dminhvu02's picture
Upload folder using huggingface_hub
b56fc9f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6281465059350607,
"eval_steps": 500,
"global_step": 850,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.9372913499539586,
"learning_rate": 2.4390243902439027e-06,
"loss": 1.6191,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 1.8879048490473127,
"learning_rate": 4.8780487804878055e-06,
"loss": 1.6982,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 1.857307355920545,
"learning_rate": 7.317073170731707e-06,
"loss": 1.6724,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 1.808250401091683,
"learning_rate": 9.756097560975611e-06,
"loss": 1.647,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 2.5133500505596453,
"learning_rate": 1.2195121951219513e-05,
"loss": 1.6079,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 1.2734146289947597,
"learning_rate": 1.4634146341463415e-05,
"loss": 1.5908,
"step": 6
},
{
"epoch": 0.01,
"grad_norm": 1.1812917040861377,
"learning_rate": 1.707317073170732e-05,
"loss": 1.5518,
"step": 7
},
{
"epoch": 0.01,
"grad_norm": 1.293637431287248,
"learning_rate": 1.9512195121951222e-05,
"loss": 1.5952,
"step": 8
},
{
"epoch": 0.01,
"grad_norm": 1.1620676440097686,
"learning_rate": 2.1951219512195124e-05,
"loss": 1.5493,
"step": 9
},
{
"epoch": 0.01,
"grad_norm": 1.3191260666446372,
"learning_rate": 2.4390243902439026e-05,
"loss": 1.5625,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 1.182981202097366,
"learning_rate": 2.682926829268293e-05,
"loss": 1.5498,
"step": 11
},
{
"epoch": 0.01,
"grad_norm": 1.0724491677903074,
"learning_rate": 2.926829268292683e-05,
"loss": 1.5547,
"step": 12
},
{
"epoch": 0.01,
"grad_norm": 0.9434780094091623,
"learning_rate": 3.170731707317073e-05,
"loss": 1.5327,
"step": 13
},
{
"epoch": 0.01,
"grad_norm": 1.0202543546064133,
"learning_rate": 3.414634146341464e-05,
"loss": 1.5933,
"step": 14
},
{
"epoch": 0.01,
"grad_norm": 0.996865818341891,
"learning_rate": 3.6585365853658535e-05,
"loss": 1.5796,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 0.9288105887086908,
"learning_rate": 3.9024390243902444e-05,
"loss": 1.4609,
"step": 16
},
{
"epoch": 0.01,
"grad_norm": 0.9726608694986103,
"learning_rate": 4.146341463414634e-05,
"loss": 1.5161,
"step": 17
},
{
"epoch": 0.01,
"grad_norm": 0.8619245275928736,
"learning_rate": 4.390243902439025e-05,
"loss": 1.5122,
"step": 18
},
{
"epoch": 0.01,
"grad_norm": 0.9215398746800475,
"learning_rate": 4.634146341463415e-05,
"loss": 1.5078,
"step": 19
},
{
"epoch": 0.01,
"grad_norm": 0.903097203515963,
"learning_rate": 4.878048780487805e-05,
"loss": 1.4502,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 0.8761498232482394,
"learning_rate": 5.121951219512195e-05,
"loss": 1.4893,
"step": 21
},
{
"epoch": 0.02,
"grad_norm": 0.8353385747464918,
"learning_rate": 5.365853658536586e-05,
"loss": 1.4717,
"step": 22
},
{
"epoch": 0.02,
"grad_norm": 0.8000291372477917,
"learning_rate": 5.6097560975609764e-05,
"loss": 1.481,
"step": 23
},
{
"epoch": 0.02,
"grad_norm": 0.8452088500727898,
"learning_rate": 5.853658536585366e-05,
"loss": 1.4644,
"step": 24
},
{
"epoch": 0.02,
"grad_norm": 0.8829309199222577,
"learning_rate": 6.097560975609756e-05,
"loss": 1.4868,
"step": 25
},
{
"epoch": 0.02,
"grad_norm": 0.8162327363449975,
"learning_rate": 6.341463414634146e-05,
"loss": 1.4883,
"step": 26
},
{
"epoch": 0.02,
"grad_norm": 0.7987925882960866,
"learning_rate": 6.585365853658538e-05,
"loss": 1.4268,
"step": 27
},
{
"epoch": 0.02,
"grad_norm": 0.7909140922467949,
"learning_rate": 6.829268292682928e-05,
"loss": 1.4873,
"step": 28
},
{
"epoch": 0.02,
"grad_norm": 0.7560592825415925,
"learning_rate": 7.073170731707317e-05,
"loss": 1.4116,
"step": 29
},
{
"epoch": 0.02,
"grad_norm": 0.7058796878894483,
"learning_rate": 7.317073170731707e-05,
"loss": 1.4023,
"step": 30
},
{
"epoch": 0.02,
"grad_norm": 0.7614550996113684,
"learning_rate": 7.560975609756099e-05,
"loss": 1.4312,
"step": 31
},
{
"epoch": 0.02,
"grad_norm": 0.7531993296256376,
"learning_rate": 7.804878048780489e-05,
"loss": 1.5024,
"step": 32
},
{
"epoch": 0.02,
"grad_norm": 0.7475795582718757,
"learning_rate": 8.048780487804879e-05,
"loss": 1.4363,
"step": 33
},
{
"epoch": 0.03,
"grad_norm": 0.7561530704205457,
"learning_rate": 8.292682926829268e-05,
"loss": 1.4873,
"step": 34
},
{
"epoch": 0.03,
"grad_norm": 0.7606234092420118,
"learning_rate": 8.53658536585366e-05,
"loss": 1.4204,
"step": 35
},
{
"epoch": 0.03,
"grad_norm": 0.7078849092381325,
"learning_rate": 8.78048780487805e-05,
"loss": 1.418,
"step": 36
},
{
"epoch": 0.03,
"grad_norm": 0.7583459620401868,
"learning_rate": 9.02439024390244e-05,
"loss": 1.4365,
"step": 37
},
{
"epoch": 0.03,
"grad_norm": 0.6479336734201823,
"learning_rate": 9.26829268292683e-05,
"loss": 1.3911,
"step": 38
},
{
"epoch": 0.03,
"grad_norm": 0.7138445522030739,
"learning_rate": 9.51219512195122e-05,
"loss": 1.4287,
"step": 39
},
{
"epoch": 0.03,
"grad_norm": 0.6772243082870256,
"learning_rate": 9.75609756097561e-05,
"loss": 1.3779,
"step": 40
},
{
"epoch": 0.03,
"grad_norm": 0.7001769060106223,
"learning_rate": 0.0001,
"loss": 1.3623,
"step": 41
},
{
"epoch": 0.03,
"grad_norm": 0.6593306891481673,
"learning_rate": 9.999985665852258e-05,
"loss": 1.3745,
"step": 42
},
{
"epoch": 0.03,
"grad_norm": 0.7111159325021309,
"learning_rate": 9.999942663491213e-05,
"loss": 1.3799,
"step": 43
},
{
"epoch": 0.03,
"grad_norm": 0.7023696510759943,
"learning_rate": 9.999870993163431e-05,
"loss": 1.4399,
"step": 44
},
{
"epoch": 0.03,
"grad_norm": 0.6736689337950041,
"learning_rate": 9.999770655279843e-05,
"loss": 1.4106,
"step": 45
},
{
"epoch": 0.03,
"grad_norm": 0.6746379997849087,
"learning_rate": 9.999641650415752e-05,
"loss": 1.4409,
"step": 46
},
{
"epoch": 0.03,
"grad_norm": 0.6615592598917496,
"learning_rate": 9.99948397931083e-05,
"loss": 1.3984,
"step": 47
},
{
"epoch": 0.04,
"grad_norm": 0.6538222984665192,
"learning_rate": 9.999297642869105e-05,
"loss": 1.4031,
"step": 48
},
{
"epoch": 0.04,
"grad_norm": 0.6129031974400467,
"learning_rate": 9.999082642158973e-05,
"loss": 1.396,
"step": 49
},
{
"epoch": 0.04,
"grad_norm": 0.6148818612628825,
"learning_rate": 9.998838978413168e-05,
"loss": 1.3574,
"step": 50
},
{
"epoch": 0.04,
"grad_norm": 0.6869612852614861,
"learning_rate": 9.99856665302878e-05,
"loss": 1.3762,
"step": 51
},
{
"epoch": 0.04,
"grad_norm": 0.7410178778694718,
"learning_rate": 9.998265667567226e-05,
"loss": 1.3481,
"step": 52
},
{
"epoch": 0.04,
"grad_norm": 0.6380516168920353,
"learning_rate": 9.997936023754257e-05,
"loss": 1.3513,
"step": 53
},
{
"epoch": 0.04,
"grad_norm": 0.6192351492724488,
"learning_rate": 9.997577723479938e-05,
"loss": 1.3662,
"step": 54
},
{
"epoch": 0.04,
"grad_norm": 0.633774941417789,
"learning_rate": 9.997190768798639e-05,
"loss": 1.3457,
"step": 55
},
{
"epoch": 0.04,
"grad_norm": 0.6016840416873676,
"learning_rate": 9.996775161929027e-05,
"loss": 1.3877,
"step": 56
},
{
"epoch": 0.04,
"grad_norm": 0.638026596140304,
"learning_rate": 9.99633090525405e-05,
"loss": 1.3892,
"step": 57
},
{
"epoch": 0.04,
"grad_norm": 0.5934027179170136,
"learning_rate": 9.995858001320926e-05,
"loss": 1.3223,
"step": 58
},
{
"epoch": 0.04,
"grad_norm": 0.6143195436309025,
"learning_rate": 9.995356452841122e-05,
"loss": 1.3862,
"step": 59
},
{
"epoch": 0.04,
"grad_norm": 0.6076935190423259,
"learning_rate": 9.994826262690347e-05,
"loss": 1.3584,
"step": 60
},
{
"epoch": 0.05,
"grad_norm": 0.6239965555110781,
"learning_rate": 9.994267433908533e-05,
"loss": 1.2771,
"step": 61
},
{
"epoch": 0.05,
"grad_norm": 0.5469871219286494,
"learning_rate": 9.99367996969981e-05,
"loss": 1.3579,
"step": 62
},
{
"epoch": 0.05,
"grad_norm": 0.5975500231663011,
"learning_rate": 9.9930638734325e-05,
"loss": 1.3872,
"step": 63
},
{
"epoch": 0.05,
"grad_norm": 0.6160102854784424,
"learning_rate": 9.992419148639087e-05,
"loss": 1.3831,
"step": 64
},
{
"epoch": 0.05,
"grad_norm": 0.5815474376554662,
"learning_rate": 9.991745799016206e-05,
"loss": 1.3745,
"step": 65
},
{
"epoch": 0.05,
"grad_norm": 0.5994591436721235,
"learning_rate": 9.991043828424612e-05,
"loss": 1.396,
"step": 66
},
{
"epoch": 0.05,
"grad_norm": 0.5896523240727669,
"learning_rate": 9.990313240889167e-05,
"loss": 1.3608,
"step": 67
},
{
"epoch": 0.05,
"grad_norm": 0.6062100949214702,
"learning_rate": 9.989554040598807e-05,
"loss": 1.2996,
"step": 68
},
{
"epoch": 0.05,
"grad_norm": 0.5941049216825265,
"learning_rate": 9.988766231906533e-05,
"loss": 1.4106,
"step": 69
},
{
"epoch": 0.05,
"grad_norm": 0.5604128113953568,
"learning_rate": 9.987949819329365e-05,
"loss": 1.3931,
"step": 70
},
{
"epoch": 0.05,
"grad_norm": 0.5519277490096212,
"learning_rate": 9.98710480754834e-05,
"loss": 1.3691,
"step": 71
},
{
"epoch": 0.05,
"grad_norm": 0.5900021330626725,
"learning_rate": 9.986231201408467e-05,
"loss": 1.4058,
"step": 72
},
{
"epoch": 0.05,
"grad_norm": 0.5699754681306506,
"learning_rate": 9.985329005918702e-05,
"loss": 1.355,
"step": 73
},
{
"epoch": 0.05,
"grad_norm": 0.593149750992695,
"learning_rate": 9.98439822625193e-05,
"loss": 1.3545,
"step": 74
},
{
"epoch": 0.06,
"grad_norm": 0.5824626045065218,
"learning_rate": 9.983438867744923e-05,
"loss": 1.3896,
"step": 75
},
{
"epoch": 0.06,
"grad_norm": 0.5900786393120402,
"learning_rate": 9.982450935898316e-05,
"loss": 1.3716,
"step": 76
},
{
"epoch": 0.06,
"grad_norm": 0.5688141367114475,
"learning_rate": 9.981434436376572e-05,
"loss": 1.3921,
"step": 77
},
{
"epoch": 0.06,
"grad_norm": 0.557565379686218,
"learning_rate": 9.980389375007955e-05,
"loss": 1.3506,
"step": 78
},
{
"epoch": 0.06,
"grad_norm": 0.5740715320740841,
"learning_rate": 9.979315757784488e-05,
"loss": 1.2917,
"step": 79
},
{
"epoch": 0.06,
"grad_norm": 0.5717745274109229,
"learning_rate": 9.97821359086193e-05,
"loss": 1.3154,
"step": 80
},
{
"epoch": 0.06,
"grad_norm": 0.609615875256831,
"learning_rate": 9.977082880559725e-05,
"loss": 1.3328,
"step": 81
},
{
"epoch": 0.06,
"grad_norm": 0.5777864702702744,
"learning_rate": 9.975923633360985e-05,
"loss": 1.3599,
"step": 82
},
{
"epoch": 0.06,
"grad_norm": 0.575948499045498,
"learning_rate": 9.974735855912436e-05,
"loss": 1.4038,
"step": 83
},
{
"epoch": 0.06,
"grad_norm": 0.550693122074238,
"learning_rate": 9.97351955502439e-05,
"loss": 1.3203,
"step": 84
},
{
"epoch": 0.06,
"grad_norm": 0.5561601283605949,
"learning_rate": 9.972274737670701e-05,
"loss": 1.3477,
"step": 85
},
{
"epoch": 0.06,
"grad_norm": 0.5601251180421914,
"learning_rate": 9.971001410988728e-05,
"loss": 1.333,
"step": 86
},
{
"epoch": 0.06,
"grad_norm": 0.6207004745075507,
"learning_rate": 9.969699582279292e-05,
"loss": 1.4048,
"step": 87
},
{
"epoch": 0.07,
"grad_norm": 0.5475040554880181,
"learning_rate": 9.968369259006634e-05,
"loss": 1.3208,
"step": 88
},
{
"epoch": 0.07,
"grad_norm": 0.6054670378552847,
"learning_rate": 9.967010448798375e-05,
"loss": 1.4131,
"step": 89
},
{
"epoch": 0.07,
"grad_norm": 0.5486336748948858,
"learning_rate": 9.965623159445471e-05,
"loss": 1.3843,
"step": 90
},
{
"epoch": 0.07,
"grad_norm": 0.585603864758025,
"learning_rate": 9.964207398902163e-05,
"loss": 1.3186,
"step": 91
},
{
"epoch": 0.07,
"grad_norm": 0.5412960874208915,
"learning_rate": 9.96276317528594e-05,
"loss": 1.2861,
"step": 92
},
{
"epoch": 0.07,
"grad_norm": 0.5442105369162202,
"learning_rate": 9.96129049687749e-05,
"loss": 1.3262,
"step": 93
},
{
"epoch": 0.07,
"grad_norm": 0.5816978676309428,
"learning_rate": 9.959789372120649e-05,
"loss": 1.3279,
"step": 94
},
{
"epoch": 0.07,
"grad_norm": 0.5557519862862452,
"learning_rate": 9.958259809622352e-05,
"loss": 1.3672,
"step": 95
},
{
"epoch": 0.07,
"grad_norm": 0.5666965195077155,
"learning_rate": 9.956701818152591e-05,
"loss": 1.3203,
"step": 96
},
{
"epoch": 0.07,
"grad_norm": 0.5354511291609182,
"learning_rate": 9.955115406644356e-05,
"loss": 1.3081,
"step": 97
},
{
"epoch": 0.07,
"grad_norm": 0.5685729288533676,
"learning_rate": 9.953500584193592e-05,
"loss": 1.3452,
"step": 98
},
{
"epoch": 0.07,
"grad_norm": 0.5922446508548838,
"learning_rate": 9.95185736005914e-05,
"loss": 1.3682,
"step": 99
},
{
"epoch": 0.07,
"grad_norm": 0.5837642463681222,
"learning_rate": 9.950185743662685e-05,
"loss": 1.3691,
"step": 100
},
{
"epoch": 0.07,
"grad_norm": 0.5761448966076219,
"learning_rate": 9.948485744588709e-05,
"loss": 1.3281,
"step": 101
},
{
"epoch": 0.08,
"grad_norm": 0.553490008569796,
"learning_rate": 9.946757372584423e-05,
"loss": 1.292,
"step": 102
},
{
"epoch": 0.08,
"grad_norm": 0.5686477341821499,
"learning_rate": 9.945000637559727e-05,
"loss": 1.3486,
"step": 103
},
{
"epoch": 0.08,
"grad_norm": 0.5772487636958804,
"learning_rate": 9.943215549587138e-05,
"loss": 1.3425,
"step": 104
},
{
"epoch": 0.08,
"grad_norm": 0.5758207849461601,
"learning_rate": 9.941402118901744e-05,
"loss": 1.3701,
"step": 105
},
{
"epoch": 0.08,
"grad_norm": 0.5582099537521159,
"learning_rate": 9.939560355901136e-05,
"loss": 1.3794,
"step": 106
},
{
"epoch": 0.08,
"grad_norm": 0.5336197399728324,
"learning_rate": 9.937690271145354e-05,
"loss": 1.3179,
"step": 107
},
{
"epoch": 0.08,
"grad_norm": 0.539037656457371,
"learning_rate": 9.935791875356832e-05,
"loss": 1.3071,
"step": 108
},
{
"epoch": 0.08,
"grad_norm": 0.5584770963502244,
"learning_rate": 9.933865179420321e-05,
"loss": 1.3945,
"step": 109
},
{
"epoch": 0.08,
"grad_norm": 0.5364047388288558,
"learning_rate": 9.931910194382837e-05,
"loss": 1.3462,
"step": 110
},
{
"epoch": 0.08,
"grad_norm": 0.5956933567804931,
"learning_rate": 9.929926931453599e-05,
"loss": 1.2585,
"step": 111
},
{
"epoch": 0.08,
"grad_norm": 0.5548298244830802,
"learning_rate": 9.927915402003964e-05,
"loss": 1.3765,
"step": 112
},
{
"epoch": 0.08,
"grad_norm": 0.5528131728204222,
"learning_rate": 9.92587561756735e-05,
"loss": 1.3452,
"step": 113
},
{
"epoch": 0.08,
"grad_norm": 0.5181397205586854,
"learning_rate": 9.92380758983919e-05,
"loss": 1.2671,
"step": 114
},
{
"epoch": 0.08,
"grad_norm": 0.5429954425262675,
"learning_rate": 9.921711330676848e-05,
"loss": 1.3574,
"step": 115
},
{
"epoch": 0.09,
"grad_norm": 0.5523231773869766,
"learning_rate": 9.919586852099562e-05,
"loss": 1.3184,
"step": 116
},
{
"epoch": 0.09,
"grad_norm": 0.5583959107787768,
"learning_rate": 9.917434166288364e-05,
"loss": 1.3442,
"step": 117
},
{
"epoch": 0.09,
"grad_norm": 0.5850081526075311,
"learning_rate": 9.915253285586024e-05,
"loss": 1.3477,
"step": 118
},
{
"epoch": 0.09,
"grad_norm": 0.5498743192645993,
"learning_rate": 9.913044222496966e-05,
"loss": 1.3398,
"step": 119
},
{
"epoch": 0.09,
"grad_norm": 0.5853233345937257,
"learning_rate": 9.910806989687206e-05,
"loss": 1.3276,
"step": 120
},
{
"epoch": 0.09,
"grad_norm": 0.559389561256856,
"learning_rate": 9.908541599984276e-05,
"loss": 1.3462,
"step": 121
},
{
"epoch": 0.09,
"grad_norm": 0.5298088621667728,
"learning_rate": 9.906248066377143e-05,
"loss": 1.2568,
"step": 122
},
{
"epoch": 0.09,
"grad_norm": 0.5731884986496186,
"learning_rate": 9.903926402016153e-05,
"loss": 1.3394,
"step": 123
},
{
"epoch": 0.09,
"grad_norm": 0.5549155957971303,
"learning_rate": 9.901576620212933e-05,
"loss": 1.311,
"step": 124
},
{
"epoch": 0.09,
"grad_norm": 0.5620092141236146,
"learning_rate": 9.899198734440335e-05,
"loss": 1.291,
"step": 125
},
{
"epoch": 0.09,
"grad_norm": 0.5405164924320079,
"learning_rate": 9.896792758332341e-05,
"loss": 1.248,
"step": 126
},
{
"epoch": 0.09,
"grad_norm": 0.5602202105737174,
"learning_rate": 9.894358705684002e-05,
"loss": 1.3115,
"step": 127
},
{
"epoch": 0.09,
"grad_norm": 0.5580296998093701,
"learning_rate": 9.891896590451344e-05,
"loss": 1.2947,
"step": 128
},
{
"epoch": 0.1,
"grad_norm": 0.5755635897570144,
"learning_rate": 9.889406426751296e-05,
"loss": 1.3086,
"step": 129
},
{
"epoch": 0.1,
"grad_norm": 0.6025851962917577,
"learning_rate": 9.886888228861608e-05,
"loss": 1.3447,
"step": 130
},
{
"epoch": 0.1,
"grad_norm": 0.5660419268974345,
"learning_rate": 9.88434201122077e-05,
"loss": 1.3232,
"step": 131
},
{
"epoch": 0.1,
"grad_norm": 0.5495648120402916,
"learning_rate": 9.881767788427925e-05,
"loss": 1.3096,
"step": 132
},
{
"epoch": 0.1,
"grad_norm": 0.5577872798163368,
"learning_rate": 9.879165575242787e-05,
"loss": 1.291,
"step": 133
},
{
"epoch": 0.1,
"grad_norm": 0.5540620803629338,
"learning_rate": 9.876535386585561e-05,
"loss": 1.335,
"step": 134
},
{
"epoch": 0.1,
"grad_norm": 0.5573425731012122,
"learning_rate": 9.873877237536853e-05,
"loss": 1.2327,
"step": 135
},
{
"epoch": 0.1,
"grad_norm": 0.5827857038389533,
"learning_rate": 9.871191143337582e-05,
"loss": 1.3333,
"step": 136
},
{
"epoch": 0.1,
"grad_norm": 0.5897883061496167,
"learning_rate": 9.868477119388896e-05,
"loss": 1.3076,
"step": 137
},
{
"epoch": 0.1,
"grad_norm": 0.5800275384221499,
"learning_rate": 9.865735181252085e-05,
"loss": 1.3188,
"step": 138
},
{
"epoch": 0.1,
"grad_norm": 0.5605765677262206,
"learning_rate": 9.862965344648485e-05,
"loss": 1.3086,
"step": 139
},
{
"epoch": 0.1,
"grad_norm": 0.5432447170586258,
"learning_rate": 9.860167625459398e-05,
"loss": 1.2861,
"step": 140
},
{
"epoch": 0.1,
"grad_norm": 0.5687257803544524,
"learning_rate": 9.85734203972599e-05,
"loss": 1.2839,
"step": 141
},
{
"epoch": 0.1,
"grad_norm": 0.5475328993701518,
"learning_rate": 9.854488603649206e-05,
"loss": 1.3169,
"step": 142
},
{
"epoch": 0.11,
"grad_norm": 0.5408143803639806,
"learning_rate": 9.851607333589677e-05,
"loss": 1.3374,
"step": 143
},
{
"epoch": 0.11,
"grad_norm": 0.5350053494827027,
"learning_rate": 9.848698246067623e-05,
"loss": 1.2888,
"step": 144
},
{
"epoch": 0.11,
"grad_norm": 0.5642075781884446,
"learning_rate": 9.84576135776276e-05,
"loss": 1.3105,
"step": 145
},
{
"epoch": 0.11,
"grad_norm": 0.5725161088840623,
"learning_rate": 9.842796685514203e-05,
"loss": 1.3516,
"step": 146
},
{
"epoch": 0.11,
"grad_norm": 0.5837888943455876,
"learning_rate": 9.839804246320375e-05,
"loss": 1.2871,
"step": 147
},
{
"epoch": 0.11,
"grad_norm": 0.5833329842842448,
"learning_rate": 9.836784057338899e-05,
"loss": 1.3232,
"step": 148
},
{
"epoch": 0.11,
"grad_norm": 0.5244172538585695,
"learning_rate": 9.833736135886512e-05,
"loss": 1.2568,
"step": 149
},
{
"epoch": 0.11,
"grad_norm": 0.5163576076330887,
"learning_rate": 9.830660499438955e-05,
"loss": 1.2759,
"step": 150
},
{
"epoch": 0.11,
"grad_norm": 0.5617840717093857,
"learning_rate": 9.827557165630879e-05,
"loss": 1.2524,
"step": 151
},
{
"epoch": 0.11,
"grad_norm": 0.547220410155329,
"learning_rate": 9.824426152255741e-05,
"loss": 1.312,
"step": 152
},
{
"epoch": 0.11,
"grad_norm": 0.5715922980351898,
"learning_rate": 9.821267477265705e-05,
"loss": 1.335,
"step": 153
},
{
"epoch": 0.11,
"grad_norm": 0.5626236612178414,
"learning_rate": 9.818081158771538e-05,
"loss": 1.3633,
"step": 154
},
{
"epoch": 0.11,
"grad_norm": 0.556817713740677,
"learning_rate": 9.814867215042502e-05,
"loss": 1.3345,
"step": 155
},
{
"epoch": 0.12,
"grad_norm": 0.5658424328358594,
"learning_rate": 9.811625664506259e-05,
"loss": 1.3325,
"step": 156
},
{
"epoch": 0.12,
"grad_norm": 0.5518987143292007,
"learning_rate": 9.808356525748748e-05,
"loss": 1.3179,
"step": 157
},
{
"epoch": 0.12,
"grad_norm": 0.5509045139485853,
"learning_rate": 9.805059817514101e-05,
"loss": 1.3276,
"step": 158
},
{
"epoch": 0.12,
"grad_norm": 0.5612999607711056,
"learning_rate": 9.801735558704517e-05,
"loss": 1.2192,
"step": 159
},
{
"epoch": 0.12,
"grad_norm": 0.530326353544212,
"learning_rate": 9.798383768380164e-05,
"loss": 1.2988,
"step": 160
},
{
"epoch": 0.12,
"grad_norm": 0.5524425336112486,
"learning_rate": 9.795004465759065e-05,
"loss": 1.2622,
"step": 161
},
{
"epoch": 0.12,
"grad_norm": 0.5121240819278214,
"learning_rate": 9.791597670216989e-05,
"loss": 1.2603,
"step": 162
},
{
"epoch": 0.12,
"grad_norm": 0.5262701595678754,
"learning_rate": 9.78816340128734e-05,
"loss": 1.22,
"step": 163
},
{
"epoch": 0.12,
"grad_norm": 0.5866254674193113,
"learning_rate": 9.784701678661045e-05,
"loss": 1.311,
"step": 164
},
{
"epoch": 0.12,
"grad_norm": 0.567120419528464,
"learning_rate": 9.781212522186443e-05,
"loss": 1.3145,
"step": 165
},
{
"epoch": 0.12,
"grad_norm": 0.5704512174009239,
"learning_rate": 9.777695951869164e-05,
"loss": 1.2612,
"step": 166
},
{
"epoch": 0.12,
"grad_norm": 0.5359884622353506,
"learning_rate": 9.774151987872027e-05,
"loss": 1.2117,
"step": 167
},
{
"epoch": 0.12,
"grad_norm": 0.5772321074843504,
"learning_rate": 9.770580650514914e-05,
"loss": 1.3525,
"step": 168
},
{
"epoch": 0.12,
"grad_norm": 0.5316876920831217,
"learning_rate": 9.766981960274653e-05,
"loss": 1.3442,
"step": 169
},
{
"epoch": 0.13,
"grad_norm": 0.5622203218145027,
"learning_rate": 9.763355937784909e-05,
"loss": 1.2964,
"step": 170
},
{
"epoch": 0.13,
"grad_norm": 0.5614932814360857,
"learning_rate": 9.759702603836059e-05,
"loss": 1.3389,
"step": 171
},
{
"epoch": 0.13,
"grad_norm": 0.568962837143467,
"learning_rate": 9.756021979375071e-05,
"loss": 1.3174,
"step": 172
},
{
"epoch": 0.13,
"grad_norm": 0.5382419139994956,
"learning_rate": 9.752314085505395e-05,
"loss": 1.3125,
"step": 173
},
{
"epoch": 0.13,
"grad_norm": 0.5677837729549118,
"learning_rate": 9.748578943486828e-05,
"loss": 1.2871,
"step": 174
},
{
"epoch": 0.13,
"grad_norm": 0.5602612877442024,
"learning_rate": 9.744816574735405e-05,
"loss": 1.3438,
"step": 175
},
{
"epoch": 0.13,
"grad_norm": 0.5735194400650546,
"learning_rate": 9.74102700082326e-05,
"loss": 1.3208,
"step": 176
},
{
"epoch": 0.13,
"grad_norm": 0.5670876099448275,
"learning_rate": 9.737210243478521e-05,
"loss": 1.2969,
"step": 177
},
{
"epoch": 0.13,
"grad_norm": 0.5450536272385241,
"learning_rate": 9.733366324585175e-05,
"loss": 1.2673,
"step": 178
},
{
"epoch": 0.13,
"grad_norm": 0.5340701964695135,
"learning_rate": 9.72949526618294e-05,
"loss": 1.3403,
"step": 179
},
{
"epoch": 0.13,
"grad_norm": 0.5422933717116616,
"learning_rate": 9.725597090467144e-05,
"loss": 1.2539,
"step": 180
},
{
"epoch": 0.13,
"grad_norm": 0.5680150103490264,
"learning_rate": 9.721671819788602e-05,
"loss": 1.3149,
"step": 181
},
{
"epoch": 0.13,
"grad_norm": 0.560101859043945,
"learning_rate": 9.717719476653475e-05,
"loss": 1.321,
"step": 182
},
{
"epoch": 0.14,
"grad_norm": 0.5267278121510764,
"learning_rate": 9.71374008372315e-05,
"loss": 1.2227,
"step": 183
},
{
"epoch": 0.14,
"grad_norm": 0.5687530339596342,
"learning_rate": 9.709733663814113e-05,
"loss": 1.3159,
"step": 184
},
{
"epoch": 0.14,
"grad_norm": 0.5321503974993333,
"learning_rate": 9.705700239897809e-05,
"loss": 1.3188,
"step": 185
},
{
"epoch": 0.14,
"grad_norm": 0.5593956329311583,
"learning_rate": 9.701639835100513e-05,
"loss": 1.249,
"step": 186
},
{
"epoch": 0.14,
"grad_norm": 0.5591047172889141,
"learning_rate": 9.697552472703205e-05,
"loss": 1.2756,
"step": 187
},
{
"epoch": 0.14,
"grad_norm": 0.5543029039316694,
"learning_rate": 9.693438176141425e-05,
"loss": 1.2915,
"step": 188
},
{
"epoch": 0.14,
"grad_norm": 0.5494961227055172,
"learning_rate": 9.68929696900515e-05,
"loss": 1.313,
"step": 189
},
{
"epoch": 0.14,
"grad_norm": 0.5541252042617403,
"learning_rate": 9.685128875038647e-05,
"loss": 1.2754,
"step": 190
},
{
"epoch": 0.14,
"grad_norm": 0.5163534781462605,
"learning_rate": 9.680933918140348e-05,
"loss": 1.2681,
"step": 191
},
{
"epoch": 0.14,
"grad_norm": 0.537157272716453,
"learning_rate": 9.676712122362706e-05,
"loss": 1.2551,
"step": 192
},
{
"epoch": 0.14,
"grad_norm": 0.5397175193183968,
"learning_rate": 9.672463511912055e-05,
"loss": 1.2822,
"step": 193
},
{
"epoch": 0.14,
"grad_norm": 0.5488691397441863,
"learning_rate": 9.668188111148484e-05,
"loss": 1.283,
"step": 194
},
{
"epoch": 0.14,
"grad_norm": 0.5905761212464122,
"learning_rate": 9.66388594458568e-05,
"loss": 1.2896,
"step": 195
},
{
"epoch": 0.14,
"grad_norm": 0.580369444338734,
"learning_rate": 9.659557036890801e-05,
"loss": 1.3416,
"step": 196
},
{
"epoch": 0.15,
"grad_norm": 0.5262728809847318,
"learning_rate": 9.655201412884327e-05,
"loss": 1.2554,
"step": 197
},
{
"epoch": 0.15,
"grad_norm": 0.5375550652008795,
"learning_rate": 9.650819097539922e-05,
"loss": 1.2612,
"step": 198
},
{
"epoch": 0.15,
"grad_norm": 0.5208197207069616,
"learning_rate": 9.646410115984289e-05,
"loss": 1.2358,
"step": 199
},
{
"epoch": 0.15,
"grad_norm": 0.5409371788748774,
"learning_rate": 9.641974493497024e-05,
"loss": 1.3262,
"step": 200
},
{
"epoch": 0.15,
"grad_norm": 0.5389211233425135,
"learning_rate": 9.637512255510475e-05,
"loss": 1.2729,
"step": 201
},
{
"epoch": 0.15,
"grad_norm": 0.5501782779153785,
"learning_rate": 9.633023427609591e-05,
"loss": 1.2322,
"step": 202
},
{
"epoch": 0.15,
"grad_norm": 0.5678681105856288,
"learning_rate": 9.628508035531785e-05,
"loss": 1.3721,
"step": 203
},
{
"epoch": 0.15,
"grad_norm": 0.5559621306210715,
"learning_rate": 9.623966105166772e-05,
"loss": 1.3267,
"step": 204
},
{
"epoch": 0.15,
"grad_norm": 0.5417687907113425,
"learning_rate": 9.619397662556435e-05,
"loss": 1.2666,
"step": 205
},
{
"epoch": 0.15,
"grad_norm": 0.5546614199696198,
"learning_rate": 9.614802733894665e-05,
"loss": 1.3389,
"step": 206
},
{
"epoch": 0.15,
"grad_norm": 0.5594799442475286,
"learning_rate": 9.610181345527217e-05,
"loss": 1.2671,
"step": 207
},
{
"epoch": 0.15,
"grad_norm": 0.5852167375394156,
"learning_rate": 9.605533523951558e-05,
"loss": 1.3335,
"step": 208
},
{
"epoch": 0.15,
"grad_norm": 0.5465110917787175,
"learning_rate": 9.600859295816708e-05,
"loss": 1.3096,
"step": 209
},
{
"epoch": 0.16,
"grad_norm": 0.5704616015169348,
"learning_rate": 9.596158687923104e-05,
"loss": 1.3022,
"step": 210
},
{
"epoch": 0.16,
"grad_norm": 0.5617616139462727,
"learning_rate": 9.591431727222424e-05,
"loss": 1.3159,
"step": 211
},
{
"epoch": 0.16,
"grad_norm": 0.5465602681324426,
"learning_rate": 9.586678440817453e-05,
"loss": 1.2708,
"step": 212
},
{
"epoch": 0.16,
"grad_norm": 0.5864421378413351,
"learning_rate": 9.581898855961912e-05,
"loss": 1.2607,
"step": 213
},
{
"epoch": 0.16,
"grad_norm": 0.556548001041405,
"learning_rate": 9.577093000060312e-05,
"loss": 1.3081,
"step": 214
},
{
"epoch": 0.16,
"grad_norm": 0.5642842704902283,
"learning_rate": 9.572260900667794e-05,
"loss": 1.2759,
"step": 215
},
{
"epoch": 0.16,
"grad_norm": 0.5486665255067006,
"learning_rate": 9.567402585489963e-05,
"loss": 1.2104,
"step": 216
},
{
"epoch": 0.16,
"grad_norm": 0.5361207508020517,
"learning_rate": 9.56251808238275e-05,
"loss": 1.2451,
"step": 217
},
{
"epoch": 0.16,
"grad_norm": 0.5149380805556683,
"learning_rate": 9.557607419352226e-05,
"loss": 1.2778,
"step": 218
},
{
"epoch": 0.16,
"grad_norm": 0.5469266902951428,
"learning_rate": 9.552670624554461e-05,
"loss": 1.2617,
"step": 219
},
{
"epoch": 0.16,
"grad_norm": 0.5430295319416,
"learning_rate": 9.54770772629535e-05,
"loss": 1.2915,
"step": 220
},
{
"epoch": 0.16,
"grad_norm": 0.5744217791056692,
"learning_rate": 9.542718753030463e-05,
"loss": 1.3281,
"step": 221
},
{
"epoch": 0.16,
"grad_norm": 0.5587545969611539,
"learning_rate": 9.537703733364871e-05,
"loss": 1.2837,
"step": 222
},
{
"epoch": 0.16,
"grad_norm": 0.5288053303373643,
"learning_rate": 9.532662696052985e-05,
"loss": 1.2949,
"step": 223
},
{
"epoch": 0.17,
"grad_norm": 0.5791175310063906,
"learning_rate": 9.527595669998399e-05,
"loss": 1.2917,
"step": 224
},
{
"epoch": 0.17,
"grad_norm": 0.5250029719207272,
"learning_rate": 9.522502684253709e-05,
"loss": 1.2375,
"step": 225
},
{
"epoch": 0.17,
"grad_norm": 0.5177601049436101,
"learning_rate": 9.517383768020361e-05,
"loss": 1.2695,
"step": 226
},
{
"epoch": 0.17,
"grad_norm": 0.5554993860583297,
"learning_rate": 9.512238950648474e-05,
"loss": 1.2917,
"step": 227
},
{
"epoch": 0.17,
"grad_norm": 0.5738329488665082,
"learning_rate": 9.507068261636679e-05,
"loss": 1.2944,
"step": 228
},
{
"epoch": 0.17,
"grad_norm": 0.5562896023700302,
"learning_rate": 9.501871730631942e-05,
"loss": 1.3296,
"step": 229
},
{
"epoch": 0.17,
"grad_norm": 0.5416347008024398,
"learning_rate": 9.496649387429404e-05,
"loss": 1.2437,
"step": 230
},
{
"epoch": 0.17,
"grad_norm": 0.5699356753997783,
"learning_rate": 9.491401261972195e-05,
"loss": 1.2705,
"step": 231
},
{
"epoch": 0.17,
"grad_norm": 0.5481624625613764,
"learning_rate": 9.486127384351282e-05,
"loss": 1.3779,
"step": 232
},
{
"epoch": 0.17,
"grad_norm": 0.5688206917165098,
"learning_rate": 9.480827784805278e-05,
"loss": 1.2754,
"step": 233
},
{
"epoch": 0.17,
"grad_norm": 0.5490377714658476,
"learning_rate": 9.475502493720283e-05,
"loss": 1.3125,
"step": 234
},
{
"epoch": 0.17,
"grad_norm": 0.5355672804730123,
"learning_rate": 9.470151541629699e-05,
"loss": 1.2627,
"step": 235
},
{
"epoch": 0.17,
"grad_norm": 0.5905840590902287,
"learning_rate": 9.464774959214063e-05,
"loss": 1.3027,
"step": 236
},
{
"epoch": 0.18,
"grad_norm": 0.56064622426517,
"learning_rate": 9.459372777300864e-05,
"loss": 1.2065,
"step": 237
},
{
"epoch": 0.18,
"grad_norm": 0.5568610691565873,
"learning_rate": 9.45394502686437e-05,
"loss": 1.3223,
"step": 238
},
{
"epoch": 0.18,
"grad_norm": 0.5300725401389981,
"learning_rate": 9.448491739025454e-05,
"loss": 1.2805,
"step": 239
},
{
"epoch": 0.18,
"grad_norm": 0.5519662242216672,
"learning_rate": 9.44301294505141e-05,
"loss": 1.2371,
"step": 240
},
{
"epoch": 0.18,
"grad_norm": 0.5402101018249572,
"learning_rate": 9.437508676355773e-05,
"loss": 1.2749,
"step": 241
},
{
"epoch": 0.18,
"grad_norm": 0.5389383005608104,
"learning_rate": 9.431978964498143e-05,
"loss": 1.2876,
"step": 242
},
{
"epoch": 0.18,
"grad_norm": 0.5310718244911751,
"learning_rate": 9.426423841184005e-05,
"loss": 1.3057,
"step": 243
},
{
"epoch": 0.18,
"grad_norm": 0.5454082533825911,
"learning_rate": 9.420843338264542e-05,
"loss": 1.2578,
"step": 244
},
{
"epoch": 0.18,
"grad_norm": 0.565349361879851,
"learning_rate": 9.415237487736452e-05,
"loss": 1.3306,
"step": 245
},
{
"epoch": 0.18,
"grad_norm": 0.5224746893789486,
"learning_rate": 9.409606321741775e-05,
"loss": 1.2598,
"step": 246
},
{
"epoch": 0.18,
"grad_norm": 0.5440997273729092,
"learning_rate": 9.403949872567695e-05,
"loss": 1.2749,
"step": 247
},
{
"epoch": 0.18,
"grad_norm": 0.5668696203741111,
"learning_rate": 9.398268172646365e-05,
"loss": 1.2739,
"step": 248
},
{
"epoch": 0.18,
"grad_norm": 0.538410569856225,
"learning_rate": 9.392561254554713e-05,
"loss": 1.2734,
"step": 249
},
{
"epoch": 0.18,
"grad_norm": 0.5458663263053075,
"learning_rate": 9.386829151014262e-05,
"loss": 1.3101,
"step": 250
},
{
"epoch": 0.19,
"grad_norm": 0.537905713825921,
"learning_rate": 9.381071894890941e-05,
"loss": 1.2666,
"step": 251
},
{
"epoch": 0.19,
"grad_norm": 0.5288916095430457,
"learning_rate": 9.375289519194894e-05,
"loss": 1.2666,
"step": 252
},
{
"epoch": 0.19,
"grad_norm": 0.5335913282729025,
"learning_rate": 9.369482057080292e-05,
"loss": 1.2886,
"step": 253
},
{
"epoch": 0.19,
"grad_norm": 0.5523824410197196,
"learning_rate": 9.363649541845142e-05,
"loss": 1.2571,
"step": 254
},
{
"epoch": 0.19,
"grad_norm": 0.5912264857528259,
"learning_rate": 9.357792006931098e-05,
"loss": 1.261,
"step": 255
},
{
"epoch": 0.19,
"grad_norm": 0.5594499774840426,
"learning_rate": 9.35190948592327e-05,
"loss": 1.3027,
"step": 256
},
{
"epoch": 0.19,
"grad_norm": 0.5379207919206825,
"learning_rate": 9.346002012550027e-05,
"loss": 1.2983,
"step": 257
},
{
"epoch": 0.19,
"grad_norm": 0.5455629199690059,
"learning_rate": 9.340069620682806e-05,
"loss": 1.2695,
"step": 258
},
{
"epoch": 0.19,
"grad_norm": 0.5471737544580354,
"learning_rate": 9.334112344335924e-05,
"loss": 1.3047,
"step": 259
},
{
"epoch": 0.19,
"grad_norm": 0.5397100655209365,
"learning_rate": 9.328130217666366e-05,
"loss": 1.2896,
"step": 260
},
{
"epoch": 0.19,
"grad_norm": 0.5636004509867364,
"learning_rate": 9.322123274973613e-05,
"loss": 1.3501,
"step": 261
},
{
"epoch": 0.19,
"grad_norm": 0.5605154015144495,
"learning_rate": 9.316091550699424e-05,
"loss": 1.2983,
"step": 262
},
{
"epoch": 0.19,
"grad_norm": 0.5461515781521593,
"learning_rate": 9.310035079427651e-05,
"loss": 1.269,
"step": 263
},
{
"epoch": 0.2,
"grad_norm": 0.5175024878789147,
"learning_rate": 9.303953895884033e-05,
"loss": 1.1653,
"step": 264
},
{
"epoch": 0.2,
"grad_norm": 0.5224669601631107,
"learning_rate": 9.297848034936006e-05,
"loss": 1.2554,
"step": 265
},
{
"epoch": 0.2,
"grad_norm": 0.5444106809363777,
"learning_rate": 9.291717531592494e-05,
"loss": 1.293,
"step": 266
},
{
"epoch": 0.2,
"grad_norm": 0.5287552712313793,
"learning_rate": 9.285562421003715e-05,
"loss": 1.2651,
"step": 267
},
{
"epoch": 0.2,
"grad_norm": 0.5381309609110954,
"learning_rate": 9.279382738460971e-05,
"loss": 1.2812,
"step": 268
},
{
"epoch": 0.2,
"grad_norm": 0.5528803396804242,
"learning_rate": 9.273178519396459e-05,
"loss": 1.3149,
"step": 269
},
{
"epoch": 0.2,
"grad_norm": 0.5270531797880375,
"learning_rate": 9.266949799383053e-05,
"loss": 1.2615,
"step": 270
},
{
"epoch": 0.2,
"grad_norm": 0.5488129774725259,
"learning_rate": 9.260696614134114e-05,
"loss": 1.2837,
"step": 271
},
{
"epoch": 0.2,
"grad_norm": 0.5335083589116082,
"learning_rate": 9.254418999503271e-05,
"loss": 1.2339,
"step": 272
},
{
"epoch": 0.2,
"grad_norm": 0.5974061497388541,
"learning_rate": 9.248116991484229e-05,
"loss": 1.2825,
"step": 273
},
{
"epoch": 0.2,
"grad_norm": 0.5381713380415607,
"learning_rate": 9.241790626210549e-05,
"loss": 1.1895,
"step": 274
},
{
"epoch": 0.2,
"grad_norm": 0.5384430847504001,
"learning_rate": 9.235439939955457e-05,
"loss": 1.2358,
"step": 275
},
{
"epoch": 0.2,
"grad_norm": 0.5256588888016233,
"learning_rate": 9.229064969131621e-05,
"loss": 1.2407,
"step": 276
},
{
"epoch": 0.2,
"grad_norm": 0.5242296953154587,
"learning_rate": 9.222665750290953e-05,
"loss": 1.2832,
"step": 277
},
{
"epoch": 0.21,
"grad_norm": 0.5224106607183625,
"learning_rate": 9.216242320124388e-05,
"loss": 1.2388,
"step": 278
},
{
"epoch": 0.21,
"grad_norm": 0.540400861953043,
"learning_rate": 9.20979471546169e-05,
"loss": 1.2695,
"step": 279
},
{
"epoch": 0.21,
"grad_norm": 0.5289483661482471,
"learning_rate": 9.203322973271223e-05,
"loss": 1.2832,
"step": 280
},
{
"epoch": 0.21,
"grad_norm": 0.5376637104674151,
"learning_rate": 9.19682713065975e-05,
"loss": 1.2783,
"step": 281
},
{
"epoch": 0.21,
"grad_norm": 0.5547766359095799,
"learning_rate": 9.19030722487222e-05,
"loss": 1.2515,
"step": 282
},
{
"epoch": 0.21,
"grad_norm": 0.5431030883095361,
"learning_rate": 9.183763293291549e-05,
"loss": 1.2346,
"step": 283
},
{
"epoch": 0.21,
"grad_norm": 0.5767856753870191,
"learning_rate": 9.17719537343841e-05,
"loss": 1.2974,
"step": 284
},
{
"epoch": 0.21,
"grad_norm": 0.5356401648893151,
"learning_rate": 9.170603502971016e-05,
"loss": 1.2532,
"step": 285
},
{
"epoch": 0.21,
"grad_norm": 0.5528695803408737,
"learning_rate": 9.163987719684907e-05,
"loss": 1.3442,
"step": 286
},
{
"epoch": 0.21,
"grad_norm": 0.5356080125920785,
"learning_rate": 9.157348061512727e-05,
"loss": 1.2686,
"step": 287
},
{
"epoch": 0.21,
"grad_norm": 0.5778656916381988,
"learning_rate": 9.150684566524012e-05,
"loss": 1.2041,
"step": 288
},
{
"epoch": 0.21,
"grad_norm": 0.5328749801157324,
"learning_rate": 9.143997272924973e-05,
"loss": 1.2437,
"step": 289
},
{
"epoch": 0.21,
"grad_norm": 0.5656275076768376,
"learning_rate": 9.13728621905827e-05,
"loss": 1.2886,
"step": 290
},
{
"epoch": 0.22,
"grad_norm": 0.5655646337419664,
"learning_rate": 9.130551443402799e-05,
"loss": 1.2783,
"step": 291
},
{
"epoch": 0.22,
"grad_norm": 0.567975953014803,
"learning_rate": 9.123792984573466e-05,
"loss": 1.3223,
"step": 292
},
{
"epoch": 0.22,
"grad_norm": 0.5361585380833186,
"learning_rate": 9.117010881320973e-05,
"loss": 1.2231,
"step": 293
},
{
"epoch": 0.22,
"grad_norm": 0.5527612532950269,
"learning_rate": 9.110205172531585e-05,
"loss": 1.3506,
"step": 294
},
{
"epoch": 0.22,
"grad_norm": 0.5330323483779986,
"learning_rate": 9.103375897226918e-05,
"loss": 1.2974,
"step": 295
},
{
"epoch": 0.22,
"grad_norm": 0.541076058179259,
"learning_rate": 9.096523094563708e-05,
"loss": 1.2617,
"step": 296
},
{
"epoch": 0.22,
"grad_norm": 0.5340836977689315,
"learning_rate": 9.089646803833589e-05,
"loss": 1.2603,
"step": 297
},
{
"epoch": 0.22,
"grad_norm": 0.5383753245320845,
"learning_rate": 9.082747064462867e-05,
"loss": 1.2583,
"step": 298
},
{
"epoch": 0.22,
"grad_norm": 0.5192836861689345,
"learning_rate": 9.075823916012298e-05,
"loss": 1.2568,
"step": 299
},
{
"epoch": 0.22,
"grad_norm": 0.5744817919271316,
"learning_rate": 9.068877398176852e-05,
"loss": 1.2131,
"step": 300
},
{
"epoch": 0.22,
"grad_norm": 0.5323047093147705,
"learning_rate": 9.061907550785498e-05,
"loss": 1.2783,
"step": 301
},
{
"epoch": 0.22,
"grad_norm": 0.5607328564400242,
"learning_rate": 9.054914413800961e-05,
"loss": 1.3398,
"step": 302
},
{
"epoch": 0.22,
"grad_norm": 0.5782257895199574,
"learning_rate": 9.047898027319507e-05,
"loss": 1.2759,
"step": 303
},
{
"epoch": 0.22,
"grad_norm": 0.546644793451931,
"learning_rate": 9.040858431570702e-05,
"loss": 1.2632,
"step": 304
},
{
"epoch": 0.23,
"grad_norm": 0.5535852227341702,
"learning_rate": 9.033795666917191e-05,
"loss": 1.312,
"step": 305
},
{
"epoch": 0.23,
"grad_norm": 0.5371002551511538,
"learning_rate": 9.026709773854457e-05,
"loss": 1.2593,
"step": 306
},
{
"epoch": 0.23,
"grad_norm": 0.5394441228369942,
"learning_rate": 9.019600793010597e-05,
"loss": 1.269,
"step": 307
},
{
"epoch": 0.23,
"grad_norm": 0.5512445550522174,
"learning_rate": 9.012468765146079e-05,
"loss": 1.2686,
"step": 308
},
{
"epoch": 0.23,
"grad_norm": 0.5043850111181398,
"learning_rate": 9.005313731153524e-05,
"loss": 1.2363,
"step": 309
},
{
"epoch": 0.23,
"grad_norm": 0.5294693808157453,
"learning_rate": 8.998135732057458e-05,
"loss": 1.2725,
"step": 310
},
{
"epoch": 0.23,
"grad_norm": 0.5235449664008548,
"learning_rate": 8.990934809014077e-05,
"loss": 1.249,
"step": 311
},
{
"epoch": 0.23,
"grad_norm": 0.5228082226582549,
"learning_rate": 8.983711003311024e-05,
"loss": 1.2153,
"step": 312
},
{
"epoch": 0.23,
"grad_norm": 0.5525620828249341,
"learning_rate": 8.976464356367134e-05,
"loss": 1.2136,
"step": 313
},
{
"epoch": 0.23,
"grad_norm": 0.5605215996168639,
"learning_rate": 8.96919490973221e-05,
"loss": 1.271,
"step": 314
},
{
"epoch": 0.23,
"grad_norm": 0.5277359930208506,
"learning_rate": 8.961902705086785e-05,
"loss": 1.1836,
"step": 315
},
{
"epoch": 0.23,
"grad_norm": 0.5405930304733125,
"learning_rate": 8.954587784241871e-05,
"loss": 1.2705,
"step": 316
},
{
"epoch": 0.23,
"grad_norm": 0.5248476194932483,
"learning_rate": 8.947250189138731e-05,
"loss": 1.2607,
"step": 317
},
{
"epoch": 0.24,
"grad_norm": 0.573678896783169,
"learning_rate": 8.939889961848634e-05,
"loss": 1.2727,
"step": 318
},
{
"epoch": 0.24,
"grad_norm": 0.5773485095137408,
"learning_rate": 8.932507144572616e-05,
"loss": 1.2607,
"step": 319
},
{
"epoch": 0.24,
"grad_norm": 0.5633980526681968,
"learning_rate": 8.925101779641232e-05,
"loss": 1.1917,
"step": 320
},
{
"epoch": 0.24,
"grad_norm": 0.5300371631849218,
"learning_rate": 8.917673909514322e-05,
"loss": 1.3105,
"step": 321
},
{
"epoch": 0.24,
"grad_norm": 0.5310192196200603,
"learning_rate": 8.910223576780758e-05,
"loss": 1.2808,
"step": 322
},
{
"epoch": 0.24,
"grad_norm": 0.5234569464366723,
"learning_rate": 8.902750824158212e-05,
"loss": 1.2468,
"step": 323
},
{
"epoch": 0.24,
"grad_norm": 0.5473770126434013,
"learning_rate": 8.895255694492896e-05,
"loss": 1.2676,
"step": 324
},
{
"epoch": 0.24,
"grad_norm": 0.5670393642092653,
"learning_rate": 8.887738230759333e-05,
"loss": 1.2456,
"step": 325
},
{
"epoch": 0.24,
"grad_norm": 0.5484650752546845,
"learning_rate": 8.880198476060095e-05,
"loss": 1.251,
"step": 326
},
{
"epoch": 0.24,
"grad_norm": 0.5569076336735002,
"learning_rate": 8.872636473625565e-05,
"loss": 1.272,
"step": 327
},
{
"epoch": 0.24,
"grad_norm": 0.5237290090420638,
"learning_rate": 8.865052266813685e-05,
"loss": 1.2822,
"step": 328
},
{
"epoch": 0.24,
"grad_norm": 0.5507489271814671,
"learning_rate": 8.857445899109715e-05,
"loss": 1.2783,
"step": 329
},
{
"epoch": 0.24,
"grad_norm": 0.5527246685898635,
"learning_rate": 8.849817414125973e-05,
"loss": 1.2705,
"step": 330
},
{
"epoch": 0.24,
"grad_norm": 0.5544016696123183,
"learning_rate": 8.84216685560159e-05,
"loss": 1.2856,
"step": 331
},
{
"epoch": 0.25,
"grad_norm": 0.5424146088216879,
"learning_rate": 8.834494267402263e-05,
"loss": 1.2202,
"step": 332
},
{
"epoch": 0.25,
"grad_norm": 0.5323806898987287,
"learning_rate": 8.826799693519996e-05,
"loss": 1.248,
"step": 333
},
{
"epoch": 0.25,
"grad_norm": 0.5595146324987165,
"learning_rate": 8.819083178072852e-05,
"loss": 1.1672,
"step": 334
},
{
"epoch": 0.25,
"grad_norm": 0.5854406580169095,
"learning_rate": 8.811344765304698e-05,
"loss": 1.2146,
"step": 335
},
{
"epoch": 0.25,
"grad_norm": 0.5697562446019094,
"learning_rate": 8.80358449958496e-05,
"loss": 1.2568,
"step": 336
},
{
"epoch": 0.25,
"grad_norm": 0.5538906977604374,
"learning_rate": 8.795802425408352e-05,
"loss": 1.2544,
"step": 337
},
{
"epoch": 0.25,
"grad_norm": 0.5211793067308176,
"learning_rate": 8.787998587394637e-05,
"loss": 1.2183,
"step": 338
},
{
"epoch": 0.25,
"grad_norm": 0.5732446722628473,
"learning_rate": 8.780173030288359e-05,
"loss": 1.3057,
"step": 339
},
{
"epoch": 0.25,
"grad_norm": 0.5352980539739127,
"learning_rate": 8.772325798958597e-05,
"loss": 1.2598,
"step": 340
},
{
"epoch": 0.25,
"grad_norm": 0.5234917926015726,
"learning_rate": 8.7644569383987e-05,
"loss": 1.1982,
"step": 341
},
{
"epoch": 0.25,
"grad_norm": 0.5844314852721842,
"learning_rate": 8.75656649372603e-05,
"loss": 1.2656,
"step": 342
},
{
"epoch": 0.25,
"grad_norm": 0.5646854448914282,
"learning_rate": 8.748654510181709e-05,
"loss": 1.21,
"step": 343
},
{
"epoch": 0.25,
"grad_norm": 0.5216723813831847,
"learning_rate": 8.740721033130352e-05,
"loss": 1.2329,
"step": 344
},
{
"epoch": 0.25,
"grad_norm": 0.5099027314874095,
"learning_rate": 8.732766108059813e-05,
"loss": 1.2236,
"step": 345
},
{
"epoch": 0.26,
"grad_norm": 0.5188769999186538,
"learning_rate": 8.72478978058092e-05,
"loss": 1.2905,
"step": 346
},
{
"epoch": 0.26,
"grad_norm": 0.5245157404984339,
"learning_rate": 8.716792096427217e-05,
"loss": 1.2339,
"step": 347
},
{
"epoch": 0.26,
"grad_norm": 0.5160205485678449,
"learning_rate": 8.708773101454697e-05,
"loss": 1.2524,
"step": 348
},
{
"epoch": 0.26,
"grad_norm": 0.510633107323387,
"learning_rate": 8.700732841641542e-05,
"loss": 1.2756,
"step": 349
},
{
"epoch": 0.26,
"grad_norm": 0.5097028901140956,
"learning_rate": 8.692671363087863e-05,
"loss": 1.2539,
"step": 350
},
{
"epoch": 0.26,
"grad_norm": 0.5506040438253419,
"learning_rate": 8.68458871201543e-05,
"loss": 1.1733,
"step": 351
},
{
"epoch": 0.26,
"grad_norm": 0.5339837805003954,
"learning_rate": 8.676484934767409e-05,
"loss": 1.1919,
"step": 352
},
{
"epoch": 0.26,
"grad_norm": 0.5243053855032012,
"learning_rate": 8.668360077808093e-05,
"loss": 1.2637,
"step": 353
},
{
"epoch": 0.26,
"grad_norm": 0.5475923045103417,
"learning_rate": 8.660214187722646e-05,
"loss": 1.2583,
"step": 354
},
{
"epoch": 0.26,
"grad_norm": 0.5139607250185231,
"learning_rate": 8.652047311216822e-05,
"loss": 1.2939,
"step": 355
},
{
"epoch": 0.26,
"grad_norm": 0.5310090229071474,
"learning_rate": 8.64385949511671e-05,
"loss": 1.2788,
"step": 356
},
{
"epoch": 0.26,
"grad_norm": 0.5531120494965365,
"learning_rate": 8.635650786368452e-05,
"loss": 1.25,
"step": 357
},
{
"epoch": 0.26,
"grad_norm": 0.5315969577054235,
"learning_rate": 8.627421232037989e-05,
"loss": 1.2357,
"step": 358
},
{
"epoch": 0.27,
"grad_norm": 0.5266216921573422,
"learning_rate": 8.619170879310779e-05,
"loss": 1.2729,
"step": 359
},
{
"epoch": 0.27,
"grad_norm": 0.5593055072800345,
"learning_rate": 8.61089977549153e-05,
"loss": 1.2529,
"step": 360
},
{
"epoch": 0.27,
"grad_norm": 0.5596710951308123,
"learning_rate": 8.602607968003935e-05,
"loss": 1.2725,
"step": 361
},
{
"epoch": 0.27,
"grad_norm": 0.5433552854623133,
"learning_rate": 8.59429550439039e-05,
"loss": 1.2446,
"step": 362
},
{
"epoch": 0.27,
"grad_norm": 0.5818949631250041,
"learning_rate": 8.585962432311727e-05,
"loss": 1.2998,
"step": 363
},
{
"epoch": 0.27,
"grad_norm": 0.514243535892493,
"learning_rate": 8.577608799546942e-05,
"loss": 1.23,
"step": 364
},
{
"epoch": 0.27,
"grad_norm": 0.5465838481685172,
"learning_rate": 8.569234653992916e-05,
"loss": 1.2532,
"step": 365
},
{
"epoch": 0.27,
"grad_norm": 0.519563471824199,
"learning_rate": 8.560840043664144e-05,
"loss": 1.2607,
"step": 366
},
{
"epoch": 0.27,
"grad_norm": 0.5334398982863738,
"learning_rate": 8.552425016692464e-05,
"loss": 1.2363,
"step": 367
},
{
"epoch": 0.27,
"grad_norm": 0.5530652812053678,
"learning_rate": 8.543989621326768e-05,
"loss": 1.2681,
"step": 368
},
{
"epoch": 0.27,
"grad_norm": 0.5502954863671434,
"learning_rate": 8.535533905932738e-05,
"loss": 1.1721,
"step": 369
},
{
"epoch": 0.27,
"grad_norm": 0.5180001078920966,
"learning_rate": 8.527057918992565e-05,
"loss": 1.2139,
"step": 370
},
{
"epoch": 0.27,
"grad_norm": 0.5333180911534254,
"learning_rate": 8.518561709104667e-05,
"loss": 1.2461,
"step": 371
},
{
"epoch": 0.27,
"grad_norm": 0.5479350107655593,
"learning_rate": 8.510045324983417e-05,
"loss": 1.2512,
"step": 372
},
{
"epoch": 0.28,
"grad_norm": 0.5246093324411485,
"learning_rate": 8.501508815458855e-05,
"loss": 1.1787,
"step": 373
},
{
"epoch": 0.28,
"grad_norm": 0.50033135264865,
"learning_rate": 8.492952229476421e-05,
"loss": 1.2271,
"step": 374
},
{
"epoch": 0.28,
"grad_norm": 0.5418162221365314,
"learning_rate": 8.484375616096658e-05,
"loss": 1.2383,
"step": 375
},
{
"epoch": 0.28,
"grad_norm": 0.516783670359288,
"learning_rate": 8.475779024494945e-05,
"loss": 1.2681,
"step": 376
},
{
"epoch": 0.28,
"grad_norm": 0.5298750460233759,
"learning_rate": 8.467162503961208e-05,
"loss": 1.2451,
"step": 377
},
{
"epoch": 0.28,
"grad_norm": 0.5149476400550106,
"learning_rate": 8.45852610389964e-05,
"loss": 1.23,
"step": 378
},
{
"epoch": 0.28,
"grad_norm": 0.5268563601419046,
"learning_rate": 8.449869873828411e-05,
"loss": 1.2129,
"step": 379
},
{
"epoch": 0.28,
"grad_norm": 0.5357435202461692,
"learning_rate": 8.441193863379396e-05,
"loss": 1.2881,
"step": 380
},
{
"epoch": 0.28,
"grad_norm": 0.5407114377511073,
"learning_rate": 8.432498122297878e-05,
"loss": 1.2559,
"step": 381
},
{
"epoch": 0.28,
"grad_norm": 0.5376253272809564,
"learning_rate": 8.423782700442277e-05,
"loss": 1.2346,
"step": 382
},
{
"epoch": 0.28,
"grad_norm": 0.5378153063595059,
"learning_rate": 8.415047647783847e-05,
"loss": 1.2031,
"step": 383
},
{
"epoch": 0.28,
"grad_norm": 0.514779002563088,
"learning_rate": 8.406293014406403e-05,
"loss": 1.2056,
"step": 384
},
{
"epoch": 0.28,
"grad_norm": 0.5659231392943161,
"learning_rate": 8.397518850506028e-05,
"loss": 1.2346,
"step": 385
},
{
"epoch": 0.29,
"grad_norm": 0.5483974446090379,
"learning_rate": 8.388725206390788e-05,
"loss": 1.2974,
"step": 386
},
{
"epoch": 0.29,
"grad_norm": 0.5297423113703096,
"learning_rate": 8.379912132480441e-05,
"loss": 1.2427,
"step": 387
},
{
"epoch": 0.29,
"grad_norm": 0.5339239833592698,
"learning_rate": 8.371079679306146e-05,
"loss": 1.2788,
"step": 388
},
{
"epoch": 0.29,
"grad_norm": 0.5346762752364651,
"learning_rate": 8.36222789751018e-05,
"loss": 1.2329,
"step": 389
},
{
"epoch": 0.29,
"grad_norm": 0.5267945253503268,
"learning_rate": 8.353356837845642e-05,
"loss": 1.3101,
"step": 390
},
{
"epoch": 0.29,
"grad_norm": 0.5227678407329124,
"learning_rate": 8.344466551176164e-05,
"loss": 1.2544,
"step": 391
},
{
"epoch": 0.29,
"grad_norm": 0.5351886972585579,
"learning_rate": 8.335557088475618e-05,
"loss": 1.2036,
"step": 392
},
{
"epoch": 0.29,
"grad_norm": 0.547855768363372,
"learning_rate": 8.326628500827826e-05,
"loss": 1.2256,
"step": 393
},
{
"epoch": 0.29,
"grad_norm": 0.5232912428703006,
"learning_rate": 8.31768083942627e-05,
"loss": 1.2524,
"step": 394
},
{
"epoch": 0.29,
"grad_norm": 0.5355407135538937,
"learning_rate": 8.308714155573785e-05,
"loss": 1.1904,
"step": 395
},
{
"epoch": 0.29,
"grad_norm": 0.5398818834520477,
"learning_rate": 8.29972850068228e-05,
"loss": 1.2544,
"step": 396
},
{
"epoch": 0.29,
"grad_norm": 0.5365767973671521,
"learning_rate": 8.290723926272439e-05,
"loss": 1.2378,
"step": 397
},
{
"epoch": 0.29,
"grad_norm": 0.5505960932890972,
"learning_rate": 8.281700483973421e-05,
"loss": 1.2471,
"step": 398
},
{
"epoch": 0.29,
"grad_norm": 0.5479428166637395,
"learning_rate": 8.272658225522569e-05,
"loss": 1.2607,
"step": 399
},
{
"epoch": 0.3,
"grad_norm": 0.5764125413085645,
"learning_rate": 8.263597202765109e-05,
"loss": 1.2888,
"step": 400
},
{
"epoch": 0.3,
"grad_norm": 0.5193462362673806,
"learning_rate": 8.254517467653858e-05,
"loss": 1.1882,
"step": 401
},
{
"epoch": 0.3,
"grad_norm": 0.5374168368793678,
"learning_rate": 8.245419072248919e-05,
"loss": 1.2358,
"step": 402
},
{
"epoch": 0.3,
"grad_norm": 0.5560345573494497,
"learning_rate": 8.236302068717392e-05,
"loss": 1.3,
"step": 403
},
{
"epoch": 0.3,
"grad_norm": 0.5223138605512301,
"learning_rate": 8.227166509333068e-05,
"loss": 1.2559,
"step": 404
},
{
"epoch": 0.3,
"grad_norm": 0.5009208364979428,
"learning_rate": 8.218012446476128e-05,
"loss": 1.2617,
"step": 405
},
{
"epoch": 0.3,
"grad_norm": 0.509867725986647,
"learning_rate": 8.208839932632849e-05,
"loss": 1.2715,
"step": 406
},
{
"epoch": 0.3,
"grad_norm": 0.5190782935920448,
"learning_rate": 8.199649020395298e-05,
"loss": 1.2183,
"step": 407
},
{
"epoch": 0.3,
"grad_norm": 0.551317848502644,
"learning_rate": 8.190439762461033e-05,
"loss": 1.2241,
"step": 408
},
{
"epoch": 0.3,
"grad_norm": 0.5299140869699253,
"learning_rate": 8.181212211632799e-05,
"loss": 1.1746,
"step": 409
},
{
"epoch": 0.3,
"grad_norm": 0.5161200175965883,
"learning_rate": 8.171966420818228e-05,
"loss": 1.2544,
"step": 410
},
{
"epoch": 0.3,
"grad_norm": 0.5368310977870265,
"learning_rate": 8.162702443029531e-05,
"loss": 1.2505,
"step": 411
},
{
"epoch": 0.3,
"grad_norm": 0.5392135585371384,
"learning_rate": 8.153420331383199e-05,
"loss": 1.2378,
"step": 412
},
{
"epoch": 0.31,
"grad_norm": 0.5652426070182841,
"learning_rate": 8.144120139099697e-05,
"loss": 1.2788,
"step": 413
},
{
"epoch": 0.31,
"grad_norm": 0.5264883521440279,
"learning_rate": 8.134801919503154e-05,
"loss": 1.2432,
"step": 414
},
{
"epoch": 0.31,
"grad_norm": 0.5391198787958846,
"learning_rate": 8.125465726021069e-05,
"loss": 1.2642,
"step": 415
},
{
"epoch": 0.31,
"grad_norm": 0.5447234901673647,
"learning_rate": 8.116111612183989e-05,
"loss": 1.2598,
"step": 416
},
{
"epoch": 0.31,
"grad_norm": 0.5239448356746366,
"learning_rate": 8.106739631625217e-05,
"loss": 1.2383,
"step": 417
},
{
"epoch": 0.31,
"grad_norm": 0.522466994953917,
"learning_rate": 8.09734983808049e-05,
"loss": 1.21,
"step": 418
},
{
"epoch": 0.31,
"grad_norm": 0.49320728726020635,
"learning_rate": 8.087942285387688e-05,
"loss": 1.1643,
"step": 419
},
{
"epoch": 0.31,
"grad_norm": 0.538615135680076,
"learning_rate": 8.07851702748651e-05,
"loss": 1.2485,
"step": 420
},
{
"epoch": 0.31,
"grad_norm": 0.5546864636999657,
"learning_rate": 8.06907411841817e-05,
"loss": 1.1887,
"step": 421
},
{
"epoch": 0.31,
"grad_norm": 0.5337150121699967,
"learning_rate": 8.05961361232509e-05,
"loss": 1.2378,
"step": 422
},
{
"epoch": 0.31,
"grad_norm": 0.5548120199862732,
"learning_rate": 8.050135563450587e-05,
"loss": 1.2129,
"step": 423
},
{
"epoch": 0.31,
"grad_norm": 0.5491477319207145,
"learning_rate": 8.040640026138562e-05,
"loss": 1.2615,
"step": 424
},
{
"epoch": 0.31,
"grad_norm": 0.5292609791678348,
"learning_rate": 8.03112705483319e-05,
"loss": 1.1963,
"step": 425
},
{
"epoch": 0.31,
"grad_norm": 0.5386073890465884,
"learning_rate": 8.021596704078605e-05,
"loss": 1.2822,
"step": 426
},
{
"epoch": 0.32,
"grad_norm": 0.5208877771953219,
"learning_rate": 8.012049028518589e-05,
"loss": 1.2468,
"step": 427
},
{
"epoch": 0.32,
"grad_norm": 0.5300893442105213,
"learning_rate": 8.002484082896257e-05,
"loss": 1.2141,
"step": 428
},
{
"epoch": 0.32,
"grad_norm": 0.5426660622332912,
"learning_rate": 7.992901922053752e-05,
"loss": 1.2083,
"step": 429
},
{
"epoch": 0.32,
"grad_norm": 0.5280778314237736,
"learning_rate": 7.983302600931911e-05,
"loss": 1.2556,
"step": 430
},
{
"epoch": 0.32,
"grad_norm": 0.5303015472910759,
"learning_rate": 7.973686174569972e-05,
"loss": 1.2246,
"step": 431
},
{
"epoch": 0.32,
"grad_norm": 0.5385117857553907,
"learning_rate": 7.964052698105247e-05,
"loss": 1.2544,
"step": 432
},
{
"epoch": 0.32,
"grad_norm": 0.5175160927509813,
"learning_rate": 7.954402226772804e-05,
"loss": 1.1724,
"step": 433
},
{
"epoch": 0.32,
"grad_norm": 0.5167307050244405,
"learning_rate": 7.944734815905154e-05,
"loss": 1.228,
"step": 434
},
{
"epoch": 0.32,
"grad_norm": 0.533666702216578,
"learning_rate": 7.93505052093194e-05,
"loss": 1.2349,
"step": 435
},
{
"epoch": 0.32,
"grad_norm": 0.5259498652131873,
"learning_rate": 7.925349397379604e-05,
"loss": 1.2415,
"step": 436
},
{
"epoch": 0.32,
"grad_norm": 0.5445977576017799,
"learning_rate": 7.915631500871083e-05,
"loss": 1.2065,
"step": 437
},
{
"epoch": 0.32,
"grad_norm": 0.5649990455410109,
"learning_rate": 7.905896887125482e-05,
"loss": 1.2417,
"step": 438
},
{
"epoch": 0.32,
"grad_norm": 0.5260513948557283,
"learning_rate": 7.896145611957759e-05,
"loss": 1.1918,
"step": 439
},
{
"epoch": 0.33,
"grad_norm": 0.5258410063287358,
"learning_rate": 7.8863777312784e-05,
"loss": 1.2124,
"step": 440
},
{
"epoch": 0.33,
"grad_norm": 0.5434644442116746,
"learning_rate": 7.876593301093104e-05,
"loss": 1.2349,
"step": 441
},
{
"epoch": 0.33,
"grad_norm": 0.5462561748612222,
"learning_rate": 7.866792377502457e-05,
"loss": 1.2373,
"step": 442
},
{
"epoch": 0.33,
"grad_norm": 0.5661256454549024,
"learning_rate": 7.856975016701615e-05,
"loss": 1.2334,
"step": 443
},
{
"epoch": 0.33,
"grad_norm": 0.5517524055311237,
"learning_rate": 7.847141274979977e-05,
"loss": 1.2549,
"step": 444
},
{
"epoch": 0.33,
"grad_norm": 0.5588533911643465,
"learning_rate": 7.837291208720866e-05,
"loss": 1.248,
"step": 445
},
{
"epoch": 0.33,
"grad_norm": 0.5432341108696274,
"learning_rate": 7.827424874401203e-05,
"loss": 1.207,
"step": 446
},
{
"epoch": 0.33,
"grad_norm": 0.5185655878803792,
"learning_rate": 7.81754232859119e-05,
"loss": 1.2087,
"step": 447
},
{
"epoch": 0.33,
"grad_norm": 0.546989000271988,
"learning_rate": 7.807643627953969e-05,
"loss": 1.2852,
"step": 448
},
{
"epoch": 0.33,
"grad_norm": 0.5609807732483688,
"learning_rate": 7.797728829245321e-05,
"loss": 1.23,
"step": 449
},
{
"epoch": 0.33,
"grad_norm": 0.5290536891546959,
"learning_rate": 7.787797989313317e-05,
"loss": 1.1687,
"step": 450
},
{
"epoch": 0.33,
"grad_norm": 0.527486366572943,
"learning_rate": 7.777851165098012e-05,
"loss": 1.2349,
"step": 451
},
{
"epoch": 0.33,
"grad_norm": 0.5444668761845415,
"learning_rate": 7.767888413631101e-05,
"loss": 1.248,
"step": 452
},
{
"epoch": 0.33,
"grad_norm": 0.5194113505588946,
"learning_rate": 7.757909792035608e-05,
"loss": 1.3081,
"step": 453
},
{
"epoch": 0.34,
"grad_norm": 0.5174613130879753,
"learning_rate": 7.747915357525545e-05,
"loss": 1.2046,
"step": 454
},
{
"epoch": 0.34,
"grad_norm": 0.5535670191712191,
"learning_rate": 7.737905167405595e-05,
"loss": 1.2136,
"step": 455
},
{
"epoch": 0.34,
"grad_norm": 0.546209627520353,
"learning_rate": 7.727879279070773e-05,
"loss": 1.2097,
"step": 456
},
{
"epoch": 0.34,
"grad_norm": 0.5221397456131871,
"learning_rate": 7.717837750006106e-05,
"loss": 1.2832,
"step": 457
},
{
"epoch": 0.34,
"grad_norm": 0.5380906003507856,
"learning_rate": 7.7077806377863e-05,
"loss": 1.1807,
"step": 458
},
{
"epoch": 0.34,
"grad_norm": 0.546159089637007,
"learning_rate": 7.697708000075403e-05,
"loss": 1.262,
"step": 459
},
{
"epoch": 0.34,
"grad_norm": 0.5378903447286532,
"learning_rate": 7.687619894626493e-05,
"loss": 1.2639,
"step": 460
},
{
"epoch": 0.34,
"grad_norm": 0.5183593724417229,
"learning_rate": 7.677516379281321e-05,
"loss": 1.2344,
"step": 461
},
{
"epoch": 0.34,
"grad_norm": 0.5110004203240966,
"learning_rate": 7.667397511970005e-05,
"loss": 1.2144,
"step": 462
},
{
"epoch": 0.34,
"grad_norm": 0.5237401648978784,
"learning_rate": 7.657263350710676e-05,
"loss": 1.1992,
"step": 463
},
{
"epoch": 0.34,
"grad_norm": 0.5458624581753624,
"learning_rate": 7.647113953609163e-05,
"loss": 1.252,
"step": 464
},
{
"epoch": 0.34,
"grad_norm": 0.55612272064723,
"learning_rate": 7.636949378858646e-05,
"loss": 1.188,
"step": 465
},
{
"epoch": 0.34,
"grad_norm": 0.5578526299155908,
"learning_rate": 7.626769684739337e-05,
"loss": 1.1951,
"step": 466
},
{
"epoch": 0.35,
"grad_norm": 0.5092511020982519,
"learning_rate": 7.616574929618125e-05,
"loss": 1.1543,
"step": 467
},
{
"epoch": 0.35,
"grad_norm": 0.5348616024567703,
"learning_rate": 7.606365171948267e-05,
"loss": 1.2368,
"step": 468
},
{
"epoch": 0.35,
"grad_norm": 0.532298079012496,
"learning_rate": 7.596140470269029e-05,
"loss": 1.2107,
"step": 469
},
{
"epoch": 0.35,
"grad_norm": 0.5514395726265122,
"learning_rate": 7.585900883205364e-05,
"loss": 1.241,
"step": 470
},
{
"epoch": 0.35,
"grad_norm": 0.5539874834294591,
"learning_rate": 7.575646469467575e-05,
"loss": 1.2249,
"step": 471
},
{
"epoch": 0.35,
"grad_norm": 0.5141238427544136,
"learning_rate": 7.565377287850977e-05,
"loss": 1.21,
"step": 472
},
{
"epoch": 0.35,
"grad_norm": 0.526119772429715,
"learning_rate": 7.555093397235552e-05,
"loss": 1.2141,
"step": 473
},
{
"epoch": 0.35,
"grad_norm": 0.5239544155150679,
"learning_rate": 7.544794856585626e-05,
"loss": 1.2446,
"step": 474
},
{
"epoch": 0.35,
"grad_norm": 0.5116743183638587,
"learning_rate": 7.53448172494952e-05,
"loss": 1.2251,
"step": 475
},
{
"epoch": 0.35,
"grad_norm": 0.5465278452905271,
"learning_rate": 7.524154061459215e-05,
"loss": 1.1744,
"step": 476
},
{
"epoch": 0.35,
"grad_norm": 0.5242898434746838,
"learning_rate": 7.51381192533001e-05,
"loss": 1.2305,
"step": 477
},
{
"epoch": 0.35,
"grad_norm": 0.5524906450650563,
"learning_rate": 7.503455375860192e-05,
"loss": 1.271,
"step": 478
},
{
"epoch": 0.35,
"grad_norm": 0.5422094091125237,
"learning_rate": 7.493084472430682e-05,
"loss": 1.2983,
"step": 479
},
{
"epoch": 0.35,
"grad_norm": 0.5100606069460412,
"learning_rate": 7.482699274504708e-05,
"loss": 1.1914,
"step": 480
},
{
"epoch": 0.36,
"grad_norm": 0.5258246755815246,
"learning_rate": 7.472299841627451e-05,
"loss": 1.1948,
"step": 481
},
{
"epoch": 0.36,
"grad_norm": 0.5183104456102203,
"learning_rate": 7.461886233425717e-05,
"loss": 1.1658,
"step": 482
},
{
"epoch": 0.36,
"grad_norm": 0.5283305385961874,
"learning_rate": 7.451458509607582e-05,
"loss": 1.2378,
"step": 483
},
{
"epoch": 0.36,
"grad_norm": 0.5552677702446687,
"learning_rate": 7.441016729962064e-05,
"loss": 1.1938,
"step": 484
},
{
"epoch": 0.36,
"grad_norm": 0.5198625616185957,
"learning_rate": 7.430560954358764e-05,
"loss": 1.2515,
"step": 485
},
{
"epoch": 0.36,
"grad_norm": 0.524907115545136,
"learning_rate": 7.420091242747536e-05,
"loss": 1.2437,
"step": 486
},
{
"epoch": 0.36,
"grad_norm": 0.520819742542826,
"learning_rate": 7.409607655158139e-05,
"loss": 1.2764,
"step": 487
},
{
"epoch": 0.36,
"grad_norm": 0.5297968503831433,
"learning_rate": 7.399110251699887e-05,
"loss": 1.2529,
"step": 488
},
{
"epoch": 0.36,
"grad_norm": 0.5214545833543685,
"learning_rate": 7.388599092561315e-05,
"loss": 1.2979,
"step": 489
},
{
"epoch": 0.36,
"grad_norm": 0.5158994351772959,
"learning_rate": 7.378074238009826e-05,
"loss": 1.2363,
"step": 490
},
{
"epoch": 0.36,
"grad_norm": 0.49265767229951024,
"learning_rate": 7.367535748391349e-05,
"loss": 1.228,
"step": 491
},
{
"epoch": 0.36,
"grad_norm": 0.5308141896787576,
"learning_rate": 7.35698368412999e-05,
"loss": 1.2527,
"step": 492
},
{
"epoch": 0.36,
"grad_norm": 0.5185543266636785,
"learning_rate": 7.346418105727686e-05,
"loss": 1.2192,
"step": 493
},
{
"epoch": 0.37,
"grad_norm": 0.5231300605729964,
"learning_rate": 7.335839073763865e-05,
"loss": 1.2065,
"step": 494
},
{
"epoch": 0.37,
"grad_norm": 0.5399567824066669,
"learning_rate": 7.325246648895088e-05,
"loss": 1.2563,
"step": 495
},
{
"epoch": 0.37,
"grad_norm": 0.5239942836551379,
"learning_rate": 7.31464089185471e-05,
"loss": 1.2549,
"step": 496
},
{
"epoch": 0.37,
"grad_norm": 0.5367247940798874,
"learning_rate": 7.304021863452524e-05,
"loss": 1.2061,
"step": 497
},
{
"epoch": 0.37,
"grad_norm": 0.5404506218621764,
"learning_rate": 7.293389624574422e-05,
"loss": 1.2142,
"step": 498
},
{
"epoch": 0.37,
"grad_norm": 0.5055969660442964,
"learning_rate": 7.282744236182034e-05,
"loss": 1.2451,
"step": 499
},
{
"epoch": 0.37,
"grad_norm": 0.5423433133756662,
"learning_rate": 7.27208575931239e-05,
"loss": 1.2012,
"step": 500
},
{
"epoch": 0.37,
"grad_norm": 0.5291351969461193,
"learning_rate": 7.26141425507756e-05,
"loss": 1.1768,
"step": 501
},
{
"epoch": 0.37,
"grad_norm": 0.5217703642849318,
"learning_rate": 7.250729784664316e-05,
"loss": 1.209,
"step": 502
},
{
"epoch": 0.37,
"grad_norm": 0.5201622197991884,
"learning_rate": 7.240032409333764e-05,
"loss": 1.2031,
"step": 503
},
{
"epoch": 0.37,
"grad_norm": 0.5281271991799672,
"learning_rate": 7.22932219042101e-05,
"loss": 1.1987,
"step": 504
},
{
"epoch": 0.37,
"grad_norm": 0.5573441678253518,
"learning_rate": 7.218599189334799e-05,
"loss": 1.2739,
"step": 505
},
{
"epoch": 0.37,
"grad_norm": 0.5665017191299871,
"learning_rate": 7.207863467557162e-05,
"loss": 1.2773,
"step": 506
},
{
"epoch": 0.37,
"grad_norm": 0.5325104774494102,
"learning_rate": 7.19711508664307e-05,
"loss": 1.209,
"step": 507
},
{
"epoch": 0.38,
"grad_norm": 0.518792873366363,
"learning_rate": 7.186354108220072e-05,
"loss": 1.2173,
"step": 508
},
{
"epoch": 0.38,
"grad_norm": 0.530762745727063,
"learning_rate": 7.175580593987951e-05,
"loss": 1.2466,
"step": 509
},
{
"epoch": 0.38,
"grad_norm": 0.5140061528285057,
"learning_rate": 7.164794605718366e-05,
"loss": 1.2139,
"step": 510
},
{
"epoch": 0.38,
"grad_norm": 0.5194168189274216,
"learning_rate": 7.153996205254495e-05,
"loss": 1.2476,
"step": 511
},
{
"epoch": 0.38,
"grad_norm": 0.5487088087238914,
"learning_rate": 7.143185454510686e-05,
"loss": 1.2251,
"step": 512
},
{
"epoch": 0.38,
"grad_norm": 0.49449833617368844,
"learning_rate": 7.1323624154721e-05,
"loss": 1.2021,
"step": 513
},
{
"epoch": 0.38,
"grad_norm": 0.5209680110441622,
"learning_rate": 7.121527150194349e-05,
"loss": 1.229,
"step": 514
},
{
"epoch": 0.38,
"grad_norm": 0.5179658980514732,
"learning_rate": 7.110679720803156e-05,
"loss": 1.2324,
"step": 515
},
{
"epoch": 0.38,
"grad_norm": 0.5237224991500224,
"learning_rate": 7.099820189493977e-05,
"loss": 1.269,
"step": 516
},
{
"epoch": 0.38,
"grad_norm": 0.5302189416292129,
"learning_rate": 7.088948618531667e-05,
"loss": 1.2041,
"step": 517
},
{
"epoch": 0.38,
"grad_norm": 0.5384341108312423,
"learning_rate": 7.078065070250106e-05,
"loss": 1.1746,
"step": 518
},
{
"epoch": 0.38,
"grad_norm": 0.5521437637462966,
"learning_rate": 7.067169607051851e-05,
"loss": 1.2886,
"step": 519
},
{
"epoch": 0.38,
"grad_norm": 0.5328288678743964,
"learning_rate": 7.056262291407772e-05,
"loss": 1.1877,
"step": 520
},
{
"epoch": 0.39,
"grad_norm": 0.5359494830051162,
"learning_rate": 7.045343185856701e-05,
"loss": 1.2202,
"step": 521
},
{
"epoch": 0.39,
"grad_norm": 0.5288532232218185,
"learning_rate": 7.034412353005063e-05,
"loss": 1.21,
"step": 522
},
{
"epoch": 0.39,
"grad_norm": 0.5512085122241619,
"learning_rate": 7.02346985552653e-05,
"loss": 1.2798,
"step": 523
},
{
"epoch": 0.39,
"grad_norm": 0.533944460040126,
"learning_rate": 7.01251575616165e-05,
"loss": 1.2539,
"step": 524
},
{
"epoch": 0.39,
"grad_norm": 0.5837632563221825,
"learning_rate": 7.0015501177175e-05,
"loss": 1.1335,
"step": 525
},
{
"epoch": 0.39,
"grad_norm": 0.5228757012149624,
"learning_rate": 6.990573003067304e-05,
"loss": 1.1689,
"step": 526
},
{
"epoch": 0.39,
"grad_norm": 0.5276157144534642,
"learning_rate": 6.979584475150103e-05,
"loss": 1.1667,
"step": 527
},
{
"epoch": 0.39,
"grad_norm": 0.5240007425873849,
"learning_rate": 6.968584596970364e-05,
"loss": 1.2256,
"step": 528
},
{
"epoch": 0.39,
"grad_norm": 0.5358932103718607,
"learning_rate": 6.957573431597646e-05,
"loss": 1.2534,
"step": 529
},
{
"epoch": 0.39,
"grad_norm": 0.5252949355084833,
"learning_rate": 6.946551042166209e-05,
"loss": 1.2139,
"step": 530
},
{
"epoch": 0.39,
"grad_norm": 0.5281892436988233,
"learning_rate": 6.935517491874683e-05,
"loss": 1.1924,
"step": 531
},
{
"epoch": 0.39,
"grad_norm": 0.5343708524890692,
"learning_rate": 6.92447284398568e-05,
"loss": 1.1919,
"step": 532
},
{
"epoch": 0.39,
"grad_norm": 0.5061436298543835,
"learning_rate": 6.91341716182545e-05,
"loss": 1.231,
"step": 533
},
{
"epoch": 0.39,
"grad_norm": 0.5526887595468994,
"learning_rate": 6.902350508783502e-05,
"loss": 1.21,
"step": 534
},
{
"epoch": 0.4,
"grad_norm": 0.5436193017022867,
"learning_rate": 6.89127294831225e-05,
"loss": 1.1917,
"step": 535
},
{
"epoch": 0.4,
"grad_norm": 0.5362337988101724,
"learning_rate": 6.880184543926655e-05,
"loss": 1.2441,
"step": 536
},
{
"epoch": 0.4,
"grad_norm": 0.5061527382765968,
"learning_rate": 6.869085359203844e-05,
"loss": 1.2168,
"step": 537
},
{
"epoch": 0.4,
"grad_norm": 0.5387899893879528,
"learning_rate": 6.85797545778276e-05,
"loss": 1.2363,
"step": 538
},
{
"epoch": 0.4,
"grad_norm": 0.5397275428078208,
"learning_rate": 6.84685490336379e-05,
"loss": 1.2261,
"step": 539
},
{
"epoch": 0.4,
"grad_norm": 0.5431245391709038,
"learning_rate": 6.835723759708401e-05,
"loss": 1.2319,
"step": 540
},
{
"epoch": 0.4,
"grad_norm": 0.5397663035045336,
"learning_rate": 6.824582090638777e-05,
"loss": 1.1814,
"step": 541
},
{
"epoch": 0.4,
"grad_norm": 0.528133649622763,
"learning_rate": 6.81342996003745e-05,
"loss": 1.1558,
"step": 542
},
{
"epoch": 0.4,
"grad_norm": 0.5440860624522684,
"learning_rate": 6.802267431846934e-05,
"loss": 1.1782,
"step": 543
},
{
"epoch": 0.4,
"grad_norm": 0.5501235346149123,
"learning_rate": 6.791094570069365e-05,
"loss": 1.228,
"step": 544
},
{
"epoch": 0.4,
"grad_norm": 0.5252345463299878,
"learning_rate": 6.779911438766116e-05,
"loss": 1.229,
"step": 545
},
{
"epoch": 0.4,
"grad_norm": 0.5190121721686506,
"learning_rate": 6.768718102057457e-05,
"loss": 1.2314,
"step": 546
},
{
"epoch": 0.4,
"grad_norm": 0.547000882661627,
"learning_rate": 6.757514624122158e-05,
"loss": 1.2126,
"step": 547
},
{
"epoch": 0.4,
"grad_norm": 0.5302137816217907,
"learning_rate": 6.746301069197148e-05,
"loss": 1.2268,
"step": 548
},
{
"epoch": 0.41,
"grad_norm": 0.5320509154114407,
"learning_rate": 6.735077501577126e-05,
"loss": 1.2153,
"step": 549
},
{
"epoch": 0.41,
"grad_norm": 0.5333187052856754,
"learning_rate": 6.723843985614201e-05,
"loss": 1.2029,
"step": 550
},
{
"epoch": 0.41,
"grad_norm": 0.5319396258291598,
"learning_rate": 6.712600585717525e-05,
"loss": 1.2437,
"step": 551
},
{
"epoch": 0.41,
"grad_norm": 0.5284174311932452,
"learning_rate": 6.701347366352922e-05,
"loss": 1.1968,
"step": 552
},
{
"epoch": 0.41,
"grad_norm": 0.543152971365775,
"learning_rate": 6.690084392042513e-05,
"loss": 1.249,
"step": 553
},
{
"epoch": 0.41,
"grad_norm": 0.5247392915658112,
"learning_rate": 6.678811727364355e-05,
"loss": 1.2651,
"step": 554
},
{
"epoch": 0.41,
"grad_norm": 0.5179262710835614,
"learning_rate": 6.667529436952063e-05,
"loss": 1.2461,
"step": 555
},
{
"epoch": 0.41,
"grad_norm": 0.5133921620807488,
"learning_rate": 6.656237585494448e-05,
"loss": 1.2083,
"step": 556
},
{
"epoch": 0.41,
"grad_norm": 0.5256137110789443,
"learning_rate": 6.644936237735128e-05,
"loss": 1.2119,
"step": 557
},
{
"epoch": 0.41,
"grad_norm": 0.5214795791380175,
"learning_rate": 6.633625458472187e-05,
"loss": 1.2197,
"step": 558
},
{
"epoch": 0.41,
"grad_norm": 0.5229372966716483,
"learning_rate": 6.622305312557773e-05,
"loss": 1.2197,
"step": 559
},
{
"epoch": 0.41,
"grad_norm": 0.5384219888080702,
"learning_rate": 6.610975864897746e-05,
"loss": 1.1738,
"step": 560
},
{
"epoch": 0.41,
"grad_norm": 0.5129183996620766,
"learning_rate": 6.599637180451294e-05,
"loss": 1.187,
"step": 561
},
{
"epoch": 0.42,
"grad_norm": 0.5418619404098455,
"learning_rate": 6.588289324230573e-05,
"loss": 1.2241,
"step": 562
},
{
"epoch": 0.42,
"grad_norm": 0.5366757957084506,
"learning_rate": 6.576932361300315e-05,
"loss": 1.2046,
"step": 563
},
{
"epoch": 0.42,
"grad_norm": 0.5373777152526157,
"learning_rate": 6.56556635677748e-05,
"loss": 1.2346,
"step": 564
},
{
"epoch": 0.42,
"grad_norm": 0.5280975504983476,
"learning_rate": 6.55419137583086e-05,
"loss": 1.2603,
"step": 565
},
{
"epoch": 0.42,
"grad_norm": 0.5511664229732176,
"learning_rate": 6.54280748368072e-05,
"loss": 1.2651,
"step": 566
},
{
"epoch": 0.42,
"grad_norm": 0.5411418066732246,
"learning_rate": 6.531414745598416e-05,
"loss": 1.1736,
"step": 567
},
{
"epoch": 0.42,
"grad_norm": 0.5511035451407554,
"learning_rate": 6.52001322690602e-05,
"loss": 1.1938,
"step": 568
},
{
"epoch": 0.42,
"grad_norm": 0.5392555071766858,
"learning_rate": 6.508602992975963e-05,
"loss": 1.2078,
"step": 569
},
{
"epoch": 0.42,
"grad_norm": 0.5415809502190669,
"learning_rate": 6.497184109230628e-05,
"loss": 1.2539,
"step": 570
},
{
"epoch": 0.42,
"grad_norm": 0.5261628211448517,
"learning_rate": 6.485756641142005e-05,
"loss": 1.1973,
"step": 571
},
{
"epoch": 0.42,
"grad_norm": 0.5122917176901964,
"learning_rate": 6.474320654231298e-05,
"loss": 1.1953,
"step": 572
},
{
"epoch": 0.42,
"grad_norm": 0.5172038523312286,
"learning_rate": 6.462876214068562e-05,
"loss": 1.186,
"step": 573
},
{
"epoch": 0.42,
"grad_norm": 0.5447216965187213,
"learning_rate": 6.451423386272312e-05,
"loss": 1.1936,
"step": 574
},
{
"epoch": 0.42,
"grad_norm": 0.5284834777209215,
"learning_rate": 6.43996223650916e-05,
"loss": 1.2319,
"step": 575
},
{
"epoch": 0.43,
"grad_norm": 0.5247856790151539,
"learning_rate": 6.42849283049343e-05,
"loss": 1.2529,
"step": 576
},
{
"epoch": 0.43,
"grad_norm": 0.5132885862155795,
"learning_rate": 6.417015233986786e-05,
"loss": 1.2468,
"step": 577
},
{
"epoch": 0.43,
"grad_norm": 0.5271464626093211,
"learning_rate": 6.405529512797857e-05,
"loss": 1.2354,
"step": 578
},
{
"epoch": 0.43,
"grad_norm": 0.5181230940813101,
"learning_rate": 6.394035732781847e-05,
"loss": 1.2002,
"step": 579
},
{
"epoch": 0.43,
"grad_norm": 0.46861475519511914,
"learning_rate": 6.382533959840177e-05,
"loss": 1.1367,
"step": 580
},
{
"epoch": 0.43,
"grad_norm": 0.5423503544753885,
"learning_rate": 6.371024259920091e-05,
"loss": 1.2109,
"step": 581
},
{
"epoch": 0.43,
"grad_norm": 0.545921477636058,
"learning_rate": 6.359506699014286e-05,
"loss": 1.2637,
"step": 582
},
{
"epoch": 0.43,
"grad_norm": 0.5005214281343369,
"learning_rate": 6.347981343160526e-05,
"loss": 1.2363,
"step": 583
},
{
"epoch": 0.43,
"grad_norm": 0.533087285382092,
"learning_rate": 6.336448258441275e-05,
"loss": 1.2388,
"step": 584
},
{
"epoch": 0.43,
"grad_norm": 0.5403536091742609,
"learning_rate": 6.32490751098331e-05,
"loss": 1.2114,
"step": 585
},
{
"epoch": 0.43,
"grad_norm": 0.5173286496306992,
"learning_rate": 6.31335916695734e-05,
"loss": 1.2051,
"step": 586
},
{
"epoch": 0.43,
"grad_norm": 0.5108214225535918,
"learning_rate": 6.301803292577635e-05,
"loss": 1.2168,
"step": 587
},
{
"epoch": 0.43,
"grad_norm": 0.5651518401766661,
"learning_rate": 6.290239954101638e-05,
"loss": 1.198,
"step": 588
},
{
"epoch": 0.44,
"grad_norm": 0.5099927600510906,
"learning_rate": 6.27866921782959e-05,
"loss": 1.2197,
"step": 589
},
{
"epoch": 0.44,
"grad_norm": 0.5123343726205575,
"learning_rate": 6.26709115010415e-05,
"loss": 1.2061,
"step": 590
},
{
"epoch": 0.44,
"grad_norm": 0.512591998951558,
"learning_rate": 6.255505817310009e-05,
"loss": 1.2388,
"step": 591
},
{
"epoch": 0.44,
"grad_norm": 0.5225197287073076,
"learning_rate": 6.243913285873517e-05,
"loss": 1.2329,
"step": 592
},
{
"epoch": 0.44,
"grad_norm": 0.5222125162050069,
"learning_rate": 6.232313622262296e-05,
"loss": 1.2441,
"step": 593
},
{
"epoch": 0.44,
"grad_norm": 0.5418033392398351,
"learning_rate": 6.220706892984865e-05,
"loss": 1.2026,
"step": 594
},
{
"epoch": 0.44,
"grad_norm": 0.5266898929761351,
"learning_rate": 6.209093164590252e-05,
"loss": 1.1775,
"step": 595
},
{
"epoch": 0.44,
"grad_norm": 0.5283784401712713,
"learning_rate": 6.197472503667616e-05,
"loss": 1.2583,
"step": 596
},
{
"epoch": 0.44,
"grad_norm": 0.5116587016310039,
"learning_rate": 6.185844976845866e-05,
"loss": 1.1968,
"step": 597
},
{
"epoch": 0.44,
"grad_norm": 0.5209965666242403,
"learning_rate": 6.174210650793276e-05,
"loss": 1.2109,
"step": 598
},
{
"epoch": 0.44,
"grad_norm": 0.5249837932309613,
"learning_rate": 6.162569592217105e-05,
"loss": 1.2256,
"step": 599
},
{
"epoch": 0.44,
"grad_norm": 0.5649846885821955,
"learning_rate": 6.150921867863215e-05,
"loss": 1.2126,
"step": 600
},
{
"epoch": 0.44,
"grad_norm": 0.5248287330210495,
"learning_rate": 6.139267544515689e-05,
"loss": 1.2061,
"step": 601
},
{
"epoch": 0.44,
"grad_norm": 0.5110179369381546,
"learning_rate": 6.127606688996441e-05,
"loss": 1.1577,
"step": 602
},
{
"epoch": 0.45,
"grad_norm": 0.528029303414983,
"learning_rate": 6.115939368164841e-05,
"loss": 1.1946,
"step": 603
},
{
"epoch": 0.45,
"grad_norm": 0.49902918198896545,
"learning_rate": 6.104265648917333e-05,
"loss": 1.1584,
"step": 604
},
{
"epoch": 0.45,
"grad_norm": 0.527430257732573,
"learning_rate": 6.09258559818704e-05,
"loss": 1.2051,
"step": 605
},
{
"epoch": 0.45,
"grad_norm": 0.502700467053209,
"learning_rate": 6.080899282943391e-05,
"loss": 1.1887,
"step": 606
},
{
"epoch": 0.45,
"grad_norm": 0.5346580850373724,
"learning_rate": 6.069206770191736e-05,
"loss": 1.1743,
"step": 607
},
{
"epoch": 0.45,
"grad_norm": 0.5522125179234447,
"learning_rate": 6.057508126972956e-05,
"loss": 1.2302,
"step": 608
},
{
"epoch": 0.45,
"grad_norm": 0.5304033863048495,
"learning_rate": 6.045803420363084e-05,
"loss": 1.2178,
"step": 609
},
{
"epoch": 0.45,
"grad_norm": 0.5281578945127866,
"learning_rate": 6.0340927174729166e-05,
"loss": 1.1973,
"step": 610
},
{
"epoch": 0.45,
"grad_norm": 0.5041828566959785,
"learning_rate": 6.022376085447632e-05,
"loss": 1.165,
"step": 611
},
{
"epoch": 0.45,
"grad_norm": 0.5288011581588896,
"learning_rate": 6.010653591466403e-05,
"loss": 1.1545,
"step": 612
},
{
"epoch": 0.45,
"grad_norm": 0.521653269091181,
"learning_rate": 5.998925302742017e-05,
"loss": 1.2549,
"step": 613
},
{
"epoch": 0.45,
"grad_norm": 0.5619947052616066,
"learning_rate": 5.987191286520479e-05,
"loss": 1.1687,
"step": 614
},
{
"epoch": 0.45,
"grad_norm": 0.5300667110541245,
"learning_rate": 5.9754516100806423e-05,
"loss": 1.2144,
"step": 615
},
{
"epoch": 0.46,
"grad_norm": 0.5347193779684026,
"learning_rate": 5.963706340733807e-05,
"loss": 1.1968,
"step": 616
},
{
"epoch": 0.46,
"grad_norm": 0.5375871113609256,
"learning_rate": 5.951955545823342e-05,
"loss": 1.2368,
"step": 617
},
{
"epoch": 0.46,
"grad_norm": 0.5446630041892937,
"learning_rate": 5.940199292724303e-05,
"loss": 1.2158,
"step": 618
},
{
"epoch": 0.46,
"grad_norm": 0.5257818739796485,
"learning_rate": 5.928437648843036e-05,
"loss": 1.187,
"step": 619
},
{
"epoch": 0.46,
"grad_norm": 0.5375817200542558,
"learning_rate": 5.9166706816167975e-05,
"loss": 1.1763,
"step": 620
},
{
"epoch": 0.46,
"grad_norm": 0.5402262250501304,
"learning_rate": 5.9048984585133646e-05,
"loss": 1.2432,
"step": 621
},
{
"epoch": 0.46,
"grad_norm": 0.530579494619579,
"learning_rate": 5.893121047030654e-05,
"loss": 1.2144,
"step": 622
},
{
"epoch": 0.46,
"grad_norm": 0.5076122557355204,
"learning_rate": 5.881338514696326e-05,
"loss": 1.2095,
"step": 623
},
{
"epoch": 0.46,
"grad_norm": 0.4899581867055723,
"learning_rate": 5.8695509290674066e-05,
"loss": 1.2222,
"step": 624
},
{
"epoch": 0.46,
"grad_norm": 0.5394084458543575,
"learning_rate": 5.8577583577298924e-05,
"loss": 1.2222,
"step": 625
},
{
"epoch": 0.46,
"grad_norm": 0.5259701967463519,
"learning_rate": 5.845960868298366e-05,
"loss": 1.187,
"step": 626
},
{
"epoch": 0.46,
"grad_norm": 0.5480519405802691,
"learning_rate": 5.834158528415611e-05,
"loss": 1.2129,
"step": 627
},
{
"epoch": 0.46,
"grad_norm": 0.536966588512764,
"learning_rate": 5.822351405752221e-05,
"loss": 1.2107,
"step": 628
},
{
"epoch": 0.46,
"grad_norm": 0.5274033257539729,
"learning_rate": 5.810539568006213e-05,
"loss": 1.1978,
"step": 629
},
{
"epoch": 0.47,
"grad_norm": 0.5203874948597922,
"learning_rate": 5.798723082902636e-05,
"loss": 1.208,
"step": 630
},
{
"epoch": 0.47,
"grad_norm": 0.5237759093105727,
"learning_rate": 5.786902018193189e-05,
"loss": 1.1877,
"step": 631
},
{
"epoch": 0.47,
"grad_norm": 0.5286040401812486,
"learning_rate": 5.7750764416558265e-05,
"loss": 1.1719,
"step": 632
},
{
"epoch": 0.47,
"grad_norm": 0.557845934995181,
"learning_rate": 5.7632464210943726e-05,
"loss": 1.1843,
"step": 633
},
{
"epoch": 0.47,
"grad_norm": 0.4929172146341576,
"learning_rate": 5.7514120243381345e-05,
"loss": 1.1611,
"step": 634
},
{
"epoch": 0.47,
"grad_norm": 0.5019009243225188,
"learning_rate": 5.739573319241505e-05,
"loss": 1.1682,
"step": 635
},
{
"epoch": 0.47,
"grad_norm": 0.5169183497566494,
"learning_rate": 5.727730373683586e-05,
"loss": 1.1885,
"step": 636
},
{
"epoch": 0.47,
"grad_norm": 0.5437611527948173,
"learning_rate": 5.71588325556779e-05,
"loss": 1.2017,
"step": 637
},
{
"epoch": 0.47,
"grad_norm": 0.5190195068877017,
"learning_rate": 5.704032032821454e-05,
"loss": 1.2185,
"step": 638
},
{
"epoch": 0.47,
"grad_norm": 0.5410389986252376,
"learning_rate": 5.692176773395446e-05,
"loss": 1.1852,
"step": 639
},
{
"epoch": 0.47,
"grad_norm": 0.5249369187472087,
"learning_rate": 5.6803175452637856e-05,
"loss": 1.2124,
"step": 640
},
{
"epoch": 0.47,
"grad_norm": 0.5364568247518677,
"learning_rate": 5.668454416423242e-05,
"loss": 1.1746,
"step": 641
},
{
"epoch": 0.47,
"grad_norm": 0.5234053514776408,
"learning_rate": 5.656587454892954e-05,
"loss": 1.1624,
"step": 642
},
{
"epoch": 0.48,
"grad_norm": 0.5177351393040158,
"learning_rate": 5.64471672871403e-05,
"loss": 1.175,
"step": 643
},
{
"epoch": 0.48,
"grad_norm": 0.5132442395510229,
"learning_rate": 5.632842305949171e-05,
"loss": 1.1545,
"step": 644
},
{
"epoch": 0.48,
"grad_norm": 0.5755509016701162,
"learning_rate": 5.620964254682266e-05,
"loss": 1.1897,
"step": 645
},
{
"epoch": 0.48,
"grad_norm": 0.5735541920509374,
"learning_rate": 5.6090826430180136e-05,
"loss": 1.2273,
"step": 646
},
{
"epoch": 0.48,
"grad_norm": 0.5559949685579456,
"learning_rate": 5.597197539081523e-05,
"loss": 1.2139,
"step": 647
},
{
"epoch": 0.48,
"grad_norm": 0.5233474966056798,
"learning_rate": 5.585309011017931e-05,
"loss": 1.209,
"step": 648
},
{
"epoch": 0.48,
"grad_norm": 0.5331263666529451,
"learning_rate": 5.573417126992003e-05,
"loss": 1.2026,
"step": 649
},
{
"epoch": 0.48,
"grad_norm": 0.5284076713653038,
"learning_rate": 5.5615219551877474e-05,
"loss": 1.1929,
"step": 650
},
{
"epoch": 0.48,
"grad_norm": 0.5225197521082877,
"learning_rate": 5.5496235638080254e-05,
"loss": 1.2612,
"step": 651
},
{
"epoch": 0.48,
"grad_norm": 0.5448728387339711,
"learning_rate": 5.5377220210741564e-05,
"loss": 1.2524,
"step": 652
},
{
"epoch": 0.48,
"grad_norm": 0.544623429616519,
"learning_rate": 5.52581739522553e-05,
"loss": 1.1814,
"step": 653
},
{
"epoch": 0.48,
"grad_norm": 0.5084514498279094,
"learning_rate": 5.5139097545192106e-05,
"loss": 1.1697,
"step": 654
},
{
"epoch": 0.48,
"grad_norm": 0.5355004661553598,
"learning_rate": 5.501999167229553e-05,
"loss": 1.2114,
"step": 655
},
{
"epoch": 0.48,
"grad_norm": 0.5276675034062545,
"learning_rate": 5.490085701647805e-05,
"loss": 1.1797,
"step": 656
},
{
"epoch": 0.49,
"grad_norm": 0.5332008126336106,
"learning_rate": 5.478169426081712e-05,
"loss": 1.2554,
"step": 657
},
{
"epoch": 0.49,
"grad_norm": 0.5023203699566182,
"learning_rate": 5.466250408855141e-05,
"loss": 1.1709,
"step": 658
},
{
"epoch": 0.49,
"grad_norm": 0.5331713252598932,
"learning_rate": 5.4543287183076706e-05,
"loss": 1.1819,
"step": 659
},
{
"epoch": 0.49,
"grad_norm": 0.5388207882005664,
"learning_rate": 5.4424044227942116e-05,
"loss": 1.1807,
"step": 660
},
{
"epoch": 0.49,
"grad_norm": 0.5245074391898623,
"learning_rate": 5.43047759068461e-05,
"loss": 1.2075,
"step": 661
},
{
"epoch": 0.49,
"grad_norm": 0.5034157178900629,
"learning_rate": 5.418548290363253e-05,
"loss": 1.1531,
"step": 662
},
{
"epoch": 0.49,
"grad_norm": 0.5291965390843301,
"learning_rate": 5.4066165902286836e-05,
"loss": 1.1948,
"step": 663
},
{
"epoch": 0.49,
"grad_norm": 0.5380603983560069,
"learning_rate": 5.394682558693204e-05,
"loss": 1.1597,
"step": 664
},
{
"epoch": 0.49,
"grad_norm": 0.514738579503193,
"learning_rate": 5.38274626418248e-05,
"loss": 1.1665,
"step": 665
},
{
"epoch": 0.49,
"grad_norm": 0.5249920622684557,
"learning_rate": 5.370807775135155e-05,
"loss": 1.1714,
"step": 666
},
{
"epoch": 0.49,
"grad_norm": 0.5349647230499153,
"learning_rate": 5.3588671600024585e-05,
"loss": 1.2334,
"step": 667
},
{
"epoch": 0.49,
"grad_norm": 0.5105922595928688,
"learning_rate": 5.346924487247804e-05,
"loss": 1.1584,
"step": 668
},
{
"epoch": 0.49,
"grad_norm": 0.5234259945759657,
"learning_rate": 5.334979825346409e-05,
"loss": 1.2466,
"step": 669
},
{
"epoch": 0.5,
"grad_norm": 0.5243549208324105,
"learning_rate": 5.3230332427848896e-05,
"loss": 1.1267,
"step": 670
},
{
"epoch": 0.5,
"grad_norm": 0.4932073653558184,
"learning_rate": 5.3110848080608796e-05,
"loss": 1.2285,
"step": 671
},
{
"epoch": 0.5,
"grad_norm": 0.5459222364102359,
"learning_rate": 5.2991345896826286e-05,
"loss": 1.2004,
"step": 672
},
{
"epoch": 0.5,
"grad_norm": 0.5003475271863167,
"learning_rate": 5.287182656168618e-05,
"loss": 1.2095,
"step": 673
},
{
"epoch": 0.5,
"grad_norm": 0.5335295764756873,
"learning_rate": 5.275229076047156e-05,
"loss": 1.2378,
"step": 674
},
{
"epoch": 0.5,
"grad_norm": 0.540414767903529,
"learning_rate": 5.2632739178559995e-05,
"loss": 1.2729,
"step": 675
},
{
"epoch": 0.5,
"grad_norm": 0.5248378699380909,
"learning_rate": 5.2513172501419484e-05,
"loss": 1.2241,
"step": 676
},
{
"epoch": 0.5,
"grad_norm": 0.5156441002749715,
"learning_rate": 5.2393591414604604e-05,
"loss": 1.1694,
"step": 677
},
{
"epoch": 0.5,
"grad_norm": 0.5390705120322237,
"learning_rate": 5.2273996603752525e-05,
"loss": 1.1514,
"step": 678
},
{
"epoch": 0.5,
"grad_norm": 0.5166127902709079,
"learning_rate": 5.215438875457914e-05,
"loss": 1.2319,
"step": 679
},
{
"epoch": 0.5,
"grad_norm": 0.5410034611024472,
"learning_rate": 5.2034768552875065e-05,
"loss": 1.1907,
"step": 680
},
{
"epoch": 0.5,
"grad_norm": 0.5123713990889422,
"learning_rate": 5.191513668450178e-05,
"loss": 1.1875,
"step": 681
},
{
"epoch": 0.5,
"grad_norm": 0.5654147275689173,
"learning_rate": 5.1795493835387596e-05,
"loss": 1.2041,
"step": 682
},
{
"epoch": 0.5,
"grad_norm": 0.5513961102479871,
"learning_rate": 5.167584069152388e-05,
"loss": 1.1975,
"step": 683
},
{
"epoch": 0.51,
"grad_norm": 0.518731962211893,
"learning_rate": 5.1556177938960915e-05,
"loss": 1.1626,
"step": 684
},
{
"epoch": 0.51,
"grad_norm": 0.506940133998132,
"learning_rate": 5.143650626380416e-05,
"loss": 1.1787,
"step": 685
},
{
"epoch": 0.51,
"grad_norm": 0.5039047094560937,
"learning_rate": 5.131682635221019e-05,
"loss": 1.186,
"step": 686
},
{
"epoch": 0.51,
"grad_norm": 0.529708919316274,
"learning_rate": 5.1197138890382835e-05,
"loss": 1.1699,
"step": 687
},
{
"epoch": 0.51,
"grad_norm": 0.5035014630527274,
"learning_rate": 5.107744456456919e-05,
"loss": 1.1772,
"step": 688
},
{
"epoch": 0.51,
"grad_norm": 0.5211130941939317,
"learning_rate": 5.095774406105571e-05,
"loss": 1.1968,
"step": 689
},
{
"epoch": 0.51,
"grad_norm": 0.530245386156086,
"learning_rate": 5.0838038066164285e-05,
"loss": 1.2026,
"step": 690
},
{
"epoch": 0.51,
"grad_norm": 0.5180702095559839,
"learning_rate": 5.071832726624828e-05,
"loss": 1.2012,
"step": 691
},
{
"epoch": 0.51,
"grad_norm": 0.534958707515583,
"learning_rate": 5.05986123476886e-05,
"loss": 1.2432,
"step": 692
},
{
"epoch": 0.51,
"grad_norm": 0.5368819061361736,
"learning_rate": 5.0478893996889796e-05,
"loss": 1.2236,
"step": 693
},
{
"epoch": 0.51,
"grad_norm": 0.5057485472728565,
"learning_rate": 5.0359172900276063e-05,
"loss": 1.207,
"step": 694
},
{
"epoch": 0.51,
"grad_norm": 0.5202027391543886,
"learning_rate": 5.023944974428738e-05,
"loss": 1.2354,
"step": 695
},
{
"epoch": 0.51,
"grad_norm": 0.544371058943266,
"learning_rate": 5.011972521537547e-05,
"loss": 1.1855,
"step": 696
},
{
"epoch": 0.52,
"grad_norm": 0.5303936746115522,
"learning_rate": 5e-05,
"loss": 1.2092,
"step": 697
},
{
"epoch": 0.52,
"grad_norm": 0.495598830706189,
"learning_rate": 4.988027478462454e-05,
"loss": 1.1836,
"step": 698
},
{
"epoch": 0.52,
"grad_norm": 0.5437430838938269,
"learning_rate": 4.976055025571264e-05,
"loss": 1.2188,
"step": 699
},
{
"epoch": 0.52,
"grad_norm": 0.5252042006485972,
"learning_rate": 4.9640827099723935e-05,
"loss": 1.2124,
"step": 700
},
{
"epoch": 0.52,
"grad_norm": 0.5398226187875242,
"learning_rate": 4.9521106003110216e-05,
"loss": 1.2209,
"step": 701
},
{
"epoch": 0.52,
"grad_norm": 0.5079952168227562,
"learning_rate": 4.940138765231141e-05,
"loss": 1.1743,
"step": 702
},
{
"epoch": 0.52,
"grad_norm": 0.5158907255425175,
"learning_rate": 4.9281672733751746e-05,
"loss": 1.2085,
"step": 703
},
{
"epoch": 0.52,
"grad_norm": 0.5013889963926388,
"learning_rate": 4.916196193383572e-05,
"loss": 1.1658,
"step": 704
},
{
"epoch": 0.52,
"grad_norm": 0.5171868116327017,
"learning_rate": 4.9042255938944296e-05,
"loss": 1.2349,
"step": 705
},
{
"epoch": 0.52,
"grad_norm": 0.5474012589077731,
"learning_rate": 4.892255543543083e-05,
"loss": 1.1753,
"step": 706
},
{
"epoch": 0.52,
"grad_norm": 0.5045254893774415,
"learning_rate": 4.880286110961718e-05,
"loss": 1.1792,
"step": 707
},
{
"epoch": 0.52,
"grad_norm": 0.5241296772261623,
"learning_rate": 4.8683173647789806e-05,
"loss": 1.2039,
"step": 708
},
{
"epoch": 0.52,
"grad_norm": 0.5377092905019106,
"learning_rate": 4.856349373619585e-05,
"loss": 1.2764,
"step": 709
},
{
"epoch": 0.52,
"grad_norm": 0.5040778956509001,
"learning_rate": 4.8443822061039104e-05,
"loss": 1.2085,
"step": 710
},
{
"epoch": 0.53,
"grad_norm": 0.5431014555604886,
"learning_rate": 4.832415930847615e-05,
"loss": 1.2173,
"step": 711
},
{
"epoch": 0.53,
"grad_norm": 0.5469411284628397,
"learning_rate": 4.8204506164612395e-05,
"loss": 1.21,
"step": 712
},
{
"epoch": 0.53,
"grad_norm": 0.5208640142266422,
"learning_rate": 4.8084863315498234e-05,
"loss": 1.188,
"step": 713
},
{
"epoch": 0.53,
"grad_norm": 0.5780919783724002,
"learning_rate": 4.796523144712494e-05,
"loss": 1.1865,
"step": 714
},
{
"epoch": 0.53,
"grad_norm": 0.5295824611977981,
"learning_rate": 4.7845611245420876e-05,
"loss": 1.1875,
"step": 715
},
{
"epoch": 0.53,
"grad_norm": 0.5173865374490104,
"learning_rate": 4.772600339624748e-05,
"loss": 1.2109,
"step": 716
},
{
"epoch": 0.53,
"grad_norm": 0.5187771855719726,
"learning_rate": 4.760640858539541e-05,
"loss": 1.2383,
"step": 717
},
{
"epoch": 0.53,
"grad_norm": 0.5393951085471508,
"learning_rate": 4.748682749858053e-05,
"loss": 1.1885,
"step": 718
},
{
"epoch": 0.53,
"grad_norm": 0.537367418139691,
"learning_rate": 4.736726082144002e-05,
"loss": 1.2236,
"step": 719
},
{
"epoch": 0.53,
"grad_norm": 0.5185124710093608,
"learning_rate": 4.724770923952844e-05,
"loss": 1.1631,
"step": 720
},
{
"epoch": 0.53,
"grad_norm": 0.5157988812615701,
"learning_rate": 4.712817343831384e-05,
"loss": 1.1992,
"step": 721
},
{
"epoch": 0.53,
"grad_norm": 0.5272520854702486,
"learning_rate": 4.7008654103173726e-05,
"loss": 1.1929,
"step": 722
},
{
"epoch": 0.53,
"grad_norm": 0.5204627551027494,
"learning_rate": 4.688915191939123e-05,
"loss": 1.1821,
"step": 723
},
{
"epoch": 0.54,
"grad_norm": 0.49476093725588927,
"learning_rate": 4.676966757215112e-05,
"loss": 1.1553,
"step": 724
},
{
"epoch": 0.54,
"grad_norm": 0.5030550735915269,
"learning_rate": 4.665020174653592e-05,
"loss": 1.2148,
"step": 725
},
{
"epoch": 0.54,
"grad_norm": 0.5478488157879362,
"learning_rate": 4.6530755127521964e-05,
"loss": 1.1348,
"step": 726
},
{
"epoch": 0.54,
"grad_norm": 0.5405275109821088,
"learning_rate": 4.641132839997543e-05,
"loss": 1.2026,
"step": 727
},
{
"epoch": 0.54,
"grad_norm": 0.5084690696751637,
"learning_rate": 4.6291922248648456e-05,
"loss": 1.1958,
"step": 728
},
{
"epoch": 0.54,
"grad_norm": 0.5317489022142494,
"learning_rate": 4.6172537358175214e-05,
"loss": 1.1831,
"step": 729
},
{
"epoch": 0.54,
"grad_norm": 0.5323411101783979,
"learning_rate": 4.605317441306798e-05,
"loss": 1.2026,
"step": 730
},
{
"epoch": 0.54,
"grad_norm": 0.5459396348813691,
"learning_rate": 4.5933834097713176e-05,
"loss": 1.1663,
"step": 731
},
{
"epoch": 0.54,
"grad_norm": 0.5135095266993602,
"learning_rate": 4.5814517096367473e-05,
"loss": 1.1655,
"step": 732
},
{
"epoch": 0.54,
"grad_norm": 0.5568849517359646,
"learning_rate": 4.569522409315392e-05,
"loss": 1.1525,
"step": 733
},
{
"epoch": 0.54,
"grad_norm": 0.5138355980368158,
"learning_rate": 4.557595577205789e-05,
"loss": 1.1843,
"step": 734
},
{
"epoch": 0.54,
"grad_norm": 0.5393996532433413,
"learning_rate": 4.5456712816923305e-05,
"loss": 1.0713,
"step": 735
},
{
"epoch": 0.54,
"grad_norm": 0.5193084343570704,
"learning_rate": 4.53374959114486e-05,
"loss": 1.1804,
"step": 736
},
{
"epoch": 0.54,
"grad_norm": 0.5397982411978588,
"learning_rate": 4.521830573918289e-05,
"loss": 1.1926,
"step": 737
},
{
"epoch": 0.55,
"grad_norm": 0.5150645040364817,
"learning_rate": 4.509914298352197e-05,
"loss": 1.2241,
"step": 738
},
{
"epoch": 0.55,
"grad_norm": 0.5402009273066287,
"learning_rate": 4.498000832770448e-05,
"loss": 1.1858,
"step": 739
},
{
"epoch": 0.55,
"grad_norm": 0.5288014513975406,
"learning_rate": 4.4860902454807905e-05,
"loss": 1.2068,
"step": 740
},
{
"epoch": 0.55,
"grad_norm": 0.5313475996762929,
"learning_rate": 4.474182604774471e-05,
"loss": 1.2124,
"step": 741
},
{
"epoch": 0.55,
"grad_norm": 0.5065472165255681,
"learning_rate": 4.462277978925845e-05,
"loss": 1.1577,
"step": 742
},
{
"epoch": 0.55,
"grad_norm": 0.5032574551100435,
"learning_rate": 4.450376436191975e-05,
"loss": 1.1982,
"step": 743
},
{
"epoch": 0.55,
"grad_norm": 0.5040707372615817,
"learning_rate": 4.4384780448122545e-05,
"loss": 1.1943,
"step": 744
},
{
"epoch": 0.55,
"grad_norm": 0.5331187646017562,
"learning_rate": 4.4265828730079987e-05,
"loss": 1.1526,
"step": 745
},
{
"epoch": 0.55,
"grad_norm": 0.5131407652546381,
"learning_rate": 4.41469098898207e-05,
"loss": 1.2163,
"step": 746
},
{
"epoch": 0.55,
"grad_norm": 0.5157964582279311,
"learning_rate": 4.402802460918478e-05,
"loss": 1.1904,
"step": 747
},
{
"epoch": 0.55,
"grad_norm": 0.5208452898515039,
"learning_rate": 4.3909173569819876e-05,
"loss": 1.1956,
"step": 748
},
{
"epoch": 0.55,
"grad_norm": 0.5232461323122678,
"learning_rate": 4.379035745317735e-05,
"loss": 1.1724,
"step": 749
},
{
"epoch": 0.55,
"grad_norm": 0.542552124469573,
"learning_rate": 4.36715769405083e-05,
"loss": 1.1228,
"step": 750
},
{
"epoch": 0.55,
"grad_norm": 0.5346283177958724,
"learning_rate": 4.355283271285971e-05,
"loss": 1.186,
"step": 751
},
{
"epoch": 0.56,
"grad_norm": 0.5236313560893754,
"learning_rate": 4.3434125451070476e-05,
"loss": 1.188,
"step": 752
},
{
"epoch": 0.56,
"grad_norm": 0.527272024389816,
"learning_rate": 4.331545583576758e-05,
"loss": 1.2153,
"step": 753
},
{
"epoch": 0.56,
"grad_norm": 0.5178793305131995,
"learning_rate": 4.319682454736215e-05,
"loss": 1.1328,
"step": 754
},
{
"epoch": 0.56,
"grad_norm": 0.524324489977986,
"learning_rate": 4.3078232266045545e-05,
"loss": 1.2371,
"step": 755
},
{
"epoch": 0.56,
"grad_norm": 0.5499694260527771,
"learning_rate": 4.295967967178549e-05,
"loss": 1.1917,
"step": 756
},
{
"epoch": 0.56,
"grad_norm": 0.5213679611913336,
"learning_rate": 4.28411674443221e-05,
"loss": 1.2158,
"step": 757
},
{
"epoch": 0.56,
"grad_norm": 0.5393429683806125,
"learning_rate": 4.2722696263164144e-05,
"loss": 1.1995,
"step": 758
},
{
"epoch": 0.56,
"grad_norm": 0.5118410909810224,
"learning_rate": 4.2604266807584964e-05,
"loss": 1.2349,
"step": 759
},
{
"epoch": 0.56,
"grad_norm": 0.524011083731795,
"learning_rate": 4.248587975661869e-05,
"loss": 1.2646,
"step": 760
},
{
"epoch": 0.56,
"grad_norm": 0.5173452587219315,
"learning_rate": 4.236753578905627e-05,
"loss": 1.208,
"step": 761
},
{
"epoch": 0.56,
"grad_norm": 0.5167330609755211,
"learning_rate": 4.224923558344175e-05,
"loss": 1.1797,
"step": 762
},
{
"epoch": 0.56,
"grad_norm": 0.550248520354795,
"learning_rate": 4.213097981806813e-05,
"loss": 1.1846,
"step": 763
},
{
"epoch": 0.56,
"grad_norm": 0.5460378705748443,
"learning_rate": 4.201276917097366e-05,
"loss": 1.2178,
"step": 764
},
{
"epoch": 0.57,
"grad_norm": 0.5267339874426903,
"learning_rate": 4.189460431993788e-05,
"loss": 1.1768,
"step": 765
},
{
"epoch": 0.57,
"grad_norm": 0.5437720753946003,
"learning_rate": 4.177648594247779e-05,
"loss": 1.2441,
"step": 766
},
{
"epoch": 0.57,
"grad_norm": 0.5252191540697293,
"learning_rate": 4.16584147158439e-05,
"loss": 1.1792,
"step": 767
},
{
"epoch": 0.57,
"grad_norm": 0.5337200734106656,
"learning_rate": 4.154039131701636e-05,
"loss": 1.2075,
"step": 768
},
{
"epoch": 0.57,
"grad_norm": 0.5365313218518762,
"learning_rate": 4.142241642270108e-05,
"loss": 1.2107,
"step": 769
},
{
"epoch": 0.57,
"grad_norm": 0.5184771196996488,
"learning_rate": 4.130449070932594e-05,
"loss": 1.2153,
"step": 770
},
{
"epoch": 0.57,
"grad_norm": 0.5112516194042346,
"learning_rate": 4.1186614853036745e-05,
"loss": 1.1616,
"step": 771
},
{
"epoch": 0.57,
"grad_norm": 0.5302215023509379,
"learning_rate": 4.106878952969348e-05,
"loss": 1.1472,
"step": 772
},
{
"epoch": 0.57,
"grad_norm": 0.5280678011239818,
"learning_rate": 4.095101541486636e-05,
"loss": 1.1479,
"step": 773
},
{
"epoch": 0.57,
"grad_norm": 0.5152968875861379,
"learning_rate": 4.083329318383204e-05,
"loss": 1.208,
"step": 774
},
{
"epoch": 0.57,
"grad_norm": 0.508664511327192,
"learning_rate": 4.071562351156966e-05,
"loss": 1.1924,
"step": 775
},
{
"epoch": 0.57,
"grad_norm": 0.5194622063261768,
"learning_rate": 4.0598007072756985e-05,
"loss": 1.2046,
"step": 776
},
{
"epoch": 0.57,
"grad_norm": 0.49313616744642114,
"learning_rate": 4.0480444541766576e-05,
"loss": 1.1738,
"step": 777
},
{
"epoch": 0.57,
"grad_norm": 0.540199963132762,
"learning_rate": 4.036293659266195e-05,
"loss": 1.1504,
"step": 778
},
{
"epoch": 0.58,
"grad_norm": 0.5381187717110268,
"learning_rate": 4.0245483899193595e-05,
"loss": 1.2437,
"step": 779
},
{
"epoch": 0.58,
"grad_norm": 0.5458507343059239,
"learning_rate": 4.012808713479522e-05,
"loss": 1.1672,
"step": 780
},
{
"epoch": 0.58,
"grad_norm": 0.5384963973167503,
"learning_rate": 4.001074697257986e-05,
"loss": 1.145,
"step": 781
},
{
"epoch": 0.58,
"grad_norm": 0.5017592357988295,
"learning_rate": 3.989346408533597e-05,
"loss": 1.2007,
"step": 782
},
{
"epoch": 0.58,
"grad_norm": 0.539898755162606,
"learning_rate": 3.977623914552369e-05,
"loss": 1.2273,
"step": 783
},
{
"epoch": 0.58,
"grad_norm": 0.5164118794312718,
"learning_rate": 3.9659072825270846e-05,
"loss": 1.1465,
"step": 784
},
{
"epoch": 0.58,
"grad_norm": 0.5310682930134909,
"learning_rate": 3.954196579636918e-05,
"loss": 1.1819,
"step": 785
},
{
"epoch": 0.58,
"grad_norm": 0.5319333488209632,
"learning_rate": 3.942491873027043e-05,
"loss": 1.2075,
"step": 786
},
{
"epoch": 0.58,
"grad_norm": 0.5188554776078524,
"learning_rate": 3.930793229808264e-05,
"loss": 1.1719,
"step": 787
},
{
"epoch": 0.58,
"grad_norm": 0.5367955841699225,
"learning_rate": 3.91910071705661e-05,
"loss": 1.1899,
"step": 788
},
{
"epoch": 0.58,
"grad_norm": 0.5252956080861287,
"learning_rate": 3.907414401812963e-05,
"loss": 1.1462,
"step": 789
},
{
"epoch": 0.58,
"grad_norm": 0.5311293624616692,
"learning_rate": 3.895734351082668e-05,
"loss": 1.2456,
"step": 790
},
{
"epoch": 0.58,
"grad_norm": 0.5325842888765375,
"learning_rate": 3.884060631835159e-05,
"loss": 1.189,
"step": 791
},
{
"epoch": 0.59,
"grad_norm": 0.5284769775552955,
"learning_rate": 3.872393311003561e-05,
"loss": 1.228,
"step": 792
},
{
"epoch": 0.59,
"grad_norm": 0.5343101478131803,
"learning_rate": 3.8607324554843136e-05,
"loss": 1.1914,
"step": 793
},
{
"epoch": 0.59,
"grad_norm": 0.5080727323917363,
"learning_rate": 3.8490781321367846e-05,
"loss": 1.1733,
"step": 794
},
{
"epoch": 0.59,
"grad_norm": 0.5370043159248648,
"learning_rate": 3.837430407782896e-05,
"loss": 1.1039,
"step": 795
},
{
"epoch": 0.59,
"grad_norm": 0.5331791903830669,
"learning_rate": 3.825789349206726e-05,
"loss": 1.2041,
"step": 796
},
{
"epoch": 0.59,
"grad_norm": 0.5071964500400644,
"learning_rate": 3.814155023154136e-05,
"loss": 1.1702,
"step": 797
},
{
"epoch": 0.59,
"grad_norm": 0.5392895066482576,
"learning_rate": 3.802527496332384e-05,
"loss": 1.1963,
"step": 798
},
{
"epoch": 0.59,
"grad_norm": 0.5464813906886878,
"learning_rate": 3.790906835409749e-05,
"loss": 1.2029,
"step": 799
},
{
"epoch": 0.59,
"grad_norm": 0.5249376656220847,
"learning_rate": 3.7792931070151364e-05,
"loss": 1.248,
"step": 800
},
{
"epoch": 0.59,
"grad_norm": 0.5378846518914249,
"learning_rate": 3.7676863777377054e-05,
"loss": 1.2261,
"step": 801
},
{
"epoch": 0.59,
"grad_norm": 0.5385685739172994,
"learning_rate": 3.756086714126483e-05,
"loss": 1.2095,
"step": 802
},
{
"epoch": 0.59,
"grad_norm": 0.5252465967407262,
"learning_rate": 3.744494182689992e-05,
"loss": 1.1636,
"step": 803
},
{
"epoch": 0.59,
"grad_norm": 0.5012134647828765,
"learning_rate": 3.732908849895852e-05,
"loss": 1.1692,
"step": 804
},
{
"epoch": 0.59,
"grad_norm": 0.5143929015268887,
"learning_rate": 3.721330782170411e-05,
"loss": 1.1702,
"step": 805
},
{
"epoch": 0.6,
"grad_norm": 0.5203850321521901,
"learning_rate": 3.7097600458983636e-05,
"loss": 1.1982,
"step": 806
},
{
"epoch": 0.6,
"grad_norm": 0.5166201582687144,
"learning_rate": 3.698196707422366e-05,
"loss": 1.1953,
"step": 807
},
{
"epoch": 0.6,
"grad_norm": 0.5524102155079722,
"learning_rate": 3.6866408330426616e-05,
"loss": 1.2336,
"step": 808
},
{
"epoch": 0.6,
"grad_norm": 0.5267182791604813,
"learning_rate": 3.675092489016693e-05,
"loss": 1.1724,
"step": 809
},
{
"epoch": 0.6,
"grad_norm": 0.5369656792035445,
"learning_rate": 3.663551741558726e-05,
"loss": 1.1228,
"step": 810
},
{
"epoch": 0.6,
"grad_norm": 0.5310024406548113,
"learning_rate": 3.652018656839474e-05,
"loss": 1.1482,
"step": 811
},
{
"epoch": 0.6,
"grad_norm": 0.521245938688013,
"learning_rate": 3.640493300985716e-05,
"loss": 1.1917,
"step": 812
},
{
"epoch": 0.6,
"grad_norm": 0.527913613792371,
"learning_rate": 3.62897574007991e-05,
"loss": 1.2241,
"step": 813
},
{
"epoch": 0.6,
"grad_norm": 0.5535932262594306,
"learning_rate": 3.6174660401598224e-05,
"loss": 1.2168,
"step": 814
},
{
"epoch": 0.6,
"grad_norm": 0.5351897402729301,
"learning_rate": 3.605964267218154e-05,
"loss": 1.2161,
"step": 815
},
{
"epoch": 0.6,
"grad_norm": 0.5259902908188178,
"learning_rate": 3.594470487202145e-05,
"loss": 1.1648,
"step": 816
},
{
"epoch": 0.6,
"grad_norm": 0.5136724289192951,
"learning_rate": 3.582984766013215e-05,
"loss": 1.166,
"step": 817
},
{
"epoch": 0.6,
"grad_norm": 0.5649961128233411,
"learning_rate": 3.571507169506571e-05,
"loss": 1.1541,
"step": 818
},
{
"epoch": 0.61,
"grad_norm": 0.5366938931373112,
"learning_rate": 3.5600377634908415e-05,
"loss": 1.1584,
"step": 819
},
{
"epoch": 0.61,
"grad_norm": 0.5373815741572574,
"learning_rate": 3.5485766137276894e-05,
"loss": 1.1323,
"step": 820
},
{
"epoch": 0.61,
"grad_norm": 0.5193905378142675,
"learning_rate": 3.537123785931439e-05,
"loss": 1.2085,
"step": 821
},
{
"epoch": 0.61,
"grad_norm": 0.5055475172545549,
"learning_rate": 3.525679345768703e-05,
"loss": 1.166,
"step": 822
},
{
"epoch": 0.61,
"grad_norm": 0.5386208555803574,
"learning_rate": 3.514243358857997e-05,
"loss": 1.135,
"step": 823
},
{
"epoch": 0.61,
"grad_norm": 0.5345574878558741,
"learning_rate": 3.502815890769374e-05,
"loss": 1.2314,
"step": 824
},
{
"epoch": 0.61,
"grad_norm": 0.5549276090530098,
"learning_rate": 3.4913970070240386e-05,
"loss": 1.1853,
"step": 825
},
{
"epoch": 0.61,
"grad_norm": 0.5367912093076593,
"learning_rate": 3.479986773093979e-05,
"loss": 1.2192,
"step": 826
},
{
"epoch": 0.61,
"grad_norm": 0.5314792285668634,
"learning_rate": 3.468585254401586e-05,
"loss": 1.199,
"step": 827
},
{
"epoch": 0.61,
"grad_norm": 0.5163677737549888,
"learning_rate": 3.457192516319281e-05,
"loss": 1.2036,
"step": 828
},
{
"epoch": 0.61,
"grad_norm": 0.572875872188477,
"learning_rate": 3.4458086241691415e-05,
"loss": 1.1809,
"step": 829
},
{
"epoch": 0.61,
"grad_norm": 0.5224596702587158,
"learning_rate": 3.4344336432225207e-05,
"loss": 1.144,
"step": 830
},
{
"epoch": 0.61,
"grad_norm": 0.5249153625845075,
"learning_rate": 3.423067638699684e-05,
"loss": 1.1772,
"step": 831
},
{
"epoch": 0.61,
"grad_norm": 0.5134500335674911,
"learning_rate": 3.4117106757694284e-05,
"loss": 1.1597,
"step": 832
},
{
"epoch": 0.62,
"grad_norm": 0.518006942913236,
"learning_rate": 3.4003628195487057e-05,
"loss": 1.2224,
"step": 833
},
{
"epoch": 0.62,
"grad_norm": 0.5077417974903964,
"learning_rate": 3.3890241351022544e-05,
"loss": 1.2124,
"step": 834
},
{
"epoch": 0.62,
"grad_norm": 0.5411241634452162,
"learning_rate": 3.3776946874422263e-05,
"loss": 1.2119,
"step": 835
},
{
"epoch": 0.62,
"grad_norm": 0.5553674706637377,
"learning_rate": 3.3663745415278134e-05,
"loss": 1.1492,
"step": 836
},
{
"epoch": 0.62,
"grad_norm": 0.5291067133636735,
"learning_rate": 3.355063762264873e-05,
"loss": 1.2158,
"step": 837
},
{
"epoch": 0.62,
"grad_norm": 0.5027390017500571,
"learning_rate": 3.3437624145055557e-05,
"loss": 1.1638,
"step": 838
},
{
"epoch": 0.62,
"grad_norm": 0.5156132801251352,
"learning_rate": 3.3324705630479355e-05,
"loss": 1.2163,
"step": 839
},
{
"epoch": 0.62,
"grad_norm": 0.5174605453587071,
"learning_rate": 3.3211882726356445e-05,
"loss": 1.2043,
"step": 840
},
{
"epoch": 0.62,
"grad_norm": 0.5251979976431941,
"learning_rate": 3.309915607957487e-05,
"loss": 1.1218,
"step": 841
},
{
"epoch": 0.62,
"grad_norm": 0.5498596263242338,
"learning_rate": 3.298652633647079e-05,
"loss": 1.1301,
"step": 842
},
{
"epoch": 0.62,
"grad_norm": 0.5298767559857231,
"learning_rate": 3.287399414282474e-05,
"loss": 1.2332,
"step": 843
},
{
"epoch": 0.62,
"grad_norm": 0.5540135983930895,
"learning_rate": 3.2761560143857994e-05,
"loss": 1.2324,
"step": 844
},
{
"epoch": 0.62,
"grad_norm": 0.5138557457478535,
"learning_rate": 3.2649224984228756e-05,
"loss": 1.1919,
"step": 845
},
{
"epoch": 0.63,
"grad_norm": 0.4994811286863287,
"learning_rate": 3.253698930802853e-05,
"loss": 1.1689,
"step": 846
},
{
"epoch": 0.63,
"grad_norm": 0.5313959573566741,
"learning_rate": 3.242485375877841e-05,
"loss": 1.1299,
"step": 847
},
{
"epoch": 0.63,
"grad_norm": 0.4994270221658335,
"learning_rate": 3.231281897942544e-05,
"loss": 1.1433,
"step": 848
},
{
"epoch": 0.63,
"grad_norm": 0.5588116507391775,
"learning_rate": 3.2200885612338845e-05,
"loss": 1.2288,
"step": 849
},
{
"epoch": 0.63,
"grad_norm": 0.5364716488949063,
"learning_rate": 3.2089054299306375e-05,
"loss": 1.207,
"step": 850
}
],
"logging_steps": 1.0,
"max_steps": 1353,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"total_flos": 7.13056619210788e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}