Aura-NeMo-12B / trainer_state.json
jeiku's picture
Upload 10 files
a720218 verified
raw
history blame
148 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9947460595446582,
"eval_steps": 500,
"global_step": 855,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0035026269702276708,
"grad_norm": 52.48405456542969,
"learning_rate": 4.651162790697675e-07,
"loss": 4.5474,
"step": 1
},
{
"epoch": 0.0070052539404553416,
"grad_norm": 15.457415580749512,
"learning_rate": 9.30232558139535e-07,
"loss": 3.8101,
"step": 2
},
{
"epoch": 0.010507880910683012,
"grad_norm": 38.854068756103516,
"learning_rate": 1.3953488372093025e-06,
"loss": 4.5719,
"step": 3
},
{
"epoch": 0.014010507880910683,
"grad_norm": 32.52230453491211,
"learning_rate": 1.86046511627907e-06,
"loss": 4.2939,
"step": 4
},
{
"epoch": 0.017513134851138354,
"grad_norm": 62.512908935546875,
"learning_rate": 2.3255813953488376e-06,
"loss": 4.9511,
"step": 5
},
{
"epoch": 0.021015761821366025,
"grad_norm": 14.109648704528809,
"learning_rate": 2.790697674418605e-06,
"loss": 3.9829,
"step": 6
},
{
"epoch": 0.024518388791593695,
"grad_norm": 39.390140533447266,
"learning_rate": 3.2558139534883724e-06,
"loss": 3.8983,
"step": 7
},
{
"epoch": 0.028021015761821366,
"grad_norm": 47.82828903198242,
"learning_rate": 3.72093023255814e-06,
"loss": 4.5586,
"step": 8
},
{
"epoch": 0.03152364273204904,
"grad_norm": 27.250408172607422,
"learning_rate": 4.186046511627907e-06,
"loss": 3.2116,
"step": 9
},
{
"epoch": 0.03502626970227671,
"grad_norm": 15.391385078430176,
"learning_rate": 4.651162790697675e-06,
"loss": 3.864,
"step": 10
},
{
"epoch": 0.03852889667250438,
"grad_norm": 15.911999702453613,
"learning_rate": 5.116279069767442e-06,
"loss": 3.4548,
"step": 11
},
{
"epoch": 0.04203152364273205,
"grad_norm": 11.446131706237793,
"learning_rate": 5.58139534883721e-06,
"loss": 3.3845,
"step": 12
},
{
"epoch": 0.04553415061295972,
"grad_norm": 21.21734619140625,
"learning_rate": 6.046511627906977e-06,
"loss": 3.9722,
"step": 13
},
{
"epoch": 0.04903677758318739,
"grad_norm": 19.188844680786133,
"learning_rate": 6.511627906976745e-06,
"loss": 4.4815,
"step": 14
},
{
"epoch": 0.05253940455341506,
"grad_norm": 15.476765632629395,
"learning_rate": 6.976744186046513e-06,
"loss": 3.9449,
"step": 15
},
{
"epoch": 0.05604203152364273,
"grad_norm": 8.746248245239258,
"learning_rate": 7.44186046511628e-06,
"loss": 3.0432,
"step": 16
},
{
"epoch": 0.0595446584938704,
"grad_norm": 14.563529014587402,
"learning_rate": 7.906976744186048e-06,
"loss": 3.4222,
"step": 17
},
{
"epoch": 0.06304728546409807,
"grad_norm": 9.088067054748535,
"learning_rate": 8.372093023255815e-06,
"loss": 3.2242,
"step": 18
},
{
"epoch": 0.06654991243432574,
"grad_norm": 18.267414093017578,
"learning_rate": 8.837209302325582e-06,
"loss": 3.4081,
"step": 19
},
{
"epoch": 0.07005253940455342,
"grad_norm": 19.622844696044922,
"learning_rate": 9.30232558139535e-06,
"loss": 3.1812,
"step": 20
},
{
"epoch": 0.07355516637478109,
"grad_norm": 14.69324016571045,
"learning_rate": 9.767441860465117e-06,
"loss": 2.7359,
"step": 21
},
{
"epoch": 0.07705779334500876,
"grad_norm": 16.522323608398438,
"learning_rate": 1.0232558139534884e-05,
"loss": 3.3083,
"step": 22
},
{
"epoch": 0.08056042031523643,
"grad_norm": 27.648794174194336,
"learning_rate": 1.0697674418604651e-05,
"loss": 3.7333,
"step": 23
},
{
"epoch": 0.0840630472854641,
"grad_norm": 18.54423713684082,
"learning_rate": 1.116279069767442e-05,
"loss": 3.013,
"step": 24
},
{
"epoch": 0.08756567425569177,
"grad_norm": 26.133230209350586,
"learning_rate": 1.1627906976744187e-05,
"loss": 3.5883,
"step": 25
},
{
"epoch": 0.09106830122591944,
"grad_norm": 11.354973793029785,
"learning_rate": 1.2093023255813954e-05,
"loss": 3.2263,
"step": 26
},
{
"epoch": 0.09457092819614711,
"grad_norm": 28.07408332824707,
"learning_rate": 1.2558139534883723e-05,
"loss": 3.4547,
"step": 27
},
{
"epoch": 0.09807355516637478,
"grad_norm": 34.68488693237305,
"learning_rate": 1.302325581395349e-05,
"loss": 4.8204,
"step": 28
},
{
"epoch": 0.10157618213660245,
"grad_norm": 11.963549613952637,
"learning_rate": 1.3488372093023257e-05,
"loss": 3.1184,
"step": 29
},
{
"epoch": 0.10507880910683012,
"grad_norm": 16.531707763671875,
"learning_rate": 1.3953488372093025e-05,
"loss": 3.4912,
"step": 30
},
{
"epoch": 0.1085814360770578,
"grad_norm": 20.2814998626709,
"learning_rate": 1.441860465116279e-05,
"loss": 3.6366,
"step": 31
},
{
"epoch": 0.11208406304728546,
"grad_norm": 19.060791015625,
"learning_rate": 1.488372093023256e-05,
"loss": 3.1751,
"step": 32
},
{
"epoch": 0.11558669001751314,
"grad_norm": 11.113019943237305,
"learning_rate": 1.5348837209302328e-05,
"loss": 3.1345,
"step": 33
},
{
"epoch": 0.1190893169877408,
"grad_norm": 17.613746643066406,
"learning_rate": 1.5813953488372095e-05,
"loss": 3.4793,
"step": 34
},
{
"epoch": 0.12259194395796848,
"grad_norm": 10.551315307617188,
"learning_rate": 1.6279069767441862e-05,
"loss": 3.0938,
"step": 35
},
{
"epoch": 0.12609457092819615,
"grad_norm": 30.171554565429688,
"learning_rate": 1.674418604651163e-05,
"loss": 4.7277,
"step": 36
},
{
"epoch": 0.1295971978984238,
"grad_norm": 12.733338356018066,
"learning_rate": 1.7209302325581396e-05,
"loss": 3.4881,
"step": 37
},
{
"epoch": 0.1330998248686515,
"grad_norm": 18.498449325561523,
"learning_rate": 1.7674418604651163e-05,
"loss": 4.3251,
"step": 38
},
{
"epoch": 0.13660245183887915,
"grad_norm": 18.08729362487793,
"learning_rate": 1.813953488372093e-05,
"loss": 3.8999,
"step": 39
},
{
"epoch": 0.14010507880910683,
"grad_norm": 12.728877067565918,
"learning_rate": 1.86046511627907e-05,
"loss": 3.3771,
"step": 40
},
{
"epoch": 0.1436077057793345,
"grad_norm": 19.182889938354492,
"learning_rate": 1.9069767441860468e-05,
"loss": 4.3478,
"step": 41
},
{
"epoch": 0.14711033274956217,
"grad_norm": 14.268775939941406,
"learning_rate": 1.9534883720930235e-05,
"loss": 3.8552,
"step": 42
},
{
"epoch": 0.15061295971978983,
"grad_norm": 32.71632766723633,
"learning_rate": 2e-05,
"loss": 2.8895,
"step": 43
},
{
"epoch": 0.15411558669001751,
"grad_norm": 15.663018226623535,
"learning_rate": 2.046511627906977e-05,
"loss": 3.9284,
"step": 44
},
{
"epoch": 0.15761821366024517,
"grad_norm": 12.49007797241211,
"learning_rate": 2.093023255813954e-05,
"loss": 3.397,
"step": 45
},
{
"epoch": 0.16112084063047286,
"grad_norm": 18.972797393798828,
"learning_rate": 2.1395348837209303e-05,
"loss": 3.4676,
"step": 46
},
{
"epoch": 0.1646234676007005,
"grad_norm": 17.138303756713867,
"learning_rate": 2.186046511627907e-05,
"loss": 3.5788,
"step": 47
},
{
"epoch": 0.1681260945709282,
"grad_norm": 13.487051010131836,
"learning_rate": 2.232558139534884e-05,
"loss": 3.618,
"step": 48
},
{
"epoch": 0.17162872154115585,
"grad_norm": 17.096302032470703,
"learning_rate": 2.2790697674418607e-05,
"loss": 3.3029,
"step": 49
},
{
"epoch": 0.17513134851138354,
"grad_norm": 9.788946151733398,
"learning_rate": 2.3255813953488374e-05,
"loss": 2.7551,
"step": 50
},
{
"epoch": 0.1786339754816112,
"grad_norm": 20.02460479736328,
"learning_rate": 2.3720930232558144e-05,
"loss": 2.5158,
"step": 51
},
{
"epoch": 0.18213660245183888,
"grad_norm": 13.219809532165527,
"learning_rate": 2.4186046511627908e-05,
"loss": 2.8995,
"step": 52
},
{
"epoch": 0.18563922942206654,
"grad_norm": 14.657979011535645,
"learning_rate": 2.4651162790697675e-05,
"loss": 3.384,
"step": 53
},
{
"epoch": 0.18914185639229422,
"grad_norm": 20.963659286499023,
"learning_rate": 2.5116279069767445e-05,
"loss": 3.3458,
"step": 54
},
{
"epoch": 0.19264448336252188,
"grad_norm": 12.124914169311523,
"learning_rate": 2.5581395348837212e-05,
"loss": 3.0666,
"step": 55
},
{
"epoch": 0.19614711033274956,
"grad_norm": 11.544058799743652,
"learning_rate": 2.604651162790698e-05,
"loss": 2.8553,
"step": 56
},
{
"epoch": 0.19964973730297722,
"grad_norm": 16.375518798828125,
"learning_rate": 2.6511627906976743e-05,
"loss": 3.2603,
"step": 57
},
{
"epoch": 0.2031523642732049,
"grad_norm": 14.338220596313477,
"learning_rate": 2.6976744186046514e-05,
"loss": 3.6208,
"step": 58
},
{
"epoch": 0.20665499124343256,
"grad_norm": 10.745012283325195,
"learning_rate": 2.744186046511628e-05,
"loss": 3.0888,
"step": 59
},
{
"epoch": 0.21015761821366025,
"grad_norm": 11.644838333129883,
"learning_rate": 2.790697674418605e-05,
"loss": 2.7899,
"step": 60
},
{
"epoch": 0.2136602451838879,
"grad_norm": 10.003059387207031,
"learning_rate": 2.8372093023255818e-05,
"loss": 3.314,
"step": 61
},
{
"epoch": 0.2171628721541156,
"grad_norm": 24.950681686401367,
"learning_rate": 2.883720930232558e-05,
"loss": 3.5718,
"step": 62
},
{
"epoch": 0.22066549912434325,
"grad_norm": 13.138785362243652,
"learning_rate": 2.930232558139535e-05,
"loss": 2.705,
"step": 63
},
{
"epoch": 0.22416812609457093,
"grad_norm": 10.878084182739258,
"learning_rate": 2.976744186046512e-05,
"loss": 3.1842,
"step": 64
},
{
"epoch": 0.2276707530647986,
"grad_norm": 16.99833106994629,
"learning_rate": 3.0232558139534886e-05,
"loss": 3.5814,
"step": 65
},
{
"epoch": 0.23117338003502627,
"grad_norm": 15.462761878967285,
"learning_rate": 3.0697674418604656e-05,
"loss": 3.1773,
"step": 66
},
{
"epoch": 0.23467600700525393,
"grad_norm": 13.846227645874023,
"learning_rate": 3.116279069767442e-05,
"loss": 2.386,
"step": 67
},
{
"epoch": 0.2381786339754816,
"grad_norm": 19.097707748413086,
"learning_rate": 3.162790697674419e-05,
"loss": 2.8701,
"step": 68
},
{
"epoch": 0.24168126094570927,
"grad_norm": 13.607734680175781,
"learning_rate": 3.2093023255813954e-05,
"loss": 3.9503,
"step": 69
},
{
"epoch": 0.24518388791593695,
"grad_norm": 11.81222152709961,
"learning_rate": 3.2558139534883724e-05,
"loss": 2.8775,
"step": 70
},
{
"epoch": 0.2486865148861646,
"grad_norm": 18.59412956237793,
"learning_rate": 3.3023255813953495e-05,
"loss": 3.9074,
"step": 71
},
{
"epoch": 0.2521891418563923,
"grad_norm": 25.35107421875,
"learning_rate": 3.348837209302326e-05,
"loss": 3.8891,
"step": 72
},
{
"epoch": 0.25569176882662,
"grad_norm": 16.84481430053711,
"learning_rate": 3.395348837209302e-05,
"loss": 3.8725,
"step": 73
},
{
"epoch": 0.2591943957968476,
"grad_norm": 17.052255630493164,
"learning_rate": 3.441860465116279e-05,
"loss": 3.0218,
"step": 74
},
{
"epoch": 0.2626970227670753,
"grad_norm": 36.11460876464844,
"learning_rate": 3.488372093023256e-05,
"loss": 4.1979,
"step": 75
},
{
"epoch": 0.266199649737303,
"grad_norm": 10.964736938476562,
"learning_rate": 3.5348837209302326e-05,
"loss": 3.1011,
"step": 76
},
{
"epoch": 0.26970227670753066,
"grad_norm": 13.844853401184082,
"learning_rate": 3.58139534883721e-05,
"loss": 3.6975,
"step": 77
},
{
"epoch": 0.2732049036777583,
"grad_norm": 12.19601821899414,
"learning_rate": 3.627906976744186e-05,
"loss": 3.1434,
"step": 78
},
{
"epoch": 0.276707530647986,
"grad_norm": 12.09930419921875,
"learning_rate": 3.674418604651163e-05,
"loss": 3.3209,
"step": 79
},
{
"epoch": 0.28021015761821366,
"grad_norm": 16.452251434326172,
"learning_rate": 3.72093023255814e-05,
"loss": 2.9085,
"step": 80
},
{
"epoch": 0.28371278458844135,
"grad_norm": 10.51562213897705,
"learning_rate": 3.7674418604651165e-05,
"loss": 3.3766,
"step": 81
},
{
"epoch": 0.287215411558669,
"grad_norm": 10.312739372253418,
"learning_rate": 3.8139534883720935e-05,
"loss": 3.2133,
"step": 82
},
{
"epoch": 0.29071803852889666,
"grad_norm": 15.232990264892578,
"learning_rate": 3.8604651162790706e-05,
"loss": 4.067,
"step": 83
},
{
"epoch": 0.29422066549912435,
"grad_norm": 13.83324146270752,
"learning_rate": 3.906976744186047e-05,
"loss": 3.2642,
"step": 84
},
{
"epoch": 0.29772329246935203,
"grad_norm": 14.54284954071045,
"learning_rate": 3.953488372093023e-05,
"loss": 3.2644,
"step": 85
},
{
"epoch": 0.30122591943957966,
"grad_norm": 18.364408493041992,
"learning_rate": 4e-05,
"loss": 3.0344,
"step": 86
},
{
"epoch": 0.30472854640980734,
"grad_norm": 11.513346672058105,
"learning_rate": 3.99998331037949e-05,
"loss": 2.5797,
"step": 87
},
{
"epoch": 0.30823117338003503,
"grad_norm": 19.537710189819336,
"learning_rate": 3.9999332417965036e-05,
"loss": 3.5762,
"step": 88
},
{
"epoch": 0.3117338003502627,
"grad_norm": 16.66529655456543,
"learning_rate": 3.999849795086666e-05,
"loss": 3.181,
"step": 89
},
{
"epoch": 0.31523642732049034,
"grad_norm": 8.610281944274902,
"learning_rate": 3.999732971642672e-05,
"loss": 2.9271,
"step": 90
},
{
"epoch": 0.318739054290718,
"grad_norm": 26.244932174682617,
"learning_rate": 3.999582773414259e-05,
"loss": 2.9479,
"step": 91
},
{
"epoch": 0.3222416812609457,
"grad_norm": 10.8726167678833,
"learning_rate": 3.99939920290818e-05,
"loss": 2.5413,
"step": 92
},
{
"epoch": 0.3257443082311734,
"grad_norm": 12.799675941467285,
"learning_rate": 3.999182263188156e-05,
"loss": 3.3469,
"step": 93
},
{
"epoch": 0.329246935201401,
"grad_norm": 13.35167121887207,
"learning_rate": 3.998931957874829e-05,
"loss": 3.5572,
"step": 94
},
{
"epoch": 0.3327495621716287,
"grad_norm": 10.501457214355469,
"learning_rate": 3.998648291145701e-05,
"loss": 2.5409,
"step": 95
},
{
"epoch": 0.3362521891418564,
"grad_norm": 13.109798431396484,
"learning_rate": 3.99833126773506e-05,
"loss": 3.4712,
"step": 96
},
{
"epoch": 0.3397548161120841,
"grad_norm": 12.54065227508545,
"learning_rate": 3.9979808929339066e-05,
"loss": 3.4943,
"step": 97
},
{
"epoch": 0.3432574430823117,
"grad_norm": 18.664926528930664,
"learning_rate": 3.997597172589865e-05,
"loss": 4.0018,
"step": 98
},
{
"epoch": 0.3467600700525394,
"grad_norm": 9.06640911102295,
"learning_rate": 3.9971801131070805e-05,
"loss": 3.5354,
"step": 99
},
{
"epoch": 0.3502626970227671,
"grad_norm": 10.051581382751465,
"learning_rate": 3.996729721446118e-05,
"loss": 3.6732,
"step": 100
},
{
"epoch": 0.35376532399299476,
"grad_norm": 10.670345306396484,
"learning_rate": 3.996246005123843e-05,
"loss": 3.422,
"step": 101
},
{
"epoch": 0.3572679509632224,
"grad_norm": 12.128016471862793,
"learning_rate": 3.995728972213299e-05,
"loss": 3.37,
"step": 102
},
{
"epoch": 0.3607705779334501,
"grad_norm": 15.131268501281738,
"learning_rate": 3.995178631343567e-05,
"loss": 3.8201,
"step": 103
},
{
"epoch": 0.36427320490367776,
"grad_norm": 11.882003784179688,
"learning_rate": 3.994594991699629e-05,
"loss": 2.4535,
"step": 104
},
{
"epoch": 0.36777583187390545,
"grad_norm": 13.227598190307617,
"learning_rate": 3.993978063022208e-05,
"loss": 3.5045,
"step": 105
},
{
"epoch": 0.3712784588441331,
"grad_norm": 9.773946762084961,
"learning_rate": 3.99332785560761e-05,
"loss": 2.7874,
"step": 106
},
{
"epoch": 0.37478108581436076,
"grad_norm": 27.06616973876953,
"learning_rate": 3.99264438030755e-05,
"loss": 3.155,
"step": 107
},
{
"epoch": 0.37828371278458844,
"grad_norm": 9.595654487609863,
"learning_rate": 3.991927648528971e-05,
"loss": 2.9589,
"step": 108
},
{
"epoch": 0.38178633975481613,
"grad_norm": 14.137282371520996,
"learning_rate": 3.9911776722338544e-05,
"loss": 3.6373,
"step": 109
},
{
"epoch": 0.38528896672504376,
"grad_norm": 11.451871871948242,
"learning_rate": 3.990394463939021e-05,
"loss": 3.41,
"step": 110
},
{
"epoch": 0.38879159369527144,
"grad_norm": 11.46674919128418,
"learning_rate": 3.989578036715919e-05,
"loss": 3.5972,
"step": 111
},
{
"epoch": 0.3922942206654991,
"grad_norm": 16.71035385131836,
"learning_rate": 3.9887284041904085e-05,
"loss": 3.734,
"step": 112
},
{
"epoch": 0.3957968476357268,
"grad_norm": 12.229024887084961,
"learning_rate": 3.987845580542535e-05,
"loss": 3.4147,
"step": 113
},
{
"epoch": 0.39929947460595444,
"grad_norm": 19.44122886657715,
"learning_rate": 3.9869295805062906e-05,
"loss": 3.7344,
"step": 114
},
{
"epoch": 0.4028021015761821,
"grad_norm": 19.81208038330078,
"learning_rate": 3.9859804193693665e-05,
"loss": 4.4243,
"step": 115
},
{
"epoch": 0.4063047285464098,
"grad_norm": 12.858543395996094,
"learning_rate": 3.9849981129729026e-05,
"loss": 4.4602,
"step": 116
},
{
"epoch": 0.4098073555166375,
"grad_norm": 14.306174278259277,
"learning_rate": 3.983982677711221e-05,
"loss": 3.147,
"step": 117
},
{
"epoch": 0.4133099824868651,
"grad_norm": 9.356593132019043,
"learning_rate": 3.98293413053155e-05,
"loss": 3.8658,
"step": 118
},
{
"epoch": 0.4168126094570928,
"grad_norm": 9.299378395080566,
"learning_rate": 3.981852488933743e-05,
"loss": 4.0554,
"step": 119
},
{
"epoch": 0.4203152364273205,
"grad_norm": 9.975719451904297,
"learning_rate": 3.9807377709699895e-05,
"loss": 4.179,
"step": 120
},
{
"epoch": 0.4238178633975482,
"grad_norm": 13.278733253479004,
"learning_rate": 3.9795899952445086e-05,
"loss": 2.7921,
"step": 121
},
{
"epoch": 0.4273204903677758,
"grad_norm": 10.308444023132324,
"learning_rate": 3.9784091809132424e-05,
"loss": 3.8932,
"step": 122
},
{
"epoch": 0.4308231173380035,
"grad_norm": 17.627866744995117,
"learning_rate": 3.9771953476835323e-05,
"loss": 3.3324,
"step": 123
},
{
"epoch": 0.4343257443082312,
"grad_norm": 14.405228614807129,
"learning_rate": 3.975948515813795e-05,
"loss": 3.3283,
"step": 124
},
{
"epoch": 0.43782837127845886,
"grad_norm": 6.409677028656006,
"learning_rate": 3.9746687061131826e-05,
"loss": 2.5538,
"step": 125
},
{
"epoch": 0.4413309982486865,
"grad_norm": 21.610538482666016,
"learning_rate": 3.973355939941232e-05,
"loss": 3.2504,
"step": 126
},
{
"epoch": 0.4448336252189142,
"grad_norm": 11.399911880493164,
"learning_rate": 3.972010239207512e-05,
"loss": 3.8677,
"step": 127
},
{
"epoch": 0.44833625218914186,
"grad_norm": 11.416419982910156,
"learning_rate": 3.970631626371258e-05,
"loss": 2.8155,
"step": 128
},
{
"epoch": 0.45183887915936954,
"grad_norm": 11.055480003356934,
"learning_rate": 3.969220124440995e-05,
"loss": 3.0696,
"step": 129
},
{
"epoch": 0.4553415061295972,
"grad_norm": 24.55332374572754,
"learning_rate": 3.9677757569741544e-05,
"loss": 4.3794,
"step": 130
},
{
"epoch": 0.45884413309982486,
"grad_norm": 14.385886192321777,
"learning_rate": 3.966298548076681e-05,
"loss": 2.9191,
"step": 131
},
{
"epoch": 0.46234676007005254,
"grad_norm": 16.254535675048828,
"learning_rate": 3.9647885224026306e-05,
"loss": 3.7145,
"step": 132
},
{
"epoch": 0.4658493870402802,
"grad_norm": 17.73900604248047,
"learning_rate": 3.96324570515376e-05,
"loss": 3.6577,
"step": 133
},
{
"epoch": 0.46935201401050786,
"grad_norm": 10.10382080078125,
"learning_rate": 3.961670122079101e-05,
"loss": 3.0099,
"step": 134
},
{
"epoch": 0.47285464098073554,
"grad_norm": 19.65964698791504,
"learning_rate": 3.960061799474539e-05,
"loss": 3.1526,
"step": 135
},
{
"epoch": 0.4763572679509632,
"grad_norm": 8.950761795043945,
"learning_rate": 3.958420764182368e-05,
"loss": 3.0122,
"step": 136
},
{
"epoch": 0.4798598949211909,
"grad_norm": 20.611316680908203,
"learning_rate": 3.956747043590844e-05,
"loss": 4.1029,
"step": 137
},
{
"epoch": 0.48336252189141854,
"grad_norm": 11.411933898925781,
"learning_rate": 3.9550406656337286e-05,
"loss": 3.5931,
"step": 138
},
{
"epoch": 0.4868651488616462,
"grad_norm": 14.644675254821777,
"learning_rate": 3.953301658789822e-05,
"loss": 3.5152,
"step": 139
},
{
"epoch": 0.4903677758318739,
"grad_norm": 16.666471481323242,
"learning_rate": 3.9515300520824877e-05,
"loss": 3.8511,
"step": 140
},
{
"epoch": 0.4938704028021016,
"grad_norm": 8.767988204956055,
"learning_rate": 3.949725875079171e-05,
"loss": 2.9792,
"step": 141
},
{
"epoch": 0.4973730297723292,
"grad_norm": 7.37055778503418,
"learning_rate": 3.947889157890901e-05,
"loss": 2.7419,
"step": 142
},
{
"epoch": 0.500875656742557,
"grad_norm": 9.221246719360352,
"learning_rate": 3.9460199311717905e-05,
"loss": 3.4633,
"step": 143
},
{
"epoch": 0.5043782837127846,
"grad_norm": 8.684052467346191,
"learning_rate": 3.944118226118523e-05,
"loss": 2.51,
"step": 144
},
{
"epoch": 0.5078809106830122,
"grad_norm": 13.942377090454102,
"learning_rate": 3.942184074469835e-05,
"loss": 2.778,
"step": 145
},
{
"epoch": 0.51138353765324,
"grad_norm": 8.383426666259766,
"learning_rate": 3.940217508505984e-05,
"loss": 3.2499,
"step": 146
},
{
"epoch": 0.5148861646234676,
"grad_norm": 11.05321979522705,
"learning_rate": 3.938218561048209e-05,
"loss": 2.8737,
"step": 147
},
{
"epoch": 0.5183887915936952,
"grad_norm": 15.219361305236816,
"learning_rate": 3.936187265458185e-05,
"loss": 3.2706,
"step": 148
},
{
"epoch": 0.521891418563923,
"grad_norm": 12.02111530303955,
"learning_rate": 3.9341236556374635e-05,
"loss": 2.4901,
"step": 149
},
{
"epoch": 0.5253940455341506,
"grad_norm": 12.307247161865234,
"learning_rate": 3.93202776602691e-05,
"loss": 3.0426,
"step": 150
},
{
"epoch": 0.5288966725043783,
"grad_norm": 17.756227493286133,
"learning_rate": 3.929899631606128e-05,
"loss": 3.8817,
"step": 151
},
{
"epoch": 0.532399299474606,
"grad_norm": 15.380708694458008,
"learning_rate": 3.9277392878928714e-05,
"loss": 2.2106,
"step": 152
},
{
"epoch": 0.5359019264448336,
"grad_norm": 11.024133682250977,
"learning_rate": 3.925546770942457e-05,
"loss": 3.1595,
"step": 153
},
{
"epoch": 0.5394045534150613,
"grad_norm": 12.819958686828613,
"learning_rate": 3.9233221173471616e-05,
"loss": 2.6395,
"step": 154
},
{
"epoch": 0.542907180385289,
"grad_norm": 9.612881660461426,
"learning_rate": 3.9210653642356094e-05,
"loss": 3.2601,
"step": 155
},
{
"epoch": 0.5464098073555166,
"grad_norm": 16.474605560302734,
"learning_rate": 3.9187765492721534e-05,
"loss": 3.6263,
"step": 156
},
{
"epoch": 0.5499124343257443,
"grad_norm": 7.583954334259033,
"learning_rate": 3.916455710656246e-05,
"loss": 3.0502,
"step": 157
},
{
"epoch": 0.553415061295972,
"grad_norm": 10.567022323608398,
"learning_rate": 3.914102887121804e-05,
"loss": 2.5924,
"step": 158
},
{
"epoch": 0.5569176882661997,
"grad_norm": 9.386385917663574,
"learning_rate": 3.911718117936559e-05,
"loss": 2.6265,
"step": 159
},
{
"epoch": 0.5604203152364273,
"grad_norm": 11.7581787109375,
"learning_rate": 3.9093014429014034e-05,
"loss": 3.7777,
"step": 160
},
{
"epoch": 0.563922942206655,
"grad_norm": 9.84007453918457,
"learning_rate": 3.906852902349726e-05,
"loss": 3.1266,
"step": 161
},
{
"epoch": 0.5674255691768827,
"grad_norm": 18.16061782836914,
"learning_rate": 3.9043725371467406e-05,
"loss": 3.04,
"step": 162
},
{
"epoch": 0.5709281961471103,
"grad_norm": 8.399232864379883,
"learning_rate": 3.9018603886888e-05,
"loss": 3.1878,
"step": 163
},
{
"epoch": 0.574430823117338,
"grad_norm": 28.759723663330078,
"learning_rate": 3.89931649890271e-05,
"loss": 2.6301,
"step": 164
},
{
"epoch": 0.5779334500875657,
"grad_norm": 13.141151428222656,
"learning_rate": 3.896740910245024e-05,
"loss": 2.3805,
"step": 165
},
{
"epoch": 0.5814360770577933,
"grad_norm": 9.281489372253418,
"learning_rate": 3.894133665701341e-05,
"loss": 2.4934,
"step": 166
},
{
"epoch": 0.5849387040280211,
"grad_norm": 12.659279823303223,
"learning_rate": 3.8914948087855816e-05,
"loss": 2.809,
"step": 167
},
{
"epoch": 0.5884413309982487,
"grad_norm": 9.575944900512695,
"learning_rate": 3.888824383539267e-05,
"loss": 3.4147,
"step": 168
},
{
"epoch": 0.5919439579684763,
"grad_norm": 12.01468276977539,
"learning_rate": 3.886122434530782e-05,
"loss": 4.2984,
"step": 169
},
{
"epoch": 0.5954465849387041,
"grad_norm": 17.843990325927734,
"learning_rate": 3.8833890068546284e-05,
"loss": 3.9301,
"step": 170
},
{
"epoch": 0.5989492119089317,
"grad_norm": 10.337265014648438,
"learning_rate": 3.8806241461306774e-05,
"loss": 2.9173,
"step": 171
},
{
"epoch": 0.6024518388791593,
"grad_norm": 10.002184867858887,
"learning_rate": 3.877827898503406e-05,
"loss": 3.1344,
"step": 172
},
{
"epoch": 0.6059544658493871,
"grad_norm": 9.526917457580566,
"learning_rate": 3.875000310641125e-05,
"loss": 2.7306,
"step": 173
},
{
"epoch": 0.6094570928196147,
"grad_norm": 15.251102447509766,
"learning_rate": 3.8721414297352044e-05,
"loss": 2.6925,
"step": 174
},
{
"epoch": 0.6129597197898424,
"grad_norm": 15.054774284362793,
"learning_rate": 3.86925130349928e-05,
"loss": 2.9571,
"step": 175
},
{
"epoch": 0.6164623467600701,
"grad_norm": 6.836494445800781,
"learning_rate": 3.866329980168462e-05,
"loss": 2.8877,
"step": 176
},
{
"epoch": 0.6199649737302977,
"grad_norm": 66.23248291015625,
"learning_rate": 3.863377508498529e-05,
"loss": 3.5541,
"step": 177
},
{
"epoch": 0.6234676007005254,
"grad_norm": 8.304655075073242,
"learning_rate": 3.860393937765112e-05,
"loss": 2.7798,
"step": 178
},
{
"epoch": 0.626970227670753,
"grad_norm": 12.527859687805176,
"learning_rate": 3.8573793177628746e-05,
"loss": 2.7926,
"step": 179
},
{
"epoch": 0.6304728546409807,
"grad_norm": 15.866973876953125,
"learning_rate": 3.854333698804681e-05,
"loss": 2.7737,
"step": 180
},
{
"epoch": 0.6339754816112084,
"grad_norm": 9.761703491210938,
"learning_rate": 3.8512571317207545e-05,
"loss": 2.9336,
"step": 181
},
{
"epoch": 0.637478108581436,
"grad_norm": 19.601985931396484,
"learning_rate": 3.848149667857834e-05,
"loss": 3.7819,
"step": 182
},
{
"epoch": 0.6409807355516638,
"grad_norm": 8.4579439163208,
"learning_rate": 3.845011359078311e-05,
"loss": 3.8168,
"step": 183
},
{
"epoch": 0.6444833625218914,
"grad_norm": 9.35549545288086,
"learning_rate": 3.841842257759368e-05,
"loss": 3.1408,
"step": 184
},
{
"epoch": 0.647985989492119,
"grad_norm": 11.803943634033203,
"learning_rate": 3.8386424167921044e-05,
"loss": 3.3967,
"step": 185
},
{
"epoch": 0.6514886164623468,
"grad_norm": 10.127033233642578,
"learning_rate": 3.83541188958065e-05,
"loss": 3.4275,
"step": 186
},
{
"epoch": 0.6549912434325744,
"grad_norm": 15.094486236572266,
"learning_rate": 3.83215073004128e-05,
"loss": 3.5596,
"step": 187
},
{
"epoch": 0.658493870402802,
"grad_norm": 5.772601127624512,
"learning_rate": 3.8288589926015085e-05,
"loss": 1.9319,
"step": 188
},
{
"epoch": 0.6619964973730298,
"grad_norm": 6.778163433074951,
"learning_rate": 3.825536732199183e-05,
"loss": 2.1441,
"step": 189
},
{
"epoch": 0.6654991243432574,
"grad_norm": 28.084945678710938,
"learning_rate": 3.822184004281571e-05,
"loss": 3.33,
"step": 190
},
{
"epoch": 0.6690017513134852,
"grad_norm": 24.842683792114258,
"learning_rate": 3.818800864804428e-05,
"loss": 3.7081,
"step": 191
},
{
"epoch": 0.6725043782837128,
"grad_norm": 24.76456069946289,
"learning_rate": 3.815387370231068e-05,
"loss": 3.9048,
"step": 192
},
{
"epoch": 0.6760070052539404,
"grad_norm": 16.95564842224121,
"learning_rate": 3.8119435775314195e-05,
"loss": 3.8501,
"step": 193
},
{
"epoch": 0.6795096322241682,
"grad_norm": 14.68836784362793,
"learning_rate": 3.808469544181078e-05,
"loss": 2.9176,
"step": 194
},
{
"epoch": 0.6830122591943958,
"grad_norm": 12.532986640930176,
"learning_rate": 3.804965328160339e-05,
"loss": 2.9567,
"step": 195
},
{
"epoch": 0.6865148861646234,
"grad_norm": 12.127058982849121,
"learning_rate": 3.801430987953239e-05,
"loss": 2.9253,
"step": 196
},
{
"epoch": 0.6900175131348512,
"grad_norm": 16.993106842041016,
"learning_rate": 3.797866582546577e-05,
"loss": 3.7068,
"step": 197
},
{
"epoch": 0.6935201401050788,
"grad_norm": 8.354787826538086,
"learning_rate": 3.794272171428923e-05,
"loss": 2.4388,
"step": 198
},
{
"epoch": 0.6970227670753065,
"grad_norm": 20.402502059936523,
"learning_rate": 3.790647814589637e-05,
"loss": 3.7772,
"step": 199
},
{
"epoch": 0.7005253940455342,
"grad_norm": 12.042376518249512,
"learning_rate": 3.7869935725178574e-05,
"loss": 2.7738,
"step": 200
},
{
"epoch": 0.7040280210157618,
"grad_norm": 9.22277545928955,
"learning_rate": 3.7833095062015e-05,
"loss": 2.7279,
"step": 201
},
{
"epoch": 0.7075306479859895,
"grad_norm": 16.998899459838867,
"learning_rate": 3.7795956771262315e-05,
"loss": 3.5442,
"step": 202
},
{
"epoch": 0.7110332749562172,
"grad_norm": 17.048124313354492,
"learning_rate": 3.77585214727445e-05,
"loss": 3.5084,
"step": 203
},
{
"epoch": 0.7145359019264448,
"grad_norm": 18.509761810302734,
"learning_rate": 3.77207897912425e-05,
"loss": 3.6591,
"step": 204
},
{
"epoch": 0.7180385288966725,
"grad_norm": 13.900635719299316,
"learning_rate": 3.768276235648373e-05,
"loss": 3.0523,
"step": 205
},
{
"epoch": 0.7215411558669002,
"grad_norm": 8.05377197265625,
"learning_rate": 3.764443980313168e-05,
"loss": 2.7942,
"step": 206
},
{
"epoch": 0.7250437828371279,
"grad_norm": 8.520611763000488,
"learning_rate": 3.760582277077519e-05,
"loss": 3.3686,
"step": 207
},
{
"epoch": 0.7285464098073555,
"grad_norm": 10.90099811553955,
"learning_rate": 3.7566911903917896e-05,
"loss": 3.7414,
"step": 208
},
{
"epoch": 0.7320490367775832,
"grad_norm": 11.195393562316895,
"learning_rate": 3.752770785196739e-05,
"loss": 3.1357,
"step": 209
},
{
"epoch": 0.7355516637478109,
"grad_norm": 8.983715057373047,
"learning_rate": 3.748821126922442e-05,
"loss": 3.5049,
"step": 210
},
{
"epoch": 0.7390542907180385,
"grad_norm": 20.425249099731445,
"learning_rate": 3.744842281487198e-05,
"loss": 4.4525,
"step": 211
},
{
"epoch": 0.7425569176882661,
"grad_norm": 11.836134910583496,
"learning_rate": 3.740834315296426e-05,
"loss": 2.8092,
"step": 212
},
{
"epoch": 0.7460595446584939,
"grad_norm": 20.9670352935791,
"learning_rate": 3.73679729524156e-05,
"loss": 3.6011,
"step": 213
},
{
"epoch": 0.7495621716287215,
"grad_norm": 6.383330821990967,
"learning_rate": 3.7327312886989345e-05,
"loss": 1.8692,
"step": 214
},
{
"epoch": 0.7530647985989493,
"grad_norm": 10.559625625610352,
"learning_rate": 3.728636363528654e-05,
"loss": 3.7165,
"step": 215
},
{
"epoch": 0.7565674255691769,
"grad_norm": 20.34081268310547,
"learning_rate": 3.724512588073467e-05,
"loss": 3.1359,
"step": 216
},
{
"epoch": 0.7600700525394045,
"grad_norm": 12.730659484863281,
"learning_rate": 3.72036003115762e-05,
"loss": 3.0624,
"step": 217
},
{
"epoch": 0.7635726795096323,
"grad_norm": 15.829120635986328,
"learning_rate": 3.7161787620857124e-05,
"loss": 3.6834,
"step": 218
},
{
"epoch": 0.7670753064798599,
"grad_norm": 14.584449768066406,
"learning_rate": 3.711968850641539e-05,
"loss": 3.8689,
"step": 219
},
{
"epoch": 0.7705779334500875,
"grad_norm": 11.392684936523438,
"learning_rate": 3.707730367086923e-05,
"loss": 3.0575,
"step": 220
},
{
"epoch": 0.7740805604203153,
"grad_norm": 7.998726844787598,
"learning_rate": 3.703463382160546e-05,
"loss": 3.0557,
"step": 221
},
{
"epoch": 0.7775831873905429,
"grad_norm": 8.74062728881836,
"learning_rate": 3.699167967076769e-05,
"loss": 2.7828,
"step": 222
},
{
"epoch": 0.7810858143607706,
"grad_norm": 9.26952075958252,
"learning_rate": 3.694844193524438e-05,
"loss": 2.6036,
"step": 223
},
{
"epoch": 0.7845884413309983,
"grad_norm": 20.752344131469727,
"learning_rate": 3.690492133665695e-05,
"loss": 4.1958,
"step": 224
},
{
"epoch": 0.7880910683012259,
"grad_norm": 8.477578163146973,
"learning_rate": 3.686111860134765e-05,
"loss": 3.269,
"step": 225
},
{
"epoch": 0.7915936952714536,
"grad_norm": 12.902408599853516,
"learning_rate": 3.681703446036751e-05,
"loss": 2.917,
"step": 226
},
{
"epoch": 0.7950963222416813,
"grad_norm": 5.842979431152344,
"learning_rate": 3.677266964946414e-05,
"loss": 2.2078,
"step": 227
},
{
"epoch": 0.7985989492119089,
"grad_norm": 6.6894755363464355,
"learning_rate": 3.672802490906937e-05,
"loss": 2.2943,
"step": 228
},
{
"epoch": 0.8021015761821366,
"grad_norm": 12.93032455444336,
"learning_rate": 3.6683100984286994e-05,
"loss": 3.2858,
"step": 229
},
{
"epoch": 0.8056042031523643,
"grad_norm": 16.70753288269043,
"learning_rate": 3.663789862488025e-05,
"loss": 3.2081,
"step": 230
},
{
"epoch": 0.809106830122592,
"grad_norm": 9.086612701416016,
"learning_rate": 3.659241858525938e-05,
"loss": 3.0141,
"step": 231
},
{
"epoch": 0.8126094570928196,
"grad_norm": 13.422370910644531,
"learning_rate": 3.6546661624468976e-05,
"loss": 3.8186,
"step": 232
},
{
"epoch": 0.8161120840630472,
"grad_norm": 20.846773147583008,
"learning_rate": 3.6500628506175353e-05,
"loss": 4.2557,
"step": 233
},
{
"epoch": 0.819614711033275,
"grad_norm": 11.69239616394043,
"learning_rate": 3.645431999865379e-05,
"loss": 2.3714,
"step": 234
},
{
"epoch": 0.8231173380035026,
"grad_norm": 16.386404037475586,
"learning_rate": 3.6407736874775694e-05,
"loss": 4.0155,
"step": 235
},
{
"epoch": 0.8266199649737302,
"grad_norm": 16.324825286865234,
"learning_rate": 3.6360879911995735e-05,
"loss": 4.509,
"step": 236
},
{
"epoch": 0.830122591943958,
"grad_norm": 7.930265426635742,
"learning_rate": 3.631374989233883e-05,
"loss": 2.2893,
"step": 237
},
{
"epoch": 0.8336252189141856,
"grad_norm": 9.163013458251953,
"learning_rate": 3.626634760238712e-05,
"loss": 2.7087,
"step": 238
},
{
"epoch": 0.8371278458844134,
"grad_norm": 10.82229995727539,
"learning_rate": 3.621867383326685e-05,
"loss": 3.1594,
"step": 239
},
{
"epoch": 0.840630472854641,
"grad_norm": 13.09811019897461,
"learning_rate": 3.617072938063513e-05,
"loss": 3.5272,
"step": 240
},
{
"epoch": 0.8441330998248686,
"grad_norm": 11.873635292053223,
"learning_rate": 3.6122515044666676e-05,
"loss": 3.0622,
"step": 241
},
{
"epoch": 0.8476357267950964,
"grad_norm": 11.226723670959473,
"learning_rate": 3.607403163004046e-05,
"loss": 2.3039,
"step": 242
},
{
"epoch": 0.851138353765324,
"grad_norm": 8.14344596862793,
"learning_rate": 3.602527994592627e-05,
"loss": 1.8582,
"step": 243
},
{
"epoch": 0.8546409807355516,
"grad_norm": 15.325976371765137,
"learning_rate": 3.5976260805971216e-05,
"loss": 2.7251,
"step": 244
},
{
"epoch": 0.8581436077057794,
"grad_norm": 9.784257888793945,
"learning_rate": 3.5926975028286145e-05,
"loss": 2.6414,
"step": 245
},
{
"epoch": 0.861646234676007,
"grad_norm": 12.891518592834473,
"learning_rate": 3.587742343543198e-05,
"loss": 3.6431,
"step": 246
},
{
"epoch": 0.8651488616462347,
"grad_norm": 12.887101173400879,
"learning_rate": 3.5827606854405995e-05,
"loss": 2.1479,
"step": 247
},
{
"epoch": 0.8686514886164624,
"grad_norm": 8.942181587219238,
"learning_rate": 3.577752611662803e-05,
"loss": 2.7682,
"step": 248
},
{
"epoch": 0.87215411558669,
"grad_norm": 10.70305061340332,
"learning_rate": 3.5727182057926594e-05,
"loss": 1.6457,
"step": 249
},
{
"epoch": 0.8756567425569177,
"grad_norm": 20.12733268737793,
"learning_rate": 3.567657551852492e-05,
"loss": 3.6745,
"step": 250
},
{
"epoch": 0.8791593695271454,
"grad_norm": 11.07908821105957,
"learning_rate": 3.5625707343026943e-05,
"loss": 3.3239,
"step": 251
},
{
"epoch": 0.882661996497373,
"grad_norm": 17.630199432373047,
"learning_rate": 3.557457838040321e-05,
"loss": 3.6983,
"step": 252
},
{
"epoch": 0.8861646234676007,
"grad_norm": 13.158476829528809,
"learning_rate": 3.552318948397671e-05,
"loss": 2.9929,
"step": 253
},
{
"epoch": 0.8896672504378283,
"grad_norm": 18.34935188293457,
"learning_rate": 3.547154151140862e-05,
"loss": 2.9516,
"step": 254
},
{
"epoch": 0.8931698774080561,
"grad_norm": 9.750235557556152,
"learning_rate": 3.5419635324683996e-05,
"loss": 3.2622,
"step": 255
},
{
"epoch": 0.8966725043782837,
"grad_norm": 13.19760513305664,
"learning_rate": 3.5367471790097395e-05,
"loss": 3.4669,
"step": 256
},
{
"epoch": 0.9001751313485113,
"grad_norm": 13.441046714782715,
"learning_rate": 3.5315051778238425e-05,
"loss": 3.3668,
"step": 257
},
{
"epoch": 0.9036777583187391,
"grad_norm": 14.647550582885742,
"learning_rate": 3.526237616397718e-05,
"loss": 3.6369,
"step": 258
},
{
"epoch": 0.9071803852889667,
"grad_norm": 18.50444984436035,
"learning_rate": 3.520944582644968e-05,
"loss": 3.3555,
"step": 259
},
{
"epoch": 0.9106830122591943,
"grad_norm": 13.95717716217041,
"learning_rate": 3.515626164904317e-05,
"loss": 3.3408,
"step": 260
},
{
"epoch": 0.9141856392294221,
"grad_norm": 10.345568656921387,
"learning_rate": 3.510282451938139e-05,
"loss": 3.7049,
"step": 261
},
{
"epoch": 0.9176882661996497,
"grad_norm": 16.413713455200195,
"learning_rate": 3.5049135329309746e-05,
"loss": 3.2264,
"step": 262
},
{
"epoch": 0.9211908931698775,
"grad_norm": 8.679278373718262,
"learning_rate": 3.499519497488046e-05,
"loss": 4.0202,
"step": 263
},
{
"epoch": 0.9246935201401051,
"grad_norm": 13.41882038116455,
"learning_rate": 3.4941004356337566e-05,
"loss": 3.2065,
"step": 264
},
{
"epoch": 0.9281961471103327,
"grad_norm": 25.00071144104004,
"learning_rate": 3.488656437810193e-05,
"loss": 3.2281,
"step": 265
},
{
"epoch": 0.9316987740805605,
"grad_norm": 8.694498062133789,
"learning_rate": 3.4831875948756115e-05,
"loss": 2.7791,
"step": 266
},
{
"epoch": 0.9352014010507881,
"grad_norm": 8.920536041259766,
"learning_rate": 3.477693998102927e-05,
"loss": 2.8562,
"step": 267
},
{
"epoch": 0.9387040280210157,
"grad_norm": 13.899048805236816,
"learning_rate": 3.472175739178184e-05,
"loss": 2.759,
"step": 268
},
{
"epoch": 0.9422066549912435,
"grad_norm": 17.14980125427246,
"learning_rate": 3.4666329101990305e-05,
"loss": 3.4728,
"step": 269
},
{
"epoch": 0.9457092819614711,
"grad_norm": 12.87341022491455,
"learning_rate": 3.461065603673178e-05,
"loss": 3.6059,
"step": 270
},
{
"epoch": 0.9492119089316988,
"grad_norm": 17.3095703125,
"learning_rate": 3.45547391251686e-05,
"loss": 4.1194,
"step": 271
},
{
"epoch": 0.9527145359019265,
"grad_norm": 12.699674606323242,
"learning_rate": 3.4498579300532803e-05,
"loss": 2.7921,
"step": 272
},
{
"epoch": 0.9562171628721541,
"grad_norm": 11.289113998413086,
"learning_rate": 3.444217750011054e-05,
"loss": 2.6439,
"step": 273
},
{
"epoch": 0.9597197898423818,
"grad_norm": 10.24327564239502,
"learning_rate": 3.438553466522647e-05,
"loss": 3.1692,
"step": 274
},
{
"epoch": 0.9632224168126094,
"grad_norm": 7.742884159088135,
"learning_rate": 3.4328651741227997e-05,
"loss": 3.4076,
"step": 275
},
{
"epoch": 0.9667250437828371,
"grad_norm": 18.92707061767578,
"learning_rate": 3.4271529677469546e-05,
"loss": 3.0746,
"step": 276
},
{
"epoch": 0.9702276707530648,
"grad_norm": 9.217591285705566,
"learning_rate": 3.421416942729668e-05,
"loss": 2.6312,
"step": 277
},
{
"epoch": 0.9737302977232924,
"grad_norm": 18.405925750732422,
"learning_rate": 3.4156571948030206e-05,
"loss": 2.9252,
"step": 278
},
{
"epoch": 0.9772329246935202,
"grad_norm": 17.508012771606445,
"learning_rate": 3.40987382009502e-05,
"loss": 3.0746,
"step": 279
},
{
"epoch": 0.9807355516637478,
"grad_norm": 11.160324096679688,
"learning_rate": 3.4040669151279945e-05,
"loss": 3.445,
"step": 280
},
{
"epoch": 0.9842381786339754,
"grad_norm": 19.364736557006836,
"learning_rate": 3.3982365768169856e-05,
"loss": 3.5094,
"step": 281
},
{
"epoch": 0.9877408056042032,
"grad_norm": 13.00851821899414,
"learning_rate": 3.392382902468126e-05,
"loss": 3.5123,
"step": 282
},
{
"epoch": 0.9912434325744308,
"grad_norm": 7.9619550704956055,
"learning_rate": 3.38650598977702e-05,
"loss": 2.4051,
"step": 283
},
{
"epoch": 0.9947460595446584,
"grad_norm": 14.466473579406738,
"learning_rate": 3.380605936827109e-05,
"loss": 3.7423,
"step": 284
},
{
"epoch": 0.9982486865148862,
"grad_norm": 11.79201889038086,
"learning_rate": 3.374682842088039e-05,
"loss": 3.7114,
"step": 285
},
{
"epoch": 1.001751313485114,
"grad_norm": 8.342573165893555,
"learning_rate": 3.3687368044140125e-05,
"loss": 2.7779,
"step": 286
},
{
"epoch": 1.0052539404553416,
"grad_norm": 7.175319194793701,
"learning_rate": 3.362767923042142e-05,
"loss": 2.4556,
"step": 287
},
{
"epoch": 1.0087565674255692,
"grad_norm": 9.89644718170166,
"learning_rate": 3.356776297590794e-05,
"loss": 1.8897,
"step": 288
},
{
"epoch": 1.0122591943957968,
"grad_norm": 8.86550521850586,
"learning_rate": 3.3507620280579215e-05,
"loss": 1.3645,
"step": 289
},
{
"epoch": 1.0157618213660244,
"grad_norm": 7.4619855880737305,
"learning_rate": 3.3447252148194014e-05,
"loss": 1.0798,
"step": 290
},
{
"epoch": 1.0192644483362523,
"grad_norm": 14.268264770507812,
"learning_rate": 3.338665958627356e-05,
"loss": 3.0727,
"step": 291
},
{
"epoch": 1.02276707530648,
"grad_norm": 9.76750373840332,
"learning_rate": 3.332584360608471e-05,
"loss": 1.7199,
"step": 292
},
{
"epoch": 1.0262697022767076,
"grad_norm": 18.44536018371582,
"learning_rate": 3.32648052226231e-05,
"loss": 2.139,
"step": 293
},
{
"epoch": 1.0297723292469352,
"grad_norm": 11.522726058959961,
"learning_rate": 3.320354545459619e-05,
"loss": 0.8731,
"step": 294
},
{
"epoch": 1.0332749562171628,
"grad_norm": 10.644336700439453,
"learning_rate": 3.314206532440625e-05,
"loss": 0.7796,
"step": 295
},
{
"epoch": 1.0367775831873904,
"grad_norm": 9.179198265075684,
"learning_rate": 3.3080365858133335e-05,
"loss": 0.4986,
"step": 296
},
{
"epoch": 1.0402802101576183,
"grad_norm": 14.007453918457031,
"learning_rate": 3.301844808551811e-05,
"loss": 0.7696,
"step": 297
},
{
"epoch": 1.043782837127846,
"grad_norm": 15.713719367980957,
"learning_rate": 3.295631303994471e-05,
"loss": 0.8191,
"step": 298
},
{
"epoch": 1.0472854640980735,
"grad_norm": 24.446319580078125,
"learning_rate": 3.289396175842346e-05,
"loss": 1.3376,
"step": 299
},
{
"epoch": 1.0507880910683012,
"grad_norm": 18.64075469970703,
"learning_rate": 3.2831395281573605e-05,
"loss": 0.9942,
"step": 300
},
{
"epoch": 1.0542907180385288,
"grad_norm": 27.9279842376709,
"learning_rate": 3.276861465360587e-05,
"loss": 2.8875,
"step": 301
},
{
"epoch": 1.0577933450087567,
"grad_norm": 13.904041290283203,
"learning_rate": 3.270562092230514e-05,
"loss": 1.2454,
"step": 302
},
{
"epoch": 1.0612959719789843,
"grad_norm": 15.944894790649414,
"learning_rate": 3.264241513901287e-05,
"loss": 0.6807,
"step": 303
},
{
"epoch": 1.064798598949212,
"grad_norm": 31.337432861328125,
"learning_rate": 3.2578998358609595e-05,
"loss": 1.8232,
"step": 304
},
{
"epoch": 1.0683012259194395,
"grad_norm": 14.853253364562988,
"learning_rate": 3.251537163949732e-05,
"loss": 2.0792,
"step": 305
},
{
"epoch": 1.0718038528896672,
"grad_norm": 11.485796928405762,
"learning_rate": 3.245153604358184e-05,
"loss": 1.0212,
"step": 306
},
{
"epoch": 1.0753064798598948,
"grad_norm": 10.490492820739746,
"learning_rate": 3.238749263625503e-05,
"loss": 0.937,
"step": 307
},
{
"epoch": 1.0788091068301227,
"grad_norm": 10.13619613647461,
"learning_rate": 3.2323242486377056e-05,
"loss": 1.597,
"step": 308
},
{
"epoch": 1.0823117338003503,
"grad_norm": 17.22042465209961,
"learning_rate": 3.2258786666258524e-05,
"loss": 1.7487,
"step": 309
},
{
"epoch": 1.085814360770578,
"grad_norm": 6.475918769836426,
"learning_rate": 3.219412625164262e-05,
"loss": 0.645,
"step": 310
},
{
"epoch": 1.0893169877408055,
"grad_norm": 12.947412490844727,
"learning_rate": 3.212926232168712e-05,
"loss": 2.1994,
"step": 311
},
{
"epoch": 1.0928196147110332,
"grad_norm": 9.059617042541504,
"learning_rate": 3.2064195958946406e-05,
"loss": 1.1179,
"step": 312
},
{
"epoch": 1.096322241681261,
"grad_norm": 8.391510009765625,
"learning_rate": 3.1998928249353394e-05,
"loss": 1.2839,
"step": 313
},
{
"epoch": 1.0998248686514887,
"grad_norm": 11.310205459594727,
"learning_rate": 3.193346028220136e-05,
"loss": 1.6124,
"step": 314
},
{
"epoch": 1.1033274956217163,
"grad_norm": 10.316460609436035,
"learning_rate": 3.1867793150125844e-05,
"loss": 0.9333,
"step": 315
},
{
"epoch": 1.106830122591944,
"grad_norm": 13.748690605163574,
"learning_rate": 3.180192794908636e-05,
"loss": 1.2929,
"step": 316
},
{
"epoch": 1.1103327495621715,
"grad_norm": 6.411694526672363,
"learning_rate": 3.173586577834812e-05,
"loss": 0.5326,
"step": 317
},
{
"epoch": 1.1138353765323994,
"grad_norm": 16.91478729248047,
"learning_rate": 3.166960774046369e-05,
"loss": 1.9846,
"step": 318
},
{
"epoch": 1.117338003502627,
"grad_norm": 10.801108360290527,
"learning_rate": 3.160315494125457e-05,
"loss": 1.1411,
"step": 319
},
{
"epoch": 1.1208406304728546,
"grad_norm": 28.392133712768555,
"learning_rate": 3.153650848979276e-05,
"loss": 2.0427,
"step": 320
},
{
"epoch": 1.1243432574430823,
"grad_norm": 13.292547225952148,
"learning_rate": 3.146966949838224e-05,
"loss": 1.7108,
"step": 321
},
{
"epoch": 1.12784588441331,
"grad_norm": 18.15999412536621,
"learning_rate": 3.140263908254042e-05,
"loss": 1.8602,
"step": 322
},
{
"epoch": 1.1313485113835378,
"grad_norm": 13.406962394714355,
"learning_rate": 3.13354183609795e-05,
"loss": 0.7865,
"step": 323
},
{
"epoch": 1.1348511383537654,
"grad_norm": 10.282164573669434,
"learning_rate": 3.126800845558782e-05,
"loss": 1.0902,
"step": 324
},
{
"epoch": 1.138353765323993,
"grad_norm": 12.66031551361084,
"learning_rate": 3.1200410491411105e-05,
"loss": 1.9786,
"step": 325
},
{
"epoch": 1.1418563922942206,
"grad_norm": 13.621040344238281,
"learning_rate": 3.1132625596633734e-05,
"loss": 1.8057,
"step": 326
},
{
"epoch": 1.1453590192644483,
"grad_norm": 13.055573463439941,
"learning_rate": 3.1064654902559875e-05,
"loss": 1.4152,
"step": 327
},
{
"epoch": 1.1488616462346761,
"grad_norm": 9.842079162597656,
"learning_rate": 3.099649954359462e-05,
"loss": 1.4,
"step": 328
},
{
"epoch": 1.1523642732049038,
"grad_norm": 5.827723026275635,
"learning_rate": 3.0928160657225044e-05,
"loss": 0.4828,
"step": 329
},
{
"epoch": 1.1558669001751314,
"grad_norm": 14.849542617797852,
"learning_rate": 3.085963938400122e-05,
"loss": 2.3794,
"step": 330
},
{
"epoch": 1.159369527145359,
"grad_norm": 11.149800300598145,
"learning_rate": 3.079093686751721e-05,
"loss": 1.6826,
"step": 331
},
{
"epoch": 1.1628721541155866,
"grad_norm": 12.088582992553711,
"learning_rate": 3.072205425439193e-05,
"loss": 1.2977,
"step": 332
},
{
"epoch": 1.1663747810858143,
"grad_norm": 9.657583236694336,
"learning_rate": 3.0652992694250055e-05,
"loss": 1.1603,
"step": 333
},
{
"epoch": 1.1698774080560421,
"grad_norm": 14.046825408935547,
"learning_rate": 3.0583753339702816e-05,
"loss": 0.9929,
"step": 334
},
{
"epoch": 1.1733800350262698,
"grad_norm": 8.845903396606445,
"learning_rate": 3.0514337346328768e-05,
"loss": 0.764,
"step": 335
},
{
"epoch": 1.1768826619964974,
"grad_norm": 8.96368408203125,
"learning_rate": 3.04447458726545e-05,
"loss": 0.8944,
"step": 336
},
{
"epoch": 1.180385288966725,
"grad_norm": 12.853364944458008,
"learning_rate": 3.0374980080135292e-05,
"loss": 1.6053,
"step": 337
},
{
"epoch": 1.1838879159369526,
"grad_norm": 11.627004623413086,
"learning_rate": 3.0305041133135745e-05,
"loss": 1.8025,
"step": 338
},
{
"epoch": 1.1873905429071803,
"grad_norm": 13.911933898925781,
"learning_rate": 3.0234930198910346e-05,
"loss": 1.7916,
"step": 339
},
{
"epoch": 1.1908931698774081,
"grad_norm": 19.774444580078125,
"learning_rate": 3.0164648447583977e-05,
"loss": 0.8744,
"step": 340
},
{
"epoch": 1.1943957968476357,
"grad_norm": 14.323461532592773,
"learning_rate": 3.0094197052132414e-05,
"loss": 1.6673,
"step": 341
},
{
"epoch": 1.1978984238178634,
"grad_norm": 13.647873878479004,
"learning_rate": 3.002357718836268e-05,
"loss": 1.2698,
"step": 342
},
{
"epoch": 1.201401050788091,
"grad_norm": 17.837255477905273,
"learning_rate": 2.9952790034893534e-05,
"loss": 2.0078,
"step": 343
},
{
"epoch": 1.2049036777583186,
"grad_norm": 13.053879737854004,
"learning_rate": 2.988183677313568e-05,
"loss": 1.9082,
"step": 344
},
{
"epoch": 1.2084063047285465,
"grad_norm": 10.994709014892578,
"learning_rate": 2.981071858727215e-05,
"loss": 1.9751,
"step": 345
},
{
"epoch": 1.2119089316987741,
"grad_norm": 10.098788261413574,
"learning_rate": 2.9739436664238464e-05,
"loss": 0.869,
"step": 346
},
{
"epoch": 1.2154115586690017,
"grad_norm": 14.354055404663086,
"learning_rate": 2.9667992193702865e-05,
"loss": 3.0441,
"step": 347
},
{
"epoch": 1.2189141856392294,
"grad_norm": 9.282268524169922,
"learning_rate": 2.9596386368046466e-05,
"loss": 0.9235,
"step": 348
},
{
"epoch": 1.222416812609457,
"grad_norm": 15.490865707397461,
"learning_rate": 2.9524620382343313e-05,
"loss": 1.7872,
"step": 349
},
{
"epoch": 1.2259194395796849,
"grad_norm": 11.079964637756348,
"learning_rate": 2.945269543434048e-05,
"loss": 1.258,
"step": 350
},
{
"epoch": 1.2294220665499125,
"grad_norm": 11.71127986907959,
"learning_rate": 2.9380612724438048e-05,
"loss": 1.9081,
"step": 351
},
{
"epoch": 1.2329246935201401,
"grad_norm": 11.974512100219727,
"learning_rate": 2.93083734556691e-05,
"loss": 1.5215,
"step": 352
},
{
"epoch": 1.2364273204903677,
"grad_norm": 8.614048957824707,
"learning_rate": 2.9235978833679604e-05,
"loss": 0.9446,
"step": 353
},
{
"epoch": 1.2399299474605954,
"grad_norm": 9.980677604675293,
"learning_rate": 2.916343006670834e-05,
"loss": 1.3451,
"step": 354
},
{
"epoch": 1.2434325744308232,
"grad_norm": 6.885375499725342,
"learning_rate": 2.9090728365566687e-05,
"loss": 0.3762,
"step": 355
},
{
"epoch": 1.2469352014010509,
"grad_norm": 9.016031265258789,
"learning_rate": 2.9017874943618465e-05,
"loss": 1.0411,
"step": 356
},
{
"epoch": 1.2504378283712785,
"grad_norm": 17.361536026000977,
"learning_rate": 2.8944871016759616e-05,
"loss": 2.4325,
"step": 357
},
{
"epoch": 1.253940455341506,
"grad_norm": 11.617949485778809,
"learning_rate": 2.887171780339799e-05,
"loss": 0.9287,
"step": 358
},
{
"epoch": 1.2574430823117337,
"grad_norm": 14.221748352050781,
"learning_rate": 2.879841652443295e-05,
"loss": 0.9818,
"step": 359
},
{
"epoch": 1.2609457092819616,
"grad_norm": 11.184192657470703,
"learning_rate": 2.8724968403235038e-05,
"loss": 1.5024,
"step": 360
},
{
"epoch": 1.2644483362521892,
"grad_norm": 17.768596649169922,
"learning_rate": 2.8651374665625507e-05,
"loss": 1.4243,
"step": 361
},
{
"epoch": 1.2679509632224168,
"grad_norm": 17.50186538696289,
"learning_rate": 2.857763653985592e-05,
"loss": 1.8902,
"step": 362
},
{
"epoch": 1.2714535901926445,
"grad_norm": 18.82529640197754,
"learning_rate": 2.8503755256587608e-05,
"loss": 1.1333,
"step": 363
},
{
"epoch": 1.274956217162872,
"grad_norm": 8.98492431640625,
"learning_rate": 2.8429732048871152e-05,
"loss": 0.6526,
"step": 364
},
{
"epoch": 1.2784588441331,
"grad_norm": 11.492777824401855,
"learning_rate": 2.83555681521258e-05,
"loss": 0.9215,
"step": 365
},
{
"epoch": 1.2819614711033274,
"grad_norm": 10.343647003173828,
"learning_rate": 2.8281264804118848e-05,
"loss": 0.7322,
"step": 366
},
{
"epoch": 1.2854640980735552,
"grad_norm": 16.036224365234375,
"learning_rate": 2.8206823244944966e-05,
"loss": 1.6768,
"step": 367
},
{
"epoch": 1.2889667250437828,
"grad_norm": 28.502471923828125,
"learning_rate": 2.8132244717005545e-05,
"loss": 1.3644,
"step": 368
},
{
"epoch": 1.2924693520140105,
"grad_norm": 19.65816879272461,
"learning_rate": 2.8057530464987883e-05,
"loss": 1.1389,
"step": 369
},
{
"epoch": 1.295971978984238,
"grad_norm": 15.79315185546875,
"learning_rate": 2.7982681735844532e-05,
"loss": 1.4806,
"step": 370
},
{
"epoch": 1.2994746059544657,
"grad_norm": 11.223061561584473,
"learning_rate": 2.7907699778772345e-05,
"loss": 0.761,
"step": 371
},
{
"epoch": 1.3029772329246936,
"grad_norm": 14.808821678161621,
"learning_rate": 2.783258584519175e-05,
"loss": 1.2235,
"step": 372
},
{
"epoch": 1.3064798598949212,
"grad_norm": 14.618925094604492,
"learning_rate": 2.7757341188725786e-05,
"loss": 2.7571,
"step": 373
},
{
"epoch": 1.3099824868651488,
"grad_norm": 14.738235473632812,
"learning_rate": 2.7681967065179212e-05,
"loss": 1.443,
"step": 374
},
{
"epoch": 1.3134851138353765,
"grad_norm": 11.83553695678711,
"learning_rate": 2.7606464732517548e-05,
"loss": 1.0723,
"step": 375
},
{
"epoch": 1.316987740805604,
"grad_norm": 11.12104606628418,
"learning_rate": 2.753083545084608e-05,
"loss": 2.5116,
"step": 376
},
{
"epoch": 1.320490367775832,
"grad_norm": 9.57516860961914,
"learning_rate": 2.7455080482388817e-05,
"loss": 0.9263,
"step": 377
},
{
"epoch": 1.3239929947460596,
"grad_norm": 9.881256103515625,
"learning_rate": 2.7379201091467425e-05,
"loss": 1.1542,
"step": 378
},
{
"epoch": 1.3274956217162872,
"grad_norm": 12.401742935180664,
"learning_rate": 2.7303198544480155e-05,
"loss": 2.2727,
"step": 379
},
{
"epoch": 1.3309982486865148,
"grad_norm": 19.053503036499023,
"learning_rate": 2.7227074109880662e-05,
"loss": 1.367,
"step": 380
},
{
"epoch": 1.3345008756567425,
"grad_norm": 10.002474784851074,
"learning_rate": 2.7150829058156886e-05,
"loss": 0.9753,
"step": 381
},
{
"epoch": 1.3380035026269703,
"grad_norm": 9.08540153503418,
"learning_rate": 2.707446466180979e-05,
"loss": 1.0053,
"step": 382
},
{
"epoch": 1.341506129597198,
"grad_norm": 9.479249000549316,
"learning_rate": 2.699798219533218e-05,
"loss": 1.0939,
"step": 383
},
{
"epoch": 1.3450087565674256,
"grad_norm": 7.634374618530273,
"learning_rate": 2.6921382935187393e-05,
"loss": 0.7234,
"step": 384
},
{
"epoch": 1.3485113835376532,
"grad_norm": 10.296411514282227,
"learning_rate": 2.6844668159788015e-05,
"loss": 1.1631,
"step": 385
},
{
"epoch": 1.3520140105078808,
"grad_norm": 11.623985290527344,
"learning_rate": 2.6767839149474533e-05,
"loss": 1.2477,
"step": 386
},
{
"epoch": 1.3555166374781087,
"grad_norm": 11.007031440734863,
"learning_rate": 2.6690897186493972e-05,
"loss": 1.1851,
"step": 387
},
{
"epoch": 1.3590192644483363,
"grad_norm": 12.727971076965332,
"learning_rate": 2.66138435549785e-05,
"loss": 1.2956,
"step": 388
},
{
"epoch": 1.362521891418564,
"grad_norm": 11.943631172180176,
"learning_rate": 2.6536679540923977e-05,
"loss": 1.3679,
"step": 389
},
{
"epoch": 1.3660245183887916,
"grad_norm": 14.202005386352539,
"learning_rate": 2.6459406432168525e-05,
"loss": 1.1852,
"step": 390
},
{
"epoch": 1.3695271453590192,
"grad_norm": 8.993060111999512,
"learning_rate": 2.6382025518371e-05,
"loss": 0.7622,
"step": 391
},
{
"epoch": 1.373029772329247,
"grad_norm": 21.68409538269043,
"learning_rate": 2.6304538090989488e-05,
"loss": 1.4973,
"step": 392
},
{
"epoch": 1.3765323992994747,
"grad_norm": 17.034027099609375,
"learning_rate": 2.6226945443259742e-05,
"loss": 1.6121,
"step": 393
},
{
"epoch": 1.3800350262697023,
"grad_norm": 16.731870651245117,
"learning_rate": 2.6149248870173618e-05,
"loss": 1.2101,
"step": 394
},
{
"epoch": 1.38353765323993,
"grad_norm": 17.604406356811523,
"learning_rate": 2.6071449668457426e-05,
"loss": 1.6075,
"step": 395
},
{
"epoch": 1.3870402802101576,
"grad_norm": 8.639128684997559,
"learning_rate": 2.5993549136550326e-05,
"loss": 0.7712,
"step": 396
},
{
"epoch": 1.3905429071803854,
"grad_norm": 17.338247299194336,
"learning_rate": 2.5915548574582622e-05,
"loss": 1.3931,
"step": 397
},
{
"epoch": 1.3940455341506128,
"grad_norm": 10.645469665527344,
"learning_rate": 2.583744928435411e-05,
"loss": 0.7641,
"step": 398
},
{
"epoch": 1.3975481611208407,
"grad_norm": 13.74074935913086,
"learning_rate": 2.5759252569312282e-05,
"loss": 1.5749,
"step": 399
},
{
"epoch": 1.4010507880910683,
"grad_norm": 11.76068115234375,
"learning_rate": 2.5680959734530663e-05,
"loss": 0.7086,
"step": 400
},
{
"epoch": 1.404553415061296,
"grad_norm": 8.68877124786377,
"learning_rate": 2.5602572086686935e-05,
"loss": 0.9056,
"step": 401
},
{
"epoch": 1.4080560420315236,
"grad_norm": 10.641279220581055,
"learning_rate": 2.5524090934041203e-05,
"loss": 1.1332,
"step": 402
},
{
"epoch": 1.4115586690017512,
"grad_norm": 12.472097396850586,
"learning_rate": 2.544551758641412e-05,
"loss": 1.6939,
"step": 403
},
{
"epoch": 1.415061295971979,
"grad_norm": 6.378911972045898,
"learning_rate": 2.5366853355165036e-05,
"loss": 0.4686,
"step": 404
},
{
"epoch": 1.4185639229422067,
"grad_norm": 14.799468994140625,
"learning_rate": 2.5288099553170123e-05,
"loss": 1.4848,
"step": 405
},
{
"epoch": 1.4220665499124343,
"grad_norm": 16.260482788085938,
"learning_rate": 2.5209257494800454e-05,
"loss": 2.419,
"step": 406
},
{
"epoch": 1.425569176882662,
"grad_norm": 16.565237045288086,
"learning_rate": 2.5130328495900046e-05,
"loss": 0.7131,
"step": 407
},
{
"epoch": 1.4290718038528896,
"grad_norm": 14.373163223266602,
"learning_rate": 2.505131387376396e-05,
"loss": 1.1584,
"step": 408
},
{
"epoch": 1.4325744308231174,
"grad_norm": 7.218545436859131,
"learning_rate": 2.4972214947116235e-05,
"loss": 0.6338,
"step": 409
},
{
"epoch": 1.436077057793345,
"grad_norm": 16.41472816467285,
"learning_rate": 2.489303303608796e-05,
"loss": 0.5099,
"step": 410
},
{
"epoch": 1.4395796847635727,
"grad_norm": 11.42400074005127,
"learning_rate": 2.4813769462195164e-05,
"loss": 1.1297,
"step": 411
},
{
"epoch": 1.4430823117338003,
"grad_norm": 15.395111083984375,
"learning_rate": 2.473442554831682e-05,
"loss": 0.5138,
"step": 412
},
{
"epoch": 1.446584938704028,
"grad_norm": 22.0821533203125,
"learning_rate": 2.4655002618672742e-05,
"loss": 2.1099,
"step": 413
},
{
"epoch": 1.4500875656742558,
"grad_norm": 13.769172668457031,
"learning_rate": 2.4575501998801487e-05,
"loss": 1.1711,
"step": 414
},
{
"epoch": 1.4535901926444834,
"grad_norm": 11.085646629333496,
"learning_rate": 2.4495925015538225e-05,
"loss": 1.5601,
"step": 415
},
{
"epoch": 1.457092819614711,
"grad_norm": 15.132203102111816,
"learning_rate": 2.4416272996992614e-05,
"loss": 2.413,
"step": 416
},
{
"epoch": 1.4605954465849387,
"grad_norm": 9.576119422912598,
"learning_rate": 2.4336547272526615e-05,
"loss": 1.0864,
"step": 417
},
{
"epoch": 1.4640980735551663,
"grad_norm": 10.316338539123535,
"learning_rate": 2.4256749172732317e-05,
"loss": 1.3956,
"step": 418
},
{
"epoch": 1.4676007005253942,
"grad_norm": 14.369219779968262,
"learning_rate": 2.4176880029409717e-05,
"loss": 1.4616,
"step": 419
},
{
"epoch": 1.4711033274956218,
"grad_norm": 11.594289779663086,
"learning_rate": 2.4096941175544514e-05,
"loss": 1.387,
"step": 420
},
{
"epoch": 1.4746059544658494,
"grad_norm": 13.779914855957031,
"learning_rate": 2.4016933945285842e-05,
"loss": 0.8608,
"step": 421
},
{
"epoch": 1.478108581436077,
"grad_norm": 11.94723129272461,
"learning_rate": 2.3936859673924e-05,
"loss": 0.7525,
"step": 422
},
{
"epoch": 1.4816112084063047,
"grad_norm": 22.267183303833008,
"learning_rate": 2.3856719697868202e-05,
"loss": 0.5829,
"step": 423
},
{
"epoch": 1.4851138353765325,
"grad_norm": 11.79100227355957,
"learning_rate": 2.3776515354624238e-05,
"loss": 1.3584,
"step": 424
},
{
"epoch": 1.4886164623467601,
"grad_norm": 14.353073120117188,
"learning_rate": 2.3696247982772146e-05,
"loss": 1.8197,
"step": 425
},
{
"epoch": 1.4921190893169878,
"grad_norm": 8.39911937713623,
"learning_rate": 2.3615918921943916e-05,
"loss": 1.4571,
"step": 426
},
{
"epoch": 1.4956217162872154,
"grad_norm": 16.525100708007812,
"learning_rate": 2.3535529512801083e-05,
"loss": 1.1179,
"step": 427
},
{
"epoch": 1.499124343257443,
"grad_norm": 11.500781059265137,
"learning_rate": 2.345508109701238e-05,
"loss": 1.1656,
"step": 428
},
{
"epoch": 1.5026269702276709,
"grad_norm": 22.588510513305664,
"learning_rate": 2.337457501723134e-05,
"loss": 1.6281,
"step": 429
},
{
"epoch": 1.5061295971978983,
"grad_norm": 10.75312328338623,
"learning_rate": 2.3294012617073874e-05,
"loss": 1.3504,
"step": 430
},
{
"epoch": 1.5096322241681261,
"grad_norm": 12.925887107849121,
"learning_rate": 2.321339524109588e-05,
"loss": 0.9368,
"step": 431
},
{
"epoch": 1.5131348511383538,
"grad_norm": 10.410888671875,
"learning_rate": 2.313272423477076e-05,
"loss": 0.9173,
"step": 432
},
{
"epoch": 1.5166374781085814,
"grad_norm": 10.296022415161133,
"learning_rate": 2.3052000944467e-05,
"loss": 0.5623,
"step": 433
},
{
"epoch": 1.5201401050788093,
"grad_norm": 9.609498977661133,
"learning_rate": 2.2971226717425677e-05,
"loss": 1.3033,
"step": 434
},
{
"epoch": 1.5236427320490367,
"grad_norm": 10.810608863830566,
"learning_rate": 2.2890402901737997e-05,
"loss": 1.5715,
"step": 435
},
{
"epoch": 1.5271453590192645,
"grad_norm": 11.768206596374512,
"learning_rate": 2.2809530846322762e-05,
"loss": 0.7532,
"step": 436
},
{
"epoch": 1.5306479859894921,
"grad_norm": 14.947583198547363,
"learning_rate": 2.272861190090389e-05,
"loss": 1.1824,
"step": 437
},
{
"epoch": 1.5341506129597198,
"grad_norm": 7.3182454109191895,
"learning_rate": 2.2647647415987874e-05,
"loss": 0.8167,
"step": 438
},
{
"epoch": 1.5376532399299476,
"grad_norm": 30.437318801879883,
"learning_rate": 2.256663874284124e-05,
"loss": 2.3371,
"step": 439
},
{
"epoch": 1.541155866900175,
"grad_norm": 9.188960075378418,
"learning_rate": 2.2485587233468004e-05,
"loss": 0.7048,
"step": 440
},
{
"epoch": 1.5446584938704029,
"grad_norm": 13.94011402130127,
"learning_rate": 2.2404494240587097e-05,
"loss": 1.1928,
"step": 441
},
{
"epoch": 1.5481611208406305,
"grad_norm": 12.979096412658691,
"learning_rate": 2.2323361117609793e-05,
"loss": 0.995,
"step": 442
},
{
"epoch": 1.5516637478108581,
"grad_norm": 10.994502067565918,
"learning_rate": 2.224218921861713e-05,
"loss": 1.1992,
"step": 443
},
{
"epoch": 1.5551663747810858,
"grad_norm": 10.981045722961426,
"learning_rate": 2.2160979898337302e-05,
"loss": 0.824,
"step": 444
},
{
"epoch": 1.5586690017513134,
"grad_norm": 16.379398345947266,
"learning_rate": 2.2079734512123033e-05,
"loss": 1.4839,
"step": 445
},
{
"epoch": 1.5621716287215412,
"grad_norm": 12.315567970275879,
"learning_rate": 2.1998454415929e-05,
"loss": 1.313,
"step": 446
},
{
"epoch": 1.5656742556917689,
"grad_norm": 28.550540924072266,
"learning_rate": 2.1917140966289155e-05,
"loss": 1.549,
"step": 447
},
{
"epoch": 1.5691768826619965,
"grad_norm": 22.290969848632812,
"learning_rate": 2.183579552029412e-05,
"loss": 1.1429,
"step": 448
},
{
"epoch": 1.5726795096322241,
"grad_norm": 9.34792709350586,
"learning_rate": 2.175441943556852e-05,
"loss": 0.661,
"step": 449
},
{
"epoch": 1.5761821366024518,
"grad_norm": 7.548702239990234,
"learning_rate": 2.167301407024832e-05,
"loss": 0.4753,
"step": 450
},
{
"epoch": 1.5796847635726796,
"grad_norm": 18.04340362548828,
"learning_rate": 2.159158078295818e-05,
"loss": 1.06,
"step": 451
},
{
"epoch": 1.583187390542907,
"grad_norm": 11.318816184997559,
"learning_rate": 2.1510120932788766e-05,
"loss": 0.5676,
"step": 452
},
{
"epoch": 1.5866900175131349,
"grad_norm": 10.302929878234863,
"learning_rate": 2.1428635879274056e-05,
"loss": 0.8679,
"step": 453
},
{
"epoch": 1.5901926444833625,
"grad_norm": 15.172941207885742,
"learning_rate": 2.134712698236868e-05,
"loss": 2.0982,
"step": 454
},
{
"epoch": 1.5936952714535901,
"grad_norm": 9.808448791503906,
"learning_rate": 2.1265595602425182e-05,
"loss": 0.4706,
"step": 455
},
{
"epoch": 1.597197898423818,
"grad_norm": 25.450702667236328,
"learning_rate": 2.1184043100171367e-05,
"loss": 1.0416,
"step": 456
},
{
"epoch": 1.6007005253940454,
"grad_norm": 16.862489700317383,
"learning_rate": 2.1102470836687532e-05,
"loss": 1.4711,
"step": 457
},
{
"epoch": 1.6042031523642732,
"grad_norm": 14.062199592590332,
"learning_rate": 2.1020880173383822e-05,
"loss": 1.8195,
"step": 458
},
{
"epoch": 1.6077057793345009,
"grad_norm": 17.54671859741211,
"learning_rate": 2.0939272471977422e-05,
"loss": 2.3865,
"step": 459
},
{
"epoch": 1.6112084063047285,
"grad_norm": 14.068134307861328,
"learning_rate": 2.0857649094469912e-05,
"loss": 1.135,
"step": 460
},
{
"epoch": 1.6147110332749564,
"grad_norm": 18.243608474731445,
"learning_rate": 2.077601140312449e-05,
"loss": 1.3811,
"step": 461
},
{
"epoch": 1.6182136602451838,
"grad_norm": 9.20577335357666,
"learning_rate": 2.0694360760443236e-05,
"loss": 0.946,
"step": 462
},
{
"epoch": 1.6217162872154116,
"grad_norm": 10.629399299621582,
"learning_rate": 2.061269852914439e-05,
"loss": 0.9641,
"step": 463
},
{
"epoch": 1.6252189141856392,
"grad_norm": 12.289473533630371,
"learning_rate": 2.0531026072139606e-05,
"loss": 1.6794,
"step": 464
},
{
"epoch": 1.6287215411558669,
"grad_norm": 6.732086658477783,
"learning_rate": 2.0449344752511197e-05,
"loss": 0.467,
"step": 465
},
{
"epoch": 1.6322241681260947,
"grad_norm": 7.626420974731445,
"learning_rate": 2.036765593348939e-05,
"loss": 0.821,
"step": 466
},
{
"epoch": 1.6357267950963221,
"grad_norm": 7.220170497894287,
"learning_rate": 2.0285960978429576e-05,
"loss": 0.7901,
"step": 467
},
{
"epoch": 1.63922942206655,
"grad_norm": 12.389723777770996,
"learning_rate": 2.020426125078955e-05,
"loss": 1.4001,
"step": 468
},
{
"epoch": 1.6427320490367776,
"grad_norm": 7.4357171058654785,
"learning_rate": 2.012255811410677e-05,
"loss": 0.7152,
"step": 469
},
{
"epoch": 1.6462346760070052,
"grad_norm": 6.436247825622559,
"learning_rate": 2.0040852931975565e-05,
"loss": 0.6231,
"step": 470
},
{
"epoch": 1.649737302977233,
"grad_norm": 13.278726577758789,
"learning_rate": 1.9959147068024435e-05,
"loss": 1.2873,
"step": 471
},
{
"epoch": 1.6532399299474605,
"grad_norm": 12.931772232055664,
"learning_rate": 1.9877441885893233e-05,
"loss": 1.2275,
"step": 472
},
{
"epoch": 1.6567425569176883,
"grad_norm": 8.590225219726562,
"learning_rate": 1.9795738749210452e-05,
"loss": 0.5315,
"step": 473
},
{
"epoch": 1.660245183887916,
"grad_norm": 16.391572952270508,
"learning_rate": 1.971403902157043e-05,
"loss": 1.7926,
"step": 474
},
{
"epoch": 1.6637478108581436,
"grad_norm": 9.678115844726562,
"learning_rate": 1.9632344066510615e-05,
"loss": 1.1219,
"step": 475
},
{
"epoch": 1.6672504378283712,
"grad_norm": 12.876395225524902,
"learning_rate": 1.9550655247488806e-05,
"loss": 1.5153,
"step": 476
},
{
"epoch": 1.6707530647985989,
"grad_norm": 12.242756843566895,
"learning_rate": 1.94689739278604e-05,
"loss": 1.1275,
"step": 477
},
{
"epoch": 1.6742556917688267,
"grad_norm": 9.798916816711426,
"learning_rate": 1.9387301470855616e-05,
"loss": 1.3813,
"step": 478
},
{
"epoch": 1.6777583187390543,
"grad_norm": 7.478942394256592,
"learning_rate": 1.9305639239556774e-05,
"loss": 0.5468,
"step": 479
},
{
"epoch": 1.681260945709282,
"grad_norm": 13.860644340515137,
"learning_rate": 1.922398859687552e-05,
"loss": 1.3181,
"step": 480
},
{
"epoch": 1.6847635726795096,
"grad_norm": 10.224373817443848,
"learning_rate": 1.914235090553009e-05,
"loss": 1.3759,
"step": 481
},
{
"epoch": 1.6882661996497372,
"grad_norm": 13.883950233459473,
"learning_rate": 1.906072752802258e-05,
"loss": 1.993,
"step": 482
},
{
"epoch": 1.691768826619965,
"grad_norm": 13.97096061706543,
"learning_rate": 1.8979119826616185e-05,
"loss": 1.236,
"step": 483
},
{
"epoch": 1.6952714535901925,
"grad_norm": 10.855804443359375,
"learning_rate": 1.8897529163312475e-05,
"loss": 0.9965,
"step": 484
},
{
"epoch": 1.6987740805604203,
"grad_norm": 14.875264167785645,
"learning_rate": 1.8815956899828643e-05,
"loss": 1.0194,
"step": 485
},
{
"epoch": 1.702276707530648,
"grad_norm": 16.03706932067871,
"learning_rate": 1.873440439757482e-05,
"loss": 1.2932,
"step": 486
},
{
"epoch": 1.7057793345008756,
"grad_norm": 20.21481704711914,
"learning_rate": 1.8652873017631325e-05,
"loss": 1.2415,
"step": 487
},
{
"epoch": 1.7092819614711035,
"grad_norm": 20.23651123046875,
"learning_rate": 1.8571364120725947e-05,
"loss": 1.7298,
"step": 488
},
{
"epoch": 1.7127845884413309,
"grad_norm": 13.188529968261719,
"learning_rate": 1.8489879067211237e-05,
"loss": 1.1726,
"step": 489
},
{
"epoch": 1.7162872154115587,
"grad_norm": 9.632075309753418,
"learning_rate": 1.8408419217041825e-05,
"loss": 0.5913,
"step": 490
},
{
"epoch": 1.7197898423817863,
"grad_norm": 14.285216331481934,
"learning_rate": 1.832698592975168e-05,
"loss": 1.5428,
"step": 491
},
{
"epoch": 1.723292469352014,
"grad_norm": 19.85175132751465,
"learning_rate": 1.8245580564431486e-05,
"loss": 2.9951,
"step": 492
},
{
"epoch": 1.7267950963222418,
"grad_norm": 9.290555000305176,
"learning_rate": 1.8164204479705884e-05,
"loss": 0.8093,
"step": 493
},
{
"epoch": 1.7302977232924692,
"grad_norm": 10.422537803649902,
"learning_rate": 1.8082859033710855e-05,
"loss": 1.3501,
"step": 494
},
{
"epoch": 1.733800350262697,
"grad_norm": 5.478065013885498,
"learning_rate": 1.800154558407101e-05,
"loss": 0.4231,
"step": 495
},
{
"epoch": 1.7373029772329247,
"grad_norm": 9.295846939086914,
"learning_rate": 1.792026548787697e-05,
"loss": 1.4291,
"step": 496
},
{
"epoch": 1.7408056042031523,
"grad_norm": 9.88309097290039,
"learning_rate": 1.78390201016627e-05,
"loss": 0.8173,
"step": 497
},
{
"epoch": 1.7443082311733802,
"grad_norm": 15.843478202819824,
"learning_rate": 1.7757810781382875e-05,
"loss": 1.7793,
"step": 498
},
{
"epoch": 1.7478108581436076,
"grad_norm": 12.055654525756836,
"learning_rate": 1.767663888239021e-05,
"loss": 0.4423,
"step": 499
},
{
"epoch": 1.7513134851138354,
"grad_norm": 6.999086380004883,
"learning_rate": 1.759550575941291e-05,
"loss": 0.561,
"step": 500
},
{
"epoch": 1.754816112084063,
"grad_norm": 10.667252540588379,
"learning_rate": 1.7514412766531995e-05,
"loss": 1.246,
"step": 501
},
{
"epoch": 1.7583187390542907,
"grad_norm": 10.708955764770508,
"learning_rate": 1.7433361257158764e-05,
"loss": 0.9596,
"step": 502
},
{
"epoch": 1.7618213660245186,
"grad_norm": 12.158077239990234,
"learning_rate": 1.735235258401213e-05,
"loss": 0.5837,
"step": 503
},
{
"epoch": 1.765323992994746,
"grad_norm": 9.02990436553955,
"learning_rate": 1.7271388099096115e-05,
"loss": 1.3399,
"step": 504
},
{
"epoch": 1.7688266199649738,
"grad_norm": 13.258699417114258,
"learning_rate": 1.7190469153677248e-05,
"loss": 1.6198,
"step": 505
},
{
"epoch": 1.7723292469352014,
"grad_norm": 12.630635261535645,
"learning_rate": 1.710959709826201e-05,
"loss": 1.5737,
"step": 506
},
{
"epoch": 1.775831873905429,
"grad_norm": 6.134144306182861,
"learning_rate": 1.7028773282574326e-05,
"loss": 0.5122,
"step": 507
},
{
"epoch": 1.7793345008756567,
"grad_norm": 11.753387451171875,
"learning_rate": 1.6947999055533006e-05,
"loss": 1.0792,
"step": 508
},
{
"epoch": 1.7828371278458843,
"grad_norm": 4.01309061050415,
"learning_rate": 1.6867275765229248e-05,
"loss": 0.1886,
"step": 509
},
{
"epoch": 1.7863397548161122,
"grad_norm": 9.992500305175781,
"learning_rate": 1.678660475890413e-05,
"loss": 1.2433,
"step": 510
},
{
"epoch": 1.7898423817863398,
"grad_norm": 15.76321792602539,
"learning_rate": 1.6705987382926126e-05,
"loss": 1.9447,
"step": 511
},
{
"epoch": 1.7933450087565674,
"grad_norm": 16.34722900390625,
"learning_rate": 1.6625424982768663e-05,
"loss": 1.7307,
"step": 512
},
{
"epoch": 1.796847635726795,
"grad_norm": 8.70386028289795,
"learning_rate": 1.6544918902987623e-05,
"loss": 0.7112,
"step": 513
},
{
"epoch": 1.8003502626970227,
"grad_norm": 23.811084747314453,
"learning_rate": 1.646447048719892e-05,
"loss": 1.7522,
"step": 514
},
{
"epoch": 1.8038528896672505,
"grad_norm": 9.175934791564941,
"learning_rate": 1.6384081078056094e-05,
"loss": 1.4078,
"step": 515
},
{
"epoch": 1.807355516637478,
"grad_norm": 11.81799602508545,
"learning_rate": 1.6303752017227854e-05,
"loss": 0.7936,
"step": 516
},
{
"epoch": 1.8108581436077058,
"grad_norm": 10.776399612426758,
"learning_rate": 1.6223484645375772e-05,
"loss": 1.6203,
"step": 517
},
{
"epoch": 1.8143607705779334,
"grad_norm": 14.465993881225586,
"learning_rate": 1.6143280302131804e-05,
"loss": 2.6888,
"step": 518
},
{
"epoch": 1.817863397548161,
"grad_norm": 10.451493263244629,
"learning_rate": 1.606314032607601e-05,
"loss": 0.9084,
"step": 519
},
{
"epoch": 1.821366024518389,
"grad_norm": 11.41504192352295,
"learning_rate": 1.598306605471417e-05,
"loss": 0.6167,
"step": 520
},
{
"epoch": 1.8248686514886163,
"grad_norm": 7.008203983306885,
"learning_rate": 1.590305882445549e-05,
"loss": 0.4968,
"step": 521
},
{
"epoch": 1.8283712784588442,
"grad_norm": 15.883779525756836,
"learning_rate": 1.5823119970590283e-05,
"loss": 1.2069,
"step": 522
},
{
"epoch": 1.8318739054290718,
"grad_norm": 13.613566398620605,
"learning_rate": 1.5743250827267686e-05,
"loss": 1.4072,
"step": 523
},
{
"epoch": 1.8353765323992994,
"grad_norm": 19.81671142578125,
"learning_rate": 1.5663452727473388e-05,
"loss": 1.2059,
"step": 524
},
{
"epoch": 1.8388791593695273,
"grad_norm": 9.457639694213867,
"learning_rate": 1.558372700300739e-05,
"loss": 0.8344,
"step": 525
},
{
"epoch": 1.8423817863397547,
"grad_norm": 18.07425880432129,
"learning_rate": 1.5504074984461775e-05,
"loss": 2.2088,
"step": 526
},
{
"epoch": 1.8458844133099825,
"grad_norm": 22.424962997436523,
"learning_rate": 1.542449800119852e-05,
"loss": 1.6593,
"step": 527
},
{
"epoch": 1.8493870402802102,
"grad_norm": 16.835895538330078,
"learning_rate": 1.5344997381327265e-05,
"loss": 0.8791,
"step": 528
},
{
"epoch": 1.8528896672504378,
"grad_norm": 11.287313461303711,
"learning_rate": 1.5265574451683186e-05,
"loss": 0.8198,
"step": 529
},
{
"epoch": 1.8563922942206657,
"grad_norm": 25.34825897216797,
"learning_rate": 1.5186230537804846e-05,
"loss": 1.4374,
"step": 530
},
{
"epoch": 1.859894921190893,
"grad_norm": 19.87013816833496,
"learning_rate": 1.5106966963912047e-05,
"loss": 1.2982,
"step": 531
},
{
"epoch": 1.863397548161121,
"grad_norm": 15.213961601257324,
"learning_rate": 1.5027785052883766e-05,
"loss": 0.515,
"step": 532
},
{
"epoch": 1.8669001751313485,
"grad_norm": 17.84857749938965,
"learning_rate": 1.4948686126236046e-05,
"loss": 0.9899,
"step": 533
},
{
"epoch": 1.8704028021015762,
"grad_norm": 10.338643074035645,
"learning_rate": 1.4869671504099961e-05,
"loss": 0.9608,
"step": 534
},
{
"epoch": 1.873905429071804,
"grad_norm": 26.768953323364258,
"learning_rate": 1.4790742505199558e-05,
"loss": 3.1679,
"step": 535
},
{
"epoch": 1.8774080560420314,
"grad_norm": 8.469191551208496,
"learning_rate": 1.4711900446829879e-05,
"loss": 0.4198,
"step": 536
},
{
"epoch": 1.8809106830122593,
"grad_norm": 12.908656120300293,
"learning_rate": 1.4633146644834966e-05,
"loss": 1.1931,
"step": 537
},
{
"epoch": 1.884413309982487,
"grad_norm": 13.781142234802246,
"learning_rate": 1.4554482413585888e-05,
"loss": 1.6451,
"step": 538
},
{
"epoch": 1.8879159369527145,
"grad_norm": 14.904067993164062,
"learning_rate": 1.4475909065958802e-05,
"loss": 1.1193,
"step": 539
},
{
"epoch": 1.8914185639229422,
"grad_norm": 13.490738868713379,
"learning_rate": 1.4397427913313076e-05,
"loss": 1.4777,
"step": 540
},
{
"epoch": 1.8949211908931698,
"grad_norm": 18.033458709716797,
"learning_rate": 1.4319040265469339e-05,
"loss": 0.9049,
"step": 541
},
{
"epoch": 1.8984238178633976,
"grad_norm": 13.456507682800293,
"learning_rate": 1.424074743068772e-05,
"loss": 3.3861,
"step": 542
},
{
"epoch": 1.9019264448336253,
"grad_norm": 12.611906051635742,
"learning_rate": 1.41625507156459e-05,
"loss": 1.6322,
"step": 543
},
{
"epoch": 1.905429071803853,
"grad_norm": 8.617958068847656,
"learning_rate": 1.4084451425417385e-05,
"loss": 0.4577,
"step": 544
},
{
"epoch": 1.9089316987740805,
"grad_norm": 14.570892333984375,
"learning_rate": 1.4006450863449683e-05,
"loss": 2.1014,
"step": 545
},
{
"epoch": 1.9124343257443082,
"grad_norm": 8.288101196289062,
"learning_rate": 1.3928550331542576e-05,
"loss": 0.6519,
"step": 546
},
{
"epoch": 1.915936952714536,
"grad_norm": 7.784424304962158,
"learning_rate": 1.3850751129826385e-05,
"loss": 0.6649,
"step": 547
},
{
"epoch": 1.9194395796847634,
"grad_norm": 10.094883918762207,
"learning_rate": 1.3773054556740261e-05,
"loss": 1.3525,
"step": 548
},
{
"epoch": 1.9229422066549913,
"grad_norm": 7.993988990783691,
"learning_rate": 1.369546190901052e-05,
"loss": 0.5407,
"step": 549
},
{
"epoch": 1.926444833625219,
"grad_norm": 15.709915161132812,
"learning_rate": 1.3617974481629008e-05,
"loss": 1.8723,
"step": 550
},
{
"epoch": 1.9299474605954465,
"grad_norm": 9.49156379699707,
"learning_rate": 1.3540593567831477e-05,
"loss": 0.5052,
"step": 551
},
{
"epoch": 1.9334500875656744,
"grad_norm": 13.911781311035156,
"learning_rate": 1.3463320459076028e-05,
"loss": 1.2281,
"step": 552
},
{
"epoch": 1.9369527145359018,
"grad_norm": 7.616586685180664,
"learning_rate": 1.3386156445021507e-05,
"loss": 0.763,
"step": 553
},
{
"epoch": 1.9404553415061296,
"grad_norm": 12.074398040771484,
"learning_rate": 1.3309102813506035e-05,
"loss": 1.1219,
"step": 554
},
{
"epoch": 1.9439579684763573,
"grad_norm": 18.879173278808594,
"learning_rate": 1.3232160850525477e-05,
"loss": 1.2152,
"step": 555
},
{
"epoch": 1.947460595446585,
"grad_norm": 8.424382209777832,
"learning_rate": 1.3155331840211992e-05,
"loss": 1.1847,
"step": 556
},
{
"epoch": 1.9509632224168127,
"grad_norm": 9.01657485961914,
"learning_rate": 1.3078617064812612e-05,
"loss": 0.722,
"step": 557
},
{
"epoch": 1.9544658493870402,
"grad_norm": 7.651902198791504,
"learning_rate": 1.3002017804667824e-05,
"loss": 0.3635,
"step": 558
},
{
"epoch": 1.957968476357268,
"grad_norm": 10.501350402832031,
"learning_rate": 1.2925535338190218e-05,
"loss": 1.3115,
"step": 559
},
{
"epoch": 1.9614711033274956,
"grad_norm": 10.333868026733398,
"learning_rate": 1.2849170941843126e-05,
"loss": 1.0494,
"step": 560
},
{
"epoch": 1.9649737302977233,
"grad_norm": 14.399115562438965,
"learning_rate": 1.2772925890119339e-05,
"loss": 1.755,
"step": 561
},
{
"epoch": 1.9684763572679511,
"grad_norm": 9.45056438446045,
"learning_rate": 1.269680145551985e-05,
"loss": 1.8133,
"step": 562
},
{
"epoch": 1.9719789842381785,
"grad_norm": 12.52590560913086,
"learning_rate": 1.2620798908532578e-05,
"loss": 0.8922,
"step": 563
},
{
"epoch": 1.9754816112084064,
"grad_norm": 20.283485412597656,
"learning_rate": 1.2544919517611191e-05,
"loss": 1.8731,
"step": 564
},
{
"epoch": 1.978984238178634,
"grad_norm": 16.538997650146484,
"learning_rate": 1.2469164549153922e-05,
"loss": 0.9377,
"step": 565
},
{
"epoch": 1.9824868651488616,
"grad_norm": 11.674798965454102,
"learning_rate": 1.239353526748245e-05,
"loss": 0.907,
"step": 566
},
{
"epoch": 1.9859894921190895,
"grad_norm": 11.961647033691406,
"learning_rate": 1.2318032934820794e-05,
"loss": 1.5268,
"step": 567
},
{
"epoch": 1.989492119089317,
"grad_norm": 10.949502944946289,
"learning_rate": 1.2242658811274222e-05,
"loss": 1.4954,
"step": 568
},
{
"epoch": 1.9929947460595447,
"grad_norm": 6.9351935386657715,
"learning_rate": 1.2167414154808258e-05,
"loss": 0.4464,
"step": 569
},
{
"epoch": 1.9964973730297724,
"grad_norm": 8.37053394317627,
"learning_rate": 1.2092300221227661e-05,
"loss": 0.642,
"step": 570
},
{
"epoch": 2.0,
"grad_norm": 15.300679206848145,
"learning_rate": 1.201731826415548e-05,
"loss": 1.9645,
"step": 571
},
{
"epoch": 2.003502626970228,
"grad_norm": 4.111511707305908,
"learning_rate": 1.1942469535012115e-05,
"loss": 0.2615,
"step": 572
},
{
"epoch": 2.0070052539404553,
"grad_norm": 3.611501455307007,
"learning_rate": 1.1867755282994464e-05,
"loss": 0.2108,
"step": 573
},
{
"epoch": 2.010507880910683,
"grad_norm": 6.634884834289551,
"learning_rate": 1.1793176755055037e-05,
"loss": 0.8019,
"step": 574
},
{
"epoch": 2.0140105078809105,
"grad_norm": 11.04240608215332,
"learning_rate": 1.1718735195881159e-05,
"loss": 1.0215,
"step": 575
},
{
"epoch": 2.0175131348511384,
"grad_norm": 2.4960787296295166,
"learning_rate": 1.164443184787421e-05,
"loss": 0.2112,
"step": 576
},
{
"epoch": 2.021015761821366,
"grad_norm": 6.72168493270874,
"learning_rate": 1.1570267951128853e-05,
"loss": 0.2771,
"step": 577
},
{
"epoch": 2.0245183887915936,
"grad_norm": 7.282315254211426,
"learning_rate": 1.1496244743412395e-05,
"loss": 0.3761,
"step": 578
},
{
"epoch": 2.0280210157618215,
"grad_norm": 7.026294708251953,
"learning_rate": 1.1422363460144087e-05,
"loss": 0.207,
"step": 579
},
{
"epoch": 2.031523642732049,
"grad_norm": 9.867234230041504,
"learning_rate": 1.1348625334374501e-05,
"loss": 0.4578,
"step": 580
},
{
"epoch": 2.0350262697022767,
"grad_norm": 6.709160327911377,
"learning_rate": 1.1275031596764975e-05,
"loss": 0.2914,
"step": 581
},
{
"epoch": 2.0385288966725046,
"grad_norm": 6.458284854888916,
"learning_rate": 1.1201583475567046e-05,
"loss": 0.6003,
"step": 582
},
{
"epoch": 2.042031523642732,
"grad_norm": 6.811922550201416,
"learning_rate": 1.1128282196602011e-05,
"loss": 0.4343,
"step": 583
},
{
"epoch": 2.04553415061296,
"grad_norm": 13.446271896362305,
"learning_rate": 1.105512898324039e-05,
"loss": 0.1694,
"step": 584
},
{
"epoch": 2.0490367775831873,
"grad_norm": 3.9686508178710938,
"learning_rate": 1.098212505638155e-05,
"loss": 0.2998,
"step": 585
},
{
"epoch": 2.052539404553415,
"grad_norm": 11.05303955078125,
"learning_rate": 1.0909271634433317e-05,
"loss": 0.268,
"step": 586
},
{
"epoch": 2.056042031523643,
"grad_norm": 14.222155570983887,
"learning_rate": 1.0836569933291663e-05,
"loss": 0.4609,
"step": 587
},
{
"epoch": 2.0595446584938704,
"grad_norm": 5.8074259757995605,
"learning_rate": 1.0764021166320403e-05,
"loss": 0.3518,
"step": 588
},
{
"epoch": 2.063047285464098,
"grad_norm": 7.573509216308594,
"learning_rate": 1.0691626544330913e-05,
"loss": 0.1935,
"step": 589
},
{
"epoch": 2.0665499124343256,
"grad_norm": 7.35963773727417,
"learning_rate": 1.0619387275561957e-05,
"loss": 0.142,
"step": 590
},
{
"epoch": 2.0700525394045535,
"grad_norm": 26.056726455688477,
"learning_rate": 1.0547304565659528e-05,
"loss": 0.5094,
"step": 591
},
{
"epoch": 2.073555166374781,
"grad_norm": 12.973404884338379,
"learning_rate": 1.0475379617656692e-05,
"loss": 0.4237,
"step": 592
},
{
"epoch": 2.0770577933450087,
"grad_norm": 15.458786964416504,
"learning_rate": 1.0403613631953544e-05,
"loss": 0.3591,
"step": 593
},
{
"epoch": 2.0805604203152366,
"grad_norm": 9.443426132202148,
"learning_rate": 1.0332007806297137e-05,
"loss": 0.3133,
"step": 594
},
{
"epoch": 2.084063047285464,
"grad_norm": 11.876457214355469,
"learning_rate": 1.0260563335761548e-05,
"loss": 0.191,
"step": 595
},
{
"epoch": 2.087565674255692,
"grad_norm": 16.497386932373047,
"learning_rate": 1.0189281412727863e-05,
"loss": 0.3268,
"step": 596
},
{
"epoch": 2.0910683012259192,
"grad_norm": 6.713415622711182,
"learning_rate": 1.0118163226864324e-05,
"loss": 0.1679,
"step": 597
},
{
"epoch": 2.094570928196147,
"grad_norm": 8.332857131958008,
"learning_rate": 1.0047209965106471e-05,
"loss": 0.3882,
"step": 598
},
{
"epoch": 2.098073555166375,
"grad_norm": 15.426718711853027,
"learning_rate": 9.976422811637323e-06,
"loss": 0.6588,
"step": 599
},
{
"epoch": 2.1015761821366024,
"grad_norm": 19.400615692138672,
"learning_rate": 9.9058029478676e-06,
"loss": 0.5528,
"step": 600
},
{
"epoch": 2.10507880910683,
"grad_norm": 10.129969596862793,
"learning_rate": 9.835351552416026e-06,
"loss": 0.4625,
"step": 601
},
{
"epoch": 2.1085814360770576,
"grad_norm": 12.683780670166016,
"learning_rate": 9.765069801089658e-06,
"loss": 0.7214,
"step": 602
},
{
"epoch": 2.1120840630472855,
"grad_norm": 9.236724853515625,
"learning_rate": 9.694958866864262e-06,
"loss": 0.4221,
"step": 603
},
{
"epoch": 2.1155866900175133,
"grad_norm": 5.072461128234863,
"learning_rate": 9.62501991986472e-06,
"loss": 0.1352,
"step": 604
},
{
"epoch": 2.1190893169877407,
"grad_norm": 5.266534805297852,
"learning_rate": 9.555254127345506e-06,
"loss": 0.3825,
"step": 605
},
{
"epoch": 2.1225919439579686,
"grad_norm": 2.0258290767669678,
"learning_rate": 9.485662653671237e-06,
"loss": 0.1042,
"step": 606
},
{
"epoch": 2.126094570928196,
"grad_norm": 7.214817047119141,
"learning_rate": 9.416246660297186e-06,
"loss": 0.1874,
"step": 607
},
{
"epoch": 2.129597197898424,
"grad_norm": 14.512842178344727,
"learning_rate": 9.347007305749954e-06,
"loss": 0.2769,
"step": 608
},
{
"epoch": 2.1330998248686517,
"grad_norm": 12.86308765411377,
"learning_rate": 9.277945745608076e-06,
"loss": 0.4056,
"step": 609
},
{
"epoch": 2.136602451838879,
"grad_norm": 8.935503959655762,
"learning_rate": 9.2090631324828e-06,
"loss": 0.267,
"step": 610
},
{
"epoch": 2.140105078809107,
"grad_norm": 10.452750205993652,
"learning_rate": 9.140360615998789e-06,
"loss": 0.9766,
"step": 611
},
{
"epoch": 2.1436077057793343,
"grad_norm": 12.40889835357666,
"learning_rate": 9.071839342774966e-06,
"loss": 0.6476,
"step": 612
},
{
"epoch": 2.147110332749562,
"grad_norm": 7.821516513824463,
"learning_rate": 9.003500456405385e-06,
"loss": 0.2597,
"step": 613
},
{
"epoch": 2.1506129597197896,
"grad_norm": 32.6134147644043,
"learning_rate": 8.935345097440131e-06,
"loss": 0.3772,
"step": 614
},
{
"epoch": 2.1541155866900175,
"grad_norm": 8.472585678100586,
"learning_rate": 8.867374403366276e-06,
"loss": 0.2884,
"step": 615
},
{
"epoch": 2.1576182136602453,
"grad_norm": 8.967446327209473,
"learning_rate": 8.7995895085889e-06,
"loss": 0.2359,
"step": 616
},
{
"epoch": 2.1611208406304727,
"grad_norm": 4.8656415939331055,
"learning_rate": 8.731991544412184e-06,
"loss": 0.201,
"step": 617
},
{
"epoch": 2.1646234676007006,
"grad_norm": 17.89019012451172,
"learning_rate": 8.6645816390205e-06,
"loss": 0.5396,
"step": 618
},
{
"epoch": 2.168126094570928,
"grad_norm": 4.795694351196289,
"learning_rate": 8.597360917459585e-06,
"loss": 0.3591,
"step": 619
},
{
"epoch": 2.171628721541156,
"grad_norm": 9.445202827453613,
"learning_rate": 8.53033050161776e-06,
"loss": 0.275,
"step": 620
},
{
"epoch": 2.1751313485113837,
"grad_norm": 12.556199073791504,
"learning_rate": 8.46349151020725e-06,
"loss": 0.3625,
"step": 621
},
{
"epoch": 2.178633975481611,
"grad_norm": 3.0647783279418945,
"learning_rate": 8.396845058745433e-06,
"loss": 0.1984,
"step": 622
},
{
"epoch": 2.182136602451839,
"grad_norm": 13.919934272766113,
"learning_rate": 8.330392259536315e-06,
"loss": 0.3918,
"step": 623
},
{
"epoch": 2.1856392294220663,
"grad_norm": 5.176346302032471,
"learning_rate": 8.264134221651879e-06,
"loss": 0.3719,
"step": 624
},
{
"epoch": 2.189141856392294,
"grad_norm": 6.9213762283325195,
"learning_rate": 8.198072050913643e-06,
"loss": 0.3471,
"step": 625
},
{
"epoch": 2.192644483362522,
"grad_norm": 2.5226728916168213,
"learning_rate": 8.132206849874166e-06,
"loss": 0.2453,
"step": 626
},
{
"epoch": 2.1961471103327495,
"grad_norm": 4.744520664215088,
"learning_rate": 8.066539717798648e-06,
"loss": 0.2357,
"step": 627
},
{
"epoch": 2.1996497373029773,
"grad_norm": 7.503551006317139,
"learning_rate": 8.001071750646615e-06,
"loss": 0.5094,
"step": 628
},
{
"epoch": 2.2031523642732047,
"grad_norm": 5.823261737823486,
"learning_rate": 7.935804041053594e-06,
"loss": 0.2253,
"step": 629
},
{
"epoch": 2.2066549912434326,
"grad_norm": 10.464018821716309,
"learning_rate": 7.870737678312887e-06,
"loss": 0.3054,
"step": 630
},
{
"epoch": 2.2101576182136604,
"grad_norm": 4.4330735206604,
"learning_rate": 7.805873748357387e-06,
"loss": 0.1626,
"step": 631
},
{
"epoch": 2.213660245183888,
"grad_norm": 4.565890312194824,
"learning_rate": 7.74121333374148e-06,
"loss": 0.2295,
"step": 632
},
{
"epoch": 2.2171628721541157,
"grad_norm": 8.96017074584961,
"learning_rate": 7.67675751362295e-06,
"loss": 0.6048,
"step": 633
},
{
"epoch": 2.220665499124343,
"grad_norm": 10.378385543823242,
"learning_rate": 7.612507363744972e-06,
"loss": 0.8332,
"step": 634
},
{
"epoch": 2.224168126094571,
"grad_norm": 7.120364665985107,
"learning_rate": 7.548463956418157e-06,
"loss": 0.2532,
"step": 635
},
{
"epoch": 2.227670753064799,
"grad_norm": 3.8531832695007324,
"learning_rate": 7.484628360502683e-06,
"loss": 0.5231,
"step": 636
},
{
"epoch": 2.231173380035026,
"grad_norm": 8.29166316986084,
"learning_rate": 7.421001641390406e-06,
"loss": 0.74,
"step": 637
},
{
"epoch": 2.234676007005254,
"grad_norm": 14.644600868225098,
"learning_rate": 7.357584860987139e-06,
"loss": 0.712,
"step": 638
},
{
"epoch": 2.2381786339754814,
"grad_norm": 8.683403968811035,
"learning_rate": 7.2943790776948645e-06,
"loss": 0.1712,
"step": 639
},
{
"epoch": 2.2416812609457093,
"grad_norm": 11.98154067993164,
"learning_rate": 7.231385346394133e-06,
"loss": 1.1342,
"step": 640
},
{
"epoch": 2.245183887915937,
"grad_norm": 5.7232136726379395,
"learning_rate": 7.168604718426408e-06,
"loss": 0.2113,
"step": 641
},
{
"epoch": 2.2486865148861646,
"grad_norm": 7.635982990264893,
"learning_rate": 7.106038241576541e-06,
"loss": 0.2895,
"step": 642
},
{
"epoch": 2.2521891418563924,
"grad_norm": 16.10260772705078,
"learning_rate": 7.043686960055292e-06,
"loss": 0.2543,
"step": 643
},
{
"epoch": 2.25569176882662,
"grad_norm": 11.05922794342041,
"learning_rate": 6.981551914481894e-06,
"loss": 0.431,
"step": 644
},
{
"epoch": 2.2591943957968477,
"grad_norm": 5.137984275817871,
"learning_rate": 6.919634141866674e-06,
"loss": 0.2008,
"step": 645
},
{
"epoch": 2.2626970227670755,
"grad_norm": 10.650192260742188,
"learning_rate": 6.857934675593758e-06,
"loss": 0.4342,
"step": 646
},
{
"epoch": 2.266199649737303,
"grad_norm": 19.132814407348633,
"learning_rate": 6.79645454540381e-06,
"loss": 0.4469,
"step": 647
},
{
"epoch": 2.2697022767075308,
"grad_norm": 7.868581771850586,
"learning_rate": 6.735194777376901e-06,
"loss": 0.4569,
"step": 648
},
{
"epoch": 2.273204903677758,
"grad_norm": 5.338744163513184,
"learning_rate": 6.6741563939152946e-06,
"loss": 0.5621,
"step": 649
},
{
"epoch": 2.276707530647986,
"grad_norm": 8.34805679321289,
"learning_rate": 6.61334041372645e-06,
"loss": 0.5147,
"step": 650
},
{
"epoch": 2.280210157618214,
"grad_norm": 7.190115451812744,
"learning_rate": 6.552747851805989e-06,
"loss": 0.3024,
"step": 651
},
{
"epoch": 2.2837127845884413,
"grad_norm": 6.776481628417969,
"learning_rate": 6.492379719420787e-06,
"loss": 0.1938,
"step": 652
},
{
"epoch": 2.287215411558669,
"grad_norm": 8.968748092651367,
"learning_rate": 6.432237024092065e-06,
"loss": 0.2207,
"step": 653
},
{
"epoch": 2.2907180385288965,
"grad_norm": 4.696019172668457,
"learning_rate": 6.372320769578581e-06,
"loss": 0.3357,
"step": 654
},
{
"epoch": 2.2942206654991244,
"grad_norm": 1.6001213788986206,
"learning_rate": 6.312631955859878e-06,
"loss": 0.1304,
"step": 655
},
{
"epoch": 2.2977232924693523,
"grad_norm": 5.230320930480957,
"learning_rate": 6.253171579119619e-06,
"loss": 0.4285,
"step": 656
},
{
"epoch": 2.3012259194395797,
"grad_norm": 12.429251670837402,
"learning_rate": 6.1939406317289165e-06,
"loss": 0.4969,
"step": 657
},
{
"epoch": 2.3047285464098075,
"grad_norm": 6.5600266456604,
"learning_rate": 6.134940102229811e-06,
"loss": 0.256,
"step": 658
},
{
"epoch": 2.308231173380035,
"grad_norm": 8.233277320861816,
"learning_rate": 6.076170975318745e-06,
"loss": 0.2827,
"step": 659
},
{
"epoch": 2.3117338003502628,
"grad_norm": 6.555598735809326,
"learning_rate": 6.0176342318301515e-06,
"loss": 0.32,
"step": 660
},
{
"epoch": 2.31523642732049,
"grad_norm": 8.620915412902832,
"learning_rate": 5.959330848720062e-06,
"loss": 0.1916,
"step": 661
},
{
"epoch": 2.318739054290718,
"grad_norm": 9.808154106140137,
"learning_rate": 5.901261799049807e-06,
"loss": 0.2856,
"step": 662
},
{
"epoch": 2.322241681260946,
"grad_norm": 8.030816078186035,
"learning_rate": 5.843428051969797e-06,
"loss": 0.3204,
"step": 663
},
{
"epoch": 2.3257443082311733,
"grad_norm": 5.74130916595459,
"learning_rate": 5.785830572703326e-06,
"loss": 0.2252,
"step": 664
},
{
"epoch": 2.329246935201401,
"grad_norm": 7.626296043395996,
"learning_rate": 5.728470322530464e-06,
"loss": 0.2844,
"step": 665
},
{
"epoch": 2.3327495621716285,
"grad_norm": 13.404093742370605,
"learning_rate": 5.671348258772007e-06,
"loss": 0.5269,
"step": 666
},
{
"epoch": 2.3362521891418564,
"grad_norm": 14.98967170715332,
"learning_rate": 5.614465334773533e-06,
"loss": 0.6109,
"step": 667
},
{
"epoch": 2.3397548161120842,
"grad_norm": 11.958736419677734,
"learning_rate": 5.5578224998894625e-06,
"loss": 0.214,
"step": 668
},
{
"epoch": 2.3432574430823117,
"grad_norm": 11.929584503173828,
"learning_rate": 5.501420699467204e-06,
"loss": 0.7827,
"step": 669
},
{
"epoch": 2.3467600700525395,
"grad_norm": 19.051668167114258,
"learning_rate": 5.445260874831402e-06,
"loss": 0.4145,
"step": 670
},
{
"epoch": 2.350262697022767,
"grad_norm": 8.200482368469238,
"learning_rate": 5.389343963268225e-06,
"loss": 0.1799,
"step": 671
},
{
"epoch": 2.3537653239929948,
"grad_norm": 9.073884963989258,
"learning_rate": 5.3336708980097e-06,
"loss": 0.1788,
"step": 672
},
{
"epoch": 2.357267950963222,
"grad_norm": 15.959501266479492,
"learning_rate": 5.2782426082181645e-06,
"loss": 0.659,
"step": 673
},
{
"epoch": 2.36077057793345,
"grad_norm": 7.414675712585449,
"learning_rate": 5.223060018970733e-06,
"loss": 0.335,
"step": 674
},
{
"epoch": 2.364273204903678,
"grad_norm": 9.217352867126465,
"learning_rate": 5.168124051243893e-06,
"loss": 0.4305,
"step": 675
},
{
"epoch": 2.3677758318739053,
"grad_norm": 12.484313011169434,
"learning_rate": 5.1134356218980866e-06,
"loss": 0.5912,
"step": 676
},
{
"epoch": 2.371278458844133,
"grad_norm": 9.667016983032227,
"learning_rate": 5.058995643662443e-06,
"loss": 0.4746,
"step": 677
},
{
"epoch": 2.3747810858143605,
"grad_norm": 15.732196807861328,
"learning_rate": 5.004805025119546e-06,
"loss": 0.4182,
"step": 678
},
{
"epoch": 2.3782837127845884,
"grad_norm": 5.530452728271484,
"learning_rate": 4.950864670690257e-06,
"loss": 0.3474,
"step": 679
},
{
"epoch": 2.3817863397548162,
"grad_norm": 7.507744312286377,
"learning_rate": 4.897175480618619e-06,
"loss": 0.2377,
"step": 680
},
{
"epoch": 2.3852889667250436,
"grad_norm": 0.8480662703514099,
"learning_rate": 4.8437383509568324e-06,
"loss": 0.1067,
"step": 681
},
{
"epoch": 2.3887915936952715,
"grad_norm": 8.764185905456543,
"learning_rate": 4.790554173550319e-06,
"loss": 0.4602,
"step": 682
},
{
"epoch": 2.392294220665499,
"grad_norm": 8.413671493530273,
"learning_rate": 4.737623836022822e-06,
"loss": 0.4463,
"step": 683
},
{
"epoch": 2.3957968476357268,
"grad_norm": 6.1764068603515625,
"learning_rate": 4.684948221761583e-06,
"loss": 0.354,
"step": 684
},
{
"epoch": 2.3992994746059546,
"grad_norm": 5.312464714050293,
"learning_rate": 4.632528209902607e-06,
"loss": 0.1352,
"step": 685
},
{
"epoch": 2.402802101576182,
"grad_norm": 10.079058647155762,
"learning_rate": 4.580364675316014e-06,
"loss": 0.4412,
"step": 686
},
{
"epoch": 2.40630472854641,
"grad_norm": 6.57410192489624,
"learning_rate": 4.528458488591385e-06,
"loss": 0.6779,
"step": 687
},
{
"epoch": 2.4098073555166373,
"grad_norm": 7.1317009925842285,
"learning_rate": 4.476810516023293e-06,
"loss": 0.1367,
"step": 688
},
{
"epoch": 2.413309982486865,
"grad_norm": 9.098307609558105,
"learning_rate": 4.425421619596786e-06,
"loss": 0.4837,
"step": 689
},
{
"epoch": 2.416812609457093,
"grad_norm": 12.327546119689941,
"learning_rate": 4.374292656973058e-06,
"loss": 0.2527,
"step": 690
},
{
"epoch": 2.4203152364273204,
"grad_norm": 5.387122631072998,
"learning_rate": 4.323424481475085e-06,
"loss": 0.2458,
"step": 691
},
{
"epoch": 2.4238178633975482,
"grad_norm": 9.385303497314453,
"learning_rate": 4.272817942073408e-06,
"loss": 0.2781,
"step": 692
},
{
"epoch": 2.4273204903677756,
"grad_norm": 3.463916778564453,
"learning_rate": 4.2224738833719695e-06,
"loss": 0.115,
"step": 693
},
{
"epoch": 2.4308231173380035,
"grad_norm": 10.816231727600098,
"learning_rate": 4.172393145594009e-06,
"loss": 0.2425,
"step": 694
},
{
"epoch": 2.4343257443082313,
"grad_norm": 6.746379375457764,
"learning_rate": 4.122576564568031e-06,
"loss": 0.2906,
"step": 695
},
{
"epoch": 2.4378283712784588,
"grad_norm": 1.8436380624771118,
"learning_rate": 4.073024971713859e-06,
"loss": 0.0874,
"step": 696
},
{
"epoch": 2.4413309982486866,
"grad_norm": 4.27414608001709,
"learning_rate": 4.023739194028784e-06,
"loss": 0.254,
"step": 697
},
{
"epoch": 2.444833625218914,
"grad_norm": 18.09396743774414,
"learning_rate": 3.974720054073733e-06,
"loss": 0.7811,
"step": 698
},
{
"epoch": 2.448336252189142,
"grad_norm": 13.307997703552246,
"learning_rate": 3.925968369959548e-06,
"loss": 0.5651,
"step": 699
},
{
"epoch": 2.4518388791593697,
"grad_norm": 11.251547813415527,
"learning_rate": 3.877484955333326e-06,
"loss": 0.7353,
"step": 700
},
{
"epoch": 2.455341506129597,
"grad_norm": 5.3191046714782715,
"learning_rate": 3.829270619364873e-06,
"loss": 0.2166,
"step": 701
},
{
"epoch": 2.458844133099825,
"grad_norm": 17.071352005004883,
"learning_rate": 3.781326166733148e-06,
"loss": 0.3852,
"step": 702
},
{
"epoch": 2.4623467600700524,
"grad_norm": 42.43695831298828,
"learning_rate": 3.7336523976128812e-06,
"loss": 0.8098,
"step": 703
},
{
"epoch": 2.4658493870402802,
"grad_norm": 15.574095726013184,
"learning_rate": 3.686250107661176e-06,
"loss": 0.6515,
"step": 704
},
{
"epoch": 2.469352014010508,
"grad_norm": 5.729953765869141,
"learning_rate": 3.6391200880042754e-06,
"loss": 0.4977,
"step": 705
},
{
"epoch": 2.4728546409807355,
"grad_norm": 7.877871513366699,
"learning_rate": 3.5922631252243132e-06,
"loss": 0.2416,
"step": 706
},
{
"epoch": 2.4763572679509633,
"grad_norm": 6.631842613220215,
"learning_rate": 3.5456800013462166e-06,
"loss": 0.216,
"step": 707
},
{
"epoch": 2.4798598949211907,
"grad_norm": 11.070491790771484,
"learning_rate": 3.499371493824646e-06,
"loss": 0.5384,
"step": 708
},
{
"epoch": 2.4833625218914186,
"grad_norm": 8.516698837280273,
"learning_rate": 3.4533383755310257e-06,
"loss": 0.3293,
"step": 709
},
{
"epoch": 2.4868651488616464,
"grad_norm": 4.230687618255615,
"learning_rate": 3.407581414740626e-06,
"loss": 0.1136,
"step": 710
},
{
"epoch": 2.490367775831874,
"grad_norm": 6.080500602722168,
"learning_rate": 3.362101375119755e-06,
"loss": 0.434,
"step": 711
},
{
"epoch": 2.4938704028021017,
"grad_norm": 6.872626781463623,
"learning_rate": 3.316899015713009e-06,
"loss": 0.1984,
"step": 712
},
{
"epoch": 2.497373029772329,
"grad_norm": 8.121652603149414,
"learning_rate": 3.2719750909306303e-06,
"loss": 0.2077,
"step": 713
},
{
"epoch": 2.500875656742557,
"grad_norm": 7.671825408935547,
"learning_rate": 3.2273303505358667e-06,
"loss": 0.1965,
"step": 714
},
{
"epoch": 2.504378283712785,
"grad_norm": 5.99457311630249,
"learning_rate": 3.182965539632492e-06,
"loss": 0.2283,
"step": 715
},
{
"epoch": 2.507880910683012,
"grad_norm": 12.298001289367676,
"learning_rate": 3.138881398652358e-06,
"loss": 0.2892,
"step": 716
},
{
"epoch": 2.51138353765324,
"grad_norm": 5.526932716369629,
"learning_rate": 3.0950786633430564e-06,
"loss": 0.208,
"step": 717
},
{
"epoch": 2.5148861646234675,
"grad_norm": 7.946258544921875,
"learning_rate": 3.05155806475562e-06,
"loss": 1.0295,
"step": 718
},
{
"epoch": 2.5183887915936953,
"grad_norm": 7.10674524307251,
"learning_rate": 3.008320329232317e-06,
"loss": 0.3865,
"step": 719
},
{
"epoch": 2.521891418563923,
"grad_norm": 5.112523555755615,
"learning_rate": 2.965366178394542e-06,
"loss": 0.1231,
"step": 720
},
{
"epoch": 2.5253940455341506,
"grad_norm": 7.176146030426025,
"learning_rate": 2.922696329130783e-06,
"loss": 0.4793,
"step": 721
},
{
"epoch": 2.5288966725043784,
"grad_norm": 3.4290506839752197,
"learning_rate": 2.8803114935846178e-06,
"loss": 0.301,
"step": 722
},
{
"epoch": 2.532399299474606,
"grad_norm": 6.2297210693359375,
"learning_rate": 2.8382123791428795e-06,
"loss": 0.1753,
"step": 723
},
{
"epoch": 2.5359019264448337,
"grad_norm": 5.336044788360596,
"learning_rate": 2.7963996884238034e-06,
"loss": 0.2035,
"step": 724
},
{
"epoch": 2.5394045534150615,
"grad_norm": 7.6555891036987305,
"learning_rate": 2.754874119265336e-06,
"loss": 0.359,
"step": 725
},
{
"epoch": 2.542907180385289,
"grad_norm": 8.17387866973877,
"learning_rate": 2.7136363647134655e-06,
"loss": 0.7163,
"step": 726
},
{
"epoch": 2.5464098073555164,
"grad_norm": 7.864010334014893,
"learning_rate": 2.6726871130106637e-06,
"loss": 0.3539,
"step": 727
},
{
"epoch": 2.549912434325744,
"grad_norm": 6.670307159423828,
"learning_rate": 2.6320270475844023e-06,
"loss": 0.19,
"step": 728
},
{
"epoch": 2.553415061295972,
"grad_norm": 7.956202507019043,
"learning_rate": 2.5916568470357486e-06,
"loss": 0.4138,
"step": 729
},
{
"epoch": 2.5569176882662,
"grad_norm": 2.7980082035064697,
"learning_rate": 2.5515771851280248e-06,
"loss": 0.1076,
"step": 730
},
{
"epoch": 2.5604203152364273,
"grad_norm": 9.661548614501953,
"learning_rate": 2.5117887307755775e-06,
"loss": 0.2974,
"step": 731
},
{
"epoch": 2.5639229422066547,
"grad_norm": 8.196139335632324,
"learning_rate": 2.472292148032609e-06,
"loss": 0.3056,
"step": 732
},
{
"epoch": 2.5674255691768826,
"grad_norm": 7.685372829437256,
"learning_rate": 2.433088096082108e-06,
"loss": 0.3848,
"step": 733
},
{
"epoch": 2.5709281961471104,
"grad_norm": 6.782233715057373,
"learning_rate": 2.3941772292248146e-06,
"loss": 0.1736,
"step": 734
},
{
"epoch": 2.574430823117338,
"grad_norm": 3.7146668434143066,
"learning_rate": 2.3555601968683273e-06,
"loss": 0.265,
"step": 735
},
{
"epoch": 2.5779334500875657,
"grad_norm": 6.972784996032715,
"learning_rate": 2.317237643516268e-06,
"loss": 0.189,
"step": 736
},
{
"epoch": 2.581436077057793,
"grad_norm": 3.0132017135620117,
"learning_rate": 2.2792102087575053e-06,
"loss": 0.1583,
"step": 737
},
{
"epoch": 2.584938704028021,
"grad_norm": 5.253518104553223,
"learning_rate": 2.2414785272555005e-06,
"loss": 0.3482,
"step": 738
},
{
"epoch": 2.588441330998249,
"grad_norm": 4.389585018157959,
"learning_rate": 2.204043228737689e-06,
"loss": 0.2313,
"step": 739
},
{
"epoch": 2.591943957968476,
"grad_norm": 11.996556282043457,
"learning_rate": 2.166904937985006e-06,
"loss": 0.4344,
"step": 740
},
{
"epoch": 2.595446584938704,
"grad_norm": 9.555136680603027,
"learning_rate": 2.1300642748214284e-06,
"loss": 0.2702,
"step": 741
},
{
"epoch": 2.5989492119089315,
"grad_norm": 10.910072326660156,
"learning_rate": 2.0935218541036373e-06,
"loss": 0.3017,
"step": 742
},
{
"epoch": 2.6024518388791593,
"grad_norm": 4.974386692047119,
"learning_rate": 2.057278285710771e-06,
"loss": 0.3118,
"step": 743
},
{
"epoch": 2.605954465849387,
"grad_norm": 7.93409538269043,
"learning_rate": 2.0213341745342373e-06,
"loss": 0.4271,
"step": 744
},
{
"epoch": 2.6094570928196146,
"grad_norm": 3.470421314239502,
"learning_rate": 1.9856901204676093e-06,
"loss": 0.167,
"step": 745
},
{
"epoch": 2.6129597197898424,
"grad_norm": 11.024426460266113,
"learning_rate": 1.950346718396614e-06,
"loss": 0.5929,
"step": 746
},
{
"epoch": 2.61646234676007,
"grad_norm": 4.000313758850098,
"learning_rate": 1.915304558189226e-06,
"loss": 0.4232,
"step": 747
},
{
"epoch": 2.6199649737302977,
"grad_norm": 4.251917839050293,
"learning_rate": 1.8805642246858035e-06,
"loss": 0.332,
"step": 748
},
{
"epoch": 2.6234676007005255,
"grad_norm": 14.466004371643066,
"learning_rate": 1.846126297689328e-06,
"loss": 0.5515,
"step": 749
},
{
"epoch": 2.626970227670753,
"grad_norm": 7.978627681732178,
"learning_rate": 1.8119913519557264e-06,
"loss": 0.2431,
"step": 750
},
{
"epoch": 2.630472854640981,
"grad_norm": 8.531673431396484,
"learning_rate": 1.7781599571842956e-06,
"loss": 0.4857,
"step": 751
},
{
"epoch": 2.633975481611208,
"grad_norm": 6.550644874572754,
"learning_rate": 1.7446326780081713e-06,
"loss": 0.5275,
"step": 752
},
{
"epoch": 2.637478108581436,
"grad_norm": 8.497308731079102,
"learning_rate": 1.7114100739849249e-06,
"loss": 0.3287,
"step": 753
},
{
"epoch": 2.640980735551664,
"grad_norm": 3.7064123153686523,
"learning_rate": 1.678492699587204e-06,
"loss": 0.1416,
"step": 754
},
{
"epoch": 2.6444833625218913,
"grad_norm": 1.5083993673324585,
"learning_rate": 1.6458811041935007e-06,
"loss": 0.1093,
"step": 755
},
{
"epoch": 2.647985989492119,
"grad_norm": 6.478839874267578,
"learning_rate": 1.6135758320789641e-06,
"loss": 0.3954,
"step": 756
},
{
"epoch": 2.6514886164623466,
"grad_norm": 9.450393676757812,
"learning_rate": 1.5815774224063218e-06,
"loss": 0.4096,
"step": 757
},
{
"epoch": 2.6549912434325744,
"grad_norm": 5.808910369873047,
"learning_rate": 1.5498864092168941e-06,
"loss": 0.2206,
"step": 758
},
{
"epoch": 2.6584938704028023,
"grad_norm": 3.4636878967285156,
"learning_rate": 1.5185033214216649e-06,
"loss": 0.3483,
"step": 759
},
{
"epoch": 2.6619964973730297,
"grad_norm": 11.797184944152832,
"learning_rate": 1.4874286827924599e-06,
"loss": 0.625,
"step": 760
},
{
"epoch": 2.6654991243432575,
"grad_norm": 11.192656517028809,
"learning_rate": 1.456663011953201e-06,
"loss": 0.2856,
"step": 761
},
{
"epoch": 2.669001751313485,
"grad_norm": 17.491886138916016,
"learning_rate": 1.4262068223712587e-06,
"loss": 0.6158,
"step": 762
},
{
"epoch": 2.672504378283713,
"grad_norm": 8.262967109680176,
"learning_rate": 1.3960606223488848e-06,
"loss": 0.4461,
"step": 763
},
{
"epoch": 2.6760070052539406,
"grad_norm": 25.136058807373047,
"learning_rate": 1.3662249150147155e-06,
"loss": 0.8565,
"step": 764
},
{
"epoch": 2.679509632224168,
"grad_norm": 8.498688697814941,
"learning_rate": 1.3367001983153837e-06,
"loss": 0.405,
"step": 765
},
{
"epoch": 2.683012259194396,
"grad_norm": 3.612231492996216,
"learning_rate": 1.3074869650072053e-06,
"loss": 0.2578,
"step": 766
},
{
"epoch": 2.6865148861646233,
"grad_norm": 6.869486331939697,
"learning_rate": 1.2785857026479609e-06,
"loss": 0.3389,
"step": 767
},
{
"epoch": 2.690017513134851,
"grad_norm": 9.957880973815918,
"learning_rate": 1.2499968935887474e-06,
"loss": 0.2081,
"step": 768
},
{
"epoch": 2.693520140105079,
"grad_norm": 4.789759635925293,
"learning_rate": 1.221721014965942e-06,
"loss": 0.3021,
"step": 769
},
{
"epoch": 2.6970227670753064,
"grad_norm": 4.189669609069824,
"learning_rate": 1.1937585386932281e-06,
"loss": 0.1325,
"step": 770
},
{
"epoch": 2.7005253940455343,
"grad_norm": 12.296935081481934,
"learning_rate": 1.1661099314537228e-06,
"loss": 0.1675,
"step": 771
},
{
"epoch": 2.7040280210157617,
"grad_norm": 1.7808837890625,
"learning_rate": 1.1387756546921902e-06,
"loss": 0.0979,
"step": 772
},
{
"epoch": 2.7075306479859895,
"grad_norm": 9.73708438873291,
"learning_rate": 1.1117561646073316e-06,
"loss": 0.3332,
"step": 773
},
{
"epoch": 2.7110332749562174,
"grad_norm": 10.950447082519531,
"learning_rate": 1.085051912144186e-06,
"loss": 0.4934,
"step": 774
},
{
"epoch": 2.714535901926445,
"grad_norm": 7.5403618812561035,
"learning_rate": 1.0586633429865966e-06,
"loss": 0.2746,
"step": 775
},
{
"epoch": 2.7180385288966726,
"grad_norm": 7.8029279708862305,
"learning_rate": 1.0325908975497657e-06,
"loss": 0.4395,
"step": 776
},
{
"epoch": 2.7215411558669,
"grad_norm": 4.058933258056641,
"learning_rate": 1.0068350109729063e-06,
"loss": 0.1307,
"step": 777
},
{
"epoch": 2.725043782837128,
"grad_norm": 7.272954940795898,
"learning_rate": 9.813961131120009e-07,
"loss": 0.204,
"step": 778
},
{
"epoch": 2.7285464098073557,
"grad_norm": 5.053043842315674,
"learning_rate": 9.562746285326007e-07,
"loss": 0.1737,
"step": 779
},
{
"epoch": 2.732049036777583,
"grad_norm": 4.146439075469971,
"learning_rate": 9.314709765027441e-07,
"loss": 0.3323,
"step": 780
},
{
"epoch": 2.735551663747811,
"grad_norm": 9.863334655761719,
"learning_rate": 9.069855709859743e-07,
"loss": 0.3481,
"step": 781
},
{
"epoch": 2.7390542907180384,
"grad_norm": 9.322667121887207,
"learning_rate": 8.828188206344168e-07,
"loss": 0.2881,
"step": 782
},
{
"epoch": 2.7425569176882663,
"grad_norm": 17.841449737548828,
"learning_rate": 8.589711287819624e-07,
"loss": 0.5504,
"step": 783
},
{
"epoch": 2.746059544658494,
"grad_norm": 5.811645984649658,
"learning_rate": 8.354428934375414e-07,
"loss": 0.2561,
"step": 784
},
{
"epoch": 2.7495621716287215,
"grad_norm": 8.1492338180542,
"learning_rate": 8.122345072784687e-07,
"loss": 0.3995,
"step": 785
},
{
"epoch": 2.7530647985989494,
"grad_norm": 16.187278747558594,
"learning_rate": 7.893463576439076e-07,
"loss": 0.7552,
"step": 786
},
{
"epoch": 2.7565674255691768,
"grad_norm": 3.650089979171753,
"learning_rate": 7.667788265283871e-07,
"loss": 0.2169,
"step": 787
},
{
"epoch": 2.7600700525394046,
"grad_norm": 6.4505743980407715,
"learning_rate": 7.445322905754326e-07,
"loss": 0.3677,
"step": 788
},
{
"epoch": 2.7635726795096325,
"grad_norm": 5.248019695281982,
"learning_rate": 7.226071210712926e-07,
"loss": 0.1273,
"step": 789
},
{
"epoch": 2.76707530647986,
"grad_norm": 5.4500837326049805,
"learning_rate": 7.010036839387257e-07,
"loss": 0.2377,
"step": 790
},
{
"epoch": 2.7705779334500873,
"grad_norm": 11.434654235839844,
"learning_rate": 6.797223397308994e-07,
"loss": 0.436,
"step": 791
},
{
"epoch": 2.774080560420315,
"grad_norm": 3.954472064971924,
"learning_rate": 6.587634436253654e-07,
"loss": 0.3242,
"step": 792
},
{
"epoch": 2.777583187390543,
"grad_norm": 7.352035045623779,
"learning_rate": 6.381273454181536e-07,
"loss": 0.4541,
"step": 793
},
{
"epoch": 2.781085814360771,
"grad_norm": 8.057424545288086,
"learning_rate": 6.178143895179123e-07,
"loss": 0.3616,
"step": 794
},
{
"epoch": 2.7845884413309983,
"grad_norm": 21.68024253845215,
"learning_rate": 5.978249149401638e-07,
"loss": 0.6163,
"step": 795
},
{
"epoch": 2.7880910683012257,
"grad_norm": 1.6756069660186768,
"learning_rate": 5.781592553016513e-07,
"loss": 0.0949,
"step": 796
},
{
"epoch": 2.7915936952714535,
"grad_norm": 15.292009353637695,
"learning_rate": 5.588177388147741e-07,
"loss": 0.195,
"step": 797
},
{
"epoch": 2.7950963222416814,
"grad_norm": 5.428353786468506,
"learning_rate": 5.398006882821016e-07,
"loss": 0.4268,
"step": 798
},
{
"epoch": 2.7985989492119088,
"grad_norm": 4.934905529022217,
"learning_rate": 5.211084210909922e-07,
"loss": 0.4207,
"step": 799
},
{
"epoch": 2.8021015761821366,
"grad_norm": 32.14579391479492,
"learning_rate": 5.027412492082895e-07,
"loss": 1.2459,
"step": 800
},
{
"epoch": 2.805604203152364,
"grad_norm": 5.984612464904785,
"learning_rate": 4.846994791751236e-07,
"loss": 0.2008,
"step": 801
},
{
"epoch": 2.809106830122592,
"grad_norm": 8.9047269821167,
"learning_rate": 4.669834121017891e-07,
"loss": 0.2059,
"step": 802
},
{
"epoch": 2.8126094570928197,
"grad_norm": 12.699865341186523,
"learning_rate": 4.495933436627198e-07,
"loss": 0.2789,
"step": 803
},
{
"epoch": 2.816112084063047,
"grad_norm": 6.121337413787842,
"learning_rate": 4.3252956409156164e-07,
"loss": 0.334,
"step": 804
},
{
"epoch": 2.819614711033275,
"grad_norm": 22.385419845581055,
"learning_rate": 4.1579235817632126e-07,
"loss": 0.7606,
"step": 805
},
{
"epoch": 2.8231173380035024,
"grad_norm": 2.840463161468506,
"learning_rate": 3.993820052546116e-07,
"loss": 0.2278,
"step": 806
},
{
"epoch": 2.8266199649737302,
"grad_norm": 9.669700622558594,
"learning_rate": 3.832987792089937e-07,
"loss": 0.2599,
"step": 807
},
{
"epoch": 2.830122591943958,
"grad_norm": 6.415764331817627,
"learning_rate": 3.6754294846240934e-07,
"loss": 0.4672,
"step": 808
},
{
"epoch": 2.8336252189141855,
"grad_norm": 12.454601287841797,
"learning_rate": 3.5211477597369535e-07,
"loss": 0.3072,
"step": 809
},
{
"epoch": 2.8371278458844134,
"grad_norm": 3.790520191192627,
"learning_rate": 3.370145192331964e-07,
"loss": 0.2364,
"step": 810
},
{
"epoch": 2.8406304728546408,
"grad_norm": 6.003750801086426,
"learning_rate": 3.222424302584615e-07,
"loss": 0.1876,
"step": 811
},
{
"epoch": 2.8441330998248686,
"grad_norm": 8.819941520690918,
"learning_rate": 3.0779875559005636e-07,
"loss": 0.4489,
"step": 812
},
{
"epoch": 2.8476357267950965,
"grad_norm": 9.377801895141602,
"learning_rate": 2.936837362874245e-07,
"loss": 0.3392,
"step": 813
},
{
"epoch": 2.851138353765324,
"grad_norm": 5.5884246826171875,
"learning_rate": 2.798976079248861e-07,
"loss": 0.3151,
"step": 814
},
{
"epoch": 2.8546409807355517,
"grad_norm": 29.302658081054688,
"learning_rate": 2.664406005876874e-07,
"loss": 0.6223,
"step": 815
},
{
"epoch": 2.858143607705779,
"grad_norm": 9.293455123901367,
"learning_rate": 2.5331293886817764e-07,
"loss": 0.379,
"step": 816
},
{
"epoch": 2.861646234676007,
"grad_norm": 3.9380757808685303,
"learning_rate": 2.4051484186204953e-07,
"loss": 0.3059,
"step": 817
},
{
"epoch": 2.865148861646235,
"grad_norm": 8.622139930725098,
"learning_rate": 2.2804652316468e-07,
"loss": 0.3891,
"step": 818
},
{
"epoch": 2.8686514886164622,
"grad_norm": 5.322511196136475,
"learning_rate": 2.1590819086758197e-07,
"loss": 0.3623,
"step": 819
},
{
"epoch": 2.87215411558669,
"grad_norm": 3.123744249343872,
"learning_rate": 2.0410004755491597e-07,
"loss": 0.0961,
"step": 820
},
{
"epoch": 2.8756567425569175,
"grad_norm": 8.360587120056152,
"learning_rate": 1.926222903001085e-07,
"loss": 0.4422,
"step": 821
},
{
"epoch": 2.8791593695271454,
"grad_norm": 8.696887969970703,
"learning_rate": 1.814751106625745e-07,
"loss": 0.3083,
"step": 822
},
{
"epoch": 2.882661996497373,
"grad_norm": 9.53421401977539,
"learning_rate": 1.7065869468450902e-07,
"loss": 0.4419,
"step": 823
},
{
"epoch": 2.8861646234676006,
"grad_norm": 8.00709056854248,
"learning_rate": 1.6017322288779614e-07,
"loss": 0.3319,
"step": 824
},
{
"epoch": 2.8896672504378285,
"grad_norm": 2.6990621089935303,
"learning_rate": 1.5001887027097594e-07,
"loss": 0.1647,
"step": 825
},
{
"epoch": 2.893169877408056,
"grad_norm": 6.979440689086914,
"learning_rate": 1.4019580630634244e-07,
"loss": 0.3581,
"step": 826
},
{
"epoch": 2.8966725043782837,
"grad_norm": 7.536319732666016,
"learning_rate": 1.307041949371013e-07,
"loss": 0.3931,
"step": 827
},
{
"epoch": 2.9001751313485116,
"grad_norm": 11.709566116333008,
"learning_rate": 1.2154419457464984e-07,
"loss": 0.6539,
"step": 828
},
{
"epoch": 2.903677758318739,
"grad_norm": 3.197880744934082,
"learning_rate": 1.1271595809591696e-07,
"loss": 0.2004,
"step": 829
},
{
"epoch": 2.907180385288967,
"grad_norm": 3.852081775665283,
"learning_rate": 1.0421963284081848e-07,
"loss": 0.2795,
"step": 830
},
{
"epoch": 2.9106830122591942,
"grad_norm": 2.3727164268493652,
"learning_rate": 9.60553606097947e-08,
"loss": 0.2725,
"step": 831
},
{
"epoch": 2.914185639229422,
"grad_norm": 23.161338806152344,
"learning_rate": 8.822327766145889e-08,
"loss": 0.7655,
"step": 832
},
{
"epoch": 2.91768826619965,
"grad_norm": 2.332108974456787,
"learning_rate": 8.072351471029471e-08,
"loss": 0.1882,
"step": 833
},
{
"epoch": 2.9211908931698773,
"grad_norm": 10.980060577392578,
"learning_rate": 7.355619692450688e-08,
"loss": 0.2514,
"step": 834
},
{
"epoch": 2.924693520140105,
"grad_norm": 6.918917179107666,
"learning_rate": 6.672144392390501e-08,
"loss": 0.1826,
"step": 835
},
{
"epoch": 2.9281961471103326,
"grad_norm": 11.677573204040527,
"learning_rate": 6.021936977792298e-08,
"loss": 0.4053,
"step": 836
},
{
"epoch": 2.9316987740805605,
"grad_norm": 9.314156532287598,
"learning_rate": 5.405008300371384e-08,
"loss": 0.3143,
"step": 837
},
{
"epoch": 2.9352014010507883,
"grad_norm": 5.6505045890808105,
"learning_rate": 4.821368656433123e-08,
"loss": 0.2023,
"step": 838
},
{
"epoch": 2.9387040280210157,
"grad_norm": 11.518875122070312,
"learning_rate": 4.2710277867015204e-08,
"loss": 0.3932,
"step": 839
},
{
"epoch": 2.9422066549912436,
"grad_norm": 4.484135150909424,
"learning_rate": 3.753994876156908e-08,
"loss": 0.2197,
"step": 840
},
{
"epoch": 2.945709281961471,
"grad_norm": 17.19270133972168,
"learning_rate": 3.270278553882511e-08,
"loss": 1.0666,
"step": 841
},
{
"epoch": 2.949211908931699,
"grad_norm": 2.880202293395996,
"learning_rate": 2.8198868929201206e-08,
"loss": 0.1144,
"step": 842
},
{
"epoch": 2.9527145359019267,
"grad_norm": 15.712217330932617,
"learning_rate": 2.402827410135533e-08,
"loss": 0.3268,
"step": 843
},
{
"epoch": 2.956217162872154,
"grad_norm": 4.731568813323975,
"learning_rate": 2.0191070660935396e-08,
"loss": 0.2733,
"step": 844
},
{
"epoch": 2.959719789842382,
"grad_norm": 9.0734224319458,
"learning_rate": 1.668732264940687e-08,
"loss": 0.2679,
"step": 845
},
{
"epoch": 2.9632224168126093,
"grad_norm": 13.673662185668945,
"learning_rate": 1.3517088542995827e-08,
"loss": 0.6694,
"step": 846
},
{
"epoch": 2.966725043782837,
"grad_norm": 9.462847709655762,
"learning_rate": 1.068042125170754e-08,
"loss": 0.2576,
"step": 847
},
{
"epoch": 2.970227670753065,
"grad_norm": 5.057038307189941,
"learning_rate": 8.177368118440499e-09,
"loss": 0.1687,
"step": 848
},
{
"epoch": 2.9737302977232924,
"grad_norm": 4.47308874130249,
"learning_rate": 6.007970918202599e-09,
"loss": 0.2286,
"step": 849
},
{
"epoch": 2.9772329246935203,
"grad_norm": 2.5368804931640625,
"learning_rate": 4.172265857411706e-09,
"loss": 0.2175,
"step": 850
},
{
"epoch": 2.9807355516637477,
"grad_norm": 6.23403263092041,
"learning_rate": 2.670283573285026e-09,
"loss": 0.2142,
"step": 851
},
{
"epoch": 2.9842381786339756,
"grad_norm": 9.441424369812012,
"learning_rate": 1.502049133341732e-09,
"loss": 0.3126,
"step": 852
},
{
"epoch": 2.9877408056042034,
"grad_norm": 5.0686163902282715,
"learning_rate": 6.675820349655304e-10,
"loss": 0.2322,
"step": 853
},
{
"epoch": 2.991243432574431,
"grad_norm": 17.517122268676758,
"learning_rate": 1.6689620510046412e-10,
"loss": 0.6579,
"step": 854
},
{
"epoch": 2.9947460595446582,
"grad_norm": 7.781464576721191,
"learning_rate": 0.0,
"loss": 0.3209,
"step": 855
}
],
"logging_steps": 1,
"max_steps": 855,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9951049450045440.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}