GENIE_en_8b / trainer_state.json
Yuan1997's picture
upload model file
fc13ecb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9939263407279775,
"eval_steps": 500,
"global_step": 1086,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0027568382511307344,
"grad_norm": 7.122869968414307,
"learning_rate": 6.060606060606061e-07,
"loss": 0.4664,
"step": 1
},
{
"epoch": 0.005513676502261469,
"grad_norm": 7.916077136993408,
"learning_rate": 1.2121212121212122e-06,
"loss": 0.4771,
"step": 2
},
{
"epoch": 0.008270514753392204,
"grad_norm": 7.081696033477783,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.4614,
"step": 3
},
{
"epoch": 0.011027353004522938,
"grad_norm": 5.813568115234375,
"learning_rate": 2.4242424242424244e-06,
"loss": 0.4299,
"step": 4
},
{
"epoch": 0.013784191255653673,
"grad_norm": 4.66733455657959,
"learning_rate": 3.0303030303030305e-06,
"loss": 0.3728,
"step": 5
},
{
"epoch": 0.01654102950678441,
"grad_norm": 6.349670886993408,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.3623,
"step": 6
},
{
"epoch": 0.01929786775791514,
"grad_norm": 3.68088436126709,
"learning_rate": 4.242424242424243e-06,
"loss": 0.2288,
"step": 7
},
{
"epoch": 0.022054706009045875,
"grad_norm": 2.508946418762207,
"learning_rate": 4.848484848484849e-06,
"loss": 0.1955,
"step": 8
},
{
"epoch": 0.02481154426017661,
"grad_norm": 7.334312915802002,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.1915,
"step": 9
},
{
"epoch": 0.027568382511307346,
"grad_norm": 6.7029619216918945,
"learning_rate": 6.060606060606061e-06,
"loss": 0.1546,
"step": 10
},
{
"epoch": 0.030325220762438078,
"grad_norm": 4.743812561035156,
"learning_rate": 6.666666666666667e-06,
"loss": 0.1565,
"step": 11
},
{
"epoch": 0.03308205901356882,
"grad_norm": 1.9065254926681519,
"learning_rate": 7.272727272727273e-06,
"loss": 0.1344,
"step": 12
},
{
"epoch": 0.03583889726469955,
"grad_norm": 1.0832651853561401,
"learning_rate": 7.87878787878788e-06,
"loss": 0.1244,
"step": 13
},
{
"epoch": 0.03859573551583028,
"grad_norm": 1.2000082731246948,
"learning_rate": 8.484848484848486e-06,
"loss": 0.1115,
"step": 14
},
{
"epoch": 0.04135257376696102,
"grad_norm": 0.937786877155304,
"learning_rate": 9.090909090909091e-06,
"loss": 0.1026,
"step": 15
},
{
"epoch": 0.04410941201809175,
"grad_norm": 1.0514339208602905,
"learning_rate": 9.696969696969698e-06,
"loss": 0.0976,
"step": 16
},
{
"epoch": 0.04686625026922248,
"grad_norm": 3.448519706726074,
"learning_rate": 1.0303030303030304e-05,
"loss": 0.1035,
"step": 17
},
{
"epoch": 0.04962308852035322,
"grad_norm": 1.5411934852600098,
"learning_rate": 1.0909090909090909e-05,
"loss": 0.0969,
"step": 18
},
{
"epoch": 0.05237992677148395,
"grad_norm": 0.6320972442626953,
"learning_rate": 1.1515151515151517e-05,
"loss": 0.0884,
"step": 19
},
{
"epoch": 0.05513676502261469,
"grad_norm": 0.967760443687439,
"learning_rate": 1.2121212121212122e-05,
"loss": 0.0851,
"step": 20
},
{
"epoch": 0.057893603273745424,
"grad_norm": 0.5067126750946045,
"learning_rate": 1.2727272727272728e-05,
"loss": 0.079,
"step": 21
},
{
"epoch": 0.060650441524876156,
"grad_norm": 0.5082628726959229,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.0787,
"step": 22
},
{
"epoch": 0.0634072797760069,
"grad_norm": 1.0926717519760132,
"learning_rate": 1.3939393939393942e-05,
"loss": 0.0751,
"step": 23
},
{
"epoch": 0.06616411802713763,
"grad_norm": 0.43889403343200684,
"learning_rate": 1.4545454545454546e-05,
"loss": 0.0696,
"step": 24
},
{
"epoch": 0.06892095627826836,
"grad_norm": 0.3727055788040161,
"learning_rate": 1.5151515151515153e-05,
"loss": 0.0693,
"step": 25
},
{
"epoch": 0.0716777945293991,
"grad_norm": 0.4280584454536438,
"learning_rate": 1.575757575757576e-05,
"loss": 0.0727,
"step": 26
},
{
"epoch": 0.07443463278052984,
"grad_norm": 0.36678358912467957,
"learning_rate": 1.6363636363636366e-05,
"loss": 0.0654,
"step": 27
},
{
"epoch": 0.07719147103166056,
"grad_norm": 0.3085872530937195,
"learning_rate": 1.6969696969696972e-05,
"loss": 0.0614,
"step": 28
},
{
"epoch": 0.0799483092827913,
"grad_norm": 0.2715630829334259,
"learning_rate": 1.7575757575757576e-05,
"loss": 0.0598,
"step": 29
},
{
"epoch": 0.08270514753392204,
"grad_norm": 0.36059409379959106,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.0602,
"step": 30
},
{
"epoch": 0.08546198578505276,
"grad_norm": 0.3720782995223999,
"learning_rate": 1.8787878787878792e-05,
"loss": 0.0602,
"step": 31
},
{
"epoch": 0.0882188240361835,
"grad_norm": 0.24194574356079102,
"learning_rate": 1.9393939393939395e-05,
"loss": 0.0553,
"step": 32
},
{
"epoch": 0.09097566228731424,
"grad_norm": 0.5713000893592834,
"learning_rate": 2e-05,
"loss": 0.0566,
"step": 33
},
{
"epoch": 0.09373250053844497,
"grad_norm": 0.4913846552371979,
"learning_rate": 1.9999955494602408e-05,
"loss": 0.0582,
"step": 34
},
{
"epoch": 0.0964893387895757,
"grad_norm": 0.26757943630218506,
"learning_rate": 1.999982197880577e-05,
"loss": 0.0544,
"step": 35
},
{
"epoch": 0.09924617704070644,
"grad_norm": 0.31889644265174866,
"learning_rate": 1.9999599453798523e-05,
"loss": 0.055,
"step": 36
},
{
"epoch": 0.10200301529183717,
"grad_norm": 0.26153671741485596,
"learning_rate": 1.9999287921561385e-05,
"loss": 0.0508,
"step": 37
},
{
"epoch": 0.1047598535429679,
"grad_norm": 0.2633945345878601,
"learning_rate": 1.9998887384867323e-05,
"loss": 0.0506,
"step": 38
},
{
"epoch": 0.10751669179409865,
"grad_norm": 0.22508633136749268,
"learning_rate": 1.9998397847281548e-05,
"loss": 0.0511,
"step": 39
},
{
"epoch": 0.11027353004522938,
"grad_norm": 0.23420976102352142,
"learning_rate": 1.9997819313161476e-05,
"loss": 0.0504,
"step": 40
},
{
"epoch": 0.11303036829636011,
"grad_norm": 0.3166053295135498,
"learning_rate": 1.9997151787656678e-05,
"loss": 0.0564,
"step": 41
},
{
"epoch": 0.11578720654749085,
"grad_norm": 0.19986656308174133,
"learning_rate": 1.9996395276708856e-05,
"loss": 0.0512,
"step": 42
},
{
"epoch": 0.11854404479862159,
"grad_norm": 0.274948388338089,
"learning_rate": 1.9995549787051772e-05,
"loss": 0.0489,
"step": 43
},
{
"epoch": 0.12130088304975231,
"grad_norm": 0.23236143589019775,
"learning_rate": 1.9994615326211203e-05,
"loss": 0.0491,
"step": 44
},
{
"epoch": 0.12405772130088305,
"grad_norm": 0.22261761128902435,
"learning_rate": 1.9993591902504854e-05,
"loss": 0.048,
"step": 45
},
{
"epoch": 0.1268145595520138,
"grad_norm": 0.22965197265148163,
"learning_rate": 1.9992479525042305e-05,
"loss": 0.0473,
"step": 46
},
{
"epoch": 0.12957139780314453,
"grad_norm": 0.22254930436611176,
"learning_rate": 1.9991278203724908e-05,
"loss": 0.0468,
"step": 47
},
{
"epoch": 0.13232823605427527,
"grad_norm": 0.18861782550811768,
"learning_rate": 1.9989987949245725e-05,
"loss": 0.0446,
"step": 48
},
{
"epoch": 0.13508507430540598,
"grad_norm": 0.2599838078022003,
"learning_rate": 1.9988608773089413e-05,
"loss": 0.0467,
"step": 49
},
{
"epoch": 0.13784191255653672,
"grad_norm": 0.24845938384532928,
"learning_rate": 1.998714068753213e-05,
"loss": 0.0461,
"step": 50
},
{
"epoch": 0.14059875080766746,
"grad_norm": 0.18050189316272736,
"learning_rate": 1.9985583705641418e-05,
"loss": 0.0459,
"step": 51
},
{
"epoch": 0.1433555890587982,
"grad_norm": 0.2283959537744522,
"learning_rate": 1.9983937841276103e-05,
"loss": 0.0452,
"step": 52
},
{
"epoch": 0.14611242730992893,
"grad_norm": 0.21824775636196136,
"learning_rate": 1.9982203109086153e-05,
"loss": 0.044,
"step": 53
},
{
"epoch": 0.14886926556105967,
"grad_norm": 0.2457500696182251,
"learning_rate": 1.998037952451255e-05,
"loss": 0.0445,
"step": 54
},
{
"epoch": 0.15162610381219038,
"grad_norm": 0.2404290735721588,
"learning_rate": 1.9978467103787176e-05,
"loss": 0.0461,
"step": 55
},
{
"epoch": 0.15438294206332112,
"grad_norm": 0.24108456075191498,
"learning_rate": 1.9976465863932632e-05,
"loss": 0.044,
"step": 56
},
{
"epoch": 0.15713978031445186,
"grad_norm": 0.19976645708084106,
"learning_rate": 1.9974375822762117e-05,
"loss": 0.0434,
"step": 57
},
{
"epoch": 0.1598966185655826,
"grad_norm": 0.20052076876163483,
"learning_rate": 1.9972196998879254e-05,
"loss": 0.0442,
"step": 58
},
{
"epoch": 0.16265345681671334,
"grad_norm": 0.19367817044258118,
"learning_rate": 1.996992941167792e-05,
"loss": 0.0436,
"step": 59
},
{
"epoch": 0.16541029506784408,
"grad_norm": 0.21227090060710907,
"learning_rate": 1.9967573081342103e-05,
"loss": 0.0439,
"step": 60
},
{
"epoch": 0.1681671333189748,
"grad_norm": 0.20237918198108673,
"learning_rate": 1.9965128028845676e-05,
"loss": 0.0414,
"step": 61
},
{
"epoch": 0.17092397157010553,
"grad_norm": 0.16752171516418457,
"learning_rate": 1.9962594275952246e-05,
"loss": 0.0426,
"step": 62
},
{
"epoch": 0.17368080982123626,
"grad_norm": 0.1880505383014679,
"learning_rate": 1.9959971845214953e-05,
"loss": 0.0414,
"step": 63
},
{
"epoch": 0.176437648072367,
"grad_norm": 0.2021344006061554,
"learning_rate": 1.995726075997626e-05,
"loss": 0.0423,
"step": 64
},
{
"epoch": 0.17919448632349774,
"grad_norm": 0.15245415270328522,
"learning_rate": 1.9954461044367752e-05,
"loss": 0.0413,
"step": 65
},
{
"epoch": 0.18195132457462848,
"grad_norm": 0.17980217933654785,
"learning_rate": 1.9951572723309918e-05,
"loss": 0.0421,
"step": 66
},
{
"epoch": 0.18470816282575922,
"grad_norm": 0.1832534670829773,
"learning_rate": 1.994859582251194e-05,
"loss": 0.0416,
"step": 67
},
{
"epoch": 0.18746500107688993,
"grad_norm": 0.6252762079238892,
"learning_rate": 1.9945530368471444e-05,
"loss": 0.047,
"step": 68
},
{
"epoch": 0.19022183932802067,
"grad_norm": 0.17044250667095184,
"learning_rate": 1.9942376388474282e-05,
"loss": 0.0406,
"step": 69
},
{
"epoch": 0.1929786775791514,
"grad_norm": 0.1817871630191803,
"learning_rate": 1.9939133910594276e-05,
"loss": 0.042,
"step": 70
},
{
"epoch": 0.19573551583028215,
"grad_norm": 0.1658797264099121,
"learning_rate": 1.9935802963692988e-05,
"loss": 0.041,
"step": 71
},
{
"epoch": 0.1984923540814129,
"grad_norm": 0.18323002755641937,
"learning_rate": 1.9932383577419432e-05,
"loss": 0.0407,
"step": 72
},
{
"epoch": 0.20124919233254362,
"grad_norm": 0.200252503156662,
"learning_rate": 1.992887578220984e-05,
"loss": 0.0427,
"step": 73
},
{
"epoch": 0.20400603058367434,
"grad_norm": 0.19649459421634674,
"learning_rate": 1.9925279609287384e-05,
"loss": 0.0439,
"step": 74
},
{
"epoch": 0.20676286883480507,
"grad_norm": 0.17329928278923035,
"learning_rate": 1.9921595090661872e-05,
"loss": 0.0415,
"step": 75
},
{
"epoch": 0.2095197070859358,
"grad_norm": 0.4066649377346039,
"learning_rate": 1.9917822259129508e-05,
"loss": 0.0416,
"step": 76
},
{
"epoch": 0.21227654533706655,
"grad_norm": 0.1922263205051422,
"learning_rate": 1.991396114827256e-05,
"loss": 0.041,
"step": 77
},
{
"epoch": 0.2150333835881973,
"grad_norm": 0.1683184802532196,
"learning_rate": 1.9910011792459086e-05,
"loss": 0.0411,
"step": 78
},
{
"epoch": 0.21779022183932803,
"grad_norm": 0.43947067856788635,
"learning_rate": 1.9905974226842614e-05,
"loss": 0.0393,
"step": 79
},
{
"epoch": 0.22054706009045877,
"grad_norm": 0.22764648497104645,
"learning_rate": 1.9901848487361834e-05,
"loss": 0.0415,
"step": 80
},
{
"epoch": 0.22330389834158948,
"grad_norm": 0.1933886855840683,
"learning_rate": 1.989763461074029e-05,
"loss": 0.0404,
"step": 81
},
{
"epoch": 0.22606073659272022,
"grad_norm": 0.1945038139820099,
"learning_rate": 1.989333263448602e-05,
"loss": 0.0398,
"step": 82
},
{
"epoch": 0.22881757484385096,
"grad_norm": 0.21237650513648987,
"learning_rate": 1.9888942596891267e-05,
"loss": 0.0406,
"step": 83
},
{
"epoch": 0.2315744130949817,
"grad_norm": 0.19435830414295197,
"learning_rate": 1.9884464537032103e-05,
"loss": 0.0397,
"step": 84
},
{
"epoch": 0.23433125134611243,
"grad_norm": 0.17015162110328674,
"learning_rate": 1.9879898494768093e-05,
"loss": 0.0398,
"step": 85
},
{
"epoch": 0.23708808959724317,
"grad_norm": 0.19542446732521057,
"learning_rate": 1.987524451074194e-05,
"loss": 0.0386,
"step": 86
},
{
"epoch": 0.23984492784837388,
"grad_norm": 0.23729243874549866,
"learning_rate": 1.9870502626379127e-05,
"loss": 0.0406,
"step": 87
},
{
"epoch": 0.24260176609950462,
"grad_norm": 0.18620969355106354,
"learning_rate": 1.9865672883887553e-05,
"loss": 0.0395,
"step": 88
},
{
"epoch": 0.24535860435063536,
"grad_norm": 0.254768043756485,
"learning_rate": 1.9860755326257127e-05,
"loss": 0.0405,
"step": 89
},
{
"epoch": 0.2481154426017661,
"grad_norm": 0.2777726650238037,
"learning_rate": 1.985574999725943e-05,
"loss": 0.0406,
"step": 90
},
{
"epoch": 0.2508722808528968,
"grad_norm": 0.23211295902729034,
"learning_rate": 1.985065694144728e-05,
"loss": 0.0389,
"step": 91
},
{
"epoch": 0.2536291191040276,
"grad_norm": 0.22822345793247223,
"learning_rate": 1.9845476204154387e-05,
"loss": 0.0393,
"step": 92
},
{
"epoch": 0.2563859573551583,
"grad_norm": 0.2358836829662323,
"learning_rate": 1.9840207831494903e-05,
"loss": 0.0383,
"step": 93
},
{
"epoch": 0.25914279560628906,
"grad_norm": 0.2143673151731491,
"learning_rate": 1.9834851870363024e-05,
"loss": 0.0411,
"step": 94
},
{
"epoch": 0.26189963385741977,
"grad_norm": 0.21156637370586395,
"learning_rate": 1.9829408368432592e-05,
"loss": 0.0393,
"step": 95
},
{
"epoch": 0.26465647210855053,
"grad_norm": 0.22637519240379333,
"learning_rate": 1.9823877374156647e-05,
"loss": 0.0375,
"step": 96
},
{
"epoch": 0.26741331035968124,
"grad_norm": 0.2607465386390686,
"learning_rate": 1.9818258936767013e-05,
"loss": 0.0371,
"step": 97
},
{
"epoch": 0.27017014861081196,
"grad_norm": 0.1963801383972168,
"learning_rate": 1.9812553106273848e-05,
"loss": 0.0377,
"step": 98
},
{
"epoch": 0.2729269868619427,
"grad_norm": 0.1716557741165161,
"learning_rate": 1.98067599334652e-05,
"loss": 0.0378,
"step": 99
},
{
"epoch": 0.27568382511307343,
"grad_norm": 0.18233723938465118,
"learning_rate": 1.980087946990656e-05,
"loss": 0.0385,
"step": 100
},
{
"epoch": 0.2784406633642042,
"grad_norm": 0.19480392336845398,
"learning_rate": 1.9794911767940405e-05,
"loss": 0.0377,
"step": 101
},
{
"epoch": 0.2811975016153349,
"grad_norm": 0.21586693823337555,
"learning_rate": 1.978885688068572e-05,
"loss": 0.0366,
"step": 102
},
{
"epoch": 0.2839543398664656,
"grad_norm": 0.4663015305995941,
"learning_rate": 1.9782714862037544e-05,
"loss": 0.0397,
"step": 103
},
{
"epoch": 0.2867111781175964,
"grad_norm": 0.43779700994491577,
"learning_rate": 1.977648576666647e-05,
"loss": 0.0405,
"step": 104
},
{
"epoch": 0.2894680163687271,
"grad_norm": 0.15922895073890686,
"learning_rate": 1.977016965001817e-05,
"loss": 0.0365,
"step": 105
},
{
"epoch": 0.29222485461985787,
"grad_norm": 0.20054024457931519,
"learning_rate": 1.9763766568312906e-05,
"loss": 0.0397,
"step": 106
},
{
"epoch": 0.2949816928709886,
"grad_norm": 0.23604053258895874,
"learning_rate": 1.9757276578545013e-05,
"loss": 0.0379,
"step": 107
},
{
"epoch": 0.29773853112211934,
"grad_norm": 0.22856947779655457,
"learning_rate": 1.9750699738482403e-05,
"loss": 0.041,
"step": 108
},
{
"epoch": 0.30049536937325005,
"grad_norm": 0.1889025717973709,
"learning_rate": 1.974403610666606e-05,
"loss": 0.0376,
"step": 109
},
{
"epoch": 0.30325220762438077,
"grad_norm": 0.16721414029598236,
"learning_rate": 1.9737285742409495e-05,
"loss": 0.0393,
"step": 110
},
{
"epoch": 0.30600904587551153,
"grad_norm": 0.19392457604408264,
"learning_rate": 1.973044870579824e-05,
"loss": 0.0382,
"step": 111
},
{
"epoch": 0.30876588412664224,
"grad_norm": 0.17549856007099152,
"learning_rate": 1.97235250576893e-05,
"loss": 0.0386,
"step": 112
},
{
"epoch": 0.311522722377773,
"grad_norm": 0.15814520418643951,
"learning_rate": 1.971651485971062e-05,
"loss": 0.0387,
"step": 113
},
{
"epoch": 0.3142795606289037,
"grad_norm": 0.5654919147491455,
"learning_rate": 1.9709418174260523e-05,
"loss": 0.0403,
"step": 114
},
{
"epoch": 0.3170363988800345,
"grad_norm": 0.16663803160190582,
"learning_rate": 1.9702235064507175e-05,
"loss": 0.0355,
"step": 115
},
{
"epoch": 0.3197932371311652,
"grad_norm": 0.17256957292556763,
"learning_rate": 1.9694965594388008e-05,
"loss": 0.0379,
"step": 116
},
{
"epoch": 0.3225500753822959,
"grad_norm": 0.23029914498329163,
"learning_rate": 1.9687609828609156e-05,
"loss": 0.0384,
"step": 117
},
{
"epoch": 0.3253069136334267,
"grad_norm": 0.1971399337053299,
"learning_rate": 1.9680167832644868e-05,
"loss": 0.0385,
"step": 118
},
{
"epoch": 0.3280637518845574,
"grad_norm": 0.21078315377235413,
"learning_rate": 1.9672639672736947e-05,
"loss": 0.0386,
"step": 119
},
{
"epoch": 0.33082059013568815,
"grad_norm": 0.20524373650550842,
"learning_rate": 1.966502541589414e-05,
"loss": 0.0365,
"step": 120
},
{
"epoch": 0.33357742838681886,
"grad_norm": 0.17085903882980347,
"learning_rate": 1.9657325129891558e-05,
"loss": 0.0375,
"step": 121
},
{
"epoch": 0.3363342666379496,
"grad_norm": 0.20739462971687317,
"learning_rate": 1.9649538883270053e-05,
"loss": 0.0367,
"step": 122
},
{
"epoch": 0.33909110488908034,
"grad_norm": 0.17642727494239807,
"learning_rate": 1.9641666745335626e-05,
"loss": 0.037,
"step": 123
},
{
"epoch": 0.34184794314021105,
"grad_norm": 0.18456482887268066,
"learning_rate": 1.9633708786158803e-05,
"loss": 0.038,
"step": 124
},
{
"epoch": 0.3446047813913418,
"grad_norm": 0.22288407385349274,
"learning_rate": 1.962566507657402e-05,
"loss": 0.0378,
"step": 125
},
{
"epoch": 0.34736161964247253,
"grad_norm": 0.20137764513492584,
"learning_rate": 1.961753568817896e-05,
"loss": 0.0371,
"step": 126
},
{
"epoch": 0.3501184578936033,
"grad_norm": 0.16192878782749176,
"learning_rate": 1.9609320693333967e-05,
"loss": 0.0375,
"step": 127
},
{
"epoch": 0.352875296144734,
"grad_norm": 0.7257514595985413,
"learning_rate": 1.960102016516136e-05,
"loss": 0.0393,
"step": 128
},
{
"epoch": 0.3556321343958647,
"grad_norm": 0.15738032758235931,
"learning_rate": 1.9592634177544803e-05,
"loss": 0.0374,
"step": 129
},
{
"epoch": 0.3583889726469955,
"grad_norm": 0.15156516432762146,
"learning_rate": 1.9584162805128636e-05,
"loss": 0.0351,
"step": 130
},
{
"epoch": 0.3611458108981262,
"grad_norm": 0.1443106234073639,
"learning_rate": 1.9575606123317215e-05,
"loss": 0.0359,
"step": 131
},
{
"epoch": 0.36390264914925696,
"grad_norm": 0.16379106044769287,
"learning_rate": 1.9566964208274254e-05,
"loss": 0.0376,
"step": 132
},
{
"epoch": 0.3666594874003877,
"grad_norm": 0.17260539531707764,
"learning_rate": 1.9558237136922117e-05,
"loss": 0.0382,
"step": 133
},
{
"epoch": 0.36941632565151844,
"grad_norm": 0.1595146209001541,
"learning_rate": 1.954942498694117e-05,
"loss": 0.0378,
"step": 134
},
{
"epoch": 0.37217316390264915,
"grad_norm": 0.1662372648715973,
"learning_rate": 1.9540527836769047e-05,
"loss": 0.036,
"step": 135
},
{
"epoch": 0.37493000215377986,
"grad_norm": 0.17359688878059387,
"learning_rate": 1.95315457656e-05,
"loss": 0.0352,
"step": 136
},
{
"epoch": 0.37768684040491063,
"grad_norm": 0.15664531290531158,
"learning_rate": 1.9522478853384154e-05,
"loss": 0.0372,
"step": 137
},
{
"epoch": 0.38044367865604134,
"grad_norm": 0.14340853691101074,
"learning_rate": 1.951332718082682e-05,
"loss": 0.0378,
"step": 138
},
{
"epoch": 0.3832005169071721,
"grad_norm": 0.16062387824058533,
"learning_rate": 1.950409082938776e-05,
"loss": 0.036,
"step": 139
},
{
"epoch": 0.3859573551583028,
"grad_norm": 0.16279537975788116,
"learning_rate": 1.949476988128047e-05,
"loss": 0.0373,
"step": 140
},
{
"epoch": 0.38871419340943353,
"grad_norm": 0.18597126007080078,
"learning_rate": 1.9485364419471454e-05,
"loss": 0.0367,
"step": 141
},
{
"epoch": 0.3914710316605643,
"grad_norm": 0.6993559002876282,
"learning_rate": 1.947587452767948e-05,
"loss": 0.0453,
"step": 142
},
{
"epoch": 0.394227869911695,
"grad_norm": 0.20166553556919098,
"learning_rate": 1.946630029037482e-05,
"loss": 0.0366,
"step": 143
},
{
"epoch": 0.3969847081628258,
"grad_norm": 0.16907399892807007,
"learning_rate": 1.9456641792778527e-05,
"loss": 0.036,
"step": 144
},
{
"epoch": 0.3997415464139565,
"grad_norm": 0.19966886937618256,
"learning_rate": 1.9446899120861653e-05,
"loss": 0.0348,
"step": 145
},
{
"epoch": 0.40249838466508725,
"grad_norm": 0.25474515557289124,
"learning_rate": 1.94370723613445e-05,
"loss": 0.0388,
"step": 146
},
{
"epoch": 0.40525522291621796,
"grad_norm": 0.2152235358953476,
"learning_rate": 1.9427161601695833e-05,
"loss": 0.0371,
"step": 147
},
{
"epoch": 0.40801206116734867,
"grad_norm": 0.15147989988327026,
"learning_rate": 1.941716693013211e-05,
"loss": 0.0374,
"step": 148
},
{
"epoch": 0.41076889941847944,
"grad_norm": 0.18094661831855774,
"learning_rate": 1.94070884356167e-05,
"loss": 0.036,
"step": 149
},
{
"epoch": 0.41352573766961015,
"grad_norm": 0.20204827189445496,
"learning_rate": 1.9396926207859085e-05,
"loss": 0.0383,
"step": 150
},
{
"epoch": 0.4162825759207409,
"grad_norm": 0.24904154241085052,
"learning_rate": 1.938668033731406e-05,
"loss": 0.0368,
"step": 151
},
{
"epoch": 0.4190394141718716,
"grad_norm": 0.15912267565727234,
"learning_rate": 1.9376350915180935e-05,
"loss": 0.0352,
"step": 152
},
{
"epoch": 0.4217962524230024,
"grad_norm": 0.1520577222108841,
"learning_rate": 1.9365938033402715e-05,
"loss": 0.036,
"step": 153
},
{
"epoch": 0.4245530906741331,
"grad_norm": 0.1562613546848297,
"learning_rate": 1.9355441784665295e-05,
"loss": 0.0367,
"step": 154
},
{
"epoch": 0.4273099289252638,
"grad_norm": 0.13876181840896606,
"learning_rate": 1.9344862262396612e-05,
"loss": 0.0354,
"step": 155
},
{
"epoch": 0.4300667671763946,
"grad_norm": 0.15534985065460205,
"learning_rate": 1.933419956076584e-05,
"loss": 0.0358,
"step": 156
},
{
"epoch": 0.4328236054275253,
"grad_norm": 0.15327094495296478,
"learning_rate": 1.932345377468253e-05,
"loss": 0.0341,
"step": 157
},
{
"epoch": 0.43558044367865606,
"grad_norm": 0.1699693650007248,
"learning_rate": 1.9312624999795784e-05,
"loss": 0.0378,
"step": 158
},
{
"epoch": 0.43833728192978677,
"grad_norm": 0.12956377863883972,
"learning_rate": 1.9301713332493386e-05,
"loss": 0.0351,
"step": 159
},
{
"epoch": 0.44109412018091754,
"grad_norm": 0.14530932903289795,
"learning_rate": 1.929071886990095e-05,
"loss": 0.0357,
"step": 160
},
{
"epoch": 0.44385095843204825,
"grad_norm": 0.15227288007736206,
"learning_rate": 1.9279641709881067e-05,
"loss": 0.0363,
"step": 161
},
{
"epoch": 0.44660779668317896,
"grad_norm": 0.165790393948555,
"learning_rate": 1.926848195103242e-05,
"loss": 0.0354,
"step": 162
},
{
"epoch": 0.4493646349343097,
"grad_norm": 0.16768325865268707,
"learning_rate": 1.9257239692688907e-05,
"loss": 0.0335,
"step": 163
},
{
"epoch": 0.45212147318544044,
"grad_norm": 0.147464320063591,
"learning_rate": 1.9245915034918763e-05,
"loss": 0.0339,
"step": 164
},
{
"epoch": 0.4548783114365712,
"grad_norm": 0.1301703006029129,
"learning_rate": 1.923450807852367e-05,
"loss": 0.0337,
"step": 165
},
{
"epoch": 0.4576351496877019,
"grad_norm": 0.1339288204908371,
"learning_rate": 1.922301892503785e-05,
"loss": 0.0353,
"step": 166
},
{
"epoch": 0.4603919879388326,
"grad_norm": 0.16490085422992706,
"learning_rate": 1.9211447676727174e-05,
"loss": 0.0346,
"step": 167
},
{
"epoch": 0.4631488261899634,
"grad_norm": 0.24428266286849976,
"learning_rate": 1.9199794436588244e-05,
"loss": 0.0355,
"step": 168
},
{
"epoch": 0.4659056644410941,
"grad_norm": 0.40584465861320496,
"learning_rate": 1.9188059308347475e-05,
"loss": 0.0397,
"step": 169
},
{
"epoch": 0.46866250269222487,
"grad_norm": 0.17336200177669525,
"learning_rate": 1.9176242396460184e-05,
"loss": 0.036,
"step": 170
},
{
"epoch": 0.4714193409433556,
"grad_norm": 0.22676712274551392,
"learning_rate": 1.916434380610963e-05,
"loss": 0.0354,
"step": 171
},
{
"epoch": 0.47417617919448635,
"grad_norm": 0.19564731419086456,
"learning_rate": 1.9152363643206126e-05,
"loss": 0.0341,
"step": 172
},
{
"epoch": 0.47693301744561706,
"grad_norm": 0.16920173168182373,
"learning_rate": 1.9140302014386044e-05,
"loss": 0.036,
"step": 173
},
{
"epoch": 0.47968985569674777,
"grad_norm": 0.1923132985830307,
"learning_rate": 1.912815902701091e-05,
"loss": 0.0334,
"step": 174
},
{
"epoch": 0.48244669394787854,
"grad_norm": 0.18520843982696533,
"learning_rate": 1.911593478916641e-05,
"loss": 0.0343,
"step": 175
},
{
"epoch": 0.48520353219900925,
"grad_norm": 0.2041483074426651,
"learning_rate": 1.9103629409661468e-05,
"loss": 0.0361,
"step": 176
},
{
"epoch": 0.48796037045014,
"grad_norm": 0.17951691150665283,
"learning_rate": 1.909124299802724e-05,
"loss": 0.0346,
"step": 177
},
{
"epoch": 0.4907172087012707,
"grad_norm": 0.17455315589904785,
"learning_rate": 1.9078775664516157e-05,
"loss": 0.0328,
"step": 178
},
{
"epoch": 0.4934740469524015,
"grad_norm": 0.149070605635643,
"learning_rate": 1.906622752010095e-05,
"loss": 0.0353,
"step": 179
},
{
"epoch": 0.4962308852035322,
"grad_norm": 0.13988004624843597,
"learning_rate": 1.9053598676473656e-05,
"loss": 0.0335,
"step": 180
},
{
"epoch": 0.4989877234546629,
"grad_norm": 0.12514030933380127,
"learning_rate": 1.904088924604461e-05,
"loss": 0.0341,
"step": 181
},
{
"epoch": 0.5017445617057936,
"grad_norm": 0.14698189496994019,
"learning_rate": 1.9028099341941457e-05,
"loss": 0.0347,
"step": 182
},
{
"epoch": 0.5045013999569244,
"grad_norm": 0.1326190084218979,
"learning_rate": 1.9015229078008163e-05,
"loss": 0.0344,
"step": 183
},
{
"epoch": 0.5072582382080552,
"grad_norm": 0.1399255394935608,
"learning_rate": 1.900227856880396e-05,
"loss": 0.0338,
"step": 184
},
{
"epoch": 0.5100150764591859,
"grad_norm": 0.17403383553028107,
"learning_rate": 1.898924792960237e-05,
"loss": 0.034,
"step": 185
},
{
"epoch": 0.5127719147103166,
"grad_norm": 0.14294308423995972,
"learning_rate": 1.8976137276390145e-05,
"loss": 0.0335,
"step": 186
},
{
"epoch": 0.5155287529614473,
"grad_norm": 0.13353058695793152,
"learning_rate": 1.8962946725866246e-05,
"loss": 0.0325,
"step": 187
},
{
"epoch": 0.5182855912125781,
"grad_norm": 0.1507033258676529,
"learning_rate": 1.8949676395440818e-05,
"loss": 0.0354,
"step": 188
},
{
"epoch": 0.5210424294637088,
"grad_norm": 0.14498627185821533,
"learning_rate": 1.8936326403234125e-05,
"loss": 0.0327,
"step": 189
},
{
"epoch": 0.5237992677148395,
"grad_norm": 0.15840047597885132,
"learning_rate": 1.892289686807551e-05,
"loss": 0.0332,
"step": 190
},
{
"epoch": 0.5265561059659702,
"grad_norm": 0.14784608781337738,
"learning_rate": 1.8909387909502335e-05,
"loss": 0.0324,
"step": 191
},
{
"epoch": 0.5293129442171011,
"grad_norm": 0.13484050333499908,
"learning_rate": 1.8895799647758912e-05,
"loss": 0.0336,
"step": 192
},
{
"epoch": 0.5320697824682318,
"grad_norm": 0.17485234141349792,
"learning_rate": 1.888213220379544e-05,
"loss": 0.0335,
"step": 193
},
{
"epoch": 0.5348266207193625,
"grad_norm": 0.14618845283985138,
"learning_rate": 1.8868385699266928e-05,
"loss": 0.0328,
"step": 194
},
{
"epoch": 0.5375834589704932,
"grad_norm": 0.12785577774047852,
"learning_rate": 1.8854560256532098e-05,
"loss": 0.0335,
"step": 195
},
{
"epoch": 0.5403402972216239,
"grad_norm": 0.16961853206157684,
"learning_rate": 1.8840655998652324e-05,
"loss": 0.0358,
"step": 196
},
{
"epoch": 0.5430971354727547,
"grad_norm": 0.14937855303287506,
"learning_rate": 1.8826673049390508e-05,
"loss": 0.0336,
"step": 197
},
{
"epoch": 0.5458539737238854,
"grad_norm": 0.14861802756786346,
"learning_rate": 1.881261153320999e-05,
"loss": 0.0346,
"step": 198
},
{
"epoch": 0.5486108119750162,
"grad_norm": 0.13903887569904327,
"learning_rate": 1.8798471575273445e-05,
"loss": 0.0319,
"step": 199
},
{
"epoch": 0.5513676502261469,
"grad_norm": 0.5592267513275146,
"learning_rate": 1.8784253301441767e-05,
"loss": 0.0367,
"step": 200
},
{
"epoch": 0.5541244884772776,
"grad_norm": 0.12278741598129272,
"learning_rate": 1.8769956838272937e-05,
"loss": 0.0326,
"step": 201
},
{
"epoch": 0.5568813267284084,
"grad_norm": 0.16035325825214386,
"learning_rate": 1.8755582313020912e-05,
"loss": 0.035,
"step": 202
},
{
"epoch": 0.5596381649795391,
"grad_norm": 0.15898433327674866,
"learning_rate": 1.8741129853634483e-05,
"loss": 0.0336,
"step": 203
},
{
"epoch": 0.5623950032306698,
"grad_norm": 0.14194312691688538,
"learning_rate": 1.8726599588756144e-05,
"loss": 0.0327,
"step": 204
},
{
"epoch": 0.5651518414818005,
"grad_norm": 0.14086908102035522,
"learning_rate": 1.8711991647720936e-05,
"loss": 0.034,
"step": 205
},
{
"epoch": 0.5679086797329312,
"grad_norm": 0.13377539813518524,
"learning_rate": 1.8697306160555303e-05,
"loss": 0.0337,
"step": 206
},
{
"epoch": 0.5706655179840621,
"grad_norm": 0.13268251717090607,
"learning_rate": 1.868254325797594e-05,
"loss": 0.0329,
"step": 207
},
{
"epoch": 0.5734223562351928,
"grad_norm": 0.13881011307239532,
"learning_rate": 1.8667703071388607e-05,
"loss": 0.0339,
"step": 208
},
{
"epoch": 0.5761791944863235,
"grad_norm": 0.15615610778331757,
"learning_rate": 1.8652785732886988e-05,
"loss": 0.0345,
"step": 209
},
{
"epoch": 0.5789360327374542,
"grad_norm": 0.3299935460090637,
"learning_rate": 1.8637791375251505e-05,
"loss": 0.0353,
"step": 210
},
{
"epoch": 0.581692870988585,
"grad_norm": 0.13316752016544342,
"learning_rate": 1.862272013194812e-05,
"loss": 0.0325,
"step": 211
},
{
"epoch": 0.5844497092397157,
"grad_norm": 0.13292838633060455,
"learning_rate": 1.8607572137127175e-05,
"loss": 0.0342,
"step": 212
},
{
"epoch": 0.5872065474908464,
"grad_norm": 0.1690252423286438,
"learning_rate": 1.859234752562217e-05,
"loss": 0.0333,
"step": 213
},
{
"epoch": 0.5899633857419772,
"grad_norm": 0.13950853049755096,
"learning_rate": 1.8577046432948586e-05,
"loss": 0.032,
"step": 214
},
{
"epoch": 0.5927202239931079,
"grad_norm": 0.12849925458431244,
"learning_rate": 1.8561668995302668e-05,
"loss": 0.0323,
"step": 215
},
{
"epoch": 0.5954770622442387,
"grad_norm": 0.22346273064613342,
"learning_rate": 1.8546215349560204e-05,
"loss": 0.0346,
"step": 216
},
{
"epoch": 0.5982339004953694,
"grad_norm": 0.12624599039554596,
"learning_rate": 1.853068563327533e-05,
"loss": 0.0328,
"step": 217
},
{
"epoch": 0.6009907387465001,
"grad_norm": 0.14228811860084534,
"learning_rate": 1.851507998467929e-05,
"loss": 0.0334,
"step": 218
},
{
"epoch": 0.6037475769976308,
"grad_norm": 0.1500948965549469,
"learning_rate": 1.849939854267919e-05,
"loss": 0.0346,
"step": 219
},
{
"epoch": 0.6065044152487615,
"grad_norm": 0.17958402633666992,
"learning_rate": 1.8483641446856798e-05,
"loss": 0.0341,
"step": 220
},
{
"epoch": 0.6092612534998924,
"grad_norm": 0.15875360369682312,
"learning_rate": 1.8467808837467277e-05,
"loss": 0.0354,
"step": 221
},
{
"epoch": 0.6120180917510231,
"grad_norm": 0.13992802798748016,
"learning_rate": 1.845190085543795e-05,
"loss": 0.0346,
"step": 222
},
{
"epoch": 0.6147749300021538,
"grad_norm": 0.14051182568073273,
"learning_rate": 1.843591764236702e-05,
"loss": 0.0325,
"step": 223
},
{
"epoch": 0.6175317682532845,
"grad_norm": 0.13775014877319336,
"learning_rate": 1.841985934052234e-05,
"loss": 0.0323,
"step": 224
},
{
"epoch": 0.6202886065044152,
"grad_norm": 0.1298038810491562,
"learning_rate": 1.840372609284013e-05,
"loss": 0.0322,
"step": 225
},
{
"epoch": 0.623045444755546,
"grad_norm": 0.14838284254074097,
"learning_rate": 1.8387518042923715e-05,
"loss": 0.0334,
"step": 226
},
{
"epoch": 0.6258022830066767,
"grad_norm": 0.1384505033493042,
"learning_rate": 1.8371235335042236e-05,
"loss": 0.0332,
"step": 227
},
{
"epoch": 0.6285591212578074,
"grad_norm": 0.1435280740261078,
"learning_rate": 1.8354878114129368e-05,
"loss": 0.0323,
"step": 228
},
{
"epoch": 0.6313159595089382,
"grad_norm": 0.19593602418899536,
"learning_rate": 1.833844652578203e-05,
"loss": 0.033,
"step": 229
},
{
"epoch": 0.634072797760069,
"grad_norm": 0.11905160546302795,
"learning_rate": 1.832194071625911e-05,
"loss": 0.0329,
"step": 230
},
{
"epoch": 0.6368296360111997,
"grad_norm": 0.13935434818267822,
"learning_rate": 1.8305360832480118e-05,
"loss": 0.0326,
"step": 231
},
{
"epoch": 0.6395864742623304,
"grad_norm": 0.13467997312545776,
"learning_rate": 1.8288707022023926e-05,
"loss": 0.0323,
"step": 232
},
{
"epoch": 0.6423433125134611,
"grad_norm": 0.13442496955394745,
"learning_rate": 1.827197943312742e-05,
"loss": 0.0326,
"step": 233
},
{
"epoch": 0.6451001507645918,
"grad_norm": 0.13342751562595367,
"learning_rate": 1.82551782146842e-05,
"loss": 0.0326,
"step": 234
},
{
"epoch": 0.6478569890157226,
"grad_norm": 0.11170811951160431,
"learning_rate": 1.8238303516243253e-05,
"loss": 0.0329,
"step": 235
},
{
"epoch": 0.6506138272668534,
"grad_norm": 0.109471395611763,
"learning_rate": 1.8221355488007606e-05,
"loss": 0.0312,
"step": 236
},
{
"epoch": 0.6533706655179841,
"grad_norm": 0.10828150063753128,
"learning_rate": 1.8204334280833005e-05,
"loss": 0.0317,
"step": 237
},
{
"epoch": 0.6561275037691148,
"grad_norm": 0.1234995648264885,
"learning_rate": 1.8187240046226576e-05,
"loss": 0.0316,
"step": 238
},
{
"epoch": 0.6588843420202455,
"grad_norm": 0.12001900374889374,
"learning_rate": 1.817007293634545e-05,
"loss": 0.0318,
"step": 239
},
{
"epoch": 0.6616411802713763,
"grad_norm": 0.12810060381889343,
"learning_rate": 1.8152833103995443e-05,
"loss": 0.0315,
"step": 240
},
{
"epoch": 0.664398018522507,
"grad_norm": 0.12769843637943268,
"learning_rate": 1.8135520702629677e-05,
"loss": 0.0324,
"step": 241
},
{
"epoch": 0.6671548567736377,
"grad_norm": 0.12816517055034637,
"learning_rate": 1.8118135886347207e-05,
"loss": 0.0311,
"step": 242
},
{
"epoch": 0.6699116950247684,
"grad_norm": 0.1479124128818512,
"learning_rate": 1.8100678809891668e-05,
"loss": 0.0322,
"step": 243
},
{
"epoch": 0.6726685332758991,
"grad_norm": 0.1499953418970108,
"learning_rate": 1.8083149628649887e-05,
"loss": 0.0322,
"step": 244
},
{
"epoch": 0.67542537152703,
"grad_norm": 0.16143152117729187,
"learning_rate": 1.8065548498650495e-05,
"loss": 0.0327,
"step": 245
},
{
"epoch": 0.6781822097781607,
"grad_norm": 0.11848675459623337,
"learning_rate": 1.8047875576562556e-05,
"loss": 0.0323,
"step": 246
},
{
"epoch": 0.6809390480292914,
"grad_norm": 0.14054562151432037,
"learning_rate": 1.803013101969415e-05,
"loss": 0.0311,
"step": 247
},
{
"epoch": 0.6836958862804221,
"grad_norm": 0.14287616312503815,
"learning_rate": 1.801231498599099e-05,
"loss": 0.0321,
"step": 248
},
{
"epoch": 0.6864527245315529,
"grad_norm": 0.1472490429878235,
"learning_rate": 1.7994427634035016e-05,
"loss": 0.0324,
"step": 249
},
{
"epoch": 0.6892095627826836,
"grad_norm": 0.2156575620174408,
"learning_rate": 1.7976469123042955e-05,
"loss": 0.0331,
"step": 250
},
{
"epoch": 0.6919664010338143,
"grad_norm": 0.15800388157367706,
"learning_rate": 1.7958439612864954e-05,
"loss": 0.0319,
"step": 251
},
{
"epoch": 0.6947232392849451,
"grad_norm": 0.13451936841011047,
"learning_rate": 1.7940339263983112e-05,
"loss": 0.032,
"step": 252
},
{
"epoch": 0.6974800775360758,
"grad_norm": 0.12370197474956512,
"learning_rate": 1.7922168237510076e-05,
"loss": 0.0328,
"step": 253
},
{
"epoch": 0.7002369157872066,
"grad_norm": 0.1429441124200821,
"learning_rate": 1.7903926695187595e-05,
"loss": 0.0318,
"step": 254
},
{
"epoch": 0.7029937540383373,
"grad_norm": 0.13566642999649048,
"learning_rate": 1.7885614799385086e-05,
"loss": 0.0322,
"step": 255
},
{
"epoch": 0.705750592289468,
"grad_norm": 0.11789460480213165,
"learning_rate": 1.78672327130982e-05,
"loss": 0.0316,
"step": 256
},
{
"epoch": 0.7085074305405987,
"grad_norm": 0.14814262092113495,
"learning_rate": 1.7848780599947334e-05,
"loss": 0.033,
"step": 257
},
{
"epoch": 0.7112642687917294,
"grad_norm": 0.12498864531517029,
"learning_rate": 1.7830258624176224e-05,
"loss": 0.0321,
"step": 258
},
{
"epoch": 0.7140211070428603,
"grad_norm": 0.12611018121242523,
"learning_rate": 1.7811666950650445e-05,
"loss": 0.0318,
"step": 259
},
{
"epoch": 0.716777945293991,
"grad_norm": 0.1379752904176712,
"learning_rate": 1.7793005744855967e-05,
"loss": 0.0343,
"step": 260
},
{
"epoch": 0.7195347835451217,
"grad_norm": 0.12495708465576172,
"learning_rate": 1.777427517289766e-05,
"loss": 0.0337,
"step": 261
},
{
"epoch": 0.7222916217962524,
"grad_norm": 0.11509250104427338,
"learning_rate": 1.775547540149784e-05,
"loss": 0.0303,
"step": 262
},
{
"epoch": 0.7250484600473831,
"grad_norm": 0.14822109043598175,
"learning_rate": 1.7736606597994763e-05,
"loss": 0.0329,
"step": 263
},
{
"epoch": 0.7278052982985139,
"grad_norm": 0.1274510771036148,
"learning_rate": 1.7717668930341152e-05,
"loss": 0.0321,
"step": 264
},
{
"epoch": 0.7305621365496446,
"grad_norm": 0.13247767090797424,
"learning_rate": 1.769866256710269e-05,
"loss": 0.0327,
"step": 265
},
{
"epoch": 0.7333189748007753,
"grad_norm": 0.1502748280763626,
"learning_rate": 1.767958767745653e-05,
"loss": 0.0336,
"step": 266
},
{
"epoch": 0.7360758130519061,
"grad_norm": 0.11968521028757095,
"learning_rate": 1.766044443118978e-05,
"loss": 0.032,
"step": 267
},
{
"epoch": 0.7388326513030369,
"grad_norm": 0.12810976803302765,
"learning_rate": 1.7641232998698e-05,
"loss": 0.0317,
"step": 268
},
{
"epoch": 0.7415894895541676,
"grad_norm": 0.13857212662696838,
"learning_rate": 1.7621953550983677e-05,
"loss": 0.03,
"step": 269
},
{
"epoch": 0.7443463278052983,
"grad_norm": 0.12155446410179138,
"learning_rate": 1.7602606259654704e-05,
"loss": 0.0317,
"step": 270
},
{
"epoch": 0.747103166056429,
"grad_norm": 0.13759441673755646,
"learning_rate": 1.7583191296922866e-05,
"loss": 0.0312,
"step": 271
},
{
"epoch": 0.7498600043075597,
"grad_norm": 0.18140603601932526,
"learning_rate": 1.7563708835602286e-05,
"loss": 0.0316,
"step": 272
},
{
"epoch": 0.7526168425586905,
"grad_norm": 0.16475705802440643,
"learning_rate": 1.7544159049107902e-05,
"loss": 0.0306,
"step": 273
},
{
"epoch": 0.7553736808098213,
"grad_norm": 0.13975286483764648,
"learning_rate": 1.7524542111453923e-05,
"loss": 0.0335,
"step": 274
},
{
"epoch": 0.758130519060952,
"grad_norm": 0.14528636634349823,
"learning_rate": 1.7504858197252263e-05,
"loss": 0.032,
"step": 275
},
{
"epoch": 0.7608873573120827,
"grad_norm": 0.15437592566013336,
"learning_rate": 1.7485107481711014e-05,
"loss": 0.033,
"step": 276
},
{
"epoch": 0.7636441955632134,
"grad_norm": 0.11566643416881561,
"learning_rate": 1.746529014063286e-05,
"loss": 0.0305,
"step": 277
},
{
"epoch": 0.7664010338143442,
"grad_norm": 0.1528697907924652,
"learning_rate": 1.7445406350413533e-05,
"loss": 0.0301,
"step": 278
},
{
"epoch": 0.7691578720654749,
"grad_norm": 0.13452287018299103,
"learning_rate": 1.7425456288040236e-05,
"loss": 0.0312,
"step": 279
},
{
"epoch": 0.7719147103166056,
"grad_norm": 0.13913044333457947,
"learning_rate": 1.740544013109005e-05,
"loss": 0.0328,
"step": 280
},
{
"epoch": 0.7746715485677363,
"grad_norm": 0.11884239315986633,
"learning_rate": 1.738535805772838e-05,
"loss": 0.0299,
"step": 281
},
{
"epoch": 0.7774283868188671,
"grad_norm": 0.13907060027122498,
"learning_rate": 1.736521024670737e-05,
"loss": 0.0326,
"step": 282
},
{
"epoch": 0.7801852250699979,
"grad_norm": 0.13899970054626465,
"learning_rate": 1.7344996877364282e-05,
"loss": 0.0329,
"step": 283
},
{
"epoch": 0.7829420633211286,
"grad_norm": 0.13733629882335663,
"learning_rate": 1.732471812961992e-05,
"loss": 0.0317,
"step": 284
},
{
"epoch": 0.7856989015722593,
"grad_norm": 0.12010809779167175,
"learning_rate": 1.7304374183977032e-05,
"loss": 0.0315,
"step": 285
},
{
"epoch": 0.78845573982339,
"grad_norm": 0.1484297811985016,
"learning_rate": 1.72839652215187e-05,
"loss": 0.0321,
"step": 286
},
{
"epoch": 0.7912125780745208,
"grad_norm": 0.12623994052410126,
"learning_rate": 1.7263491423906716e-05,
"loss": 0.0319,
"step": 287
},
{
"epoch": 0.7939694163256515,
"grad_norm": 0.11906126886606216,
"learning_rate": 1.7242952973379983e-05,
"loss": 0.0305,
"step": 288
},
{
"epoch": 0.7967262545767823,
"grad_norm": 0.1313122808933258,
"learning_rate": 1.7222350052752883e-05,
"loss": 0.0319,
"step": 289
},
{
"epoch": 0.799483092827913,
"grad_norm": 0.1219288781285286,
"learning_rate": 1.720168284541365e-05,
"loss": 0.0297,
"step": 290
},
{
"epoch": 0.8022399310790437,
"grad_norm": 0.16674523055553436,
"learning_rate": 1.7180951535322742e-05,
"loss": 0.0322,
"step": 291
},
{
"epoch": 0.8049967693301745,
"grad_norm": 0.21712636947631836,
"learning_rate": 1.7160156307011197e-05,
"loss": 0.0318,
"step": 292
},
{
"epoch": 0.8077536075813052,
"grad_norm": 0.1368701457977295,
"learning_rate": 1.7139297345578992e-05,
"loss": 0.0302,
"step": 293
},
{
"epoch": 0.8105104458324359,
"grad_norm": 0.1643868386745453,
"learning_rate": 1.7118374836693407e-05,
"loss": 0.0328,
"step": 294
},
{
"epoch": 0.8132672840835666,
"grad_norm": 0.11599249392747879,
"learning_rate": 1.7097388966587355e-05,
"loss": 0.0303,
"step": 295
},
{
"epoch": 0.8160241223346973,
"grad_norm": 0.29641032218933105,
"learning_rate": 1.7076339922057736e-05,
"loss": 0.034,
"step": 296
},
{
"epoch": 0.8187809605858282,
"grad_norm": 0.1458274871110916,
"learning_rate": 1.705522789046377e-05,
"loss": 0.0318,
"step": 297
},
{
"epoch": 0.8215377988369589,
"grad_norm": 0.12504877150058746,
"learning_rate": 1.7034053059725325e-05,
"loss": 0.0321,
"step": 298
},
{
"epoch": 0.8242946370880896,
"grad_norm": 0.16158223152160645,
"learning_rate": 1.7012815618321256e-05,
"loss": 0.0326,
"step": 299
},
{
"epoch": 0.8270514753392203,
"grad_norm": 0.12104715406894684,
"learning_rate": 1.6991515755287715e-05,
"loss": 0.031,
"step": 300
},
{
"epoch": 0.8298083135903511,
"grad_norm": 0.1327955275774002,
"learning_rate": 1.697015366021648e-05,
"loss": 0.033,
"step": 301
},
{
"epoch": 0.8325651518414818,
"grad_norm": 0.12627461552619934,
"learning_rate": 1.694872952325326e-05,
"loss": 0.0305,
"step": 302
},
{
"epoch": 0.8353219900926125,
"grad_norm": 0.13003765046596527,
"learning_rate": 1.6927243535095995e-05,
"loss": 0.0313,
"step": 303
},
{
"epoch": 0.8380788283437433,
"grad_norm": 0.15389683842658997,
"learning_rate": 1.690569588699318e-05,
"loss": 0.0314,
"step": 304
},
{
"epoch": 0.840835666594874,
"grad_norm": 0.1295783370733261,
"learning_rate": 1.6884086770742138e-05,
"loss": 0.0311,
"step": 305
},
{
"epoch": 0.8435925048460048,
"grad_norm": 0.14465399086475372,
"learning_rate": 1.686241637868734e-05,
"loss": 0.0305,
"step": 306
},
{
"epoch": 0.8463493430971355,
"grad_norm": 0.12934014201164246,
"learning_rate": 1.6840684903718658e-05,
"loss": 0.0315,
"step": 307
},
{
"epoch": 0.8491061813482662,
"grad_norm": 0.13668671250343323,
"learning_rate": 1.681889253926969e-05,
"loss": 0.0308,
"step": 308
},
{
"epoch": 0.8518630195993969,
"grad_norm": 0.1645808219909668,
"learning_rate": 1.6797039479315994e-05,
"loss": 0.0328,
"step": 309
},
{
"epoch": 0.8546198578505276,
"grad_norm": 0.1372358649969101,
"learning_rate": 1.67751259183734e-05,
"loss": 0.0309,
"step": 310
},
{
"epoch": 0.8573766961016585,
"grad_norm": 0.151007741689682,
"learning_rate": 1.675315205149626e-05,
"loss": 0.033,
"step": 311
},
{
"epoch": 0.8601335343527892,
"grad_norm": 0.18326100707054138,
"learning_rate": 1.67311180742757e-05,
"loss": 0.0314,
"step": 312
},
{
"epoch": 0.8628903726039199,
"grad_norm": 0.12580075860023499,
"learning_rate": 1.6709024182837917e-05,
"loss": 0.0314,
"step": 313
},
{
"epoch": 0.8656472108550506,
"grad_norm": 0.14372077584266663,
"learning_rate": 1.6686870573842388e-05,
"loss": 0.0298,
"step": 314
},
{
"epoch": 0.8684040491061813,
"grad_norm": 0.1365354061126709,
"learning_rate": 1.6664657444480145e-05,
"loss": 0.0304,
"step": 315
},
{
"epoch": 0.8711608873573121,
"grad_norm": 0.17227758467197418,
"learning_rate": 1.6642384992472026e-05,
"loss": 0.0311,
"step": 316
},
{
"epoch": 0.8739177256084428,
"grad_norm": 0.1729755699634552,
"learning_rate": 1.6620053416066892e-05,
"loss": 0.0315,
"step": 317
},
{
"epoch": 0.8766745638595735,
"grad_norm": 0.1310206949710846,
"learning_rate": 1.6597662914039885e-05,
"loss": 0.0313,
"step": 318
},
{
"epoch": 0.8794314021107043,
"grad_norm": 0.15664447844028473,
"learning_rate": 1.657521368569064e-05,
"loss": 0.0304,
"step": 319
},
{
"epoch": 0.8821882403618351,
"grad_norm": 0.16371680796146393,
"learning_rate": 1.6552705930841523e-05,
"loss": 0.0306,
"step": 320
},
{
"epoch": 0.8849450786129658,
"grad_norm": 0.132271409034729,
"learning_rate": 1.653013984983585e-05,
"loss": 0.0315,
"step": 321
},
{
"epoch": 0.8877019168640965,
"grad_norm": 0.1789659708738327,
"learning_rate": 1.6507515643536113e-05,
"loss": 0.0304,
"step": 322
},
{
"epoch": 0.8904587551152272,
"grad_norm": 0.12446186691522598,
"learning_rate": 1.6484833513322155e-05,
"loss": 0.0297,
"step": 323
},
{
"epoch": 0.8932155933663579,
"grad_norm": 0.13840149343013763,
"learning_rate": 1.6462093661089432e-05,
"loss": 0.0302,
"step": 324
},
{
"epoch": 0.8959724316174887,
"grad_norm": 0.12458626925945282,
"learning_rate": 1.643929628924717e-05,
"loss": 0.0321,
"step": 325
},
{
"epoch": 0.8987292698686195,
"grad_norm": 0.13855692744255066,
"learning_rate": 1.6416441600716593e-05,
"loss": 0.0305,
"step": 326
},
{
"epoch": 0.9014861081197502,
"grad_norm": 0.1461232304573059,
"learning_rate": 1.6393529798929103e-05,
"loss": 0.03,
"step": 327
},
{
"epoch": 0.9042429463708809,
"grad_norm": 0.11444980651140213,
"learning_rate": 1.637056108782446e-05,
"loss": 0.0292,
"step": 328
},
{
"epoch": 0.9069997846220116,
"grad_norm": 0.1289425641298294,
"learning_rate": 1.6347535671848998e-05,
"loss": 0.0305,
"step": 329
},
{
"epoch": 0.9097566228731424,
"grad_norm": 0.12594164907932281,
"learning_rate": 1.6324453755953772e-05,
"loss": 0.0288,
"step": 330
},
{
"epoch": 0.9125134611242731,
"grad_norm": 0.41084402799606323,
"learning_rate": 1.6301315545592753e-05,
"loss": 0.0299,
"step": 331
},
{
"epoch": 0.9152702993754038,
"grad_norm": 0.13620588183403015,
"learning_rate": 1.627812124672099e-05,
"loss": 0.0302,
"step": 332
},
{
"epoch": 0.9180271376265345,
"grad_norm": 0.10916028916835785,
"learning_rate": 1.6254871065792776e-05,
"loss": 0.031,
"step": 333
},
{
"epoch": 0.9207839758776653,
"grad_norm": 0.1257525086402893,
"learning_rate": 1.623156520975983e-05,
"loss": 0.0313,
"step": 334
},
{
"epoch": 0.9235408141287961,
"grad_norm": 0.10521648079156876,
"learning_rate": 1.620820388606942e-05,
"loss": 0.0296,
"step": 335
},
{
"epoch": 0.9262976523799268,
"grad_norm": 0.11178401112556458,
"learning_rate": 1.618478730266255e-05,
"loss": 0.0307,
"step": 336
},
{
"epoch": 0.9290544906310575,
"grad_norm": 0.11989890038967133,
"learning_rate": 1.6161315667972095e-05,
"loss": 0.0298,
"step": 337
},
{
"epoch": 0.9318113288821882,
"grad_norm": 0.12103229016065598,
"learning_rate": 1.6137789190920938e-05,
"loss": 0.0302,
"step": 338
},
{
"epoch": 0.934568167133319,
"grad_norm": 0.10210815072059631,
"learning_rate": 1.6114208080920125e-05,
"loss": 0.029,
"step": 339
},
{
"epoch": 0.9373250053844497,
"grad_norm": 0.12528973817825317,
"learning_rate": 1.6090572547866983e-05,
"loss": 0.0303,
"step": 340
},
{
"epoch": 0.9400818436355804,
"grad_norm": 0.12944018840789795,
"learning_rate": 1.606688280214328e-05,
"loss": 0.031,
"step": 341
},
{
"epoch": 0.9428386818867112,
"grad_norm": 0.1170840710401535,
"learning_rate": 1.6043139054613326e-05,
"loss": 0.0304,
"step": 342
},
{
"epoch": 0.9455955201378419,
"grad_norm": 0.1317935287952423,
"learning_rate": 1.60193415166221e-05,
"loss": 0.0309,
"step": 343
},
{
"epoch": 0.9483523583889727,
"grad_norm": 0.10986040532588959,
"learning_rate": 1.599549039999338e-05,
"loss": 0.0306,
"step": 344
},
{
"epoch": 0.9511091966401034,
"grad_norm": 0.11340153217315674,
"learning_rate": 1.5971585917027864e-05,
"loss": 0.0293,
"step": 345
},
{
"epoch": 0.9538660348912341,
"grad_norm": 0.1250225007534027,
"learning_rate": 1.594762828050124e-05,
"loss": 0.029,
"step": 346
},
{
"epoch": 0.9566228731423648,
"grad_norm": 0.1400870531797409,
"learning_rate": 1.5923617703662346e-05,
"loss": 0.0313,
"step": 347
},
{
"epoch": 0.9593797113934955,
"grad_norm": 0.10701585561037064,
"learning_rate": 1.5899554400231233e-05,
"loss": 0.0316,
"step": 348
},
{
"epoch": 0.9621365496446264,
"grad_norm": 0.1186312660574913,
"learning_rate": 1.587543858439727e-05,
"loss": 0.0307,
"step": 349
},
{
"epoch": 0.9648933878957571,
"grad_norm": 0.10630480200052261,
"learning_rate": 1.585127047081727e-05,
"loss": 0.0297,
"step": 350
},
{
"epoch": 0.9676502261468878,
"grad_norm": 0.11205119639635086,
"learning_rate": 1.5827050274613512e-05,
"loss": 0.0297,
"step": 351
},
{
"epoch": 0.9704070643980185,
"grad_norm": 0.11059113591909409,
"learning_rate": 1.580277821137191e-05,
"loss": 0.0296,
"step": 352
},
{
"epoch": 0.9731639026491492,
"grad_norm": 0.13033725321292877,
"learning_rate": 1.577845449714001e-05,
"loss": 0.0303,
"step": 353
},
{
"epoch": 0.97592074090028,
"grad_norm": 0.10061930865049362,
"learning_rate": 1.5754079348425137e-05,
"loss": 0.0296,
"step": 354
},
{
"epoch": 0.9786775791514107,
"grad_norm": 0.11028820276260376,
"learning_rate": 1.5729652982192428e-05,
"loss": 0.0284,
"step": 355
},
{
"epoch": 0.9814344174025414,
"grad_norm": 0.11424390226602554,
"learning_rate": 1.5705175615862906e-05,
"loss": 0.0293,
"step": 356
},
{
"epoch": 0.9841912556536722,
"grad_norm": 0.14579032361507416,
"learning_rate": 1.568064746731156e-05,
"loss": 0.0299,
"step": 357
},
{
"epoch": 0.986948093904803,
"grad_norm": 0.11156564950942993,
"learning_rate": 1.5656068754865388e-05,
"loss": 0.0298,
"step": 358
},
{
"epoch": 0.9897049321559337,
"grad_norm": 0.10791637748479843,
"learning_rate": 1.5631439697301464e-05,
"loss": 0.03,
"step": 359
},
{
"epoch": 0.9924617704070644,
"grad_norm": 0.11196880787611008,
"learning_rate": 1.560676051384499e-05,
"loss": 0.0289,
"step": 360
},
{
"epoch": 0.9952186086581951,
"grad_norm": 0.11056679487228394,
"learning_rate": 1.558203142416734e-05,
"loss": 0.0285,
"step": 361
},
{
"epoch": 0.9979754469093258,
"grad_norm": 0.1029677763581276,
"learning_rate": 1.5557252648384103e-05,
"loss": 0.0303,
"step": 362
},
{
"epoch": 1.0007322851604565,
"grad_norm": 0.12087027728557587,
"learning_rate": 1.553242440705314e-05,
"loss": 0.0297,
"step": 363
},
{
"epoch": 1.0034891234115872,
"grad_norm": 0.11372894048690796,
"learning_rate": 1.5507546921172595e-05,
"loss": 0.0251,
"step": 364
},
{
"epoch": 1.0062459616627182,
"grad_norm": 0.11590228229761124,
"learning_rate": 1.548262041217895e-05,
"loss": 0.0244,
"step": 365
},
{
"epoch": 1.009002799913849,
"grad_norm": 0.129511296749115,
"learning_rate": 1.5457645101945046e-05,
"loss": 0.0247,
"step": 366
},
{
"epoch": 1.0117596381649796,
"grad_norm": 0.12303431332111359,
"learning_rate": 1.5432621212778105e-05,
"loss": 0.0247,
"step": 367
},
{
"epoch": 1.0145164764161103,
"grad_norm": 0.1147053986787796,
"learning_rate": 1.5407548967417755e-05,
"loss": 0.0241,
"step": 368
},
{
"epoch": 1.017273314667241,
"grad_norm": 0.117189422249794,
"learning_rate": 1.538242858903404e-05,
"loss": 0.0244,
"step": 369
},
{
"epoch": 1.0200301529183717,
"grad_norm": 0.1118486151099205,
"learning_rate": 1.5357260301225448e-05,
"loss": 0.0249,
"step": 370
},
{
"epoch": 1.0227869911695024,
"grad_norm": 0.4378014802932739,
"learning_rate": 1.5332044328016916e-05,
"loss": 0.0262,
"step": 371
},
{
"epoch": 1.0255438294206332,
"grad_norm": 0.12007389217615128,
"learning_rate": 1.530678089385782e-05,
"loss": 0.0256,
"step": 372
},
{
"epoch": 1.0283006676717639,
"grad_norm": 0.13402129709720612,
"learning_rate": 1.5281470223619995e-05,
"loss": 0.0254,
"step": 373
},
{
"epoch": 1.0310575059228946,
"grad_norm": 0.1433870643377304,
"learning_rate": 1.525611254259574e-05,
"loss": 0.0251,
"step": 374
},
{
"epoch": 1.0338143441740255,
"grad_norm": 0.116049624979496,
"learning_rate": 1.5230708076495777e-05,
"loss": 0.0246,
"step": 375
},
{
"epoch": 1.0365711824251562,
"grad_norm": 0.10888892412185669,
"learning_rate": 1.5205257051447291e-05,
"loss": 0.0248,
"step": 376
},
{
"epoch": 1.039328020676287,
"grad_norm": 0.1329452395439148,
"learning_rate": 1.5179759693991869e-05,
"loss": 0.0248,
"step": 377
},
{
"epoch": 1.0420848589274176,
"grad_norm": 0.12392039597034454,
"learning_rate": 1.5154216231083522e-05,
"loss": 0.0255,
"step": 378
},
{
"epoch": 1.0448416971785484,
"grad_norm": 0.11657357215881348,
"learning_rate": 1.5128626890086647e-05,
"loss": 0.0247,
"step": 379
},
{
"epoch": 1.047598535429679,
"grad_norm": 0.11587107181549072,
"learning_rate": 1.5102991898774e-05,
"loss": 0.0256,
"step": 380
},
{
"epoch": 1.0503553736808098,
"grad_norm": 0.5603076219558716,
"learning_rate": 1.507731148532468e-05,
"loss": 0.0252,
"step": 381
},
{
"epoch": 1.0531122119319405,
"grad_norm": 0.11775552481412888,
"learning_rate": 1.505158587832208e-05,
"loss": 0.0246,
"step": 382
},
{
"epoch": 1.0558690501830712,
"grad_norm": 0.10402605682611465,
"learning_rate": 1.5025815306751888e-05,
"loss": 0.0238,
"step": 383
},
{
"epoch": 1.058625888434202,
"grad_norm": 0.1355026811361313,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.0262,
"step": 384
},
{
"epoch": 1.0613827266853328,
"grad_norm": 0.11940840631723404,
"learning_rate": 1.497414018785052e-05,
"loss": 0.0246,
"step": 385
},
{
"epoch": 1.0641395649364636,
"grad_norm": 0.12173695117235184,
"learning_rate": 1.4948236100483688e-05,
"loss": 0.025,
"step": 386
},
{
"epoch": 1.0668964031875943,
"grad_norm": 0.11082287132740021,
"learning_rate": 1.492228796847385e-05,
"loss": 0.0258,
"step": 387
},
{
"epoch": 1.069653241438725,
"grad_norm": 0.10860753059387207,
"learning_rate": 1.4896296022787386e-05,
"loss": 0.0244,
"step": 388
},
{
"epoch": 1.0724100796898557,
"grad_norm": 0.10953570157289505,
"learning_rate": 1.4870260494780679e-05,
"loss": 0.0246,
"step": 389
},
{
"epoch": 1.0751669179409864,
"grad_norm": 0.10230018198490143,
"learning_rate": 1.4844181616198028e-05,
"loss": 0.0249,
"step": 390
},
{
"epoch": 1.077923756192117,
"grad_norm": 0.10772014409303665,
"learning_rate": 1.4818059619169606e-05,
"loss": 0.0247,
"step": 391
},
{
"epoch": 1.0806805944432478,
"grad_norm": 0.11978469043970108,
"learning_rate": 1.479189473620939e-05,
"loss": 0.025,
"step": 392
},
{
"epoch": 1.0834374326943785,
"grad_norm": 0.10893766582012177,
"learning_rate": 1.4765687200213079e-05,
"loss": 0.0239,
"step": 393
},
{
"epoch": 1.0861942709455095,
"grad_norm": 0.1162242516875267,
"learning_rate": 1.4739437244456039e-05,
"loss": 0.024,
"step": 394
},
{
"epoch": 1.0889511091966402,
"grad_norm": 0.11603809893131256,
"learning_rate": 1.4713145102591209e-05,
"loss": 0.0255,
"step": 395
},
{
"epoch": 1.0917079474477709,
"grad_norm": 0.11185269802808762,
"learning_rate": 1.4686811008647037e-05,
"loss": 0.0244,
"step": 396
},
{
"epoch": 1.0944647856989016,
"grad_norm": 0.11362854391336441,
"learning_rate": 1.4660435197025391e-05,
"loss": 0.0248,
"step": 397
},
{
"epoch": 1.0972216239500323,
"grad_norm": 0.12096700072288513,
"learning_rate": 1.463401790249946e-05,
"loss": 0.0247,
"step": 398
},
{
"epoch": 1.099978462201163,
"grad_norm": 0.12074792385101318,
"learning_rate": 1.4607559360211688e-05,
"loss": 0.025,
"step": 399
},
{
"epoch": 1.1027353004522937,
"grad_norm": 0.13158197700977325,
"learning_rate": 1.4581059805671662e-05,
"loss": 0.026,
"step": 400
},
{
"epoch": 1.1054921387034244,
"grad_norm": 0.1625628024339676,
"learning_rate": 1.4554519474754025e-05,
"loss": 0.0249,
"step": 401
},
{
"epoch": 1.1082489769545552,
"grad_norm": 0.11231596022844315,
"learning_rate": 1.4527938603696376e-05,
"loss": 0.0246,
"step": 402
},
{
"epoch": 1.111005815205686,
"grad_norm": 0.10914017260074615,
"learning_rate": 1.4501317429097155e-05,
"loss": 0.0247,
"step": 403
},
{
"epoch": 1.1137626534568168,
"grad_norm": 0.11855874955654144,
"learning_rate": 1.4474656187913558e-05,
"loss": 0.0255,
"step": 404
},
{
"epoch": 1.1165194917079475,
"grad_norm": 0.10382278263568878,
"learning_rate": 1.4447955117459414e-05,
"loss": 0.0235,
"step": 405
},
{
"epoch": 1.1192763299590782,
"grad_norm": 0.11612187325954437,
"learning_rate": 1.4421214455403071e-05,
"loss": 0.0253,
"step": 406
},
{
"epoch": 1.122033168210209,
"grad_norm": 0.11434487253427505,
"learning_rate": 1.4394434439765295e-05,
"loss": 0.0256,
"step": 407
},
{
"epoch": 1.1247900064613396,
"grad_norm": 0.14260295033454895,
"learning_rate": 1.436761530891713e-05,
"loss": 0.0246,
"step": 408
},
{
"epoch": 1.1275468447124704,
"grad_norm": 0.11852490156888962,
"learning_rate": 1.4340757301577787e-05,
"loss": 0.0242,
"step": 409
},
{
"epoch": 1.130303682963601,
"grad_norm": 0.19492119550704956,
"learning_rate": 1.4313860656812537e-05,
"loss": 0.0254,
"step": 410
},
{
"epoch": 1.1330605212147318,
"grad_norm": 0.14018341898918152,
"learning_rate": 1.4286925614030542e-05,
"loss": 0.0265,
"step": 411
},
{
"epoch": 1.1358173594658627,
"grad_norm": 0.11521740257740021,
"learning_rate": 1.425995241298277e-05,
"loss": 0.0247,
"step": 412
},
{
"epoch": 1.1385741977169934,
"grad_norm": 0.10554037988185883,
"learning_rate": 1.423294129375982e-05,
"loss": 0.0251,
"step": 413
},
{
"epoch": 1.1413310359681241,
"grad_norm": 0.12840139865875244,
"learning_rate": 1.4205892496789816e-05,
"loss": 0.0248,
"step": 414
},
{
"epoch": 1.1440878742192548,
"grad_norm": 0.11721502244472504,
"learning_rate": 1.4178806262836252e-05,
"loss": 0.0259,
"step": 415
},
{
"epoch": 1.1468447124703856,
"grad_norm": 0.14833590388298035,
"learning_rate": 1.4151682832995846e-05,
"loss": 0.0238,
"step": 416
},
{
"epoch": 1.1496015507215163,
"grad_norm": 0.10142626613378525,
"learning_rate": 1.4124522448696407e-05,
"loss": 0.0243,
"step": 417
},
{
"epoch": 1.152358388972647,
"grad_norm": 0.13285695016384125,
"learning_rate": 1.4097325351694674e-05,
"loss": 0.0248,
"step": 418
},
{
"epoch": 1.1551152272237777,
"grad_norm": 0.10552552342414856,
"learning_rate": 1.407009178407417e-05,
"loss": 0.0245,
"step": 419
},
{
"epoch": 1.1578720654749084,
"grad_norm": 0.14879612624645233,
"learning_rate": 1.404282198824305e-05,
"loss": 0.0242,
"step": 420
},
{
"epoch": 1.160628903726039,
"grad_norm": 0.13807976245880127,
"learning_rate": 1.4015516206931932e-05,
"loss": 0.0241,
"step": 421
},
{
"epoch": 1.1633857419771698,
"grad_norm": 0.12726476788520813,
"learning_rate": 1.3988174683191744e-05,
"loss": 0.0244,
"step": 422
},
{
"epoch": 1.1661425802283008,
"grad_norm": 0.11876961588859558,
"learning_rate": 1.396079766039157e-05,
"loss": 0.0241,
"step": 423
},
{
"epoch": 1.1688994184794315,
"grad_norm": 0.13200323283672333,
"learning_rate": 1.393338538221646e-05,
"loss": 0.0254,
"step": 424
},
{
"epoch": 1.1716562567305622,
"grad_norm": 0.1211104691028595,
"learning_rate": 1.3905938092665283e-05,
"loss": 0.0253,
"step": 425
},
{
"epoch": 1.1744130949816929,
"grad_norm": 0.13063450157642365,
"learning_rate": 1.387845603604855e-05,
"loss": 0.0251,
"step": 426
},
{
"epoch": 1.1771699332328236,
"grad_norm": 0.12195156514644623,
"learning_rate": 1.385093945698623e-05,
"loss": 0.0243,
"step": 427
},
{
"epoch": 1.1799267714839543,
"grad_norm": 0.15483883023262024,
"learning_rate": 1.382338860040558e-05,
"loss": 0.0245,
"step": 428
},
{
"epoch": 1.182683609735085,
"grad_norm": 0.11185739189386368,
"learning_rate": 1.3795803711538966e-05,
"loss": 0.0248,
"step": 429
},
{
"epoch": 1.1854404479862157,
"grad_norm": 0.11978735029697418,
"learning_rate": 1.3768185035921677e-05,
"loss": 0.0242,
"step": 430
},
{
"epoch": 1.1881972862373464,
"grad_norm": 0.11775299161672592,
"learning_rate": 1.374053281938974e-05,
"loss": 0.0245,
"step": 431
},
{
"epoch": 1.1909541244884774,
"grad_norm": 0.10916128754615784,
"learning_rate": 1.3712847308077737e-05,
"loss": 0.024,
"step": 432
},
{
"epoch": 1.193710962739608,
"grad_norm": 0.12981654703617096,
"learning_rate": 1.3685128748416603e-05,
"loss": 0.0251,
"step": 433
},
{
"epoch": 1.1964678009907388,
"grad_norm": 0.1111079528927803,
"learning_rate": 1.3657377387131443e-05,
"loss": 0.0242,
"step": 434
},
{
"epoch": 1.1992246392418695,
"grad_norm": 0.2875465154647827,
"learning_rate": 1.3629593471239328e-05,
"loss": 0.0246,
"step": 435
},
{
"epoch": 1.2019814774930002,
"grad_norm": 0.10963843017816544,
"learning_rate": 1.3601777248047105e-05,
"loss": 0.0245,
"step": 436
},
{
"epoch": 1.204738315744131,
"grad_norm": 0.15255135297775269,
"learning_rate": 1.3573928965149188e-05,
"loss": 0.0254,
"step": 437
},
{
"epoch": 1.2074951539952616,
"grad_norm": 0.12013466656208038,
"learning_rate": 1.3546048870425356e-05,
"loss": 0.0252,
"step": 438
},
{
"epoch": 1.2102519922463923,
"grad_norm": 0.11045455187559128,
"learning_rate": 1.3518137212038554e-05,
"loss": 0.0233,
"step": 439
},
{
"epoch": 1.213008830497523,
"grad_norm": 0.11393214017152786,
"learning_rate": 1.3490194238432665e-05,
"loss": 0.0253,
"step": 440
},
{
"epoch": 1.215765668748654,
"grad_norm": 0.39521655440330505,
"learning_rate": 1.346222019833033e-05,
"loss": 0.0243,
"step": 441
},
{
"epoch": 1.2185225069997847,
"grad_norm": 0.11597371101379395,
"learning_rate": 1.3434215340730692e-05,
"loss": 0.0247,
"step": 442
},
{
"epoch": 1.2212793452509154,
"grad_norm": 0.11905563622713089,
"learning_rate": 1.340617991490722e-05,
"loss": 0.0246,
"step": 443
},
{
"epoch": 1.2240361835020461,
"grad_norm": 0.11847221851348877,
"learning_rate": 1.3378114170405473e-05,
"loss": 0.0244,
"step": 444
},
{
"epoch": 1.2267930217531768,
"grad_norm": 0.11068813502788544,
"learning_rate": 1.335001835704087e-05,
"loss": 0.0252,
"step": 445
},
{
"epoch": 1.2295498600043075,
"grad_norm": 0.1128387600183487,
"learning_rate": 1.3321892724896483e-05,
"loss": 0.025,
"step": 446
},
{
"epoch": 1.2323066982554383,
"grad_norm": 0.10336292535066605,
"learning_rate": 1.3293737524320798e-05,
"loss": 0.0245,
"step": 447
},
{
"epoch": 1.235063536506569,
"grad_norm": 0.11005783826112747,
"learning_rate": 1.3265553005925494e-05,
"loss": 0.0253,
"step": 448
},
{
"epoch": 1.2378203747576997,
"grad_norm": 0.125181645154953,
"learning_rate": 1.3237339420583213e-05,
"loss": 0.0248,
"step": 449
},
{
"epoch": 1.2405772130088306,
"grad_norm": 0.1266736388206482,
"learning_rate": 1.3209097019425317e-05,
"loss": 0.0255,
"step": 450
},
{
"epoch": 1.2433340512599613,
"grad_norm": 0.13304491341114044,
"learning_rate": 1.3180826053839668e-05,
"loss": 0.0249,
"step": 451
},
{
"epoch": 1.246090889511092,
"grad_norm": 0.09845131635665894,
"learning_rate": 1.315252677546838e-05,
"loss": 0.0242,
"step": 452
},
{
"epoch": 1.2488477277622227,
"grad_norm": 0.11469008028507233,
"learning_rate": 1.3124199436205575e-05,
"loss": 0.0252,
"step": 453
},
{
"epoch": 1.2516045660133535,
"grad_norm": 0.11134012043476105,
"learning_rate": 1.309584428819516e-05,
"loss": 0.0235,
"step": 454
},
{
"epoch": 1.2543614042644842,
"grad_norm": 0.10330889374017715,
"learning_rate": 1.3067461583828553e-05,
"loss": 0.0251,
"step": 455
},
{
"epoch": 1.2571182425156149,
"grad_norm": 0.10626427829265594,
"learning_rate": 1.303905157574247e-05,
"loss": 0.0232,
"step": 456
},
{
"epoch": 1.2598750807667456,
"grad_norm": 0.10066665709018707,
"learning_rate": 1.3010614516816652e-05,
"loss": 0.0244,
"step": 457
},
{
"epoch": 1.2626319190178763,
"grad_norm": 0.0952746570110321,
"learning_rate": 1.2982150660171613e-05,
"loss": 0.0231,
"step": 458
},
{
"epoch": 1.2653887572690072,
"grad_norm": 0.11685788631439209,
"learning_rate": 1.2953660259166413e-05,
"loss": 0.024,
"step": 459
},
{
"epoch": 1.2681455955201377,
"grad_norm": 0.10656964778900146,
"learning_rate": 1.2925143567396374e-05,
"loss": 0.0253,
"step": 460
},
{
"epoch": 1.2709024337712687,
"grad_norm": 0.12450357526540756,
"learning_rate": 1.2896600838690838e-05,
"loss": 0.0241,
"step": 461
},
{
"epoch": 1.2736592720223994,
"grad_norm": 0.10607194900512695,
"learning_rate": 1.2868032327110904e-05,
"loss": 0.0238,
"step": 462
},
{
"epoch": 1.27641611027353,
"grad_norm": 0.12448973953723907,
"learning_rate": 1.2839438286947163e-05,
"loss": 0.0245,
"step": 463
},
{
"epoch": 1.2791729485246608,
"grad_norm": 0.11434370279312134,
"learning_rate": 1.2810818972717438e-05,
"loss": 0.0236,
"step": 464
},
{
"epoch": 1.2819297867757915,
"grad_norm": 0.11489138007164001,
"learning_rate": 1.2782174639164528e-05,
"loss": 0.0241,
"step": 465
},
{
"epoch": 1.2846866250269222,
"grad_norm": 0.11225125193595886,
"learning_rate": 1.2753505541253917e-05,
"loss": 0.0238,
"step": 466
},
{
"epoch": 1.287443463278053,
"grad_norm": 0.11676695942878723,
"learning_rate": 1.272481193417153e-05,
"loss": 0.0251,
"step": 467
},
{
"epoch": 1.2902003015291836,
"grad_norm": 0.10919535160064697,
"learning_rate": 1.269609407332144e-05,
"loss": 0.0248,
"step": 468
},
{
"epoch": 1.2929571397803143,
"grad_norm": 0.11963889002799988,
"learning_rate": 1.2667352214323614e-05,
"loss": 0.0253,
"step": 469
},
{
"epoch": 1.2957139780314453,
"grad_norm": 0.11035304516553879,
"learning_rate": 1.2638586613011625e-05,
"loss": 0.0241,
"step": 470
},
{
"epoch": 1.298470816282576,
"grad_norm": 0.1180749386548996,
"learning_rate": 1.2609797525430374e-05,
"loss": 0.0257,
"step": 471
},
{
"epoch": 1.3012276545337067,
"grad_norm": 0.11271151155233383,
"learning_rate": 1.258098520783382e-05,
"loss": 0.024,
"step": 472
},
{
"epoch": 1.3039844927848374,
"grad_norm": 0.10727877169847488,
"learning_rate": 1.2552149916682695e-05,
"loss": 0.0244,
"step": 473
},
{
"epoch": 1.3067413310359681,
"grad_norm": 0.10205347836017609,
"learning_rate": 1.2523291908642219e-05,
"loss": 0.0235,
"step": 474
},
{
"epoch": 1.3094981692870988,
"grad_norm": 0.10410701483488083,
"learning_rate": 1.2494411440579814e-05,
"loss": 0.0245,
"step": 475
},
{
"epoch": 1.3122550075382295,
"grad_norm": 0.10330906510353088,
"learning_rate": 1.2465508769562824e-05,
"loss": 0.0246,
"step": 476
},
{
"epoch": 1.3150118457893603,
"grad_norm": 0.10153473168611526,
"learning_rate": 1.243658415285622e-05,
"loss": 0.024,
"step": 477
},
{
"epoch": 1.317768684040491,
"grad_norm": 0.13269540667533875,
"learning_rate": 1.240763784792032e-05,
"loss": 0.0265,
"step": 478
},
{
"epoch": 1.320525522291622,
"grad_norm": 0.1011882796883583,
"learning_rate": 1.2378670112408482e-05,
"loss": 0.0244,
"step": 479
},
{
"epoch": 1.3232823605427526,
"grad_norm": 0.10707399994134903,
"learning_rate": 1.2349681204164823e-05,
"loss": 0.0239,
"step": 480
},
{
"epoch": 1.3260391987938833,
"grad_norm": 0.1126994714140892,
"learning_rate": 1.2320671381221923e-05,
"loss": 0.0256,
"step": 481
},
{
"epoch": 1.328796037045014,
"grad_norm": 0.10313718020915985,
"learning_rate": 1.229164090179852e-05,
"loss": 0.0243,
"step": 482
},
{
"epoch": 1.3315528752961447,
"grad_norm": 0.1096411719918251,
"learning_rate": 1.2262590024297226e-05,
"loss": 0.0242,
"step": 483
},
{
"epoch": 1.3343097135472755,
"grad_norm": 0.11218464374542236,
"learning_rate": 1.2233519007302201e-05,
"loss": 0.0243,
"step": 484
},
{
"epoch": 1.3370665517984062,
"grad_norm": 0.09545588493347168,
"learning_rate": 1.2204428109576888e-05,
"loss": 0.0235,
"step": 485
},
{
"epoch": 1.3398233900495369,
"grad_norm": 0.10384158790111542,
"learning_rate": 1.2175317590061676e-05,
"loss": 0.0239,
"step": 486
},
{
"epoch": 1.3425802283006676,
"grad_norm": 0.10288365185260773,
"learning_rate": 1.2146187707871617e-05,
"loss": 0.0246,
"step": 487
},
{
"epoch": 1.3453370665517985,
"grad_norm": 0.12427128106355667,
"learning_rate": 1.211703872229411e-05,
"loss": 0.0247,
"step": 488
},
{
"epoch": 1.348093904802929,
"grad_norm": 0.11715073883533478,
"learning_rate": 1.2087870892786588e-05,
"loss": 0.0245,
"step": 489
},
{
"epoch": 1.35085074305406,
"grad_norm": 0.10031214356422424,
"learning_rate": 1.2058684478974226e-05,
"loss": 0.0237,
"step": 490
},
{
"epoch": 1.3536075813051907,
"grad_norm": 0.13713191449642181,
"learning_rate": 1.2029479740647613e-05,
"loss": 0.0226,
"step": 491
},
{
"epoch": 1.3563644195563214,
"grad_norm": 0.10992252826690674,
"learning_rate": 1.2000256937760446e-05,
"loss": 0.0254,
"step": 492
},
{
"epoch": 1.359121257807452,
"grad_norm": 0.12964554131031036,
"learning_rate": 1.1971016330427217e-05,
"loss": 0.0249,
"step": 493
},
{
"epoch": 1.3618780960585828,
"grad_norm": 0.10349434614181519,
"learning_rate": 1.1941758178920899e-05,
"loss": 0.0243,
"step": 494
},
{
"epoch": 1.3646349343097135,
"grad_norm": 0.13970984518527985,
"learning_rate": 1.1912482743670624e-05,
"loss": 0.024,
"step": 495
},
{
"epoch": 1.3673917725608442,
"grad_norm": 0.10153508186340332,
"learning_rate": 1.188319028525937e-05,
"loss": 0.0237,
"step": 496
},
{
"epoch": 1.3701486108119751,
"grad_norm": 0.12652255594730377,
"learning_rate": 1.1853881064421634e-05,
"loss": 0.0248,
"step": 497
},
{
"epoch": 1.3729054490631056,
"grad_norm": 0.1191074326634407,
"learning_rate": 1.1824555342041129e-05,
"loss": 0.0244,
"step": 498
},
{
"epoch": 1.3756622873142366,
"grad_norm": 0.11915133893489838,
"learning_rate": 1.1795213379148436e-05,
"loss": 0.0232,
"step": 499
},
{
"epoch": 1.3784191255653673,
"grad_norm": 0.11027630418539047,
"learning_rate": 1.17658554369187e-05,
"loss": 0.0255,
"step": 500
},
{
"epoch": 1.381175963816498,
"grad_norm": 0.1299276500940323,
"learning_rate": 1.1736481776669307e-05,
"loss": 0.0243,
"step": 501
},
{
"epoch": 1.3839328020676287,
"grad_norm": 0.10648605227470398,
"learning_rate": 1.1707092659857531e-05,
"loss": 0.024,
"step": 502
},
{
"epoch": 1.3866896403187594,
"grad_norm": 0.12794575095176697,
"learning_rate": 1.1677688348078244e-05,
"loss": 0.0241,
"step": 503
},
{
"epoch": 1.3894464785698901,
"grad_norm": 0.11318343877792358,
"learning_rate": 1.1648269103061567e-05,
"loss": 0.0239,
"step": 504
},
{
"epoch": 1.3922033168210208,
"grad_norm": 0.10124088078737259,
"learning_rate": 1.1618835186670532e-05,
"loss": 0.0231,
"step": 505
},
{
"epoch": 1.3949601550721518,
"grad_norm": 0.10141489654779434,
"learning_rate": 1.1589386860898773e-05,
"loss": 0.0246,
"step": 506
},
{
"epoch": 1.3977169933232823,
"grad_norm": 0.11699099838733673,
"learning_rate": 1.155992438786818e-05,
"loss": 0.0237,
"step": 507
},
{
"epoch": 1.4004738315744132,
"grad_norm": 0.10175668448209763,
"learning_rate": 1.1530448029826566e-05,
"loss": 0.023,
"step": 508
},
{
"epoch": 1.403230669825544,
"grad_norm": 0.12864287197589874,
"learning_rate": 1.1500958049145342e-05,
"loss": 0.0247,
"step": 509
},
{
"epoch": 1.4059875080766746,
"grad_norm": 0.8590685129165649,
"learning_rate": 1.1471454708317163e-05,
"loss": 0.0278,
"step": 510
},
{
"epoch": 1.4087443463278053,
"grad_norm": 0.11892825365066528,
"learning_rate": 1.1441938269953619e-05,
"loss": 0.0251,
"step": 511
},
{
"epoch": 1.411501184578936,
"grad_norm": 0.12736280262470245,
"learning_rate": 1.1412408996782871e-05,
"loss": 0.024,
"step": 512
},
{
"epoch": 1.4142580228300667,
"grad_norm": 0.09979959577322006,
"learning_rate": 1.1382867151647333e-05,
"loss": 0.0234,
"step": 513
},
{
"epoch": 1.4170148610811975,
"grad_norm": 0.11944937705993652,
"learning_rate": 1.1353312997501313e-05,
"loss": 0.0241,
"step": 514
},
{
"epoch": 1.4197716993323282,
"grad_norm": 0.09399650245904922,
"learning_rate": 1.1323746797408687e-05,
"loss": 0.0229,
"step": 515
},
{
"epoch": 1.4225285375834589,
"grad_norm": 0.11699047684669495,
"learning_rate": 1.1294168814540554e-05,
"loss": 0.025,
"step": 516
},
{
"epoch": 1.4252853758345898,
"grad_norm": 0.137738898396492,
"learning_rate": 1.1264579312172896e-05,
"loss": 0.0253,
"step": 517
},
{
"epoch": 1.4280422140857205,
"grad_norm": 0.11010394990444183,
"learning_rate": 1.123497855368422e-05,
"loss": 0.0245,
"step": 518
},
{
"epoch": 1.4307990523368512,
"grad_norm": 0.09499143809080124,
"learning_rate": 1.1205366802553231e-05,
"loss": 0.0232,
"step": 519
},
{
"epoch": 1.433555890587982,
"grad_norm": 0.11812635511159897,
"learning_rate": 1.1175744322356488e-05,
"loss": 0.0238,
"step": 520
},
{
"epoch": 1.4363127288391127,
"grad_norm": 0.10630928725004196,
"learning_rate": 1.1146111376766033e-05,
"loss": 0.0247,
"step": 521
},
{
"epoch": 1.4390695670902434,
"grad_norm": 0.11619951575994492,
"learning_rate": 1.1116468229547079e-05,
"loss": 0.024,
"step": 522
},
{
"epoch": 1.441826405341374,
"grad_norm": 0.09959083050489426,
"learning_rate": 1.1086815144555633e-05,
"loss": 0.0244,
"step": 523
},
{
"epoch": 1.4445832435925048,
"grad_norm": 0.10520867258310318,
"learning_rate": 1.105715238573616e-05,
"loss": 0.0248,
"step": 524
},
{
"epoch": 1.4473400818436355,
"grad_norm": 0.09931975603103638,
"learning_rate": 1.1027480217119245e-05,
"loss": 0.0237,
"step": 525
},
{
"epoch": 1.4500969200947664,
"grad_norm": 0.10330688953399658,
"learning_rate": 1.0997798902819208e-05,
"loss": 0.0245,
"step": 526
},
{
"epoch": 1.452853758345897,
"grad_norm": 0.09967660158872604,
"learning_rate": 1.0968108707031792e-05,
"loss": 0.0245,
"step": 527
},
{
"epoch": 1.4556105965970279,
"grad_norm": 0.11198758333921432,
"learning_rate": 1.0938409894031793e-05,
"loss": 0.0243,
"step": 528
},
{
"epoch": 1.4583674348481586,
"grad_norm": 0.8267385959625244,
"learning_rate": 1.0908702728170706e-05,
"loss": 0.0254,
"step": 529
},
{
"epoch": 1.4611242730992893,
"grad_norm": 0.09910894185304642,
"learning_rate": 1.0878987473874381e-05,
"loss": 0.0238,
"step": 530
},
{
"epoch": 1.46388111135042,
"grad_norm": 0.10041078925132751,
"learning_rate": 1.084926439564065e-05,
"loss": 0.0246,
"step": 531
},
{
"epoch": 1.4666379496015507,
"grad_norm": 0.1009271889925003,
"learning_rate": 1.0819533758037002e-05,
"loss": 0.0241,
"step": 532
},
{
"epoch": 1.4693947878526814,
"grad_norm": 0.10631167888641357,
"learning_rate": 1.0789795825698206e-05,
"loss": 0.0249,
"step": 533
},
{
"epoch": 1.4721516261038121,
"grad_norm": 0.1118827611207962,
"learning_rate": 1.0760050863323961e-05,
"loss": 0.0244,
"step": 534
},
{
"epoch": 1.474908464354943,
"grad_norm": 0.11578582972288132,
"learning_rate": 1.0730299135676545e-05,
"loss": 0.0239,
"step": 535
},
{
"epoch": 1.4776653026060735,
"grad_norm": 0.102451853454113,
"learning_rate": 1.0700540907578447e-05,
"loss": 0.0228,
"step": 536
},
{
"epoch": 1.4804221408572045,
"grad_norm": 0.09739474207162857,
"learning_rate": 1.0670776443910024e-05,
"loss": 0.0235,
"step": 537
},
{
"epoch": 1.4831789791083352,
"grad_norm": 0.1013341024518013,
"learning_rate": 1.0641006009607137e-05,
"loss": 0.0241,
"step": 538
},
{
"epoch": 1.485935817359466,
"grad_norm": 0.10632984340190887,
"learning_rate": 1.0611229869658785e-05,
"loss": 0.0235,
"step": 539
},
{
"epoch": 1.4886926556105966,
"grad_norm": 0.115880087018013,
"learning_rate": 1.0581448289104759e-05,
"loss": 0.0238,
"step": 540
},
{
"epoch": 1.4914494938617273,
"grad_norm": 0.1438998281955719,
"learning_rate": 1.0551661533033274e-05,
"loss": 0.0249,
"step": 541
},
{
"epoch": 1.494206332112858,
"grad_norm": 0.10747671127319336,
"learning_rate": 1.052186986657862e-05,
"loss": 0.0239,
"step": 542
},
{
"epoch": 1.4969631703639887,
"grad_norm": 0.11497830599546432,
"learning_rate": 1.0492073554918782e-05,
"loss": 0.024,
"step": 543
},
{
"epoch": 1.4997200086151197,
"grad_norm": 0.11187911778688431,
"learning_rate": 1.0462272863273105e-05,
"loss": 0.0237,
"step": 544
},
{
"epoch": 1.5024768468662502,
"grad_norm": 0.10226496309041977,
"learning_rate": 1.0432468056899909e-05,
"loss": 0.0226,
"step": 545
},
{
"epoch": 1.505233685117381,
"grad_norm": 0.12337212264537811,
"learning_rate": 1.0402659401094154e-05,
"loss": 0.0237,
"step": 546
},
{
"epoch": 1.5079905233685116,
"grad_norm": 0.1370334029197693,
"learning_rate": 1.0372847161185047e-05,
"loss": 0.0246,
"step": 547
},
{
"epoch": 1.5107473616196425,
"grad_norm": 0.1170111671090126,
"learning_rate": 1.0343031602533713e-05,
"loss": 0.0232,
"step": 548
},
{
"epoch": 1.5135041998707732,
"grad_norm": 0.10803765803575516,
"learning_rate": 1.0313212990530804e-05,
"loss": 0.0241,
"step": 549
},
{
"epoch": 1.516261038121904,
"grad_norm": 0.19091671705245972,
"learning_rate": 1.028339159059416e-05,
"loss": 0.025,
"step": 550
},
{
"epoch": 1.5190178763730346,
"grad_norm": 0.11063394695520401,
"learning_rate": 1.0253567668166436e-05,
"loss": 0.024,
"step": 551
},
{
"epoch": 1.5217747146241654,
"grad_norm": 0.11760086566209793,
"learning_rate": 1.0223741488712732e-05,
"loss": 0.0239,
"step": 552
},
{
"epoch": 1.5245315528752963,
"grad_norm": 0.10287030041217804,
"learning_rate": 1.0193913317718245e-05,
"loss": 0.0237,
"step": 553
},
{
"epoch": 1.5272883911264268,
"grad_norm": 0.1050201803445816,
"learning_rate": 1.0164083420685898e-05,
"loss": 0.0244,
"step": 554
},
{
"epoch": 1.5300452293775577,
"grad_norm": 0.10405981540679932,
"learning_rate": 1.0134252063133976e-05,
"loss": 0.0231,
"step": 555
},
{
"epoch": 1.5328020676286882,
"grad_norm": 0.09600961208343506,
"learning_rate": 1.0104419510593764e-05,
"loss": 0.0222,
"step": 556
},
{
"epoch": 1.5355589058798191,
"grad_norm": 0.2610664367675781,
"learning_rate": 1.0074586028607184e-05,
"loss": 0.0242,
"step": 557
},
{
"epoch": 1.5383157441309498,
"grad_norm": 0.10342807322740555,
"learning_rate": 1.0044751882724436e-05,
"loss": 0.0226,
"step": 558
},
{
"epoch": 1.5410725823820806,
"grad_norm": 0.11091192066669464,
"learning_rate": 1.0014917338501618e-05,
"loss": 0.0234,
"step": 559
},
{
"epoch": 1.5438294206332113,
"grad_norm": 0.10212964564561844,
"learning_rate": 9.985082661498384e-06,
"loss": 0.0241,
"step": 560
},
{
"epoch": 1.546586258884342,
"grad_norm": 0.12537163496017456,
"learning_rate": 9.955248117275566e-06,
"loss": 0.026,
"step": 561
},
{
"epoch": 1.549343097135473,
"grad_norm": 0.10132145881652832,
"learning_rate": 9.925413971392816e-06,
"loss": 0.0237,
"step": 562
},
{
"epoch": 1.5520999353866034,
"grad_norm": 0.11508084088563919,
"learning_rate": 9.89558048940624e-06,
"loss": 0.0235,
"step": 563
},
{
"epoch": 1.5548567736377343,
"grad_norm": 0.11934718489646912,
"learning_rate": 9.865747936866027e-06,
"loss": 0.0234,
"step": 564
},
{
"epoch": 1.5576136118888648,
"grad_norm": 0.10267031192779541,
"learning_rate": 9.835916579314105e-06,
"loss": 0.0235,
"step": 565
},
{
"epoch": 1.5603704501399958,
"grad_norm": 0.10713180154561996,
"learning_rate": 9.806086682281759e-06,
"loss": 0.0235,
"step": 566
},
{
"epoch": 1.5631272883911265,
"grad_norm": 0.11230739206075668,
"learning_rate": 9.776258511287271e-06,
"loss": 0.0237,
"step": 567
},
{
"epoch": 1.5658841266422572,
"grad_norm": 0.11151719093322754,
"learning_rate": 9.746432331833568e-06,
"loss": 0.0232,
"step": 568
},
{
"epoch": 1.568640964893388,
"grad_norm": 0.10779338330030441,
"learning_rate": 9.716608409405843e-06,
"loss": 0.0236,
"step": 569
},
{
"epoch": 1.5713978031445186,
"grad_norm": 0.10462480783462524,
"learning_rate": 9.6867870094692e-06,
"loss": 0.0232,
"step": 570
},
{
"epoch": 1.5741546413956493,
"grad_norm": 0.11873821914196014,
"learning_rate": 9.65696839746629e-06,
"loss": 0.0235,
"step": 571
},
{
"epoch": 1.57691147964678,
"grad_norm": 0.10550446808338165,
"learning_rate": 9.627152838814954e-06,
"loss": 0.0231,
"step": 572
},
{
"epoch": 1.579668317897911,
"grad_norm": 0.10466040670871735,
"learning_rate": 9.597340598905851e-06,
"loss": 0.0241,
"step": 573
},
{
"epoch": 1.5824251561490414,
"grad_norm": 0.09853997081518173,
"learning_rate": 9.567531943100093e-06,
"loss": 0.0227,
"step": 574
},
{
"epoch": 1.5851819944001724,
"grad_norm": 0.4732920825481415,
"learning_rate": 9.537727136726899e-06,
"loss": 0.0275,
"step": 575
},
{
"epoch": 1.587938832651303,
"grad_norm": 0.11485108733177185,
"learning_rate": 9.50792644508122e-06,
"loss": 0.0229,
"step": 576
},
{
"epoch": 1.5906956709024338,
"grad_norm": 0.10848281532526016,
"learning_rate": 9.478130133421381e-06,
"loss": 0.0237,
"step": 577
},
{
"epoch": 1.5934525091535645,
"grad_norm": 0.10901174694299698,
"learning_rate": 9.448338466966727e-06,
"loss": 0.0233,
"step": 578
},
{
"epoch": 1.5962093474046952,
"grad_norm": 0.10002460330724716,
"learning_rate": 9.418551710895243e-06,
"loss": 0.0227,
"step": 579
},
{
"epoch": 1.598966185655826,
"grad_norm": 0.10787548869848251,
"learning_rate": 9.388770130341217e-06,
"loss": 0.0235,
"step": 580
},
{
"epoch": 1.6017230239069566,
"grad_norm": 0.10888142138719559,
"learning_rate": 9.358993990392864e-06,
"loss": 0.0235,
"step": 581
},
{
"epoch": 1.6044798621580876,
"grad_norm": 0.11493578553199768,
"learning_rate": 9.329223556089976e-06,
"loss": 0.0244,
"step": 582
},
{
"epoch": 1.607236700409218,
"grad_norm": 0.1134362742304802,
"learning_rate": 9.299459092421558e-06,
"loss": 0.0251,
"step": 583
},
{
"epoch": 1.609993538660349,
"grad_norm": 0.10327161103487015,
"learning_rate": 9.269700864323462e-06,
"loss": 0.024,
"step": 584
},
{
"epoch": 1.6127503769114795,
"grad_norm": 0.10661885142326355,
"learning_rate": 9.239949136676042e-06,
"loss": 0.0237,
"step": 585
},
{
"epoch": 1.6155072151626104,
"grad_norm": 0.11634720116853714,
"learning_rate": 9.210204174301797e-06,
"loss": 0.0239,
"step": 586
},
{
"epoch": 1.6182640534137411,
"grad_norm": 0.10218880325555801,
"learning_rate": 9.180466241963e-06,
"loss": 0.0231,
"step": 587
},
{
"epoch": 1.6210208916648718,
"grad_norm": 0.10723837465047836,
"learning_rate": 9.15073560435935e-06,
"loss": 0.0233,
"step": 588
},
{
"epoch": 1.6237777299160026,
"grad_norm": 0.10045973211526871,
"learning_rate": 9.121012526125625e-06,
"loss": 0.0229,
"step": 589
},
{
"epoch": 1.6265345681671333,
"grad_norm": 0.1160292997956276,
"learning_rate": 9.091297271829296e-06,
"loss": 0.024,
"step": 590
},
{
"epoch": 1.6292914064182642,
"grad_norm": 0.0929265022277832,
"learning_rate": 9.061590105968208e-06,
"loss": 0.022,
"step": 591
},
{
"epoch": 1.6320482446693947,
"grad_norm": 0.09764169901609421,
"learning_rate": 9.03189129296821e-06,
"loss": 0.0237,
"step": 592
},
{
"epoch": 1.6348050829205256,
"grad_norm": 0.10847126692533493,
"learning_rate": 9.002201097180796e-06,
"loss": 0.0229,
"step": 593
},
{
"epoch": 1.637561921171656,
"grad_norm": 0.10012809187173843,
"learning_rate": 8.97251978288076e-06,
"loss": 0.0237,
"step": 594
},
{
"epoch": 1.640318759422787,
"grad_norm": 0.10542436689138412,
"learning_rate": 8.942847614263842e-06,
"loss": 0.0225,
"step": 595
},
{
"epoch": 1.6430755976739178,
"grad_norm": 0.10059002041816711,
"learning_rate": 8.91318485544437e-06,
"loss": 0.0229,
"step": 596
},
{
"epoch": 1.6458324359250485,
"grad_norm": 0.09918685257434845,
"learning_rate": 8.883531770452924e-06,
"loss": 0.023,
"step": 597
},
{
"epoch": 1.6485892741761792,
"grad_norm": 0.7748421430587769,
"learning_rate": 8.853888623233967e-06,
"loss": 0.0313,
"step": 598
},
{
"epoch": 1.6513461124273099,
"grad_norm": 0.1017061397433281,
"learning_rate": 8.824255677643518e-06,
"loss": 0.0237,
"step": 599
},
{
"epoch": 1.6541029506784408,
"grad_norm": 0.10307373106479645,
"learning_rate": 8.79463319744677e-06,
"loss": 0.0238,
"step": 600
},
{
"epoch": 1.6568597889295713,
"grad_norm": 0.09575623273849487,
"learning_rate": 8.765021446315784e-06,
"loss": 0.0227,
"step": 601
},
{
"epoch": 1.6596166271807022,
"grad_norm": 0.10236816108226776,
"learning_rate": 8.735420687827107e-06,
"loss": 0.0242,
"step": 602
},
{
"epoch": 1.6623734654318327,
"grad_norm": 0.1926388442516327,
"learning_rate": 8.705831185459446e-06,
"loss": 0.0269,
"step": 603
},
{
"epoch": 1.6651303036829637,
"grad_norm": 0.10154667496681213,
"learning_rate": 8.676253202591318e-06,
"loss": 0.0238,
"step": 604
},
{
"epoch": 1.6678871419340944,
"grad_norm": 0.10679133236408234,
"learning_rate": 8.646687002498692e-06,
"loss": 0.0235,
"step": 605
},
{
"epoch": 1.670643980185225,
"grad_norm": 0.1166776716709137,
"learning_rate": 8.617132848352672e-06,
"loss": 0.0228,
"step": 606
},
{
"epoch": 1.6734008184363558,
"grad_norm": 0.10293897241353989,
"learning_rate": 8.58759100321713e-06,
"loss": 0.022,
"step": 607
},
{
"epoch": 1.6761576566874865,
"grad_norm": 0.3156561255455017,
"learning_rate": 8.558061730046385e-06,
"loss": 0.0259,
"step": 608
},
{
"epoch": 1.6789144949386172,
"grad_norm": 0.10844126343727112,
"learning_rate": 8.528545291682839e-06,
"loss": 0.0227,
"step": 609
},
{
"epoch": 1.681671333189748,
"grad_norm": 0.10488175600767136,
"learning_rate": 8.499041950854665e-06,
"loss": 0.0239,
"step": 610
},
{
"epoch": 1.6844281714408789,
"grad_norm": 0.12386349588632584,
"learning_rate": 8.469551970173437e-06,
"loss": 0.0238,
"step": 611
},
{
"epoch": 1.6871850096920094,
"grad_norm": 0.11962109059095383,
"learning_rate": 8.440075612131823e-06,
"loss": 0.0241,
"step": 612
},
{
"epoch": 1.6899418479431403,
"grad_norm": 0.09642872214317322,
"learning_rate": 8.410613139101229e-06,
"loss": 0.0227,
"step": 613
},
{
"epoch": 1.692698686194271,
"grad_norm": 0.10245215147733688,
"learning_rate": 8.38116481332947e-06,
"loss": 0.0224,
"step": 614
},
{
"epoch": 1.6954555244454017,
"grad_norm": 0.10805162042379379,
"learning_rate": 8.351730896938438e-06,
"loss": 0.024,
"step": 615
},
{
"epoch": 1.6982123626965324,
"grad_norm": 0.10161999613046646,
"learning_rate": 8.322311651921759e-06,
"loss": 0.0228,
"step": 616
},
{
"epoch": 1.7009692009476631,
"grad_norm": 0.10508878529071808,
"learning_rate": 8.292907340142472e-06,
"loss": 0.0241,
"step": 617
},
{
"epoch": 1.7037260391987938,
"grad_norm": 0.10359475761651993,
"learning_rate": 8.263518223330698e-06,
"loss": 0.0241,
"step": 618
},
{
"epoch": 1.7064828774499246,
"grad_norm": 0.11690201610326767,
"learning_rate": 8.2341445630813e-06,
"loss": 0.0236,
"step": 619
},
{
"epoch": 1.7092397157010555,
"grad_norm": 0.1021987721323967,
"learning_rate": 8.204786620851569e-06,
"loss": 0.0227,
"step": 620
},
{
"epoch": 1.711996553952186,
"grad_norm": 0.10730181634426117,
"learning_rate": 8.175444657958875e-06,
"loss": 0.0221,
"step": 621
},
{
"epoch": 1.714753392203317,
"grad_norm": 0.10022248327732086,
"learning_rate": 8.146118935578367e-06,
"loss": 0.0239,
"step": 622
},
{
"epoch": 1.7175102304544474,
"grad_norm": 0.11165450513362885,
"learning_rate": 8.116809714740634e-06,
"loss": 0.0229,
"step": 623
},
{
"epoch": 1.7202670687055783,
"grad_norm": 1.2205007076263428,
"learning_rate": 8.087517256329376e-06,
"loss": 0.024,
"step": 624
},
{
"epoch": 1.723023906956709,
"grad_norm": 0.09469296783208847,
"learning_rate": 8.058241821079106e-06,
"loss": 0.0234,
"step": 625
},
{
"epoch": 1.7257807452078398,
"grad_norm": 0.09778100997209549,
"learning_rate": 8.028983669572786e-06,
"loss": 0.0229,
"step": 626
},
{
"epoch": 1.7285375834589705,
"grad_norm": 0.10135500133037567,
"learning_rate": 7.999743062239557e-06,
"loss": 0.0227,
"step": 627
},
{
"epoch": 1.7312944217101012,
"grad_norm": 0.0966990739107132,
"learning_rate": 7.97052025935239e-06,
"loss": 0.0232,
"step": 628
},
{
"epoch": 1.734051259961232,
"grad_norm": 0.09644000977277756,
"learning_rate": 7.941315521025776e-06,
"loss": 0.023,
"step": 629
},
{
"epoch": 1.7368080982123626,
"grad_norm": 0.10050094872713089,
"learning_rate": 7.912129107213417e-06,
"loss": 0.022,
"step": 630
},
{
"epoch": 1.7395649364634935,
"grad_norm": 0.09904894977807999,
"learning_rate": 7.882961277705897e-06,
"loss": 0.0224,
"step": 631
},
{
"epoch": 1.742321774714624,
"grad_norm": 0.1084563285112381,
"learning_rate": 7.853812292128386e-06,
"loss": 0.0228,
"step": 632
},
{
"epoch": 1.745078612965755,
"grad_norm": 0.11320846527814865,
"learning_rate": 7.824682409938328e-06,
"loss": 0.0235,
"step": 633
},
{
"epoch": 1.7478354512168857,
"grad_norm": 0.1111724004149437,
"learning_rate": 7.795571890423116e-06,
"loss": 0.023,
"step": 634
},
{
"epoch": 1.7505922894680164,
"grad_norm": 0.09731971472501755,
"learning_rate": 7.766480992697802e-06,
"loss": 0.0227,
"step": 635
},
{
"epoch": 1.753349127719147,
"grad_norm": 0.11039575189352036,
"learning_rate": 7.73740997570278e-06,
"loss": 0.0225,
"step": 636
},
{
"epoch": 1.7561059659702778,
"grad_norm": 0.10182961076498032,
"learning_rate": 7.708359098201483e-06,
"loss": 0.022,
"step": 637
},
{
"epoch": 1.7588628042214087,
"grad_norm": 0.103904128074646,
"learning_rate": 7.67932861877808e-06,
"loss": 0.0227,
"step": 638
},
{
"epoch": 1.7616196424725392,
"grad_norm": 0.12430708110332489,
"learning_rate": 7.650318795835179e-06,
"loss": 0.0233,
"step": 639
},
{
"epoch": 1.7643764807236701,
"grad_norm": 0.09785618633031845,
"learning_rate": 7.621329887591519e-06,
"loss": 0.0236,
"step": 640
},
{
"epoch": 1.7671333189748006,
"grad_norm": 0.10391392558813095,
"learning_rate": 7.592362152079684e-06,
"loss": 0.0228,
"step": 641
},
{
"epoch": 1.7698901572259316,
"grad_norm": 0.33653295040130615,
"learning_rate": 7.563415847143782e-06,
"loss": 0.0281,
"step": 642
},
{
"epoch": 1.7726469954770623,
"grad_norm": 0.12001726031303406,
"learning_rate": 7.5344912304371785e-06,
"loss": 0.0229,
"step": 643
},
{
"epoch": 1.775403833728193,
"grad_norm": 0.12422462552785873,
"learning_rate": 7.505588559420188e-06,
"loss": 0.0238,
"step": 644
},
{
"epoch": 1.7781606719793237,
"grad_norm": 0.09668347984552383,
"learning_rate": 7.476708091357783e-06,
"loss": 0.0224,
"step": 645
},
{
"epoch": 1.7809175102304544,
"grad_norm": 0.11820239573717117,
"learning_rate": 7.447850083317308e-06,
"loss": 0.023,
"step": 646
},
{
"epoch": 1.7836743484815853,
"grad_norm": 0.10456949472427368,
"learning_rate": 7.419014792166182e-06,
"loss": 0.0239,
"step": 647
},
{
"epoch": 1.7864311867327158,
"grad_norm": 0.11468762159347534,
"learning_rate": 7.39020247456963e-06,
"loss": 0.0229,
"step": 648
},
{
"epoch": 1.7891880249838468,
"grad_norm": 0.09829288721084595,
"learning_rate": 7.361413386988379e-06,
"loss": 0.0231,
"step": 649
},
{
"epoch": 1.7919448632349773,
"grad_norm": 0.10345254838466644,
"learning_rate": 7.332647785676388e-06,
"loss": 0.0231,
"step": 650
},
{
"epoch": 1.7947017014861082,
"grad_norm": 0.09892752021551132,
"learning_rate": 7.303905926678565e-06,
"loss": 0.0234,
"step": 651
},
{
"epoch": 1.797458539737239,
"grad_norm": 0.09165947884321213,
"learning_rate": 7.275188065828476e-06,
"loss": 0.022,
"step": 652
},
{
"epoch": 1.8002153779883696,
"grad_norm": 0.09468812495470047,
"learning_rate": 7.246494458746085e-06,
"loss": 0.0223,
"step": 653
},
{
"epoch": 1.8029722162395003,
"grad_norm": 0.1075858324766159,
"learning_rate": 7.217825360835475e-06,
"loss": 0.0233,
"step": 654
},
{
"epoch": 1.805729054490631,
"grad_norm": 0.10927974432706833,
"learning_rate": 7.189181027282561e-06,
"loss": 0.0232,
"step": 655
},
{
"epoch": 1.8084858927417617,
"grad_norm": 0.10256339609622955,
"learning_rate": 7.160561713052843e-06,
"loss": 0.0237,
"step": 656
},
{
"epoch": 1.8112427309928925,
"grad_norm": 0.11086180061101913,
"learning_rate": 7.131967672889101e-06,
"loss": 0.0234,
"step": 657
},
{
"epoch": 1.8139995692440234,
"grad_norm": 0.10017542541027069,
"learning_rate": 7.103399161309165e-06,
"loss": 0.0225,
"step": 658
},
{
"epoch": 1.8167564074951539,
"grad_norm": 0.10286585986614227,
"learning_rate": 7.074856432603628e-06,
"loss": 0.0225,
"step": 659
},
{
"epoch": 1.8195132457462848,
"grad_norm": 0.10518784821033478,
"learning_rate": 7.04633974083359e-06,
"loss": 0.0234,
"step": 660
},
{
"epoch": 1.8222700839974153,
"grad_norm": 0.10116968303918839,
"learning_rate": 7.017849339828388e-06,
"loss": 0.0233,
"step": 661
},
{
"epoch": 1.8250269222485462,
"grad_norm": 0.10069679468870163,
"learning_rate": 6.989385483183355e-06,
"loss": 0.0234,
"step": 662
},
{
"epoch": 1.827783760499677,
"grad_norm": 0.09656015038490295,
"learning_rate": 6.960948424257532e-06,
"loss": 0.0225,
"step": 663
},
{
"epoch": 1.8305405987508077,
"grad_norm": 0.10670676827430725,
"learning_rate": 6.9325384161714485e-06,
"loss": 0.0231,
"step": 664
},
{
"epoch": 1.8332974370019384,
"grad_norm": 0.12347907572984695,
"learning_rate": 6.904155711804843e-06,
"loss": 0.0235,
"step": 665
},
{
"epoch": 1.836054275253069,
"grad_norm": 0.09916937351226807,
"learning_rate": 6.8758005637944245e-06,
"loss": 0.0229,
"step": 666
},
{
"epoch": 1.8388111135042,
"grad_norm": 0.09796298295259476,
"learning_rate": 6.8474732245316245e-06,
"loss": 0.0234,
"step": 667
},
{
"epoch": 1.8415679517553305,
"grad_norm": 0.1063319593667984,
"learning_rate": 6.819173946160336e-06,
"loss": 0.0232,
"step": 668
},
{
"epoch": 1.8443247900064614,
"grad_norm": 0.09941181540489197,
"learning_rate": 6.7909029805746855e-06,
"loss": 0.0234,
"step": 669
},
{
"epoch": 1.847081628257592,
"grad_norm": 0.09763844311237335,
"learning_rate": 6.762660579416791e-06,
"loss": 0.0237,
"step": 670
},
{
"epoch": 1.8498384665087229,
"grad_norm": 0.094894640147686,
"learning_rate": 6.734446994074507e-06,
"loss": 0.0216,
"step": 671
},
{
"epoch": 1.8525953047598536,
"grad_norm": 0.10713546723127365,
"learning_rate": 6.706262475679205e-06,
"loss": 0.0232,
"step": 672
},
{
"epoch": 1.8553521430109843,
"grad_norm": 0.09470642358064651,
"learning_rate": 6.678107275103519e-06,
"loss": 0.0231,
"step": 673
},
{
"epoch": 1.858108981262115,
"grad_norm": 0.09599259495735168,
"learning_rate": 6.649981642959133e-06,
"loss": 0.023,
"step": 674
},
{
"epoch": 1.8608658195132457,
"grad_norm": 0.09776968508958817,
"learning_rate": 6.62188582959453e-06,
"loss": 0.0224,
"step": 675
},
{
"epoch": 1.8636226577643766,
"grad_norm": 0.09774177521467209,
"learning_rate": 6.593820085092782e-06,
"loss": 0.0222,
"step": 676
},
{
"epoch": 1.8663794960155071,
"grad_norm": 0.10560328513383865,
"learning_rate": 6.565784659269314e-06,
"loss": 0.0233,
"step": 677
},
{
"epoch": 1.869136334266638,
"grad_norm": 0.09616672247648239,
"learning_rate": 6.537779801669677e-06,
"loss": 0.023,
"step": 678
},
{
"epoch": 1.8718931725177685,
"grad_norm": 0.10657691955566406,
"learning_rate": 6.509805761567336e-06,
"loss": 0.0232,
"step": 679
},
{
"epoch": 1.8746500107688995,
"grad_norm": 0.09221483767032623,
"learning_rate": 6.481862787961448e-06,
"loss": 0.0227,
"step": 680
},
{
"epoch": 1.8774068490200302,
"grad_norm": 0.08830998837947845,
"learning_rate": 6.453951129574644e-06,
"loss": 0.0221,
"step": 681
},
{
"epoch": 1.880163687271161,
"grad_norm": 0.08807796984910965,
"learning_rate": 6.4260710348508115e-06,
"loss": 0.0224,
"step": 682
},
{
"epoch": 1.8829205255222916,
"grad_norm": 0.08681437373161316,
"learning_rate": 6.3982227519528986e-06,
"loss": 0.0216,
"step": 683
},
{
"epoch": 1.8856773637734223,
"grad_norm": 0.09876042604446411,
"learning_rate": 6.370406528760675e-06,
"loss": 0.0228,
"step": 684
},
{
"epoch": 1.8884342020245533,
"grad_norm": 0.10514353215694427,
"learning_rate": 6.34262261286856e-06,
"loss": 0.0219,
"step": 685
},
{
"epoch": 1.8911910402756837,
"grad_norm": 0.0905836671590805,
"learning_rate": 6.3148712515833985e-06,
"loss": 0.022,
"step": 686
},
{
"epoch": 1.8939478785268147,
"grad_norm": 0.09052827209234238,
"learning_rate": 6.287152691922264e-06,
"loss": 0.0216,
"step": 687
},
{
"epoch": 1.8967047167779452,
"grad_norm": 0.101600281894207,
"learning_rate": 6.259467180610262e-06,
"loss": 0.0233,
"step": 688
},
{
"epoch": 1.899461555029076,
"grad_norm": 0.10384192317724228,
"learning_rate": 6.231814964078327e-06,
"loss": 0.0233,
"step": 689
},
{
"epoch": 1.9022183932802068,
"grad_norm": 0.1043287143111229,
"learning_rate": 6.204196288461037e-06,
"loss": 0.0224,
"step": 690
},
{
"epoch": 1.9049752315313375,
"grad_norm": 0.41572827100753784,
"learning_rate": 6.176611399594422e-06,
"loss": 0.0241,
"step": 691
},
{
"epoch": 1.9077320697824682,
"grad_norm": 0.0979122593998909,
"learning_rate": 6.149060543013772e-06,
"loss": 0.0222,
"step": 692
},
{
"epoch": 1.910488908033599,
"grad_norm": 0.09667398780584335,
"learning_rate": 6.121543963951453e-06,
"loss": 0.0232,
"step": 693
},
{
"epoch": 1.9132457462847297,
"grad_norm": 0.10258089005947113,
"learning_rate": 6.094061907334718e-06,
"loss": 0.023,
"step": 694
},
{
"epoch": 1.9160025845358604,
"grad_norm": 0.09479233622550964,
"learning_rate": 6.066614617783542e-06,
"loss": 0.0225,
"step": 695
},
{
"epoch": 1.9187594227869913,
"grad_norm": 0.09578879177570343,
"learning_rate": 6.039202339608432e-06,
"loss": 0.0226,
"step": 696
},
{
"epoch": 1.9215162610381218,
"grad_norm": 0.09669813513755798,
"learning_rate": 6.0118253168082555e-06,
"loss": 0.0228,
"step": 697
},
{
"epoch": 1.9242730992892527,
"grad_norm": 0.09967345744371414,
"learning_rate": 5.984483793068072e-06,
"loss": 0.023,
"step": 698
},
{
"epoch": 1.9270299375403832,
"grad_norm": 0.09882698953151703,
"learning_rate": 5.957178011756952e-06,
"loss": 0.0239,
"step": 699
},
{
"epoch": 1.9297867757915141,
"grad_norm": 0.10672547668218613,
"learning_rate": 5.92990821592583e-06,
"loss": 0.0235,
"step": 700
},
{
"epoch": 1.9325436140426449,
"grad_norm": 0.10070157796144485,
"learning_rate": 5.902674648305329e-06,
"loss": 0.0223,
"step": 701
},
{
"epoch": 1.9353004522937756,
"grad_norm": 0.1068674623966217,
"learning_rate": 5.875477551303596e-06,
"loss": 0.0236,
"step": 702
},
{
"epoch": 1.9380572905449063,
"grad_norm": 0.1479007601737976,
"learning_rate": 5.848317167004159e-06,
"loss": 0.024,
"step": 703
},
{
"epoch": 1.940814128796037,
"grad_norm": 0.09680446237325668,
"learning_rate": 5.8211937371637525e-06,
"loss": 0.0224,
"step": 704
},
{
"epoch": 1.943570967047168,
"grad_norm": 0.09958844631910324,
"learning_rate": 5.794107503210187e-06,
"loss": 0.0228,
"step": 705
},
{
"epoch": 1.9463278052982984,
"grad_norm": 0.10216110944747925,
"learning_rate": 5.767058706240183e-06,
"loss": 0.0217,
"step": 706
},
{
"epoch": 1.9490846435494293,
"grad_norm": 0.09057790040969849,
"learning_rate": 5.740047587017232e-06,
"loss": 0.0225,
"step": 707
},
{
"epoch": 1.9518414818005598,
"grad_norm": 0.10457273572683334,
"learning_rate": 5.713074385969457e-06,
"loss": 0.0222,
"step": 708
},
{
"epoch": 1.9545983200516908,
"grad_norm": 0.1035657674074173,
"learning_rate": 5.686139343187468e-06,
"loss": 0.0229,
"step": 709
},
{
"epoch": 1.9573551583028215,
"grad_norm": 0.1085587665438652,
"learning_rate": 5.659242698422213e-06,
"loss": 0.0222,
"step": 710
},
{
"epoch": 1.9601119965539522,
"grad_norm": 0.10512305051088333,
"learning_rate": 5.632384691082874e-06,
"loss": 0.023,
"step": 711
},
{
"epoch": 1.962868834805083,
"grad_norm": 0.1115739643573761,
"learning_rate": 5.605565560234707e-06,
"loss": 0.0228,
"step": 712
},
{
"epoch": 1.9656256730562136,
"grad_norm": 0.11410468816757202,
"learning_rate": 5.578785544596928e-06,
"loss": 0.0234,
"step": 713
},
{
"epoch": 1.9683825113073445,
"grad_norm": 0.10378382354974747,
"learning_rate": 5.55204488254059e-06,
"loss": 0.0242,
"step": 714
},
{
"epoch": 1.971139349558475,
"grad_norm": 0.11169740557670593,
"learning_rate": 5.525343812086445e-06,
"loss": 0.0227,
"step": 715
},
{
"epoch": 1.973896187809606,
"grad_norm": 0.11073627322912216,
"learning_rate": 5.498682570902849e-06,
"loss": 0.0227,
"step": 716
},
{
"epoch": 1.9766530260607365,
"grad_norm": 0.10393860191106796,
"learning_rate": 5.47206139630363e-06,
"loss": 0.0231,
"step": 717
},
{
"epoch": 1.9794098643118674,
"grad_norm": 0.09541879594326019,
"learning_rate": 5.445480525245976e-06,
"loss": 0.0235,
"step": 718
},
{
"epoch": 1.982166702562998,
"grad_norm": 0.10563753545284271,
"learning_rate": 5.418940194328344e-06,
"loss": 0.0232,
"step": 719
},
{
"epoch": 1.9849235408141288,
"grad_norm": 0.09059736877679825,
"learning_rate": 5.3924406397883174e-06,
"loss": 0.0221,
"step": 720
},
{
"epoch": 1.9876803790652595,
"grad_norm": 0.0970776304602623,
"learning_rate": 5.365982097500545e-06,
"loss": 0.0219,
"step": 721
},
{
"epoch": 1.9904372173163902,
"grad_norm": 0.10448465496301651,
"learning_rate": 5.339564802974615e-06,
"loss": 0.0218,
"step": 722
},
{
"epoch": 1.9931940555675212,
"grad_norm": 0.11694996803998947,
"learning_rate": 5.313188991352964e-06,
"loss": 0.0223,
"step": 723
},
{
"epoch": 1.9959508938186517,
"grad_norm": 0.10104145109653473,
"learning_rate": 5.286854897408793e-06,
"loss": 0.0229,
"step": 724
},
{
"epoch": 1.9987077320697826,
"grad_norm": 0.10007411241531372,
"learning_rate": 5.2605627555439635e-06,
"loss": 0.0229,
"step": 725
},
{
"epoch": 2.001464570320913,
"grad_norm": 0.10868417471647263,
"learning_rate": 5.234312799786921e-06,
"loss": 0.0199,
"step": 726
},
{
"epoch": 2.004221408572044,
"grad_norm": 0.1075412854552269,
"learning_rate": 5.208105263790611e-06,
"loss": 0.0169,
"step": 727
},
{
"epoch": 2.0069782468231745,
"grad_norm": 0.09774283319711685,
"learning_rate": 5.181940380830393e-06,
"loss": 0.0166,
"step": 728
},
{
"epoch": 2.0097350850743054,
"grad_norm": 0.09378799796104431,
"learning_rate": 5.155818383801976e-06,
"loss": 0.0169,
"step": 729
},
{
"epoch": 2.0124919233254364,
"grad_norm": 0.09300612658262253,
"learning_rate": 5.129739505219325e-06,
"loss": 0.0175,
"step": 730
},
{
"epoch": 2.015248761576567,
"grad_norm": 0.10933335870504379,
"learning_rate": 5.103703977212615e-06,
"loss": 0.0172,
"step": 731
},
{
"epoch": 2.018005599827698,
"grad_norm": 0.10995355993509293,
"learning_rate": 5.077712031526153e-06,
"loss": 0.0165,
"step": 732
},
{
"epoch": 2.0207624380788283,
"grad_norm": 0.0987633690237999,
"learning_rate": 5.051763899516313e-06,
"loss": 0.0169,
"step": 733
},
{
"epoch": 2.023519276329959,
"grad_norm": 0.10309556126594543,
"learning_rate": 5.025859812149481e-06,
"loss": 0.0166,
"step": 734
},
{
"epoch": 2.0262761145810897,
"grad_norm": 0.10243550688028336,
"learning_rate": 5.000000000000003e-06,
"loss": 0.0176,
"step": 735
},
{
"epoch": 2.0290329528322206,
"grad_norm": 0.10226277261972427,
"learning_rate": 4.9741846932481154e-06,
"loss": 0.0163,
"step": 736
},
{
"epoch": 2.031789791083351,
"grad_norm": 0.10129394382238388,
"learning_rate": 4.94841412167792e-06,
"loss": 0.0167,
"step": 737
},
{
"epoch": 2.034546629334482,
"grad_norm": 0.09485170245170593,
"learning_rate": 4.922688514675325e-06,
"loss": 0.0167,
"step": 738
},
{
"epoch": 2.0373034675856125,
"grad_norm": 0.10152100771665573,
"learning_rate": 4.8970081012260014e-06,
"loss": 0.0169,
"step": 739
},
{
"epoch": 2.0400603058367435,
"grad_norm": 0.10325178503990173,
"learning_rate": 4.8713731099133576e-06,
"loss": 0.0172,
"step": 740
},
{
"epoch": 2.0428171440878744,
"grad_norm": 0.10417655855417252,
"learning_rate": 4.845783768916482e-06,
"loss": 0.0167,
"step": 741
},
{
"epoch": 2.045573982339005,
"grad_norm": 0.0955713763833046,
"learning_rate": 4.820240306008136e-06,
"loss": 0.0164,
"step": 742
},
{
"epoch": 2.048330820590136,
"grad_norm": 0.10096914321184158,
"learning_rate": 4.794742948552716e-06,
"loss": 0.0164,
"step": 743
},
{
"epoch": 2.0510876588412663,
"grad_norm": 0.11180409044027328,
"learning_rate": 4.769291923504226e-06,
"loss": 0.0169,
"step": 744
},
{
"epoch": 2.0538444970923972,
"grad_norm": 0.10208172351121902,
"learning_rate": 4.743887457404268e-06,
"loss": 0.0159,
"step": 745
},
{
"epoch": 2.0566013353435277,
"grad_norm": 0.10103499889373779,
"learning_rate": 4.718529776380009e-06,
"loss": 0.0161,
"step": 746
},
{
"epoch": 2.0593581735946587,
"grad_norm": 0.09129516035318375,
"learning_rate": 4.693219106142186e-06,
"loss": 0.0164,
"step": 747
},
{
"epoch": 2.062115011845789,
"grad_norm": 0.10983917117118835,
"learning_rate": 4.66795567198309e-06,
"loss": 0.0165,
"step": 748
},
{
"epoch": 2.06487185009692,
"grad_norm": 0.10270462185144424,
"learning_rate": 4.642739698774555e-06,
"loss": 0.0171,
"step": 749
},
{
"epoch": 2.067628688348051,
"grad_norm": 0.10552225261926651,
"learning_rate": 4.617571410965964e-06,
"loss": 0.0167,
"step": 750
},
{
"epoch": 2.0703855265991815,
"grad_norm": 0.10517138242721558,
"learning_rate": 4.59245103258225e-06,
"loss": 0.0164,
"step": 751
},
{
"epoch": 2.0731423648503124,
"grad_norm": 0.09321381151676178,
"learning_rate": 4.567378787221896e-06,
"loss": 0.0163,
"step": 752
},
{
"epoch": 2.075899203101443,
"grad_norm": 0.10226123034954071,
"learning_rate": 4.542354898054953e-06,
"loss": 0.0168,
"step": 753
},
{
"epoch": 2.078656041352574,
"grad_norm": 0.09658444672822952,
"learning_rate": 4.517379587821049e-06,
"loss": 0.0161,
"step": 754
},
{
"epoch": 2.0814128796037044,
"grad_norm": 0.08875004947185516,
"learning_rate": 4.492453078827409e-06,
"loss": 0.0158,
"step": 755
},
{
"epoch": 2.0841697178548353,
"grad_norm": 0.10765478014945984,
"learning_rate": 4.467575592946865e-06,
"loss": 0.016,
"step": 756
},
{
"epoch": 2.086926556105966,
"grad_norm": 0.09269033372402191,
"learning_rate": 4.442747351615899e-06,
"loss": 0.0156,
"step": 757
},
{
"epoch": 2.0896833943570967,
"grad_norm": 0.099794402718544,
"learning_rate": 4.417968575832664e-06,
"loss": 0.0155,
"step": 758
},
{
"epoch": 2.0924402326082276,
"grad_norm": 0.10920445621013641,
"learning_rate": 4.393239486155011e-06,
"loss": 0.0164,
"step": 759
},
{
"epoch": 2.095197070859358,
"grad_norm": 0.10013429075479507,
"learning_rate": 4.3685603026985356e-06,
"loss": 0.0159,
"step": 760
},
{
"epoch": 2.097953909110489,
"grad_norm": 0.1004144474864006,
"learning_rate": 4.343931245134616e-06,
"loss": 0.0171,
"step": 761
},
{
"epoch": 2.1007107473616196,
"grad_norm": 0.11513813585042953,
"learning_rate": 4.319352532688444e-06,
"loss": 0.0167,
"step": 762
},
{
"epoch": 2.1034675856127505,
"grad_norm": 0.10025149583816528,
"learning_rate": 4.294824384137096e-06,
"loss": 0.0167,
"step": 763
},
{
"epoch": 2.106224423863881,
"grad_norm": 0.13849525153636932,
"learning_rate": 4.270347017807575e-06,
"loss": 0.0173,
"step": 764
},
{
"epoch": 2.108981262115012,
"grad_norm": 0.10839790850877762,
"learning_rate": 4.245920651574864e-06,
"loss": 0.0164,
"step": 765
},
{
"epoch": 2.1117381003661424,
"grad_norm": 0.09577582031488419,
"learning_rate": 4.221545502859994e-06,
"loss": 0.0158,
"step": 766
},
{
"epoch": 2.1144949386172733,
"grad_norm": 0.1021181270480156,
"learning_rate": 4.197221788628096e-06,
"loss": 0.0163,
"step": 767
},
{
"epoch": 2.117251776868404,
"grad_norm": 0.10877656936645508,
"learning_rate": 4.172949725386488e-06,
"loss": 0.0159,
"step": 768
},
{
"epoch": 2.1200086151195348,
"grad_norm": 0.09782122820615768,
"learning_rate": 4.148729529182736e-06,
"loss": 0.0167,
"step": 769
},
{
"epoch": 2.1227654533706657,
"grad_norm": 0.10682158172130585,
"learning_rate": 4.124561415602729e-06,
"loss": 0.0171,
"step": 770
},
{
"epoch": 2.125522291621796,
"grad_norm": 0.10475680232048035,
"learning_rate": 4.100445599768774e-06,
"loss": 0.0164,
"step": 771
},
{
"epoch": 2.128279129872927,
"grad_norm": 0.09578834474086761,
"learning_rate": 4.0763822963376585e-06,
"loss": 0.0158,
"step": 772
},
{
"epoch": 2.1310359681240576,
"grad_norm": 0.09797288477420807,
"learning_rate": 4.0523717194987634e-06,
"loss": 0.0158,
"step": 773
},
{
"epoch": 2.1337928063751885,
"grad_norm": 0.1016959697008133,
"learning_rate": 4.028414082972141e-06,
"loss": 0.0168,
"step": 774
},
{
"epoch": 2.136549644626319,
"grad_norm": 0.09790080785751343,
"learning_rate": 4.004509600006618e-06,
"loss": 0.0155,
"step": 775
},
{
"epoch": 2.13930648287745,
"grad_norm": 0.1030542254447937,
"learning_rate": 3.980658483377903e-06,
"loss": 0.0166,
"step": 776
},
{
"epoch": 2.142063321128581,
"grad_norm": 0.11205972731113434,
"learning_rate": 3.956860945386677e-06,
"loss": 0.0158,
"step": 777
},
{
"epoch": 2.1448201593797114,
"grad_norm": 0.09531433135271072,
"learning_rate": 3.9331171978567206e-06,
"loss": 0.0154,
"step": 778
},
{
"epoch": 2.1475769976308423,
"grad_norm": 0.09850120544433594,
"learning_rate": 3.909427452133017e-06,
"loss": 0.0162,
"step": 779
},
{
"epoch": 2.150333835881973,
"grad_norm": 0.11114904284477234,
"learning_rate": 3.885791919079878e-06,
"loss": 0.016,
"step": 780
},
{
"epoch": 2.1530906741331037,
"grad_norm": 0.10965460538864136,
"learning_rate": 3.862210809079061e-06,
"loss": 0.0156,
"step": 781
},
{
"epoch": 2.155847512384234,
"grad_norm": 0.09533203393220901,
"learning_rate": 3.838684332027908e-06,
"loss": 0.0163,
"step": 782
},
{
"epoch": 2.158604350635365,
"grad_norm": 0.09494911879301071,
"learning_rate": 3.815212697337451e-06,
"loss": 0.0158,
"step": 783
},
{
"epoch": 2.1613611888864956,
"grad_norm": 0.10040943324565887,
"learning_rate": 3.7917961139305835e-06,
"loss": 0.0162,
"step": 784
},
{
"epoch": 2.1641180271376266,
"grad_norm": 0.09985315054655075,
"learning_rate": 3.7684347902401753e-06,
"loss": 0.0164,
"step": 785
},
{
"epoch": 2.166874865388757,
"grad_norm": 0.09866054356098175,
"learning_rate": 3.745128934207225e-06,
"loss": 0.0161,
"step": 786
},
{
"epoch": 2.169631703639888,
"grad_norm": 0.09665759652853012,
"learning_rate": 3.7218787532790167e-06,
"loss": 0.0163,
"step": 787
},
{
"epoch": 2.172388541891019,
"grad_norm": 0.09830871224403381,
"learning_rate": 3.6986844544072496e-06,
"loss": 0.0165,
"step": 788
},
{
"epoch": 2.1751453801421494,
"grad_norm": 0.10809045284986496,
"learning_rate": 3.6755462440462288e-06,
"loss": 0.0167,
"step": 789
},
{
"epoch": 2.1779022183932804,
"grad_norm": 0.09549186378717422,
"learning_rate": 3.6524643281510018e-06,
"loss": 0.0164,
"step": 790
},
{
"epoch": 2.180659056644411,
"grad_norm": 0.09945723414421082,
"learning_rate": 3.6294389121755404e-06,
"loss": 0.0159,
"step": 791
},
{
"epoch": 2.1834158948955418,
"grad_norm": 0.09856431931257248,
"learning_rate": 3.606470201070904e-06,
"loss": 0.0164,
"step": 792
},
{
"epoch": 2.1861727331466723,
"grad_norm": 0.09308286756277084,
"learning_rate": 3.58355839928341e-06,
"loss": 0.0155,
"step": 793
},
{
"epoch": 2.188929571397803,
"grad_norm": 0.10592252761125565,
"learning_rate": 3.560703710752833e-06,
"loss": 0.0167,
"step": 794
},
{
"epoch": 2.1916864096489337,
"grad_norm": 0.10037653893232346,
"learning_rate": 3.5379063389105727e-06,
"loss": 0.0167,
"step": 795
},
{
"epoch": 2.1944432479000646,
"grad_norm": 0.1029176115989685,
"learning_rate": 3.515166486677848e-06,
"loss": 0.0173,
"step": 796
},
{
"epoch": 2.1972000861511956,
"grad_norm": 0.10505726933479309,
"learning_rate": 3.4924843564638946e-06,
"loss": 0.0164,
"step": 797
},
{
"epoch": 2.199956924402326,
"grad_norm": 0.12057758867740631,
"learning_rate": 3.4698601501641517e-06,
"loss": 0.0168,
"step": 798
},
{
"epoch": 2.202713762653457,
"grad_norm": 0.09361850470304489,
"learning_rate": 3.447294069158481e-06,
"loss": 0.0155,
"step": 799
},
{
"epoch": 2.2054706009045875,
"grad_norm": 0.10088173300027847,
"learning_rate": 3.424786314309365e-06,
"loss": 0.0157,
"step": 800
},
{
"epoch": 2.2082274391557184,
"grad_norm": 0.09545209258794785,
"learning_rate": 3.4023370859601192e-06,
"loss": 0.016,
"step": 801
},
{
"epoch": 2.210984277406849,
"grad_norm": 0.10133402049541473,
"learning_rate": 3.3799465839331103e-06,
"loss": 0.0167,
"step": 802
},
{
"epoch": 2.21374111565798,
"grad_norm": 0.10037866234779358,
"learning_rate": 3.3576150075279757e-06,
"loss": 0.0166,
"step": 803
},
{
"epoch": 2.2164979539091103,
"grad_norm": 0.13703982532024384,
"learning_rate": 3.335342555519855e-06,
"loss": 0.0167,
"step": 804
},
{
"epoch": 2.2192547921602412,
"grad_norm": 0.10422453284263611,
"learning_rate": 3.313129426157613e-06,
"loss": 0.016,
"step": 805
},
{
"epoch": 2.222011630411372,
"grad_norm": 0.09957727044820786,
"learning_rate": 3.290975817162082e-06,
"loss": 0.0156,
"step": 806
},
{
"epoch": 2.2247684686625027,
"grad_norm": 0.0947730764746666,
"learning_rate": 3.2688819257242963e-06,
"loss": 0.0157,
"step": 807
},
{
"epoch": 2.2275253069136336,
"grad_norm": 0.10239588469266891,
"learning_rate": 3.246847948503744e-06,
"loss": 0.016,
"step": 808
},
{
"epoch": 2.230282145164764,
"grad_norm": 0.10617274791002274,
"learning_rate": 3.2248740816266012e-06,
"loss": 0.0158,
"step": 809
},
{
"epoch": 2.233038983415895,
"grad_norm": 0.10229507088661194,
"learning_rate": 3.2029605206840088e-06,
"loss": 0.0162,
"step": 810
},
{
"epoch": 2.2357958216670255,
"grad_norm": 0.09557145833969116,
"learning_rate": 3.181107460730314e-06,
"loss": 0.0157,
"step": 811
},
{
"epoch": 2.2385526599181564,
"grad_norm": 0.11345598846673965,
"learning_rate": 3.1593150962813425e-06,
"loss": 0.0167,
"step": 812
},
{
"epoch": 2.241309498169287,
"grad_norm": 0.11662036180496216,
"learning_rate": 3.1375836213126653e-06,
"loss": 0.0168,
"step": 813
},
{
"epoch": 2.244066336420418,
"grad_norm": 0.10055527836084366,
"learning_rate": 3.115913229257864e-06,
"loss": 0.0165,
"step": 814
},
{
"epoch": 2.2468231746715484,
"grad_norm": 0.10372263938188553,
"learning_rate": 3.0943041130068243e-06,
"loss": 0.0165,
"step": 815
},
{
"epoch": 2.2495800129226793,
"grad_norm": 0.10061642527580261,
"learning_rate": 3.0727564649040066e-06,
"loss": 0.0157,
"step": 816
},
{
"epoch": 2.25233685117381,
"grad_norm": 0.10070322453975677,
"learning_rate": 3.0512704767467417e-06,
"loss": 0.016,
"step": 817
},
{
"epoch": 2.2550936894249407,
"grad_norm": 0.09765329211950302,
"learning_rate": 3.0298463397835223e-06,
"loss": 0.0158,
"step": 818
},
{
"epoch": 2.2578505276760716,
"grad_norm": 0.09247851371765137,
"learning_rate": 3.008484244712286e-06,
"loss": 0.0161,
"step": 819
},
{
"epoch": 2.260607365927202,
"grad_norm": 0.21034474670886993,
"learning_rate": 2.987184381678747e-06,
"loss": 0.0165,
"step": 820
},
{
"epoch": 2.263364204178333,
"grad_norm": 0.1013868898153305,
"learning_rate": 2.965946940274678e-06,
"loss": 0.0164,
"step": 821
},
{
"epoch": 2.2661210424294636,
"grad_norm": 0.10465855151414871,
"learning_rate": 2.9447721095362325e-06,
"loss": 0.0162,
"step": 822
},
{
"epoch": 2.2688778806805945,
"grad_norm": 0.10128065943717957,
"learning_rate": 2.9236600779422674e-06,
"loss": 0.0162,
"step": 823
},
{
"epoch": 2.2716347189317254,
"grad_norm": 0.1028231531381607,
"learning_rate": 2.902611033412648e-06,
"loss": 0.0167,
"step": 824
},
{
"epoch": 2.274391557182856,
"grad_norm": 0.09928223490715027,
"learning_rate": 2.8816251633065963e-06,
"loss": 0.0163,
"step": 825
},
{
"epoch": 2.277148395433987,
"grad_norm": 0.1016731932759285,
"learning_rate": 2.8607026544210115e-06,
"loss": 0.0156,
"step": 826
},
{
"epoch": 2.2799052336851173,
"grad_norm": 0.11646661907434464,
"learning_rate": 2.8398436929888085e-06,
"loss": 0.0165,
"step": 827
},
{
"epoch": 2.2826620719362483,
"grad_norm": 0.1035120040178299,
"learning_rate": 2.819048464677261e-06,
"loss": 0.0163,
"step": 828
},
{
"epoch": 2.2854189101873787,
"grad_norm": 0.10418447107076645,
"learning_rate": 2.798317154586352e-06,
"loss": 0.016,
"step": 829
},
{
"epoch": 2.2881757484385097,
"grad_norm": 0.095628522336483,
"learning_rate": 2.7776499472471184e-06,
"loss": 0.0155,
"step": 830
},
{
"epoch": 2.29093258668964,
"grad_norm": 0.09392069280147552,
"learning_rate": 2.7570470266200177e-06,
"loss": 0.0155,
"step": 831
},
{
"epoch": 2.293689424940771,
"grad_norm": 0.10431677848100662,
"learning_rate": 2.736508576093285e-06,
"loss": 0.0163,
"step": 832
},
{
"epoch": 2.2964462631919016,
"grad_norm": 0.1353488713502884,
"learning_rate": 2.716034778481301e-06,
"loss": 0.0164,
"step": 833
},
{
"epoch": 2.2992031014430325,
"grad_norm": 0.100525863468647,
"learning_rate": 2.69562581602297e-06,
"loss": 0.0161,
"step": 834
},
{
"epoch": 2.3019599396941635,
"grad_norm": 0.0952887013554573,
"learning_rate": 2.675281870380082e-06,
"loss": 0.0155,
"step": 835
},
{
"epoch": 2.304716777945294,
"grad_norm": 0.13243414461612701,
"learning_rate": 2.65500312263572e-06,
"loss": 0.0159,
"step": 836
},
{
"epoch": 2.307473616196425,
"grad_norm": 0.09811341762542725,
"learning_rate": 2.6347897532926293e-06,
"loss": 0.0157,
"step": 837
},
{
"epoch": 2.3102304544475554,
"grad_norm": 0.10353419929742813,
"learning_rate": 2.6146419422716174e-06,
"loss": 0.0157,
"step": 838
},
{
"epoch": 2.3129872926986863,
"grad_norm": 0.09999054670333862,
"learning_rate": 2.594559868909956e-06,
"loss": 0.0157,
"step": 839
},
{
"epoch": 2.315744130949817,
"grad_norm": 0.10014519840478897,
"learning_rate": 2.5745437119597704e-06,
"loss": 0.0162,
"step": 840
},
{
"epoch": 2.3185009692009477,
"grad_norm": 0.09995284676551819,
"learning_rate": 2.554593649586469e-06,
"loss": 0.0163,
"step": 841
},
{
"epoch": 2.321257807452078,
"grad_norm": 0.10019443184137344,
"learning_rate": 2.5347098593671417e-06,
"loss": 0.0161,
"step": 842
},
{
"epoch": 2.324014645703209,
"grad_norm": 0.09603513777256012,
"learning_rate": 2.514892518288988e-06,
"loss": 0.0162,
"step": 843
},
{
"epoch": 2.3267714839543396,
"grad_norm": 0.09885770082473755,
"learning_rate": 2.49514180274774e-06,
"loss": 0.0158,
"step": 844
},
{
"epoch": 2.3295283222054706,
"grad_norm": 0.09956613928079605,
"learning_rate": 2.4754578885460813e-06,
"loss": 0.0156,
"step": 845
},
{
"epoch": 2.3322851604566015,
"grad_norm": 0.09634565562009811,
"learning_rate": 2.455840950892099e-06,
"loss": 0.0158,
"step": 846
},
{
"epoch": 2.335041998707732,
"grad_norm": 0.09536509960889816,
"learning_rate": 2.436291164397715e-06,
"loss": 0.0161,
"step": 847
},
{
"epoch": 2.337798836958863,
"grad_norm": 0.09601006656885147,
"learning_rate": 2.416808703077135e-06,
"loss": 0.0167,
"step": 848
},
{
"epoch": 2.3405556752099934,
"grad_norm": 0.10354162007570267,
"learning_rate": 2.3973937403452983e-06,
"loss": 0.0159,
"step": 849
},
{
"epoch": 2.3433125134611243,
"grad_norm": 0.10057395696640015,
"learning_rate": 2.3780464490163267e-06,
"loss": 0.0163,
"step": 850
},
{
"epoch": 2.346069351712255,
"grad_norm": 0.09877093136310577,
"learning_rate": 2.3587670013020026e-06,
"loss": 0.0159,
"step": 851
},
{
"epoch": 2.3488261899633858,
"grad_norm": 0.11428674310445786,
"learning_rate": 2.339555568810221e-06,
"loss": 0.0162,
"step": 852
},
{
"epoch": 2.3515830282145167,
"grad_norm": 0.10734842717647552,
"learning_rate": 2.3204123225434714e-06,
"loss": 0.0158,
"step": 853
},
{
"epoch": 2.354339866465647,
"grad_norm": 0.09699169546365738,
"learning_rate": 2.3013374328973113e-06,
"loss": 0.0161,
"step": 854
},
{
"epoch": 2.357096704716778,
"grad_norm": 0.09945055842399597,
"learning_rate": 2.28233106965885e-06,
"loss": 0.0164,
"step": 855
},
{
"epoch": 2.3598535429679086,
"grad_norm": 0.09800931066274643,
"learning_rate": 2.2633934020052383e-06,
"loss": 0.0155,
"step": 856
},
{
"epoch": 2.3626103812190395,
"grad_norm": 0.09882977604866028,
"learning_rate": 2.2445245985021613e-06,
"loss": 0.0154,
"step": 857
},
{
"epoch": 2.36536721947017,
"grad_norm": 0.0957275852560997,
"learning_rate": 2.2257248271023424e-06,
"loss": 0.0164,
"step": 858
},
{
"epoch": 2.368124057721301,
"grad_norm": 0.10268919914960861,
"learning_rate": 2.206994255144036e-06,
"loss": 0.0164,
"step": 859
},
{
"epoch": 2.3708808959724315,
"grad_norm": 0.10079904645681381,
"learning_rate": 2.188333049349556e-06,
"loss": 0.0157,
"step": 860
},
{
"epoch": 2.3736377342235624,
"grad_norm": 0.0992964580655098,
"learning_rate": 2.1697413758237785e-06,
"loss": 0.0158,
"step": 861
},
{
"epoch": 2.376394572474693,
"grad_norm": 0.10243627429008484,
"learning_rate": 2.1512194000526676e-06,
"loss": 0.0167,
"step": 862
},
{
"epoch": 2.379151410725824,
"grad_norm": 0.0987883061170578,
"learning_rate": 2.1327672869018036e-06,
"loss": 0.0163,
"step": 863
},
{
"epoch": 2.3819082489769547,
"grad_norm": 0.10566847771406174,
"learning_rate": 2.114385200614912e-06,
"loss": 0.0157,
"step": 864
},
{
"epoch": 2.3846650872280852,
"grad_norm": 0.09772182255983353,
"learning_rate": 2.0960733048124082e-06,
"loss": 0.0153,
"step": 865
},
{
"epoch": 2.387421925479216,
"grad_norm": 0.11045119166374207,
"learning_rate": 2.077831762489927e-06,
"loss": 0.0165,
"step": 866
},
{
"epoch": 2.3901787637303467,
"grad_norm": 0.1000387892127037,
"learning_rate": 2.0596607360168897e-06,
"loss": 0.0163,
"step": 867
},
{
"epoch": 2.3929356019814776,
"grad_norm": 0.09834710508584976,
"learning_rate": 2.0415603871350476e-06,
"loss": 0.0162,
"step": 868
},
{
"epoch": 2.395692440232608,
"grad_norm": 0.10450678318738937,
"learning_rate": 2.023530876957045e-06,
"loss": 0.0155,
"step": 869
},
{
"epoch": 2.398449278483739,
"grad_norm": 0.11825753003358841,
"learning_rate": 2.0055723659649907e-06,
"loss": 0.0157,
"step": 870
},
{
"epoch": 2.40120611673487,
"grad_norm": 0.09847236424684525,
"learning_rate": 1.987685014009011e-06,
"loss": 0.0158,
"step": 871
},
{
"epoch": 2.4039629549860004,
"grad_norm": 0.09447720646858215,
"learning_rate": 1.9698689803058523e-06,
"loss": 0.0159,
"step": 872
},
{
"epoch": 2.406719793237131,
"grad_norm": 0.09634242951869965,
"learning_rate": 1.952124423437447e-06,
"loss": 0.016,
"step": 873
},
{
"epoch": 2.409476631488262,
"grad_norm": 0.10218273848295212,
"learning_rate": 1.934451501349507e-06,
"loss": 0.0153,
"step": 874
},
{
"epoch": 2.412233469739393,
"grad_norm": 0.0962023138999939,
"learning_rate": 1.9168503713501184e-06,
"loss": 0.0156,
"step": 875
},
{
"epoch": 2.4149903079905233,
"grad_norm": 0.09034065902233124,
"learning_rate": 1.8993211901083353e-06,
"loss": 0.0152,
"step": 876
},
{
"epoch": 2.417747146241654,
"grad_norm": 0.1083788052201271,
"learning_rate": 1.8818641136527959e-06,
"loss": 0.0165,
"step": 877
},
{
"epoch": 2.4205039844927847,
"grad_norm": 0.09179603308439255,
"learning_rate": 1.8644792973703252e-06,
"loss": 0.0154,
"step": 878
},
{
"epoch": 2.4232608227439156,
"grad_norm": 0.1054045706987381,
"learning_rate": 1.8471668960045575e-06,
"loss": 0.0162,
"step": 879
},
{
"epoch": 2.426017660995046,
"grad_norm": 0.10657548904418945,
"learning_rate": 1.8299270636545518e-06,
"loss": 0.0149,
"step": 880
},
{
"epoch": 2.428774499246177,
"grad_norm": 0.09735960513353348,
"learning_rate": 1.8127599537734297e-06,
"loss": 0.0161,
"step": 881
},
{
"epoch": 2.431531337497308,
"grad_norm": 0.11152695119380951,
"learning_rate": 1.7956657191669969e-06,
"loss": 0.0164,
"step": 882
},
{
"epoch": 2.4342881757484385,
"grad_norm": 0.09965367615222931,
"learning_rate": 1.7786445119923967e-06,
"loss": 0.0154,
"step": 883
},
{
"epoch": 2.4370450139995694,
"grad_norm": 0.09934116899967194,
"learning_rate": 1.7616964837567497e-06,
"loss": 0.0159,
"step": 884
},
{
"epoch": 2.4398018522507,
"grad_norm": 0.10009805858135223,
"learning_rate": 1.7448217853158e-06,
"loss": 0.0159,
"step": 885
},
{
"epoch": 2.442558690501831,
"grad_norm": 0.0922827422618866,
"learning_rate": 1.7280205668725814e-06,
"loss": 0.0156,
"step": 886
},
{
"epoch": 2.4453155287529613,
"grad_norm": 0.1045307070016861,
"learning_rate": 1.7112929779760768e-06,
"loss": 0.0156,
"step": 887
},
{
"epoch": 2.4480723670040923,
"grad_norm": 0.09331434965133667,
"learning_rate": 1.6946391675198838e-06,
"loss": 0.0155,
"step": 888
},
{
"epoch": 2.4508292052552227,
"grad_norm": 0.09583646059036255,
"learning_rate": 1.6780592837408926e-06,
"loss": 0.0156,
"step": 889
},
{
"epoch": 2.4535860435063537,
"grad_norm": 0.0972675010561943,
"learning_rate": 1.6615534742179684e-06,
"loss": 0.0158,
"step": 890
},
{
"epoch": 2.456342881757484,
"grad_norm": 0.09544920176267624,
"learning_rate": 1.6451218858706374e-06,
"loss": 0.0155,
"step": 891
},
{
"epoch": 2.459099720008615,
"grad_norm": 0.0940442830324173,
"learning_rate": 1.6287646649577672e-06,
"loss": 0.0154,
"step": 892
},
{
"epoch": 2.461856558259746,
"grad_norm": 0.09687791019678116,
"learning_rate": 1.6124819570762862e-06,
"loss": 0.0152,
"step": 893
},
{
"epoch": 2.4646133965108765,
"grad_norm": 0.09892424196004868,
"learning_rate": 1.5962739071598709e-06,
"loss": 0.0162,
"step": 894
},
{
"epoch": 2.4673702347620075,
"grad_norm": 0.09757326543331146,
"learning_rate": 1.5801406594776625e-06,
"loss": 0.0154,
"step": 895
},
{
"epoch": 2.470127073013138,
"grad_norm": 0.22701092064380646,
"learning_rate": 1.5640823576329844e-06,
"loss": 0.017,
"step": 896
},
{
"epoch": 2.472883911264269,
"grad_norm": 0.094330795109272,
"learning_rate": 1.5480991445620541e-06,
"loss": 0.0151,
"step": 897
},
{
"epoch": 2.4756407495153994,
"grad_norm": 0.09668730944395065,
"learning_rate": 1.5321911625327224e-06,
"loss": 0.0155,
"step": 898
},
{
"epoch": 2.4783975877665303,
"grad_norm": 0.09876801818609238,
"learning_rate": 1.5163585531432046e-06,
"loss": 0.0157,
"step": 899
},
{
"epoch": 2.4811544260176612,
"grad_norm": 0.1035366952419281,
"learning_rate": 1.500601457320814e-06,
"loss": 0.0151,
"step": 900
},
{
"epoch": 2.4839112642687917,
"grad_norm": 0.0975731685757637,
"learning_rate": 1.4849200153207176e-06,
"loss": 0.0155,
"step": 901
},
{
"epoch": 2.4866681025199227,
"grad_norm": 0.0988369807600975,
"learning_rate": 1.4693143667246713e-06,
"loss": 0.0157,
"step": 902
},
{
"epoch": 2.489424940771053,
"grad_norm": 0.10393507033586502,
"learning_rate": 1.453784650439798e-06,
"loss": 0.0159,
"step": 903
},
{
"epoch": 2.492181779022184,
"grad_norm": 0.09848541766405106,
"learning_rate": 1.4383310046973365e-06,
"loss": 0.0156,
"step": 904
},
{
"epoch": 2.4949386172733146,
"grad_norm": 0.09526592493057251,
"learning_rate": 1.4229535670514162e-06,
"loss": 0.0159,
"step": 905
},
{
"epoch": 2.4976954555244455,
"grad_norm": 0.256574809551239,
"learning_rate": 1.407652474377832e-06,
"loss": 0.0174,
"step": 906
},
{
"epoch": 2.500452293775576,
"grad_norm": 0.0996571034193039,
"learning_rate": 1.3924278628728305e-06,
"loss": 0.0164,
"step": 907
},
{
"epoch": 2.503209132026707,
"grad_norm": 0.10017193108797073,
"learning_rate": 1.3772798680518828e-06,
"loss": 0.0156,
"step": 908
},
{
"epoch": 2.5059659702778374,
"grad_norm": 0.09400206059217453,
"learning_rate": 1.3622086247484989e-06,
"loss": 0.0157,
"step": 909
},
{
"epoch": 2.5087228085289683,
"grad_norm": 0.09920066595077515,
"learning_rate": 1.3472142671130139e-06,
"loss": 0.0156,
"step": 910
},
{
"epoch": 2.5114796467800993,
"grad_norm": 0.09991924464702606,
"learning_rate": 1.3322969286113973e-06,
"loss": 0.0162,
"step": 911
},
{
"epoch": 2.5142364850312298,
"grad_norm": 0.09943817555904388,
"learning_rate": 1.3174567420240647e-06,
"loss": 0.0158,
"step": 912
},
{
"epoch": 2.5169933232823607,
"grad_norm": 0.10019614547491074,
"learning_rate": 1.3026938394446976e-06,
"loss": 0.0153,
"step": 913
},
{
"epoch": 2.519750161533491,
"grad_norm": 0.09503661096096039,
"learning_rate": 1.2880083522790654e-06,
"loss": 0.0158,
"step": 914
},
{
"epoch": 2.522506999784622,
"grad_norm": 0.09582358598709106,
"learning_rate": 1.273400411243857e-06,
"loss": 0.0155,
"step": 915
},
{
"epoch": 2.5252638380357526,
"grad_norm": 0.09465383738279343,
"learning_rate": 1.2588701463655172e-06,
"loss": 0.0154,
"step": 916
},
{
"epoch": 2.5280206762868835,
"grad_norm": 0.10085918009281158,
"learning_rate": 1.2444176869790925e-06,
"loss": 0.016,
"step": 917
},
{
"epoch": 2.5307775145380145,
"grad_norm": 0.10438365489244461,
"learning_rate": 1.2300431617270669e-06,
"loss": 0.0153,
"step": 918
},
{
"epoch": 2.533534352789145,
"grad_norm": 0.1272774487733841,
"learning_rate": 1.2157466985582367e-06,
"loss": 0.016,
"step": 919
},
{
"epoch": 2.5362911910402755,
"grad_norm": 0.10423656553030014,
"learning_rate": 1.2015284247265567e-06,
"loss": 0.0163,
"step": 920
},
{
"epoch": 2.5390480292914064,
"grad_norm": 0.09617140144109726,
"learning_rate": 1.1873884667900125e-06,
"loss": 0.0157,
"step": 921
},
{
"epoch": 2.5418048675425373,
"grad_norm": 0.09703914076089859,
"learning_rate": 1.1733269506094957e-06,
"loss": 0.0162,
"step": 922
},
{
"epoch": 2.544561705793668,
"grad_norm": 0.09608243405818939,
"learning_rate": 1.1593440013476775e-06,
"loss": 0.0151,
"step": 923
},
{
"epoch": 2.5473185440447987,
"grad_norm": 0.107514888048172,
"learning_rate": 1.1454397434679022e-06,
"loss": 0.0158,
"step": 924
},
{
"epoch": 2.5500753822959292,
"grad_norm": 0.09657265990972519,
"learning_rate": 1.1316143007330739e-06,
"loss": 0.0153,
"step": 925
},
{
"epoch": 2.55283222054706,
"grad_norm": 0.08969084173440933,
"learning_rate": 1.1178677962045604e-06,
"loss": 0.0154,
"step": 926
},
{
"epoch": 2.5555890587981906,
"grad_norm": 0.09439155459403992,
"learning_rate": 1.1042003522410882e-06,
"loss": 0.0157,
"step": 927
},
{
"epoch": 2.5583458970493216,
"grad_norm": 0.10195891559123993,
"learning_rate": 1.090612090497668e-06,
"loss": 0.0164,
"step": 928
},
{
"epoch": 2.5611027353004525,
"grad_norm": 0.10047302395105362,
"learning_rate": 1.077103131924493e-06,
"loss": 0.016,
"step": 929
},
{
"epoch": 2.563859573551583,
"grad_norm": 0.10115232318639755,
"learning_rate": 1.0636735967658785e-06,
"loss": 0.0158,
"step": 930
},
{
"epoch": 2.5666164118027135,
"grad_norm": 0.09630636125802994,
"learning_rate": 1.0503236045591857e-06,
"loss": 0.0156,
"step": 931
},
{
"epoch": 2.5693732500538444,
"grad_norm": 0.09693799912929535,
"learning_rate": 1.037053274133758e-06,
"loss": 0.0153,
"step": 932
},
{
"epoch": 2.5721300883049754,
"grad_norm": 0.09665205329656601,
"learning_rate": 1.0238627236098619e-06,
"loss": 0.0151,
"step": 933
},
{
"epoch": 2.574886926556106,
"grad_norm": 0.0942736491560936,
"learning_rate": 1.0107520703976325e-06,
"loss": 0.0151,
"step": 934
},
{
"epoch": 2.577643764807237,
"grad_norm": 0.10749771445989609,
"learning_rate": 9.977214311960404e-07,
"loss": 0.016,
"step": 935
},
{
"epoch": 2.5804006030583673,
"grad_norm": 0.09226176887750626,
"learning_rate": 9.8477092199184e-07,
"loss": 0.0152,
"step": 936
},
{
"epoch": 2.583157441309498,
"grad_norm": 0.10179731994867325,
"learning_rate": 9.719006580585444e-07,
"loss": 0.0158,
"step": 937
},
{
"epoch": 2.5859142795606287,
"grad_norm": 0.09366576373577118,
"learning_rate": 9.591107539553945e-07,
"loss": 0.0159,
"step": 938
},
{
"epoch": 2.5886711178117596,
"grad_norm": 0.097577303647995,
"learning_rate": 9.464013235263458e-07,
"loss": 0.0157,
"step": 939
},
{
"epoch": 2.5914279560628906,
"grad_norm": 0.10417082905769348,
"learning_rate": 9.337724798990489e-07,
"loss": 0.018,
"step": 940
},
{
"epoch": 2.594184794314021,
"grad_norm": 0.1003030315041542,
"learning_rate": 9.212243354838435e-07,
"loss": 0.016,
"step": 941
},
{
"epoch": 2.596941632565152,
"grad_norm": 0.09599234908819199,
"learning_rate": 9.08757001972762e-07,
"loss": 0.0154,
"step": 942
},
{
"epoch": 2.5996984708162825,
"grad_norm": 0.09770214557647705,
"learning_rate": 8.963705903385344e-07,
"loss": 0.016,
"step": 943
},
{
"epoch": 2.6024553090674134,
"grad_norm": 0.09537551552057266,
"learning_rate": 8.8406521083359e-07,
"loss": 0.0158,
"step": 944
},
{
"epoch": 2.605212147318544,
"grad_norm": 0.09884826838970184,
"learning_rate": 8.71840972989092e-07,
"loss": 0.0156,
"step": 945
},
{
"epoch": 2.607968985569675,
"grad_norm": 0.09631089121103287,
"learning_rate": 8.596979856139553e-07,
"loss": 0.0157,
"step": 946
},
{
"epoch": 2.6107258238208058,
"grad_norm": 0.09276318550109863,
"learning_rate": 8.476363567938751e-07,
"loss": 0.0151,
"step": 947
},
{
"epoch": 2.6134826620719362,
"grad_norm": 0.10186196863651276,
"learning_rate": 8.356561938903707e-07,
"loss": 0.0151,
"step": 948
},
{
"epoch": 2.6162395003230667,
"grad_norm": 0.0975264310836792,
"learning_rate": 8.237576035398198e-07,
"loss": 0.0156,
"step": 949
},
{
"epoch": 2.6189963385741977,
"grad_norm": 0.10101190954446793,
"learning_rate": 8.119406916525252e-07,
"loss": 0.016,
"step": 950
},
{
"epoch": 2.6217531768253286,
"grad_norm": 0.10287696123123169,
"learning_rate": 8.002055634117578e-07,
"loss": 0.0157,
"step": 951
},
{
"epoch": 2.624510015076459,
"grad_norm": 0.0948198139667511,
"learning_rate": 7.885523232728287e-07,
"loss": 0.0158,
"step": 952
},
{
"epoch": 2.62726685332759,
"grad_norm": 0.10126490145921707,
"learning_rate": 7.769810749621532e-07,
"loss": 0.0162,
"step": 953
},
{
"epoch": 2.6300236915787205,
"grad_norm": 0.09267105907201767,
"learning_rate": 7.654919214763357e-07,
"loss": 0.0155,
"step": 954
},
{
"epoch": 2.6327805298298514,
"grad_norm": 0.10376208275556564,
"learning_rate": 7.540849650812409e-07,
"loss": 0.0154,
"step": 955
},
{
"epoch": 2.635537368080982,
"grad_norm": 0.09337103366851807,
"learning_rate": 7.427603073110967e-07,
"loss": 0.0157,
"step": 956
},
{
"epoch": 2.638294206332113,
"grad_norm": 0.08885804563760757,
"learning_rate": 7.315180489675822e-07,
"loss": 0.0153,
"step": 957
},
{
"epoch": 2.641051044583244,
"grad_norm": 0.09852619469165802,
"learning_rate": 7.203582901189332e-07,
"loss": 0.0152,
"step": 958
},
{
"epoch": 2.6438078828343743,
"grad_norm": 0.0962531641125679,
"learning_rate": 7.092811300990521e-07,
"loss": 0.0153,
"step": 959
},
{
"epoch": 2.6465647210855052,
"grad_norm": 0.09466900676488876,
"learning_rate": 6.98286667506618e-07,
"loss": 0.0152,
"step": 960
},
{
"epoch": 2.6493215593366357,
"grad_norm": 0.09106676280498505,
"learning_rate": 6.87375000204219e-07,
"loss": 0.0147,
"step": 961
},
{
"epoch": 2.6520783975877666,
"grad_norm": 0.09509000927209854,
"learning_rate": 6.765462253174715e-07,
"loss": 0.0155,
"step": 962
},
{
"epoch": 2.654835235838897,
"grad_norm": 0.09613344073295593,
"learning_rate": 6.658004392341633e-07,
"loss": 0.0155,
"step": 963
},
{
"epoch": 2.657592074090028,
"grad_norm": 0.09688057750463486,
"learning_rate": 6.551377376033896e-07,
"loss": 0.0157,
"step": 964
},
{
"epoch": 2.660348912341159,
"grad_norm": 0.09640353918075562,
"learning_rate": 6.445582153347074e-07,
"loss": 0.0159,
"step": 965
},
{
"epoch": 2.6631057505922895,
"grad_norm": 0.10333269834518433,
"learning_rate": 6.340619665972847e-07,
"loss": 0.0158,
"step": 966
},
{
"epoch": 2.66586258884342,
"grad_norm": 0.09676087647676468,
"learning_rate": 6.236490848190657e-07,
"loss": 0.0156,
"step": 967
},
{
"epoch": 2.668619427094551,
"grad_norm": 0.09745439141988754,
"learning_rate": 6.133196626859406e-07,
"loss": 0.0163,
"step": 968
},
{
"epoch": 2.671376265345682,
"grad_norm": 0.0986739918589592,
"learning_rate": 6.030737921409169e-07,
"loss": 0.0163,
"step": 969
},
{
"epoch": 2.6741331035968123,
"grad_norm": 0.09318210184574127,
"learning_rate": 5.929115643833005e-07,
"loss": 0.0158,
"step": 970
},
{
"epoch": 2.6768899418479433,
"grad_norm": 0.09318020939826965,
"learning_rate": 5.828330698678908e-07,
"loss": 0.0153,
"step": 971
},
{
"epoch": 2.6796467800990738,
"grad_norm": 0.0958879366517067,
"learning_rate": 5.728383983041696e-07,
"loss": 0.0159,
"step": 972
},
{
"epoch": 2.6824036183502047,
"grad_norm": 0.10100314021110535,
"learning_rate": 5.629276386555016e-07,
"loss": 0.0152,
"step": 973
},
{
"epoch": 2.685160456601335,
"grad_norm": 0.0920896902680397,
"learning_rate": 5.531008791383485e-07,
"loss": 0.0153,
"step": 974
},
{
"epoch": 2.687917294852466,
"grad_norm": 0.09028688073158264,
"learning_rate": 5.43358207221476e-07,
"loss": 0.0156,
"step": 975
},
{
"epoch": 2.690674133103597,
"grad_norm": 0.09332931786775589,
"learning_rate": 5.336997096251816e-07,
"loss": 0.0151,
"step": 976
},
{
"epoch": 2.6934309713547275,
"grad_norm": 0.10222768783569336,
"learning_rate": 5.241254723205225e-07,
"loss": 0.0153,
"step": 977
},
{
"epoch": 2.696187809605858,
"grad_norm": 0.10587936639785767,
"learning_rate": 5.146355805285452e-07,
"loss": 0.0151,
"step": 978
},
{
"epoch": 2.698944647856989,
"grad_norm": 0.09399544447660446,
"learning_rate": 5.052301187195296e-07,
"loss": 0.0156,
"step": 979
},
{
"epoch": 2.70170148610812,
"grad_norm": 0.09084748476743698,
"learning_rate": 4.959091706122431e-07,
"loss": 0.0152,
"step": 980
},
{
"epoch": 2.7044583243592504,
"grad_norm": 0.09691156446933746,
"learning_rate": 4.866728191731829e-07,
"loss": 0.0156,
"step": 981
},
{
"epoch": 2.7072151626103813,
"grad_norm": 0.09536400437355042,
"learning_rate": 4.775211466158469e-07,
"loss": 0.0157,
"step": 982
},
{
"epoch": 2.709972000861512,
"grad_norm": 0.0932604968547821,
"learning_rate": 4.6845423440000315e-07,
"loss": 0.0151,
"step": 983
},
{
"epoch": 2.7127288391126427,
"grad_norm": 0.0943024605512619,
"learning_rate": 4.594721632309551e-07,
"loss": 0.0157,
"step": 984
},
{
"epoch": 2.715485677363773,
"grad_norm": 0.11024822294712067,
"learning_rate": 4.505750130588371e-07,
"loss": 0.0159,
"step": 985
},
{
"epoch": 2.718242515614904,
"grad_norm": 0.09148909151554108,
"learning_rate": 4.4176286307788475e-07,
"loss": 0.0152,
"step": 986
},
{
"epoch": 2.720999353866035,
"grad_norm": 0.10189063847064972,
"learning_rate": 4.3303579172574884e-07,
"loss": 0.0164,
"step": 987
},
{
"epoch": 2.7237561921171656,
"grad_norm": 0.09687193483114243,
"learning_rate": 4.243938766827849e-07,
"loss": 0.0156,
"step": 988
},
{
"epoch": 2.7265130303682965,
"grad_norm": 0.0970255509018898,
"learning_rate": 4.1583719487136575e-07,
"loss": 0.0154,
"step": 989
},
{
"epoch": 2.729269868619427,
"grad_norm": 0.08850499987602234,
"learning_rate": 4.0736582245519795e-07,
"loss": 0.0151,
"step": 990
},
{
"epoch": 2.732026706870558,
"grad_norm": 0.10004325211048126,
"learning_rate": 3.9897983483863866e-07,
"loss": 0.0157,
"step": 991
},
{
"epoch": 2.7347835451216884,
"grad_norm": 0.09604191035032272,
"learning_rate": 3.9067930666603304e-07,
"loss": 0.0154,
"step": 992
},
{
"epoch": 2.7375403833728194,
"grad_norm": 0.096500463783741,
"learning_rate": 3.824643118210403e-07,
"loss": 0.0159,
"step": 993
},
{
"epoch": 2.7402972216239503,
"grad_norm": 0.08940698951482773,
"learning_rate": 3.743349234259841e-07,
"loss": 0.0155,
"step": 994
},
{
"epoch": 2.7430540598750808,
"grad_norm": 0.09379703551530838,
"learning_rate": 3.662912138411967e-07,
"loss": 0.0153,
"step": 995
},
{
"epoch": 2.7458108981262113,
"grad_norm": 0.09963972121477127,
"learning_rate": 3.5833325466437697e-07,
"loss": 0.0156,
"step": 996
},
{
"epoch": 2.748567736377342,
"grad_norm": 0.09487731754779816,
"learning_rate": 3.5046111672995097e-07,
"loss": 0.0158,
"step": 997
},
{
"epoch": 2.751324574628473,
"grad_norm": 0.09154865145683289,
"learning_rate": 3.426748701084448e-07,
"loss": 0.0152,
"step": 998
},
{
"epoch": 2.7540814128796036,
"grad_norm": 0.09428098797798157,
"learning_rate": 3.349745841058605e-07,
"loss": 0.0155,
"step": 999
},
{
"epoch": 2.7568382511307346,
"grad_norm": 0.09710869193077087,
"learning_rate": 3.2736032726305546e-07,
"loss": 0.0158,
"step": 1000
},
{
"epoch": 2.759595089381865,
"grad_norm": 0.10234350711107254,
"learning_rate": 3.198321673551341e-07,
"loss": 0.0159,
"step": 1001
},
{
"epoch": 2.762351927632996,
"grad_norm": 0.09612155705690384,
"learning_rate": 3.1239017139084725e-07,
"loss": 0.0162,
"step": 1002
},
{
"epoch": 2.7651087658841265,
"grad_norm": 0.10025949776172638,
"learning_rate": 3.050344056119925e-07,
"loss": 0.0158,
"step": 1003
},
{
"epoch": 2.7678656041352574,
"grad_norm": 0.09752298891544342,
"learning_rate": 2.977649354928258e-07,
"loss": 0.0153,
"step": 1004
},
{
"epoch": 2.7706224423863883,
"grad_norm": 0.09761834889650345,
"learning_rate": 2.905818257394799e-07,
"loss": 0.0155,
"step": 1005
},
{
"epoch": 2.773379280637519,
"grad_norm": 0.09161604940891266,
"learning_rate": 2.834851402893857e-07,
"loss": 0.0148,
"step": 1006
},
{
"epoch": 2.7761361188886493,
"grad_norm": 0.09183601289987564,
"learning_rate": 2.764749423107027e-07,
"loss": 0.0151,
"step": 1007
},
{
"epoch": 2.7788929571397802,
"grad_norm": 0.09860436618328094,
"learning_rate": 2.6955129420176193e-07,
"loss": 0.0165,
"step": 1008
},
{
"epoch": 2.781649795390911,
"grad_norm": 0.09372715651988983,
"learning_rate": 2.627142575905062e-07,
"loss": 0.0154,
"step": 1009
},
{
"epoch": 2.7844066336420417,
"grad_norm": 0.10581711679697037,
"learning_rate": 2.559638933339414e-07,
"loss": 0.0157,
"step": 1010
},
{
"epoch": 2.7871634718931726,
"grad_norm": 0.09646962583065033,
"learning_rate": 2.493002615175977e-07,
"loss": 0.0156,
"step": 1011
},
{
"epoch": 2.7899203101443035,
"grad_norm": 0.09317374974489212,
"learning_rate": 2.4272342145499006e-07,
"loss": 0.0154,
"step": 1012
},
{
"epoch": 2.792677148395434,
"grad_norm": 0.09786811470985413,
"learning_rate": 2.3623343168709624e-07,
"loss": 0.0157,
"step": 1013
},
{
"epoch": 2.7954339866465645,
"grad_norm": 0.09875071793794632,
"learning_rate": 2.2983034998182997e-07,
"loss": 0.0157,
"step": 1014
},
{
"epoch": 2.7981908248976954,
"grad_norm": 0.09668000042438507,
"learning_rate": 2.235142333335316e-07,
"loss": 0.0151,
"step": 1015
},
{
"epoch": 2.8009476631488264,
"grad_norm": 0.09402566403150558,
"learning_rate": 2.1728513796245855e-07,
"loss": 0.0154,
"step": 1016
},
{
"epoch": 2.803704501399957,
"grad_norm": 0.09588855504989624,
"learning_rate": 2.11143119314281e-07,
"loss": 0.0157,
"step": 1017
},
{
"epoch": 2.806461339651088,
"grad_norm": 0.09455292671918869,
"learning_rate": 2.0508823205959815e-07,
"loss": 0.0154,
"step": 1018
},
{
"epoch": 2.8092181779022183,
"grad_norm": 0.09076821804046631,
"learning_rate": 1.991205300934429e-07,
"loss": 0.0154,
"step": 1019
},
{
"epoch": 2.811975016153349,
"grad_norm": 0.09466226398944855,
"learning_rate": 1.9324006653480332e-07,
"loss": 0.0152,
"step": 1020
},
{
"epoch": 2.8147318544044797,
"grad_norm": 0.10528804361820221,
"learning_rate": 1.874468937261531e-07,
"loss": 0.0157,
"step": 1021
},
{
"epoch": 2.8174886926556106,
"grad_norm": 0.09444960951805115,
"learning_rate": 1.8174106323298634e-07,
"loss": 0.0154,
"step": 1022
},
{
"epoch": 2.8202455309067416,
"grad_norm": 0.09570340067148209,
"learning_rate": 1.761226258433524e-07,
"loss": 0.0157,
"step": 1023
},
{
"epoch": 2.823002369157872,
"grad_norm": 0.09762073308229446,
"learning_rate": 1.7059163156740943e-07,
"loss": 0.0155,
"step": 1024
},
{
"epoch": 2.8257592074090025,
"grad_norm": 0.09936973452568054,
"learning_rate": 1.6514812963697723e-07,
"loss": 0.0156,
"step": 1025
},
{
"epoch": 2.8285160456601335,
"grad_norm": 0.09471126645803452,
"learning_rate": 1.5979216850509848e-07,
"loss": 0.0151,
"step": 1026
},
{
"epoch": 2.8312728839112644,
"grad_norm": 0.09497305750846863,
"learning_rate": 1.545237958456125e-07,
"loss": 0.0155,
"step": 1027
},
{
"epoch": 2.834029722162395,
"grad_norm": 0.09496881812810898,
"learning_rate": 1.4934305855271892e-07,
"loss": 0.0153,
"step": 1028
},
{
"epoch": 2.836786560413526,
"grad_norm": 0.09368608891963959,
"learning_rate": 1.4425000274057577e-07,
"loss": 0.0157,
"step": 1029
},
{
"epoch": 2.8395433986646563,
"grad_norm": 0.1018013209104538,
"learning_rate": 1.3924467374287432e-07,
"loss": 0.0157,
"step": 1030
},
{
"epoch": 2.8423002369157873,
"grad_norm": 0.09439851343631744,
"learning_rate": 1.343271161124493e-07,
"loss": 0.0159,
"step": 1031
},
{
"epoch": 2.8450570751669177,
"grad_norm": 0.10299389809370041,
"learning_rate": 1.2949737362087156e-07,
"loss": 0.0157,
"step": 1032
},
{
"epoch": 2.8478139134180487,
"grad_norm": 0.10252979397773743,
"learning_rate": 1.247554892580616e-07,
"loss": 0.0159,
"step": 1033
},
{
"epoch": 2.8505707516691796,
"grad_norm": 0.08925472944974899,
"learning_rate": 1.201015052319099e-07,
"loss": 0.0155,
"step": 1034
},
{
"epoch": 2.85332758992031,
"grad_norm": 0.09510352462530136,
"learning_rate": 1.1553546296789952e-07,
"loss": 0.0159,
"step": 1035
},
{
"epoch": 2.856084428171441,
"grad_norm": 0.09258411824703217,
"learning_rate": 1.1105740310873414e-07,
"loss": 0.0151,
"step": 1036
},
{
"epoch": 2.8588412664225715,
"grad_norm": 0.09796860814094543,
"learning_rate": 1.066673655139816e-07,
"loss": 0.0153,
"step": 1037
},
{
"epoch": 2.8615981046737025,
"grad_norm": 0.0932827889919281,
"learning_rate": 1.0236538925971429e-07,
"loss": 0.0155,
"step": 1038
},
{
"epoch": 2.864354942924833,
"grad_norm": 0.09266915917396545,
"learning_rate": 9.815151263816714e-08,
"loss": 0.0153,
"step": 1039
},
{
"epoch": 2.867111781175964,
"grad_norm": 0.09166253358125687,
"learning_rate": 9.402577315738904e-08,
"loss": 0.0154,
"step": 1040
},
{
"epoch": 2.869868619427095,
"grad_norm": 0.10095134377479553,
"learning_rate": 8.99882075409153e-08,
"loss": 0.0161,
"step": 1041
},
{
"epoch": 2.8726254576782253,
"grad_norm": 0.09815669804811478,
"learning_rate": 8.603885172744131e-08,
"loss": 0.0163,
"step": 1042
},
{
"epoch": 2.875382295929356,
"grad_norm": 0.09382443875074387,
"learning_rate": 8.217774087049268e-08,
"loss": 0.016,
"step": 1043
},
{
"epoch": 2.8781391341804867,
"grad_norm": 0.09190834313631058,
"learning_rate": 7.840490933812783e-08,
"loss": 0.0158,
"step": 1044
},
{
"epoch": 2.8808959724316177,
"grad_norm": 0.09133637696504593,
"learning_rate": 7.472039071261927e-08,
"loss": 0.0149,
"step": 1045
},
{
"epoch": 2.883652810682748,
"grad_norm": 0.10109441727399826,
"learning_rate": 7.112421779015944e-08,
"loss": 0.0152,
"step": 1046
},
{
"epoch": 2.886409648933879,
"grad_norm": 0.09567693620920181,
"learning_rate": 6.761642258056977e-08,
"loss": 0.0151,
"step": 1047
},
{
"epoch": 2.8891664871850096,
"grad_norm": 0.17240361869335175,
"learning_rate": 6.419703630701546e-08,
"loss": 0.0174,
"step": 1048
},
{
"epoch": 2.8919233254361405,
"grad_norm": 0.09366879612207413,
"learning_rate": 6.086608940572447e-08,
"loss": 0.0157,
"step": 1049
},
{
"epoch": 2.894680163687271,
"grad_norm": 0.09174374490976334,
"learning_rate": 5.7623611525721155e-08,
"loss": 0.0152,
"step": 1050
},
{
"epoch": 2.897437001938402,
"grad_norm": 0.096317358314991,
"learning_rate": 5.446963152855644e-08,
"loss": 0.0156,
"step": 1051
},
{
"epoch": 2.900193840189533,
"grad_norm": 0.09576837718486786,
"learning_rate": 5.140417748806026e-08,
"loss": 0.0152,
"step": 1052
},
{
"epoch": 2.9029506784406633,
"grad_norm": 0.09430639445781708,
"learning_rate": 4.8427276690081735e-08,
"loss": 0.0152,
"step": 1053
},
{
"epoch": 2.905707516691794,
"grad_norm": 0.10490168631076813,
"learning_rate": 4.553895563225053e-08,
"loss": 0.015,
"step": 1054
},
{
"epoch": 2.9084643549429248,
"grad_norm": 0.0979146957397461,
"learning_rate": 4.2739240023742526e-08,
"loss": 0.0154,
"step": 1055
},
{
"epoch": 2.9112211931940557,
"grad_norm": 0.10030056536197662,
"learning_rate": 4.002815478505007e-08,
"loss": 0.0155,
"step": 1056
},
{
"epoch": 2.913978031445186,
"grad_norm": 0.10702443867921829,
"learning_rate": 3.7405724047756554e-08,
"loss": 0.0163,
"step": 1057
},
{
"epoch": 2.916734869696317,
"grad_norm": 0.09876976162195206,
"learning_rate": 3.487197115432883e-08,
"loss": 0.0147,
"step": 1058
},
{
"epoch": 2.9194917079474476,
"grad_norm": 0.09448473155498505,
"learning_rate": 3.242691865790071e-08,
"loss": 0.0148,
"step": 1059
},
{
"epoch": 2.9222485461985785,
"grad_norm": 0.10175088047981262,
"learning_rate": 3.0070588322079765e-08,
"loss": 0.0165,
"step": 1060
},
{
"epoch": 2.925005384449709,
"grad_norm": 0.09407947957515717,
"learning_rate": 2.780300112074974e-08,
"loss": 0.0154,
"step": 1061
},
{
"epoch": 2.92776222270084,
"grad_norm": 0.09297125786542892,
"learning_rate": 2.5624177237884017e-08,
"loss": 0.0156,
"step": 1062
},
{
"epoch": 2.930519060951971,
"grad_norm": 0.10525539517402649,
"learning_rate": 2.3534136067369094e-08,
"loss": 0.0165,
"step": 1063
},
{
"epoch": 2.9332758992031014,
"grad_norm": 0.09468672424554825,
"learning_rate": 2.1532896212825837e-08,
"loss": 0.0158,
"step": 1064
},
{
"epoch": 2.9360327374542323,
"grad_norm": 0.0892932116985321,
"learning_rate": 1.962047548744961e-08,
"loss": 0.0148,
"step": 1065
},
{
"epoch": 2.938789575705363,
"grad_norm": 0.09384763240814209,
"learning_rate": 1.7796890913850395e-08,
"loss": 0.0154,
"step": 1066
},
{
"epoch": 2.9415464139564937,
"grad_norm": 0.10021128505468369,
"learning_rate": 1.606215872389738e-08,
"loss": 0.0154,
"step": 1067
},
{
"epoch": 2.9443032522076242,
"grad_norm": 0.09487558901309967,
"learning_rate": 1.4416294358582383e-08,
"loss": 0.0151,
"step": 1068
},
{
"epoch": 2.947060090458755,
"grad_norm": 0.09665752202272415,
"learning_rate": 1.2859312467872197e-08,
"loss": 0.0148,
"step": 1069
},
{
"epoch": 2.949816928709886,
"grad_norm": 0.11037097126245499,
"learning_rate": 1.1391226910588693e-08,
"loss": 0.0153,
"step": 1070
},
{
"epoch": 2.9525737669610166,
"grad_norm": 0.101137176156044,
"learning_rate": 1.0012050754277802e-08,
"loss": 0.0153,
"step": 1071
},
{
"epoch": 2.955330605212147,
"grad_norm": 0.09380457550287247,
"learning_rate": 8.721796275095173e-09,
"loss": 0.0152,
"step": 1072
},
{
"epoch": 2.958087443463278,
"grad_norm": 0.09098217636346817,
"learning_rate": 7.520474957699586e-09,
"loss": 0.0149,
"step": 1073
},
{
"epoch": 2.960844281714409,
"grad_norm": 0.0910525918006897,
"learning_rate": 6.40809749514637e-09,
"loss": 0.015,
"step": 1074
},
{
"epoch": 2.9636011199655394,
"grad_norm": 0.09770431369543076,
"learning_rate": 5.384673788797479e-09,
"loss": 0.0151,
"step": 1075
},
{
"epoch": 2.9663579582166704,
"grad_norm": 0.09908950328826904,
"learning_rate": 4.450212948227117e-09,
"loss": 0.0161,
"step": 1076
},
{
"epoch": 2.969114796467801,
"grad_norm": 0.09452182054519653,
"learning_rate": 3.6047232911462506e-09,
"loss": 0.0153,
"step": 1077
},
{
"epoch": 2.971871634718932,
"grad_norm": 0.09382626414299011,
"learning_rate": 2.8482123433248853e-09,
"loss": 0.0153,
"step": 1078
},
{
"epoch": 2.9746284729700623,
"grad_norm": 0.0985400527715683,
"learning_rate": 2.180686838527679e-09,
"loss": 0.0158,
"step": 1079
},
{
"epoch": 2.977385311221193,
"grad_norm": 0.09935043007135391,
"learning_rate": 1.6021527184528761e-09,
"loss": 0.0163,
"step": 1080
},
{
"epoch": 2.980142149472324,
"grad_norm": 0.09754447638988495,
"learning_rate": 1.1126151326779077e-09,
"loss": 0.0156,
"step": 1081
},
{
"epoch": 2.9828989877234546,
"grad_norm": 0.09899445623159409,
"learning_rate": 7.120784386160928e-10,
"loss": 0.0162,
"step": 1082
},
{
"epoch": 2.985655825974585,
"grad_norm": 0.09259970486164093,
"learning_rate": 4.005462014766703e-10,
"loss": 0.0153,
"step": 1083
},
{
"epoch": 2.988412664225716,
"grad_norm": 0.09933052957057953,
"learning_rate": 1.7802119423149244e-10,
"loss": 0.016,
"step": 1084
},
{
"epoch": 2.991169502476847,
"grad_norm": 0.09650570899248123,
"learning_rate": 4.450539759393024e-11,
"loss": 0.0155,
"step": 1085
},
{
"epoch": 2.9939263407279775,
"grad_norm": 0.0921320915222168,
"learning_rate": 0.0,
"loss": 0.0156,
"step": 1086
},
{
"epoch": 2.9939263407279775,
"step": 1086,
"total_flos": 5.826670523626553e+18,
"train_loss": 0.02890731801551597,
"train_runtime": 65667.5663,
"train_samples_per_second": 8.484,
"train_steps_per_second": 0.017
}
],
"logging_steps": 1.0,
"max_steps": 1086,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.826670523626553e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}