diff --git "a/checkpoint-2000/trainer_state.json" "b/checkpoint-2000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/checkpoint-2000/trainer_state.json"
@@ -0,0 +1,14033 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 6.135280553420446,
+  "eval_steps": 500,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0030745580322828594,
+      "grad_norm": 7.8743672370910645,
+      "learning_rate": 4e-05,
+      "loss": 4.1619,
+      "step": 1
+    },
+    {
+      "epoch": 0.006149116064565719,
+      "grad_norm": 5.61863374710083,
+      "learning_rate": 8e-05,
+      "loss": 4.2003,
+      "step": 2
+    },
+    {
+      "epoch": 0.009223674096848577,
+      "grad_norm": 5.158657550811768,
+      "learning_rate": 0.00012,
+      "loss": 4.8946,
+      "step": 3
+    },
+    {
+      "epoch": 0.012298232129131437,
+      "grad_norm": 6.152472019195557,
+      "learning_rate": 0.00016,
+      "loss": 4.5709,
+      "step": 4
+    },
+    {
+      "epoch": 0.015372790161414296,
+      "grad_norm": 3.8159048557281494,
+      "learning_rate": 0.0002,
+      "loss": 3.525,
+      "step": 5
+    },
+    {
+      "epoch": 0.018447348193697154,
+      "grad_norm": 2.894798517227173,
+      "learning_rate": 0.00019993322203672788,
+      "loss": 2.8446,
+      "step": 6
+    },
+    {
+      "epoch": 0.021521906225980016,
+      "grad_norm": 2.2424588203430176,
+      "learning_rate": 0.00019986644407345576,
+      "loss": 2.3555,
+      "step": 7
+    },
+    {
+      "epoch": 0.024596464258262875,
+      "grad_norm": 2.5049643516540527,
+      "learning_rate": 0.00019979966611018366,
+      "loss": 2.1607,
+      "step": 8
+    },
+    {
+      "epoch": 0.027671022290545733,
+      "grad_norm": 2.0380496978759766,
+      "learning_rate": 0.00019973288814691153,
+      "loss": 2.1771,
+      "step": 9
+    },
+    {
+      "epoch": 0.030745580322828592,
+      "grad_norm": 2.0299413204193115,
+      "learning_rate": 0.0001996661101836394,
+      "loss": 1.5033,
+      "step": 10
+    },
+    {
+      "epoch": 0.033820138355111454,
+      "grad_norm": 2.054259777069092,
+      "learning_rate": 0.00019959933222036728,
+      "loss": 1.4109,
+      "step": 11
+    },
+    {
+      "epoch": 0.03689469638739431,
+      "grad_norm": 1.8728002309799194,
+      "learning_rate": 0.00019953255425709515,
+      "loss": 1.4278,
+      "step": 12
+    },
+    {
+      "epoch": 0.03996925441967717,
+      "grad_norm": 1.6537948846817017,
+      "learning_rate": 0.00019946577629382305,
+      "loss": 1.1342,
+      "step": 13
+    },
+    {
+      "epoch": 0.04304381245196003,
+      "grad_norm": 1.2836942672729492,
+      "learning_rate": 0.00019939899833055092,
+      "loss": 1.3855,
+      "step": 14
+    },
+    {
+      "epoch": 0.04611837048424289,
+      "grad_norm": 1.4211474657058716,
+      "learning_rate": 0.00019933222036727882,
+      "loss": 1.2734,
+      "step": 15
+    },
+    {
+      "epoch": 0.04919292851652575,
+      "grad_norm": 1.298756718635559,
+      "learning_rate": 0.0001992654424040067,
+      "loss": 1.0105,
+      "step": 16
+    },
+    {
+      "epoch": 0.05226748654880861,
+      "grad_norm": 1.2545682191848755,
+      "learning_rate": 0.00019919866444073457,
+      "loss": 0.8059,
+      "step": 17
+    },
+    {
+      "epoch": 0.05534204458109147,
+      "grad_norm": 1.1537564992904663,
+      "learning_rate": 0.00019913188647746244,
+      "loss": 1.0606,
+      "step": 18
+    },
+    {
+      "epoch": 0.05841660261337433,
+      "grad_norm": 1.1393249034881592,
+      "learning_rate": 0.00019906510851419034,
+      "loss": 0.8851,
+      "step": 19
+    },
+    {
+      "epoch": 0.061491160645657184,
+      "grad_norm": 1.2342195510864258,
+      "learning_rate": 0.00019899833055091822,
+      "loss": 0.782,
+      "step": 20
+    },
+    {
+      "epoch": 0.06456571867794005,
+      "grad_norm": 1.1908934116363525,
+      "learning_rate": 0.0001989315525876461,
+      "loss": 0.8126,
+      "step": 21
+    },
+    {
+      "epoch": 0.06764027671022291,
+      "grad_norm": 1.1466214656829834,
+      "learning_rate": 0.00019886477462437396,
+      "loss": 1.0469,
+      "step": 22
+    },
+    {
+      "epoch": 0.07071483474250577,
+      "grad_norm": 1.1647766828536987,
+      "learning_rate": 0.00019879799666110183,
+      "loss": 0.7116,
+      "step": 23
+    },
+    {
+      "epoch": 0.07378939277478862,
+      "grad_norm": 1.043047308921814,
+      "learning_rate": 0.00019873121869782974,
+      "loss": 0.7711,
+      "step": 24
+    },
+    {
+      "epoch": 0.07686395080707148,
+      "grad_norm": 1.2585339546203613,
+      "learning_rate": 0.0001986644407345576,
+      "loss": 0.8884,
+      "step": 25
+    },
+    {
+      "epoch": 0.07993850883935434,
+      "grad_norm": 1.3209491968154907,
+      "learning_rate": 0.00019859766277128548,
+      "loss": 0.899,
+      "step": 26
+    },
+    {
+      "epoch": 0.0830130668716372,
+      "grad_norm": 1.4513576030731201,
+      "learning_rate": 0.00019853088480801335,
+      "loss": 0.7261,
+      "step": 27
+    },
+    {
+      "epoch": 0.08608762490392007,
+      "grad_norm": 1.3187739849090576,
+      "learning_rate": 0.00019846410684474123,
+      "loss": 0.951,
+      "step": 28
+    },
+    {
+      "epoch": 0.08916218293620293,
+      "grad_norm": 1.2414864301681519,
+      "learning_rate": 0.00019839732888146913,
+      "loss": 0.8317,
+      "step": 29
+    },
+    {
+      "epoch": 0.09223674096848578,
+      "grad_norm": 1.1460022926330566,
+      "learning_rate": 0.000198330550918197,
+      "loss": 0.7259,
+      "step": 30
+    },
+    {
+      "epoch": 0.09531129900076864,
+      "grad_norm": 1.5636142492294312,
+      "learning_rate": 0.00019826377295492487,
+      "loss": 1.0959,
+      "step": 31
+    },
+    {
+      "epoch": 0.0983858570330515,
+      "grad_norm": 1.3006511926651,
+      "learning_rate": 0.00019819699499165277,
+      "loss": 0.8215,
+      "step": 32
+    },
+    {
+      "epoch": 0.10146041506533436,
+      "grad_norm": 1.0390676259994507,
+      "learning_rate": 0.00019813021702838065,
+      "loss": 0.6979,
+      "step": 33
+    },
+    {
+      "epoch": 0.10453497309761722,
+      "grad_norm": 1.1039427518844604,
+      "learning_rate": 0.00019806343906510852,
+      "loss": 0.8445,
+      "step": 34
+    },
+    {
+      "epoch": 0.10760953112990007,
+      "grad_norm": 1.3381197452545166,
+      "learning_rate": 0.00019799666110183642,
+      "loss": 0.862,
+      "step": 35
+    },
+    {
+      "epoch": 0.11068408916218293,
+      "grad_norm": 1.2397987842559814,
+      "learning_rate": 0.0001979298831385643,
+      "loss": 0.9947,
+      "step": 36
+    },
+    {
+      "epoch": 0.1137586471944658,
+      "grad_norm": 1.143774151802063,
+      "learning_rate": 0.00019786310517529217,
+      "loss": 0.7655,
+      "step": 37
+    },
+    {
+      "epoch": 0.11683320522674866,
+      "grad_norm": 0.9365643858909607,
+      "learning_rate": 0.00019779632721202004,
+      "loss": 0.669,
+      "step": 38
+    },
+    {
+      "epoch": 0.11990776325903152,
+      "grad_norm": 0.9902568459510803,
+      "learning_rate": 0.00019772954924874791,
+      "loss": 0.828,
+      "step": 39
+    },
+    {
+      "epoch": 0.12298232129131437,
+      "grad_norm": 1.1478556394577026,
+      "learning_rate": 0.00019766277128547581,
+      "loss": 0.8117,
+      "step": 40
+    },
+    {
+      "epoch": 0.12605687932359724,
+      "grad_norm": 1.234010100364685,
+      "learning_rate": 0.0001975959933222037,
+      "loss": 0.6825,
+      "step": 41
+    },
+    {
+      "epoch": 0.1291314373558801,
+      "grad_norm": 0.9467914700508118,
+      "learning_rate": 0.00019752921535893156,
+      "loss": 0.7473,
+      "step": 42
+    },
+    {
+      "epoch": 0.13220599538816294,
+      "grad_norm": 0.8851337432861328,
+      "learning_rate": 0.00019746243739565943,
+      "loss": 0.6269,
+      "step": 43
+    },
+    {
+      "epoch": 0.13528055342044581,
+      "grad_norm": 0.9889845848083496,
+      "learning_rate": 0.0001973956594323873,
+      "loss": 0.8628,
+      "step": 44
+    },
+    {
+      "epoch": 0.13835511145272866,
+      "grad_norm": 0.838996946811676,
+      "learning_rate": 0.0001973288814691152,
+      "loss": 0.5659,
+      "step": 45
+    },
+    {
+      "epoch": 0.14142966948501154,
+      "grad_norm": 0.9662989974021912,
+      "learning_rate": 0.00019726210350584308,
+      "loss": 0.7361,
+      "step": 46
+    },
+    {
+      "epoch": 0.1445042275172944,
+      "grad_norm": 0.9126756191253662,
+      "learning_rate": 0.00019719532554257095,
+      "loss": 0.6841,
+      "step": 47
+    },
+    {
+      "epoch": 0.14757878554957723,
+      "grad_norm": 1.0940887928009033,
+      "learning_rate": 0.00019712854757929883,
+      "loss": 0.7206,
+      "step": 48
+    },
+    {
+      "epoch": 0.1506533435818601,
+      "grad_norm": 0.9076804518699646,
+      "learning_rate": 0.00019706176961602673,
+      "loss": 0.6463,
+      "step": 49
+    },
+    {
+      "epoch": 0.15372790161414296,
+      "grad_norm": 1.1357953548431396,
+      "learning_rate": 0.0001969949916527546,
+      "loss": 0.7941,
+      "step": 50
+    },
+    {
+      "epoch": 0.15680245964642583,
+      "grad_norm": 0.9527478814125061,
+      "learning_rate": 0.0001969282136894825,
+      "loss": 0.5493,
+      "step": 51
+    },
+    {
+      "epoch": 0.15987701767870868,
+      "grad_norm": 1.0596438646316528,
+      "learning_rate": 0.00019686143572621037,
+      "loss": 0.8444,
+      "step": 52
+    },
+    {
+      "epoch": 0.16295157571099156,
+      "grad_norm": 0.8877243995666504,
+      "learning_rate": 0.00019679465776293825,
+      "loss": 0.683,
+      "step": 53
+    },
+    {
+      "epoch": 0.1660261337432744,
+      "grad_norm": 0.959449052810669,
+      "learning_rate": 0.00019672787979966612,
+      "loss": 0.6365,
+      "step": 54
+    },
+    {
+      "epoch": 0.16910069177555725,
+      "grad_norm": 1.0784345865249634,
+      "learning_rate": 0.000196661101836394,
+      "loss": 0.9189,
+      "step": 55
+    },
+    {
+      "epoch": 0.17217524980784013,
+      "grad_norm": 0.7773799896240234,
+      "learning_rate": 0.0001965943238731219,
+      "loss": 0.5239,
+      "step": 56
+    },
+    {
+      "epoch": 0.17524980784012298,
+      "grad_norm": 0.8316354751586914,
+      "learning_rate": 0.00019652754590984977,
+      "loss": 0.5098,
+      "step": 57
+    },
+    {
+      "epoch": 0.17832436587240585,
+      "grad_norm": 0.9249610304832458,
+      "learning_rate": 0.00019646076794657764,
+      "loss": 0.7623,
+      "step": 58
+    },
+    {
+      "epoch": 0.1813989239046887,
+      "grad_norm": 0.9492266178131104,
+      "learning_rate": 0.0001963939899833055,
+      "loss": 0.7771,
+      "step": 59
+    },
+    {
+      "epoch": 0.18447348193697155,
+      "grad_norm": 0.9219992160797119,
+      "learning_rate": 0.00019632721202003339,
+      "loss": 0.7204,
+      "step": 60
+    },
+    {
+      "epoch": 0.18754803996925443,
+      "grad_norm": 1.1604337692260742,
+      "learning_rate": 0.00019626043405676129,
+      "loss": 1.3049,
+      "step": 61
+    },
+    {
+      "epoch": 0.19062259800153727,
+      "grad_norm": 0.8619215488433838,
+      "learning_rate": 0.00019619365609348916,
+      "loss": 0.7158,
+      "step": 62
+    },
+    {
+      "epoch": 0.19369715603382015,
+      "grad_norm": 0.9047840237617493,
+      "learning_rate": 0.00019612687813021703,
+      "loss": 0.7197,
+      "step": 63
+    },
+    {
+      "epoch": 0.196771714066103,
+      "grad_norm": 0.9470083713531494,
+      "learning_rate": 0.0001960601001669449,
+      "loss": 0.7085,
+      "step": 64
+    },
+    {
+      "epoch": 0.19984627209838585,
+      "grad_norm": 0.9106523394584656,
+      "learning_rate": 0.00019599332220367278,
+      "loss": 0.7993,
+      "step": 65
+    },
+    {
+      "epoch": 0.20292083013066872,
+      "grad_norm": 0.7691475749015808,
+      "learning_rate": 0.00019592654424040068,
+      "loss": 0.6885,
+      "step": 66
+    },
+    {
+      "epoch": 0.20599538816295157,
+      "grad_norm": 1.3003575801849365,
+      "learning_rate": 0.00019585976627712855,
+      "loss": 0.8159,
+      "step": 67
+    },
+    {
+      "epoch": 0.20906994619523445,
+      "grad_norm": 0.7156600952148438,
+      "learning_rate": 0.00019579298831385645,
+      "loss": 0.5073,
+      "step": 68
+    },
+    {
+      "epoch": 0.2121445042275173,
+      "grad_norm": 0.8237838745117188,
+      "learning_rate": 0.00019572621035058433,
+      "loss": 0.7599,
+      "step": 69
+    },
+    {
+      "epoch": 0.21521906225980014,
+      "grad_norm": 0.961710512638092,
+      "learning_rate": 0.0001956594323873122,
+      "loss": 0.7606,
+      "step": 70
+    },
+    {
+      "epoch": 0.21829362029208302,
+      "grad_norm": 1.1123751401901245,
+      "learning_rate": 0.00019559265442404007,
+      "loss": 0.6975,
+      "step": 71
+    },
+    {
+      "epoch": 0.22136817832436587,
+      "grad_norm": 0.8143901824951172,
+      "learning_rate": 0.00019552587646076797,
+      "loss": 0.6289,
+      "step": 72
+    },
+    {
+      "epoch": 0.22444273635664874,
+      "grad_norm": 0.845358669757843,
+      "learning_rate": 0.00019545909849749584,
+      "loss": 0.6792,
+      "step": 73
+    },
+    {
+      "epoch": 0.2275172943889316,
+      "grad_norm": 0.6951713562011719,
+      "learning_rate": 0.00019539232053422372,
+      "loss": 0.5856,
+      "step": 74
+    },
+    {
+      "epoch": 0.23059185242121444,
+      "grad_norm": 0.7871154546737671,
+      "learning_rate": 0.0001953255425709516,
+      "loss": 0.5165,
+      "step": 75
+    },
+    {
+      "epoch": 0.23366641045349731,
+      "grad_norm": 0.8228313326835632,
+      "learning_rate": 0.00019525876460767946,
+      "loss": 0.5862,
+      "step": 76
+    },
+    {
+      "epoch": 0.23674096848578016,
+      "grad_norm": 0.8904668688774109,
+      "learning_rate": 0.00019519198664440736,
+      "loss": 0.7879,
+      "step": 77
+    },
+    {
+      "epoch": 0.23981552651806304,
+      "grad_norm": 1.0688227415084839,
+      "learning_rate": 0.00019512520868113524,
+      "loss": 0.8699,
+      "step": 78
+    },
+    {
+      "epoch": 0.24289008455034589,
+      "grad_norm": 0.9055834412574768,
+      "learning_rate": 0.0001950584307178631,
+      "loss": 0.63,
+      "step": 79
+    },
+    {
+      "epoch": 0.24596464258262873,
+      "grad_norm": 0.8711212873458862,
+      "learning_rate": 0.00019499165275459098,
+      "loss": 0.7236,
+      "step": 80
+    },
+    {
+      "epoch": 0.2490392006149116,
+      "grad_norm": 0.8715277910232544,
+      "learning_rate": 0.00019492487479131886,
+      "loss": 0.5725,
+      "step": 81
+    },
+    {
+      "epoch": 0.2521137586471945,
+      "grad_norm": 0.7342225313186646,
+      "learning_rate": 0.00019485809682804673,
+      "loss": 0.6088,
+      "step": 82
+    },
+    {
+      "epoch": 0.25518831667947733,
+      "grad_norm": 1.0883733034133911,
+      "learning_rate": 0.00019479131886477463,
+      "loss": 0.9584,
+      "step": 83
+    },
+    {
+      "epoch": 0.2582628747117602,
+      "grad_norm": 1.0370501279830933,
+      "learning_rate": 0.0001947245409015025,
+      "loss": 0.9084,
+      "step": 84
+    },
+    {
+      "epoch": 0.26133743274404303,
+      "grad_norm": 0.7242286205291748,
+      "learning_rate": 0.0001946577629382304,
+      "loss": 0.5581,
+      "step": 85
+    },
+    {
+      "epoch": 0.2644119907763259,
+      "grad_norm": 1.0035842657089233,
+      "learning_rate": 0.00019459098497495828,
+      "loss": 0.6489,
+      "step": 86
+    },
+    {
+      "epoch": 0.2674865488086088,
+      "grad_norm": 1.13217294216156,
+      "learning_rate": 0.00019452420701168615,
+      "loss": 0.8034,
+      "step": 87
+    },
+    {
+      "epoch": 0.27056110684089163,
+      "grad_norm": 1.004482388496399,
+      "learning_rate": 0.00019445742904841405,
+      "loss": 0.7356,
+      "step": 88
+    },
+    {
+      "epoch": 0.2736356648731745,
+      "grad_norm": 0.8573530316352844,
+      "learning_rate": 0.00019439065108514192,
+      "loss": 0.753,
+      "step": 89
+    },
+    {
+      "epoch": 0.2767102229054573,
+      "grad_norm": 0.9892554879188538,
+      "learning_rate": 0.0001943238731218698,
+      "loss": 0.6642,
+      "step": 90
+    },
+    {
+      "epoch": 0.2797847809377402,
+      "grad_norm": 0.7686218619346619,
+      "learning_rate": 0.00019425709515859767,
+      "loss": 0.7824,
+      "step": 91
+    },
+    {
+      "epoch": 0.2828593389700231,
+      "grad_norm": 0.7348684668540955,
+      "learning_rate": 0.00019419031719532554,
+      "loss": 0.7147,
+      "step": 92
+    },
+    {
+      "epoch": 0.2859338970023059,
+      "grad_norm": 0.6922216415405273,
+      "learning_rate": 0.00019412353923205344,
+      "loss": 0.4838,
+      "step": 93
+    },
+    {
+      "epoch": 0.2890084550345888,
+      "grad_norm": 0.8074454665184021,
+      "learning_rate": 0.00019405676126878132,
+      "loss": 0.78,
+      "step": 94
+    },
+    {
+      "epoch": 0.2920830130668716,
+      "grad_norm": 1.0575733184814453,
+      "learning_rate": 0.0001939899833055092,
+      "loss": 0.9982,
+      "step": 95
+    },
+    {
+      "epoch": 0.29515757109915447,
+      "grad_norm": 0.8220807313919067,
+      "learning_rate": 0.00019392320534223706,
+      "loss": 0.6747,
+      "step": 96
+    },
+    {
+      "epoch": 0.2982321291314374,
+      "grad_norm": 0.9394708275794983,
+      "learning_rate": 0.00019385642737896494,
+      "loss": 0.8472,
+      "step": 97
+    },
+    {
+      "epoch": 0.3013066871637202,
+      "grad_norm": 0.8578686118125916,
+      "learning_rate": 0.0001937896494156928,
+      "loss": 0.7043,
+      "step": 98
+    },
+    {
+      "epoch": 0.30438124519600307,
+      "grad_norm": 0.8185007572174072,
+      "learning_rate": 0.0001937228714524207,
+      "loss": 0.6039,
+      "step": 99
+    },
+    {
+      "epoch": 0.3074558032282859,
+      "grad_norm": 0.829156219959259,
+      "learning_rate": 0.00019365609348914858,
+      "loss": 0.6431,
+      "step": 100
+    },
+    {
+      "epoch": 0.31053036126056877,
+      "grad_norm": 0.7808781266212463,
+      "learning_rate": 0.00019358931552587646,
+      "loss": 0.6859,
+      "step": 101
+    },
+    {
+      "epoch": 0.31360491929285167,
+      "grad_norm": 0.9246749877929688,
+      "learning_rate": 0.00019352253756260436,
+      "loss": 0.7724,
+      "step": 102
+    },
+    {
+      "epoch": 0.3166794773251345,
+      "grad_norm": 0.8568185567855835,
+      "learning_rate": 0.00019345575959933223,
+      "loss": 0.6667,
+      "step": 103
+    },
+    {
+      "epoch": 0.31975403535741737,
+      "grad_norm": 0.862723171710968,
+      "learning_rate": 0.00019338898163606013,
+      "loss": 0.8031,
+      "step": 104
+    },
+    {
+      "epoch": 0.3228285933897002,
+      "grad_norm": 0.7219960689544678,
+      "learning_rate": 0.000193322203672788,
+      "loss": 0.6426,
+      "step": 105
+    },
+    {
+      "epoch": 0.3259031514219831,
+      "grad_norm": 0.7314926385879517,
+      "learning_rate": 0.00019325542570951588,
+      "loss": 0.6216,
+      "step": 106
+    },
+    {
+      "epoch": 0.32897770945426597,
+      "grad_norm": 0.8021729588508606,
+      "learning_rate": 0.00019318864774624375,
+      "loss": 0.794,
+      "step": 107
+    },
+    {
+      "epoch": 0.3320522674865488,
+      "grad_norm": 0.8143153786659241,
+      "learning_rate": 0.00019312186978297162,
+      "loss": 0.619,
+      "step": 108
+    },
+    {
+      "epoch": 0.33512682551883166,
+      "grad_norm": 0.7071205377578735,
+      "learning_rate": 0.00019305509181969952,
+      "loss": 0.607,
+      "step": 109
+    },
+    {
+      "epoch": 0.3382013835511145,
+      "grad_norm": 0.6996274590492249,
+      "learning_rate": 0.0001929883138564274,
+      "loss": 0.7112,
+      "step": 110
+    },
+    {
+      "epoch": 0.3412759415833974,
+      "grad_norm": 0.794760525226593,
+      "learning_rate": 0.00019292153589315527,
+      "loss": 0.6147,
+      "step": 111
+    },
+    {
+      "epoch": 0.34435049961568026,
+      "grad_norm": 0.7364431619644165,
+      "learning_rate": 0.00019285475792988314,
+      "loss": 0.5951,
+      "step": 112
+    },
+    {
+      "epoch": 0.3474250576479631,
+      "grad_norm": 1.001115083694458,
+      "learning_rate": 0.00019278797996661101,
+      "loss": 0.7773,
+      "step": 113
+    },
+    {
+      "epoch": 0.35049961568024596,
+      "grad_norm": 0.69508296251297,
+      "learning_rate": 0.0001927212020033389,
+      "loss": 0.7077,
+      "step": 114
+    },
+    {
+      "epoch": 0.3535741737125288,
+      "grad_norm": 0.7061307430267334,
+      "learning_rate": 0.0001926544240400668,
+      "loss": 0.5519,
+      "step": 115
+    },
+    {
+      "epoch": 0.3566487317448117,
+      "grad_norm": 0.7255828976631165,
+      "learning_rate": 0.00019258764607679466,
+      "loss": 0.6196,
+      "step": 116
+    },
+    {
+      "epoch": 0.35972328977709456,
+      "grad_norm": 0.8059960007667542,
+      "learning_rate": 0.00019252086811352253,
+      "loss": 0.6625,
+      "step": 117
+    },
+    {
+      "epoch": 0.3627978478093774,
+      "grad_norm": 0.7943634986877441,
+      "learning_rate": 0.0001924540901502504,
+      "loss": 0.821,
+      "step": 118
+    },
+    {
+      "epoch": 0.36587240584166025,
+      "grad_norm": 0.8535416126251221,
+      "learning_rate": 0.0001923873121869783,
+      "loss": 0.779,
+      "step": 119
+    },
+    {
+      "epoch": 0.3689469638739431,
+      "grad_norm": 0.745639979839325,
+      "learning_rate": 0.00019232053422370618,
+      "loss": 0.6222,
+      "step": 120
+    },
+    {
+      "epoch": 0.372021521906226,
+      "grad_norm": 0.8718635439872742,
+      "learning_rate": 0.00019225375626043408,
+      "loss": 0.6487,
+      "step": 121
+    },
+    {
+      "epoch": 0.37509607993850885,
+      "grad_norm": 0.6557499170303345,
+      "learning_rate": 0.00019218697829716195,
+      "loss": 0.4837,
+      "step": 122
+    },
+    {
+      "epoch": 0.3781706379707917,
+      "grad_norm": 0.7555654644966125,
+      "learning_rate": 0.00019212020033388983,
+      "loss": 0.7067,
+      "step": 123
+    },
+    {
+      "epoch": 0.38124519600307455,
+      "grad_norm": 0.8583431839942932,
+      "learning_rate": 0.0001920534223706177,
+      "loss": 0.7727,
+      "step": 124
+    },
+    {
+      "epoch": 0.3843197540353574,
+      "grad_norm": 0.8364957571029663,
+      "learning_rate": 0.0001919866444073456,
+      "loss": 0.6632,
+      "step": 125
+    },
+    {
+      "epoch": 0.3873943120676403,
+      "grad_norm": 0.8850215077400208,
+      "learning_rate": 0.00019191986644407347,
+      "loss": 0.6054,
+      "step": 126
+    },
+    {
+      "epoch": 0.39046887009992315,
+      "grad_norm": 0.765125036239624,
+      "learning_rate": 0.00019185308848080135,
+      "loss": 0.514,
+      "step": 127
+    },
+    {
+      "epoch": 0.393543428132206,
+      "grad_norm": 0.9039108753204346,
+      "learning_rate": 0.00019178631051752922,
+      "loss": 0.7446,
+      "step": 128
+    },
+    {
+      "epoch": 0.39661798616448884,
+      "grad_norm": 0.80910724401474,
+      "learning_rate": 0.0001917195325542571,
+      "loss": 0.7129,
+      "step": 129
+    },
+    {
+      "epoch": 0.3996925441967717,
+      "grad_norm": 0.7383053302764893,
+      "learning_rate": 0.00019165275459098497,
+      "loss": 0.6525,
+      "step": 130
+    },
+    {
+      "epoch": 0.4027671022290546,
+      "grad_norm": 0.67941814661026,
+      "learning_rate": 0.00019158597662771287,
+      "loss": 0.4873,
+      "step": 131
+    },
+    {
+      "epoch": 0.40584166026133744,
+      "grad_norm": 0.5803771018981934,
+      "learning_rate": 0.00019151919866444074,
+      "loss": 0.5808,
+      "step": 132
+    },
+    {
+      "epoch": 0.4089162182936203,
+      "grad_norm": 0.7955583930015564,
+      "learning_rate": 0.0001914524207011686,
+      "loss": 0.568,
+      "step": 133
+    },
+    {
+      "epoch": 0.41199077632590314,
+      "grad_norm": 0.768507182598114,
+      "learning_rate": 0.0001913856427378965,
+      "loss": 0.7452,
+      "step": 134
+    },
+    {
+      "epoch": 0.415065334358186,
+      "grad_norm": 0.7801786065101624,
+      "learning_rate": 0.0001913188647746244,
+      "loss": 0.6177,
+      "step": 135
+    },
+    {
+      "epoch": 0.4181398923904689,
+      "grad_norm": 0.7118993401527405,
+      "learning_rate": 0.00019125208681135226,
+      "loss": 0.6292,
+      "step": 136
+    },
+    {
+      "epoch": 0.42121445042275174,
+      "grad_norm": 0.764198362827301,
+      "learning_rate": 0.00019118530884808016,
+      "loss": 0.6508,
+      "step": 137
+    },
+    {
+      "epoch": 0.4242890084550346,
+      "grad_norm": 0.8192620873451233,
+      "learning_rate": 0.00019111853088480803,
+      "loss": 0.7133,
+      "step": 138
+    },
+    {
+      "epoch": 0.42736356648731744,
+      "grad_norm": 0.8090092539787292,
+      "learning_rate": 0.0001910517529215359,
+      "loss": 0.6533,
+      "step": 139
+    },
+    {
+      "epoch": 0.4304381245196003,
+      "grad_norm": 0.6094421148300171,
+      "learning_rate": 0.00019098497495826378,
+      "loss": 0.6756,
+      "step": 140
+    },
+    {
+      "epoch": 0.4335126825518832,
+      "grad_norm": 0.673056423664093,
+      "learning_rate": 0.00019091819699499168,
+      "loss": 0.6727,
+      "step": 141
+    },
+    {
+      "epoch": 0.43658724058416604,
+      "grad_norm": 0.6354759335517883,
+      "learning_rate": 0.00019085141903171955,
+      "loss": 0.6474,
+      "step": 142
+    },
+    {
+      "epoch": 0.4396617986164489,
+      "grad_norm": 0.9268069863319397,
+      "learning_rate": 0.00019078464106844743,
+      "loss": 1.086,
+      "step": 143
+    },
+    {
+      "epoch": 0.44273635664873173,
+      "grad_norm": 0.649411141872406,
+      "learning_rate": 0.0001907178631051753,
+      "loss": 0.6204,
+      "step": 144
+    },
+    {
+      "epoch": 0.4458109146810146,
+      "grad_norm": 0.7348757982254028,
+      "learning_rate": 0.00019065108514190317,
+      "loss": 0.7464,
+      "step": 145
+    },
+    {
+      "epoch": 0.4488854727132975,
+      "grad_norm": 0.6263845562934875,
+      "learning_rate": 0.00019058430717863107,
+      "loss": 0.7217,
+      "step": 146
+    },
+    {
+      "epoch": 0.45196003074558033,
+      "grad_norm": 0.7039823532104492,
+      "learning_rate": 0.00019051752921535895,
+      "loss": 0.792,
+      "step": 147
+    },
+    {
+      "epoch": 0.4550345887778632,
+      "grad_norm": 0.6015087366104126,
+      "learning_rate": 0.00019045075125208682,
+      "loss": 0.6796,
+      "step": 148
+    },
+    {
+      "epoch": 0.45810914681014603,
+      "grad_norm": 0.6295155882835388,
+      "learning_rate": 0.0001903839732888147,
+      "loss": 0.5762,
+      "step": 149
+    },
+    {
+      "epoch": 0.4611837048424289,
+      "grad_norm": 0.6592227816581726,
+      "learning_rate": 0.00019031719532554257,
+      "loss": 0.662,
+      "step": 150
+    },
+    {
+      "epoch": 0.4642582628747118,
+      "grad_norm": 0.7038462162017822,
+      "learning_rate": 0.00019025041736227044,
+      "loss": 0.5505,
+      "step": 151
+    },
+    {
+      "epoch": 0.46733282090699463,
+      "grad_norm": 0.7902334332466125,
+      "learning_rate": 0.00019018363939899834,
+      "loss": 0.7457,
+      "step": 152
+    },
+    {
+      "epoch": 0.4704073789392775,
+      "grad_norm": 0.673903226852417,
+      "learning_rate": 0.0001901168614357262,
+      "loss": 0.6595,
+      "step": 153
+    },
+    {
+      "epoch": 0.4734819369715603,
+      "grad_norm": 0.7488313913345337,
+      "learning_rate": 0.0001900500834724541,
+      "loss": 0.8645,
+      "step": 154
+    },
+    {
+      "epoch": 0.4765564950038432,
+      "grad_norm": 0.9577059149742126,
+      "learning_rate": 0.00018998330550918199,
+      "loss": 0.9233,
+      "step": 155
+    },
+    {
+      "epoch": 0.4796310530361261,
+      "grad_norm": 0.6935007572174072,
+      "learning_rate": 0.00018991652754590986,
+      "loss": 0.654,
+      "step": 156
+    },
+    {
+      "epoch": 0.4827056110684089,
+      "grad_norm": 0.7638063430786133,
+      "learning_rate": 0.00018984974958263776,
+      "loss": 0.7632,
+      "step": 157
+    },
+    {
+      "epoch": 0.48578016910069177,
+      "grad_norm": 0.6244141459465027,
+      "learning_rate": 0.00018978297161936563,
+      "loss": 0.476,
+      "step": 158
+    },
+    {
+      "epoch": 0.4888547271329746,
+      "grad_norm": 0.9423524141311646,
+      "learning_rate": 0.0001897161936560935,
+      "loss": 0.7204,
+      "step": 159
+    },
+    {
+      "epoch": 0.49192928516525747,
+      "grad_norm": 0.8479251861572266,
+      "learning_rate": 0.00018964941569282138,
+      "loss": 0.7546,
+      "step": 160
+    },
+    {
+      "epoch": 0.49500384319754037,
+      "grad_norm": 0.7143809795379639,
+      "learning_rate": 0.00018958263772954925,
+      "loss": 0.5741,
+      "step": 161
+    },
+    {
+      "epoch": 0.4980784012298232,
+      "grad_norm": 0.7384529709815979,
+      "learning_rate": 0.00018951585976627715,
+      "loss": 0.5389,
+      "step": 162
+    },
+    {
+      "epoch": 0.5011529592621061,
+      "grad_norm": 0.8297166228294373,
+      "learning_rate": 0.00018944908180300502,
+      "loss": 0.7686,
+      "step": 163
+    },
+    {
+      "epoch": 0.504227517294389,
+      "grad_norm": 0.9101626873016357,
+      "learning_rate": 0.0001893823038397329,
+      "loss": 0.709,
+      "step": 164
+    },
+    {
+      "epoch": 0.5073020753266718,
+      "grad_norm": 0.8472141027450562,
+      "learning_rate": 0.00018931552587646077,
+      "loss": 0.7436,
+      "step": 165
+    },
+    {
+      "epoch": 0.5103766333589547,
+      "grad_norm": 0.7950085401535034,
+      "learning_rate": 0.00018924874791318864,
+      "loss": 0.6462,
+      "step": 166
+    },
+    {
+      "epoch": 0.5134511913912375,
+      "grad_norm": 0.8818950057029724,
+      "learning_rate": 0.00018918196994991652,
+      "loss": 0.799,
+      "step": 167
+    },
+    {
+      "epoch": 0.5165257494235204,
+      "grad_norm": 0.816806435585022,
+      "learning_rate": 0.00018911519198664442,
+      "loss": 0.5552,
+      "step": 168
+    },
+    {
+      "epoch": 0.5196003074558032,
+      "grad_norm": 0.6618863940238953,
+      "learning_rate": 0.0001890484140233723,
+      "loss": 0.4716,
+      "step": 169
+    },
+    {
+      "epoch": 0.5226748654880861,
+      "grad_norm": 0.6072298288345337,
+      "learning_rate": 0.00018898163606010016,
+      "loss": 0.5535,
+      "step": 170
+    },
+    {
+      "epoch": 0.5257494235203689,
+      "grad_norm": 0.7458838820457458,
+      "learning_rate": 0.00018891485809682806,
+      "loss": 0.8668,
+      "step": 171
+    },
+    {
+      "epoch": 0.5288239815526518,
+      "grad_norm": 0.6389868855476379,
+      "learning_rate": 0.00018884808013355594,
+      "loss": 0.5767,
+      "step": 172
+    },
+    {
+      "epoch": 0.5318985395849347,
+      "grad_norm": 0.6578021049499512,
+      "learning_rate": 0.00018878130217028384,
+      "loss": 0.7959,
+      "step": 173
+    },
+    {
+      "epoch": 0.5349730976172176,
+      "grad_norm": 1.0363503694534302,
+      "learning_rate": 0.0001887145242070117,
+      "loss": 0.6947,
+      "step": 174
+    },
+    {
+      "epoch": 0.5380476556495004,
+      "grad_norm": 0.7046053409576416,
+      "learning_rate": 0.00018864774624373958,
+      "loss": 0.6739,
+      "step": 175
+    },
+    {
+      "epoch": 0.5411222136817833,
+      "grad_norm": 0.8335860967636108,
+      "learning_rate": 0.00018858096828046746,
+      "loss": 0.7296,
+      "step": 176
+    },
+    {
+      "epoch": 0.5441967717140661,
+      "grad_norm": 0.6775506734848022,
+      "learning_rate": 0.00018851419031719533,
+      "loss": 0.5817,
+      "step": 177
+    },
+    {
+      "epoch": 0.547271329746349,
+      "grad_norm": 0.7883867621421814,
+      "learning_rate": 0.00018844741235392323,
+      "loss": 0.7067,
+      "step": 178
+    },
+    {
+      "epoch": 0.5503458877786318,
+      "grad_norm": 0.7405235767364502,
+      "learning_rate": 0.0001883806343906511,
+      "loss": 0.8347,
+      "step": 179
+    },
+    {
+      "epoch": 0.5534204458109147,
+      "grad_norm": 0.7003398537635803,
+      "learning_rate": 0.00018831385642737898,
+      "loss": 0.6322,
+      "step": 180
+    },
+    {
+      "epoch": 0.5564950038431975,
+      "grad_norm": 0.7515572309494019,
+      "learning_rate": 0.00018824707846410685,
+      "loss": 0.6944,
+      "step": 181
+    },
+    {
+      "epoch": 0.5595695618754803,
+      "grad_norm": 0.6841930150985718,
+      "learning_rate": 0.00018818030050083472,
+      "loss": 0.5833,
+      "step": 182
+    },
+    {
+      "epoch": 0.5626441199077633,
+      "grad_norm": 0.6888793706893921,
+      "learning_rate": 0.0001881135225375626,
+      "loss": 0.6509,
+      "step": 183
+    },
+    {
+      "epoch": 0.5657186779400462,
+      "grad_norm": 0.6468893885612488,
+      "learning_rate": 0.0001880467445742905,
+      "loss": 0.5695,
+      "step": 184
+    },
+    {
+      "epoch": 0.568793235972329,
+      "grad_norm": 0.7017901539802551,
+      "learning_rate": 0.00018797996661101837,
+      "loss": 0.5681,
+      "step": 185
+    },
+    {
+      "epoch": 0.5718677940046119,
+      "grad_norm": 0.7171371579170227,
+      "learning_rate": 0.00018791318864774624,
+      "loss": 0.6043,
+      "step": 186
+    },
+    {
+      "epoch": 0.5749423520368947,
+      "grad_norm": 0.77923583984375,
+      "learning_rate": 0.00018784641068447412,
+      "loss": 0.6694,
+      "step": 187
+    },
+    {
+      "epoch": 0.5780169100691775,
+      "grad_norm": 0.7366213202476501,
+      "learning_rate": 0.00018777963272120202,
+      "loss": 0.6366,
+      "step": 188
+    },
+    {
+      "epoch": 0.5810914681014604,
+      "grad_norm": 0.6756160259246826,
+      "learning_rate": 0.0001877128547579299,
+      "loss": 0.6678,
+      "step": 189
+    },
+    {
+      "epoch": 0.5841660261337432,
+      "grad_norm": 0.6736173629760742,
+      "learning_rate": 0.0001876460767946578,
+      "loss": 0.6418,
+      "step": 190
+    },
+    {
+      "epoch": 0.5872405841660261,
+      "grad_norm": 0.7356569170951843,
+      "learning_rate": 0.00018757929883138566,
+      "loss": 0.6235,
+      "step": 191
+    },
+    {
+      "epoch": 0.5903151421983089,
+      "grad_norm": 0.8169667720794678,
+      "learning_rate": 0.00018751252086811354,
+      "loss": 0.768,
+      "step": 192
+    },
+    {
+      "epoch": 0.5933897002305919,
+      "grad_norm": 1.0212959051132202,
+      "learning_rate": 0.0001874457429048414,
+      "loss": 0.7575,
+      "step": 193
+    },
+    {
+      "epoch": 0.5964642582628747,
+      "grad_norm": 0.7294356822967529,
+      "learning_rate": 0.0001873789649415693,
+      "loss": 0.5606,
+      "step": 194
+    },
+    {
+      "epoch": 0.5995388162951576,
+      "grad_norm": 0.8933930397033691,
+      "learning_rate": 0.00018731218697829718,
+      "loss": 0.7284,
+      "step": 195
+    },
+    {
+      "epoch": 0.6026133743274404,
+      "grad_norm": 0.640938937664032,
+      "learning_rate": 0.00018724540901502506,
+      "loss": 0.4718,
+      "step": 196
+    },
+    {
+      "epoch": 0.6056879323597233,
+      "grad_norm": 1.032175064086914,
+      "learning_rate": 0.00018717863105175293,
+      "loss": 0.7647,
+      "step": 197
+    },
+    {
+      "epoch": 0.6087624903920061,
+      "grad_norm": 0.7845223546028137,
+      "learning_rate": 0.0001871118530884808,
+      "loss": 0.657,
+      "step": 198
+    },
+    {
+      "epoch": 0.611837048424289,
+      "grad_norm": 0.7698432803153992,
+      "learning_rate": 0.00018704507512520868,
+      "loss": 0.6231,
+      "step": 199
+    },
+    {
+      "epoch": 0.6149116064565718,
+      "grad_norm": 0.8457287549972534,
+      "learning_rate": 0.00018697829716193658,
+      "loss": 0.5908,
+      "step": 200
+    },
+    {
+      "epoch": 0.6179861644888547,
+      "grad_norm": 0.9682031869888306,
+      "learning_rate": 0.00018691151919866445,
+      "loss": 0.7658,
+      "step": 201
+    },
+    {
+      "epoch": 0.6210607225211375,
+      "grad_norm": 0.7560285925865173,
+      "learning_rate": 0.00018684474123539232,
+      "loss": 0.5672,
+      "step": 202
+    },
+    {
+      "epoch": 0.6241352805534205,
+      "grad_norm": 0.749602198600769,
+      "learning_rate": 0.0001867779632721202,
+      "loss": 0.5424,
+      "step": 203
+    },
+    {
+      "epoch": 0.6272098385857033,
+      "grad_norm": 0.6830094456672668,
+      "learning_rate": 0.00018671118530884807,
+      "loss": 0.6763,
+      "step": 204
+    },
+    {
+      "epoch": 0.6302843966179862,
+      "grad_norm": 0.696247935295105,
+      "learning_rate": 0.00018664440734557597,
+      "loss": 0.5836,
+      "step": 205
+    },
+    {
+      "epoch": 0.633358954650269,
+      "grad_norm": 0.7082201242446899,
+      "learning_rate": 0.00018657762938230384,
+      "loss": 0.6022,
+      "step": 206
+    },
+    {
+      "epoch": 0.6364335126825519,
+      "grad_norm": 0.7224099636077881,
+      "learning_rate": 0.00018651085141903174,
+      "loss": 0.7518,
+      "step": 207
+    },
+    {
+      "epoch": 0.6395080707148347,
+      "grad_norm": 0.6942217946052551,
+      "learning_rate": 0.00018644407345575962,
+      "loss": 0.7052,
+      "step": 208
+    },
+    {
+      "epoch": 0.6425826287471176,
+      "grad_norm": 0.6529689431190491,
+      "learning_rate": 0.0001863772954924875,
+      "loss": 0.5383,
+      "step": 209
+    },
+    {
+      "epoch": 0.6456571867794004,
+      "grad_norm": 0.6160123944282532,
+      "learning_rate": 0.0001863105175292154,
+      "loss": 0.5955,
+      "step": 210
+    },
+    {
+      "epoch": 0.6487317448116833,
+      "grad_norm": 0.6024816036224365,
+      "learning_rate": 0.00018624373956594326,
+      "loss": 0.6733,
+      "step": 211
+    },
+    {
+      "epoch": 0.6518063028439662,
+      "grad_norm": 0.5778756737709045,
+      "learning_rate": 0.00018617696160267113,
+      "loss": 0.5349,
+      "step": 212
+    },
+    {
+      "epoch": 0.6548808608762491,
+      "grad_norm": 0.7351570725440979,
+      "learning_rate": 0.000186110183639399,
+      "loss": 0.795,
+      "step": 213
+    },
+    {
+      "epoch": 0.6579554189085319,
+      "grad_norm": 0.8623232245445251,
+      "learning_rate": 0.00018604340567612688,
+      "loss": 0.7451,
+      "step": 214
+    },
+    {
+      "epoch": 0.6610299769408148,
+      "grad_norm": 0.7850607633590698,
+      "learning_rate": 0.00018597662771285475,
+      "loss": 0.6888,
+      "step": 215
+    },
+    {
+      "epoch": 0.6641045349730976,
+      "grad_norm": 0.687150239944458,
+      "learning_rate": 0.00018590984974958265,
+      "loss": 0.5033,
+      "step": 216
+    },
+    {
+      "epoch": 0.6671790930053805,
+      "grad_norm": 0.532691478729248,
+      "learning_rate": 0.00018584307178631053,
+      "loss": 0.4734,
+      "step": 217
+    },
+    {
+      "epoch": 0.6702536510376633,
+      "grad_norm": 0.7870986461639404,
+      "learning_rate": 0.0001857762938230384,
+      "loss": 0.7304,
+      "step": 218
+    },
+    {
+      "epoch": 0.6733282090699462,
+      "grad_norm": 0.7504063248634338,
+      "learning_rate": 0.00018570951585976627,
+      "loss": 0.7476,
+      "step": 219
+    },
+    {
+      "epoch": 0.676402767102229,
+      "grad_norm": 0.7235811948776245,
+      "learning_rate": 0.00018564273789649415,
+      "loss": 0.5458,
+      "step": 220
+    },
+    {
+      "epoch": 0.6794773251345119,
+      "grad_norm": 0.8325716853141785,
+      "learning_rate": 0.00018557595993322205,
+      "loss": 0.6991,
+      "step": 221
+    },
+    {
+      "epoch": 0.6825518831667948,
+      "grad_norm": 0.7696716785430908,
+      "learning_rate": 0.00018550918196994992,
+      "loss": 0.6374,
+      "step": 222
+    },
+    {
+      "epoch": 0.6856264411990777,
+      "grad_norm": 0.9007569551467896,
+      "learning_rate": 0.0001854424040066778,
+      "loss": 0.7021,
+      "step": 223
+    },
+    {
+      "epoch": 0.6887009992313605,
+      "grad_norm": 0.8389153480529785,
+      "learning_rate": 0.0001853756260434057,
+      "loss": 0.7095,
+      "step": 224
+    },
+    {
+      "epoch": 0.6917755572636434,
+      "grad_norm": 0.8680058121681213,
+      "learning_rate": 0.00018530884808013357,
+      "loss": 0.6494,
+      "step": 225
+    },
+    {
+      "epoch": 0.6948501152959262,
+      "grad_norm": 0.5919209718704224,
+      "learning_rate": 0.00018524207011686147,
+      "loss": 0.4893,
+      "step": 226
+    },
+    {
+      "epoch": 0.6979246733282091,
+      "grad_norm": 0.6116464138031006,
+      "learning_rate": 0.00018517529215358934,
+      "loss": 0.6041,
+      "step": 227
+    },
+    {
+      "epoch": 0.7009992313604919,
+      "grad_norm": 0.6643829941749573,
+      "learning_rate": 0.00018510851419031721,
+      "loss": 0.6262,
+      "step": 228
+    },
+    {
+      "epoch": 0.7040737893927748,
+      "grad_norm": 0.8140367269515991,
+      "learning_rate": 0.0001850417362270451,
+      "loss": 0.6593,
+      "step": 229
+    },
+    {
+      "epoch": 0.7071483474250576,
+      "grad_norm": 0.7204163670539856,
+      "learning_rate": 0.00018497495826377296,
+      "loss": 0.6194,
+      "step": 230
+    },
+    {
+      "epoch": 0.7102229054573405,
+      "grad_norm": 0.6929581165313721,
+      "learning_rate": 0.00018490818030050083,
+      "loss": 0.638,
+      "step": 231
+    },
+    {
+      "epoch": 0.7132974634896234,
+      "grad_norm": 0.8570030331611633,
+      "learning_rate": 0.00018484140233722873,
+      "loss": 0.7651,
+      "step": 232
+    },
+    {
+      "epoch": 0.7163720215219063,
+      "grad_norm": 0.8367635011672974,
+      "learning_rate": 0.0001847746243739566,
+      "loss": 0.7164,
+      "step": 233
+    },
+    {
+      "epoch": 0.7194465795541891,
+      "grad_norm": 0.730556845664978,
+      "learning_rate": 0.00018470784641068448,
+      "loss": 0.7644,
+      "step": 234
+    },
+    {
+      "epoch": 0.722521137586472,
+      "grad_norm": 0.6781991720199585,
+      "learning_rate": 0.00018464106844741235,
+      "loss": 0.739,
+      "step": 235
+    },
+    {
+      "epoch": 0.7255956956187548,
+      "grad_norm": 0.6006051301956177,
+      "learning_rate": 0.00018457429048414023,
+      "loss": 0.6119,
+      "step": 236
+    },
+    {
+      "epoch": 0.7286702536510377,
+      "grad_norm": 0.7293769717216492,
+      "learning_rate": 0.00018450751252086813,
+      "loss": 0.7567,
+      "step": 237
+    },
+    {
+      "epoch": 0.7317448116833205,
+      "grad_norm": 0.8237872123718262,
+      "learning_rate": 0.000184440734557596,
+      "loss": 0.8348,
+      "step": 238
+    },
+    {
+      "epoch": 0.7348193697156034,
+      "grad_norm": 0.7130082845687866,
+      "learning_rate": 0.00018437395659432387,
+      "loss": 0.7532,
+      "step": 239
+    },
+    {
+      "epoch": 0.7378939277478862,
+      "grad_norm": 0.6330310702323914,
+      "learning_rate": 0.00018430717863105175,
+      "loss": 0.5578,
+      "step": 240
+    },
+    {
+      "epoch": 0.740968485780169,
+      "grad_norm": 0.5756601095199585,
+      "learning_rate": 0.00018424040066777965,
+      "loss": 0.6896,
+      "step": 241
+    },
+    {
+      "epoch": 0.744043043812452,
+      "grad_norm": 0.5361329913139343,
+      "learning_rate": 0.00018417362270450752,
+      "loss": 0.5699,
+      "step": 242
+    },
+    {
+      "epoch": 0.7471176018447349,
+      "grad_norm": 0.8006643056869507,
+      "learning_rate": 0.00018410684474123542,
+      "loss": 0.7754,
+      "step": 243
+    },
+    {
+      "epoch": 0.7501921598770177,
+      "grad_norm": 0.5526012778282166,
+      "learning_rate": 0.0001840400667779633,
+      "loss": 0.652,
+      "step": 244
+    },
+    {
+      "epoch": 0.7532667179093006,
+      "grad_norm": 0.6159217357635498,
+      "learning_rate": 0.00018397328881469117,
+      "loss": 0.6075,
+      "step": 245
+    },
+    {
+      "epoch": 0.7563412759415834,
+      "grad_norm": 0.7183135151863098,
+      "learning_rate": 0.00018390651085141904,
+      "loss": 0.7171,
+      "step": 246
+    },
+    {
+      "epoch": 0.7594158339738662,
+      "grad_norm": 0.5863479375839233,
+      "learning_rate": 0.0001838397328881469,
+      "loss": 0.6779,
+      "step": 247
+    },
+    {
+      "epoch": 0.7624903920061491,
+      "grad_norm": 0.6453308463096619,
+      "learning_rate": 0.0001837729549248748,
+      "loss": 0.6641,
+      "step": 248
+    },
+    {
+      "epoch": 0.765564950038432,
+      "grad_norm": 0.6052024364471436,
+      "learning_rate": 0.00018370617696160269,
+      "loss": 0.5686,
+      "step": 249
+    },
+    {
+      "epoch": 0.7686395080707148,
+      "grad_norm": 0.578968346118927,
+      "learning_rate": 0.00018363939899833056,
+      "loss": 0.6197,
+      "step": 250
+    },
+    {
+      "epoch": 0.7717140661029976,
+      "grad_norm": 0.6560538411140442,
+      "learning_rate": 0.00018357262103505843,
+      "loss": 0.5807,
+      "step": 251
+    },
+    {
+      "epoch": 0.7747886241352806,
+      "grad_norm": 0.5958215594291687,
+      "learning_rate": 0.0001835058430717863,
+      "loss": 0.5999,
+      "step": 252
+    },
+    {
+      "epoch": 0.7778631821675634,
+      "grad_norm": 0.5787363052368164,
+      "learning_rate": 0.0001834390651085142,
+      "loss": 0.4809,
+      "step": 253
+    },
+    {
+      "epoch": 0.7809377401998463,
+      "grad_norm": 0.5788077712059021,
+      "learning_rate": 0.00018337228714524208,
+      "loss": 0.5878,
+      "step": 254
+    },
+    {
+      "epoch": 0.7840122982321291,
+      "grad_norm": 0.6090837121009827,
+      "learning_rate": 0.00018330550918196995,
+      "loss": 0.5489,
+      "step": 255
+    },
+    {
+      "epoch": 0.787086856264412,
+      "grad_norm": 0.7720903754234314,
+      "learning_rate": 0.00018323873121869782,
+      "loss": 0.7388,
+      "step": 256
+    },
+    {
+      "epoch": 0.7901614142966948,
+      "grad_norm": 0.8125558495521545,
+      "learning_rate": 0.0001831719532554257,
+      "loss": 0.7156,
+      "step": 257
+    },
+    {
+      "epoch": 0.7932359723289777,
+      "grad_norm": 0.9323811531066895,
+      "learning_rate": 0.0001831051752921536,
+      "loss": 0.7382,
+      "step": 258
+    },
+    {
+      "epoch": 0.7963105303612605,
+      "grad_norm": 1.0001492500305176,
+      "learning_rate": 0.00018303839732888147,
+      "loss": 0.8477,
+      "step": 259
+    },
+    {
+      "epoch": 0.7993850883935434,
+      "grad_norm": 0.5271466374397278,
+      "learning_rate": 0.00018297161936560937,
+      "loss": 0.6826,
+      "step": 260
+    },
+    {
+      "epoch": 0.8024596464258262,
+      "grad_norm": 0.6705284118652344,
+      "learning_rate": 0.00018290484140233724,
+      "loss": 0.6206,
+      "step": 261
+    },
+    {
+      "epoch": 0.8055342044581092,
+      "grad_norm": 0.6997379064559937,
+      "learning_rate": 0.00018283806343906512,
+      "loss": 0.692,
+      "step": 262
+    },
+    {
+      "epoch": 0.808608762490392,
+      "grad_norm": 0.6880616545677185,
+      "learning_rate": 0.000182771285475793,
+      "loss": 0.4271,
+      "step": 263
+    },
+    {
+      "epoch": 0.8116833205226749,
+      "grad_norm": 0.6490948796272278,
+      "learning_rate": 0.0001827045075125209,
+      "loss": 0.5254,
+      "step": 264
+    },
+    {
+      "epoch": 0.8147578785549577,
+      "grad_norm": 0.6712121963500977,
+      "learning_rate": 0.00018263772954924876,
+      "loss": 0.6695,
+      "step": 265
+    },
+    {
+      "epoch": 0.8178324365872406,
+      "grad_norm": 0.6833428740501404,
+      "learning_rate": 0.00018257095158597664,
+      "loss": 0.6575,
+      "step": 266
+    },
+    {
+      "epoch": 0.8209069946195234,
+      "grad_norm": 0.567756712436676,
+      "learning_rate": 0.0001825041736227045,
+      "loss": 0.5706,
+      "step": 267
+    },
+    {
+      "epoch": 0.8239815526518063,
+      "grad_norm": 0.6579324007034302,
+      "learning_rate": 0.00018243739565943238,
+      "loss": 0.6428,
+      "step": 268
+    },
+    {
+      "epoch": 0.8270561106840891,
+      "grad_norm": 0.6071141958236694,
+      "learning_rate": 0.00018237061769616028,
+      "loss": 0.687,
+      "step": 269
+    },
+    {
+      "epoch": 0.830130668716372,
+      "grad_norm": 0.7748661041259766,
+      "learning_rate": 0.00018230383973288816,
+      "loss": 0.5839,
+      "step": 270
+    },
+    {
+      "epoch": 0.8332052267486548,
+      "grad_norm": 0.8571879267692566,
+      "learning_rate": 0.00018223706176961603,
+      "loss": 0.8454,
+      "step": 271
+    },
+    {
+      "epoch": 0.8362797847809378,
+      "grad_norm": 0.7605094313621521,
+      "learning_rate": 0.0001821702838063439,
+      "loss": 0.6888,
+      "step": 272
+    },
+    {
+      "epoch": 0.8393543428132206,
+      "grad_norm": 0.7963970303535461,
+      "learning_rate": 0.00018210350584307178,
+      "loss": 0.6651,
+      "step": 273
+    },
+    {
+      "epoch": 0.8424289008455035,
+      "grad_norm": 0.8123186826705933,
+      "learning_rate": 0.00018203672787979968,
+      "loss": 0.6378,
+      "step": 274
+    },
+    {
+      "epoch": 0.8455034588777863,
+      "grad_norm": 0.6652207374572754,
+      "learning_rate": 0.00018196994991652755,
+      "loss": 0.6736,
+      "step": 275
+    },
+    {
+      "epoch": 0.8485780169100692,
+      "grad_norm": 0.8622680306434631,
+      "learning_rate": 0.00018190317195325542,
+      "loss": 0.7403,
+      "step": 276
+    },
+    {
+      "epoch": 0.851652574942352,
+      "grad_norm": 0.7115731239318848,
+      "learning_rate": 0.00018183639398998332,
+      "loss": 0.6155,
+      "step": 277
+    },
+    {
+      "epoch": 0.8547271329746349,
+      "grad_norm": 0.8267805576324463,
+      "learning_rate": 0.0001817696160267112,
+      "loss": 0.587,
+      "step": 278
+    },
+    {
+      "epoch": 0.8578016910069177,
+      "grad_norm": 0.6537695527076721,
+      "learning_rate": 0.0001817028380634391,
+      "loss": 0.6371,
+      "step": 279
+    },
+    {
+      "epoch": 0.8608762490392006,
+      "grad_norm": 0.76251620054245,
+      "learning_rate": 0.00018163606010016697,
+      "loss": 0.7431,
+      "step": 280
+    },
+    {
+      "epoch": 0.8639508070714835,
+      "grad_norm": 0.7395420670509338,
+      "learning_rate": 0.00018156928213689484,
+      "loss": 0.687,
+      "step": 281
+    },
+    {
+      "epoch": 0.8670253651037664,
+      "grad_norm": 0.6425495147705078,
+      "learning_rate": 0.00018150250417362272,
+      "loss": 0.5745,
+      "step": 282
+    },
+    {
+      "epoch": 0.8700999231360492,
+      "grad_norm": 0.868341326713562,
+      "learning_rate": 0.0001814357262103506,
+      "loss": 0.8135,
+      "step": 283
+    },
+    {
+      "epoch": 0.8731744811683321,
+      "grad_norm": 0.8760446906089783,
+      "learning_rate": 0.00018136894824707846,
+      "loss": 0.9087,
+      "step": 284
+    },
+    {
+      "epoch": 0.8762490392006149,
+      "grad_norm": 0.5800943970680237,
+      "learning_rate": 0.00018130217028380636,
+      "loss": 0.5347,
+      "step": 285
+    },
+    {
+      "epoch": 0.8793235972328978,
+      "grad_norm": 0.7919514179229736,
+      "learning_rate": 0.00018123539232053424,
+      "loss": 0.5879,
+      "step": 286
+    },
+    {
+      "epoch": 0.8823981552651806,
+      "grad_norm": 0.5620681643486023,
+      "learning_rate": 0.0001811686143572621,
+      "loss": 0.51,
+      "step": 287
+    },
+    {
+      "epoch": 0.8854727132974635,
+      "grad_norm": 0.7460557818412781,
+      "learning_rate": 0.00018110183639398998,
+      "loss": 0.8177,
+      "step": 288
+    },
+    {
+      "epoch": 0.8885472713297463,
+      "grad_norm": 0.774587094783783,
+      "learning_rate": 0.00018103505843071786,
+      "loss": 0.6761,
+      "step": 289
+    },
+    {
+      "epoch": 0.8916218293620292,
+      "grad_norm": 0.6145612597465515,
+      "learning_rate": 0.00018096828046744576,
+      "loss": 0.7094,
+      "step": 290
+    },
+    {
+      "epoch": 0.8946963873943121,
+      "grad_norm": 0.6812341809272766,
+      "learning_rate": 0.00018090150250417363,
+      "loss": 0.8529,
+      "step": 291
+    },
+    {
+      "epoch": 0.897770945426595,
+      "grad_norm": 0.7788804769515991,
+      "learning_rate": 0.0001808347245409015,
+      "loss": 0.6635,
+      "step": 292
+    },
+    {
+      "epoch": 0.9008455034588778,
+      "grad_norm": 0.6078181862831116,
+      "learning_rate": 0.00018076794657762938,
+      "loss": 0.6468,
+      "step": 293
+    },
+    {
+      "epoch": 0.9039200614911607,
+      "grad_norm": 0.6376216411590576,
+      "learning_rate": 0.00018070116861435728,
+      "loss": 0.6725,
+      "step": 294
+    },
+    {
+      "epoch": 0.9069946195234435,
+      "grad_norm": 0.5974690914154053,
+      "learning_rate": 0.00018063439065108515,
+      "loss": 0.5874,
+      "step": 295
+    },
+    {
+      "epoch": 0.9100691775557264,
+      "grad_norm": 0.6442948579788208,
+      "learning_rate": 0.00018056761268781305,
+      "loss": 0.626,
+      "step": 296
+    },
+    {
+      "epoch": 0.9131437355880092,
+      "grad_norm": 0.7131801247596741,
+      "learning_rate": 0.00018050083472454092,
+      "loss": 0.6812,
+      "step": 297
+    },
+    {
+      "epoch": 0.9162182936202921,
+      "grad_norm": 0.823663592338562,
+      "learning_rate": 0.0001804340567612688,
+      "loss": 0.9266,
+      "step": 298
+    },
+    {
+      "epoch": 0.9192928516525749,
+      "grad_norm": 0.7136701345443726,
+      "learning_rate": 0.00018036727879799667,
+      "loss": 0.6225,
+      "step": 299
+    },
+    {
+      "epoch": 0.9223674096848578,
+      "grad_norm": 0.6703259348869324,
+      "learning_rate": 0.00018030050083472454,
+      "loss": 0.6608,
+      "step": 300
+    },
+    {
+      "epoch": 0.9254419677171407,
+      "grad_norm": 0.6696874499320984,
+      "learning_rate": 0.00018023372287145244,
+      "loss": 0.5609,
+      "step": 301
+    },
+    {
+      "epoch": 0.9285165257494236,
+      "grad_norm": 0.6228551268577576,
+      "learning_rate": 0.00018016694490818031,
+      "loss": 0.7327,
+      "step": 302
+    },
+    {
+      "epoch": 0.9315910837817064,
+      "grad_norm": 0.6737201809883118,
+      "learning_rate": 0.0001801001669449082,
+      "loss": 0.6051,
+      "step": 303
+    },
+    {
+      "epoch": 0.9346656418139893,
+      "grad_norm": 0.7718166708946228,
+      "learning_rate": 0.00018003338898163606,
+      "loss": 0.6961,
+      "step": 304
+    },
+    {
+      "epoch": 0.9377401998462721,
+      "grad_norm": 0.9040055871009827,
+      "learning_rate": 0.00017996661101836393,
+      "loss": 0.6448,
+      "step": 305
+    },
+    {
+      "epoch": 0.940814757878555,
+      "grad_norm": 0.7209524512290955,
+      "learning_rate": 0.00017989983305509183,
+      "loss": 0.695,
+      "step": 306
+    },
+    {
+      "epoch": 0.9438893159108378,
+      "grad_norm": 0.6280409693717957,
+      "learning_rate": 0.0001798330550918197,
+      "loss": 0.677,
+      "step": 307
+    },
+    {
+      "epoch": 0.9469638739431206,
+      "grad_norm": 0.715514063835144,
+      "learning_rate": 0.00017976627712854758,
+      "loss": 0.6076,
+      "step": 308
+    },
+    {
+      "epoch": 0.9500384319754035,
+      "grad_norm": 0.6662933230400085,
+      "learning_rate": 0.00017969949916527545,
+      "loss": 0.6453,
+      "step": 309
+    },
+    {
+      "epoch": 0.9531129900076863,
+      "grad_norm": 0.6966415047645569,
+      "learning_rate": 0.00017963272120200333,
+      "loss": 0.7091,
+      "step": 310
+    },
+    {
+      "epoch": 0.9561875480399693,
+      "grad_norm": 0.7018651366233826,
+      "learning_rate": 0.00017956594323873123,
+      "loss": 0.6113,
+      "step": 311
+    },
+    {
+      "epoch": 0.9592621060722522,
+      "grad_norm": 0.5975345373153687,
+      "learning_rate": 0.0001794991652754591,
+      "loss": 0.4381,
+      "step": 312
+    },
+    {
+      "epoch": 0.962336664104535,
+      "grad_norm": 0.7371988296508789,
+      "learning_rate": 0.000179432387312187,
+      "loss": 0.7471,
+      "step": 313
+    },
+    {
+      "epoch": 0.9654112221368178,
+      "grad_norm": 0.5989629030227661,
+      "learning_rate": 0.00017936560934891487,
+      "loss": 0.5947,
+      "step": 314
+    },
+    {
+      "epoch": 0.9684857801691007,
+      "grad_norm": 0.5772401094436646,
+      "learning_rate": 0.00017929883138564275,
+      "loss": 0.5862,
+      "step": 315
+    },
+    {
+      "epoch": 0.9715603382013835,
+      "grad_norm": 0.7896726727485657,
+      "learning_rate": 0.00017923205342237062,
+      "loss": 0.7367,
+      "step": 316
+    },
+    {
+      "epoch": 0.9746348962336664,
+      "grad_norm": 0.9095852375030518,
+      "learning_rate": 0.00017916527545909852,
+      "loss": 0.7899,
+      "step": 317
+    },
+    {
+      "epoch": 0.9777094542659492,
+      "grad_norm": 0.5150197744369507,
+      "learning_rate": 0.0001790984974958264,
+      "loss": 0.4869,
+      "step": 318
+    },
+    {
+      "epoch": 0.9807840122982321,
+      "grad_norm": 0.6638761162757874,
+      "learning_rate": 0.00017903171953255427,
+      "loss": 0.6129,
+      "step": 319
+    },
+    {
+      "epoch": 0.9838585703305149,
+      "grad_norm": 0.738000750541687,
+      "learning_rate": 0.00017896494156928214,
+      "loss": 0.5616,
+      "step": 320
+    },
+    {
+      "epoch": 0.9869331283627979,
+      "grad_norm": 0.6779305934906006,
+      "learning_rate": 0.00017889816360601,
+      "loss": 0.6168,
+      "step": 321
+    },
+    {
+      "epoch": 0.9900076863950807,
+      "grad_norm": 0.5411549806594849,
+      "learning_rate": 0.0001788313856427379,
+      "loss": 0.5456,
+      "step": 322
+    },
+    {
+      "epoch": 0.9930822444273636,
+      "grad_norm": 0.6001323461532593,
+      "learning_rate": 0.0001787646076794658,
+      "loss": 0.7528,
+      "step": 323
+    },
+    {
+      "epoch": 0.9961568024596464,
+      "grad_norm": 0.6542277932167053,
+      "learning_rate": 0.00017869782971619366,
+      "loss": 0.5923,
+      "step": 324
+    },
+    {
+      "epoch": 0.9992313604919293,
+      "grad_norm": 0.6943919658660889,
+      "learning_rate": 0.00017863105175292153,
+      "loss": 1.0933,
+      "step": 325
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.3673266172409058,
+      "learning_rate": 0.0001785642737896494,
+      "loss": 0.675,
+      "step": 326
+    },
+    {
+      "epoch": 1.0030745580322828,
+      "grad_norm": 0.5977247953414917,
+      "learning_rate": 0.0001784974958263773,
+      "loss": 0.6236,
+      "step": 327
+    },
+    {
+      "epoch": 1.0061491160645657,
+      "grad_norm": 0.5625783205032349,
+      "learning_rate": 0.00017843071786310518,
+      "loss": 0.5051,
+      "step": 328
+    },
+    {
+      "epoch": 1.0092236740968485,
+      "grad_norm": 0.5822674036026001,
+      "learning_rate": 0.00017836393989983305,
+      "loss": 0.4488,
+      "step": 329
+    },
+    {
+      "epoch": 1.0122982321291314,
+      "grad_norm": 0.5701442360877991,
+      "learning_rate": 0.00017829716193656095,
+      "loss": 0.5875,
+      "step": 330
+    },
+    {
+      "epoch": 1.0153727901614142,
+      "grad_norm": 0.5955713391304016,
+      "learning_rate": 0.00017823038397328883,
+      "loss": 0.4612,
+      "step": 331
+    },
+    {
+      "epoch": 1.018447348193697,
+      "grad_norm": 0.5202100276947021,
+      "learning_rate": 0.0001781636060100167,
+      "loss": 0.4925,
+      "step": 332
+    },
+    {
+      "epoch": 1.02152190622598,
+      "grad_norm": 0.6523457765579224,
+      "learning_rate": 0.0001780968280467446,
+      "loss": 0.5336,
+      "step": 333
+    },
+    {
+      "epoch": 1.0245964642582628,
+      "grad_norm": 0.653768002986908,
+      "learning_rate": 0.00017803005008347247,
+      "loss": 0.5789,
+      "step": 334
+    },
+    {
+      "epoch": 1.0276710222905456,
+      "grad_norm": 0.680659294128418,
+      "learning_rate": 0.00017796327212020035,
+      "loss": 0.5427,
+      "step": 335
+    },
+    {
+      "epoch": 1.0307455803228285,
+      "grad_norm": 0.6698821783065796,
+      "learning_rate": 0.00017789649415692822,
+      "loss": 0.5877,
+      "step": 336
+    },
+    {
+      "epoch": 1.0338201383551116,
+      "grad_norm": 0.628028392791748,
+      "learning_rate": 0.0001778297161936561,
+      "loss": 0.4996,
+      "step": 337
+    },
+    {
+      "epoch": 1.0368946963873944,
+      "grad_norm": 0.5866581797599792,
+      "learning_rate": 0.000177762938230384,
+      "loss": 0.6283,
+      "step": 338
+    },
+    {
+      "epoch": 1.0399692544196772,
+      "grad_norm": 0.646752119064331,
+      "learning_rate": 0.00017769616026711187,
+      "loss": 0.4585,
+      "step": 339
+    },
+    {
+      "epoch": 1.04304381245196,
+      "grad_norm": 0.678822934627533,
+      "learning_rate": 0.00017762938230383974,
+      "loss": 0.4741,
+      "step": 340
+    },
+    {
+      "epoch": 1.046118370484243,
+      "grad_norm": 0.57511967420578,
+      "learning_rate": 0.0001775626043405676,
+      "loss": 0.576,
+      "step": 341
+    },
+    {
+      "epoch": 1.0491929285165258,
+      "grad_norm": 0.7732008099555969,
+      "learning_rate": 0.00017749582637729548,
+      "loss": 0.6847,
+      "step": 342
+    },
+    {
+      "epoch": 1.0522674865488086,
+      "grad_norm": 0.47226476669311523,
+      "learning_rate": 0.00017742904841402339,
+      "loss": 0.289,
+      "step": 343
+    },
+    {
+      "epoch": 1.0553420445810915,
+      "grad_norm": 0.7770098447799683,
+      "learning_rate": 0.00017736227045075126,
+      "loss": 0.5968,
+      "step": 344
+    },
+    {
+      "epoch": 1.0584166026133743,
+      "grad_norm": 0.8492668867111206,
+      "learning_rate": 0.00017729549248747913,
+      "loss": 0.6167,
+      "step": 345
+    },
+    {
+      "epoch": 1.0614911606456572,
+      "grad_norm": 0.876089870929718,
+      "learning_rate": 0.000177228714524207,
+      "loss": 0.5435,
+      "step": 346
+    },
+    {
+      "epoch": 1.06456571867794,
+      "grad_norm": 0.7883753776550293,
+      "learning_rate": 0.0001771619365609349,
+      "loss": 0.5498,
+      "step": 347
+    },
+    {
+      "epoch": 1.0676402767102229,
+      "grad_norm": 0.785437822341919,
+      "learning_rate": 0.00017709515859766278,
+      "loss": 0.5835,
+      "step": 348
+    },
+    {
+      "epoch": 1.0707148347425057,
+      "grad_norm": 0.6789015531539917,
+      "learning_rate": 0.00017702838063439068,
+      "loss": 0.5492,
+      "step": 349
+    },
+    {
+      "epoch": 1.0737893927747886,
+      "grad_norm": 0.7070201635360718,
+      "learning_rate": 0.00017696160267111855,
+      "loss": 0.5829,
+      "step": 350
+    },
+    {
+      "epoch": 1.0768639508070714,
+      "grad_norm": 0.7011975049972534,
+      "learning_rate": 0.00017689482470784642,
+      "loss": 0.4774,
+      "step": 351
+    },
+    {
+      "epoch": 1.0799385088393543,
+      "grad_norm": 0.7407499551773071,
+      "learning_rate": 0.0001768280467445743,
+      "loss": 0.51,
+      "step": 352
+    },
+    {
+      "epoch": 1.0830130668716371,
+      "grad_norm": 0.672869861125946,
+      "learning_rate": 0.00017676126878130217,
+      "loss": 0.5383,
+      "step": 353
+    },
+    {
+      "epoch": 1.08608762490392,
+      "grad_norm": 0.8781456351280212,
+      "learning_rate": 0.00017669449081803007,
+      "loss": 0.6436,
+      "step": 354
+    },
+    {
+      "epoch": 1.089162182936203,
+      "grad_norm": 0.8077890872955322,
+      "learning_rate": 0.00017662771285475794,
+      "loss": 0.6629,
+      "step": 355
+    },
+    {
+      "epoch": 1.0922367409684859,
+      "grad_norm": 0.7883043885231018,
+      "learning_rate": 0.00017656093489148582,
+      "loss": 0.6393,
+      "step": 356
+    },
+    {
+      "epoch": 1.0953112990007687,
+      "grad_norm": 0.68159419298172,
+      "learning_rate": 0.0001764941569282137,
+      "loss": 0.5835,
+      "step": 357
+    },
+    {
+      "epoch": 1.0983858570330516,
+      "grad_norm": 0.658222496509552,
+      "learning_rate": 0.00017642737896494156,
+      "loss": 0.4921,
+      "step": 358
+    },
+    {
+      "epoch": 1.1014604150653344,
+      "grad_norm": 0.6931422352790833,
+      "learning_rate": 0.00017636060100166946,
+      "loss": 0.5664,
+      "step": 359
+    },
+    {
+      "epoch": 1.1045349730976173,
+      "grad_norm": 0.6795049905776978,
+      "learning_rate": 0.00017629382303839734,
+      "loss": 0.5559,
+      "step": 360
+    },
+    {
+      "epoch": 1.1076095311299001,
+      "grad_norm": 0.9736855030059814,
+      "learning_rate": 0.0001762270450751252,
+      "loss": 0.7188,
+      "step": 361
+    },
+    {
+      "epoch": 1.110684089162183,
+      "grad_norm": 0.6535844802856445,
+      "learning_rate": 0.00017616026711185308,
+      "loss": 0.5894,
+      "step": 362
+    },
+    {
+      "epoch": 1.1137586471944658,
+      "grad_norm": 0.7295445799827576,
+      "learning_rate": 0.00017609348914858096,
+      "loss": 0.6106,
+      "step": 363
+    },
+    {
+      "epoch": 1.1168332052267487,
+      "grad_norm": 0.7204632759094238,
+      "learning_rate": 0.00017602671118530886,
+      "loss": 0.6681,
+      "step": 364
+    },
+    {
+      "epoch": 1.1199077632590315,
+      "grad_norm": 0.64588862657547,
+      "learning_rate": 0.00017595993322203673,
+      "loss": 0.5934,
+      "step": 365
+    },
+    {
+      "epoch": 1.1229823212913144,
+      "grad_norm": 0.6482330560684204,
+      "learning_rate": 0.00017589315525876463,
+      "loss": 0.5082,
+      "step": 366
+    },
+    {
+      "epoch": 1.1260568793235972,
+      "grad_norm": 0.6101349592208862,
+      "learning_rate": 0.0001758263772954925,
+      "loss": 0.4976,
+      "step": 367
+    },
+    {
+      "epoch": 1.12913143735588,
+      "grad_norm": 0.5716677308082581,
+      "learning_rate": 0.00017575959933222038,
+      "loss": 0.3977,
+      "step": 368
+    },
+    {
+      "epoch": 1.132205995388163,
+      "grad_norm": 0.557501494884491,
+      "learning_rate": 0.00017569282136894825,
+      "loss": 0.492,
+      "step": 369
+    },
+    {
+      "epoch": 1.1352805534204458,
+      "grad_norm": 0.7171933054924011,
+      "learning_rate": 0.00017562604340567615,
+      "loss": 0.6809,
+      "step": 370
+    },
+    {
+      "epoch": 1.1383551114527286,
+      "grad_norm": 0.5551110506057739,
+      "learning_rate": 0.00017555926544240402,
+      "loss": 0.4988,
+      "step": 371
+    },
+    {
+      "epoch": 1.1414296694850115,
+      "grad_norm": 0.6553733944892883,
+      "learning_rate": 0.0001754924874791319,
+      "loss": 0.4898,
+      "step": 372
+    },
+    {
+      "epoch": 1.1445042275172943,
+      "grad_norm": 0.69221431016922,
+      "learning_rate": 0.00017542570951585977,
+      "loss": 0.6136,
+      "step": 373
+    },
+    {
+      "epoch": 1.1475787855495772,
+      "grad_norm": 0.5864092707633972,
+      "learning_rate": 0.00017535893155258764,
+      "loss": 0.5688,
+      "step": 374
+    },
+    {
+      "epoch": 1.15065334358186,
+      "grad_norm": 0.756809651851654,
+      "learning_rate": 0.00017529215358931554,
+      "loss": 0.5349,
+      "step": 375
+    },
+    {
+      "epoch": 1.1537279016141428,
+      "grad_norm": 0.6437715291976929,
+      "learning_rate": 0.00017522537562604342,
+      "loss": 0.5303,
+      "step": 376
+    },
+    {
+      "epoch": 1.156802459646426,
+      "grad_norm": 0.5712356567382812,
+      "learning_rate": 0.0001751585976627713,
+      "loss": 0.4698,
+      "step": 377
+    },
+    {
+      "epoch": 1.1598770176787088,
+      "grad_norm": 0.6452774405479431,
+      "learning_rate": 0.00017509181969949916,
+      "loss": 0.6214,
+      "step": 378
+    },
+    {
+      "epoch": 1.1629515757109916,
+      "grad_norm": 0.6589751839637756,
+      "learning_rate": 0.00017502504173622704,
+      "loss": 0.5302,
+      "step": 379
+    },
+    {
+      "epoch": 1.1660261337432745,
+      "grad_norm": 0.6354514360427856,
+      "learning_rate": 0.0001749582637729549,
+      "loss": 0.4941,
+      "step": 380
+    },
+    {
+      "epoch": 1.1691006917755573,
+      "grad_norm": 0.8875218033790588,
+      "learning_rate": 0.0001748914858096828,
+      "loss": 0.5601,
+      "step": 381
+    },
+    {
+      "epoch": 1.1721752498078402,
+      "grad_norm": 0.7112509608268738,
+      "learning_rate": 0.0001748247078464107,
+      "loss": 0.5841,
+      "step": 382
+    },
+    {
+      "epoch": 1.175249807840123,
+      "grad_norm": 0.6991716623306274,
+      "learning_rate": 0.00017475792988313858,
+      "loss": 0.4966,
+      "step": 383
+    },
+    {
+      "epoch": 1.1783243658724059,
+      "grad_norm": 0.8313332200050354,
+      "learning_rate": 0.00017469115191986646,
+      "loss": 0.6796,
+      "step": 384
+    },
+    {
+      "epoch": 1.1813989239046887,
+      "grad_norm": 0.6446208953857422,
+      "learning_rate": 0.00017462437395659433,
+      "loss": 0.4277,
+      "step": 385
+    },
+    {
+      "epoch": 1.1844734819369716,
+      "grad_norm": 0.6382359862327576,
+      "learning_rate": 0.00017455759599332223,
+      "loss": 0.5088,
+      "step": 386
+    },
+    {
+      "epoch": 1.1875480399692544,
+      "grad_norm": 0.8059669733047485,
+      "learning_rate": 0.0001744908180300501,
+      "loss": 0.5724,
+      "step": 387
+    },
+    {
+      "epoch": 1.1906225980015372,
+      "grad_norm": 0.7880392074584961,
+      "learning_rate": 0.00017442404006677798,
+      "loss": 0.6245,
+      "step": 388
+    },
+    {
+      "epoch": 1.19369715603382,
+      "grad_norm": 0.780595600605011,
+      "learning_rate": 0.00017435726210350585,
+      "loss": 0.5629,
+      "step": 389
+    },
+    {
+      "epoch": 1.196771714066103,
+      "grad_norm": 0.8109543323516846,
+      "learning_rate": 0.00017429048414023372,
+      "loss": 0.6815,
+      "step": 390
+    },
+    {
+      "epoch": 1.1998462720983858,
+      "grad_norm": 0.6399725079536438,
+      "learning_rate": 0.00017422370617696162,
+      "loss": 0.5488,
+      "step": 391
+    },
+    {
+      "epoch": 1.2029208301306686,
+      "grad_norm": 0.6464505195617676,
+      "learning_rate": 0.0001741569282136895,
+      "loss": 0.5546,
+      "step": 392
+    },
+    {
+      "epoch": 1.2059953881629515,
+      "grad_norm": 0.7562092542648315,
+      "learning_rate": 0.00017409015025041737,
+      "loss": 0.5998,
+      "step": 393
+    },
+    {
+      "epoch": 1.2090699461952346,
+      "grad_norm": 0.7341581583023071,
+      "learning_rate": 0.00017402337228714524,
+      "loss": 0.6801,
+      "step": 394
+    },
+    {
+      "epoch": 1.2121445042275174,
+      "grad_norm": 0.7949944734573364,
+      "learning_rate": 0.00017395659432387311,
+      "loss": 0.6253,
+      "step": 395
+    },
+    {
+      "epoch": 1.2152190622598003,
+      "grad_norm": 0.6935542225837708,
+      "learning_rate": 0.00017388981636060101,
+      "loss": 0.6417,
+      "step": 396
+    },
+    {
+      "epoch": 1.218293620292083,
+      "grad_norm": 0.6856999397277832,
+      "learning_rate": 0.0001738230383973289,
+      "loss": 0.5247,
+      "step": 397
+    },
+    {
+      "epoch": 1.221368178324366,
+      "grad_norm": 0.5797318816184998,
+      "learning_rate": 0.00017375626043405676,
+      "loss": 0.5061,
+      "step": 398
+    },
+    {
+      "epoch": 1.2244427363566488,
+      "grad_norm": 0.5869422554969788,
+      "learning_rate": 0.00017368948247078466,
+      "loss": 0.5226,
+      "step": 399
+    },
+    {
+      "epoch": 1.2275172943889316,
+      "grad_norm": 0.8467463850975037,
+      "learning_rate": 0.00017362270450751253,
+      "loss": 0.711,
+      "step": 400
+    },
+    {
+      "epoch": 1.2305918524212145,
+      "grad_norm": 0.7549751996994019,
+      "learning_rate": 0.0001735559265442404,
+      "loss": 0.6558,
+      "step": 401
+    },
+    {
+      "epoch": 1.2336664104534973,
+      "grad_norm": 0.6192473769187927,
+      "learning_rate": 0.0001734891485809683,
+      "loss": 0.5697,
+      "step": 402
+    },
+    {
+      "epoch": 1.2367409684857802,
+      "grad_norm": 0.7555997967720032,
+      "learning_rate": 0.00017342237061769618,
+      "loss": 0.5774,
+      "step": 403
+    },
+    {
+      "epoch": 1.239815526518063,
+      "grad_norm": 0.6554675698280334,
+      "learning_rate": 0.00017335559265442405,
+      "loss": 0.653,
+      "step": 404
+    },
+    {
+      "epoch": 1.2428900845503459,
+      "grad_norm": 0.8110440969467163,
+      "learning_rate": 0.00017328881469115193,
+      "loss": 0.6604,
+      "step": 405
+    },
+    {
+      "epoch": 1.2459646425826287,
+      "grad_norm": 0.7523771524429321,
+      "learning_rate": 0.0001732220367278798,
+      "loss": 0.7315,
+      "step": 406
+    },
+    {
+      "epoch": 1.2490392006149116,
+      "grad_norm": 0.7357513308525085,
+      "learning_rate": 0.0001731552587646077,
+      "loss": 0.6301,
+      "step": 407
+    },
+    {
+      "epoch": 1.2521137586471944,
+      "grad_norm": 0.6375721096992493,
+      "learning_rate": 0.00017308848080133557,
+      "loss": 0.4597,
+      "step": 408
+    },
+    {
+      "epoch": 1.2551883166794773,
+      "grad_norm": 0.7142077684402466,
+      "learning_rate": 0.00017302170283806345,
+      "loss": 0.5701,
+      "step": 409
+    },
+    {
+      "epoch": 1.2582628747117601,
+      "grad_norm": 0.5495367646217346,
+      "learning_rate": 0.00017295492487479132,
+      "loss": 0.4168,
+      "step": 410
+    },
+    {
+      "epoch": 1.261337432744043,
+      "grad_norm": 0.6137920618057251,
+      "learning_rate": 0.0001728881469115192,
+      "loss": 0.5961,
+      "step": 411
+    },
+    {
+      "epoch": 1.2644119907763258,
+      "grad_norm": 0.6373696327209473,
+      "learning_rate": 0.0001728213689482471,
+      "loss": 0.6035,
+      "step": 412
+    },
+    {
+      "epoch": 1.2674865488086087,
+      "grad_norm": 0.7311916351318359,
+      "learning_rate": 0.00017275459098497497,
+      "loss": 0.6675,
+      "step": 413
+    },
+    {
+      "epoch": 1.2705611068408915,
+      "grad_norm": 0.5674752593040466,
+      "learning_rate": 0.00017268781302170284,
+      "loss": 0.5865,
+      "step": 414
+    },
+    {
+      "epoch": 1.2736356648731744,
+      "grad_norm": 0.6945238709449768,
+      "learning_rate": 0.0001726210350584307,
+      "loss": 0.5979,
+      "step": 415
+    },
+    {
+      "epoch": 1.2767102229054572,
+      "grad_norm": 0.7307734489440918,
+      "learning_rate": 0.0001725542570951586,
+      "loss": 0.6184,
+      "step": 416
+    },
+    {
+      "epoch": 1.27978478093774,
+      "grad_norm": 0.6113364100456238,
+      "learning_rate": 0.0001724874791318865,
+      "loss": 0.4949,
+      "step": 417
+    },
+    {
+      "epoch": 1.2828593389700231,
+      "grad_norm": 0.8040212988853455,
+      "learning_rate": 0.0001724207011686144,
+      "loss": 0.6111,
+      "step": 418
+    },
+    {
+      "epoch": 1.285933897002306,
+      "grad_norm": 0.6946241855621338,
+      "learning_rate": 0.00017235392320534226,
+      "loss": 0.4845,
+      "step": 419
+    },
+    {
+      "epoch": 1.2890084550345888,
+      "grad_norm": 0.559880256652832,
+      "learning_rate": 0.00017228714524207013,
+      "loss": 0.6224,
+      "step": 420
+    },
+    {
+      "epoch": 1.2920830130668717,
+      "grad_norm": 0.7335419654846191,
+      "learning_rate": 0.000172220367278798,
+      "loss": 0.6117,
+      "step": 421
+    },
+    {
+      "epoch": 1.2951575710991545,
+      "grad_norm": 0.6661849617958069,
+      "learning_rate": 0.00017215358931552588,
+      "loss": 0.5993,
+      "step": 422
+    },
+    {
+      "epoch": 1.2982321291314374,
+      "grad_norm": 0.7723634243011475,
+      "learning_rate": 0.00017208681135225378,
+      "loss": 0.7025,
+      "step": 423
+    },
+    {
+      "epoch": 1.3013066871637202,
+      "grad_norm": 0.5866445302963257,
+      "learning_rate": 0.00017202003338898165,
+      "loss": 0.5205,
+      "step": 424
+    },
+    {
+      "epoch": 1.304381245196003,
+      "grad_norm": 0.9210363030433655,
+      "learning_rate": 0.00017195325542570953,
+      "loss": 0.6247,
+      "step": 425
+    },
+    {
+      "epoch": 1.307455803228286,
+      "grad_norm": 0.6116583943367004,
+      "learning_rate": 0.0001718864774624374,
+      "loss": 0.5366,
+      "step": 426
+    },
+    {
+      "epoch": 1.3105303612605688,
+      "grad_norm": 0.7020177245140076,
+      "learning_rate": 0.00017181969949916527,
+      "loss": 0.4586,
+      "step": 427
+    },
+    {
+      "epoch": 1.3136049192928516,
+      "grad_norm": 0.8982479572296143,
+      "learning_rate": 0.00017175292153589317,
+      "loss": 0.6121,
+      "step": 428
+    },
+    {
+      "epoch": 1.3166794773251345,
+      "grad_norm": 0.6956773996353149,
+      "learning_rate": 0.00017168614357262105,
+      "loss": 0.5002,
+      "step": 429
+    },
+    {
+      "epoch": 1.3197540353574173,
+      "grad_norm": 0.5864204168319702,
+      "learning_rate": 0.00017161936560934892,
+      "loss": 0.4577,
+      "step": 430
+    },
+    {
+      "epoch": 1.3228285933897002,
+      "grad_norm": 0.6034566760063171,
+      "learning_rate": 0.0001715525876460768,
+      "loss": 0.5688,
+      "step": 431
+    },
+    {
+      "epoch": 1.3259031514219832,
+      "grad_norm": 0.7787615060806274,
+      "learning_rate": 0.00017148580968280467,
+      "loss": 0.6904,
+      "step": 432
+    },
+    {
+      "epoch": 1.328977709454266,
+      "grad_norm": 0.6120966076850891,
+      "learning_rate": 0.00017141903171953257,
+      "loss": 0.452,
+      "step": 433
+    },
+    {
+      "epoch": 1.332052267486549,
+      "grad_norm": 0.6668190360069275,
+      "learning_rate": 0.00017135225375626044,
+      "loss": 0.5082,
+      "step": 434
+    },
+    {
+      "epoch": 1.3351268255188318,
+      "grad_norm": 0.660654604434967,
+      "learning_rate": 0.00017128547579298834,
+      "loss": 0.618,
+      "step": 435
+    },
+    {
+      "epoch": 1.3382013835511146,
+      "grad_norm": 0.6356967091560364,
+      "learning_rate": 0.0001712186978297162,
+      "loss": 0.583,
+      "step": 436
+    },
+    {
+      "epoch": 1.3412759415833975,
+      "grad_norm": 0.6737658977508545,
+      "learning_rate": 0.00017115191986644409,
+      "loss": 0.511,
+      "step": 437
+    },
+    {
+      "epoch": 1.3443504996156803,
+      "grad_norm": 0.6208163499832153,
+      "learning_rate": 0.00017108514190317196,
+      "loss": 0.4826,
+      "step": 438
+    },
+    {
+      "epoch": 1.3474250576479632,
+      "grad_norm": 0.570587694644928,
+      "learning_rate": 0.00017101836393989986,
+      "loss": 0.4451,
+      "step": 439
+    },
+    {
+      "epoch": 1.350499615680246,
+      "grad_norm": 0.6985802054405212,
+      "learning_rate": 0.00017095158597662773,
+      "loss": 0.5063,
+      "step": 440
+    },
+    {
+      "epoch": 1.3535741737125289,
+      "grad_norm": 0.6364935040473938,
+      "learning_rate": 0.0001708848080133556,
+      "loss": 0.5311,
+      "step": 441
+    },
+    {
+      "epoch": 1.3566487317448117,
+      "grad_norm": 0.6550077199935913,
+      "learning_rate": 0.00017081803005008348,
+      "loss": 0.4903,
+      "step": 442
+    },
+    {
+      "epoch": 1.3597232897770946,
+      "grad_norm": 0.6158908605575562,
+      "learning_rate": 0.00017075125208681135,
+      "loss": 0.5618,
+      "step": 443
+    },
+    {
+      "epoch": 1.3627978478093774,
+      "grad_norm": 0.6985887885093689,
+      "learning_rate": 0.00017068447412353925,
+      "loss": 0.5663,
+      "step": 444
+    },
+    {
+      "epoch": 1.3658724058416603,
+      "grad_norm": 0.6205439567565918,
+      "learning_rate": 0.00017061769616026712,
+      "loss": 0.4905,
+      "step": 445
+    },
+    {
+      "epoch": 1.368946963873943,
+      "grad_norm": 0.9212015271186829,
+      "learning_rate": 0.000170550918196995,
+      "loss": 0.7055,
+      "step": 446
+    },
+    {
+      "epoch": 1.372021521906226,
+      "grad_norm": 0.5099778175354004,
+      "learning_rate": 0.00017048414023372287,
+      "loss": 0.3754,
+      "step": 447
+    },
+    {
+      "epoch": 1.3750960799385088,
+      "grad_norm": 0.7985131740570068,
+      "learning_rate": 0.00017041736227045074,
+      "loss": 0.6526,
+      "step": 448
+    },
+    {
+      "epoch": 1.3781706379707916,
+      "grad_norm": 0.8302136063575745,
+      "learning_rate": 0.00017035058430717862,
+      "loss": 0.6056,
+      "step": 449
+    },
+    {
+      "epoch": 1.3812451960030745,
+      "grad_norm": 0.7308214902877808,
+      "learning_rate": 0.00017028380634390652,
+      "loss": 0.6452,
+      "step": 450
+    },
+    {
+      "epoch": 1.3843197540353573,
+      "grad_norm": 0.7058115005493164,
+      "learning_rate": 0.0001702170283806344,
+      "loss": 0.6399,
+      "step": 451
+    },
+    {
+      "epoch": 1.3873943120676402,
+      "grad_norm": 0.5836137533187866,
+      "learning_rate": 0.0001701502504173623,
+      "loss": 0.5628,
+      "step": 452
+    },
+    {
+      "epoch": 1.390468870099923,
+      "grad_norm": 0.5505719780921936,
+      "learning_rate": 0.00017008347245409016,
+      "loss": 0.4292,
+      "step": 453
+    },
+    {
+      "epoch": 1.3935434281322059,
+      "grad_norm": 0.7084729671478271,
+      "learning_rate": 0.00017001669449081804,
+      "loss": 0.5762,
+      "step": 454
+    },
+    {
+      "epoch": 1.3966179861644887,
+      "grad_norm": 0.6776607632637024,
+      "learning_rate": 0.00016994991652754594,
+      "loss": 0.4992,
+      "step": 455
+    },
+    {
+      "epoch": 1.3996925441967716,
+      "grad_norm": 0.6364510655403137,
+      "learning_rate": 0.0001698831385642738,
+      "loss": 0.4434,
+      "step": 456
+    },
+    {
+      "epoch": 1.4027671022290547,
+      "grad_norm": 0.6788143515586853,
+      "learning_rate": 0.00016981636060100168,
+      "loss": 0.4134,
+      "step": 457
+    },
+    {
+      "epoch": 1.4058416602613375,
+      "grad_norm": 0.6752612590789795,
+      "learning_rate": 0.00016974958263772956,
+      "loss": 0.5838,
+      "step": 458
+    },
+    {
+      "epoch": 1.4089162182936203,
+      "grad_norm": 0.6687692403793335,
+      "learning_rate": 0.00016968280467445743,
+      "loss": 0.6128,
+      "step": 459
+    },
+    {
+      "epoch": 1.4119907763259032,
+      "grad_norm": 0.8868100047111511,
+      "learning_rate": 0.00016961602671118533,
+      "loss": 0.5903,
+      "step": 460
+    },
+    {
+      "epoch": 1.415065334358186,
+      "grad_norm": 0.7482825517654419,
+      "learning_rate": 0.0001695492487479132,
+      "loss": 0.5034,
+      "step": 461
+    },
+    {
+      "epoch": 1.418139892390469,
+      "grad_norm": 0.5688104033470154,
+      "learning_rate": 0.00016948247078464108,
+      "loss": 0.5084,
+      "step": 462
+    },
+    {
+      "epoch": 1.4212144504227517,
+      "grad_norm": 0.730925440788269,
+      "learning_rate": 0.00016941569282136895,
+      "loss": 0.5267,
+      "step": 463
+    },
+    {
+      "epoch": 1.4242890084550346,
+      "grad_norm": 0.683314859867096,
+      "learning_rate": 0.00016934891485809682,
+      "loss": 0.6047,
+      "step": 464
+    },
+    {
+      "epoch": 1.4273635664873174,
+      "grad_norm": 0.7654600143432617,
+      "learning_rate": 0.0001692821368948247,
+      "loss": 0.5612,
+      "step": 465
+    },
+    {
+      "epoch": 1.4304381245196003,
+      "grad_norm": 0.5215669870376587,
+      "learning_rate": 0.0001692153589315526,
+      "loss": 0.4425,
+      "step": 466
+    },
+    {
+      "epoch": 1.4335126825518831,
+      "grad_norm": 0.8029130697250366,
+      "learning_rate": 0.00016914858096828047,
+      "loss": 0.6748,
+      "step": 467
+    },
+    {
+      "epoch": 1.436587240584166,
+      "grad_norm": 0.7310311794281006,
+      "learning_rate": 0.00016908180300500834,
+      "loss": 0.7158,
+      "step": 468
+    },
+    {
+      "epoch": 1.4396617986164488,
+      "grad_norm": 0.6347652077674866,
+      "learning_rate": 0.00016901502504173624,
+      "loss": 0.447,
+      "step": 469
+    },
+    {
+      "epoch": 1.4427363566487317,
+      "grad_norm": 0.6077408194541931,
+      "learning_rate": 0.00016894824707846412,
+      "loss": 0.5187,
+      "step": 470
+    },
+    {
+      "epoch": 1.4458109146810145,
+      "grad_norm": 0.7281926274299622,
+      "learning_rate": 0.00016888146911519202,
+      "loss": 0.6181,
+      "step": 471
+    },
+    {
+      "epoch": 1.4488854727132976,
+      "grad_norm": 0.7540388107299805,
+      "learning_rate": 0.0001688146911519199,
+      "loss": 0.5303,
+      "step": 472
+    },
+    {
+      "epoch": 1.4519600307455804,
+      "grad_norm": 0.8174847364425659,
+      "learning_rate": 0.00016874791318864776,
+      "loss": 0.733,
+      "step": 473
+    },
+    {
+      "epoch": 1.4550345887778633,
+      "grad_norm": 0.6414505243301392,
+      "learning_rate": 0.00016868113522537564,
+      "loss": 0.5458,
+      "step": 474
+    },
+    {
+      "epoch": 1.4581091468101461,
+      "grad_norm": 0.9108033776283264,
+      "learning_rate": 0.0001686143572621035,
+      "loss": 0.7125,
+      "step": 475
+    },
+    {
+      "epoch": 1.461183704842429,
+      "grad_norm": 0.6116359233856201,
+      "learning_rate": 0.0001685475792988314,
+      "loss": 0.5549,
+      "step": 476
+    },
+    {
+      "epoch": 1.4642582628747118,
+      "grad_norm": 0.821499228477478,
+      "learning_rate": 0.00016848080133555928,
+      "loss": 0.7422,
+      "step": 477
+    },
+    {
+      "epoch": 1.4673328209069947,
+      "grad_norm": 0.5836993455886841,
+      "learning_rate": 0.00016841402337228716,
+      "loss": 0.4829,
+      "step": 478
+    },
+    {
+      "epoch": 1.4704073789392775,
+      "grad_norm": 0.7028072476387024,
+      "learning_rate": 0.00016834724540901503,
+      "loss": 0.4844,
+      "step": 479
+    },
+    {
+      "epoch": 1.4734819369715604,
+      "grad_norm": 0.6338192224502563,
+      "learning_rate": 0.0001682804674457429,
+      "loss": 0.418,
+      "step": 480
+    },
+    {
+      "epoch": 1.4765564950038432,
+      "grad_norm": 0.7174279689788818,
+      "learning_rate": 0.00016821368948247077,
+      "loss": 0.4816,
+      "step": 481
+    },
+    {
+      "epoch": 1.479631053036126,
+      "grad_norm": 0.6590016484260559,
+      "learning_rate": 0.00016814691151919868,
+      "loss": 0.5613,
+      "step": 482
+    },
+    {
+      "epoch": 1.482705611068409,
+      "grad_norm": 0.7180425524711609,
+      "learning_rate": 0.00016808013355592655,
+      "loss": 0.7368,
+      "step": 483
+    },
+    {
+      "epoch": 1.4857801691006918,
+      "grad_norm": 0.7836325168609619,
+      "learning_rate": 0.00016801335559265442,
+      "loss": 0.6126,
+      "step": 484
+    },
+    {
+      "epoch": 1.4888547271329746,
+      "grad_norm": 0.6930490732192993,
+      "learning_rate": 0.0001679465776293823,
+      "loss": 0.6497,
+      "step": 485
+    },
+    {
+      "epoch": 1.4919292851652575,
+      "grad_norm": 0.6975258588790894,
+      "learning_rate": 0.0001678797996661102,
+      "loss": 0.579,
+      "step": 486
+    },
+    {
+      "epoch": 1.4950038431975403,
+      "grad_norm": 0.7456351518630981,
+      "learning_rate": 0.00016781302170283807,
+      "loss": 0.5209,
+      "step": 487
+    },
+    {
+      "epoch": 1.4980784012298232,
+      "grad_norm": 0.6301809549331665,
+      "learning_rate": 0.00016774624373956597,
+      "loss": 0.4117,
+      "step": 488
+    },
+    {
+      "epoch": 1.501152959262106,
+      "grad_norm": 0.9827542304992676,
+      "learning_rate": 0.00016767946577629384,
+      "loss": 0.7722,
+      "step": 489
+    },
+    {
+      "epoch": 1.5042275172943889,
+      "grad_norm": 0.6148912906646729,
+      "learning_rate": 0.00016761268781302171,
+      "loss": 0.6126,
+      "step": 490
+    },
+    {
+      "epoch": 1.5073020753266717,
+      "grad_norm": 0.7233926057815552,
+      "learning_rate": 0.0001675459098497496,
+      "loss": 0.6748,
+      "step": 491
+    },
+    {
+      "epoch": 1.5103766333589546,
+      "grad_norm": 0.7733349204063416,
+      "learning_rate": 0.0001674791318864775,
+      "loss": 0.5462,
+      "step": 492
+    },
+    {
+      "epoch": 1.5134511913912374,
+      "grad_norm": 0.6742725372314453,
+      "learning_rate": 0.00016741235392320536,
+      "loss": 0.6109,
+      "step": 493
+    },
+    {
+      "epoch": 1.5165257494235203,
+      "grad_norm": 0.5742484331130981,
+      "learning_rate": 0.00016734557595993323,
+      "loss": 0.452,
+      "step": 494
+    },
+    {
+      "epoch": 1.519600307455803,
+      "grad_norm": 0.5890893936157227,
+      "learning_rate": 0.0001672787979966611,
+      "loss": 0.5423,
+      "step": 495
+    },
+    {
+      "epoch": 1.522674865488086,
+      "grad_norm": 0.6500853896141052,
+      "learning_rate": 0.00016721202003338898,
+      "loss": 0.5345,
+      "step": 496
+    },
+    {
+      "epoch": 1.5257494235203688,
+      "grad_norm": 0.6630553603172302,
+      "learning_rate": 0.00016714524207011685,
+      "loss": 0.5529,
+      "step": 497
+    },
+    {
+      "epoch": 1.5288239815526516,
+      "grad_norm": 0.72234046459198,
+      "learning_rate": 0.00016707846410684475,
+      "loss": 0.5947,
+      "step": 498
+    },
+    {
+      "epoch": 1.5318985395849347,
+      "grad_norm": 0.7056167125701904,
+      "learning_rate": 0.00016701168614357263,
+      "loss": 0.5464,
+      "step": 499
+    },
+    {
+      "epoch": 1.5349730976172176,
+      "grad_norm": 0.7403351068496704,
+      "learning_rate": 0.0001669449081803005,
+      "loss": 0.5423,
+      "step": 500
+    },
+    {
+      "epoch": 1.5380476556495004,
+      "grad_norm": 0.8917403817176819,
+      "learning_rate": 0.00016687813021702837,
+      "loss": 0.6635,
+      "step": 501
+    },
+    {
+      "epoch": 1.5411222136817833,
+      "grad_norm": 0.5691559910774231,
+      "learning_rate": 0.00016681135225375625,
+      "loss": 0.4648,
+      "step": 502
+    },
+    {
+      "epoch": 1.544196771714066,
+      "grad_norm": 0.7191663980484009,
+      "learning_rate": 0.00016674457429048415,
+      "loss": 0.63,
+      "step": 503
+    },
+    {
+      "epoch": 1.547271329746349,
+      "grad_norm": 0.6063690781593323,
+      "learning_rate": 0.00016667779632721202,
+      "loss": 0.5557,
+      "step": 504
+    },
+    {
+      "epoch": 1.5503458877786318,
+      "grad_norm": 0.6743360161781311,
+      "learning_rate": 0.00016661101836393992,
+      "loss": 0.5346,
+      "step": 505
+    },
+    {
+      "epoch": 1.5534204458109147,
+      "grad_norm": 0.6480421423912048,
+      "learning_rate": 0.0001665442404006678,
+      "loss": 0.5116,
+      "step": 506
+    },
+    {
+      "epoch": 1.5564950038431975,
+      "grad_norm": 0.6903517842292786,
+      "learning_rate": 0.00016647746243739567,
+      "loss": 0.6378,
+      "step": 507
+    },
+    {
+      "epoch": 1.5595695618754803,
+      "grad_norm": 0.6405192613601685,
+      "learning_rate": 0.00016641068447412357,
+      "loss": 0.6756,
+      "step": 508
+    },
+    {
+      "epoch": 1.5626441199077634,
+      "grad_norm": 0.7051334381103516,
+      "learning_rate": 0.00016634390651085144,
+      "loss": 0.4695,
+      "step": 509
+    },
+    {
+      "epoch": 1.5657186779400463,
+      "grad_norm": 0.5805487036705017,
+      "learning_rate": 0.0001662771285475793,
+      "loss": 0.557,
+      "step": 510
+    },
+    {
+      "epoch": 1.5687932359723291,
+      "grad_norm": 0.5971087217330933,
+      "learning_rate": 0.00016621035058430719,
+      "loss": 0.4993,
+      "step": 511
+    },
+    {
+      "epoch": 1.571867794004612,
+      "grad_norm": 0.5403761863708496,
+      "learning_rate": 0.00016614357262103506,
+      "loss": 0.4008,
+      "step": 512
+    },
+    {
+      "epoch": 1.5749423520368948,
+      "grad_norm": 0.8529918193817139,
+      "learning_rate": 0.00016607679465776293,
+      "loss": 0.6232,
+      "step": 513
+    },
+    {
+      "epoch": 1.5780169100691777,
+      "grad_norm": 0.5955516695976257,
+      "learning_rate": 0.00016601001669449083,
+      "loss": 0.5359,
+      "step": 514
+    },
+    {
+      "epoch": 1.5810914681014605,
+      "grad_norm": 0.6873809099197388,
+      "learning_rate": 0.0001659432387312187,
+      "loss": 0.6932,
+      "step": 515
+    },
+    {
+      "epoch": 1.5841660261337434,
+      "grad_norm": 0.7022868394851685,
+      "learning_rate": 0.00016587646076794658,
+      "loss": 0.6894,
+      "step": 516
+    },
+    {
+      "epoch": 1.5872405841660262,
+      "grad_norm": 0.7386640906333923,
+      "learning_rate": 0.00016580968280467445,
+      "loss": 0.4857,
+      "step": 517
+    },
+    {
+      "epoch": 1.590315142198309,
+      "grad_norm": 0.6635391712188721,
+      "learning_rate": 0.00016574290484140233,
+      "loss": 0.5664,
+      "step": 518
+    },
+    {
+      "epoch": 1.593389700230592,
+      "grad_norm": 0.6896170973777771,
+      "learning_rate": 0.00016567612687813023,
+      "loss": 0.4613,
+      "step": 519
+    },
+    {
+      "epoch": 1.5964642582628747,
+      "grad_norm": 0.5555704236030579,
+      "learning_rate": 0.0001656093489148581,
+      "loss": 0.4811,
+      "step": 520
+    },
+    {
+      "epoch": 1.5995388162951576,
+      "grad_norm": 0.7170313596725464,
+      "learning_rate": 0.00016554257095158597,
+      "loss": 0.5939,
+      "step": 521
+    },
+    {
+      "epoch": 1.6026133743274404,
+      "grad_norm": 0.6032419204711914,
+      "learning_rate": 0.00016547579298831387,
+      "loss": 0.554,
+      "step": 522
+    },
+    {
+      "epoch": 1.6056879323597233,
+      "grad_norm": 0.8021843433380127,
+      "learning_rate": 0.00016540901502504175,
+      "loss": 0.6693,
+      "step": 523
+    },
+    {
+      "epoch": 1.6087624903920061,
+      "grad_norm": 0.7321604490280151,
+      "learning_rate": 0.00016534223706176965,
+      "loss": 0.7513,
+      "step": 524
+    },
+    {
+      "epoch": 1.611837048424289,
+      "grad_norm": 0.6060817241668701,
+      "learning_rate": 0.00016527545909849752,
+      "loss": 0.553,
+      "step": 525
+    },
+    {
+      "epoch": 1.6149116064565718,
+      "grad_norm": 0.7783850431442261,
+      "learning_rate": 0.0001652086811352254,
+      "loss": 0.5449,
+      "step": 526
+    },
+    {
+      "epoch": 1.6179861644888547,
+      "grad_norm": 0.8254792094230652,
+      "learning_rate": 0.00016514190317195327,
+      "loss": 0.6773,
+      "step": 527
+    },
+    {
+      "epoch": 1.6210607225211375,
+      "grad_norm": 0.7466058731079102,
+      "learning_rate": 0.00016507512520868114,
+      "loss": 0.6807,
+      "step": 528
+    },
+    {
+      "epoch": 1.6241352805534204,
+      "grad_norm": 0.8844708800315857,
+      "learning_rate": 0.00016500834724540904,
+      "loss": 0.5325,
+      "step": 529
+    },
+    {
+      "epoch": 1.6272098385857032,
+      "grad_norm": 0.8244767189025879,
+      "learning_rate": 0.0001649415692821369,
+      "loss": 0.8234,
+      "step": 530
+    },
+    {
+      "epoch": 1.630284396617986,
+      "grad_norm": 0.6416113376617432,
+      "learning_rate": 0.00016487479131886478,
+      "loss": 0.5985,
+      "step": 531
+    },
+    {
+      "epoch": 1.633358954650269,
+      "grad_norm": 0.4929693341255188,
+      "learning_rate": 0.00016480801335559266,
+      "loss": 0.4375,
+      "step": 532
+    },
+    {
+      "epoch": 1.6364335126825518,
+      "grad_norm": 0.540748655796051,
+      "learning_rate": 0.00016474123539232053,
+      "loss": 0.4758,
+      "step": 533
+    },
+    {
+      "epoch": 1.6395080707148346,
+      "grad_norm": 0.8574146032333374,
+      "learning_rate": 0.0001646744574290484,
+      "loss": 0.7296,
+      "step": 534
+    },
+    {
+      "epoch": 1.6425826287471175,
+      "grad_norm": 0.7862269282341003,
+      "learning_rate": 0.0001646076794657763,
+      "loss": 0.7556,
+      "step": 535
+    },
+    {
+      "epoch": 1.6456571867794003,
+      "grad_norm": 0.6202278137207031,
+      "learning_rate": 0.00016454090150250418,
+      "loss": 0.5431,
+      "step": 536
+    },
+    {
+      "epoch": 1.6487317448116832,
+      "grad_norm": 0.580601155757904,
+      "learning_rate": 0.00016447412353923205,
+      "loss": 0.4694,
+      "step": 537
+    },
+    {
+      "epoch": 1.6518063028439662,
+      "grad_norm": 0.5990520715713501,
+      "learning_rate": 0.00016440734557595992,
+      "loss": 0.5506,
+      "step": 538
+    },
+    {
+      "epoch": 1.654880860876249,
+      "grad_norm": 0.5700373649597168,
+      "learning_rate": 0.00016434056761268782,
+      "loss": 0.6156,
+      "step": 539
+    },
+    {
+      "epoch": 1.657955418908532,
+      "grad_norm": 0.6192472577095032,
+      "learning_rate": 0.0001642737896494157,
+      "loss": 0.5789,
+      "step": 540
+    },
+    {
+      "epoch": 1.6610299769408148,
+      "grad_norm": 0.741287112236023,
+      "learning_rate": 0.0001642070116861436,
+      "loss": 0.5113,
+      "step": 541
+    },
+    {
+      "epoch": 1.6641045349730976,
+      "grad_norm": 0.609207272529602,
+      "learning_rate": 0.00016414023372287147,
+      "loss": 0.5957,
+      "step": 542
+    },
+    {
+      "epoch": 1.6671790930053805,
+      "grad_norm": 0.613161027431488,
+      "learning_rate": 0.00016407345575959934,
+      "loss": 0.5169,
+      "step": 543
+    },
+    {
+      "epoch": 1.6702536510376633,
+      "grad_norm": 0.6057065725326538,
+      "learning_rate": 0.00016400667779632722,
+      "loss": 0.3969,
+      "step": 544
+    },
+    {
+      "epoch": 1.6733282090699462,
+      "grad_norm": 0.6364975571632385,
+      "learning_rate": 0.00016393989983305512,
+      "loss": 0.5941,
+      "step": 545
+    },
+    {
+      "epoch": 1.676402767102229,
+      "grad_norm": 0.6298673152923584,
+      "learning_rate": 0.000163873121869783,
+      "loss": 0.5397,
+      "step": 546
+    },
+    {
+      "epoch": 1.6794773251345119,
+      "grad_norm": 0.5753400921821594,
+      "learning_rate": 0.00016380634390651086,
+      "loss": 0.5225,
+      "step": 547
+    },
+    {
+      "epoch": 1.682551883166795,
+      "grad_norm": 0.47216150164604187,
+      "learning_rate": 0.00016373956594323874,
+      "loss": 0.4412,
+      "step": 548
+    },
+    {
+      "epoch": 1.6856264411990778,
+      "grad_norm": 0.575374960899353,
+      "learning_rate": 0.0001636727879799666,
+      "loss": 0.563,
+      "step": 549
+    },
+    {
+      "epoch": 1.6887009992313606,
+      "grad_norm": 0.6871128678321838,
+      "learning_rate": 0.00016360601001669448,
+      "loss": 0.517,
+      "step": 550
+    },
+    {
+      "epoch": 1.6917755572636435,
+      "grad_norm": 0.6241912841796875,
+      "learning_rate": 0.00016353923205342238,
+      "loss": 0.5816,
+      "step": 551
+    },
+    {
+      "epoch": 1.6948501152959263,
+      "grad_norm": 0.5549102425575256,
+      "learning_rate": 0.00016347245409015026,
+      "loss": 0.5728,
+      "step": 552
+    },
+    {
+      "epoch": 1.6979246733282092,
+      "grad_norm": 0.8817942142486572,
+      "learning_rate": 0.00016340567612687813,
+      "loss": 0.56,
+      "step": 553
+    },
+    {
+      "epoch": 1.700999231360492,
+      "grad_norm": 0.7771773338317871,
+      "learning_rate": 0.000163338898163606,
+      "loss": 0.6107,
+      "step": 554
+    },
+    {
+      "epoch": 1.7040737893927749,
+      "grad_norm": 0.7410566210746765,
+      "learning_rate": 0.00016327212020033388,
+      "loss": 0.6591,
+      "step": 555
+    },
+    {
+      "epoch": 1.7071483474250577,
+      "grad_norm": 0.830802857875824,
+      "learning_rate": 0.00016320534223706178,
+      "loss": 0.6667,
+      "step": 556
+    },
+    {
+      "epoch": 1.7102229054573406,
+      "grad_norm": 0.593959629535675,
+      "learning_rate": 0.00016313856427378965,
+      "loss": 0.5319,
+      "step": 557
+    },
+    {
+      "epoch": 1.7132974634896234,
+      "grad_norm": 0.6377514004707336,
+      "learning_rate": 0.00016307178631051755,
+      "loss": 0.5966,
+      "step": 558
+    },
+    {
+      "epoch": 1.7163720215219063,
+      "grad_norm": 0.6252657771110535,
+      "learning_rate": 0.00016300500834724542,
+      "loss": 0.5337,
+      "step": 559
+    },
+    {
+      "epoch": 1.7194465795541891,
+      "grad_norm": 0.885527491569519,
+      "learning_rate": 0.0001629382303839733,
+      "loss": 0.8417,
+      "step": 560
+    },
+    {
+      "epoch": 1.722521137586472,
+      "grad_norm": 0.5693302154541016,
+      "learning_rate": 0.0001628714524207012,
+      "loss": 0.5038,
+      "step": 561
+    },
+    {
+      "epoch": 1.7255956956187548,
+      "grad_norm": 0.7291401624679565,
+      "learning_rate": 0.00016280467445742907,
+      "loss": 0.6994,
+      "step": 562
+    },
+    {
+      "epoch": 1.7286702536510377,
+      "grad_norm": 0.7223179340362549,
+      "learning_rate": 0.00016273789649415694,
+      "loss": 0.648,
+      "step": 563
+    },
+    {
+      "epoch": 1.7317448116833205,
+      "grad_norm": 0.7139200568199158,
+      "learning_rate": 0.00016267111853088482,
+      "loss": 0.5822,
+      "step": 564
+    },
+    {
+      "epoch": 1.7348193697156034,
+      "grad_norm": 0.5660908222198486,
+      "learning_rate": 0.0001626043405676127,
+      "loss": 0.3695,
+      "step": 565
+    },
+    {
+      "epoch": 1.7378939277478862,
+      "grad_norm": 0.698505163192749,
+      "learning_rate": 0.00016253756260434056,
+      "loss": 0.6601,
+      "step": 566
+    },
+    {
+      "epoch": 1.740968485780169,
+      "grad_norm": 0.5684105753898621,
+      "learning_rate": 0.00016247078464106846,
+      "loss": 0.6013,
+      "step": 567
+    },
+    {
+      "epoch": 1.744043043812452,
+      "grad_norm": 0.645592212677002,
+      "learning_rate": 0.00016240400667779634,
+      "loss": 0.6394,
+      "step": 568
+    },
+    {
+      "epoch": 1.7471176018447347,
+      "grad_norm": 0.6073788404464722,
+      "learning_rate": 0.0001623372287145242,
+      "loss": 0.4312,
+      "step": 569
+    },
+    {
+      "epoch": 1.7501921598770176,
+      "grad_norm": 0.7062597274780273,
+      "learning_rate": 0.00016227045075125208,
+      "loss": 0.5653,
+      "step": 570
+    },
+    {
+      "epoch": 1.7532667179093004,
+      "grad_norm": 0.5822290182113647,
+      "learning_rate": 0.00016220367278797996,
+      "loss": 0.5852,
+      "step": 571
+    },
+    {
+      "epoch": 1.7563412759415833,
+      "grad_norm": 0.6263893842697144,
+      "learning_rate": 0.00016213689482470786,
+      "loss": 0.6102,
+      "step": 572
+    },
+    {
+      "epoch": 1.7594158339738661,
+      "grad_norm": 0.7281681299209595,
+      "learning_rate": 0.00016207011686143573,
+      "loss": 0.6375,
+      "step": 573
+    },
+    {
+      "epoch": 1.762490392006149,
+      "grad_norm": 0.6217925548553467,
+      "learning_rate": 0.0001620033388981636,
+      "loss": 0.603,
+      "step": 574
+    },
+    {
+      "epoch": 1.7655649500384318,
+      "grad_norm": 0.822990357875824,
+      "learning_rate": 0.0001619365609348915,
+      "loss": 0.7641,
+      "step": 575
+    },
+    {
+      "epoch": 1.7686395080707147,
+      "grad_norm": 0.6625170111656189,
+      "learning_rate": 0.00016186978297161938,
+      "loss": 0.5701,
+      "step": 576
+    },
+    {
+      "epoch": 1.7717140661029975,
+      "grad_norm": 0.6847323179244995,
+      "learning_rate": 0.00016180300500834728,
+      "loss": 0.47,
+      "step": 577
+    },
+    {
+      "epoch": 1.7747886241352806,
+      "grad_norm": 0.6274866461753845,
+      "learning_rate": 0.00016173622704507515,
+      "loss": 0.4998,
+      "step": 578
+    },
+    {
+      "epoch": 1.7778631821675634,
+      "grad_norm": 0.7083932161331177,
+      "learning_rate": 0.00016166944908180302,
+      "loss": 0.6362,
+      "step": 579
+    },
+    {
+      "epoch": 1.7809377401998463,
+      "grad_norm": 0.7024930715560913,
+      "learning_rate": 0.0001616026711185309,
+      "loss": 0.6101,
+      "step": 580
+    },
+    {
+      "epoch": 1.7840122982321291,
+      "grad_norm": 0.9053730964660645,
+      "learning_rate": 0.00016153589315525877,
+      "loss": 0.7606,
+      "step": 581
+    },
+    {
+      "epoch": 1.787086856264412,
+      "grad_norm": 1.0986732244491577,
+      "learning_rate": 0.00016146911519198664,
+      "loss": 0.7126,
+      "step": 582
+    },
+    {
+      "epoch": 1.7901614142966948,
+      "grad_norm": 0.6207830309867859,
+      "learning_rate": 0.00016140233722871454,
+      "loss": 0.5338,
+      "step": 583
+    },
+    {
+      "epoch": 1.7932359723289777,
+      "grad_norm": 0.5910727977752686,
+      "learning_rate": 0.00016133555926544241,
+      "loss": 0.4771,
+      "step": 584
+    },
+    {
+      "epoch": 1.7963105303612605,
+      "grad_norm": 0.5598863363265991,
+      "learning_rate": 0.0001612687813021703,
+      "loss": 0.3635,
+      "step": 585
+    },
+    {
+      "epoch": 1.7993850883935434,
+      "grad_norm": 0.7183571457862854,
+      "learning_rate": 0.00016120200333889816,
+      "loss": 0.6022,
+      "step": 586
+    },
+    {
+      "epoch": 1.8024596464258262,
+      "grad_norm": 0.7178698182106018,
+      "learning_rate": 0.00016113522537562603,
+      "loss": 0.5143,
+      "step": 587
+    },
+    {
+      "epoch": 1.8055342044581093,
+      "grad_norm": 0.5767114162445068,
+      "learning_rate": 0.00016106844741235393,
+      "loss": 0.481,
+      "step": 588
+    },
+    {
+      "epoch": 1.8086087624903922,
+      "grad_norm": 0.6642889380455017,
+      "learning_rate": 0.0001610016694490818,
+      "loss": 0.5119,
+      "step": 589
+    },
+    {
+      "epoch": 1.811683320522675,
+      "grad_norm": 0.7314223647117615,
+      "learning_rate": 0.00016093489148580968,
+      "loss": 0.5836,
+      "step": 590
+    },
+    {
+      "epoch": 1.8147578785549578,
+      "grad_norm": 0.6860315799713135,
+      "learning_rate": 0.00016086811352253755,
+      "loss": 0.5669,
+      "step": 591
+    },
+    {
+      "epoch": 1.8178324365872407,
+      "grad_norm": 0.7875143885612488,
+      "learning_rate": 0.00016080133555926545,
+      "loss": 0.6005,
+      "step": 592
+    },
+    {
+      "epoch": 1.8209069946195235,
+      "grad_norm": 0.7283911108970642,
+      "learning_rate": 0.00016073455759599333,
+      "loss": 0.5565,
+      "step": 593
+    },
+    {
+      "epoch": 1.8239815526518064,
+      "grad_norm": 0.5864517092704773,
+      "learning_rate": 0.00016066777963272123,
+      "loss": 0.5659,
+      "step": 594
+    },
+    {
+      "epoch": 1.8270561106840892,
+      "grad_norm": 0.6149706244468689,
+      "learning_rate": 0.0001606010016694491,
+      "loss": 0.5811,
+      "step": 595
+    },
+    {
+      "epoch": 1.830130668716372,
+      "grad_norm": 0.6962308883666992,
+      "learning_rate": 0.00016053422370617697,
+      "loss": 0.6053,
+      "step": 596
+    },
+    {
+      "epoch": 1.833205226748655,
+      "grad_norm": 0.5711308121681213,
+      "learning_rate": 0.00016046744574290485,
+      "loss": 0.4212,
+      "step": 597
+    },
+    {
+      "epoch": 1.8362797847809378,
+      "grad_norm": 0.7618324756622314,
+      "learning_rate": 0.00016040066777963272,
+      "loss": 0.7267,
+      "step": 598
+    },
+    {
+      "epoch": 1.8393543428132206,
+      "grad_norm": 0.7906466126441956,
+      "learning_rate": 0.00016033388981636062,
+      "loss": 0.7056,
+      "step": 599
+    },
+    {
+      "epoch": 1.8424289008455035,
+      "grad_norm": 1.0188270807266235,
+      "learning_rate": 0.0001602671118530885,
+      "loss": 0.634,
+      "step": 600
+    },
+    {
+      "epoch": 1.8455034588777863,
+      "grad_norm": 0.7009850740432739,
+      "learning_rate": 0.00016020033388981637,
+      "loss": 0.4883,
+      "step": 601
+    },
+    {
+      "epoch": 1.8485780169100692,
+      "grad_norm": 0.8244671821594238,
+      "learning_rate": 0.00016013355592654424,
+      "loss": 0.7119,
+      "step": 602
+    },
+    {
+      "epoch": 1.851652574942352,
+      "grad_norm": 0.738471508026123,
+      "learning_rate": 0.0001600667779632721,
+      "loss": 0.6025,
+      "step": 603
+    },
+    {
+      "epoch": 1.8547271329746349,
+      "grad_norm": 0.6964389085769653,
+      "learning_rate": 0.00016,
+      "loss": 0.5,
+      "step": 604
+    },
+    {
+      "epoch": 1.8578016910069177,
+      "grad_norm": 0.5497778654098511,
+      "learning_rate": 0.00015993322203672789,
+      "loss": 0.5629,
+      "step": 605
+    },
+    {
+      "epoch": 1.8608762490392006,
+      "grad_norm": 0.644513726234436,
+      "learning_rate": 0.00015986644407345576,
+      "loss": 0.465,
+      "step": 606
+    },
+    {
+      "epoch": 1.8639508070714834,
+      "grad_norm": 0.6021044254302979,
+      "learning_rate": 0.00015979966611018363,
+      "loss": 0.4142,
+      "step": 607
+    },
+    {
+      "epoch": 1.8670253651037663,
+      "grad_norm": 0.669230043888092,
+      "learning_rate": 0.0001597328881469115,
+      "loss": 0.6143,
+      "step": 608
+    },
+    {
+      "epoch": 1.8700999231360491,
+      "grad_norm": 0.7413586378097534,
+      "learning_rate": 0.0001596661101836394,
+      "loss": 0.6182,
+      "step": 609
+    },
+    {
+      "epoch": 1.873174481168332,
+      "grad_norm": 0.6968368291854858,
+      "learning_rate": 0.00015959933222036728,
+      "loss": 0.5306,
+      "step": 610
+    },
+    {
+      "epoch": 1.8762490392006148,
+      "grad_norm": 0.6736475825309753,
+      "learning_rate": 0.00015953255425709518,
+      "loss": 0.5857,
+      "step": 611
+    },
+    {
+      "epoch": 1.8793235972328977,
+      "grad_norm": 0.6630072593688965,
+      "learning_rate": 0.00015946577629382305,
+      "loss": 0.4775,
+      "step": 612
+    },
+    {
+      "epoch": 1.8823981552651805,
+      "grad_norm": 0.6984624266624451,
+      "learning_rate": 0.00015939899833055093,
+      "loss": 0.5635,
+      "step": 613
+    },
+    {
+      "epoch": 1.8854727132974634,
+      "grad_norm": 0.6280466914176941,
+      "learning_rate": 0.0001593322203672788,
+      "loss": 0.8159,
+      "step": 614
+    },
+    {
+      "epoch": 1.8885472713297462,
+      "grad_norm": 0.7790103554725647,
+      "learning_rate": 0.0001592654424040067,
+      "loss": 0.594,
+      "step": 615
+    },
+    {
+      "epoch": 1.891621829362029,
+      "grad_norm": 0.704753041267395,
+      "learning_rate": 0.00015919866444073457,
+      "loss": 0.5726,
+      "step": 616
+    },
+    {
+      "epoch": 1.8946963873943121,
+      "grad_norm": 0.7425320148468018,
+      "learning_rate": 0.00015913188647746245,
+      "loss": 0.5657,
+      "step": 617
+    },
+    {
+      "epoch": 1.897770945426595,
+      "grad_norm": 0.6058589816093445,
+      "learning_rate": 0.00015906510851419032,
+      "loss": 0.4574,
+      "step": 618
+    },
+    {
+      "epoch": 1.9008455034588778,
+      "grad_norm": 0.811036229133606,
+      "learning_rate": 0.0001589983305509182,
+      "loss": 0.5719,
+      "step": 619
+    },
+    {
+      "epoch": 1.9039200614911607,
+      "grad_norm": 0.5609816908836365,
+      "learning_rate": 0.0001589315525876461,
+      "loss": 0.7197,
+      "step": 620
+    },
+    {
+      "epoch": 1.9069946195234435,
+      "grad_norm": 0.6295925974845886,
+      "learning_rate": 0.00015886477462437397,
+      "loss": 0.514,
+      "step": 621
+    },
+    {
+      "epoch": 1.9100691775557264,
+      "grad_norm": 0.9893009662628174,
+      "learning_rate": 0.00015879799666110184,
+      "loss": 0.6079,
+      "step": 622
+    },
+    {
+      "epoch": 1.9131437355880092,
+      "grad_norm": 0.6634209752082825,
+      "learning_rate": 0.0001587312186978297,
+      "loss": 0.5731,
+      "step": 623
+    },
+    {
+      "epoch": 1.916218293620292,
+      "grad_norm": 0.6897741556167603,
+      "learning_rate": 0.00015866444073455758,
+      "loss": 0.533,
+      "step": 624
+    },
+    {
+      "epoch": 1.919292851652575,
+      "grad_norm": 0.7442365884780884,
+      "learning_rate": 0.00015859766277128548,
+      "loss": 0.5796,
+      "step": 625
+    },
+    {
+      "epoch": 1.9223674096848578,
+      "grad_norm": 0.7648442387580872,
+      "learning_rate": 0.00015853088480801336,
+      "loss": 0.6745,
+      "step": 626
+    },
+    {
+      "epoch": 1.9254419677171408,
+      "grad_norm": 0.6118778586387634,
+      "learning_rate": 0.00015846410684474123,
+      "loss": 0.5577,
+      "step": 627
+    },
+    {
+      "epoch": 1.9285165257494237,
+      "grad_norm": 0.7464010715484619,
+      "learning_rate": 0.00015839732888146913,
+      "loss": 0.6854,
+      "step": 628
+    },
+    {
+      "epoch": 1.9315910837817065,
+      "grad_norm": 0.63694828748703,
+      "learning_rate": 0.000158330550918197,
+      "loss": 0.5794,
+      "step": 629
+    },
+    {
+      "epoch": 1.9346656418139894,
+      "grad_norm": 0.7984501123428345,
+      "learning_rate": 0.00015826377295492488,
+      "loss": 0.8564,
+      "step": 630
+    },
+    {
+      "epoch": 1.9377401998462722,
+      "grad_norm": 0.7075039744377136,
+      "learning_rate": 0.00015819699499165278,
+      "loss": 0.5751,
+      "step": 631
+    },
+    {
+      "epoch": 1.940814757878555,
+      "grad_norm": 0.6514005064964294,
+      "learning_rate": 0.00015813021702838065,
+      "loss": 0.6048,
+      "step": 632
+    },
+    {
+      "epoch": 1.943889315910838,
+      "grad_norm": 0.5643919706344604,
+      "learning_rate": 0.00015806343906510852,
+      "loss": 0.4348,
+      "step": 633
+    },
+    {
+      "epoch": 1.9469638739431208,
+      "grad_norm": 0.7066437005996704,
+      "learning_rate": 0.0001579966611018364,
+      "loss": 0.5822,
+      "step": 634
+    },
+    {
+      "epoch": 1.9500384319754036,
+      "grad_norm": 0.5992090106010437,
+      "learning_rate": 0.00015792988313856427,
+      "loss": 0.5614,
+      "step": 635
+    },
+    {
+      "epoch": 1.9531129900076865,
+      "grad_norm": 0.6332142353057861,
+      "learning_rate": 0.00015786310517529217,
+      "loss": 0.5655,
+      "step": 636
+    },
+    {
+      "epoch": 1.9561875480399693,
+      "grad_norm": 0.5068455934524536,
+      "learning_rate": 0.00015779632721202004,
+      "loss": 0.5389,
+      "step": 637
+    },
+    {
+      "epoch": 1.9592621060722522,
+      "grad_norm": 0.8024671673774719,
+      "learning_rate": 0.00015772954924874792,
+      "loss": 0.7261,
+      "step": 638
+    },
+    {
+      "epoch": 1.962336664104535,
+      "grad_norm": 0.8747161626815796,
+      "learning_rate": 0.0001576627712854758,
+      "loss": 0.6632,
+      "step": 639
+    },
+    {
+      "epoch": 1.9654112221368178,
+      "grad_norm": 0.5946447253227234,
+      "learning_rate": 0.00015759599332220366,
+      "loss": 0.5571,
+      "step": 640
+    },
+    {
+      "epoch": 1.9684857801691007,
+      "grad_norm": 0.7284528017044067,
+      "learning_rate": 0.00015752921535893156,
+      "loss": 0.6314,
+      "step": 641
+    },
+    {
+      "epoch": 1.9715603382013835,
+      "grad_norm": 0.824228823184967,
+      "learning_rate": 0.00015746243739565944,
+      "loss": 0.7593,
+      "step": 642
+    },
+    {
+      "epoch": 1.9746348962336664,
+      "grad_norm": 0.6937350034713745,
+      "learning_rate": 0.0001573956594323873,
+      "loss": 0.6647,
+      "step": 643
+    },
+    {
+      "epoch": 1.9777094542659492,
+      "grad_norm": 0.5793902277946472,
+      "learning_rate": 0.0001573288814691152,
+      "loss": 0.4004,
+      "step": 644
+    },
+    {
+      "epoch": 1.980784012298232,
+      "grad_norm": 0.7415186762809753,
+      "learning_rate": 0.00015726210350584308,
+      "loss": 0.454,
+      "step": 645
+    },
+    {
+      "epoch": 1.983858570330515,
+      "grad_norm": 0.6287279725074768,
+      "learning_rate": 0.00015719532554257096,
+      "loss": 0.6492,
+      "step": 646
+    },
+    {
+      "epoch": 1.9869331283627978,
+      "grad_norm": 0.7581256628036499,
+      "learning_rate": 0.00015712854757929886,
+      "loss": 0.6954,
+      "step": 647
+    },
+    {
+      "epoch": 1.9900076863950806,
+      "grad_norm": 0.7032405734062195,
+      "learning_rate": 0.00015706176961602673,
+      "loss": 0.601,
+      "step": 648
+    },
+    {
+      "epoch": 1.9930822444273635,
+      "grad_norm": 0.9088711142539978,
+      "learning_rate": 0.0001569949916527546,
+      "loss": 0.7629,
+      "step": 649
+    },
+    {
+      "epoch": 1.9961568024596463,
+      "grad_norm": 0.7218103408813477,
+      "learning_rate": 0.00015692821368948248,
+      "loss": 0.666,
+      "step": 650
+    },
+    {
+      "epoch": 1.9992313604919292,
+      "grad_norm": 0.7617568373680115,
+      "learning_rate": 0.00015686143572621035,
+      "loss": 0.6146,
+      "step": 651
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 1.2042289972305298,
+      "learning_rate": 0.00015679465776293825,
+      "loss": 0.5399,
+      "step": 652
+    },
+    {
+      "epoch": 2.003074558032283,
+      "grad_norm": 0.4528297781944275,
+      "learning_rate": 0.00015672787979966612,
+      "loss": 0.359,
+      "step": 653
+    },
+    {
+      "epoch": 2.0061491160645657,
+      "grad_norm": 0.6834194660186768,
+      "learning_rate": 0.000156661101836394,
+      "loss": 0.4381,
+      "step": 654
+    },
+    {
+      "epoch": 2.0092236740968485,
+      "grad_norm": 0.7097493410110474,
+      "learning_rate": 0.00015659432387312187,
+      "loss": 0.5073,
+      "step": 655
+    },
+    {
+      "epoch": 2.0122982321291314,
+      "grad_norm": 0.5966106057167053,
+      "learning_rate": 0.00015652754590984974,
+      "loss": 0.5188,
+      "step": 656
+    },
+    {
+      "epoch": 2.0153727901614142,
+      "grad_norm": 0.5781939029693604,
+      "learning_rate": 0.00015646076794657764,
+      "loss": 0.4936,
+      "step": 657
+    },
+    {
+      "epoch": 2.018447348193697,
+      "grad_norm": 0.6681936979293823,
+      "learning_rate": 0.00015639398998330552,
+      "loss": 0.5091,
+      "step": 658
+    },
+    {
+      "epoch": 2.02152190622598,
+      "grad_norm": 0.7436164617538452,
+      "learning_rate": 0.0001563272120200334,
+      "loss": 0.6323,
+      "step": 659
+    },
+    {
+      "epoch": 2.024596464258263,
+      "grad_norm": 0.59382164478302,
+      "learning_rate": 0.00015626043405676126,
+      "loss": 0.4239,
+      "step": 660
+    },
+    {
+      "epoch": 2.0276710222905456,
+      "grad_norm": 0.659829318523407,
+      "learning_rate": 0.00015619365609348916,
+      "loss": 0.4769,
+      "step": 661
+    },
+    {
+      "epoch": 2.0307455803228285,
+      "grad_norm": 0.6705843806266785,
+      "learning_rate": 0.00015612687813021704,
+      "loss": 0.5814,
+      "step": 662
+    },
+    {
+      "epoch": 2.0338201383551113,
+      "grad_norm": 0.6286864876747131,
+      "learning_rate": 0.00015606010016694494,
+      "loss": 0.3872,
+      "step": 663
+    },
+    {
+      "epoch": 2.036894696387394,
+      "grad_norm": 0.6223423480987549,
+      "learning_rate": 0.0001559933222036728,
+      "loss": 0.5428,
+      "step": 664
+    },
+    {
+      "epoch": 2.039969254419677,
+      "grad_norm": 0.7200874090194702,
+      "learning_rate": 0.00015592654424040068,
+      "loss": 0.5192,
+      "step": 665
+    },
+    {
+      "epoch": 2.04304381245196,
+      "grad_norm": 0.6993906497955322,
+      "learning_rate": 0.00015585976627712856,
+      "loss": 0.523,
+      "step": 666
+    },
+    {
+      "epoch": 2.0461183704842427,
+      "grad_norm": 0.7193444967269897,
+      "learning_rate": 0.00015579298831385643,
+      "loss": 0.4788,
+      "step": 667
+    },
+    {
+      "epoch": 2.0491929285165256,
+      "grad_norm": 0.7082064747810364,
+      "learning_rate": 0.00015572621035058433,
+      "loss": 0.5071,
+      "step": 668
+    },
+    {
+      "epoch": 2.0522674865488084,
+      "grad_norm": 0.6296613812446594,
+      "learning_rate": 0.0001556594323873122,
+      "loss": 0.5415,
+      "step": 669
+    },
+    {
+      "epoch": 2.0553420445810913,
+      "grad_norm": 1.0283185243606567,
+      "learning_rate": 0.00015559265442404007,
+      "loss": 0.4654,
+      "step": 670
+    },
+    {
+      "epoch": 2.058416602613374,
+      "grad_norm": 0.8436565399169922,
+      "learning_rate": 0.00015552587646076795,
+      "loss": 0.6448,
+      "step": 671
+    },
+    {
+      "epoch": 2.061491160645657,
+      "grad_norm": 0.5912400484085083,
+      "learning_rate": 0.00015545909849749582,
+      "loss": 0.4333,
+      "step": 672
+    },
+    {
+      "epoch": 2.0645657186779403,
+      "grad_norm": 0.7355748414993286,
+      "learning_rate": 0.00015539232053422372,
+      "loss": 0.4239,
+      "step": 673
+    },
+    {
+      "epoch": 2.067640276710223,
+      "grad_norm": 0.6488693952560425,
+      "learning_rate": 0.0001553255425709516,
+      "loss": 0.454,
+      "step": 674
+    },
+    {
+      "epoch": 2.070714834742506,
+      "grad_norm": 0.5765907764434814,
+      "learning_rate": 0.00015525876460767947,
+      "loss": 0.4197,
+      "step": 675
+    },
+    {
+      "epoch": 2.073789392774789,
+      "grad_norm": 0.9428765773773193,
+      "learning_rate": 0.00015519198664440734,
+      "loss": 0.6399,
+      "step": 676
+    },
+    {
+      "epoch": 2.0768639508070716,
+      "grad_norm": 0.6274253726005554,
+      "learning_rate": 0.00015512520868113521,
+      "loss": 0.5117,
+      "step": 677
+    },
+    {
+      "epoch": 2.0799385088393545,
+      "grad_norm": 0.6983177065849304,
+      "learning_rate": 0.00015505843071786311,
+      "loss": 0.3869,
+      "step": 678
+    },
+    {
+      "epoch": 2.0830130668716373,
+      "grad_norm": 0.6359655261039734,
+      "learning_rate": 0.000154991652754591,
+      "loss": 0.4617,
+      "step": 679
+    },
+    {
+      "epoch": 2.08608762490392,
+      "grad_norm": 0.6552188992500305,
+      "learning_rate": 0.0001549248747913189,
+      "loss": 0.4573,
+      "step": 680
+    },
+    {
+      "epoch": 2.089162182936203,
+      "grad_norm": 0.7065202593803406,
+      "learning_rate": 0.00015485809682804676,
+      "loss": 0.5351,
+      "step": 681
+    },
+    {
+      "epoch": 2.092236740968486,
+      "grad_norm": 0.7550007700920105,
+      "learning_rate": 0.00015479131886477463,
+      "loss": 0.5607,
+      "step": 682
+    },
+    {
+      "epoch": 2.0953112990007687,
+      "grad_norm": 0.6819210648536682,
+      "learning_rate": 0.0001547245409015025,
+      "loss": 0.4222,
+      "step": 683
+    },
+    {
+      "epoch": 2.0983858570330516,
+      "grad_norm": 0.5584454536437988,
+      "learning_rate": 0.0001546577629382304,
+      "loss": 0.3859,
+      "step": 684
+    },
+    {
+      "epoch": 2.1014604150653344,
+      "grad_norm": 0.7186906337738037,
+      "learning_rate": 0.00015459098497495828,
+      "loss": 0.4977,
+      "step": 685
+    },
+    {
+      "epoch": 2.1045349730976173,
+      "grad_norm": 0.763657808303833,
+      "learning_rate": 0.00015452420701168615,
+      "loss": 0.5239,
+      "step": 686
+    },
+    {
+      "epoch": 2.1076095311299,
+      "grad_norm": 0.6879159212112427,
+      "learning_rate": 0.00015445742904841403,
+      "loss": 0.5443,
+      "step": 687
+    },
+    {
+      "epoch": 2.110684089162183,
+      "grad_norm": 0.5697076916694641,
+      "learning_rate": 0.0001543906510851419,
+      "loss": 0.3364,
+      "step": 688
+    },
+    {
+      "epoch": 2.113758647194466,
+      "grad_norm": 0.6115249991416931,
+      "learning_rate": 0.0001543238731218698,
+      "loss": 0.5267,
+      "step": 689
+    },
+    {
+      "epoch": 2.1168332052267487,
+      "grad_norm": 0.6462056040763855,
+      "learning_rate": 0.00015425709515859767,
+      "loss": 0.4057,
+      "step": 690
+    },
+    {
+      "epoch": 2.1199077632590315,
+      "grad_norm": 0.6328736543655396,
+      "learning_rate": 0.00015419031719532555,
+      "loss": 0.4059,
+      "step": 691
+    },
+    {
+      "epoch": 2.1229823212913144,
+      "grad_norm": 0.6837843656539917,
+      "learning_rate": 0.00015412353923205342,
+      "loss": 0.4288,
+      "step": 692
+    },
+    {
+      "epoch": 2.126056879323597,
+      "grad_norm": 0.6994965672492981,
+      "learning_rate": 0.0001540567612687813,
+      "loss": 0.5656,
+      "step": 693
+    },
+    {
+      "epoch": 2.12913143735588,
+      "grad_norm": 0.6533644795417786,
+      "learning_rate": 0.0001539899833055092,
+      "loss": 0.3702,
+      "step": 694
+    },
+    {
+      "epoch": 2.132205995388163,
+      "grad_norm": 0.6903581023216248,
+      "learning_rate": 0.00015392320534223707,
+      "loss": 0.4216,
+      "step": 695
+    },
+    {
+      "epoch": 2.1352805534204458,
+      "grad_norm": 0.6325581669807434,
+      "learning_rate": 0.00015385642737896494,
+      "loss": 0.3968,
+      "step": 696
+    },
+    {
+      "epoch": 2.1383551114527286,
+      "grad_norm": 0.6790093779563904,
+      "learning_rate": 0.00015378964941569284,
+      "loss": 0.5112,
+      "step": 697
+    },
+    {
+      "epoch": 2.1414296694850115,
+      "grad_norm": 0.8143894672393799,
+      "learning_rate": 0.0001537228714524207,
+      "loss": 0.5103,
+      "step": 698
+    },
+    {
+      "epoch": 2.1445042275172943,
+      "grad_norm": 0.6844452023506165,
+      "learning_rate": 0.00015365609348914859,
+      "loss": 0.4344,
+      "step": 699
+    },
+    {
+      "epoch": 2.147578785549577,
+      "grad_norm": 1.0638381242752075,
+      "learning_rate": 0.00015358931552587649,
+      "loss": 0.5799,
+      "step": 700
+    },
+    {
+      "epoch": 2.15065334358186,
+      "grad_norm": 0.7177916169166565,
+      "learning_rate": 0.00015352253756260436,
+      "loss": 0.5159,
+      "step": 701
+    },
+    {
+      "epoch": 2.153727901614143,
+      "grad_norm": 0.5857630968093872,
+      "learning_rate": 0.00015345575959933223,
+      "loss": 0.452,
+      "step": 702
+    },
+    {
+      "epoch": 2.1568024596464257,
+      "grad_norm": 0.7078539729118347,
+      "learning_rate": 0.0001533889816360601,
+      "loss": 0.5155,
+      "step": 703
+    },
+    {
+      "epoch": 2.1598770176787085,
+      "grad_norm": 0.8432323932647705,
+      "learning_rate": 0.00015332220367278798,
+      "loss": 0.6138,
+      "step": 704
+    },
+    {
+      "epoch": 2.1629515757109914,
+      "grad_norm": 0.6977456212043762,
+      "learning_rate": 0.00015325542570951588,
+      "loss": 0.577,
+      "step": 705
+    },
+    {
+      "epoch": 2.1660261337432742,
+      "grad_norm": 0.72422194480896,
+      "learning_rate": 0.00015318864774624375,
+      "loss": 0.5873,
+      "step": 706
+    },
+    {
+      "epoch": 2.169100691775557,
+      "grad_norm": 0.846378743648529,
+      "learning_rate": 0.00015312186978297163,
+      "loss": 0.5319,
+      "step": 707
+    },
+    {
+      "epoch": 2.17217524980784,
+      "grad_norm": 0.6224305629730225,
+      "learning_rate": 0.0001530550918196995,
+      "loss": 0.6304,
+      "step": 708
+    },
+    {
+      "epoch": 2.175249807840123,
+      "grad_norm": 0.7657787203788757,
+      "learning_rate": 0.00015298831385642737,
+      "loss": 0.531,
+      "step": 709
+    },
+    {
+      "epoch": 2.178324365872406,
+      "grad_norm": 0.8921689987182617,
+      "learning_rate": 0.00015292153589315527,
+      "loss": 0.4925,
+      "step": 710
+    },
+    {
+      "epoch": 2.1813989239046885,
+      "grad_norm": 0.5680480003356934,
+      "learning_rate": 0.00015285475792988315,
+      "loss": 0.3959,
+      "step": 711
+    },
+    {
+      "epoch": 2.1844734819369718,
+      "grad_norm": 0.6384515166282654,
+      "learning_rate": 0.00015278797996661102,
+      "loss": 0.5023,
+      "step": 712
+    },
+    {
+      "epoch": 2.1875480399692546,
+      "grad_norm": 0.523273766040802,
+      "learning_rate": 0.0001527212020033389,
+      "loss": 0.3509,
+      "step": 713
+    },
+    {
+      "epoch": 2.1906225980015375,
+      "grad_norm": 0.6296597719192505,
+      "learning_rate": 0.0001526544240400668,
+      "loss": 0.4695,
+      "step": 714
+    },
+    {
+      "epoch": 2.1936971560338203,
+      "grad_norm": 0.6718856692314148,
+      "learning_rate": 0.00015258764607679466,
+      "loss": 0.4394,
+      "step": 715
+    },
+    {
+      "epoch": 2.196771714066103,
+      "grad_norm": 0.731511116027832,
+      "learning_rate": 0.00015252086811352257,
+      "loss": 0.3299,
+      "step": 716
+    },
+    {
+      "epoch": 2.199846272098386,
+      "grad_norm": 0.7541506886482239,
+      "learning_rate": 0.00015245409015025044,
+      "loss": 0.5043,
+      "step": 717
+    },
+    {
+      "epoch": 2.202920830130669,
+      "grad_norm": 0.8243811726570129,
+      "learning_rate": 0.0001523873121869783,
+      "loss": 0.6253,
+      "step": 718
+    },
+    {
+      "epoch": 2.2059953881629517,
+      "grad_norm": 0.7630672454833984,
+      "learning_rate": 0.00015232053422370618,
+      "loss": 0.4685,
+      "step": 719
+    },
+    {
+      "epoch": 2.2090699461952346,
+      "grad_norm": 0.6123481392860413,
+      "learning_rate": 0.00015225375626043406,
+      "loss": 0.4081,
+      "step": 720
+    },
+    {
+      "epoch": 2.2121445042275174,
+      "grad_norm": 0.6752267479896545,
+      "learning_rate": 0.00015218697829716196,
+      "loss": 0.4342,
+      "step": 721
+    },
+    {
+      "epoch": 2.2152190622598003,
+      "grad_norm": 0.913813054561615,
+      "learning_rate": 0.00015212020033388983,
+      "loss": 0.4762,
+      "step": 722
+    },
+    {
+      "epoch": 2.218293620292083,
+      "grad_norm": 0.7751143574714661,
+      "learning_rate": 0.0001520534223706177,
+      "loss": 0.5079,
+      "step": 723
+    },
+    {
+      "epoch": 2.221368178324366,
+      "grad_norm": 0.8524821996688843,
+      "learning_rate": 0.00015198664440734558,
+      "loss": 0.5464,
+      "step": 724
+    },
+    {
+      "epoch": 2.224442736356649,
+      "grad_norm": 0.8985180258750916,
+      "learning_rate": 0.00015191986644407345,
+      "loss": 0.5276,
+      "step": 725
+    },
+    {
+      "epoch": 2.2275172943889316,
+      "grad_norm": 0.6020591855049133,
+      "learning_rate": 0.00015185308848080135,
+      "loss": 0.3995,
+      "step": 726
+    },
+    {
+      "epoch": 2.2305918524212145,
+      "grad_norm": 0.7074214220046997,
+      "learning_rate": 0.00015178631051752922,
+      "loss": 0.4887,
+      "step": 727
+    },
+    {
+      "epoch": 2.2336664104534973,
+      "grad_norm": 0.7474585771560669,
+      "learning_rate": 0.0001517195325542571,
+      "loss": 0.5474,
+      "step": 728
+    },
+    {
+      "epoch": 2.23674096848578,
+      "grad_norm": 0.6883979439735413,
+      "learning_rate": 0.00015165275459098497,
+      "loss": 0.5503,
+      "step": 729
+    },
+    {
+      "epoch": 2.239815526518063,
+      "grad_norm": 0.6393066644668579,
+      "learning_rate": 0.00015158597662771284,
+      "loss": 0.4356,
+      "step": 730
+    },
+    {
+      "epoch": 2.242890084550346,
+      "grad_norm": 0.6586110591888428,
+      "learning_rate": 0.00015151919866444074,
+      "loss": 0.3659,
+      "step": 731
+    },
+    {
+      "epoch": 2.2459646425826287,
+      "grad_norm": 0.7263343930244446,
+      "learning_rate": 0.00015145242070116862,
+      "loss": 0.4629,
+      "step": 732
+    },
+    {
+      "epoch": 2.2490392006149116,
+      "grad_norm": 0.8680408000946045,
+      "learning_rate": 0.00015138564273789652,
+      "loss": 0.5825,
+      "step": 733
+    },
+    {
+      "epoch": 2.2521137586471944,
+      "grad_norm": 0.5599681735038757,
+      "learning_rate": 0.0001513188647746244,
+      "loss": 0.4886,
+      "step": 734
+    },
+    {
+      "epoch": 2.2551883166794773,
+      "grad_norm": 0.7630482316017151,
+      "learning_rate": 0.00015125208681135226,
+      "loss": 0.496,
+      "step": 735
+    },
+    {
+      "epoch": 2.25826287471176,
+      "grad_norm": 0.6882701516151428,
+      "learning_rate": 0.00015118530884808014,
+      "loss": 0.5949,
+      "step": 736
+    },
+    {
+      "epoch": 2.261337432744043,
+      "grad_norm": 0.7318270802497864,
+      "learning_rate": 0.00015111853088480804,
+      "loss": 0.5267,
+      "step": 737
+    },
+    {
+      "epoch": 2.264411990776326,
+      "grad_norm": 0.8890166878700256,
+      "learning_rate": 0.0001510517529215359,
+      "loss": 0.587,
+      "step": 738
+    },
+    {
+      "epoch": 2.2674865488086087,
+      "grad_norm": 0.735357940196991,
+      "learning_rate": 0.00015098497495826378,
+      "loss": 0.53,
+      "step": 739
+    },
+    {
+      "epoch": 2.2705611068408915,
+      "grad_norm": 0.6169731616973877,
+      "learning_rate": 0.00015091819699499166,
+      "loss": 0.3872,
+      "step": 740
+    },
+    {
+      "epoch": 2.2736356648731744,
+      "grad_norm": 0.6245728135108948,
+      "learning_rate": 0.00015085141903171953,
+      "loss": 0.4331,
+      "step": 741
+    },
+    {
+      "epoch": 2.276710222905457,
+      "grad_norm": 0.6054602265357971,
+      "learning_rate": 0.00015078464106844743,
+      "loss": 0.4014,
+      "step": 742
+    },
+    {
+      "epoch": 2.27978478093774,
+      "grad_norm": 0.6015118956565857,
+      "learning_rate": 0.0001507178631051753,
+      "loss": 0.4942,
+      "step": 743
+    },
+    {
+      "epoch": 2.282859338970023,
+      "grad_norm": 0.7360993027687073,
+      "learning_rate": 0.00015065108514190318,
+      "loss": 0.5544,
+      "step": 744
+    },
+    {
+      "epoch": 2.2859338970023058,
+      "grad_norm": 0.8529961109161377,
+      "learning_rate": 0.00015058430717863105,
+      "loss": 0.5008,
+      "step": 745
+    },
+    {
+      "epoch": 2.2890084550345886,
+      "grad_norm": 0.7723920345306396,
+      "learning_rate": 0.00015051752921535892,
+      "loss": 0.5894,
+      "step": 746
+    },
+    {
+      "epoch": 2.2920830130668715,
+      "grad_norm": 0.8459378480911255,
+      "learning_rate": 0.0001504507512520868,
+      "loss": 0.6038,
+      "step": 747
+    },
+    {
+      "epoch": 2.2951575710991543,
+      "grad_norm": 0.732806384563446,
+      "learning_rate": 0.0001503839732888147,
+      "loss": 0.4986,
+      "step": 748
+    },
+    {
+      "epoch": 2.2982321291314376,
+      "grad_norm": 0.6265669465065002,
+      "learning_rate": 0.00015031719532554257,
+      "loss": 0.3117,
+      "step": 749
+    },
+    {
+      "epoch": 2.30130668716372,
+      "grad_norm": 0.6586902141571045,
+      "learning_rate": 0.00015025041736227047,
+      "loss": 0.3206,
+      "step": 750
+    },
+    {
+      "epoch": 2.3043812451960033,
+      "grad_norm": 0.5551536679267883,
+      "learning_rate": 0.00015018363939899834,
+      "loss": 0.4016,
+      "step": 751
+    },
+    {
+      "epoch": 2.3074558032282857,
+      "grad_norm": 0.8721263408660889,
+      "learning_rate": 0.00015011686143572622,
+      "loss": 0.595,
+      "step": 752
+    },
+    {
+      "epoch": 2.310530361260569,
+      "grad_norm": 0.7609719038009644,
+      "learning_rate": 0.00015005008347245412,
+      "loss": 0.4655,
+      "step": 753
+    },
+    {
+      "epoch": 2.313604919292852,
+      "grad_norm": 0.8068011999130249,
+      "learning_rate": 0.000149983305509182,
+      "loss": 0.5389,
+      "step": 754
+    },
+    {
+      "epoch": 2.3166794773251347,
+      "grad_norm": 0.5893248319625854,
+      "learning_rate": 0.00014991652754590986,
+      "loss": 0.3675,
+      "step": 755
+    },
+    {
+      "epoch": 2.3197540353574175,
+      "grad_norm": 0.4989778399467468,
+      "learning_rate": 0.00014984974958263774,
+      "loss": 0.3376,
+      "step": 756
+    },
+    {
+      "epoch": 2.3228285933897004,
+      "grad_norm": 0.8979980945587158,
+      "learning_rate": 0.0001497829716193656,
+      "loss": 0.4876,
+      "step": 757
+    },
+    {
+      "epoch": 2.3259031514219832,
+      "grad_norm": 0.6380670070648193,
+      "learning_rate": 0.0001497161936560935,
+      "loss": 0.4801,
+      "step": 758
+    },
+    {
+      "epoch": 2.328977709454266,
+      "grad_norm": 0.7083134651184082,
+      "learning_rate": 0.00014964941569282138,
+      "loss": 0.5479,
+      "step": 759
+    },
+    {
+      "epoch": 2.332052267486549,
+      "grad_norm": 0.6810340881347656,
+      "learning_rate": 0.00014958263772954926,
+      "loss": 0.4373,
+      "step": 760
+    },
+    {
+      "epoch": 2.3351268255188318,
+      "grad_norm": 0.7883718013763428,
+      "learning_rate": 0.00014951585976627713,
+      "loss": 0.5557,
+      "step": 761
+    },
+    {
+      "epoch": 2.3382013835511146,
+      "grad_norm": 0.644123375415802,
+      "learning_rate": 0.000149449081803005,
+      "loss": 0.4117,
+      "step": 762
+    },
+    {
+      "epoch": 2.3412759415833975,
+      "grad_norm": 0.8770838975906372,
+      "learning_rate": 0.00014938230383973287,
+      "loss": 0.4398,
+      "step": 763
+    },
+    {
+      "epoch": 2.3443504996156803,
+      "grad_norm": 0.603274405002594,
+      "learning_rate": 0.00014931552587646077,
+      "loss": 0.4261,
+      "step": 764
+    },
+    {
+      "epoch": 2.347425057647963,
+      "grad_norm": 0.7817360162734985,
+      "learning_rate": 0.00014924874791318865,
+      "loss": 0.471,
+      "step": 765
+    },
+    {
+      "epoch": 2.350499615680246,
+      "grad_norm": 0.703245222568512,
+      "learning_rate": 0.00014918196994991652,
+      "loss": 0.4605,
+      "step": 766
+    },
+    {
+      "epoch": 2.353574173712529,
+      "grad_norm": 0.6251977682113647,
+      "learning_rate": 0.00014911519198664442,
+      "loss": 0.4783,
+      "step": 767
+    },
+    {
+      "epoch": 2.3566487317448117,
+      "grad_norm": 0.8665552735328674,
+      "learning_rate": 0.0001490484140233723,
+      "loss": 0.4498,
+      "step": 768
+    },
+    {
+      "epoch": 2.3597232897770946,
+      "grad_norm": 0.7540160417556763,
+      "learning_rate": 0.0001489816360601002,
+      "loss": 0.4788,
+      "step": 769
+    },
+    {
+      "epoch": 2.3627978478093774,
+      "grad_norm": 0.7006065845489502,
+      "learning_rate": 0.00014891485809682807,
+      "loss": 0.4451,
+      "step": 770
+    },
+    {
+      "epoch": 2.3658724058416603,
+      "grad_norm": 0.7307246923446655,
+      "learning_rate": 0.00014884808013355594,
+      "loss": 0.5392,
+      "step": 771
+    },
+    {
+      "epoch": 2.368946963873943,
+      "grad_norm": 0.7006644606590271,
+      "learning_rate": 0.00014878130217028381,
+      "loss": 0.5656,
+      "step": 772
+    },
+    {
+      "epoch": 2.372021521906226,
+      "grad_norm": 0.8450719714164734,
+      "learning_rate": 0.0001487145242070117,
+      "loss": 0.6446,
+      "step": 773
+    },
+    {
+      "epoch": 2.375096079938509,
+      "grad_norm": 0.7223272323608398,
+      "learning_rate": 0.0001486477462437396,
+      "loss": 0.4977,
+      "step": 774
+    },
+    {
+      "epoch": 2.3781706379707916,
+      "grad_norm": 0.7771975994110107,
+      "learning_rate": 0.00014858096828046746,
+      "loss": 0.5423,
+      "step": 775
+    },
+    {
+      "epoch": 2.3812451960030745,
+      "grad_norm": 0.6998997926712036,
+      "learning_rate": 0.00014851419031719533,
+      "loss": 0.4189,
+      "step": 776
+    },
+    {
+      "epoch": 2.3843197540353573,
+      "grad_norm": 0.7170137166976929,
+      "learning_rate": 0.0001484474123539232,
+      "loss": 0.5548,
+      "step": 777
+    },
+    {
+      "epoch": 2.38739431206764,
+      "grad_norm": 0.7737225294113159,
+      "learning_rate": 0.00014838063439065108,
+      "loss": 0.5361,
+      "step": 778
+    },
+    {
+      "epoch": 2.390468870099923,
+      "grad_norm": 0.6768509149551392,
+      "learning_rate": 0.00014831385642737895,
+      "loss": 0.4285,
+      "step": 779
+    },
+    {
+      "epoch": 2.393543428132206,
+      "grad_norm": 0.7848289608955383,
+      "learning_rate": 0.00014824707846410685,
+      "loss": 0.5096,
+      "step": 780
+    },
+    {
+      "epoch": 2.3966179861644887,
+      "grad_norm": 0.7384264469146729,
+      "learning_rate": 0.00014818030050083473,
+      "loss": 0.5718,
+      "step": 781
+    },
+    {
+      "epoch": 2.3996925441967716,
+      "grad_norm": 0.508388876914978,
+      "learning_rate": 0.0001481135225375626,
+      "loss": 0.3681,
+      "step": 782
+    },
+    {
+      "epoch": 2.4027671022290544,
+      "grad_norm": 0.6172118186950684,
+      "learning_rate": 0.00014804674457429047,
+      "loss": 0.3936,
+      "step": 783
+    },
+    {
+      "epoch": 2.4058416602613373,
+      "grad_norm": 0.7471083998680115,
+      "learning_rate": 0.00014797996661101837,
+      "loss": 0.4298,
+      "step": 784
+    },
+    {
+      "epoch": 2.40891621829362,
+      "grad_norm": 0.6412104964256287,
+      "learning_rate": 0.00014791318864774625,
+      "loss": 0.4488,
+      "step": 785
+    },
+    {
+      "epoch": 2.411990776325903,
+      "grad_norm": 0.5242339372634888,
+      "learning_rate": 0.00014784641068447415,
+      "loss": 0.4069,
+      "step": 786
+    },
+    {
+      "epoch": 2.415065334358186,
+      "grad_norm": 0.7063101530075073,
+      "learning_rate": 0.00014777963272120202,
+      "loss": 0.399,
+      "step": 787
+    },
+    {
+      "epoch": 2.418139892390469,
+      "grad_norm": 0.750368595123291,
+      "learning_rate": 0.0001477128547579299,
+      "loss": 0.4841,
+      "step": 788
+    },
+    {
+      "epoch": 2.4212144504227515,
+      "grad_norm": 0.6533263325691223,
+      "learning_rate": 0.00014764607679465777,
+      "loss": 0.5361,
+      "step": 789
+    },
+    {
+      "epoch": 2.424289008455035,
+      "grad_norm": 0.7714757323265076,
+      "learning_rate": 0.00014757929883138567,
+      "loss": 0.5144,
+      "step": 790
+    },
+    {
+      "epoch": 2.427363566487317,
+      "grad_norm": 0.6196386218070984,
+      "learning_rate": 0.00014751252086811354,
+      "loss": 0.3753,
+      "step": 791
+    },
+    {
+      "epoch": 2.4304381245196005,
+      "grad_norm": 0.822083055973053,
+      "learning_rate": 0.0001474457429048414,
+      "loss": 0.5603,
+      "step": 792
+    },
+    {
+      "epoch": 2.4335126825518834,
+      "grad_norm": 0.919624924659729,
+      "learning_rate": 0.00014737896494156929,
+      "loss": 0.7186,
+      "step": 793
+    },
+    {
+      "epoch": 2.436587240584166,
+      "grad_norm": 0.7581265568733215,
+      "learning_rate": 0.00014731218697829716,
+      "loss": 0.4248,
+      "step": 794
+    },
+    {
+      "epoch": 2.439661798616449,
+      "grad_norm": 0.7717792391777039,
+      "learning_rate": 0.00014724540901502506,
+      "loss": 0.5697,
+      "step": 795
+    },
+    {
+      "epoch": 2.442736356648732,
+      "grad_norm": 0.7188724875450134,
+      "learning_rate": 0.00014717863105175293,
+      "loss": 0.5401,
+      "step": 796
+    },
+    {
+      "epoch": 2.4458109146810147,
+      "grad_norm": 0.7343811392784119,
+      "learning_rate": 0.0001471118530884808,
+      "loss": 0.5252,
+      "step": 797
+    },
+    {
+      "epoch": 2.4488854727132976,
+      "grad_norm": 0.8835532665252686,
+      "learning_rate": 0.00014704507512520868,
+      "loss": 0.4707,
+      "step": 798
+    },
+    {
+      "epoch": 2.4519600307455804,
+      "grad_norm": 0.8905605673789978,
+      "learning_rate": 0.00014697829716193655,
+      "loss": 0.6641,
+      "step": 799
+    },
+    {
+      "epoch": 2.4550345887778633,
+      "grad_norm": 0.6634113192558289,
+      "learning_rate": 0.00014691151919866443,
+      "loss": 0.5159,
+      "step": 800
+    },
+    {
+      "epoch": 2.458109146810146,
+      "grad_norm": 0.6292420625686646,
+      "learning_rate": 0.00014684474123539233,
+      "loss": 0.4333,
+      "step": 801
+    },
+    {
+      "epoch": 2.461183704842429,
+      "grad_norm": 0.806917667388916,
+      "learning_rate": 0.0001467779632721202,
+      "loss": 0.4925,
+      "step": 802
+    },
+    {
+      "epoch": 2.464258262874712,
+      "grad_norm": 0.7074801921844482,
+      "learning_rate": 0.0001467111853088481,
+      "loss": 0.5207,
+      "step": 803
+    },
+    {
+      "epoch": 2.4673328209069947,
+      "grad_norm": 0.6873858571052551,
+      "learning_rate": 0.00014664440734557597,
+      "loss": 0.575,
+      "step": 804
+    },
+    {
+      "epoch": 2.4704073789392775,
+      "grad_norm": 0.7576258182525635,
+      "learning_rate": 0.00014657762938230385,
+      "loss": 0.4433,
+      "step": 805
+    },
+    {
+      "epoch": 2.4734819369715604,
+      "grad_norm": 0.8473274111747742,
+      "learning_rate": 0.00014651085141903175,
+      "loss": 0.5218,
+      "step": 806
+    },
+    {
+      "epoch": 2.4765564950038432,
+      "grad_norm": 0.6038965582847595,
+      "learning_rate": 0.00014644407345575962,
+      "loss": 0.3794,
+      "step": 807
+    },
+    {
+      "epoch": 2.479631053036126,
+      "grad_norm": 0.714070200920105,
+      "learning_rate": 0.0001463772954924875,
+      "loss": 0.542,
+      "step": 808
+    },
+    {
+      "epoch": 2.482705611068409,
+      "grad_norm": 0.6756383776664734,
+      "learning_rate": 0.00014631051752921536,
+      "loss": 0.5188,
+      "step": 809
+    },
+    {
+      "epoch": 2.4857801691006918,
+      "grad_norm": 0.6580228209495544,
+      "learning_rate": 0.00014624373956594324,
+      "loss": 0.455,
+      "step": 810
+    },
+    {
+      "epoch": 2.4888547271329746,
+      "grad_norm": 0.7520489692687988,
+      "learning_rate": 0.00014617696160267114,
+      "loss": 0.6004,
+      "step": 811
+    },
+    {
+      "epoch": 2.4919292851652575,
+      "grad_norm": 0.6205190420150757,
+      "learning_rate": 0.000146110183639399,
+      "loss": 0.4561,
+      "step": 812
+    },
+    {
+      "epoch": 2.4950038431975403,
+      "grad_norm": 0.6518359780311584,
+      "learning_rate": 0.00014604340567612688,
+      "loss": 0.4629,
+      "step": 813
+    },
+    {
+      "epoch": 2.498078401229823,
+      "grad_norm": 0.8324114680290222,
+      "learning_rate": 0.00014597662771285476,
+      "loss": 0.4107,
+      "step": 814
+    },
+    {
+      "epoch": 2.501152959262106,
+      "grad_norm": 0.62924724817276,
+      "learning_rate": 0.00014590984974958263,
+      "loss": 0.4224,
+      "step": 815
+    },
+    {
+      "epoch": 2.504227517294389,
+      "grad_norm": 0.6838513612747192,
+      "learning_rate": 0.0001458430717863105,
+      "loss": 0.4847,
+      "step": 816
+    },
+    {
+      "epoch": 2.5073020753266717,
+      "grad_norm": 0.5814975500106812,
+      "learning_rate": 0.0001457762938230384,
+      "loss": 0.4064,
+      "step": 817
+    },
+    {
+      "epoch": 2.5103766333589546,
+      "grad_norm": 0.7436339855194092,
+      "learning_rate": 0.00014570951585976628,
+      "loss": 0.4815,
+      "step": 818
+    },
+    {
+      "epoch": 2.5134511913912374,
+      "grad_norm": 0.672369658946991,
+      "learning_rate": 0.00014564273789649415,
+      "loss": 0.494,
+      "step": 819
+    },
+    {
+      "epoch": 2.5165257494235203,
+      "grad_norm": 0.7163512110710144,
+      "learning_rate": 0.00014557595993322205,
+      "loss": 0.3771,
+      "step": 820
+    },
+    {
+      "epoch": 2.519600307455803,
+      "grad_norm": 0.814750611782074,
+      "learning_rate": 0.00014550918196994992,
+      "loss": 0.5173,
+      "step": 821
+    },
+    {
+      "epoch": 2.522674865488086,
+      "grad_norm": 0.8272102475166321,
+      "learning_rate": 0.00014544240400667782,
+      "loss": 0.4774,
+      "step": 822
+    },
+    {
+      "epoch": 2.525749423520369,
+      "grad_norm": 0.7299224734306335,
+      "learning_rate": 0.0001453756260434057,
+      "loss": 0.539,
+      "step": 823
+    },
+    {
+      "epoch": 2.5288239815526516,
+      "grad_norm": 0.6639888882637024,
+      "learning_rate": 0.00014530884808013357,
+      "loss": 0.4437,
+      "step": 824
+    },
+    {
+      "epoch": 2.531898539584935,
+      "grad_norm": 0.5353997945785522,
+      "learning_rate": 0.00014524207011686144,
+      "loss": 0.3794,
+      "step": 825
+    },
+    {
+      "epoch": 2.5349730976172173,
+      "grad_norm": 0.6737149357795715,
+      "learning_rate": 0.00014517529215358932,
+      "loss": 0.5088,
+      "step": 826
+    },
+    {
+      "epoch": 2.5380476556495006,
+      "grad_norm": 0.6940316557884216,
+      "learning_rate": 0.00014510851419031722,
+      "loss": 0.3551,
+      "step": 827
+    },
+    {
+      "epoch": 2.541122213681783,
+      "grad_norm": 0.5293498039245605,
+      "learning_rate": 0.0001450417362270451,
+      "loss": 0.4501,
+      "step": 828
+    },
+    {
+      "epoch": 2.5441967717140663,
+      "grad_norm": 0.8832515478134155,
+      "learning_rate": 0.00014497495826377296,
+      "loss": 0.6148,
+      "step": 829
+    },
+    {
+      "epoch": 2.5472713297463487,
+      "grad_norm": 0.8401015996932983,
+      "learning_rate": 0.00014490818030050084,
+      "loss": 0.5798,
+      "step": 830
+    },
+    {
+      "epoch": 2.550345887778632,
+      "grad_norm": 0.8171026110649109,
+      "learning_rate": 0.0001448414023372287,
+      "loss": 0.4115,
+      "step": 831
+    },
+    {
+      "epoch": 2.5534204458109144,
+      "grad_norm": 0.6658011674880981,
+      "learning_rate": 0.00014477462437395658,
+      "loss": 0.4396,
+      "step": 832
+    },
+    {
+      "epoch": 2.5564950038431977,
+      "grad_norm": 0.6402685046195984,
+      "learning_rate": 0.00014470784641068448,
+      "loss": 0.3923,
+      "step": 833
+    },
+    {
+      "epoch": 2.55956956187548,
+      "grad_norm": 0.7223045229911804,
+      "learning_rate": 0.00014464106844741236,
+      "loss": 0.5936,
+      "step": 834
+    },
+    {
+      "epoch": 2.5626441199077634,
+      "grad_norm": 0.7487578988075256,
+      "learning_rate": 0.00014457429048414023,
+      "loss": 0.6008,
+      "step": 835
+    },
+    {
+      "epoch": 2.5657186779400463,
+      "grad_norm": 0.7661901712417603,
+      "learning_rate": 0.0001445075125208681,
+      "loss": 0.4106,
+      "step": 836
+    },
+    {
+      "epoch": 2.568793235972329,
+      "grad_norm": 0.6282891035079956,
+      "learning_rate": 0.000144440734557596,
+      "loss": 0.3504,
+      "step": 837
+    },
+    {
+      "epoch": 2.571867794004612,
+      "grad_norm": 0.7049952745437622,
+      "learning_rate": 0.00014437395659432388,
+      "loss": 0.5956,
+      "step": 838
+    },
+    {
+      "epoch": 2.574942352036895,
+      "grad_norm": 0.6975913643836975,
+      "learning_rate": 0.00014430717863105178,
+      "loss": 0.4768,
+      "step": 839
+    },
+    {
+      "epoch": 2.5780169100691777,
+      "grad_norm": 0.7281587719917297,
+      "learning_rate": 0.00014424040066777965,
+      "loss": 0.5222,
+      "step": 840
+    },
+    {
+      "epoch": 2.5810914681014605,
+      "grad_norm": 0.864368200302124,
+      "learning_rate": 0.00014417362270450752,
+      "loss": 0.5532,
+      "step": 841
+    },
+    {
+      "epoch": 2.5841660261337434,
+      "grad_norm": 0.634505569934845,
+      "learning_rate": 0.0001441068447412354,
+      "loss": 0.3804,
+      "step": 842
+    },
+    {
+      "epoch": 2.587240584166026,
+      "grad_norm": 0.6007309556007385,
+      "learning_rate": 0.0001440400667779633,
+      "loss": 0.5151,
+      "step": 843
+    },
+    {
+      "epoch": 2.590315142198309,
+      "grad_norm": 0.9483073353767395,
+      "learning_rate": 0.00014397328881469117,
+      "loss": 0.5779,
+      "step": 844
+    },
+    {
+      "epoch": 2.593389700230592,
+      "grad_norm": 0.8563257455825806,
+      "learning_rate": 0.00014390651085141904,
+      "loss": 0.606,
+      "step": 845
+    },
+    {
+      "epoch": 2.5964642582628747,
+      "grad_norm": 0.6220794320106506,
+      "learning_rate": 0.00014383973288814692,
+      "loss": 0.4883,
+      "step": 846
+    },
+    {
+      "epoch": 2.5995388162951576,
+      "grad_norm": 0.6485925912857056,
+      "learning_rate": 0.0001437729549248748,
+      "loss": 0.4443,
+      "step": 847
+    },
+    {
+      "epoch": 2.6026133743274404,
+      "grad_norm": 0.8992952108383179,
+      "learning_rate": 0.00014370617696160266,
+      "loss": 0.6118,
+      "step": 848
+    },
+    {
+      "epoch": 2.6056879323597233,
+      "grad_norm": 0.5959873199462891,
+      "learning_rate": 0.00014363939899833056,
+      "loss": 0.4875,
+      "step": 849
+    },
+    {
+      "epoch": 2.608762490392006,
+      "grad_norm": 0.8172950744628906,
+      "learning_rate": 0.00014357262103505844,
+      "loss": 0.5899,
+      "step": 850
+    },
+    {
+      "epoch": 2.611837048424289,
+      "grad_norm": 1.0087146759033203,
+      "learning_rate": 0.0001435058430717863,
+      "loss": 0.6385,
+      "step": 851
+    },
+    {
+      "epoch": 2.614911606456572,
+      "grad_norm": 0.6918483376502991,
+      "learning_rate": 0.00014343906510851418,
+      "loss": 0.4764,
+      "step": 852
+    },
+    {
+      "epoch": 2.6179861644888547,
+      "grad_norm": 0.8268954753875732,
+      "learning_rate": 0.00014337228714524205,
+      "loss": 0.535,
+      "step": 853
+    },
+    {
+      "epoch": 2.6210607225211375,
+      "grad_norm": 0.8672003746032715,
+      "learning_rate": 0.00014330550918196995,
+      "loss": 0.5815,
+      "step": 854
+    },
+    {
+      "epoch": 2.6241352805534204,
+      "grad_norm": 0.6377939581871033,
+      "learning_rate": 0.00014323873121869783,
+      "loss": 0.4999,
+      "step": 855
+    },
+    {
+      "epoch": 2.6272098385857032,
+      "grad_norm": 0.6987239718437195,
+      "learning_rate": 0.00014317195325542573,
+      "loss": 0.534,
+      "step": 856
+    },
+    {
+      "epoch": 2.630284396617986,
+      "grad_norm": 0.7003011107444763,
+      "learning_rate": 0.0001431051752921536,
+      "loss": 0.5054,
+      "step": 857
+    },
+    {
+      "epoch": 2.633358954650269,
+      "grad_norm": 0.5871327519416809,
+      "learning_rate": 0.00014303839732888147,
+      "loss": 0.425,
+      "step": 858
+    },
+    {
+      "epoch": 2.6364335126825518,
+      "grad_norm": 0.6714287996292114,
+      "learning_rate": 0.00014297161936560937,
+      "loss": 0.5268,
+      "step": 859
+    },
+    {
+      "epoch": 2.6395080707148346,
+      "grad_norm": 0.8090579509735107,
+      "learning_rate": 0.00014290484140233725,
+      "loss": 0.4094,
+      "step": 860
+    },
+    {
+      "epoch": 2.6425826287471175,
+      "grad_norm": 0.6854161620140076,
+      "learning_rate": 0.00014283806343906512,
+      "loss": 0.4385,
+      "step": 861
+    },
+    {
+      "epoch": 2.6456571867794003,
+      "grad_norm": 0.8665665984153748,
+      "learning_rate": 0.000142771285475793,
+      "loss": 0.5323,
+      "step": 862
+    },
+    {
+      "epoch": 2.648731744811683,
+      "grad_norm": 0.6155755519866943,
+      "learning_rate": 0.00014270450751252087,
+      "loss": 0.5086,
+      "step": 863
+    },
+    {
+      "epoch": 2.6518063028439665,
+      "grad_norm": 0.6008875370025635,
+      "learning_rate": 0.00014263772954924874,
+      "loss": 0.4709,
+      "step": 864
+    },
+    {
+      "epoch": 2.654880860876249,
+      "grad_norm": 0.6181650161743164,
+      "learning_rate": 0.00014257095158597664,
+      "loss": 0.4797,
+      "step": 865
+    },
+    {
+      "epoch": 2.657955418908532,
+      "grad_norm": 0.7965251803398132,
+      "learning_rate": 0.00014250417362270451,
+      "loss": 0.5568,
+      "step": 866
+    },
+    {
+      "epoch": 2.6610299769408146,
+      "grad_norm": 0.6701710224151611,
+      "learning_rate": 0.0001424373956594324,
+      "loss": 0.5649,
+      "step": 867
+    },
+    {
+      "epoch": 2.664104534973098,
+      "grad_norm": 0.7391377091407776,
+      "learning_rate": 0.00014237061769616026,
+      "loss": 0.5245,
+      "step": 868
+    },
+    {
+      "epoch": 2.6671790930053803,
+      "grad_norm": 0.6421666741371155,
+      "learning_rate": 0.00014230383973288813,
+      "loss": 0.6095,
+      "step": 869
+    },
+    {
+      "epoch": 2.6702536510376635,
+      "grad_norm": 0.6544116735458374,
+      "learning_rate": 0.00014223706176961603,
+      "loss": 0.4581,
+      "step": 870
+    },
+    {
+      "epoch": 2.673328209069946,
+      "grad_norm": 0.6023032069206238,
+      "learning_rate": 0.0001421702838063439,
+      "loss": 0.3308,
+      "step": 871
+    },
+    {
+      "epoch": 2.6764027671022292,
+      "grad_norm": 0.6281394362449646,
+      "learning_rate": 0.00014210350584307178,
+      "loss": 0.5103,
+      "step": 872
+    },
+    {
+      "epoch": 2.6794773251345116,
+      "grad_norm": 0.7043030261993408,
+      "learning_rate": 0.00014203672787979968,
+      "loss": 0.5985,
+      "step": 873
+    },
+    {
+      "epoch": 2.682551883166795,
+      "grad_norm": 0.5958001613616943,
+      "learning_rate": 0.00014196994991652755,
+      "loss": 0.455,
+      "step": 874
+    },
+    {
+      "epoch": 2.685626441199078,
+      "grad_norm": 0.7591226696968079,
+      "learning_rate": 0.00014190317195325545,
+      "loss": 0.5467,
+      "step": 875
+    },
+    {
+      "epoch": 2.6887009992313606,
+      "grad_norm": 0.8010213375091553,
+      "learning_rate": 0.00014183639398998333,
+      "loss": 0.4531,
+      "step": 876
+    },
+    {
+      "epoch": 2.6917755572636435,
+      "grad_norm": 0.8268343210220337,
+      "learning_rate": 0.0001417696160267112,
+      "loss": 0.5443,
+      "step": 877
+    },
+    {
+      "epoch": 2.6948501152959263,
+      "grad_norm": 0.6514490246772766,
+      "learning_rate": 0.00014170283806343907,
+      "loss": 0.4455,
+      "step": 878
+    },
+    {
+      "epoch": 2.697924673328209,
+      "grad_norm": 1.0831782817840576,
+      "learning_rate": 0.00014163606010016695,
+      "loss": 0.5648,
+      "step": 879
+    },
+    {
+      "epoch": 2.700999231360492,
+      "grad_norm": 0.8194222450256348,
+      "learning_rate": 0.00014156928213689482,
+      "loss": 0.6465,
+      "step": 880
+    },
+    {
+      "epoch": 2.704073789392775,
+      "grad_norm": 0.7758293747901917,
+      "learning_rate": 0.00014150250417362272,
+      "loss": 0.4335,
+      "step": 881
+    },
+    {
+      "epoch": 2.7071483474250577,
+      "grad_norm": 0.596432089805603,
+      "learning_rate": 0.0001414357262103506,
+      "loss": 0.4159,
+      "step": 882
+    },
+    {
+      "epoch": 2.7102229054573406,
+      "grad_norm": 0.7483400702476501,
+      "learning_rate": 0.00014136894824707847,
+      "loss": 0.5471,
+      "step": 883
+    },
+    {
+      "epoch": 2.7132974634896234,
+      "grad_norm": 0.6361656785011292,
+      "learning_rate": 0.00014130217028380634,
+      "loss": 0.4815,
+      "step": 884
+    },
+    {
+      "epoch": 2.7163720215219063,
+      "grad_norm": 0.6137235760688782,
+      "learning_rate": 0.0001412353923205342,
+      "loss": 0.4245,
+      "step": 885
+    },
+    {
+      "epoch": 2.719446579554189,
+      "grad_norm": 0.6101003289222717,
+      "learning_rate": 0.0001411686143572621,
+      "loss": 0.5262,
+      "step": 886
+    },
+    {
+      "epoch": 2.722521137586472,
+      "grad_norm": 0.939014732837677,
+      "learning_rate": 0.00014110183639398999,
+      "loss": 0.5089,
+      "step": 887
+    },
+    {
+      "epoch": 2.725595695618755,
+      "grad_norm": 0.7217115759849548,
+      "learning_rate": 0.00014103505843071786,
+      "loss": 0.5018,
+      "step": 888
+    },
+    {
+      "epoch": 2.7286702536510377,
+      "grad_norm": 0.6515239477157593,
+      "learning_rate": 0.00014096828046744576,
+      "loss": 0.5274,
+      "step": 889
+    },
+    {
+      "epoch": 2.7317448116833205,
+      "grad_norm": 0.7656288743019104,
+      "learning_rate": 0.00014090150250417363,
+      "loss": 0.5395,
+      "step": 890
+    },
+    {
+      "epoch": 2.7348193697156034,
+      "grad_norm": 0.77834552526474,
+      "learning_rate": 0.00014083472454090153,
+      "loss": 0.5201,
+      "step": 891
+    },
+    {
+      "epoch": 2.737893927747886,
+      "grad_norm": 0.9140714406967163,
+      "learning_rate": 0.0001407679465776294,
+      "loss": 0.7341,
+      "step": 892
+    },
+    {
+      "epoch": 2.740968485780169,
+      "grad_norm": 0.8534432649612427,
+      "learning_rate": 0.00014070116861435728,
+      "loss": 0.4747,
+      "step": 893
+    },
+    {
+      "epoch": 2.744043043812452,
+      "grad_norm": 0.8247655034065247,
+      "learning_rate": 0.00014063439065108515,
+      "loss": 0.5811,
+      "step": 894
+    },
+    {
+      "epoch": 2.7471176018447347,
+      "grad_norm": 0.6922281980514526,
+      "learning_rate": 0.00014056761268781303,
+      "loss": 0.5797,
+      "step": 895
+    },
+    {
+      "epoch": 2.7501921598770176,
+      "grad_norm": 0.7262521982192993,
+      "learning_rate": 0.0001405008347245409,
+      "loss": 0.3954,
+      "step": 896
+    },
+    {
+      "epoch": 2.7532667179093004,
+      "grad_norm": 0.7673102021217346,
+      "learning_rate": 0.0001404340567612688,
+      "loss": 0.4929,
+      "step": 897
+    },
+    {
+      "epoch": 2.7563412759415833,
+      "grad_norm": 0.6259851455688477,
+      "learning_rate": 0.00014036727879799667,
+      "loss": 0.4048,
+      "step": 898
+    },
+    {
+      "epoch": 2.759415833973866,
+      "grad_norm": 0.7085642218589783,
+      "learning_rate": 0.00014030050083472454,
+      "loss": 0.4338,
+      "step": 899
+    },
+    {
+      "epoch": 2.762490392006149,
+      "grad_norm": 0.6708558797836304,
+      "learning_rate": 0.00014023372287145242,
+      "loss": 0.5151,
+      "step": 900
+    },
+    {
+      "epoch": 2.765564950038432,
+      "grad_norm": 0.7648240327835083,
+      "learning_rate": 0.0001401669449081803,
+      "loss": 0.5601,
+      "step": 901
+    },
+    {
+      "epoch": 2.7686395080707147,
+      "grad_norm": 0.6803378462791443,
+      "learning_rate": 0.0001401001669449082,
+      "loss": 0.624,
+      "step": 902
+    },
+    {
+      "epoch": 2.7717140661029975,
+      "grad_norm": 0.7478699088096619,
+      "learning_rate": 0.00014003338898163606,
+      "loss": 0.4805,
+      "step": 903
+    },
+    {
+      "epoch": 2.7747886241352804,
+      "grad_norm": 0.6584222316741943,
+      "learning_rate": 0.00013996661101836394,
+      "loss": 0.4451,
+      "step": 904
+    },
+    {
+      "epoch": 2.7778631821675637,
+      "grad_norm": 0.5883088111877441,
+      "learning_rate": 0.0001398998330550918,
+      "loss": 0.3848,
+      "step": 905
+    },
+    {
+      "epoch": 2.780937740199846,
+      "grad_norm": 0.5683791041374207,
+      "learning_rate": 0.0001398330550918197,
+      "loss": 0.3969,
+      "step": 906
+    },
+    {
+      "epoch": 2.7840122982321294,
+      "grad_norm": 0.5645559430122375,
+      "learning_rate": 0.00013976627712854758,
+      "loss": 0.337,
+      "step": 907
+    },
+    {
+      "epoch": 2.7870868562644118,
+      "grad_norm": 0.5845876336097717,
+      "learning_rate": 0.00013969949916527548,
+      "loss": 0.4401,
+      "step": 908
+    },
+    {
+      "epoch": 2.790161414296695,
+      "grad_norm": 0.8455728888511658,
+      "learning_rate": 0.00013963272120200336,
+      "loss": 0.6128,
+      "step": 909
+    },
+    {
+      "epoch": 2.7932359723289775,
+      "grad_norm": 0.8465787172317505,
+      "learning_rate": 0.00013956594323873123,
+      "loss": 0.5862,
+      "step": 910
+    },
+    {
+      "epoch": 2.7963105303612608,
+      "grad_norm": 0.8809154629707336,
+      "learning_rate": 0.0001394991652754591,
+      "loss": 0.677,
+      "step": 911
+    },
+    {
+      "epoch": 2.799385088393543,
+      "grad_norm": 0.6254997849464417,
+      "learning_rate": 0.00013943238731218698,
+      "loss": 0.534,
+      "step": 912
+    },
+    {
+      "epoch": 2.8024596464258265,
+      "grad_norm": 0.6675909757614136,
+      "learning_rate": 0.00013936560934891488,
+      "loss": 0.4258,
+      "step": 913
+    },
+    {
+      "epoch": 2.8055342044581093,
+      "grad_norm": 0.672428548336029,
+      "learning_rate": 0.00013929883138564275,
+      "loss": 0.4659,
+      "step": 914
+    },
+    {
+      "epoch": 2.808608762490392,
+      "grad_norm": 0.7433823943138123,
+      "learning_rate": 0.00013923205342237062,
+      "loss": 0.4804,
+      "step": 915
+    },
+    {
+      "epoch": 2.811683320522675,
+      "grad_norm": 0.6739639639854431,
+      "learning_rate": 0.0001391652754590985,
+      "loss": 0.5034,
+      "step": 916
+    },
+    {
+      "epoch": 2.814757878554958,
+      "grad_norm": 0.7234442234039307,
+      "learning_rate": 0.00013909849749582637,
+      "loss": 0.5953,
+      "step": 917
+    },
+    {
+      "epoch": 2.8178324365872407,
+      "grad_norm": 0.7517747282981873,
+      "learning_rate": 0.00013903171953255427,
+      "loss": 0.4764,
+      "step": 918
+    },
+    {
+      "epoch": 2.8209069946195235,
+      "grad_norm": 0.6552411913871765,
+      "learning_rate": 0.00013896494156928214,
+      "loss": 0.5352,
+      "step": 919
+    },
+    {
+      "epoch": 2.8239815526518064,
+      "grad_norm": 0.5779647827148438,
+      "learning_rate": 0.00013889816360601002,
+      "loss": 0.3907,
+      "step": 920
+    },
+    {
+      "epoch": 2.8270561106840892,
+      "grad_norm": 0.7147451639175415,
+      "learning_rate": 0.0001388313856427379,
+      "loss": 0.4245,
+      "step": 921
+    },
+    {
+      "epoch": 2.830130668716372,
+      "grad_norm": 0.5399389863014221,
+      "learning_rate": 0.00013876460767946576,
+      "loss": 0.4573,
+      "step": 922
+    },
+    {
+      "epoch": 2.833205226748655,
+      "grad_norm": 0.47868096828460693,
+      "learning_rate": 0.00013869782971619366,
+      "loss": 0.4469,
+      "step": 923
+    },
+    {
+      "epoch": 2.836279784780938,
+      "grad_norm": 0.6399335861206055,
+      "learning_rate": 0.00013863105175292154,
+      "loss": 0.4518,
+      "step": 924
+    },
+    {
+      "epoch": 2.8393543428132206,
+      "grad_norm": 0.6514092683792114,
+      "learning_rate": 0.00013856427378964944,
+      "loss": 0.4701,
+      "step": 925
+    },
+    {
+      "epoch": 2.8424289008455035,
+      "grad_norm": 1.0016971826553345,
+      "learning_rate": 0.0001384974958263773,
+      "loss": 0.5146,
+      "step": 926
+    },
+    {
+      "epoch": 2.8455034588777863,
+      "grad_norm": 0.6343466639518738,
+      "learning_rate": 0.00013843071786310518,
+      "loss": 0.5356,
+      "step": 927
+    },
+    {
+      "epoch": 2.848578016910069,
+      "grad_norm": 0.7292190194129944,
+      "learning_rate": 0.00013836393989983308,
+      "loss": 0.4852,
+      "step": 928
+    },
+    {
+      "epoch": 2.851652574942352,
+      "grad_norm": 0.6090812683105469,
+      "learning_rate": 0.00013829716193656096,
+      "loss": 0.3873,
+      "step": 929
+    },
+    {
+      "epoch": 2.854727132974635,
+      "grad_norm": 0.7116502523422241,
+      "learning_rate": 0.00013823038397328883,
+      "loss": 0.4726,
+      "step": 930
+    },
+    {
+      "epoch": 2.8578016910069177,
+      "grad_norm": 0.8313955664634705,
+      "learning_rate": 0.0001381636060100167,
+      "loss": 0.4524,
+      "step": 931
+    },
+    {
+      "epoch": 2.8608762490392006,
+      "grad_norm": 0.7220770120620728,
+      "learning_rate": 0.00013809682804674458,
+      "loss": 0.4448,
+      "step": 932
+    },
+    {
+      "epoch": 2.8639508070714834,
+      "grad_norm": 0.8398887515068054,
+      "learning_rate": 0.00013803005008347245,
+      "loss": 0.4853,
+      "step": 933
+    },
+    {
+      "epoch": 2.8670253651037663,
+      "grad_norm": 0.7636063098907471,
+      "learning_rate": 0.00013796327212020035,
+      "loss": 0.5618,
+      "step": 934
+    },
+    {
+      "epoch": 2.870099923136049,
+      "grad_norm": 0.5957133769989014,
+      "learning_rate": 0.00013789649415692822,
+      "loss": 0.4751,
+      "step": 935
+    },
+    {
+      "epoch": 2.873174481168332,
+      "grad_norm": 0.7475373148918152,
+      "learning_rate": 0.0001378297161936561,
+      "loss": 0.5226,
+      "step": 936
+    },
+    {
+      "epoch": 2.876249039200615,
+      "grad_norm": 0.7718681693077087,
+      "learning_rate": 0.00013776293823038397,
+      "loss": 0.5382,
+      "step": 937
+    },
+    {
+      "epoch": 2.8793235972328977,
+      "grad_norm": 0.7646799087524414,
+      "learning_rate": 0.00013769616026711184,
+      "loss": 0.5466,
+      "step": 938
+    },
+    {
+      "epoch": 2.8823981552651805,
+      "grad_norm": 0.45133599638938904,
+      "learning_rate": 0.00013762938230383974,
+      "loss": 0.3254,
+      "step": 939
+    },
+    {
+      "epoch": 2.8854727132974634,
+      "grad_norm": 0.6464604735374451,
+      "learning_rate": 0.00013756260434056762,
+      "loss": 0.5075,
+      "step": 940
+    },
+    {
+      "epoch": 2.888547271329746,
+      "grad_norm": 0.6089568138122559,
+      "learning_rate": 0.0001374958263772955,
+      "loss": 0.5177,
+      "step": 941
+    },
+    {
+      "epoch": 2.891621829362029,
+      "grad_norm": 0.6696579456329346,
+      "learning_rate": 0.0001374290484140234,
+      "loss": 0.4651,
+      "step": 942
+    },
+    {
+      "epoch": 2.894696387394312,
+      "grad_norm": 0.7825729846954346,
+      "learning_rate": 0.00013736227045075126,
+      "loss": 0.5601,
+      "step": 943
+    },
+    {
+      "epoch": 2.897770945426595,
+      "grad_norm": 0.75175541639328,
+      "learning_rate": 0.00013729549248747916,
+      "loss": 0.4901,
+      "step": 944
+    },
+    {
+      "epoch": 2.9008455034588776,
+      "grad_norm": 0.6651338338851929,
+      "learning_rate": 0.00013722871452420704,
+      "loss": 0.6134,
+      "step": 945
+    },
+    {
+      "epoch": 2.903920061491161,
+      "grad_norm": 0.6632173657417297,
+      "learning_rate": 0.0001371619365609349,
+      "loss": 0.4926,
+      "step": 946
+    },
+    {
+      "epoch": 2.9069946195234433,
+      "grad_norm": 0.666152834892273,
+      "learning_rate": 0.00013709515859766278,
+      "loss": 0.5822,
+      "step": 947
+    },
+    {
+      "epoch": 2.9100691775557266,
+      "grad_norm": 0.779793381690979,
+      "learning_rate": 0.00013702838063439065,
+      "loss": 0.6007,
+      "step": 948
+    },
+    {
+      "epoch": 2.913143735588009,
+      "grad_norm": 0.5794811248779297,
+      "learning_rate": 0.00013696160267111853,
+      "loss": 0.459,
+      "step": 949
+    },
+    {
+      "epoch": 2.9162182936202923,
+      "grad_norm": 0.65561443567276,
+      "learning_rate": 0.00013689482470784643,
+      "loss": 0.5584,
+      "step": 950
+    },
+    {
+      "epoch": 2.9192928516525747,
+      "grad_norm": 0.6967616677284241,
+      "learning_rate": 0.0001368280467445743,
+      "loss": 0.6218,
+      "step": 951
+    },
+    {
+      "epoch": 2.922367409684858,
+      "grad_norm": 0.7796815037727356,
+      "learning_rate": 0.00013676126878130217,
+      "loss": 0.6594,
+      "step": 952
+    },
+    {
+      "epoch": 2.925441967717141,
+      "grad_norm": 0.7640193700790405,
+      "learning_rate": 0.00013669449081803005,
+      "loss": 0.5783,
+      "step": 953
+    },
+    {
+      "epoch": 2.9285165257494237,
+      "grad_norm": 0.870796799659729,
+      "learning_rate": 0.00013662771285475792,
+      "loss": 0.4792,
+      "step": 954
+    },
+    {
+      "epoch": 2.9315910837817065,
+      "grad_norm": 0.8562505841255188,
+      "learning_rate": 0.00013656093489148582,
+      "loss": 0.4662,
+      "step": 955
+    },
+    {
+      "epoch": 2.9346656418139894,
+      "grad_norm": 0.744202196598053,
+      "learning_rate": 0.0001364941569282137,
+      "loss": 0.5733,
+      "step": 956
+    },
+    {
+      "epoch": 2.937740199846272,
+      "grad_norm": 0.7171375155448914,
+      "learning_rate": 0.00013642737896494157,
+      "loss": 0.4559,
+      "step": 957
+    },
+    {
+      "epoch": 2.940814757878555,
+      "grad_norm": 0.6538399457931519,
+      "learning_rate": 0.00013636060100166944,
+      "loss": 0.5757,
+      "step": 958
+    },
+    {
+      "epoch": 2.943889315910838,
+      "grad_norm": 0.7372276782989502,
+      "learning_rate": 0.00013629382303839734,
+      "loss": 0.5342,
+      "step": 959
+    },
+    {
+      "epoch": 2.9469638739431208,
+      "grad_norm": 0.7643387317657471,
+      "learning_rate": 0.00013622704507512521,
+      "loss": 0.4865,
+      "step": 960
+    },
+    {
+      "epoch": 2.9500384319754036,
+      "grad_norm": 0.8265420198440552,
+      "learning_rate": 0.00013616026711185311,
+      "loss": 0.487,
+      "step": 961
+    },
+    {
+      "epoch": 2.9531129900076865,
+      "grad_norm": 0.7020171284675598,
+      "learning_rate": 0.000136093489148581,
+      "loss": 0.4587,
+      "step": 962
+    },
+    {
+      "epoch": 2.9561875480399693,
+      "grad_norm": 0.8034495711326599,
+      "learning_rate": 0.00013602671118530886,
+      "loss": 0.5976,
+      "step": 963
+    },
+    {
+      "epoch": 2.959262106072252,
+      "grad_norm": 0.8909509181976318,
+      "learning_rate": 0.00013595993322203673,
+      "loss": 0.5283,
+      "step": 964
+    },
+    {
+      "epoch": 2.962336664104535,
+      "grad_norm": 0.8513332009315491,
+      "learning_rate": 0.0001358931552587646,
+      "loss": 0.5633,
+      "step": 965
+    },
+    {
+      "epoch": 2.965411222136818,
+      "grad_norm": 0.884508490562439,
+      "learning_rate": 0.0001358263772954925,
+      "loss": 0.5723,
+      "step": 966
+    },
+    {
+      "epoch": 2.9684857801691007,
+      "grad_norm": 0.7936095595359802,
+      "learning_rate": 0.00013575959933222038,
+      "loss": 0.5113,
+      "step": 967
+    },
+    {
+      "epoch": 2.9715603382013835,
+      "grad_norm": 0.9732086658477783,
+      "learning_rate": 0.00013569282136894825,
+      "loss": 0.5064,
+      "step": 968
+    },
+    {
+      "epoch": 2.9746348962336664,
+      "grad_norm": 0.7790175676345825,
+      "learning_rate": 0.00013562604340567613,
+      "loss": 0.653,
+      "step": 969
+    },
+    {
+      "epoch": 2.9777094542659492,
+      "grad_norm": 0.6383731365203857,
+      "learning_rate": 0.000135559265442404,
+      "loss": 0.4153,
+      "step": 970
+    },
+    {
+      "epoch": 2.980784012298232,
+      "grad_norm": 0.6142308115959167,
+      "learning_rate": 0.0001354924874791319,
+      "loss": 0.4416,
+      "step": 971
+    },
+    {
+      "epoch": 2.983858570330515,
+      "grad_norm": 0.8212004899978638,
+      "learning_rate": 0.00013542570951585977,
+      "loss": 0.628,
+      "step": 972
+    },
+    {
+      "epoch": 2.986933128362798,
+      "grad_norm": 0.7956951856613159,
+      "learning_rate": 0.00013535893155258765,
+      "loss": 0.4985,
+      "step": 973
+    },
+    {
+      "epoch": 2.9900076863950806,
+      "grad_norm": 0.6558810472488403,
+      "learning_rate": 0.00013529215358931552,
+      "loss": 0.6306,
+      "step": 974
+    },
+    {
+      "epoch": 2.9930822444273635,
+      "grad_norm": 0.6772769689559937,
+      "learning_rate": 0.0001352253756260434,
+      "loss": 0.4105,
+      "step": 975
+    },
+    {
+      "epoch": 2.9961568024596463,
+      "grad_norm": 0.6904112100601196,
+      "learning_rate": 0.0001351585976627713,
+      "loss": 0.6085,
+      "step": 976
+    },
+    {
+      "epoch": 2.999231360491929,
+      "grad_norm": 1.2956902980804443,
+      "learning_rate": 0.00013509181969949917,
+      "loss": 0.5799,
+      "step": 977
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 1.4253301620483398,
+      "learning_rate": 0.00013502504173622707,
+      "loss": 0.532,
+      "step": 978
+    },
+    {
+      "epoch": 3.003074558032283,
+      "grad_norm": 0.5800220370292664,
+      "learning_rate": 0.00013495826377295494,
+      "loss": 0.3877,
+      "step": 979
+    },
+    {
+      "epoch": 3.0061491160645657,
+      "grad_norm": 0.558982253074646,
+      "learning_rate": 0.0001348914858096828,
+      "loss": 0.3996,
+      "step": 980
+    },
+    {
+      "epoch": 3.0092236740968485,
+      "grad_norm": 0.6223140954971313,
+      "learning_rate": 0.00013482470784641069,
+      "loss": 0.5572,
+      "step": 981
+    },
+    {
+      "epoch": 3.0122982321291314,
+      "grad_norm": 0.522871196269989,
+      "learning_rate": 0.00013475792988313859,
+      "loss": 0.4207,
+      "step": 982
+    },
+    {
+      "epoch": 3.0153727901614142,
+      "grad_norm": 0.8188950419425964,
+      "learning_rate": 0.00013469115191986646,
+      "loss": 0.4837,
+      "step": 983
+    },
+    {
+      "epoch": 3.018447348193697,
+      "grad_norm": 0.5563365817070007,
+      "learning_rate": 0.00013462437395659433,
+      "loss": 0.3388,
+      "step": 984
+    },
+    {
+      "epoch": 3.02152190622598,
+      "grad_norm": 0.6464280486106873,
+      "learning_rate": 0.0001345575959933222,
+      "loss": 0.4506,
+      "step": 985
+    },
+    {
+      "epoch": 3.024596464258263,
+      "grad_norm": 0.815517246723175,
+      "learning_rate": 0.00013449081803005008,
+      "loss": 0.5219,
+      "step": 986
+    },
+    {
+      "epoch": 3.0276710222905456,
+      "grad_norm": 0.6663722395896912,
+      "learning_rate": 0.00013442404006677798,
+      "loss": 0.4144,
+      "step": 987
+    },
+    {
+      "epoch": 3.0307455803228285,
+      "grad_norm": 0.5828370451927185,
+      "learning_rate": 0.00013435726210350585,
+      "loss": 0.3811,
+      "step": 988
+    },
+    {
+      "epoch": 3.0338201383551113,
+      "grad_norm": 0.5832375884056091,
+      "learning_rate": 0.00013429048414023373,
+      "loss": 0.3878,
+      "step": 989
+    },
+    {
+      "epoch": 3.036894696387394,
+      "grad_norm": 0.5275335311889648,
+      "learning_rate": 0.0001342237061769616,
+      "loss": 0.3398,
+      "step": 990
+    },
+    {
+      "epoch": 3.039969254419677,
+      "grad_norm": 0.7779368758201599,
+      "learning_rate": 0.00013415692821368947,
+      "loss": 0.4973,
+      "step": 991
+    },
+    {
+      "epoch": 3.04304381245196,
+      "grad_norm": 0.7502028942108154,
+      "learning_rate": 0.00013409015025041737,
+      "loss": 0.4701,
+      "step": 992
+    },
+    {
+      "epoch": 3.0461183704842427,
+      "grad_norm": 0.7934368848800659,
+      "learning_rate": 0.00013402337228714524,
+      "loss": 0.5111,
+      "step": 993
+    },
+    {
+      "epoch": 3.0491929285165256,
+      "grad_norm": 0.6284624338150024,
+      "learning_rate": 0.00013395659432387312,
+      "loss": 0.3573,
+      "step": 994
+    },
+    {
+      "epoch": 3.0522674865488084,
+      "grad_norm": 0.8872091770172119,
+      "learning_rate": 0.00013388981636060102,
+      "loss": 0.3814,
+      "step": 995
+    },
+    {
+      "epoch": 3.0553420445810913,
+      "grad_norm": 0.5997917652130127,
+      "learning_rate": 0.0001338230383973289,
+      "loss": 0.4055,
+      "step": 996
+    },
+    {
+      "epoch": 3.058416602613374,
+      "grad_norm": 0.5672919154167175,
+      "learning_rate": 0.00013375626043405676,
+      "loss": 0.3655,
+      "step": 997
+    },
+    {
+      "epoch": 3.061491160645657,
+      "grad_norm": 0.7909939885139465,
+      "learning_rate": 0.00013368948247078466,
+      "loss": 0.4561,
+      "step": 998
+    },
+    {
+      "epoch": 3.0645657186779403,
+      "grad_norm": 1.0427160263061523,
+      "learning_rate": 0.00013362270450751254,
+      "loss": 0.5413,
+      "step": 999
+    },
+    {
+      "epoch": 3.067640276710223,
+      "grad_norm": 0.8109893798828125,
+      "learning_rate": 0.0001335559265442404,
+      "loss": 0.4794,
+      "step": 1000
+    },
+    {
+      "epoch": 3.070714834742506,
+      "grad_norm": 0.7566149830818176,
+      "learning_rate": 0.00013348914858096828,
+      "loss": 0.394,
+      "step": 1001
+    },
+    {
+      "epoch": 3.073789392774789,
+      "grad_norm": 0.7242660522460938,
+      "learning_rate": 0.00013342237061769616,
+      "loss": 0.4266,
+      "step": 1002
+    },
+    {
+      "epoch": 3.0768639508070716,
+      "grad_norm": 0.9477089047431946,
+      "learning_rate": 0.00013335559265442406,
+      "loss": 0.4617,
+      "step": 1003
+    },
+    {
+      "epoch": 3.0799385088393545,
+      "grad_norm": 0.949448823928833,
+      "learning_rate": 0.00013328881469115193,
+      "loss": 0.5183,
+      "step": 1004
+    },
+    {
+      "epoch": 3.0830130668716373,
+      "grad_norm": 0.7148897647857666,
+      "learning_rate": 0.0001332220367278798,
+      "loss": 0.4209,
+      "step": 1005
+    },
+    {
+      "epoch": 3.08608762490392,
+      "grad_norm": 0.6903197169303894,
+      "learning_rate": 0.00013315525876460768,
+      "loss": 0.4703,
+      "step": 1006
+    },
+    {
+      "epoch": 3.089162182936203,
+      "grad_norm": 0.7613615393638611,
+      "learning_rate": 0.00013308848080133555,
+      "loss": 0.3055,
+      "step": 1007
+    },
+    {
+      "epoch": 3.092236740968486,
+      "grad_norm": 0.5844465494155884,
+      "learning_rate": 0.00013302170283806345,
+      "loss": 0.2888,
+      "step": 1008
+    },
+    {
+      "epoch": 3.0953112990007687,
+      "grad_norm": 0.772946298122406,
+      "learning_rate": 0.00013295492487479132,
+      "loss": 0.4935,
+      "step": 1009
+    },
+    {
+      "epoch": 3.0983858570330516,
+      "grad_norm": 0.7142703533172607,
+      "learning_rate": 0.0001328881469115192,
+      "loss": 0.443,
+      "step": 1010
+    },
+    {
+      "epoch": 3.1014604150653344,
+      "grad_norm": 0.7844696044921875,
+      "learning_rate": 0.00013282136894824707,
+      "loss": 0.4677,
+      "step": 1011
+    },
+    {
+      "epoch": 3.1045349730976173,
+      "grad_norm": 0.8850453495979309,
+      "learning_rate": 0.00013275459098497497,
+      "loss": 0.4703,
+      "step": 1012
+    },
+    {
+      "epoch": 3.1076095311299,
+      "grad_norm": 0.6243056058883667,
+      "learning_rate": 0.00013268781302170284,
+      "loss": 0.3225,
+      "step": 1013
+    },
+    {
+      "epoch": 3.110684089162183,
+      "grad_norm": 0.5218976736068726,
+      "learning_rate": 0.00013262103505843074,
+      "loss": 0.3424,
+      "step": 1014
+    },
+    {
+      "epoch": 3.113758647194466,
+      "grad_norm": 0.78139728307724,
+      "learning_rate": 0.00013255425709515862,
+      "loss": 0.3779,
+      "step": 1015
+    },
+    {
+      "epoch": 3.1168332052267487,
+      "grad_norm": 0.7067313194274902,
+      "learning_rate": 0.0001324874791318865,
+      "loss": 0.2979,
+      "step": 1016
+    },
+    {
+      "epoch": 3.1199077632590315,
+      "grad_norm": 0.5684396028518677,
+      "learning_rate": 0.00013242070116861436,
+      "loss": 0.3936,
+      "step": 1017
+    },
+    {
+      "epoch": 3.1229823212913144,
+      "grad_norm": 0.7444823384284973,
+      "learning_rate": 0.00013235392320534224,
+      "loss": 0.3771,
+      "step": 1018
+    },
+    {
+      "epoch": 3.126056879323597,
+      "grad_norm": 0.7313172221183777,
+      "learning_rate": 0.00013228714524207014,
+      "loss": 0.4764,
+      "step": 1019
+    },
+    {
+      "epoch": 3.12913143735588,
+      "grad_norm": 0.8631938099861145,
+      "learning_rate": 0.000132220367278798,
+      "loss": 0.4095,
+      "step": 1020
+    },
+    {
+      "epoch": 3.132205995388163,
+      "grad_norm": 0.745307445526123,
+      "learning_rate": 0.00013215358931552588,
+      "loss": 0.3906,
+      "step": 1021
+    },
+    {
+      "epoch": 3.1352805534204458,
+      "grad_norm": 0.7458917498588562,
+      "learning_rate": 0.00013208681135225376,
+      "loss": 0.4967,
+      "step": 1022
+    },
+    {
+      "epoch": 3.1383551114527286,
+      "grad_norm": 0.8067619204521179,
+      "learning_rate": 0.00013202003338898163,
+      "loss": 0.5558,
+      "step": 1023
+    },
+    {
+      "epoch": 3.1414296694850115,
+      "grad_norm": 0.8676497340202332,
+      "learning_rate": 0.00013195325542570953,
+      "loss": 0.4726,
+      "step": 1024
+    },
+    {
+      "epoch": 3.1445042275172943,
+      "grad_norm": 0.8006786108016968,
+      "learning_rate": 0.0001318864774624374,
+      "loss": 0.4352,
+      "step": 1025
+    },
+    {
+      "epoch": 3.147578785549577,
+      "grad_norm": 0.7759934067726135,
+      "learning_rate": 0.00013181969949916528,
+      "loss": 0.4349,
+      "step": 1026
+    },
+    {
+      "epoch": 3.15065334358186,
+      "grad_norm": 0.6713132262229919,
+      "learning_rate": 0.00013175292153589315,
+      "loss": 0.4152,
+      "step": 1027
+    },
+    {
+      "epoch": 3.153727901614143,
+      "grad_norm": 0.8547674417495728,
+      "learning_rate": 0.00013168614357262102,
+      "loss": 0.3794,
+      "step": 1028
+    },
+    {
+      "epoch": 3.1568024596464257,
+      "grad_norm": 0.8227840065956116,
+      "learning_rate": 0.00013161936560934892,
+      "loss": 0.4816,
+      "step": 1029
+    },
+    {
+      "epoch": 3.1598770176787085,
+      "grad_norm": 0.7303609251976013,
+      "learning_rate": 0.0001315525876460768,
+      "loss": 0.4662,
+      "step": 1030
+    },
+    {
+      "epoch": 3.1629515757109914,
+      "grad_norm": 0.7921698689460754,
+      "learning_rate": 0.0001314858096828047,
+      "loss": 0.4455,
+      "step": 1031
+    },
+    {
+      "epoch": 3.1660261337432742,
+      "grad_norm": 0.662651538848877,
+      "learning_rate": 0.00013141903171953257,
+      "loss": 0.3648,
+      "step": 1032
+    },
+    {
+      "epoch": 3.169100691775557,
+      "grad_norm": 0.8179068565368652,
+      "learning_rate": 0.00013135225375626044,
+      "loss": 0.4263,
+      "step": 1033
+    },
+    {
+      "epoch": 3.17217524980784,
+      "grad_norm": 0.5210088491439819,
+      "learning_rate": 0.00013128547579298832,
+      "loss": 0.3315,
+      "step": 1034
+    },
+    {
+      "epoch": 3.175249807840123,
+      "grad_norm": 0.7272716164588928,
+      "learning_rate": 0.00013121869782971622,
+      "loss": 0.4366,
+      "step": 1035
+    },
+    {
+      "epoch": 3.178324365872406,
+      "grad_norm": 0.8663350343704224,
+      "learning_rate": 0.0001311519198664441,
+      "loss": 0.4821,
+      "step": 1036
+    },
+    {
+      "epoch": 3.1813989239046885,
+      "grad_norm": 0.7915233373641968,
+      "learning_rate": 0.00013108514190317196,
+      "loss": 0.4452,
+      "step": 1037
+    },
+    {
+      "epoch": 3.1844734819369718,
+      "grad_norm": 0.8421617746353149,
+      "learning_rate": 0.00013101836393989983,
+      "loss": 0.4405,
+      "step": 1038
+    },
+    {
+      "epoch": 3.1875480399692546,
+      "grad_norm": 0.6787004470825195,
+      "learning_rate": 0.0001309515859766277,
+      "loss": 0.3901,
+      "step": 1039
+    },
+    {
+      "epoch": 3.1906225980015375,
+      "grad_norm": 0.6390913128852844,
+      "learning_rate": 0.0001308848080133556,
+      "loss": 0.3647,
+      "step": 1040
+    },
+    {
+      "epoch": 3.1936971560338203,
+      "grad_norm": 0.6698052883148193,
+      "learning_rate": 0.00013081803005008348,
+      "loss": 0.3983,
+      "step": 1041
+    },
+    {
+      "epoch": 3.196771714066103,
+      "grad_norm": 0.6419287919998169,
+      "learning_rate": 0.00013075125208681135,
+      "loss": 0.396,
+      "step": 1042
+    },
+    {
+      "epoch": 3.199846272098386,
+      "grad_norm": 0.7305182218551636,
+      "learning_rate": 0.00013068447412353923,
+      "loss": 0.4746,
+      "step": 1043
+    },
+    {
+      "epoch": 3.202920830130669,
+      "grad_norm": 1.1813292503356934,
+      "learning_rate": 0.0001306176961602671,
+      "loss": 0.3741,
+      "step": 1044
+    },
+    {
+      "epoch": 3.2059953881629517,
+      "grad_norm": 0.7310966849327087,
+      "learning_rate": 0.00013055091819699497,
+      "loss": 0.4184,
+      "step": 1045
+    },
+    {
+      "epoch": 3.2090699461952346,
+      "grad_norm": 0.5950028896331787,
+      "learning_rate": 0.00013048414023372287,
+      "loss": 0.3328,
+      "step": 1046
+    },
+    {
+      "epoch": 3.2121445042275174,
+      "grad_norm": 0.5876432657241821,
+      "learning_rate": 0.00013041736227045075,
+      "loss": 0.3244,
+      "step": 1047
+    },
+    {
+      "epoch": 3.2152190622598003,
+      "grad_norm": 0.7231600284576416,
+      "learning_rate": 0.00013035058430717865,
+      "loss": 0.432,
+      "step": 1048
+    },
+    {
+      "epoch": 3.218293620292083,
+      "grad_norm": 0.9304287433624268,
+      "learning_rate": 0.00013028380634390652,
+      "loss": 0.4804,
+      "step": 1049
+    },
+    {
+      "epoch": 3.221368178324366,
+      "grad_norm": 0.7582074999809265,
+      "learning_rate": 0.0001302170283806344,
+      "loss": 0.4306,
+      "step": 1050
+    },
+    {
+      "epoch": 3.224442736356649,
+      "grad_norm": 0.7604076266288757,
+      "learning_rate": 0.0001301502504173623,
+      "loss": 0.4444,
+      "step": 1051
+    },
+    {
+      "epoch": 3.2275172943889316,
+      "grad_norm": 0.9969847202301025,
+      "learning_rate": 0.00013008347245409017,
+      "loss": 0.4617,
+      "step": 1052
+    },
+    {
+      "epoch": 3.2305918524212145,
+      "grad_norm": 1.0649595260620117,
+      "learning_rate": 0.00013001669449081804,
+      "loss": 0.3696,
+      "step": 1053
+    },
+    {
+      "epoch": 3.2336664104534973,
+      "grad_norm": 0.6512593030929565,
+      "learning_rate": 0.00012994991652754591,
+      "loss": 0.3397,
+      "step": 1054
+    },
+    {
+      "epoch": 3.23674096848578,
+      "grad_norm": 0.9685459136962891,
+      "learning_rate": 0.0001298831385642738,
+      "loss": 0.4758,
+      "step": 1055
+    },
+    {
+      "epoch": 3.239815526518063,
+      "grad_norm": 0.8561423420906067,
+      "learning_rate": 0.0001298163606010017,
+      "loss": 0.404,
+      "step": 1056
+    },
+    {
+      "epoch": 3.242890084550346,
+      "grad_norm": 0.7284657955169678,
+      "learning_rate": 0.00012974958263772956,
+      "loss": 0.3061,
+      "step": 1057
+    },
+    {
+      "epoch": 3.2459646425826287,
+      "grad_norm": 0.7802515029907227,
+      "learning_rate": 0.00012968280467445743,
+      "loss": 0.441,
+      "step": 1058
+    },
+    {
+      "epoch": 3.2490392006149116,
+      "grad_norm": 0.7817832231521606,
+      "learning_rate": 0.0001296160267111853,
+      "loss": 0.4344,
+      "step": 1059
+    },
+    {
+      "epoch": 3.2521137586471944,
+      "grad_norm": 0.7306939959526062,
+      "learning_rate": 0.00012954924874791318,
+      "loss": 0.39,
+      "step": 1060
+    },
+    {
+      "epoch": 3.2551883166794773,
+      "grad_norm": 0.6479128003120422,
+      "learning_rate": 0.00012948247078464108,
+      "loss": 0.4264,
+      "step": 1061
+    },
+    {
+      "epoch": 3.25826287471176,
+      "grad_norm": 0.6551531553268433,
+      "learning_rate": 0.00012941569282136895,
+      "loss": 0.3445,
+      "step": 1062
+    },
+    {
+      "epoch": 3.261337432744043,
+      "grad_norm": 0.9258570075035095,
+      "learning_rate": 0.00012934891485809683,
+      "loss": 0.4762,
+      "step": 1063
+    },
+    {
+      "epoch": 3.264411990776326,
+      "grad_norm": 0.6978762149810791,
+      "learning_rate": 0.0001292821368948247,
+      "loss": 0.3335,
+      "step": 1064
+    },
+    {
+      "epoch": 3.2674865488086087,
+      "grad_norm": 0.7362371683120728,
+      "learning_rate": 0.0001292153589315526,
+      "loss": 0.4587,
+      "step": 1065
+    },
+    {
+      "epoch": 3.2705611068408915,
+      "grad_norm": 0.8437744975090027,
+      "learning_rate": 0.00012914858096828047,
+      "loss": 0.3999,
+      "step": 1066
+    },
+    {
+      "epoch": 3.2736356648731744,
+      "grad_norm": 1.0384852886199951,
+      "learning_rate": 0.00012908180300500837,
+      "loss": 0.4975,
+      "step": 1067
+    },
+    {
+      "epoch": 3.276710222905457,
+      "grad_norm": 0.6881088018417358,
+      "learning_rate": 0.00012901502504173625,
+      "loss": 0.3496,
+      "step": 1068
+    },
+    {
+      "epoch": 3.27978478093774,
+      "grad_norm": 0.6974284648895264,
+      "learning_rate": 0.00012894824707846412,
+      "loss": 0.3252,
+      "step": 1069
+    },
+    {
+      "epoch": 3.282859338970023,
+      "grad_norm": 0.6597744822502136,
+      "learning_rate": 0.000128881469115192,
+      "loss": 0.371,
+      "step": 1070
+    },
+    {
+      "epoch": 3.2859338970023058,
+      "grad_norm": 0.8249826431274414,
+      "learning_rate": 0.00012881469115191987,
+      "loss": 0.506,
+      "step": 1071
+    },
+    {
+      "epoch": 3.2890084550345886,
+      "grad_norm": 0.9232259392738342,
+      "learning_rate": 0.00012874791318864777,
+      "loss": 0.4379,
+      "step": 1072
+    },
+    {
+      "epoch": 3.2920830130668715,
+      "grad_norm": 0.7886003851890564,
+      "learning_rate": 0.00012868113522537564,
+      "loss": 0.3461,
+      "step": 1073
+    },
+    {
+      "epoch": 3.2951575710991543,
+      "grad_norm": 0.7593116760253906,
+      "learning_rate": 0.0001286143572621035,
+      "loss": 0.3498,
+      "step": 1074
+    },
+    {
+      "epoch": 3.2982321291314376,
+      "grad_norm": 0.6457827091217041,
+      "learning_rate": 0.00012854757929883139,
+      "loss": 0.4303,
+      "step": 1075
+    },
+    {
+      "epoch": 3.30130668716372,
+      "grad_norm": 0.7233401536941528,
+      "learning_rate": 0.00012848080133555926,
+      "loss": 0.3888,
+      "step": 1076
+    },
+    {
+      "epoch": 3.3043812451960033,
+      "grad_norm": 0.6364323496818542,
+      "learning_rate": 0.00012841402337228716,
+      "loss": 0.4483,
+      "step": 1077
+    },
+    {
+      "epoch": 3.3074558032282857,
+      "grad_norm": 0.7049786448478699,
+      "learning_rate": 0.00012834724540901503,
+      "loss": 0.3465,
+      "step": 1078
+    },
+    {
+      "epoch": 3.310530361260569,
+      "grad_norm": 0.6245574951171875,
+      "learning_rate": 0.0001282804674457429,
+      "loss": 0.2893,
+      "step": 1079
+    },
+    {
+      "epoch": 3.313604919292852,
+      "grad_norm": 0.8154529929161072,
+      "learning_rate": 0.00012821368948247078,
+      "loss": 0.4662,
+      "step": 1080
+    },
+    {
+      "epoch": 3.3166794773251347,
+      "grad_norm": 0.7856273651123047,
+      "learning_rate": 0.00012814691151919865,
+      "loss": 0.4118,
+      "step": 1081
+    },
+    {
+      "epoch": 3.3197540353574175,
+      "grad_norm": 0.7315744757652283,
+      "learning_rate": 0.00012808013355592655,
+      "loss": 0.3951,
+      "step": 1082
+    },
+    {
+      "epoch": 3.3228285933897004,
+      "grad_norm": 0.7132816910743713,
+      "learning_rate": 0.00012801335559265442,
+      "loss": 0.3028,
+      "step": 1083
+    },
+    {
+      "epoch": 3.3259031514219832,
+      "grad_norm": 0.7761401534080505,
+      "learning_rate": 0.00012794657762938233,
+      "loss": 0.4595,
+      "step": 1084
+    },
+    {
+      "epoch": 3.328977709454266,
+      "grad_norm": 0.6712490916252136,
+      "learning_rate": 0.0001278797996661102,
+      "loss": 0.3146,
+      "step": 1085
+    },
+    {
+      "epoch": 3.332052267486549,
+      "grad_norm": 0.6390141248703003,
+      "learning_rate": 0.00012781302170283807,
+      "loss": 0.5137,
+      "step": 1086
+    },
+    {
+      "epoch": 3.3351268255188318,
+      "grad_norm": 0.6065652370452881,
+      "learning_rate": 0.00012774624373956594,
+      "loss": 0.2764,
+      "step": 1087
+    },
+    {
+      "epoch": 3.3382013835511146,
+      "grad_norm": 0.9247648119926453,
+      "learning_rate": 0.00012767946577629384,
+      "loss": 0.6112,
+      "step": 1088
+    },
+    {
+      "epoch": 3.3412759415833975,
+      "grad_norm": 0.8865838050842285,
+      "learning_rate": 0.00012761268781302172,
+      "loss": 0.5974,
+      "step": 1089
+    },
+    {
+      "epoch": 3.3443504996156803,
+      "grad_norm": 0.5885515809059143,
+      "learning_rate": 0.0001275459098497496,
+      "loss": 0.2611,
+      "step": 1090
+    },
+    {
+      "epoch": 3.347425057647963,
+      "grad_norm": 0.814175546169281,
+      "learning_rate": 0.00012747913188647746,
+      "loss": 0.3653,
+      "step": 1091
+    },
+    {
+      "epoch": 3.350499615680246,
+      "grad_norm": 0.6553864479064941,
+      "learning_rate": 0.00012741235392320534,
+      "loss": 0.3619,
+      "step": 1092
+    },
+    {
+      "epoch": 3.353574173712529,
+      "grad_norm": 0.8142261505126953,
+      "learning_rate": 0.00012734557595993324,
+      "loss": 0.5629,
+      "step": 1093
+    },
+    {
+      "epoch": 3.3566487317448117,
+      "grad_norm": 0.8324840664863586,
+      "learning_rate": 0.0001272787979966611,
+      "loss": 0.3358,
+      "step": 1094
+    },
+    {
+      "epoch": 3.3597232897770946,
+      "grad_norm": 0.7623977065086365,
+      "learning_rate": 0.00012721202003338898,
+      "loss": 0.4203,
+      "step": 1095
+    },
+    {
+      "epoch": 3.3627978478093774,
+      "grad_norm": 0.877435564994812,
+      "learning_rate": 0.00012714524207011686,
+      "loss": 0.4465,
+      "step": 1096
+    },
+    {
+      "epoch": 3.3658724058416603,
+      "grad_norm": 0.8097100257873535,
+      "learning_rate": 0.00012707846410684473,
+      "loss": 0.5269,
+      "step": 1097
+    },
+    {
+      "epoch": 3.368946963873943,
+      "grad_norm": 0.6663544178009033,
+      "learning_rate": 0.0001270116861435726,
+      "loss": 0.3844,
+      "step": 1098
+    },
+    {
+      "epoch": 3.372021521906226,
+      "grad_norm": 0.6997086405754089,
+      "learning_rate": 0.0001269449081803005,
+      "loss": 0.3691,
+      "step": 1099
+    },
+    {
+      "epoch": 3.375096079938509,
+      "grad_norm": 0.9864381551742554,
+      "learning_rate": 0.00012687813021702838,
+      "loss": 0.4578,
+      "step": 1100
+    },
+    {
+      "epoch": 3.3781706379707916,
+      "grad_norm": 0.9177810549736023,
+      "learning_rate": 0.00012681135225375628,
+      "loss": 0.5168,
+      "step": 1101
+    },
+    {
+      "epoch": 3.3812451960030745,
+      "grad_norm": 0.7557141184806824,
+      "learning_rate": 0.00012674457429048415,
+      "loss": 0.4225,
+      "step": 1102
+    },
+    {
+      "epoch": 3.3843197540353573,
+      "grad_norm": 0.6452154517173767,
+      "learning_rate": 0.00012667779632721202,
+      "loss": 0.4318,
+      "step": 1103
+    },
+    {
+      "epoch": 3.38739431206764,
+      "grad_norm": 0.7105704545974731,
+      "learning_rate": 0.00012661101836393992,
+      "loss": 0.3128,
+      "step": 1104
+    },
+    {
+      "epoch": 3.390468870099923,
+      "grad_norm": 0.7198373079299927,
+      "learning_rate": 0.0001265442404006678,
+      "loss": 0.4633,
+      "step": 1105
+    },
+    {
+      "epoch": 3.393543428132206,
+      "grad_norm": 0.7540241479873657,
+      "learning_rate": 0.00012647746243739567,
+      "loss": 0.4966,
+      "step": 1106
+    },
+    {
+      "epoch": 3.3966179861644887,
+      "grad_norm": 0.7719717025756836,
+      "learning_rate": 0.00012641068447412354,
+      "loss": 0.4614,
+      "step": 1107
+    },
+    {
+      "epoch": 3.3996925441967716,
+      "grad_norm": 1.0767078399658203,
+      "learning_rate": 0.00012634390651085142,
+      "loss": 0.5644,
+      "step": 1108
+    },
+    {
+      "epoch": 3.4027671022290544,
+      "grad_norm": 0.7565367817878723,
+      "learning_rate": 0.00012627712854757932,
+      "loss": 0.4629,
+      "step": 1109
+    },
+    {
+      "epoch": 3.4058416602613373,
+      "grad_norm": 0.7821168899536133,
+      "learning_rate": 0.0001262103505843072,
+      "loss": 0.4701,
+      "step": 1110
+    },
+    {
+      "epoch": 3.40891621829362,
+      "grad_norm": 0.8335303664207458,
+      "learning_rate": 0.00012614357262103506,
+      "loss": 0.4421,
+      "step": 1111
+    },
+    {
+      "epoch": 3.411990776325903,
+      "grad_norm": 0.6488150954246521,
+      "learning_rate": 0.00012607679465776294,
+      "loss": 0.3861,
+      "step": 1112
+    },
+    {
+      "epoch": 3.415065334358186,
+      "grad_norm": 0.6686526536941528,
+      "learning_rate": 0.0001260100166944908,
+      "loss": 0.4383,
+      "step": 1113
+    },
+    {
+      "epoch": 3.418139892390469,
+      "grad_norm": 0.7704545855522156,
+      "learning_rate": 0.00012594323873121868,
+      "loss": 0.5089,
+      "step": 1114
+    },
+    {
+      "epoch": 3.4212144504227515,
+      "grad_norm": 0.8212313652038574,
+      "learning_rate": 0.00012587646076794658,
+      "loss": 0.3403,
+      "step": 1115
+    },
+    {
+      "epoch": 3.424289008455035,
+      "grad_norm": 0.6802902817726135,
+      "learning_rate": 0.00012580968280467446,
+      "loss": 0.3938,
+      "step": 1116
+    },
+    {
+      "epoch": 3.427363566487317,
+      "grad_norm": 0.6224616169929504,
+      "learning_rate": 0.00012574290484140233,
+      "loss": 0.4623,
+      "step": 1117
+    },
+    {
+      "epoch": 3.4304381245196005,
+      "grad_norm": 0.5823367834091187,
+      "learning_rate": 0.00012567612687813023,
+      "loss": 0.4328,
+      "step": 1118
+    },
+    {
+      "epoch": 3.4335126825518834,
+      "grad_norm": 0.5620308518409729,
+      "learning_rate": 0.0001256093489148581,
+      "loss": 0.3416,
+      "step": 1119
+    },
+    {
+      "epoch": 3.436587240584166,
+      "grad_norm": 0.8712018728256226,
+      "learning_rate": 0.000125542570951586,
+      "loss": 0.3876,
+      "step": 1120
+    },
+    {
+      "epoch": 3.439661798616449,
+      "grad_norm": 0.766313910484314,
+      "learning_rate": 0.00012547579298831388,
+      "loss": 0.5371,
+      "step": 1121
+    },
+    {
+      "epoch": 3.442736356648732,
+      "grad_norm": 0.8842399716377258,
+      "learning_rate": 0.00012540901502504175,
+      "loss": 0.3864,
+      "step": 1122
+    },
+    {
+      "epoch": 3.4458109146810147,
+      "grad_norm": 0.8501667976379395,
+      "learning_rate": 0.00012534223706176962,
+      "loss": 0.5317,
+      "step": 1123
+    },
+    {
+      "epoch": 3.4488854727132976,
+      "grad_norm": 0.8099106550216675,
+      "learning_rate": 0.0001252754590984975,
+      "loss": 0.4038,
+      "step": 1124
+    },
+    {
+      "epoch": 3.4519600307455804,
+      "grad_norm": 0.6638100743293762,
+      "learning_rate": 0.0001252086811352254,
+      "loss": 0.3981,
+      "step": 1125
+    },
+    {
+      "epoch": 3.4550345887778633,
+      "grad_norm": 0.713429868221283,
+      "learning_rate": 0.00012514190317195327,
+      "loss": 0.474,
+      "step": 1126
+    },
+    {
+      "epoch": 3.458109146810146,
+      "grad_norm": 0.6736339330673218,
+      "learning_rate": 0.00012507512520868114,
+      "loss": 0.3764,
+      "step": 1127
+    },
+    {
+      "epoch": 3.461183704842429,
+      "grad_norm": 0.7324123382568359,
+      "learning_rate": 0.00012500834724540902,
+      "loss": 0.4455,
+      "step": 1128
+    },
+    {
+      "epoch": 3.464258262874712,
+      "grad_norm": 0.7468026876449585,
+      "learning_rate": 0.0001249415692821369,
+      "loss": 0.4044,
+      "step": 1129
+    },
+    {
+      "epoch": 3.4673328209069947,
+      "grad_norm": 0.7653748393058777,
+      "learning_rate": 0.00012487479131886476,
+      "loss": 0.4189,
+      "step": 1130
+    },
+    {
+      "epoch": 3.4704073789392775,
+      "grad_norm": 0.8756456971168518,
+      "learning_rate": 0.00012480801335559266,
+      "loss": 0.5738,
+      "step": 1131
+    },
+    {
+      "epoch": 3.4734819369715604,
+      "grad_norm": 0.9344881772994995,
+      "learning_rate": 0.00012474123539232053,
+      "loss": 0.3945,
+      "step": 1132
+    },
+    {
+      "epoch": 3.4765564950038432,
+      "grad_norm": 0.736493706703186,
+      "learning_rate": 0.0001246744574290484,
+      "loss": 0.3714,
+      "step": 1133
+    },
+    {
+      "epoch": 3.479631053036126,
+      "grad_norm": 0.678229570388794,
+      "learning_rate": 0.0001246076794657763,
+      "loss": 0.4412,
+      "step": 1134
+    },
+    {
+      "epoch": 3.482705611068409,
+      "grad_norm": 0.9181579351425171,
+      "learning_rate": 0.00012454090150250418,
+      "loss": 0.4796,
+      "step": 1135
+    },
+    {
+      "epoch": 3.4857801691006918,
+      "grad_norm": 0.7823171019554138,
+      "learning_rate": 0.00012447412353923208,
+      "loss": 0.357,
+      "step": 1136
+    },
+    {
+      "epoch": 3.4888547271329746,
+      "grad_norm": 0.7010154724121094,
+      "learning_rate": 0.00012440734557595995,
+      "loss": 0.3073,
+      "step": 1137
+    },
+    {
+      "epoch": 3.4919292851652575,
+      "grad_norm": 0.8835572004318237,
+      "learning_rate": 0.00012434056761268783,
+      "loss": 0.5125,
+      "step": 1138
+    },
+    {
+      "epoch": 3.4950038431975403,
+      "grad_norm": 1.2275294065475464,
+      "learning_rate": 0.0001242737896494157,
+      "loss": 0.4167,
+      "step": 1139
+    },
+    {
+      "epoch": 3.498078401229823,
+      "grad_norm": 0.7526091933250427,
+      "learning_rate": 0.00012420701168614357,
+      "loss": 0.4264,
+      "step": 1140
+    },
+    {
+      "epoch": 3.501152959262106,
+      "grad_norm": 0.9319266080856323,
+      "learning_rate": 0.00012414023372287147,
+      "loss": 0.5189,
+      "step": 1141
+    },
+    {
+      "epoch": 3.504227517294389,
+      "grad_norm": 0.9764059782028198,
+      "learning_rate": 0.00012407345575959935,
+      "loss": 0.3808,
+      "step": 1142
+    },
+    {
+      "epoch": 3.5073020753266717,
+      "grad_norm": 0.891604483127594,
+      "learning_rate": 0.00012400667779632722,
+      "loss": 0.4733,
+      "step": 1143
+    },
+    {
+      "epoch": 3.5103766333589546,
+      "grad_norm": 0.985975444316864,
+      "learning_rate": 0.0001239398998330551,
+      "loss": 0.4055,
+      "step": 1144
+    },
+    {
+      "epoch": 3.5134511913912374,
+      "grad_norm": 0.9841047525405884,
+      "learning_rate": 0.00012387312186978297,
+      "loss": 0.4578,
+      "step": 1145
+    },
+    {
+      "epoch": 3.5165257494235203,
+      "grad_norm": 0.8037697672843933,
+      "learning_rate": 0.00012380634390651084,
+      "loss": 0.4271,
+      "step": 1146
+    },
+    {
+      "epoch": 3.519600307455803,
+      "grad_norm": 0.9044193625450134,
+      "learning_rate": 0.00012373956594323874,
+      "loss": 0.4559,
+      "step": 1147
+    },
+    {
+      "epoch": 3.522674865488086,
+      "grad_norm": 0.7791280746459961,
+      "learning_rate": 0.00012367278797996661,
+      "loss": 0.4635,
+      "step": 1148
+    },
+    {
+      "epoch": 3.525749423520369,
+      "grad_norm": 0.8410618901252747,
+      "learning_rate": 0.0001236060100166945,
+      "loss": 0.3929,
+      "step": 1149
+    },
+    {
+      "epoch": 3.5288239815526516,
+      "grad_norm": 0.7505420446395874,
+      "learning_rate": 0.00012353923205342236,
+      "loss": 0.4405,
+      "step": 1150
+    },
+    {
+      "epoch": 3.531898539584935,
+      "grad_norm": 0.6377079486846924,
+      "learning_rate": 0.00012347245409015026,
+      "loss": 0.3495,
+      "step": 1151
+    },
+    {
+      "epoch": 3.5349730976172173,
+      "grad_norm": 0.8469225168228149,
+      "learning_rate": 0.00012340567612687813,
+      "loss": 0.4331,
+      "step": 1152
+    },
+    {
+      "epoch": 3.5380476556495006,
+      "grad_norm": 0.5318998694419861,
+      "learning_rate": 0.00012333889816360603,
+      "loss": 0.4056,
+      "step": 1153
+    },
+    {
+      "epoch": 3.541122213681783,
+      "grad_norm": 0.8198487162590027,
+      "learning_rate": 0.0001232721202003339,
+      "loss": 0.5742,
+      "step": 1154
+    },
+    {
+      "epoch": 3.5441967717140663,
+      "grad_norm": 0.8151354789733887,
+      "learning_rate": 0.00012320534223706178,
+      "loss": 0.4746,
+      "step": 1155
+    },
+    {
+      "epoch": 3.5472713297463487,
+      "grad_norm": 0.8636469841003418,
+      "learning_rate": 0.00012313856427378965,
+      "loss": 0.6195,
+      "step": 1156
+    },
+    {
+      "epoch": 3.550345887778632,
+      "grad_norm": 0.9126644730567932,
+      "learning_rate": 0.00012307178631051755,
+      "loss": 0.4336,
+      "step": 1157
+    },
+    {
+      "epoch": 3.5534204458109144,
+      "grad_norm": 0.8281782269477844,
+      "learning_rate": 0.00012300500834724543,
+      "loss": 0.4048,
+      "step": 1158
+    },
+    {
+      "epoch": 3.5564950038431977,
+      "grad_norm": 0.9562798738479614,
+      "learning_rate": 0.0001229382303839733,
+      "loss": 0.4943,
+      "step": 1159
+    },
+    {
+      "epoch": 3.55956956187548,
+      "grad_norm": 0.7244289517402649,
+      "learning_rate": 0.00012287145242070117,
+      "loss": 0.3413,
+      "step": 1160
+    },
+    {
+      "epoch": 3.5626441199077634,
+      "grad_norm": 0.9391937851905823,
+      "learning_rate": 0.00012280467445742905,
+      "loss": 0.415,
+      "step": 1161
+    },
+    {
+      "epoch": 3.5657186779400463,
+      "grad_norm": 0.6994863748550415,
+      "learning_rate": 0.00012273789649415692,
+      "loss": 0.4256,
+      "step": 1162
+    },
+    {
+      "epoch": 3.568793235972329,
+      "grad_norm": 0.7271562814712524,
+      "learning_rate": 0.00012267111853088482,
+      "loss": 0.4266,
+      "step": 1163
+    },
+    {
+      "epoch": 3.571867794004612,
+      "grad_norm": 0.7303061485290527,
+      "learning_rate": 0.0001226043405676127,
+      "loss": 0.4692,
+      "step": 1164
+    },
+    {
+      "epoch": 3.574942352036895,
+      "grad_norm": 1.049743890762329,
+      "learning_rate": 0.00012253756260434057,
+      "loss": 0.5633,
+      "step": 1165
+    },
+    {
+      "epoch": 3.5780169100691777,
+      "grad_norm": 0.6518731713294983,
+      "learning_rate": 0.00012247078464106844,
+      "loss": 0.3834,
+      "step": 1166
+    },
+    {
+      "epoch": 3.5810914681014605,
+      "grad_norm": 0.680600643157959,
+      "learning_rate": 0.0001224040066777963,
+      "loss": 0.4485,
+      "step": 1167
+    },
+    {
+      "epoch": 3.5841660261337434,
+      "grad_norm": 0.6864722967147827,
+      "learning_rate": 0.0001223372287145242,
+      "loss": 0.4605,
+      "step": 1168
+    },
+    {
+      "epoch": 3.587240584166026,
+      "grad_norm": 0.7405598759651184,
+      "learning_rate": 0.00012227045075125209,
+      "loss": 0.4041,
+      "step": 1169
+    },
+    {
+      "epoch": 3.590315142198309,
+      "grad_norm": 0.675830602645874,
+      "learning_rate": 0.00012220367278797999,
+      "loss": 0.3742,
+      "step": 1170
+    },
+    {
+      "epoch": 3.593389700230592,
+      "grad_norm": 0.8901248574256897,
+      "learning_rate": 0.00012213689482470786,
+      "loss": 0.4401,
+      "step": 1171
+    },
+    {
+      "epoch": 3.5964642582628747,
+      "grad_norm": 0.6679547429084778,
+      "learning_rate": 0.00012207011686143572,
+      "loss": 0.4705,
+      "step": 1172
+    },
+    {
+      "epoch": 3.5995388162951576,
+      "grad_norm": 0.8528178930282593,
+      "learning_rate": 0.00012200333889816362,
+      "loss": 0.4257,
+      "step": 1173
+    },
+    {
+      "epoch": 3.6026133743274404,
+      "grad_norm": 0.9046573042869568,
+      "learning_rate": 0.00012193656093489149,
+      "loss": 0.374,
+      "step": 1174
+    },
+    {
+      "epoch": 3.6056879323597233,
+      "grad_norm": 0.6642177700996399,
+      "learning_rate": 0.00012186978297161938,
+      "loss": 0.4346,
+      "step": 1175
+    },
+    {
+      "epoch": 3.608762490392006,
+      "grad_norm": 0.7178785800933838,
+      "learning_rate": 0.00012180300500834725,
+      "loss": 0.395,
+      "step": 1176
+    },
+    {
+      "epoch": 3.611837048424289,
+      "grad_norm": 0.8669521808624268,
+      "learning_rate": 0.00012173622704507512,
+      "loss": 0.5228,
+      "step": 1177
+    },
+    {
+      "epoch": 3.614911606456572,
+      "grad_norm": 0.6138285398483276,
+      "learning_rate": 0.00012166944908180303,
+      "loss": 0.4007,
+      "step": 1178
+    },
+    {
+      "epoch": 3.6179861644888547,
+      "grad_norm": 1.0008139610290527,
+      "learning_rate": 0.0001216026711185309,
+      "loss": 0.4103,
+      "step": 1179
+    },
+    {
+      "epoch": 3.6210607225211375,
+      "grad_norm": 0.666658341884613,
+      "learning_rate": 0.00012153589315525877,
+      "loss": 0.383,
+      "step": 1180
+    },
+    {
+      "epoch": 3.6241352805534204,
+      "grad_norm": 0.8966631293296814,
+      "learning_rate": 0.00012146911519198664,
+      "loss": 0.5084,
+      "step": 1181
+    },
+    {
+      "epoch": 3.6272098385857032,
+      "grad_norm": 0.8953879475593567,
+      "learning_rate": 0.00012140233722871452,
+      "loss": 0.5283,
+      "step": 1182
+    },
+    {
+      "epoch": 3.630284396617986,
+      "grad_norm": 0.7656745314598083,
+      "learning_rate": 0.0001213355592654424,
+      "loss": 0.5076,
+      "step": 1183
+    },
+    {
+      "epoch": 3.633358954650269,
+      "grad_norm": 0.7582895159721375,
+      "learning_rate": 0.00012126878130217029,
+      "loss": 0.4206,
+      "step": 1184
+    },
+    {
+      "epoch": 3.6364335126825518,
+      "grad_norm": 0.8229513764381409,
+      "learning_rate": 0.00012120200333889818,
+      "loss": 0.5156,
+      "step": 1185
+    },
+    {
+      "epoch": 3.6395080707148346,
+      "grad_norm": 0.5379828810691833,
+      "learning_rate": 0.00012113522537562605,
+      "loss": 0.3438,
+      "step": 1186
+    },
+    {
+      "epoch": 3.6425826287471175,
+      "grad_norm": 0.6136037111282349,
+      "learning_rate": 0.00012106844741235392,
+      "loss": 0.4394,
+      "step": 1187
+    },
+    {
+      "epoch": 3.6456571867794003,
+      "grad_norm": 1.137101411819458,
+      "learning_rate": 0.0001210016694490818,
+      "loss": 0.5354,
+      "step": 1188
+    },
+    {
+      "epoch": 3.648731744811683,
+      "grad_norm": 0.6826598048210144,
+      "learning_rate": 0.0001209348914858097,
+      "loss": 0.3609,
+      "step": 1189
+    },
+    {
+      "epoch": 3.6518063028439665,
+      "grad_norm": 0.7083644270896912,
+      "learning_rate": 0.00012086811352253757,
+      "loss": 0.3692,
+      "step": 1190
+    },
+    {
+      "epoch": 3.654880860876249,
+      "grad_norm": 0.8692861199378967,
+      "learning_rate": 0.00012080133555926544,
+      "loss": 0.3899,
+      "step": 1191
+    },
+    {
+      "epoch": 3.657955418908532,
+      "grad_norm": 0.5692325234413147,
+      "learning_rate": 0.00012073455759599333,
+      "loss": 0.4135,
+      "step": 1192
+    },
+    {
+      "epoch": 3.6610299769408146,
+      "grad_norm": 0.6517208218574524,
+      "learning_rate": 0.0001206677796327212,
+      "loss": 0.4944,
+      "step": 1193
+    },
+    {
+      "epoch": 3.664104534973098,
+      "grad_norm": 1.0306694507598877,
+      "learning_rate": 0.0001206010016694491,
+      "loss": 0.3471,
+      "step": 1194
+    },
+    {
+      "epoch": 3.6671790930053803,
+      "grad_norm": 0.6560060977935791,
+      "learning_rate": 0.00012053422370617698,
+      "loss": 0.421,
+      "step": 1195
+    },
+    {
+      "epoch": 3.6702536510376635,
+      "grad_norm": 0.7117607593536377,
+      "learning_rate": 0.00012046744574290485,
+      "loss": 0.3881,
+      "step": 1196
+    },
+    {
+      "epoch": 3.673328209069946,
+      "grad_norm": 0.931069552898407,
+      "learning_rate": 0.00012040066777963272,
+      "loss": 0.5125,
+      "step": 1197
+    },
+    {
+      "epoch": 3.6764027671022292,
+      "grad_norm": 0.7183043956756592,
+      "learning_rate": 0.0001203338898163606,
+      "loss": 0.3876,
+      "step": 1198
+    },
+    {
+      "epoch": 3.6794773251345116,
+      "grad_norm": 0.6167232394218445,
+      "learning_rate": 0.00012026711185308848,
+      "loss": 0.432,
+      "step": 1199
+    },
+    {
+      "epoch": 3.682551883166795,
+      "grad_norm": 0.7681392431259155,
+      "learning_rate": 0.00012020033388981637,
+      "loss": 0.3579,
+      "step": 1200
+    },
+    {
+      "epoch": 3.685626441199078,
+      "grad_norm": 0.6500406861305237,
+      "learning_rate": 0.00012013355592654426,
+      "loss": 0.3566,
+      "step": 1201
+    },
+    {
+      "epoch": 3.6887009992313606,
+      "grad_norm": 0.6759480237960815,
+      "learning_rate": 0.00012006677796327213,
+      "loss": 0.3474,
+      "step": 1202
+    },
+    {
+      "epoch": 3.6917755572636435,
+      "grad_norm": 0.7032824158668518,
+      "learning_rate": 0.00012,
+      "loss": 0.4049,
+      "step": 1203
+    },
+    {
+      "epoch": 3.6948501152959263,
+      "grad_norm": 0.7631069421768188,
+      "learning_rate": 0.00011993322203672788,
+      "loss": 0.3905,
+      "step": 1204
+    },
+    {
+      "epoch": 3.697924673328209,
+      "grad_norm": 0.7755546569824219,
+      "learning_rate": 0.00011986644407345578,
+      "loss": 0.4175,
+      "step": 1205
+    },
+    {
+      "epoch": 3.700999231360492,
+      "grad_norm": 0.5792478919029236,
+      "learning_rate": 0.00011979966611018365,
+      "loss": 0.3005,
+      "step": 1206
+    },
+    {
+      "epoch": 3.704073789392775,
+      "grad_norm": 0.7339358925819397,
+      "learning_rate": 0.00011973288814691152,
+      "loss": 0.417,
+      "step": 1207
+    },
+    {
+      "epoch": 3.7071483474250577,
+      "grad_norm": 0.8882247805595398,
+      "learning_rate": 0.0001196661101836394,
+      "loss": 0.4666,
+      "step": 1208
+    },
+    {
+      "epoch": 3.7102229054573406,
+      "grad_norm": 0.727995753288269,
+      "learning_rate": 0.00011959933222036728,
+      "loss": 0.417,
+      "step": 1209
+    },
+    {
+      "epoch": 3.7132974634896234,
+      "grad_norm": 0.9979139566421509,
+      "learning_rate": 0.00011953255425709517,
+      "loss": 0.4272,
+      "step": 1210
+    },
+    {
+      "epoch": 3.7163720215219063,
+      "grad_norm": 0.9539368152618408,
+      "learning_rate": 0.00011946577629382306,
+      "loss": 0.5065,
+      "step": 1211
+    },
+    {
+      "epoch": 3.719446579554189,
+      "grad_norm": 0.7285603880882263,
+      "learning_rate": 0.00011939899833055093,
+      "loss": 0.4355,
+      "step": 1212
+    },
+    {
+      "epoch": 3.722521137586472,
+      "grad_norm": 0.8624237179756165,
+      "learning_rate": 0.0001193322203672788,
+      "loss": 0.4397,
+      "step": 1213
+    },
+    {
+      "epoch": 3.725595695618755,
+      "grad_norm": 0.9688683748245239,
+      "learning_rate": 0.00011926544240400668,
+      "loss": 0.491,
+      "step": 1214
+    },
+    {
+      "epoch": 3.7286702536510377,
+      "grad_norm": 0.8200318813323975,
+      "learning_rate": 0.00011919866444073455,
+      "loss": 0.4199,
+      "step": 1215
+    },
+    {
+      "epoch": 3.7317448116833205,
+      "grad_norm": 0.7483800053596497,
+      "learning_rate": 0.00011913188647746245,
+      "loss": 0.3424,
+      "step": 1216
+    },
+    {
+      "epoch": 3.7348193697156034,
+      "grad_norm": 1.0238198041915894,
+      "learning_rate": 0.00011906510851419032,
+      "loss": 0.3969,
+      "step": 1217
+    },
+    {
+      "epoch": 3.737893927747886,
+      "grad_norm": 0.924199104309082,
+      "learning_rate": 0.00011899833055091821,
+      "loss": 0.5159,
+      "step": 1218
+    },
+    {
+      "epoch": 3.740968485780169,
+      "grad_norm": 0.8077093362808228,
+      "learning_rate": 0.00011893155258764608,
+      "loss": 0.4923,
+      "step": 1219
+    },
+    {
+      "epoch": 3.744043043812452,
+      "grad_norm": 0.9883623719215393,
+      "learning_rate": 0.00011886477462437396,
+      "loss": 0.5852,
+      "step": 1220
+    },
+    {
+      "epoch": 3.7471176018447347,
+      "grad_norm": 0.8903137445449829,
+      "learning_rate": 0.00011879799666110186,
+      "loss": 0.4461,
+      "step": 1221
+    },
+    {
+      "epoch": 3.7501921598770176,
+      "grad_norm": 0.8356419205665588,
+      "learning_rate": 0.00011873121869782973,
+      "loss": 0.384,
+      "step": 1222
+    },
+    {
+      "epoch": 3.7532667179093004,
+      "grad_norm": 0.6669814586639404,
+      "learning_rate": 0.0001186644407345576,
+      "loss": 0.4374,
+      "step": 1223
+    },
+    {
+      "epoch": 3.7563412759415833,
+      "grad_norm": 0.8386452794075012,
+      "learning_rate": 0.00011859766277128547,
+      "loss": 0.5073,
+      "step": 1224
+    },
+    {
+      "epoch": 3.759415833973866,
+      "grad_norm": 0.7137802243232727,
+      "learning_rate": 0.00011853088480801335,
+      "loss": 0.4911,
+      "step": 1225
+    },
+    {
+      "epoch": 3.762490392006149,
+      "grad_norm": 0.9081368446350098,
+      "learning_rate": 0.00011846410684474125,
+      "loss": 0.4414,
+      "step": 1226
+    },
+    {
+      "epoch": 3.765564950038432,
+      "grad_norm": 0.625066876411438,
+      "learning_rate": 0.00011839732888146912,
+      "loss": 0.3807,
+      "step": 1227
+    },
+    {
+      "epoch": 3.7686395080707147,
+      "grad_norm": 0.7176731824874878,
+      "learning_rate": 0.00011833055091819701,
+      "loss": 0.4615,
+      "step": 1228
+    },
+    {
+      "epoch": 3.7717140661029975,
+      "grad_norm": 0.8574363589286804,
+      "learning_rate": 0.00011826377295492488,
+      "loss": 0.4909,
+      "step": 1229
+    },
+    {
+      "epoch": 3.7747886241352804,
+      "grad_norm": 0.7505884766578674,
+      "learning_rate": 0.00011819699499165275,
+      "loss": 0.5507,
+      "step": 1230
+    },
+    {
+      "epoch": 3.7778631821675637,
+      "grad_norm": 0.6918272972106934,
+      "learning_rate": 0.00011813021702838063,
+      "loss": 0.424,
+      "step": 1231
+    },
+    {
+      "epoch": 3.780937740199846,
+      "grad_norm": 0.4620833992958069,
+      "learning_rate": 0.00011806343906510853,
+      "loss": 0.3651,
+      "step": 1232
+    },
+    {
+      "epoch": 3.7840122982321294,
+      "grad_norm": 0.8369824290275574,
+      "learning_rate": 0.0001179966611018364,
+      "loss": 0.4467,
+      "step": 1233
+    },
+    {
+      "epoch": 3.7870868562644118,
+      "grad_norm": 0.7672296762466431,
+      "learning_rate": 0.00011792988313856427,
+      "loss": 0.475,
+      "step": 1234
+    },
+    {
+      "epoch": 3.790161414296695,
+      "grad_norm": 0.8405357003211975,
+      "learning_rate": 0.00011786310517529216,
+      "loss": 0.5173,
+      "step": 1235
+    },
+    {
+      "epoch": 3.7932359723289775,
+      "grad_norm": 0.7033690810203552,
+      "learning_rate": 0.00011779632721202003,
+      "loss": 0.4607,
+      "step": 1236
+    },
+    {
+      "epoch": 3.7963105303612608,
+      "grad_norm": 0.6288658380508423,
+      "learning_rate": 0.00011772954924874793,
+      "loss": 0.4055,
+      "step": 1237
+    },
+    {
+      "epoch": 3.799385088393543,
+      "grad_norm": 0.690845787525177,
+      "learning_rate": 0.00011766277128547581,
+      "loss": 0.4283,
+      "step": 1238
+    },
+    {
+      "epoch": 3.8024596464258265,
+      "grad_norm": 0.6428495049476624,
+      "learning_rate": 0.00011759599332220368,
+      "loss": 0.3298,
+      "step": 1239
+    },
+    {
+      "epoch": 3.8055342044581093,
+      "grad_norm": 0.658479630947113,
+      "learning_rate": 0.00011752921535893155,
+      "loss": 0.4336,
+      "step": 1240
+    },
+    {
+      "epoch": 3.808608762490392,
+      "grad_norm": 0.7378556728363037,
+      "learning_rate": 0.00011746243739565943,
+      "loss": 0.3664,
+      "step": 1241
+    },
+    {
+      "epoch": 3.811683320522675,
+      "grad_norm": 0.8548963069915771,
+      "learning_rate": 0.00011739565943238733,
+      "loss": 0.4573,
+      "step": 1242
+    },
+    {
+      "epoch": 3.814757878554958,
+      "grad_norm": 0.7019163966178894,
+      "learning_rate": 0.0001173288814691152,
+      "loss": 0.4826,
+      "step": 1243
+    },
+    {
+      "epoch": 3.8178324365872407,
+      "grad_norm": 1.227756142616272,
+      "learning_rate": 0.00011726210350584307,
+      "loss": 0.6298,
+      "step": 1244
+    },
+    {
+      "epoch": 3.8209069946195235,
+      "grad_norm": 0.8075862526893616,
+      "learning_rate": 0.00011719532554257096,
+      "loss": 0.4412,
+      "step": 1245
+    },
+    {
+      "epoch": 3.8239815526518064,
+      "grad_norm": 0.8187466859817505,
+      "learning_rate": 0.00011712854757929883,
+      "loss": 0.4675,
+      "step": 1246
+    },
+    {
+      "epoch": 3.8270561106840892,
+      "grad_norm": 0.916185200214386,
+      "learning_rate": 0.0001170617696160267,
+      "loss": 0.3927,
+      "step": 1247
+    },
+    {
+      "epoch": 3.830130668716372,
+      "grad_norm": 0.8163374066352844,
+      "learning_rate": 0.0001169949916527546,
+      "loss": 0.4642,
+      "step": 1248
+    },
+    {
+      "epoch": 3.833205226748655,
+      "grad_norm": 0.8225308656692505,
+      "learning_rate": 0.00011692821368948248,
+      "loss": 0.5168,
+      "step": 1249
+    },
+    {
+      "epoch": 3.836279784780938,
+      "grad_norm": 0.931461751461029,
+      "learning_rate": 0.00011686143572621035,
+      "loss": 0.3826,
+      "step": 1250
+    },
+    {
+      "epoch": 3.8393543428132206,
+      "grad_norm": 0.8172028064727783,
+      "learning_rate": 0.00011679465776293823,
+      "loss": 0.4679,
+      "step": 1251
+    },
+    {
+      "epoch": 3.8424289008455035,
+      "grad_norm": 0.9193819165229797,
+      "learning_rate": 0.00011672787979966611,
+      "loss": 0.5616,
+      "step": 1252
+    },
+    {
+      "epoch": 3.8455034588777863,
+      "grad_norm": 0.6455274224281311,
+      "learning_rate": 0.000116661101836394,
+      "loss": 0.3846,
+      "step": 1253
+    },
+    {
+      "epoch": 3.848578016910069,
+      "grad_norm": 0.7567316889762878,
+      "learning_rate": 0.00011659432387312189,
+      "loss": 0.3945,
+      "step": 1254
+    },
+    {
+      "epoch": 3.851652574942352,
+      "grad_norm": 0.7793917059898376,
+      "learning_rate": 0.00011652754590984976,
+      "loss": 0.4302,
+      "step": 1255
+    },
+    {
+      "epoch": 3.854727132974635,
+      "grad_norm": 0.7038170099258423,
+      "learning_rate": 0.00011646076794657763,
+      "loss": 0.3853,
+      "step": 1256
+    },
+    {
+      "epoch": 3.8578016910069177,
+      "grad_norm": 0.5196588039398193,
+      "learning_rate": 0.0001163939899833055,
+      "loss": 0.2655,
+      "step": 1257
+    },
+    {
+      "epoch": 3.8608762490392006,
+      "grad_norm": 0.70482337474823,
+      "learning_rate": 0.0001163272120200334,
+      "loss": 0.4513,
+      "step": 1258
+    },
+    {
+      "epoch": 3.8639508070714834,
+      "grad_norm": 0.828891396522522,
+      "learning_rate": 0.00011626043405676128,
+      "loss": 0.4812,
+      "step": 1259
+    },
+    {
+      "epoch": 3.8670253651037663,
+      "grad_norm": 0.7358651161193848,
+      "learning_rate": 0.00011619365609348915,
+      "loss": 0.3227,
+      "step": 1260
+    },
+    {
+      "epoch": 3.870099923136049,
+      "grad_norm": 0.6543817520141602,
+      "learning_rate": 0.00011612687813021703,
+      "loss": 0.3776,
+      "step": 1261
+    },
+    {
+      "epoch": 3.873174481168332,
+      "grad_norm": 0.6436611413955688,
+      "learning_rate": 0.00011606010016694491,
+      "loss": 0.3741,
+      "step": 1262
+    },
+    {
+      "epoch": 3.876249039200615,
+      "grad_norm": 0.6203712224960327,
+      "learning_rate": 0.00011599332220367279,
+      "loss": 0.3884,
+      "step": 1263
+    },
+    {
+      "epoch": 3.8793235972328977,
+      "grad_norm": 0.7520287036895752,
+      "learning_rate": 0.00011592654424040069,
+      "loss": 0.5706,
+      "step": 1264
+    },
+    {
+      "epoch": 3.8823981552651805,
+      "grad_norm": 0.7709315419197083,
+      "learning_rate": 0.00011585976627712856,
+      "loss": 0.3824,
+      "step": 1265
+    },
+    {
+      "epoch": 3.8854727132974634,
+      "grad_norm": 0.6220033764839172,
+      "learning_rate": 0.00011579298831385643,
+      "loss": 0.3308,
+      "step": 1266
+    },
+    {
+      "epoch": 3.888547271329746,
+      "grad_norm": 0.7906895279884338,
+      "learning_rate": 0.0001157262103505843,
+      "loss": 0.5238,
+      "step": 1267
+    },
+    {
+      "epoch": 3.891621829362029,
+      "grad_norm": 0.693013608455658,
+      "learning_rate": 0.00011565943238731218,
+      "loss": 0.5147,
+      "step": 1268
+    },
+    {
+      "epoch": 3.894696387394312,
+      "grad_norm": 0.6043047904968262,
+      "learning_rate": 0.00011559265442404008,
+      "loss": 0.2871,
+      "step": 1269
+    },
+    {
+      "epoch": 3.897770945426595,
+      "grad_norm": 0.560471773147583,
+      "learning_rate": 0.00011552587646076795,
+      "loss": 0.4276,
+      "step": 1270
+    },
+    {
+      "epoch": 3.9008455034588776,
+      "grad_norm": 0.7022919654846191,
+      "learning_rate": 0.00011545909849749584,
+      "loss": 0.4981,
+      "step": 1271
+    },
+    {
+      "epoch": 3.903920061491161,
+      "grad_norm": 0.933049201965332,
+      "learning_rate": 0.00011539232053422371,
+      "loss": 0.4203,
+      "step": 1272
+    },
+    {
+      "epoch": 3.9069946195234433,
+      "grad_norm": 0.6328878998756409,
+      "learning_rate": 0.00011532554257095158,
+      "loss": 0.358,
+      "step": 1273
+    },
+    {
+      "epoch": 3.9100691775557266,
+      "grad_norm": 0.7153301239013672,
+      "learning_rate": 0.00011525876460767948,
+      "loss": 0.4345,
+      "step": 1274
+    },
+    {
+      "epoch": 3.913143735588009,
+      "grad_norm": 0.6789084672927856,
+      "learning_rate": 0.00011519198664440736,
+      "loss": 0.4499,
+      "step": 1275
+    },
+    {
+      "epoch": 3.9162182936202923,
+      "grad_norm": 0.8615806698799133,
+      "learning_rate": 0.00011512520868113523,
+      "loss": 0.5112,
+      "step": 1276
+    },
+    {
+      "epoch": 3.9192928516525747,
+      "grad_norm": 0.9562219381332397,
+      "learning_rate": 0.0001150584307178631,
+      "loss": 0.4993,
+      "step": 1277
+    },
+    {
+      "epoch": 3.922367409684858,
+      "grad_norm": 0.8305587768554688,
+      "learning_rate": 0.00011499165275459098,
+      "loss": 0.4301,
+      "step": 1278
+    },
+    {
+      "epoch": 3.925441967717141,
+      "grad_norm": 0.7225807309150696,
+      "learning_rate": 0.00011492487479131886,
+      "loss": 0.4778,
+      "step": 1279
+    },
+    {
+      "epoch": 3.9285165257494237,
+      "grad_norm": 0.600487470626831,
+      "learning_rate": 0.00011485809682804675,
+      "loss": 0.3729,
+      "step": 1280
+    },
+    {
+      "epoch": 3.9315910837817065,
+      "grad_norm": 0.7126119136810303,
+      "learning_rate": 0.00011479131886477464,
+      "loss": 0.4114,
+      "step": 1281
+    },
+    {
+      "epoch": 3.9346656418139894,
+      "grad_norm": 0.6836767792701721,
+      "learning_rate": 0.00011472454090150251,
+      "loss": 0.3806,
+      "step": 1282
+    },
+    {
+      "epoch": 3.937740199846272,
+      "grad_norm": 0.9370895624160767,
+      "learning_rate": 0.00011465776293823038,
+      "loss": 0.4382,
+      "step": 1283
+    },
+    {
+      "epoch": 3.940814757878555,
+      "grad_norm": 0.5400208234786987,
+      "learning_rate": 0.00011459098497495826,
+      "loss": 0.301,
+      "step": 1284
+    },
+    {
+      "epoch": 3.943889315910838,
+      "grad_norm": 0.7497467994689941,
+      "learning_rate": 0.00011452420701168616,
+      "loss": 0.4158,
+      "step": 1285
+    },
+    {
+      "epoch": 3.9469638739431208,
+      "grad_norm": 0.7468736171722412,
+      "learning_rate": 0.00011445742904841403,
+      "loss": 0.4492,
+      "step": 1286
+    },
+    {
+      "epoch": 3.9500384319754036,
+      "grad_norm": 0.8118924498558044,
+      "learning_rate": 0.0001143906510851419,
+      "loss": 0.4847,
+      "step": 1287
+    },
+    {
+      "epoch": 3.9531129900076865,
+      "grad_norm": 0.6973615288734436,
+      "learning_rate": 0.00011432387312186979,
+      "loss": 0.4148,
+      "step": 1288
+    },
+    {
+      "epoch": 3.9561875480399693,
+      "grad_norm": 0.8795959949493408,
+      "learning_rate": 0.00011425709515859766,
+      "loss": 0.4772,
+      "step": 1289
+    },
+    {
+      "epoch": 3.959262106072252,
+      "grad_norm": 0.8716256618499756,
+      "learning_rate": 0.00011419031719532556,
+      "loss": 0.4586,
+      "step": 1290
+    },
+    {
+      "epoch": 3.962336664104535,
+      "grad_norm": 0.7880982756614685,
+      "learning_rate": 0.00011412353923205344,
+      "loss": 0.4364,
+      "step": 1291
+    },
+    {
+      "epoch": 3.965411222136818,
+      "grad_norm": 0.8473154306411743,
+      "learning_rate": 0.00011405676126878131,
+      "loss": 0.4775,
+      "step": 1292
+    },
+    {
+      "epoch": 3.9684857801691007,
+      "grad_norm": 0.8033487200737,
+      "learning_rate": 0.00011398998330550918,
+      "loss": 0.3901,
+      "step": 1293
+    },
+    {
+      "epoch": 3.9715603382013835,
+      "grad_norm": 0.8566176891326904,
+      "learning_rate": 0.00011392320534223706,
+      "loss": 0.4494,
+      "step": 1294
+    },
+    {
+      "epoch": 3.9746348962336664,
+      "grad_norm": 0.8029381632804871,
+      "learning_rate": 0.00011385642737896493,
+      "loss": 0.3145,
+      "step": 1295
+    },
+    {
+      "epoch": 3.9777094542659492,
+      "grad_norm": 0.7575416564941406,
+      "learning_rate": 0.00011378964941569283,
+      "loss": 0.3394,
+      "step": 1296
+    },
+    {
+      "epoch": 3.980784012298232,
+      "grad_norm": 0.6976135969161987,
+      "learning_rate": 0.0001137228714524207,
+      "loss": 0.5052,
+      "step": 1297
+    },
+    {
+      "epoch": 3.983858570330515,
+      "grad_norm": 0.6242879629135132,
+      "learning_rate": 0.00011365609348914859,
+      "loss": 0.3319,
+      "step": 1298
+    },
+    {
+      "epoch": 3.986933128362798,
+      "grad_norm": 0.8205263614654541,
+      "learning_rate": 0.00011358931552587646,
+      "loss": 0.4923,
+      "step": 1299
+    },
+    {
+      "epoch": 3.9900076863950806,
+      "grad_norm": 0.6506344079971313,
+      "learning_rate": 0.00011352253756260434,
+      "loss": 0.412,
+      "step": 1300
+    },
+    {
+      "epoch": 3.9930822444273635,
+      "grad_norm": 0.8723356127738953,
+      "learning_rate": 0.00011345575959933224,
+      "loss": 0.4347,
+      "step": 1301
+    },
+    {
+      "epoch": 3.9961568024596463,
+      "grad_norm": 0.7876335978507996,
+      "learning_rate": 0.00011338898163606011,
+      "loss": 0.5355,
+      "step": 1302
+    },
+    {
+      "epoch": 3.999231360491929,
+      "grad_norm": 0.724051833152771,
+      "learning_rate": 0.00011332220367278798,
+      "loss": 0.3937,
+      "step": 1303
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 1.3604131937026978,
+      "learning_rate": 0.00011325542570951586,
+      "loss": 0.4378,
+      "step": 1304
+    },
+    {
+      "epoch": 4.003074558032283,
+      "grad_norm": 0.5147106647491455,
+      "learning_rate": 0.00011318864774624374,
+      "loss": 0.3391,
+      "step": 1305
+    },
+    {
+      "epoch": 4.006149116064566,
+      "grad_norm": 0.6199834942817688,
+      "learning_rate": 0.00011312186978297163,
+      "loss": 0.3723,
+      "step": 1306
+    },
+    {
+      "epoch": 4.009223674096849,
+      "grad_norm": 0.7257975935935974,
+      "learning_rate": 0.00011305509181969952,
+      "loss": 0.3119,
+      "step": 1307
+    },
+    {
+      "epoch": 4.012298232129131,
+      "grad_norm": 0.6027304530143738,
+      "learning_rate": 0.00011298831385642739,
+      "loss": 0.3193,
+      "step": 1308
+    },
+    {
+      "epoch": 4.015372790161415,
+      "grad_norm": 0.7012556791305542,
+      "learning_rate": 0.00011292153589315526,
+      "loss": 0.5244,
+      "step": 1309
+    },
+    {
+      "epoch": 4.018447348193697,
+      "grad_norm": 0.702237606048584,
+      "learning_rate": 0.00011285475792988314,
+      "loss": 0.2945,
+      "step": 1310
+    },
+    {
+      "epoch": 4.02152190622598,
+      "grad_norm": 0.7039638757705688,
+      "learning_rate": 0.00011278797996661104,
+      "loss": 0.3911,
+      "step": 1311
+    },
+    {
+      "epoch": 4.024596464258263,
+      "grad_norm": 0.6667320132255554,
+      "learning_rate": 0.00011272120200333891,
+      "loss": 0.2977,
+      "step": 1312
+    },
+    {
+      "epoch": 4.027671022290546,
+      "grad_norm": 0.692411482334137,
+      "learning_rate": 0.00011265442404006678,
+      "loss": 0.3628,
+      "step": 1313
+    },
+    {
+      "epoch": 4.0307455803228285,
+      "grad_norm": 0.8840232491493225,
+      "learning_rate": 0.00011258764607679465,
+      "loss": 0.3213,
+      "step": 1314
+    },
+    {
+      "epoch": 4.033820138355112,
+      "grad_norm": 0.6245793104171753,
+      "learning_rate": 0.00011252086811352254,
+      "loss": 0.3689,
+      "step": 1315
+    },
+    {
+      "epoch": 4.036894696387394,
+      "grad_norm": 0.6313285231590271,
+      "learning_rate": 0.00011245409015025041,
+      "loss": 0.302,
+      "step": 1316
+    },
+    {
+      "epoch": 4.0399692544196775,
+      "grad_norm": 0.7105359435081482,
+      "learning_rate": 0.00011238731218697832,
+      "loss": 0.316,
+      "step": 1317
+    },
+    {
+      "epoch": 4.04304381245196,
+      "grad_norm": 0.6478031873703003,
+      "learning_rate": 0.00011232053422370619,
+      "loss": 0.3727,
+      "step": 1318
+    },
+    {
+      "epoch": 4.046118370484243,
+      "grad_norm": 0.4994255602359772,
+      "learning_rate": 0.00011225375626043406,
+      "loss": 0.2032,
+      "step": 1319
+    },
+    {
+      "epoch": 4.049192928516526,
+      "grad_norm": 0.9945188760757446,
+      "learning_rate": 0.00011218697829716193,
+      "loss": 0.5024,
+      "step": 1320
+    },
+    {
+      "epoch": 4.052267486548809,
+      "grad_norm": 0.8060212135314941,
+      "learning_rate": 0.00011212020033388981,
+      "loss": 0.3265,
+      "step": 1321
+    },
+    {
+      "epoch": 4.055342044581091,
+      "grad_norm": 0.8381320238113403,
+      "learning_rate": 0.00011205342237061771,
+      "loss": 0.3818,
+      "step": 1322
+    },
+    {
+      "epoch": 4.058416602613375,
+      "grad_norm": 0.9504372477531433,
+      "learning_rate": 0.00011198664440734558,
+      "loss": 0.4802,
+      "step": 1323
+    },
+    {
+      "epoch": 4.061491160645657,
+      "grad_norm": 0.7901592254638672,
+      "learning_rate": 0.00011191986644407347,
+      "loss": 0.3169,
+      "step": 1324
+    },
+    {
+      "epoch": 4.06456571867794,
+      "grad_norm": 0.7563232779502869,
+      "learning_rate": 0.00011185308848080134,
+      "loss": 0.3226,
+      "step": 1325
+    },
+    {
+      "epoch": 4.067640276710223,
+      "grad_norm": 0.6596280932426453,
+      "learning_rate": 0.00011178631051752921,
+      "loss": 0.2505,
+      "step": 1326
+    },
+    {
+      "epoch": 4.070714834742506,
+      "grad_norm": 0.7296786308288574,
+      "learning_rate": 0.00011171953255425711,
+      "loss": 0.2848,
+      "step": 1327
+    },
+    {
+      "epoch": 4.073789392774788,
+      "grad_norm": 0.8909521102905273,
+      "learning_rate": 0.00011165275459098499,
+      "loss": 0.4322,
+      "step": 1328
+    },
+    {
+      "epoch": 4.076863950807072,
+      "grad_norm": 0.7292854189872742,
+      "learning_rate": 0.00011158597662771286,
+      "loss": 0.291,
+      "step": 1329
+    },
+    {
+      "epoch": 4.079938508839354,
+      "grad_norm": 0.9252512454986572,
+      "learning_rate": 0.00011151919866444073,
+      "loss": 0.2974,
+      "step": 1330
+    },
+    {
+      "epoch": 4.083013066871637,
+      "grad_norm": 0.7636522054672241,
+      "learning_rate": 0.00011145242070116862,
+      "loss": 0.4483,
+      "step": 1331
+    },
+    {
+      "epoch": 4.08608762490392,
+      "grad_norm": 0.8409242033958435,
+      "learning_rate": 0.0001113856427378965,
+      "loss": 0.4355,
+      "step": 1332
+    },
+    {
+      "epoch": 4.089162182936203,
+      "grad_norm": 0.9327632188796997,
+      "learning_rate": 0.0001113188647746244,
+      "loss": 0.4552,
+      "step": 1333
+    },
+    {
+      "epoch": 4.092236740968485,
+      "grad_norm": 0.7346988916397095,
+      "learning_rate": 0.00011125208681135227,
+      "loss": 0.3438,
+      "step": 1334
+    },
+    {
+      "epoch": 4.095311299000769,
+      "grad_norm": 1.004155158996582,
+      "learning_rate": 0.00011118530884808014,
+      "loss": 0.3276,
+      "step": 1335
+    },
+    {
+      "epoch": 4.098385857033051,
+      "grad_norm": 0.7132447361946106,
+      "learning_rate": 0.00011111853088480801,
+      "loss": 0.3801,
+      "step": 1336
+    },
+    {
+      "epoch": 4.101460415065334,
+      "grad_norm": 0.629642128944397,
+      "learning_rate": 0.00011105175292153589,
+      "loss": 0.2056,
+      "step": 1337
+    },
+    {
+      "epoch": 4.104534973097617,
+      "grad_norm": 0.6974900960922241,
+      "learning_rate": 0.00011098497495826379,
+      "loss": 0.3028,
+      "step": 1338
+    },
+    {
+      "epoch": 4.1076095311299,
+      "grad_norm": 0.7688671946525574,
+      "learning_rate": 0.00011091819699499166,
+      "loss": 0.2914,
+      "step": 1339
+    },
+    {
+      "epoch": 4.1106840891621825,
+      "grad_norm": 0.7950320839881897,
+      "learning_rate": 0.00011085141903171953,
+      "loss": 0.3381,
+      "step": 1340
+    },
+    {
+      "epoch": 4.113758647194466,
+      "grad_norm": 0.8874083757400513,
+      "learning_rate": 0.00011078464106844742,
+      "loss": 0.3343,
+      "step": 1341
+    },
+    {
+      "epoch": 4.116833205226748,
+      "grad_norm": 0.8627938032150269,
+      "learning_rate": 0.00011071786310517529,
+      "loss": 0.3888,
+      "step": 1342
+    },
+    {
+      "epoch": 4.1199077632590315,
+      "grad_norm": 0.7516458034515381,
+      "learning_rate": 0.0001106510851419032,
+      "loss": 0.3158,
+      "step": 1343
+    },
+    {
+      "epoch": 4.122982321291314,
+      "grad_norm": 0.7732129693031311,
+      "learning_rate": 0.00011058430717863107,
+      "loss": 0.3114,
+      "step": 1344
+    },
+    {
+      "epoch": 4.126056879323597,
+      "grad_norm": 0.6700358986854553,
+      "learning_rate": 0.00011051752921535894,
+      "loss": 0.2412,
+      "step": 1345
+    },
+    {
+      "epoch": 4.1291314373558805,
+      "grad_norm": 1.0231423377990723,
+      "learning_rate": 0.00011045075125208681,
+      "loss": 0.3424,
+      "step": 1346
+    },
+    {
+      "epoch": 4.132205995388163,
+      "grad_norm": 0.8192147016525269,
+      "learning_rate": 0.00011038397328881469,
+      "loss": 0.3089,
+      "step": 1347
+    },
+    {
+      "epoch": 4.135280553420446,
+      "grad_norm": 0.7541559934616089,
+      "learning_rate": 0.00011031719532554257,
+      "loss": 0.3552,
+      "step": 1348
+    },
+    {
+      "epoch": 4.138355111452729,
+      "grad_norm": 0.929007887840271,
+      "learning_rate": 0.00011025041736227046,
+      "loss": 0.3173,
+      "step": 1349
+    },
+    {
+      "epoch": 4.141429669485012,
+      "grad_norm": 0.5695236325263977,
+      "learning_rate": 0.00011018363939899835,
+      "loss": 0.3172,
+      "step": 1350
+    },
+    {
+      "epoch": 4.144504227517294,
+      "grad_norm": 0.9651820659637451,
+      "learning_rate": 0.00011011686143572622,
+      "loss": 0.3911,
+      "step": 1351
+    },
+    {
+      "epoch": 4.147578785549578,
+      "grad_norm": 0.7829585075378418,
+      "learning_rate": 0.00011005008347245409,
+      "loss": 0.4412,
+      "step": 1352
+    },
+    {
+      "epoch": 4.15065334358186,
+      "grad_norm": 0.5842923521995544,
+      "learning_rate": 0.00010998330550918197,
+      "loss": 0.32,
+      "step": 1353
+    },
+    {
+      "epoch": 4.153727901614143,
+      "grad_norm": 1.1148773431777954,
+      "learning_rate": 0.00010991652754590987,
+      "loss": 0.4073,
+      "step": 1354
+    },
+    {
+      "epoch": 4.156802459646426,
+      "grad_norm": 0.59675133228302,
+      "learning_rate": 0.00010984974958263774,
+      "loss": 0.2595,
+      "step": 1355
+    },
+    {
+      "epoch": 4.159877017678709,
+      "grad_norm": 0.9646673798561096,
+      "learning_rate": 0.00010978297161936561,
+      "loss": 0.4913,
+      "step": 1356
+    },
+    {
+      "epoch": 4.162951575710991,
+      "grad_norm": 0.8348448276519775,
+      "learning_rate": 0.00010971619365609349,
+      "loss": 0.3936,
+      "step": 1357
+    },
+    {
+      "epoch": 4.166026133743275,
+      "grad_norm": 0.9026066660881042,
+      "learning_rate": 0.00010964941569282137,
+      "loss": 0.4597,
+      "step": 1358
+    },
+    {
+      "epoch": 4.169100691775557,
+      "grad_norm": 1.0557740926742554,
+      "learning_rate": 0.00010958263772954926,
+      "loss": 0.2901,
+      "step": 1359
+    },
+    {
+      "epoch": 4.17217524980784,
+      "grad_norm": 1.6775768995285034,
+      "learning_rate": 0.00010951585976627715,
+      "loss": 0.5577,
+      "step": 1360
+    },
+    {
+      "epoch": 4.175249807840123,
+      "grad_norm": 0.651542603969574,
+      "learning_rate": 0.00010944908180300502,
+      "loss": 0.324,
+      "step": 1361
+    },
+    {
+      "epoch": 4.178324365872406,
+      "grad_norm": 0.8348442912101746,
+      "learning_rate": 0.00010938230383973289,
+      "loss": 0.3474,
+      "step": 1362
+    },
+    {
+      "epoch": 4.1813989239046885,
+      "grad_norm": 0.7684600949287415,
+      "learning_rate": 0.00010931552587646076,
+      "loss": 0.3105,
+      "step": 1363
+    },
+    {
+      "epoch": 4.184473481936972,
+      "grad_norm": 0.8022297620773315,
+      "learning_rate": 0.00010924874791318864,
+      "loss": 0.3591,
+      "step": 1364
+    },
+    {
+      "epoch": 4.187548039969254,
+      "grad_norm": 0.9433055520057678,
+      "learning_rate": 0.00010918196994991654,
+      "loss": 0.3116,
+      "step": 1365
+    },
+    {
+      "epoch": 4.1906225980015375,
+      "grad_norm": 0.9922048449516296,
+      "learning_rate": 0.00010911519198664441,
+      "loss": 0.4071,
+      "step": 1366
+    },
+    {
+      "epoch": 4.19369715603382,
+      "grad_norm": 0.7621304988861084,
+      "learning_rate": 0.0001090484140233723,
+      "loss": 0.3698,
+      "step": 1367
+    },
+    {
+      "epoch": 4.196771714066103,
+      "grad_norm": 0.8218173980712891,
+      "learning_rate": 0.00010898163606010017,
+      "loss": 0.3159,
+      "step": 1368
+    },
+    {
+      "epoch": 4.199846272098386,
+      "grad_norm": 0.9964919090270996,
+      "learning_rate": 0.00010891485809682804,
+      "loss": 0.3501,
+      "step": 1369
+    },
+    {
+      "epoch": 4.202920830130669,
+      "grad_norm": 0.705668032169342,
+      "learning_rate": 0.00010884808013355594,
+      "loss": 0.3418,
+      "step": 1370
+    },
+    {
+      "epoch": 4.205995388162951,
+      "grad_norm": 0.6391593217849731,
+      "learning_rate": 0.00010878130217028382,
+      "loss": 0.2915,
+      "step": 1371
+    },
+    {
+      "epoch": 4.209069946195235,
+      "grad_norm": 1.1502752304077148,
+      "learning_rate": 0.00010871452420701169,
+      "loss": 0.3739,
+      "step": 1372
+    },
+    {
+      "epoch": 4.212144504227517,
+      "grad_norm": 1.0136791467666626,
+      "learning_rate": 0.00010864774624373956,
+      "loss": 0.446,
+      "step": 1373
+    },
+    {
+      "epoch": 4.2152190622598,
+      "grad_norm": 1.116603136062622,
+      "learning_rate": 0.00010858096828046744,
+      "loss": 0.3013,
+      "step": 1374
+    },
+    {
+      "epoch": 4.218293620292083,
+      "grad_norm": 0.8702336549758911,
+      "learning_rate": 0.00010851419031719534,
+      "loss": 0.3708,
+      "step": 1375
+    },
+    {
+      "epoch": 4.221368178324366,
+      "grad_norm": 0.7424792647361755,
+      "learning_rate": 0.00010844741235392321,
+      "loss": 0.3883,
+      "step": 1376
+    },
+    {
+      "epoch": 4.224442736356648,
+      "grad_norm": 0.9215840697288513,
+      "learning_rate": 0.0001083806343906511,
+      "loss": 0.3527,
+      "step": 1377
+    },
+    {
+      "epoch": 4.227517294388932,
+      "grad_norm": 0.728461742401123,
+      "learning_rate": 0.00010831385642737897,
+      "loss": 0.3263,
+      "step": 1378
+    },
+    {
+      "epoch": 4.230591852421214,
+      "grad_norm": 0.6894111037254333,
+      "learning_rate": 0.00010824707846410684,
+      "loss": 0.3055,
+      "step": 1379
+    },
+    {
+      "epoch": 4.233666410453497,
+      "grad_norm": 0.736510694026947,
+      "learning_rate": 0.00010818030050083472,
+      "loss": 0.2888,
+      "step": 1380
+    },
+    {
+      "epoch": 4.23674096848578,
+      "grad_norm": 0.6261756420135498,
+      "learning_rate": 0.00010811352253756262,
+      "loss": 0.3168,
+      "step": 1381
+    },
+    {
+      "epoch": 4.239815526518063,
+      "grad_norm": 0.6462433934211731,
+      "learning_rate": 0.00010804674457429049,
+      "loss": 0.3785,
+      "step": 1382
+    },
+    {
+      "epoch": 4.242890084550346,
+      "grad_norm": 1.0697581768035889,
+      "learning_rate": 0.00010797996661101836,
+      "loss": 0.382,
+      "step": 1383
+    },
+    {
+      "epoch": 4.245964642582629,
+      "grad_norm": 0.8354079723358154,
+      "learning_rate": 0.00010791318864774625,
+      "loss": 0.4826,
+      "step": 1384
+    },
+    {
+      "epoch": 4.249039200614912,
+      "grad_norm": 0.9178540110588074,
+      "learning_rate": 0.00010784641068447412,
+      "loss": 0.3667,
+      "step": 1385
+    },
+    {
+      "epoch": 4.252113758647194,
+      "grad_norm": 0.6986132264137268,
+      "learning_rate": 0.00010777963272120202,
+      "loss": 0.3417,
+      "step": 1386
+    },
+    {
+      "epoch": 4.255188316679478,
+      "grad_norm": 0.6934733390808105,
+      "learning_rate": 0.0001077128547579299,
+      "loss": 0.3177,
+      "step": 1387
+    },
+    {
+      "epoch": 4.25826287471176,
+      "grad_norm": 0.7552710175514221,
+      "learning_rate": 0.00010764607679465777,
+      "loss": 0.4545,
+      "step": 1388
+    },
+    {
+      "epoch": 4.261337432744043,
+      "grad_norm": 0.8772902488708496,
+      "learning_rate": 0.00010757929883138564,
+      "loss": 0.4488,
+      "step": 1389
+    },
+    {
+      "epoch": 4.264411990776326,
+      "grad_norm": 0.6232932806015015,
+      "learning_rate": 0.00010751252086811352,
+      "loss": 0.2655,
+      "step": 1390
+    },
+    {
+      "epoch": 4.267486548808609,
+      "grad_norm": 0.8846897482872009,
+      "learning_rate": 0.00010744574290484142,
+      "loss": 0.354,
+      "step": 1391
+    },
+    {
+      "epoch": 4.2705611068408915,
+      "grad_norm": 0.9057449102401733,
+      "learning_rate": 0.00010737896494156929,
+      "loss": 0.4472,
+      "step": 1392
+    },
+    {
+      "epoch": 4.273635664873175,
+      "grad_norm": 0.9705424308776855,
+      "learning_rate": 0.00010731218697829716,
+      "loss": 0.3624,
+      "step": 1393
+    },
+    {
+      "epoch": 4.276710222905457,
+      "grad_norm": 1.3559931516647339,
+      "learning_rate": 0.00010724540901502505,
+      "loss": 0.4698,
+      "step": 1394
+    },
+    {
+      "epoch": 4.2797847809377405,
+      "grad_norm": 0.8337675333023071,
+      "learning_rate": 0.00010717863105175292,
+      "loss": 0.3795,
+      "step": 1395
+    },
+    {
+      "epoch": 4.282859338970023,
+      "grad_norm": 1.1630418300628662,
+      "learning_rate": 0.0001071118530884808,
+      "loss": 0.4924,
+      "step": 1396
+    },
+    {
+      "epoch": 4.285933897002306,
+      "grad_norm": 0.7302567362785339,
+      "learning_rate": 0.0001070450751252087,
+      "loss": 0.3188,
+      "step": 1397
+    },
+    {
+      "epoch": 4.289008455034589,
+      "grad_norm": 0.7226994037628174,
+      "learning_rate": 0.00010697829716193657,
+      "loss": 0.3188,
+      "step": 1398
+    },
+    {
+      "epoch": 4.292083013066872,
+      "grad_norm": 0.666989266872406,
+      "learning_rate": 0.00010691151919866444,
+      "loss": 0.3394,
+      "step": 1399
+    },
+    {
+      "epoch": 4.295157571099154,
+      "grad_norm": 0.9268330931663513,
+      "learning_rate": 0.00010684474123539232,
+      "loss": 0.4713,
+      "step": 1400
+    },
+    {
+      "epoch": 4.298232129131438,
+      "grad_norm": 0.823275625705719,
+      "learning_rate": 0.0001067779632721202,
+      "loss": 0.3481,
+      "step": 1401
+    },
+    {
+      "epoch": 4.30130668716372,
+      "grad_norm": 0.5804985761642456,
+      "learning_rate": 0.00010671118530884809,
+      "loss": 0.2705,
+      "step": 1402
+    },
+    {
+      "epoch": 4.304381245196003,
+      "grad_norm": 0.539432942867279,
+      "learning_rate": 0.00010664440734557598,
+      "loss": 0.3344,
+      "step": 1403
+    },
+    {
+      "epoch": 4.307455803228286,
+      "grad_norm": 0.6926316618919373,
+      "learning_rate": 0.00010657762938230385,
+      "loss": 0.3133,
+      "step": 1404
+    },
+    {
+      "epoch": 4.310530361260569,
+      "grad_norm": 0.6743838787078857,
+      "learning_rate": 0.00010651085141903172,
+      "loss": 0.3433,
+      "step": 1405
+    },
+    {
+      "epoch": 4.313604919292851,
+      "grad_norm": 1.0226610898971558,
+      "learning_rate": 0.0001064440734557596,
+      "loss": 0.3499,
+      "step": 1406
+    },
+    {
+      "epoch": 4.316679477325135,
+      "grad_norm": 0.9818789958953857,
+      "learning_rate": 0.0001063772954924875,
+      "loss": 0.3106,
+      "step": 1407
+    },
+    {
+      "epoch": 4.319754035357417,
+      "grad_norm": 0.9667727947235107,
+      "learning_rate": 0.00010631051752921537,
+      "loss": 0.3755,
+      "step": 1408
+    },
+    {
+      "epoch": 4.3228285933897,
+      "grad_norm": 0.8136192560195923,
+      "learning_rate": 0.00010624373956594324,
+      "loss": 0.4172,
+      "step": 1409
+    },
+    {
+      "epoch": 4.325903151421983,
+      "grad_norm": 1.4286353588104248,
+      "learning_rate": 0.00010617696160267111,
+      "loss": 0.2508,
+      "step": 1410
+    },
+    {
+      "epoch": 4.328977709454266,
+      "grad_norm": 0.9519496560096741,
+      "learning_rate": 0.000106110183639399,
+      "loss": 0.2951,
+      "step": 1411
+    },
+    {
+      "epoch": 4.3320522674865485,
+      "grad_norm": 1.119429111480713,
+      "learning_rate": 0.00010604340567612687,
+      "loss": 0.415,
+      "step": 1412
+    },
+    {
+      "epoch": 4.335126825518832,
+      "grad_norm": 0.9656046032905579,
+      "learning_rate": 0.00010597662771285477,
+      "loss": 0.3971,
+      "step": 1413
+    },
+    {
+      "epoch": 4.338201383551114,
+      "grad_norm": 0.7389115691184998,
+      "learning_rate": 0.00010590984974958265,
+      "loss": 0.3996,
+      "step": 1414
+    },
+    {
+      "epoch": 4.3412759415833975,
+      "grad_norm": 0.7295717597007751,
+      "learning_rate": 0.00010584307178631052,
+      "loss": 0.3148,
+      "step": 1415
+    },
+    {
+      "epoch": 4.34435049961568,
+      "grad_norm": 1.141958475112915,
+      "learning_rate": 0.0001057762938230384,
+      "loss": 0.3959,
+      "step": 1416
+    },
+    {
+      "epoch": 4.347425057647963,
+      "grad_norm": 0.8199194073677063,
+      "learning_rate": 0.00010570951585976627,
+      "loss": 0.2737,
+      "step": 1417
+    },
+    {
+      "epoch": 4.350499615680246,
+      "grad_norm": 0.9329640865325928,
+      "learning_rate": 0.00010564273789649417,
+      "loss": 0.3366,
+      "step": 1418
+    },
+    {
+      "epoch": 4.353574173712529,
+      "grad_norm": 0.9693445563316345,
+      "learning_rate": 0.00010557595993322204,
+      "loss": 0.325,
+      "step": 1419
+    },
+    {
+      "epoch": 4.356648731744812,
+      "grad_norm": 3.1419506072998047,
+      "learning_rate": 0.00010550918196994993,
+      "loss": 0.4225,
+      "step": 1420
+    },
+    {
+      "epoch": 4.359723289777095,
+      "grad_norm": 0.8056375980377197,
+      "learning_rate": 0.0001054424040066778,
+      "loss": 0.3366,
+      "step": 1421
+    },
+    {
+      "epoch": 4.362797847809377,
+      "grad_norm": 0.9013074636459351,
+      "learning_rate": 0.00010537562604340567,
+      "loss": 0.3814,
+      "step": 1422
+    },
+    {
+      "epoch": 4.36587240584166,
+      "grad_norm": 0.6411908864974976,
+      "learning_rate": 0.00010530884808013357,
+      "loss": 0.3604,
+      "step": 1423
+    },
+    {
+      "epoch": 4.3689469638739435,
+      "grad_norm": 0.7328122854232788,
+      "learning_rate": 0.00010524207011686145,
+      "loss": 0.3706,
+      "step": 1424
+    },
+    {
+      "epoch": 4.372021521906226,
+      "grad_norm": 0.7676102519035339,
+      "learning_rate": 0.00010517529215358932,
+      "loss": 0.3575,
+      "step": 1425
+    },
+    {
+      "epoch": 4.375096079938509,
+      "grad_norm": 0.7656323313713074,
+      "learning_rate": 0.0001051085141903172,
+      "loss": 0.3897,
+      "step": 1426
+    },
+    {
+      "epoch": 4.378170637970792,
+      "grad_norm": 0.8879655599594116,
+      "learning_rate": 0.00010504173622704507,
+      "loss": 0.2235,
+      "step": 1427
+    },
+    {
+      "epoch": 4.381245196003075,
+      "grad_norm": 0.8029223680496216,
+      "learning_rate": 0.00010497495826377295,
+      "loss": 0.4513,
+      "step": 1428
+    },
+    {
+      "epoch": 4.384319754035357,
+      "grad_norm": 0.8824205994606018,
+      "learning_rate": 0.00010490818030050084,
+      "loss": 0.3558,
+      "step": 1429
+    },
+    {
+      "epoch": 4.387394312067641,
+      "grad_norm": 0.6517553329467773,
+      "learning_rate": 0.00010484140233722873,
+      "loss": 0.3304,
+      "step": 1430
+    },
+    {
+      "epoch": 4.390468870099923,
+      "grad_norm": 0.6570941805839539,
+      "learning_rate": 0.0001047746243739566,
+      "loss": 0.3072,
+      "step": 1431
+    },
+    {
+      "epoch": 4.393543428132206,
+      "grad_norm": 0.6981759667396545,
+      "learning_rate": 0.00010470784641068447,
+      "loss": 0.2602,
+      "step": 1432
+    },
+    {
+      "epoch": 4.396617986164489,
+      "grad_norm": 0.999544084072113,
+      "learning_rate": 0.00010464106844741235,
+      "loss": 0.4485,
+      "step": 1433
+    },
+    {
+      "epoch": 4.399692544196772,
+      "grad_norm": 0.6772480010986328,
+      "learning_rate": 0.00010457429048414025,
+      "loss": 0.3789,
+      "step": 1434
+    },
+    {
+      "epoch": 4.402767102229054,
+      "grad_norm": 1.0531984567642212,
+      "learning_rate": 0.00010450751252086812,
+      "loss": 0.2384,
+      "step": 1435
+    },
+    {
+      "epoch": 4.405841660261338,
+      "grad_norm": 0.7211788892745972,
+      "learning_rate": 0.00010444073455759599,
+      "loss": 0.3192,
+      "step": 1436
+    },
+    {
+      "epoch": 4.40891621829362,
+      "grad_norm": 0.9477794170379639,
+      "learning_rate": 0.00010437395659432388,
+      "loss": 0.2762,
+      "step": 1437
+    },
+    {
+      "epoch": 4.411990776325903,
+      "grad_norm": 0.8108130097389221,
+      "learning_rate": 0.00010430717863105175,
+      "loss": 0.3724,
+      "step": 1438
+    },
+    {
+      "epoch": 4.415065334358186,
+      "grad_norm": 1.231468915939331,
+      "learning_rate": 0.00010424040066777965,
+      "loss": 0.4689,
+      "step": 1439
+    },
+    {
+      "epoch": 4.418139892390469,
+      "grad_norm": 1.2272400856018066,
+      "learning_rate": 0.00010417362270450753,
+      "loss": 0.3773,
+      "step": 1440
+    },
+    {
+      "epoch": 4.4212144504227515,
+      "grad_norm": 0.7169706225395203,
+      "learning_rate": 0.0001041068447412354,
+      "loss": 0.2731,
+      "step": 1441
+    },
+    {
+      "epoch": 4.424289008455035,
+      "grad_norm": 0.568555474281311,
+      "learning_rate": 0.00010404006677796327,
+      "loss": 0.3041,
+      "step": 1442
+    },
+    {
+      "epoch": 4.427363566487317,
+      "grad_norm": 1.2105591297149658,
+      "learning_rate": 0.00010397328881469115,
+      "loss": 0.3463,
+      "step": 1443
+    },
+    {
+      "epoch": 4.4304381245196005,
+      "grad_norm": 0.7139995098114014,
+      "learning_rate": 0.00010390651085141905,
+      "loss": 0.3665,
+      "step": 1444
+    },
+    {
+      "epoch": 4.433512682551883,
+      "grad_norm": 0.6359079480171204,
+      "learning_rate": 0.00010383973288814692,
+      "loss": 0.2739,
+      "step": 1445
+    },
+    {
+      "epoch": 4.436587240584166,
+      "grad_norm": 0.8577691316604614,
+      "learning_rate": 0.0001037729549248748,
+      "loss": 0.2478,
+      "step": 1446
+    },
+    {
+      "epoch": 4.439661798616449,
+      "grad_norm": 0.68791264295578,
+      "learning_rate": 0.00010370617696160268,
+      "loss": 0.337,
+      "step": 1447
+    },
+    {
+      "epoch": 4.442736356648732,
+      "grad_norm": 0.7423458695411682,
+      "learning_rate": 0.00010363939899833055,
+      "loss": 0.3899,
+      "step": 1448
+    },
+    {
+      "epoch": 4.445810914681014,
+      "grad_norm": 0.894343912601471,
+      "learning_rate": 0.00010357262103505843,
+      "loss": 0.3282,
+      "step": 1449
+    },
+    {
+      "epoch": 4.448885472713298,
+      "grad_norm": 0.9872162342071533,
+      "learning_rate": 0.00010350584307178633,
+      "loss": 0.3724,
+      "step": 1450
+    },
+    {
+      "epoch": 4.45196003074558,
+      "grad_norm": 0.9836599230766296,
+      "learning_rate": 0.0001034390651085142,
+      "loss": 0.3555,
+      "step": 1451
+    },
+    {
+      "epoch": 4.455034588777863,
+      "grad_norm": 0.9895578622817993,
+      "learning_rate": 0.00010337228714524207,
+      "loss": 0.3823,
+      "step": 1452
+    },
+    {
+      "epoch": 4.458109146810146,
+      "grad_norm": 1.0904133319854736,
+      "learning_rate": 0.00010330550918196994,
+      "loss": 0.4033,
+      "step": 1453
+    },
+    {
+      "epoch": 4.461183704842429,
+      "grad_norm": 0.6034055352210999,
+      "learning_rate": 0.00010323873121869783,
+      "loss": 0.2676,
+      "step": 1454
+    },
+    {
+      "epoch": 4.464258262874711,
+      "grad_norm": 0.7707822322845459,
+      "learning_rate": 0.00010317195325542572,
+      "loss": 0.3143,
+      "step": 1455
+    },
+    {
+      "epoch": 4.467332820906995,
+      "grad_norm": 1.0982093811035156,
+      "learning_rate": 0.0001031051752921536,
+      "loss": 0.3799,
+      "step": 1456
+    },
+    {
+      "epoch": 4.470407378939277,
+      "grad_norm": 2.1675314903259277,
+      "learning_rate": 0.00010303839732888148,
+      "loss": 0.4698,
+      "step": 1457
+    },
+    {
+      "epoch": 4.47348193697156,
+      "grad_norm": 0.8458796143531799,
+      "learning_rate": 0.00010297161936560935,
+      "loss": 0.5113,
+      "step": 1458
+    },
+    {
+      "epoch": 4.476556495003843,
+      "grad_norm": 0.8346131443977356,
+      "learning_rate": 0.00010290484140233722,
+      "loss": 0.3766,
+      "step": 1459
+    },
+    {
+      "epoch": 4.479631053036126,
+      "grad_norm": 0.7935206890106201,
+      "learning_rate": 0.00010283806343906512,
+      "loss": 0.3153,
+      "step": 1460
+    },
+    {
+      "epoch": 4.482705611068409,
+      "grad_norm": 0.8221637606620789,
+      "learning_rate": 0.000102771285475793,
+      "loss": 0.2635,
+      "step": 1461
+    },
+    {
+      "epoch": 4.485780169100692,
+      "grad_norm": 0.5546371936798096,
+      "learning_rate": 0.00010270450751252087,
+      "loss": 0.2976,
+      "step": 1462
+    },
+    {
+      "epoch": 4.488854727132974,
+      "grad_norm": 1.041944146156311,
+      "learning_rate": 0.00010263772954924876,
+      "loss": 0.4004,
+      "step": 1463
+    },
+    {
+      "epoch": 4.4919292851652575,
+      "grad_norm": 0.827978253364563,
+      "learning_rate": 0.00010257095158597663,
+      "loss": 0.335,
+      "step": 1464
+    },
+    {
+      "epoch": 4.495003843197541,
+      "grad_norm": 0.8025320768356323,
+      "learning_rate": 0.0001025041736227045,
+      "loss": 0.3436,
+      "step": 1465
+    },
+    {
+      "epoch": 4.498078401229823,
+      "grad_norm": 0.7182911038398743,
+      "learning_rate": 0.0001024373956594324,
+      "loss": 0.4948,
+      "step": 1466
+    },
+    {
+      "epoch": 4.5011529592621065,
+      "grad_norm": 0.9388545155525208,
+      "learning_rate": 0.00010237061769616028,
+      "loss": 0.3967,
+      "step": 1467
+    },
+    {
+      "epoch": 4.504227517294389,
+      "grad_norm": 1.0608465671539307,
+      "learning_rate": 0.00010230383973288815,
+      "loss": 0.3166,
+      "step": 1468
+    },
+    {
+      "epoch": 4.507302075326672,
+      "grad_norm": 0.9616206288337708,
+      "learning_rate": 0.00010223706176961602,
+      "loss": 0.4008,
+      "step": 1469
+    },
+    {
+      "epoch": 4.510376633358955,
+      "grad_norm": 0.689566433429718,
+      "learning_rate": 0.0001021702838063439,
+      "loss": 0.3611,
+      "step": 1470
+    },
+    {
+      "epoch": 4.513451191391238,
+      "grad_norm": 0.612333357334137,
+      "learning_rate": 0.0001021035058430718,
+      "loss": 0.3755,
+      "step": 1471
+    },
+    {
+      "epoch": 4.51652574942352,
+      "grad_norm": 0.7102506160736084,
+      "learning_rate": 0.00010203672787979967,
+      "loss": 0.3566,
+      "step": 1472
+    },
+    {
+      "epoch": 4.5196003074558035,
+      "grad_norm": 0.7646180391311646,
+      "learning_rate": 0.00010196994991652756,
+      "loss": 0.2881,
+      "step": 1473
+    },
+    {
+      "epoch": 4.522674865488086,
+      "grad_norm": 0.8247338533401489,
+      "learning_rate": 0.00010190317195325543,
+      "loss": 0.3961,
+      "step": 1474
+    },
+    {
+      "epoch": 4.525749423520369,
+      "grad_norm": 0.622003972530365,
+      "learning_rate": 0.0001018363939899833,
+      "loss": 0.3407,
+      "step": 1475
+    },
+    {
+      "epoch": 4.528823981552652,
+      "grad_norm": 0.6311368346214294,
+      "learning_rate": 0.0001017696160267112,
+      "loss": 0.2872,
+      "step": 1476
+    },
+    {
+      "epoch": 4.531898539584935,
+      "grad_norm": 0.8423951268196106,
+      "learning_rate": 0.00010170283806343908,
+      "loss": 0.3465,
+      "step": 1477
+    },
+    {
+      "epoch": 4.534973097617217,
+      "grad_norm": 0.5665594339370728,
+      "learning_rate": 0.00010163606010016695,
+      "loss": 0.3414,
+      "step": 1478
+    },
+    {
+      "epoch": 4.538047655649501,
+      "grad_norm": 0.8207141160964966,
+      "learning_rate": 0.00010156928213689482,
+      "loss": 0.4187,
+      "step": 1479
+    },
+    {
+      "epoch": 4.541122213681783,
+      "grad_norm": 0.5721847414970398,
+      "learning_rate": 0.00010150250417362271,
+      "loss": 0.2909,
+      "step": 1480
+    },
+    {
+      "epoch": 4.544196771714066,
+      "grad_norm": 0.837468147277832,
+      "learning_rate": 0.00010143572621035058,
+      "loss": 0.4037,
+      "step": 1481
+    },
+    {
+      "epoch": 4.547271329746349,
+      "grad_norm": 0.7777520418167114,
+      "learning_rate": 0.00010136894824707848,
+      "loss": 0.4051,
+      "step": 1482
+    },
+    {
+      "epoch": 4.550345887778632,
+      "grad_norm": 1.183840274810791,
+      "learning_rate": 0.00010130217028380636,
+      "loss": 0.414,
+      "step": 1483
+    },
+    {
+      "epoch": 4.553420445810914,
+      "grad_norm": 0.9845882654190063,
+      "learning_rate": 0.00010123539232053423,
+      "loss": 0.3536,
+      "step": 1484
+    },
+    {
+      "epoch": 4.556495003843198,
+      "grad_norm": 0.6358274817466736,
+      "learning_rate": 0.0001011686143572621,
+      "loss": 0.3828,
+      "step": 1485
+    },
+    {
+      "epoch": 4.55956956187548,
+      "grad_norm": 0.8890843391418457,
+      "learning_rate": 0.00010110183639398998,
+      "loss": 0.3399,
+      "step": 1486
+    },
+    {
+      "epoch": 4.562644119907763,
+      "grad_norm": 0.894417941570282,
+      "learning_rate": 0.00010103505843071788,
+      "loss": 0.4613,
+      "step": 1487
+    },
+    {
+      "epoch": 4.565718677940046,
+      "grad_norm": 0.8622507452964783,
+      "learning_rate": 0.00010096828046744575,
+      "loss": 0.2838,
+      "step": 1488
+    },
+    {
+      "epoch": 4.568793235972329,
+      "grad_norm": 0.8701838850975037,
+      "learning_rate": 0.00010090150250417362,
+      "loss": 0.3169,
+      "step": 1489
+    },
+    {
+      "epoch": 4.5718677940046115,
+      "grad_norm": 0.8100345134735107,
+      "learning_rate": 0.00010083472454090151,
+      "loss": 0.3649,
+      "step": 1490
+    },
+    {
+      "epoch": 4.574942352036895,
+      "grad_norm": 0.8611205220222473,
+      "learning_rate": 0.00010076794657762938,
+      "loss": 0.2422,
+      "step": 1491
+    },
+    {
+      "epoch": 4.578016910069177,
+      "grad_norm": 0.8310852646827698,
+      "learning_rate": 0.00010070116861435728,
+      "loss": 0.3244,
+      "step": 1492
+    },
+    {
+      "epoch": 4.5810914681014605,
+      "grad_norm": 0.7983706593513489,
+      "learning_rate": 0.00010063439065108516,
+      "loss": 0.3132,
+      "step": 1493
+    },
+    {
+      "epoch": 4.584166026133743,
+      "grad_norm": 0.6380778551101685,
+      "learning_rate": 0.00010056761268781303,
+      "loss": 0.3557,
+      "step": 1494
+    },
+    {
+      "epoch": 4.587240584166026,
+      "grad_norm": 0.81980299949646,
+      "learning_rate": 0.0001005008347245409,
+      "loss": 0.4001,
+      "step": 1495
+    },
+    {
+      "epoch": 4.590315142198309,
+      "grad_norm": 1.0842241048812866,
+      "learning_rate": 0.00010043405676126878,
+      "loss": 0.3951,
+      "step": 1496
+    },
+    {
+      "epoch": 4.593389700230592,
+      "grad_norm": 0.7225966453552246,
+      "learning_rate": 0.00010036727879799666,
+      "loss": 0.3051,
+      "step": 1497
+    },
+    {
+      "epoch": 4.596464258262875,
+      "grad_norm": 0.7823684811592102,
+      "learning_rate": 0.00010030050083472455,
+      "loss": 0.3113,
+      "step": 1498
+    },
+    {
+      "epoch": 4.599538816295158,
+      "grad_norm": 0.8264310359954834,
+      "learning_rate": 0.00010023372287145244,
+      "loss": 0.4567,
+      "step": 1499
+    },
+    {
+      "epoch": 4.60261337432744,
+      "grad_norm": 1.0230191946029663,
+      "learning_rate": 0.00010016694490818031,
+      "loss": 0.5121,
+      "step": 1500
+    },
+    {
+      "epoch": 4.605687932359723,
+      "grad_norm": 0.7866786122322083,
+      "learning_rate": 0.00010010016694490818,
+      "loss": 0.2891,
+      "step": 1501
+    },
+    {
+      "epoch": 4.608762490392007,
+      "grad_norm": 0.7644535303115845,
+      "learning_rate": 0.00010003338898163605,
+      "loss": 0.2806,
+      "step": 1502
+    },
+    {
+      "epoch": 4.611837048424289,
+      "grad_norm": 0.6497211456298828,
+      "learning_rate": 9.996661101836394e-05,
+      "loss": 0.3869,
+      "step": 1503
+    },
+    {
+      "epoch": 4.614911606456571,
+      "grad_norm": 0.694921612739563,
+      "learning_rate": 9.989983305509183e-05,
+      "loss": 0.3874,
+      "step": 1504
+    },
+    {
+      "epoch": 4.617986164488855,
+      "grad_norm": 0.8609017133712769,
+      "learning_rate": 9.98330550918197e-05,
+      "loss": 0.3816,
+      "step": 1505
+    },
+    {
+      "epoch": 4.621060722521138,
+      "grad_norm": 0.6470094323158264,
+      "learning_rate": 9.976627712854757e-05,
+      "loss": 0.321,
+      "step": 1506
+    },
+    {
+      "epoch": 4.62413528055342,
+      "grad_norm": 0.9883415102958679,
+      "learning_rate": 9.969949916527546e-05,
+      "loss": 0.4452,
+      "step": 1507
+    },
+    {
+      "epoch": 4.627209838585704,
+      "grad_norm": 0.9782819151878357,
+      "learning_rate": 9.963272120200335e-05,
+      "loss": 0.3273,
+      "step": 1508
+    },
+    {
+      "epoch": 4.630284396617986,
+      "grad_norm": 1.2475955486297607,
+      "learning_rate": 9.956594323873122e-05,
+      "loss": 0.3471,
+      "step": 1509
+    },
+    {
+      "epoch": 4.633358954650269,
+      "grad_norm": 0.6427537202835083,
+      "learning_rate": 9.949916527545911e-05,
+      "loss": 0.3775,
+      "step": 1510
+    },
+    {
+      "epoch": 4.636433512682552,
+      "grad_norm": 0.7873066067695618,
+      "learning_rate": 9.943238731218698e-05,
+      "loss": 0.3011,
+      "step": 1511
+    },
+    {
+      "epoch": 4.639508070714835,
+      "grad_norm": 0.71328204870224,
+      "learning_rate": 9.936560934891487e-05,
+      "loss": 0.3559,
+      "step": 1512
+    },
+    {
+      "epoch": 4.6425826287471175,
+      "grad_norm": 0.7703279256820679,
+      "learning_rate": 9.929883138564274e-05,
+      "loss": 0.2484,
+      "step": 1513
+    },
+    {
+      "epoch": 4.645657186779401,
+      "grad_norm": 0.8112149238586426,
+      "learning_rate": 9.923205342237061e-05,
+      "loss": 0.2511,
+      "step": 1514
+    },
+    {
+      "epoch": 4.648731744811683,
+      "grad_norm": 0.729215681552887,
+      "learning_rate": 9.91652754590985e-05,
+      "loss": 0.3465,
+      "step": 1515
+    },
+    {
+      "epoch": 4.6518063028439665,
+      "grad_norm": 0.8515496850013733,
+      "learning_rate": 9.909849749582639e-05,
+      "loss": 0.3523,
+      "step": 1516
+    },
+    {
+      "epoch": 4.654880860876249,
+      "grad_norm": 0.9650899171829224,
+      "learning_rate": 9.903171953255426e-05,
+      "loss": 0.2568,
+      "step": 1517
+    },
+    {
+      "epoch": 4.657955418908532,
+      "grad_norm": 1.0580472946166992,
+      "learning_rate": 9.896494156928215e-05,
+      "loss": 0.3365,
+      "step": 1518
+    },
+    {
+      "epoch": 4.661029976940815,
+      "grad_norm": 0.9089365005493164,
+      "learning_rate": 9.889816360601002e-05,
+      "loss": 0.3948,
+      "step": 1519
+    },
+    {
+      "epoch": 4.664104534973098,
+      "grad_norm": 0.7647799849510193,
+      "learning_rate": 9.883138564273791e-05,
+      "loss": 0.4183,
+      "step": 1520
+    },
+    {
+      "epoch": 4.66717909300538,
+      "grad_norm": 0.9137128591537476,
+      "learning_rate": 9.876460767946578e-05,
+      "loss": 0.4112,
+      "step": 1521
+    },
+    {
+      "epoch": 4.6702536510376635,
+      "grad_norm": 0.7739920616149902,
+      "learning_rate": 9.869782971619365e-05,
+      "loss": 0.3357,
+      "step": 1522
+    },
+    {
+      "epoch": 4.673328209069946,
+      "grad_norm": 0.90510493516922,
+      "learning_rate": 9.863105175292154e-05,
+      "loss": 0.3252,
+      "step": 1523
+    },
+    {
+      "epoch": 4.676402767102229,
+      "grad_norm": 0.7696104645729065,
+      "learning_rate": 9.856427378964941e-05,
+      "loss": 0.3981,
+      "step": 1524
+    },
+    {
+      "epoch": 4.679477325134512,
+      "grad_norm": 0.8543115854263306,
+      "learning_rate": 9.84974958263773e-05,
+      "loss": 0.3611,
+      "step": 1525
+    },
+    {
+      "epoch": 4.682551883166795,
+      "grad_norm": 0.6455523371696472,
+      "learning_rate": 9.843071786310519e-05,
+      "loss": 0.3056,
+      "step": 1526
+    },
+    {
+      "epoch": 4.685626441199077,
+      "grad_norm": 0.827754020690918,
+      "learning_rate": 9.836393989983306e-05,
+      "loss": 0.3799,
+      "step": 1527
+    },
+    {
+      "epoch": 4.688700999231361,
+      "grad_norm": 0.7233520746231079,
+      "learning_rate": 9.829716193656095e-05,
+      "loss": 0.3514,
+      "step": 1528
+    },
+    {
+      "epoch": 4.691775557263643,
+      "grad_norm": 0.6044474244117737,
+      "learning_rate": 9.823038397328882e-05,
+      "loss": 0.3335,
+      "step": 1529
+    },
+    {
+      "epoch": 4.694850115295926,
+      "grad_norm": 0.938494861125946,
+      "learning_rate": 9.816360601001669e-05,
+      "loss": 0.3564,
+      "step": 1530
+    },
+    {
+      "epoch": 4.697924673328209,
+      "grad_norm": 0.7700350880622864,
+      "learning_rate": 9.809682804674458e-05,
+      "loss": 0.3767,
+      "step": 1531
+    },
+    {
+      "epoch": 4.700999231360492,
+      "grad_norm": 0.774013876914978,
+      "learning_rate": 9.803005008347245e-05,
+      "loss": 0.4254,
+      "step": 1532
+    },
+    {
+      "epoch": 4.704073789392774,
+      "grad_norm": 0.987633228302002,
+      "learning_rate": 9.796327212020034e-05,
+      "loss": 0.3796,
+      "step": 1533
+    },
+    {
+      "epoch": 4.707148347425058,
+      "grad_norm": 0.8716994524002075,
+      "learning_rate": 9.789649415692823e-05,
+      "loss": 0.3461,
+      "step": 1534
+    },
+    {
+      "epoch": 4.71022290545734,
+      "grad_norm": 1.3219870328903198,
+      "learning_rate": 9.78297161936561e-05,
+      "loss": 0.3371,
+      "step": 1535
+    },
+    {
+      "epoch": 4.713297463489623,
+      "grad_norm": 0.7755498886108398,
+      "learning_rate": 9.776293823038399e-05,
+      "loss": 0.2963,
+      "step": 1536
+    },
+    {
+      "epoch": 4.716372021521906,
+      "grad_norm": 0.8221181035041809,
+      "learning_rate": 9.769616026711186e-05,
+      "loss": 0.4356,
+      "step": 1537
+    },
+    {
+      "epoch": 4.719446579554189,
+      "grad_norm": 0.7704976797103882,
+      "learning_rate": 9.762938230383973e-05,
+      "loss": 0.3544,
+      "step": 1538
+    },
+    {
+      "epoch": 4.722521137586472,
+      "grad_norm": 0.9037477970123291,
+      "learning_rate": 9.756260434056762e-05,
+      "loss": 0.3885,
+      "step": 1539
+    },
+    {
+      "epoch": 4.725595695618755,
+      "grad_norm": 0.7319322228431702,
+      "learning_rate": 9.749582637729549e-05,
+      "loss": 0.3295,
+      "step": 1540
+    },
+    {
+      "epoch": 4.728670253651037,
+      "grad_norm": 1.0239067077636719,
+      "learning_rate": 9.742904841402337e-05,
+      "loss": 0.4706,
+      "step": 1541
+    },
+    {
+      "epoch": 4.7317448116833205,
+      "grad_norm": 0.8769973516464233,
+      "learning_rate": 9.736227045075125e-05,
+      "loss": 0.3697,
+      "step": 1542
+    },
+    {
+      "epoch": 4.734819369715604,
+      "grad_norm": 0.9179707169532776,
+      "learning_rate": 9.729549248747914e-05,
+      "loss": 0.4451,
+      "step": 1543
+    },
+    {
+      "epoch": 4.737893927747886,
+      "grad_norm": 0.8629128932952881,
+      "learning_rate": 9.722871452420703e-05,
+      "loss": 0.3772,
+      "step": 1544
+    },
+    {
+      "epoch": 4.740968485780169,
+      "grad_norm": 0.7455741763114929,
+      "learning_rate": 9.71619365609349e-05,
+      "loss": 0.3695,
+      "step": 1545
+    },
+    {
+      "epoch": 4.744043043812452,
+      "grad_norm": 0.8288558125495911,
+      "learning_rate": 9.709515859766277e-05,
+      "loss": 0.4511,
+      "step": 1546
+    },
+    {
+      "epoch": 4.747117601844735,
+      "grad_norm": 0.6822009682655334,
+      "learning_rate": 9.702838063439066e-05,
+      "loss": 0.3832,
+      "step": 1547
+    },
+    {
+      "epoch": 4.750192159877018,
+      "grad_norm": 0.7247387766838074,
+      "learning_rate": 9.696160267111853e-05,
+      "loss": 0.3628,
+      "step": 1548
+    },
+    {
+      "epoch": 4.753266717909301,
+      "grad_norm": 0.7800189256668091,
+      "learning_rate": 9.68948247078464e-05,
+      "loss": 0.3548,
+      "step": 1549
+    },
+    {
+      "epoch": 4.756341275941583,
+      "grad_norm": 1.8545207977294922,
+      "learning_rate": 9.682804674457429e-05,
+      "loss": 0.3329,
+      "step": 1550
+    },
+    {
+      "epoch": 4.759415833973867,
+      "grad_norm": 0.8365579843521118,
+      "learning_rate": 9.676126878130218e-05,
+      "loss": 0.3358,
+      "step": 1551
+    },
+    {
+      "epoch": 4.762490392006149,
+      "grad_norm": 0.8753309845924377,
+      "learning_rate": 9.669449081803006e-05,
+      "loss": 0.404,
+      "step": 1552
+    },
+    {
+      "epoch": 4.765564950038432,
+      "grad_norm": 1.0423812866210938,
+      "learning_rate": 9.662771285475794e-05,
+      "loss": 0.393,
+      "step": 1553
+    },
+    {
+      "epoch": 4.768639508070715,
+      "grad_norm": 0.9028570055961609,
+      "learning_rate": 9.656093489148581e-05,
+      "loss": 0.3943,
+      "step": 1554
+    },
+    {
+      "epoch": 4.771714066102998,
+      "grad_norm": 0.9643226265907288,
+      "learning_rate": 9.64941569282137e-05,
+      "loss": 0.4855,
+      "step": 1555
+    },
+    {
+      "epoch": 4.77478862413528,
+      "grad_norm": 0.9107238054275513,
+      "learning_rate": 9.642737896494157e-05,
+      "loss": 0.4372,
+      "step": 1556
+    },
+    {
+      "epoch": 4.777863182167564,
+      "grad_norm": 0.725831925868988,
+      "learning_rate": 9.636060100166944e-05,
+      "loss": 0.3901,
+      "step": 1557
+    },
+    {
+      "epoch": 4.780937740199846,
+      "grad_norm": 0.8662984371185303,
+      "learning_rate": 9.629382303839733e-05,
+      "loss": 0.3486,
+      "step": 1558
+    },
+    {
+      "epoch": 4.784012298232129,
+      "grad_norm": 0.6875986456871033,
+      "learning_rate": 9.62270450751252e-05,
+      "loss": 0.3593,
+      "step": 1559
+    },
+    {
+      "epoch": 4.787086856264412,
+      "grad_norm": 0.6532884836196899,
+      "learning_rate": 9.616026711185309e-05,
+      "loss": 0.3615,
+      "step": 1560
+    },
+    {
+      "epoch": 4.790161414296695,
+      "grad_norm": 0.7729180455207825,
+      "learning_rate": 9.609348914858098e-05,
+      "loss": 0.3268,
+      "step": 1561
+    },
+    {
+      "epoch": 4.7932359723289775,
+      "grad_norm": 1.191616177558899,
+      "learning_rate": 9.602671118530885e-05,
+      "loss": 0.4502,
+      "step": 1562
+    },
+    {
+      "epoch": 4.796310530361261,
+      "grad_norm": 0.7924370765686035,
+      "learning_rate": 9.595993322203674e-05,
+      "loss": 0.4504,
+      "step": 1563
+    },
+    {
+      "epoch": 4.799385088393543,
+      "grad_norm": 0.833450973033905,
+      "learning_rate": 9.589315525876461e-05,
+      "loss": 0.4081,
+      "step": 1564
+    },
+    {
+      "epoch": 4.8024596464258265,
+      "grad_norm": 1.320788025856018,
+      "learning_rate": 9.582637729549248e-05,
+      "loss": 0.3704,
+      "step": 1565
+    },
+    {
+      "epoch": 4.805534204458109,
+      "grad_norm": 0.9833523035049438,
+      "learning_rate": 9.575959933222037e-05,
+      "loss": 0.4213,
+      "step": 1566
+    },
+    {
+      "epoch": 4.808608762490392,
+      "grad_norm": 0.7859238386154175,
+      "learning_rate": 9.569282136894824e-05,
+      "loss": 0.4225,
+      "step": 1567
+    },
+    {
+      "epoch": 4.811683320522675,
+      "grad_norm": 0.6075074076652527,
+      "learning_rate": 9.562604340567613e-05,
+      "loss": 0.3745,
+      "step": 1568
+    },
+    {
+      "epoch": 4.814757878554958,
+      "grad_norm": 0.8117219805717468,
+      "learning_rate": 9.555926544240402e-05,
+      "loss": 0.3762,
+      "step": 1569
+    },
+    {
+      "epoch": 4.81783243658724,
+      "grad_norm": 0.7648201584815979,
+      "learning_rate": 9.549248747913189e-05,
+      "loss": 0.3781,
+      "step": 1570
+    },
+    {
+      "epoch": 4.8209069946195235,
+      "grad_norm": 0.8862608671188354,
+      "learning_rate": 9.542570951585978e-05,
+      "loss": 0.3936,
+      "step": 1571
+    },
+    {
+      "epoch": 4.823981552651806,
+      "grad_norm": 0.8977257609367371,
+      "learning_rate": 9.535893155258765e-05,
+      "loss": 0.4147,
+      "step": 1572
+    },
+    {
+      "epoch": 4.827056110684089,
+      "grad_norm": 0.6795991659164429,
+      "learning_rate": 9.529215358931554e-05,
+      "loss": 0.4196,
+      "step": 1573
+    },
+    {
+      "epoch": 4.830130668716372,
+      "grad_norm": 0.6213774085044861,
+      "learning_rate": 9.522537562604341e-05,
+      "loss": 0.3451,
+      "step": 1574
+    },
+    {
+      "epoch": 4.833205226748655,
+      "grad_norm": 0.8230448961257935,
+      "learning_rate": 9.515859766277128e-05,
+      "loss": 0.3525,
+      "step": 1575
+    },
+    {
+      "epoch": 4.836279784780938,
+      "grad_norm": 1.0086671113967896,
+      "learning_rate": 9.509181969949917e-05,
+      "loss": 0.347,
+      "step": 1576
+    },
+    {
+      "epoch": 4.839354342813221,
+      "grad_norm": 1.0692055225372314,
+      "learning_rate": 9.502504173622706e-05,
+      "loss": 0.3153,
+      "step": 1577
+    },
+    {
+      "epoch": 4.842428900845503,
+      "grad_norm": 0.7910997271537781,
+      "learning_rate": 9.495826377295493e-05,
+      "loss": 0.3721,
+      "step": 1578
+    },
+    {
+      "epoch": 4.845503458877786,
+      "grad_norm": 1.0143672227859497,
+      "learning_rate": 9.489148580968282e-05,
+      "loss": 0.2858,
+      "step": 1579
+    },
+    {
+      "epoch": 4.84857801691007,
+      "grad_norm": 0.8259998559951782,
+      "learning_rate": 9.482470784641069e-05,
+      "loss": 0.4454,
+      "step": 1580
+    },
+    {
+      "epoch": 4.851652574942352,
+      "grad_norm": 0.9319655299186707,
+      "learning_rate": 9.475792988313858e-05,
+      "loss": 0.2912,
+      "step": 1581
+    },
+    {
+      "epoch": 4.854727132974634,
+      "grad_norm": 0.7429136633872986,
+      "learning_rate": 9.469115191986645e-05,
+      "loss": 0.402,
+      "step": 1582
+    },
+    {
+      "epoch": 4.857801691006918,
+      "grad_norm": 0.96834397315979,
+      "learning_rate": 9.462437395659432e-05,
+      "loss": 0.5215,
+      "step": 1583
+    },
+    {
+      "epoch": 4.860876249039201,
+      "grad_norm": 0.7908016443252563,
+      "learning_rate": 9.455759599332221e-05,
+      "loss": 0.4318,
+      "step": 1584
+    },
+    {
+      "epoch": 4.863950807071483,
+      "grad_norm": 0.7773927450180054,
+      "learning_rate": 9.449081803005008e-05,
+      "loss": 0.2145,
+      "step": 1585
+    },
+    {
+      "epoch": 4.867025365103767,
+      "grad_norm": 0.8596830368041992,
+      "learning_rate": 9.442404006677797e-05,
+      "loss": 0.3767,
+      "step": 1586
+    },
+    {
+      "epoch": 4.870099923136049,
+      "grad_norm": 0.9522383213043213,
+      "learning_rate": 9.435726210350586e-05,
+      "loss": 0.4253,
+      "step": 1587
+    },
+    {
+      "epoch": 4.873174481168332,
+      "grad_norm": 0.878300666809082,
+      "learning_rate": 9.429048414023373e-05,
+      "loss": 0.4303,
+      "step": 1588
+    },
+    {
+      "epoch": 4.876249039200615,
+      "grad_norm": 1.1075037717819214,
+      "learning_rate": 9.422370617696162e-05,
+      "loss": 0.2863,
+      "step": 1589
+    },
+    {
+      "epoch": 4.879323597232898,
+      "grad_norm": 1.001308560371399,
+      "learning_rate": 9.415692821368949e-05,
+      "loss": 0.3345,
+      "step": 1590
+    },
+    {
+      "epoch": 4.8823981552651805,
+      "grad_norm": 1.6283379793167114,
+      "learning_rate": 9.409015025041736e-05,
+      "loss": 0.4163,
+      "step": 1591
+    },
+    {
+      "epoch": 4.885472713297464,
+      "grad_norm": 1.2115163803100586,
+      "learning_rate": 9.402337228714525e-05,
+      "loss": 0.3277,
+      "step": 1592
+    },
+    {
+      "epoch": 4.888547271329746,
+      "grad_norm": 0.9039791226387024,
+      "learning_rate": 9.395659432387312e-05,
+      "loss": 0.3978,
+      "step": 1593
+    },
+    {
+      "epoch": 4.8916218293620295,
+      "grad_norm": 0.9173548221588135,
+      "learning_rate": 9.388981636060101e-05,
+      "loss": 0.3724,
+      "step": 1594
+    },
+    {
+      "epoch": 4.894696387394312,
+      "grad_norm": 1.280786395072937,
+      "learning_rate": 9.38230383973289e-05,
+      "loss": 0.504,
+      "step": 1595
+    },
+    {
+      "epoch": 4.897770945426595,
+      "grad_norm": 1.13883638381958,
+      "learning_rate": 9.375626043405677e-05,
+      "loss": 0.2902,
+      "step": 1596
+    },
+    {
+      "epoch": 4.900845503458878,
+      "grad_norm": 0.7668140530586243,
+      "learning_rate": 9.368948247078465e-05,
+      "loss": 0.3323,
+      "step": 1597
+    },
+    {
+      "epoch": 4.903920061491161,
+      "grad_norm": 0.7449612021446228,
+      "learning_rate": 9.362270450751253e-05,
+      "loss": 0.3363,
+      "step": 1598
+    },
+    {
+      "epoch": 4.906994619523443,
+      "grad_norm": 0.9896752834320068,
+      "learning_rate": 9.35559265442404e-05,
+      "loss": 0.4381,
+      "step": 1599
+    },
+    {
+      "epoch": 4.910069177555727,
+      "grad_norm": 0.8678106069564819,
+      "learning_rate": 9.348914858096829e-05,
+      "loss": 0.3658,
+      "step": 1600
+    },
+    {
+      "epoch": 4.913143735588009,
+      "grad_norm": 0.9787524342536926,
+      "learning_rate": 9.342237061769616e-05,
+      "loss": 0.2971,
+      "step": 1601
+    },
+    {
+      "epoch": 4.916218293620292,
+      "grad_norm": 0.8093075752258301,
+      "learning_rate": 9.335559265442403e-05,
+      "loss": 0.3491,
+      "step": 1602
+    },
+    {
+      "epoch": 4.919292851652575,
+      "grad_norm": 1.5801624059677124,
+      "learning_rate": 9.328881469115192e-05,
+      "loss": 0.4743,
+      "step": 1603
+    },
+    {
+      "epoch": 4.922367409684858,
+      "grad_norm": 0.6919710040092468,
+      "learning_rate": 9.322203672787981e-05,
+      "loss": 0.355,
+      "step": 1604
+    },
+    {
+      "epoch": 4.92544196771714,
+      "grad_norm": 0.8053343892097473,
+      "learning_rate": 9.31552587646077e-05,
+      "loss": 0.3963,
+      "step": 1605
+    },
+    {
+      "epoch": 4.928516525749424,
+      "grad_norm": 0.8745597004890442,
+      "learning_rate": 9.308848080133557e-05,
+      "loss": 0.445,
+      "step": 1606
+    },
+    {
+      "epoch": 4.931591083781706,
+      "grad_norm": 0.6608087420463562,
+      "learning_rate": 9.302170283806344e-05,
+      "loss": 0.3536,
+      "step": 1607
+    },
+    {
+      "epoch": 4.934665641813989,
+      "grad_norm": 0.6686768531799316,
+      "learning_rate": 9.295492487479133e-05,
+      "loss": 0.3352,
+      "step": 1608
+    },
+    {
+      "epoch": 4.937740199846272,
+      "grad_norm": 1.0998315811157227,
+      "learning_rate": 9.28881469115192e-05,
+      "loss": 0.4367,
+      "step": 1609
+    },
+    {
+      "epoch": 4.940814757878555,
+      "grad_norm": 1.2435147762298584,
+      "learning_rate": 9.282136894824707e-05,
+      "loss": 0.4703,
+      "step": 1610
+    },
+    {
+      "epoch": 4.9438893159108375,
+      "grad_norm": 0.6086277365684509,
+      "learning_rate": 9.275459098497496e-05,
+      "loss": 0.3431,
+      "step": 1611
+    },
+    {
+      "epoch": 4.946963873943121,
+      "grad_norm": 0.6807745099067688,
+      "learning_rate": 9.268781302170285e-05,
+      "loss": 0.3895,
+      "step": 1612
+    },
+    {
+      "epoch": 4.950038431975403,
+      "grad_norm": 0.9875394105911255,
+      "learning_rate": 9.262103505843073e-05,
+      "loss": 0.4324,
+      "step": 1613
+    },
+    {
+      "epoch": 4.9531129900076865,
+      "grad_norm": 0.7492454051971436,
+      "learning_rate": 9.255425709515861e-05,
+      "loss": 0.3759,
+      "step": 1614
+    },
+    {
+      "epoch": 4.956187548039969,
+      "grad_norm": 0.8571634292602539,
+      "learning_rate": 9.248747913188648e-05,
+      "loss": 0.465,
+      "step": 1615
+    },
+    {
+      "epoch": 4.959262106072252,
+      "grad_norm": 0.8077588677406311,
+      "learning_rate": 9.242070116861437e-05,
+      "loss": 0.4365,
+      "step": 1616
+    },
+    {
+      "epoch": 4.9623366641045354,
+      "grad_norm": 1.1906275749206543,
+      "learning_rate": 9.235392320534224e-05,
+      "loss": 0.4708,
+      "step": 1617
+    },
+    {
+      "epoch": 4.965411222136818,
+      "grad_norm": 0.8662446737289429,
+      "learning_rate": 9.228714524207011e-05,
+      "loss": 0.3778,
+      "step": 1618
+    },
+    {
+      "epoch": 4.9684857801691,
+      "grad_norm": 1.0619062185287476,
+      "learning_rate": 9.2220367278798e-05,
+      "loss": 0.4452,
+      "step": 1619
+    },
+    {
+      "epoch": 4.9715603382013835,
+      "grad_norm": 0.7930029034614563,
+      "learning_rate": 9.215358931552587e-05,
+      "loss": 0.4003,
+      "step": 1620
+    },
+    {
+      "epoch": 4.974634896233667,
+      "grad_norm": 0.7096540331840515,
+      "learning_rate": 9.208681135225376e-05,
+      "loss": 0.3177,
+      "step": 1621
+    },
+    {
+      "epoch": 4.977709454265949,
+      "grad_norm": 0.673022985458374,
+      "learning_rate": 9.202003338898165e-05,
+      "loss": 0.4044,
+      "step": 1622
+    },
+    {
+      "epoch": 4.980784012298232,
+      "grad_norm": 0.7269219756126404,
+      "learning_rate": 9.195325542570952e-05,
+      "loss": 0.3602,
+      "step": 1623
+    },
+    {
+      "epoch": 4.983858570330515,
+      "grad_norm": 0.617123007774353,
+      "learning_rate": 9.18864774624374e-05,
+      "loss": 0.2814,
+      "step": 1624
+    },
+    {
+      "epoch": 4.986933128362798,
+      "grad_norm": 1.1843900680541992,
+      "learning_rate": 9.181969949916528e-05,
+      "loss": 0.4494,
+      "step": 1625
+    },
+    {
+      "epoch": 4.990007686395081,
+      "grad_norm": 0.8236628770828247,
+      "learning_rate": 9.175292153589315e-05,
+      "loss": 0.394,
+      "step": 1626
+    },
+    {
+      "epoch": 4.993082244427364,
+      "grad_norm": 0.7394270896911621,
+      "learning_rate": 9.168614357262104e-05,
+      "loss": 0.3729,
+      "step": 1627
+    },
+    {
+      "epoch": 4.996156802459646,
+      "grad_norm": 1.1829383373260498,
+      "learning_rate": 9.161936560934891e-05,
+      "loss": 0.4782,
+      "step": 1628
+    },
+    {
+      "epoch": 4.99923136049193,
+      "grad_norm": 0.8535853028297424,
+      "learning_rate": 9.15525876460768e-05,
+      "loss": 0.3974,
+      "step": 1629
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 2.5640904903411865,
+      "learning_rate": 9.148580968280469e-05,
+      "loss": 0.4622,
+      "step": 1630
+    },
+    {
+      "epoch": 5.003074558032283,
+      "grad_norm": 0.572605311870575,
+      "learning_rate": 9.141903171953256e-05,
+      "loss": 0.2781,
+      "step": 1631
+    },
+    {
+      "epoch": 5.006149116064566,
+      "grad_norm": 0.623123824596405,
+      "learning_rate": 9.135225375626045e-05,
+      "loss": 0.333,
+      "step": 1632
+    },
+    {
+      "epoch": 5.009223674096849,
+      "grad_norm": 0.6070964932441711,
+      "learning_rate": 9.128547579298832e-05,
+      "loss": 0.2586,
+      "step": 1633
+    },
+    {
+      "epoch": 5.012298232129131,
+      "grad_norm": 0.49212580919265747,
+      "learning_rate": 9.121869782971619e-05,
+      "loss": 0.205,
+      "step": 1634
+    },
+    {
+      "epoch": 5.015372790161415,
+      "grad_norm": 0.6224344372749329,
+      "learning_rate": 9.115191986644408e-05,
+      "loss": 0.3319,
+      "step": 1635
+    },
+    {
+      "epoch": 5.018447348193697,
+      "grad_norm": 0.9165223240852356,
+      "learning_rate": 9.108514190317195e-05,
+      "loss": 0.4182,
+      "step": 1636
+    },
+    {
+      "epoch": 5.02152190622598,
+      "grad_norm": 0.4661840498447418,
+      "learning_rate": 9.101836393989984e-05,
+      "loss": 0.2065,
+      "step": 1637
+    },
+    {
+      "epoch": 5.024596464258263,
+      "grad_norm": 0.5676157474517822,
+      "learning_rate": 9.095158597662771e-05,
+      "loss": 0.2128,
+      "step": 1638
+    },
+    {
+      "epoch": 5.027671022290546,
+      "grad_norm": 0.7022117972373962,
+      "learning_rate": 9.08848080133556e-05,
+      "loss": 0.3238,
+      "step": 1639
+    },
+    {
+      "epoch": 5.0307455803228285,
+      "grad_norm": 0.5956372022628784,
+      "learning_rate": 9.081803005008348e-05,
+      "loss": 0.3295,
+      "step": 1640
+    },
+    {
+      "epoch": 5.033820138355112,
+      "grad_norm": 0.6279803514480591,
+      "learning_rate": 9.075125208681136e-05,
+      "loss": 0.2878,
+      "step": 1641
+    },
+    {
+      "epoch": 5.036894696387394,
+      "grad_norm": 0.624049961566925,
+      "learning_rate": 9.068447412353923e-05,
+      "loss": 0.2526,
+      "step": 1642
+    },
+    {
+      "epoch": 5.0399692544196775,
+      "grad_norm": 1.4161157608032227,
+      "learning_rate": 9.061769616026712e-05,
+      "loss": 0.3716,
+      "step": 1643
+    },
+    {
+      "epoch": 5.04304381245196,
+      "grad_norm": 0.6286519765853882,
+      "learning_rate": 9.055091819699499e-05,
+      "loss": 0.1649,
+      "step": 1644
+    },
+    {
+      "epoch": 5.046118370484243,
+      "grad_norm": 0.9992043972015381,
+      "learning_rate": 9.048414023372288e-05,
+      "loss": 0.3121,
+      "step": 1645
+    },
+    {
+      "epoch": 5.049192928516526,
+      "grad_norm": 0.6721956133842468,
+      "learning_rate": 9.041736227045075e-05,
+      "loss": 0.3164,
+      "step": 1646
+    },
+    {
+      "epoch": 5.052267486548809,
+      "grad_norm": 0.9115797877311707,
+      "learning_rate": 9.035058430717864e-05,
+      "loss": 0.3463,
+      "step": 1647
+    },
+    {
+      "epoch": 5.055342044581091,
+      "grad_norm": 1.3086482286453247,
+      "learning_rate": 9.028380634390652e-05,
+      "loss": 0.3249,
+      "step": 1648
+    },
+    {
+      "epoch": 5.058416602613375,
+      "grad_norm": 0.935232937335968,
+      "learning_rate": 9.02170283806344e-05,
+      "loss": 0.3646,
+      "step": 1649
+    },
+    {
+      "epoch": 5.061491160645657,
+      "grad_norm": 0.6877484917640686,
+      "learning_rate": 9.015025041736227e-05,
+      "loss": 0.33,
+      "step": 1650
+    },
+    {
+      "epoch": 5.06456571867794,
+      "grad_norm": 0.9156901836395264,
+      "learning_rate": 9.008347245409016e-05,
+      "loss": 0.3068,
+      "step": 1651
+    },
+    {
+      "epoch": 5.067640276710223,
+      "grad_norm": 0.8413227796554565,
+      "learning_rate": 9.001669449081803e-05,
+      "loss": 0.4284,
+      "step": 1652
+    },
+    {
+      "epoch": 5.070714834742506,
+      "grad_norm": 1.406680703163147,
+      "learning_rate": 8.994991652754592e-05,
+      "loss": 0.2722,
+      "step": 1653
+    },
+    {
+      "epoch": 5.073789392774788,
+      "grad_norm": 1.240125060081482,
+      "learning_rate": 8.988313856427379e-05,
+      "loss": 0.2941,
+      "step": 1654
+    },
+    {
+      "epoch": 5.076863950807072,
+      "grad_norm": 0.9402351379394531,
+      "learning_rate": 8.981636060100166e-05,
+      "loss": 0.3144,
+      "step": 1655
+    },
+    {
+      "epoch": 5.079938508839354,
+      "grad_norm": 0.972048819065094,
+      "learning_rate": 8.974958263772955e-05,
+      "loss": 0.3534,
+      "step": 1656
+    },
+    {
+      "epoch": 5.083013066871637,
+      "grad_norm": 0.8675603270530701,
+      "learning_rate": 8.968280467445744e-05,
+      "loss": 0.2585,
+      "step": 1657
+    },
+    {
+      "epoch": 5.08608762490392,
+      "grad_norm": 1.2413748502731323,
+      "learning_rate": 8.961602671118531e-05,
+      "loss": 0.3368,
+      "step": 1658
+    },
+    {
+      "epoch": 5.089162182936203,
+      "grad_norm": 0.35579174757003784,
+      "learning_rate": 8.95492487479132e-05,
+      "loss": 0.2431,
+      "step": 1659
+    },
+    {
+      "epoch": 5.092236740968485,
+      "grad_norm": 1.1538844108581543,
+      "learning_rate": 8.948247078464107e-05,
+      "loss": 0.321,
+      "step": 1660
+    },
+    {
+      "epoch": 5.095311299000769,
+      "grad_norm": 1.1886178255081177,
+      "learning_rate": 8.941569282136896e-05,
+      "loss": 0.2492,
+      "step": 1661
+    },
+    {
+      "epoch": 5.098385857033051,
+      "grad_norm": 0.5046135783195496,
+      "learning_rate": 8.934891485809683e-05,
+      "loss": 0.2312,
+      "step": 1662
+    },
+    {
+      "epoch": 5.101460415065334,
+      "grad_norm": 0.644220232963562,
+      "learning_rate": 8.92821368948247e-05,
+      "loss": 0.2498,
+      "step": 1663
+    },
+    {
+      "epoch": 5.104534973097617,
+      "grad_norm": 1.113159418106079,
+      "learning_rate": 8.921535893155259e-05,
+      "loss": 0.3425,
+      "step": 1664
+    },
+    {
+      "epoch": 5.1076095311299,
+      "grad_norm": 0.6977350115776062,
+      "learning_rate": 8.914858096828048e-05,
+      "loss": 0.3435,
+      "step": 1665
+    },
+    {
+      "epoch": 5.1106840891621825,
+      "grad_norm": 0.7484399080276489,
+      "learning_rate": 8.908180300500835e-05,
+      "loss": 0.2939,
+      "step": 1666
+    },
+    {
+      "epoch": 5.113758647194466,
+      "grad_norm": 0.9543803930282593,
+      "learning_rate": 8.901502504173624e-05,
+      "loss": 0.3154,
+      "step": 1667
+    },
+    {
+      "epoch": 5.116833205226748,
+      "grad_norm": 0.9736766219139099,
+      "learning_rate": 8.894824707846411e-05,
+      "loss": 0.2569,
+      "step": 1668
+    },
+    {
+      "epoch": 5.1199077632590315,
+      "grad_norm": 1.1530828475952148,
+      "learning_rate": 8.8881469115192e-05,
+      "loss": 0.3189,
+      "step": 1669
+    },
+    {
+      "epoch": 5.122982321291314,
+      "grad_norm": 0.6867527365684509,
+      "learning_rate": 8.881469115191987e-05,
+      "loss": 0.2784,
+      "step": 1670
+    },
+    {
+      "epoch": 5.126056879323597,
+      "grad_norm": 0.9009777307510376,
+      "learning_rate": 8.874791318864774e-05,
+      "loss": 0.2885,
+      "step": 1671
+    },
+    {
+      "epoch": 5.1291314373558805,
+      "grad_norm": 1.6015172004699707,
+      "learning_rate": 8.868113522537563e-05,
+      "loss": 0.2062,
+      "step": 1672
+    },
+    {
+      "epoch": 5.132205995388163,
+      "grad_norm": 0.7882331013679504,
+      "learning_rate": 8.86143572621035e-05,
+      "loss": 0.3244,
+      "step": 1673
+    },
+    {
+      "epoch": 5.135280553420446,
+      "grad_norm": 0.9412429332733154,
+      "learning_rate": 8.854757929883139e-05,
+      "loss": 0.3171,
+      "step": 1674
+    },
+    {
+      "epoch": 5.138355111452729,
+      "grad_norm": 0.6887170672416687,
+      "learning_rate": 8.848080133555928e-05,
+      "loss": 0.3405,
+      "step": 1675
+    },
+    {
+      "epoch": 5.141429669485012,
+      "grad_norm": 0.7920656800270081,
+      "learning_rate": 8.841402337228715e-05,
+      "loss": 0.3431,
+      "step": 1676
+    },
+    {
+      "epoch": 5.144504227517294,
+      "grad_norm": 0.663131594657898,
+      "learning_rate": 8.834724540901504e-05,
+      "loss": 0.2914,
+      "step": 1677
+    },
+    {
+      "epoch": 5.147578785549578,
+      "grad_norm": 0.9940230250358582,
+      "learning_rate": 8.828046744574291e-05,
+      "loss": 0.398,
+      "step": 1678
+    },
+    {
+      "epoch": 5.15065334358186,
+      "grad_norm": 1.2957160472869873,
+      "learning_rate": 8.821368948247078e-05,
+      "loss": 0.3417,
+      "step": 1679
+    },
+    {
+      "epoch": 5.153727901614143,
+      "grad_norm": 0.8957284092903137,
+      "learning_rate": 8.814691151919867e-05,
+      "loss": 0.2058,
+      "step": 1680
+    },
+    {
+      "epoch": 5.156802459646426,
+      "grad_norm": 1.5182899236679077,
+      "learning_rate": 8.808013355592654e-05,
+      "loss": 0.3196,
+      "step": 1681
+    },
+    {
+      "epoch": 5.159877017678709,
+      "grad_norm": 0.98117995262146,
+      "learning_rate": 8.801335559265443e-05,
+      "loss": 0.3348,
+      "step": 1682
+    },
+    {
+      "epoch": 5.162951575710991,
+      "grad_norm": 0.9935417175292969,
+      "learning_rate": 8.794657762938232e-05,
+      "loss": 0.1992,
+      "step": 1683
+    },
+    {
+      "epoch": 5.166026133743275,
+      "grad_norm": 0.6094648838043213,
+      "learning_rate": 8.787979966611019e-05,
+      "loss": 0.2138,
+      "step": 1684
+    },
+    {
+      "epoch": 5.169100691775557,
+      "grad_norm": 0.7856438755989075,
+      "learning_rate": 8.781302170283808e-05,
+      "loss": 0.2625,
+      "step": 1685
+    },
+    {
+      "epoch": 5.17217524980784,
+      "grad_norm": 0.7598311305046082,
+      "learning_rate": 8.774624373956595e-05,
+      "loss": 0.2635,
+      "step": 1686
+    },
+    {
+      "epoch": 5.175249807840123,
+      "grad_norm": 1.2613056898117065,
+      "learning_rate": 8.767946577629382e-05,
+      "loss": 0.3251,
+      "step": 1687
+    },
+    {
+      "epoch": 5.178324365872406,
+      "grad_norm": 1.7386010885238647,
+      "learning_rate": 8.761268781302171e-05,
+      "loss": 0.2677,
+      "step": 1688
+    },
+    {
+      "epoch": 5.1813989239046885,
+      "grad_norm": 0.7499911189079285,
+      "learning_rate": 8.754590984974958e-05,
+      "loss": 0.2974,
+      "step": 1689
+    },
+    {
+      "epoch": 5.184473481936972,
+      "grad_norm": 0.6865500211715698,
+      "learning_rate": 8.747913188647745e-05,
+      "loss": 0.3266,
+      "step": 1690
+    },
+    {
+      "epoch": 5.187548039969254,
+      "grad_norm": 0.8432502150535583,
+      "learning_rate": 8.741235392320535e-05,
+      "loss": 0.2287,
+      "step": 1691
+    },
+    {
+      "epoch": 5.1906225980015375,
+      "grad_norm": 1.0338119268417358,
+      "learning_rate": 8.734557595993323e-05,
+      "loss": 0.2959,
+      "step": 1692
+    },
+    {
+      "epoch": 5.19369715603382,
+      "grad_norm": 0.7273797988891602,
+      "learning_rate": 8.727879799666111e-05,
+      "loss": 0.3644,
+      "step": 1693
+    },
+    {
+      "epoch": 5.196771714066103,
+      "grad_norm": 0.9218087196350098,
+      "learning_rate": 8.721202003338899e-05,
+      "loss": 0.282,
+      "step": 1694
+    },
+    {
+      "epoch": 5.199846272098386,
+      "grad_norm": 0.49654561281204224,
+      "learning_rate": 8.714524207011686e-05,
+      "loss": 0.2808,
+      "step": 1695
+    },
+    {
+      "epoch": 5.202920830130669,
+      "grad_norm": 1.4503116607666016,
+      "learning_rate": 8.707846410684475e-05,
+      "loss": 0.3207,
+      "step": 1696
+    },
+    {
+      "epoch": 5.205995388162951,
+      "grad_norm": 0.7454671859741211,
+      "learning_rate": 8.701168614357262e-05,
+      "loss": 0.2838,
+      "step": 1697
+    },
+    {
+      "epoch": 5.209069946195235,
+      "grad_norm": 0.7439486980438232,
+      "learning_rate": 8.694490818030051e-05,
+      "loss": 0.2806,
+      "step": 1698
+    },
+    {
+      "epoch": 5.212144504227517,
+      "grad_norm": 1.0724588632583618,
+      "learning_rate": 8.687813021702838e-05,
+      "loss": 0.2592,
+      "step": 1699
+    },
+    {
+      "epoch": 5.2152190622598,
+      "grad_norm": 1.1502668857574463,
+      "learning_rate": 8.681135225375627e-05,
+      "loss": 0.3127,
+      "step": 1700
+    },
+    {
+      "epoch": 5.218293620292083,
+      "grad_norm": 0.7341523170471191,
+      "learning_rate": 8.674457429048415e-05,
+      "loss": 0.304,
+      "step": 1701
+    },
+    {
+      "epoch": 5.221368178324366,
+      "grad_norm": 0.8837214112281799,
+      "learning_rate": 8.667779632721203e-05,
+      "loss": 0.3238,
+      "step": 1702
+    },
+    {
+      "epoch": 5.224442736356648,
+      "grad_norm": 0.9992470145225525,
+      "learning_rate": 8.66110183639399e-05,
+      "loss": 0.2472,
+      "step": 1703
+    },
+    {
+      "epoch": 5.227517294388932,
+      "grad_norm": 0.569851279258728,
+      "learning_rate": 8.654424040066779e-05,
+      "loss": 0.2762,
+      "step": 1704
+    },
+    {
+      "epoch": 5.230591852421214,
+      "grad_norm": 0.9270056486129761,
+      "learning_rate": 8.647746243739566e-05,
+      "loss": 0.3032,
+      "step": 1705
+    },
+    {
+      "epoch": 5.233666410453497,
+      "grad_norm": 0.9279311895370483,
+      "learning_rate": 8.641068447412355e-05,
+      "loss": 0.3044,
+      "step": 1706
+    },
+    {
+      "epoch": 5.23674096848578,
+      "grad_norm": 0.905545175075531,
+      "learning_rate": 8.634390651085142e-05,
+      "loss": 0.3282,
+      "step": 1707
+    },
+    {
+      "epoch": 5.239815526518063,
+      "grad_norm": 0.7078321576118469,
+      "learning_rate": 8.62771285475793e-05,
+      "loss": 0.2272,
+      "step": 1708
+    },
+    {
+      "epoch": 5.242890084550346,
+      "grad_norm": 0.8569689393043518,
+      "learning_rate": 8.62103505843072e-05,
+      "loss": 0.3754,
+      "step": 1709
+    },
+    {
+      "epoch": 5.245964642582629,
+      "grad_norm": 0.9020428657531738,
+      "learning_rate": 8.614357262103507e-05,
+      "loss": 0.3115,
+      "step": 1710
+    },
+    {
+      "epoch": 5.249039200614912,
+      "grad_norm": 0.8380615711212158,
+      "learning_rate": 8.607679465776294e-05,
+      "loss": 0.2889,
+      "step": 1711
+    },
+    {
+      "epoch": 5.252113758647194,
+      "grad_norm": 0.6772667765617371,
+      "learning_rate": 8.601001669449083e-05,
+      "loss": 0.2722,
+      "step": 1712
+    },
+    {
+      "epoch": 5.255188316679478,
+      "grad_norm": 0.9966198801994324,
+      "learning_rate": 8.59432387312187e-05,
+      "loss": 0.3286,
+      "step": 1713
+    },
+    {
+      "epoch": 5.25826287471176,
+      "grad_norm": 0.7050550580024719,
+      "learning_rate": 8.587646076794659e-05,
+      "loss": 0.2526,
+      "step": 1714
+    },
+    {
+      "epoch": 5.261337432744043,
+      "grad_norm": 0.6464506983757019,
+      "learning_rate": 8.580968280467446e-05,
+      "loss": 0.254,
+      "step": 1715
+    },
+    {
+      "epoch": 5.264411990776326,
+      "grad_norm": 0.7716936469078064,
+      "learning_rate": 8.574290484140233e-05,
+      "loss": 0.2742,
+      "step": 1716
+    },
+    {
+      "epoch": 5.267486548808609,
+      "grad_norm": 0.746012806892395,
+      "learning_rate": 8.567612687813022e-05,
+      "loss": 0.4213,
+      "step": 1717
+    },
+    {
+      "epoch": 5.2705611068408915,
+      "grad_norm": 0.8593916893005371,
+      "learning_rate": 8.56093489148581e-05,
+      "loss": 0.3349,
+      "step": 1718
+    },
+    {
+      "epoch": 5.273635664873175,
+      "grad_norm": 0.7389137148857117,
+      "learning_rate": 8.554257095158598e-05,
+      "loss": 0.3449,
+      "step": 1719
+    },
+    {
+      "epoch": 5.276710222905457,
+      "grad_norm": 1.1622214317321777,
+      "learning_rate": 8.547579298831387e-05,
+      "loss": 0.3472,
+      "step": 1720
+    },
+    {
+      "epoch": 5.2797847809377405,
+      "grad_norm": 0.5685468316078186,
+      "learning_rate": 8.540901502504174e-05,
+      "loss": 0.2676,
+      "step": 1721
+    },
+    {
+      "epoch": 5.282859338970023,
+      "grad_norm": 0.8736433982849121,
+      "learning_rate": 8.534223706176963e-05,
+      "loss": 0.2126,
+      "step": 1722
+    },
+    {
+      "epoch": 5.285933897002306,
+      "grad_norm": 0.7043049931526184,
+      "learning_rate": 8.52754590984975e-05,
+      "loss": 0.3439,
+      "step": 1723
+    },
+    {
+      "epoch": 5.289008455034589,
+      "grad_norm": 1.075692057609558,
+      "learning_rate": 8.520868113522537e-05,
+      "loss": 0.4651,
+      "step": 1724
+    },
+    {
+      "epoch": 5.292083013066872,
+      "grad_norm": 0.8230921030044556,
+      "learning_rate": 8.514190317195326e-05,
+      "loss": 0.2736,
+      "step": 1725
+    },
+    {
+      "epoch": 5.295157571099154,
+      "grad_norm": 0.9219911694526672,
+      "learning_rate": 8.507512520868115e-05,
+      "loss": 0.2919,
+      "step": 1726
+    },
+    {
+      "epoch": 5.298232129131438,
+      "grad_norm": 0.6772744059562683,
+      "learning_rate": 8.500834724540902e-05,
+      "loss": 0.2923,
+      "step": 1727
+    },
+    {
+      "epoch": 5.30130668716372,
+      "grad_norm": 1.0341936349868774,
+      "learning_rate": 8.49415692821369e-05,
+      "loss": 0.325,
+      "step": 1728
+    },
+    {
+      "epoch": 5.304381245196003,
+      "grad_norm": 0.6965529322624207,
+      "learning_rate": 8.487479131886478e-05,
+      "loss": 0.2289,
+      "step": 1729
+    },
+    {
+      "epoch": 5.307455803228286,
+      "grad_norm": 0.8680992722511292,
+      "learning_rate": 8.480801335559267e-05,
+      "loss": 0.3257,
+      "step": 1730
+    },
+    {
+      "epoch": 5.310530361260569,
+      "grad_norm": 0.8042769432067871,
+      "learning_rate": 8.474123539232054e-05,
+      "loss": 0.2816,
+      "step": 1731
+    },
+    {
+      "epoch": 5.313604919292851,
+      "grad_norm": 1.2106633186340332,
+      "learning_rate": 8.467445742904841e-05,
+      "loss": 0.2709,
+      "step": 1732
+    },
+    {
+      "epoch": 5.316679477325135,
+      "grad_norm": 0.6236171722412109,
+      "learning_rate": 8.46076794657763e-05,
+      "loss": 0.223,
+      "step": 1733
+    },
+    {
+      "epoch": 5.319754035357417,
+      "grad_norm": 0.7089080214500427,
+      "learning_rate": 8.454090150250417e-05,
+      "loss": 0.3727,
+      "step": 1734
+    },
+    {
+      "epoch": 5.3228285933897,
+      "grad_norm": 0.9685229659080505,
+      "learning_rate": 8.447412353923206e-05,
+      "loss": 0.3011,
+      "step": 1735
+    },
+    {
+      "epoch": 5.325903151421983,
+      "grad_norm": 0.8630408644676208,
+      "learning_rate": 8.440734557595994e-05,
+      "loss": 0.2856,
+      "step": 1736
+    },
+    {
+      "epoch": 5.328977709454266,
+      "grad_norm": 0.8283337950706482,
+      "learning_rate": 8.434056761268782e-05,
+      "loss": 0.2499,
+      "step": 1737
+    },
+    {
+      "epoch": 5.3320522674865485,
+      "grad_norm": 0.6598505973815918,
+      "learning_rate": 8.42737896494157e-05,
+      "loss": 0.3935,
+      "step": 1738
+    },
+    {
+      "epoch": 5.335126825518832,
+      "grad_norm": 0.7599532008171082,
+      "learning_rate": 8.420701168614358e-05,
+      "loss": 0.3502,
+      "step": 1739
+    },
+    {
+      "epoch": 5.338201383551114,
+      "grad_norm": 0.9693306684494019,
+      "learning_rate": 8.414023372287145e-05,
+      "loss": 0.5447,
+      "step": 1740
+    },
+    {
+      "epoch": 5.3412759415833975,
+      "grad_norm": 1.4195016622543335,
+      "learning_rate": 8.407345575959934e-05,
+      "loss": 0.3302,
+      "step": 1741
+    },
+    {
+      "epoch": 5.34435049961568,
+      "grad_norm": 1.2126317024230957,
+      "learning_rate": 8.400667779632721e-05,
+      "loss": 0.355,
+      "step": 1742
+    },
+    {
+      "epoch": 5.347425057647963,
+      "grad_norm": 0.8964106440544128,
+      "learning_rate": 8.39398998330551e-05,
+      "loss": 0.3305,
+      "step": 1743
+    },
+    {
+      "epoch": 5.350499615680246,
+      "grad_norm": 0.5942551493644714,
+      "learning_rate": 8.387312186978298e-05,
+      "loss": 0.2959,
+      "step": 1744
+    },
+    {
+      "epoch": 5.353574173712529,
+      "grad_norm": 0.6881222724914551,
+      "learning_rate": 8.380634390651086e-05,
+      "loss": 0.279,
+      "step": 1745
+    },
+    {
+      "epoch": 5.356648731744812,
+      "grad_norm": 0.6308599710464478,
+      "learning_rate": 8.373956594323874e-05,
+      "loss": 0.2354,
+      "step": 1746
+    },
+    {
+      "epoch": 5.359723289777095,
+      "grad_norm": 0.7263630628585815,
+      "learning_rate": 8.367278797996662e-05,
+      "loss": 0.3054,
+      "step": 1747
+    },
+    {
+      "epoch": 5.362797847809377,
+      "grad_norm": 0.7219898104667664,
+      "learning_rate": 8.360601001669449e-05,
+      "loss": 0.2876,
+      "step": 1748
+    },
+    {
+      "epoch": 5.36587240584166,
+      "grad_norm": 0.8973868489265442,
+      "learning_rate": 8.353923205342238e-05,
+      "loss": 0.3736,
+      "step": 1749
+    },
+    {
+      "epoch": 5.3689469638739435,
+      "grad_norm": 0.757659375667572,
+      "learning_rate": 8.347245409015025e-05,
+      "loss": 0.3041,
+      "step": 1750
+    },
+    {
+      "epoch": 5.372021521906226,
+      "grad_norm": 0.8917866349220276,
+      "learning_rate": 8.340567612687812e-05,
+      "loss": 0.3019,
+      "step": 1751
+    },
+    {
+      "epoch": 5.375096079938509,
+      "grad_norm": 0.6132904291152954,
+      "learning_rate": 8.333889816360601e-05,
+      "loss": 0.2637,
+      "step": 1752
+    },
+    {
+      "epoch": 5.378170637970792,
+      "grad_norm": 0.9521093964576721,
+      "learning_rate": 8.32721202003339e-05,
+      "loss": 0.349,
+      "step": 1753
+    },
+    {
+      "epoch": 5.381245196003075,
+      "grad_norm": 0.503698468208313,
+      "learning_rate": 8.320534223706178e-05,
+      "loss": 0.2973,
+      "step": 1754
+    },
+    {
+      "epoch": 5.384319754035357,
+      "grad_norm": 1.0433109998703003,
+      "learning_rate": 8.313856427378966e-05,
+      "loss": 0.3069,
+      "step": 1755
+    },
+    {
+      "epoch": 5.387394312067641,
+      "grad_norm": 1.374406099319458,
+      "learning_rate": 8.307178631051753e-05,
+      "loss": 0.3536,
+      "step": 1756
+    },
+    {
+      "epoch": 5.390468870099923,
+      "grad_norm": 0.7242358922958374,
+      "learning_rate": 8.300500834724542e-05,
+      "loss": 0.3207,
+      "step": 1757
+    },
+    {
+      "epoch": 5.393543428132206,
+      "grad_norm": 0.8785935640335083,
+      "learning_rate": 8.293823038397329e-05,
+      "loss": 0.3405,
+      "step": 1758
+    },
+    {
+      "epoch": 5.396617986164489,
+      "grad_norm": 1.2258713245391846,
+      "learning_rate": 8.287145242070116e-05,
+      "loss": 0.2641,
+      "step": 1759
+    },
+    {
+      "epoch": 5.399692544196772,
+      "grad_norm": 1.837854027748108,
+      "learning_rate": 8.280467445742905e-05,
+      "loss": 0.3816,
+      "step": 1760
+    },
+    {
+      "epoch": 5.402767102229054,
+      "grad_norm": 0.7135657072067261,
+      "learning_rate": 8.273789649415694e-05,
+      "loss": 0.2461,
+      "step": 1761
+    },
+    {
+      "epoch": 5.405841660261338,
+      "grad_norm": 0.8239970803260803,
+      "learning_rate": 8.267111853088482e-05,
+      "loss": 0.2936,
+      "step": 1762
+    },
+    {
+      "epoch": 5.40891621829362,
+      "grad_norm": 0.6553420424461365,
+      "learning_rate": 8.26043405676127e-05,
+      "loss": 0.3007,
+      "step": 1763
+    },
+    {
+      "epoch": 5.411990776325903,
+      "grad_norm": 1.8108381032943726,
+      "learning_rate": 8.253756260434057e-05,
+      "loss": 0.4522,
+      "step": 1764
+    },
+    {
+      "epoch": 5.415065334358186,
+      "grad_norm": 0.8653173446655273,
+      "learning_rate": 8.247078464106846e-05,
+      "loss": 0.3126,
+      "step": 1765
+    },
+    {
+      "epoch": 5.418139892390469,
+      "grad_norm": 0.7217906713485718,
+      "learning_rate": 8.240400667779633e-05,
+      "loss": 0.2043,
+      "step": 1766
+    },
+    {
+      "epoch": 5.4212144504227515,
+      "grad_norm": 0.90260910987854,
+      "learning_rate": 8.23372287145242e-05,
+      "loss": 0.2376,
+      "step": 1767
+    },
+    {
+      "epoch": 5.424289008455035,
+      "grad_norm": 0.8756963014602661,
+      "learning_rate": 8.227045075125209e-05,
+      "loss": 0.3742,
+      "step": 1768
+    },
+    {
+      "epoch": 5.427363566487317,
+      "grad_norm": 0.7344855070114136,
+      "learning_rate": 8.220367278797996e-05,
+      "loss": 0.3112,
+      "step": 1769
+    },
+    {
+      "epoch": 5.4304381245196005,
+      "grad_norm": 0.7740147113800049,
+      "learning_rate": 8.213689482470785e-05,
+      "loss": 0.2127,
+      "step": 1770
+    },
+    {
+      "epoch": 5.433512682551883,
+      "grad_norm": 0.8592774271965027,
+      "learning_rate": 8.207011686143574e-05,
+      "loss": 0.2716,
+      "step": 1771
+    },
+    {
+      "epoch": 5.436587240584166,
+      "grad_norm": 0.7816128134727478,
+      "learning_rate": 8.200333889816361e-05,
+      "loss": 0.3683,
+      "step": 1772
+    },
+    {
+      "epoch": 5.439661798616449,
+      "grad_norm": 1.1457465887069702,
+      "learning_rate": 8.19365609348915e-05,
+      "loss": 0.2948,
+      "step": 1773
+    },
+    {
+      "epoch": 5.442736356648732,
+      "grad_norm": 2.5698084831237793,
+      "learning_rate": 8.186978297161937e-05,
+      "loss": 0.3148,
+      "step": 1774
+    },
+    {
+      "epoch": 5.445810914681014,
+      "grad_norm": 0.8997441530227661,
+      "learning_rate": 8.180300500834724e-05,
+      "loss": 0.3582,
+      "step": 1775
+    },
+    {
+      "epoch": 5.448885472713298,
+      "grad_norm": 0.6346564888954163,
+      "learning_rate": 8.173622704507513e-05,
+      "loss": 0.2195,
+      "step": 1776
+    },
+    {
+      "epoch": 5.45196003074558,
+      "grad_norm": 1.0303326845169067,
+      "learning_rate": 8.1669449081803e-05,
+      "loss": 0.274,
+      "step": 1777
+    },
+    {
+      "epoch": 5.455034588777863,
+      "grad_norm": 0.8634578585624695,
+      "learning_rate": 8.160267111853089e-05,
+      "loss": 0.3599,
+      "step": 1778
+    },
+    {
+      "epoch": 5.458109146810146,
+      "grad_norm": 1.3686116933822632,
+      "learning_rate": 8.153589315525877e-05,
+      "loss": 0.2781,
+      "step": 1779
+    },
+    {
+      "epoch": 5.461183704842429,
+      "grad_norm": 0.564072847366333,
+      "learning_rate": 8.146911519198665e-05,
+      "loss": 0.2335,
+      "step": 1780
+    },
+    {
+      "epoch": 5.464258262874711,
+      "grad_norm": 0.7149077653884888,
+      "learning_rate": 8.140233722871453e-05,
+      "loss": 0.3076,
+      "step": 1781
+    },
+    {
+      "epoch": 5.467332820906995,
+      "grad_norm": 1.153348684310913,
+      "learning_rate": 8.133555926544241e-05,
+      "loss": 0.3365,
+      "step": 1782
+    },
+    {
+      "epoch": 5.470407378939277,
+      "grad_norm": 0.5060893893241882,
+      "learning_rate": 8.126878130217028e-05,
+      "loss": 0.2537,
+      "step": 1783
+    },
+    {
+      "epoch": 5.47348193697156,
+      "grad_norm": 1.0472662448883057,
+      "learning_rate": 8.120200333889817e-05,
+      "loss": 0.2637,
+      "step": 1784
+    },
+    {
+      "epoch": 5.476556495003843,
+      "grad_norm": 0.6877180337905884,
+      "learning_rate": 8.113522537562604e-05,
+      "loss": 0.3489,
+      "step": 1785
+    },
+    {
+      "epoch": 5.479631053036126,
+      "grad_norm": 0.8638304471969604,
+      "learning_rate": 8.106844741235393e-05,
+      "loss": 0.283,
+      "step": 1786
+    },
+    {
+      "epoch": 5.482705611068409,
+      "grad_norm": 0.828100323677063,
+      "learning_rate": 8.10016694490818e-05,
+      "loss": 0.2685,
+      "step": 1787
+    },
+    {
+      "epoch": 5.485780169100692,
+      "grad_norm": 0.8909431099891663,
+      "learning_rate": 8.093489148580969e-05,
+      "loss": 0.2413,
+      "step": 1788
+    },
+    {
+      "epoch": 5.488854727132974,
+      "grad_norm": 1.0037970542907715,
+      "learning_rate": 8.086811352253757e-05,
+      "loss": 0.3077,
+      "step": 1789
+    },
+    {
+      "epoch": 5.4919292851652575,
+      "grad_norm": 1.3559635877609253,
+      "learning_rate": 8.080133555926545e-05,
+      "loss": 0.3585,
+      "step": 1790
+    },
+    {
+      "epoch": 5.495003843197541,
+      "grad_norm": 0.9360470771789551,
+      "learning_rate": 8.073455759599332e-05,
+      "loss": 0.368,
+      "step": 1791
+    },
+    {
+      "epoch": 5.498078401229823,
+      "grad_norm": 0.8319844007492065,
+      "learning_rate": 8.066777963272121e-05,
+      "loss": 0.2898,
+      "step": 1792
+    },
+    {
+      "epoch": 5.5011529592621065,
+      "grad_norm": 0.6613747477531433,
+      "learning_rate": 8.060100166944908e-05,
+      "loss": 0.2506,
+      "step": 1793
+    },
+    {
+      "epoch": 5.504227517294389,
+      "grad_norm": 0.7393064498901367,
+      "learning_rate": 8.053422370617697e-05,
+      "loss": 0.261,
+      "step": 1794
+    },
+    {
+      "epoch": 5.507302075326672,
+      "grad_norm": 0.8899523019790649,
+      "learning_rate": 8.046744574290484e-05,
+      "loss": 0.3113,
+      "step": 1795
+    },
+    {
+      "epoch": 5.510376633358955,
+      "grad_norm": 1.0439255237579346,
+      "learning_rate": 8.040066777963273e-05,
+      "loss": 0.4196,
+      "step": 1796
+    },
+    {
+      "epoch": 5.513451191391238,
+      "grad_norm": 0.625464916229248,
+      "learning_rate": 8.033388981636061e-05,
+      "loss": 0.2637,
+      "step": 1797
+    },
+    {
+      "epoch": 5.51652574942352,
+      "grad_norm": 1.0431058406829834,
+      "learning_rate": 8.026711185308849e-05,
+      "loss": 0.2172,
+      "step": 1798
+    },
+    {
+      "epoch": 5.5196003074558035,
+      "grad_norm": 1.1402390003204346,
+      "learning_rate": 8.020033388981636e-05,
+      "loss": 0.2572,
+      "step": 1799
+    },
+    {
+      "epoch": 5.522674865488086,
+      "grad_norm": 0.7621378302574158,
+      "learning_rate": 8.013355592654425e-05,
+      "loss": 0.3302,
+      "step": 1800
+    },
+    {
+      "epoch": 5.525749423520369,
+      "grad_norm": 1.0336471796035767,
+      "learning_rate": 8.006677796327212e-05,
+      "loss": 0.2131,
+      "step": 1801
+    },
+    {
+      "epoch": 5.528823981552652,
+      "grad_norm": 1.23903226852417,
+      "learning_rate": 8e-05,
+      "loss": 0.2769,
+      "step": 1802
+    },
+    {
+      "epoch": 5.531898539584935,
+      "grad_norm": 0.8230191469192505,
+      "learning_rate": 7.993322203672788e-05,
+      "loss": 0.2967,
+      "step": 1803
+    },
+    {
+      "epoch": 5.534973097617217,
+      "grad_norm": 0.8352370262145996,
+      "learning_rate": 7.986644407345575e-05,
+      "loss": 0.2303,
+      "step": 1804
+    },
+    {
+      "epoch": 5.538047655649501,
+      "grad_norm": 1.2304105758666992,
+      "learning_rate": 7.979966611018364e-05,
+      "loss": 0.4731,
+      "step": 1805
+    },
+    {
+      "epoch": 5.541122213681783,
+      "grad_norm": 1.0414502620697021,
+      "learning_rate": 7.973288814691153e-05,
+      "loss": 0.3484,
+      "step": 1806
+    },
+    {
+      "epoch": 5.544196771714066,
+      "grad_norm": 1.0510241985321045,
+      "learning_rate": 7.96661101836394e-05,
+      "loss": 0.3604,
+      "step": 1807
+    },
+    {
+      "epoch": 5.547271329746349,
+      "grad_norm": 0.6692155599594116,
+      "learning_rate": 7.959933222036729e-05,
+      "loss": 0.3249,
+      "step": 1808
+    },
+    {
+      "epoch": 5.550345887778632,
+      "grad_norm": 0.9099972248077393,
+      "learning_rate": 7.953255425709516e-05,
+      "loss": 0.3089,
+      "step": 1809
+    },
+    {
+      "epoch": 5.553420445810914,
+      "grad_norm": 0.8659818768501282,
+      "learning_rate": 7.946577629382305e-05,
+      "loss": 0.3146,
+      "step": 1810
+    },
+    {
+      "epoch": 5.556495003843198,
+      "grad_norm": 0.8696914315223694,
+      "learning_rate": 7.939899833055092e-05,
+      "loss": 0.4468,
+      "step": 1811
+    },
+    {
+      "epoch": 5.55956956187548,
+      "grad_norm": 1.1352142095565796,
+      "learning_rate": 7.933222036727879e-05,
+      "loss": 0.2151,
+      "step": 1812
+    },
+    {
+      "epoch": 5.562644119907763,
+      "grad_norm": 0.899272620677948,
+      "learning_rate": 7.926544240400668e-05,
+      "loss": 0.3825,
+      "step": 1813
+    },
+    {
+      "epoch": 5.565718677940046,
+      "grad_norm": 1.0112216472625732,
+      "learning_rate": 7.919866444073457e-05,
+      "loss": 0.4442,
+      "step": 1814
+    },
+    {
+      "epoch": 5.568793235972329,
+      "grad_norm": 0.9360047578811646,
+      "learning_rate": 7.913188647746244e-05,
+      "loss": 0.305,
+      "step": 1815
+    },
+    {
+      "epoch": 5.5718677940046115,
+      "grad_norm": 0.9701045751571655,
+      "learning_rate": 7.906510851419033e-05,
+      "loss": 0.3318,
+      "step": 1816
+    },
+    {
+      "epoch": 5.574942352036895,
+      "grad_norm": 1.0220452547073364,
+      "learning_rate": 7.89983305509182e-05,
+      "loss": 0.3886,
+      "step": 1817
+    },
+    {
+      "epoch": 5.578016910069177,
+      "grad_norm": 1.1464786529541016,
+      "learning_rate": 7.893155258764609e-05,
+      "loss": 0.3478,
+      "step": 1818
+    },
+    {
+      "epoch": 5.5810914681014605,
+      "grad_norm": 0.8255491256713867,
+      "learning_rate": 7.886477462437396e-05,
+      "loss": 0.3586,
+      "step": 1819
+    },
+    {
+      "epoch": 5.584166026133743,
+      "grad_norm": 1.0034533739089966,
+      "learning_rate": 7.879799666110183e-05,
+      "loss": 0.294,
+      "step": 1820
+    },
+    {
+      "epoch": 5.587240584166026,
+      "grad_norm": 0.8229129314422607,
+      "learning_rate": 7.873121869782972e-05,
+      "loss": 0.2546,
+      "step": 1821
+    },
+    {
+      "epoch": 5.590315142198309,
+      "grad_norm": 0.6609354019165039,
+      "learning_rate": 7.86644407345576e-05,
+      "loss": 0.31,
+      "step": 1822
+    },
+    {
+      "epoch": 5.593389700230592,
+      "grad_norm": 0.7836920619010925,
+      "learning_rate": 7.859766277128548e-05,
+      "loss": 0.3437,
+      "step": 1823
+    },
+    {
+      "epoch": 5.596464258262875,
+      "grad_norm": 1.286696195602417,
+      "learning_rate": 7.853088480801337e-05,
+      "loss": 0.2675,
+      "step": 1824
+    },
+    {
+      "epoch": 5.599538816295158,
+      "grad_norm": 0.7446246147155762,
+      "learning_rate": 7.846410684474124e-05,
+      "loss": 0.2127,
+      "step": 1825
+    },
+    {
+      "epoch": 5.60261337432744,
+      "grad_norm": 0.7205875515937805,
+      "learning_rate": 7.839732888146912e-05,
+      "loss": 0.3288,
+      "step": 1826
+    },
+    {
+      "epoch": 5.605687932359723,
+      "grad_norm": 0.8203064203262329,
+      "learning_rate": 7.8330550918197e-05,
+      "loss": 0.337,
+      "step": 1827
+    },
+    {
+      "epoch": 5.608762490392007,
+      "grad_norm": 0.6690270900726318,
+      "learning_rate": 7.826377295492487e-05,
+      "loss": 0.3033,
+      "step": 1828
+    },
+    {
+      "epoch": 5.611837048424289,
+      "grad_norm": 0.5355561375617981,
+      "learning_rate": 7.819699499165276e-05,
+      "loss": 0.3438,
+      "step": 1829
+    },
+    {
+      "epoch": 5.614911606456571,
+      "grad_norm": 0.6979895234107971,
+      "learning_rate": 7.813021702838063e-05,
+      "loss": 0.2798,
+      "step": 1830
+    },
+    {
+      "epoch": 5.617986164488855,
+      "grad_norm": 0.6088154315948486,
+      "learning_rate": 7.806343906510852e-05,
+      "loss": 0.269,
+      "step": 1831
+    },
+    {
+      "epoch": 5.621060722521138,
+      "grad_norm": 1.0615488290786743,
+      "learning_rate": 7.79966611018364e-05,
+      "loss": 0.37,
+      "step": 1832
+    },
+    {
+      "epoch": 5.62413528055342,
+      "grad_norm": 0.8934707641601562,
+      "learning_rate": 7.792988313856428e-05,
+      "loss": 0.3102,
+      "step": 1833
+    },
+    {
+      "epoch": 5.627209838585704,
+      "grad_norm": 0.9596664905548096,
+      "learning_rate": 7.786310517529216e-05,
+      "loss": 0.2823,
+      "step": 1834
+    },
+    {
+      "epoch": 5.630284396617986,
+      "grad_norm": 0.8570690155029297,
+      "learning_rate": 7.779632721202004e-05,
+      "loss": 0.3825,
+      "step": 1835
+    },
+    {
+      "epoch": 5.633358954650269,
+      "grad_norm": 0.7704600095748901,
+      "learning_rate": 7.772954924874791e-05,
+      "loss": 0.35,
+      "step": 1836
+    },
+    {
+      "epoch": 5.636433512682552,
+      "grad_norm": 0.9135782122612,
+      "learning_rate": 7.76627712854758e-05,
+      "loss": 0.3951,
+      "step": 1837
+    },
+    {
+      "epoch": 5.639508070714835,
+      "grad_norm": 1.0562645196914673,
+      "learning_rate": 7.759599332220367e-05,
+      "loss": 0.2882,
+      "step": 1838
+    },
+    {
+      "epoch": 5.6425826287471175,
+      "grad_norm": 1.0557276010513306,
+      "learning_rate": 7.752921535893156e-05,
+      "loss": 0.3554,
+      "step": 1839
+    },
+    {
+      "epoch": 5.645657186779401,
+      "grad_norm": 0.8899006843566895,
+      "learning_rate": 7.746243739565944e-05,
+      "loss": 0.3476,
+      "step": 1840
+    },
+    {
+      "epoch": 5.648731744811683,
+      "grad_norm": 0.8078686594963074,
+      "learning_rate": 7.739565943238732e-05,
+      "loss": 0.2755,
+      "step": 1841
+    },
+    {
+      "epoch": 5.6518063028439665,
+      "grad_norm": 0.8886568546295166,
+      "learning_rate": 7.73288814691152e-05,
+      "loss": 0.3686,
+      "step": 1842
+    },
+    {
+      "epoch": 5.654880860876249,
+      "grad_norm": 1.3097084760665894,
+      "learning_rate": 7.726210350584308e-05,
+      "loss": 0.3381,
+      "step": 1843
+    },
+    {
+      "epoch": 5.657955418908532,
+      "grad_norm": 0.597942590713501,
+      "learning_rate": 7.719532554257095e-05,
+      "loss": 0.2658,
+      "step": 1844
+    },
+    {
+      "epoch": 5.661029976940815,
+      "grad_norm": 0.8042814135551453,
+      "learning_rate": 7.712854757929884e-05,
+      "loss": 0.3169,
+      "step": 1845
+    },
+    {
+      "epoch": 5.664104534973098,
+      "grad_norm": 0.5749388933181763,
+      "learning_rate": 7.706176961602671e-05,
+      "loss": 0.2733,
+      "step": 1846
+    },
+    {
+      "epoch": 5.66717909300538,
+      "grad_norm": 0.6768372654914856,
+      "learning_rate": 7.69949916527546e-05,
+      "loss": 0.3274,
+      "step": 1847
+    },
+    {
+      "epoch": 5.6702536510376635,
+      "grad_norm": 0.9132068157196045,
+      "learning_rate": 7.692821368948247e-05,
+      "loss": 0.326,
+      "step": 1848
+    },
+    {
+      "epoch": 5.673328209069946,
+      "grad_norm": 0.966132640838623,
+      "learning_rate": 7.686143572621036e-05,
+      "loss": 0.2591,
+      "step": 1849
+    },
+    {
+      "epoch": 5.676402767102229,
+      "grad_norm": 1.1084728240966797,
+      "learning_rate": 7.679465776293824e-05,
+      "loss": 0.3073,
+      "step": 1850
+    },
+    {
+      "epoch": 5.679477325134512,
+      "grad_norm": 0.7305138111114502,
+      "learning_rate": 7.672787979966612e-05,
+      "loss": 0.2618,
+      "step": 1851
+    },
+    {
+      "epoch": 5.682551883166795,
+      "grad_norm": 0.637614905834198,
+      "learning_rate": 7.666110183639399e-05,
+      "loss": 0.3106,
+      "step": 1852
+    },
+    {
+      "epoch": 5.685626441199077,
+      "grad_norm": 0.941795289516449,
+      "learning_rate": 7.659432387312188e-05,
+      "loss": 0.3271,
+      "step": 1853
+    },
+    {
+      "epoch": 5.688700999231361,
+      "grad_norm": 0.7227844595909119,
+      "learning_rate": 7.652754590984975e-05,
+      "loss": 0.2718,
+      "step": 1854
+    },
+    {
+      "epoch": 5.691775557263643,
+      "grad_norm": 0.9095995426177979,
+      "learning_rate": 7.646076794657764e-05,
+      "loss": 0.315,
+      "step": 1855
+    },
+    {
+      "epoch": 5.694850115295926,
+      "grad_norm": 1.2558395862579346,
+      "learning_rate": 7.639398998330551e-05,
+      "loss": 0.4484,
+      "step": 1856
+    },
+    {
+      "epoch": 5.697924673328209,
+      "grad_norm": 0.6830787658691406,
+      "learning_rate": 7.63272120200334e-05,
+      "loss": 0.3691,
+      "step": 1857
+    },
+    {
+      "epoch": 5.700999231360492,
+      "grad_norm": 1.5645900964736938,
+      "learning_rate": 7.626043405676128e-05,
+      "loss": 0.3653,
+      "step": 1858
+    },
+    {
+      "epoch": 5.704073789392774,
+      "grad_norm": 0.5888504385948181,
+      "learning_rate": 7.619365609348916e-05,
+      "loss": 0.2683,
+      "step": 1859
+    },
+    {
+      "epoch": 5.707148347425058,
+      "grad_norm": 0.6585515737533569,
+      "learning_rate": 7.612687813021703e-05,
+      "loss": 0.2923,
+      "step": 1860
+    },
+    {
+      "epoch": 5.71022290545734,
+      "grad_norm": 0.8930748701095581,
+      "learning_rate": 7.606010016694492e-05,
+      "loss": 0.3592,
+      "step": 1861
+    },
+    {
+      "epoch": 5.713297463489623,
+      "grad_norm": 0.7318699359893799,
+      "learning_rate": 7.599332220367279e-05,
+      "loss": 0.2895,
+      "step": 1862
+    },
+    {
+      "epoch": 5.716372021521906,
+      "grad_norm": 0.7849537134170532,
+      "learning_rate": 7.592654424040068e-05,
+      "loss": 0.2668,
+      "step": 1863
+    },
+    {
+      "epoch": 5.719446579554189,
+      "grad_norm": 0.9625186920166016,
+      "learning_rate": 7.585976627712855e-05,
+      "loss": 0.3139,
+      "step": 1864
+    },
+    {
+      "epoch": 5.722521137586472,
+      "grad_norm": 0.904823899269104,
+      "learning_rate": 7.579298831385642e-05,
+      "loss": 0.3532,
+      "step": 1865
+    },
+    {
+      "epoch": 5.725595695618755,
+      "grad_norm": 0.974162220954895,
+      "learning_rate": 7.572621035058431e-05,
+      "loss": 0.3306,
+      "step": 1866
+    },
+    {
+      "epoch": 5.728670253651037,
+      "grad_norm": 0.7732940912246704,
+      "learning_rate": 7.56594323873122e-05,
+      "loss": 0.3054,
+      "step": 1867
+    },
+    {
+      "epoch": 5.7317448116833205,
+      "grad_norm": 0.8150412440299988,
+      "learning_rate": 7.559265442404007e-05,
+      "loss": 0.2711,
+      "step": 1868
+    },
+    {
+      "epoch": 5.734819369715604,
+      "grad_norm": 1.464375615119934,
+      "learning_rate": 7.552587646076796e-05,
+      "loss": 0.3856,
+      "step": 1869
+    },
+    {
+      "epoch": 5.737893927747886,
+      "grad_norm": 0.7832287549972534,
+      "learning_rate": 7.545909849749583e-05,
+      "loss": 0.2674,
+      "step": 1870
+    },
+    {
+      "epoch": 5.740968485780169,
+      "grad_norm": 0.8007357716560364,
+      "learning_rate": 7.539232053422371e-05,
+      "loss": 0.1946,
+      "step": 1871
+    },
+    {
+      "epoch": 5.744043043812452,
+      "grad_norm": 0.6958500742912292,
+      "learning_rate": 7.532554257095159e-05,
+      "loss": 0.2734,
+      "step": 1872
+    },
+    {
+      "epoch": 5.747117601844735,
+      "grad_norm": 1.1845893859863281,
+      "learning_rate": 7.525876460767946e-05,
+      "loss": 0.3715,
+      "step": 1873
+    },
+    {
+      "epoch": 5.750192159877018,
+      "grad_norm": 0.7488757967948914,
+      "learning_rate": 7.519198664440735e-05,
+      "loss": 0.2586,
+      "step": 1874
+    },
+    {
+      "epoch": 5.753266717909301,
+      "grad_norm": 1.5162636041641235,
+      "learning_rate": 7.512520868113523e-05,
+      "loss": 0.2977,
+      "step": 1875
+    },
+    {
+      "epoch": 5.756341275941583,
+      "grad_norm": 2.1945416927337646,
+      "learning_rate": 7.505843071786311e-05,
+      "loss": 0.3504,
+      "step": 1876
+    },
+    {
+      "epoch": 5.759415833973867,
+      "grad_norm": 1.0076838731765747,
+      "learning_rate": 7.4991652754591e-05,
+      "loss": 0.3167,
+      "step": 1877
+    },
+    {
+      "epoch": 5.762490392006149,
+      "grad_norm": 0.7844017744064331,
+      "learning_rate": 7.492487479131887e-05,
+      "loss": 0.3902,
+      "step": 1878
+    },
+    {
+      "epoch": 5.765564950038432,
+      "grad_norm": 1.1953024864196777,
+      "learning_rate": 7.485809682804675e-05,
+      "loss": 0.3945,
+      "step": 1879
+    },
+    {
+      "epoch": 5.768639508070715,
+      "grad_norm": 0.6889199614524841,
+      "learning_rate": 7.479131886477463e-05,
+      "loss": 0.3535,
+      "step": 1880
+    },
+    {
+      "epoch": 5.771714066102998,
+      "grad_norm": 0.7977723479270935,
+      "learning_rate": 7.47245409015025e-05,
+      "loss": 0.3421,
+      "step": 1881
+    },
+    {
+      "epoch": 5.77478862413528,
+      "grad_norm": 0.6946485042572021,
+      "learning_rate": 7.465776293823039e-05,
+      "loss": 0.2671,
+      "step": 1882
+    },
+    {
+      "epoch": 5.777863182167564,
+      "grad_norm": 0.7452620267868042,
+      "learning_rate": 7.459098497495826e-05,
+      "loss": 0.3047,
+      "step": 1883
+    },
+    {
+      "epoch": 5.780937740199846,
+      "grad_norm": 0.6193966269493103,
+      "learning_rate": 7.452420701168615e-05,
+      "loss": 0.2053,
+      "step": 1884
+    },
+    {
+      "epoch": 5.784012298232129,
+      "grad_norm": 0.9007473587989807,
+      "learning_rate": 7.445742904841403e-05,
+      "loss": 0.4161,
+      "step": 1885
+    },
+    {
+      "epoch": 5.787086856264412,
+      "grad_norm": 1.1725136041641235,
+      "learning_rate": 7.439065108514191e-05,
+      "loss": 0.2089,
+      "step": 1886
+    },
+    {
+      "epoch": 5.790161414296695,
+      "grad_norm": 0.9010354280471802,
+      "learning_rate": 7.43238731218698e-05,
+      "loss": 0.3207,
+      "step": 1887
+    },
+    {
+      "epoch": 5.7932359723289775,
+      "grad_norm": 1.4096622467041016,
+      "learning_rate": 7.425709515859767e-05,
+      "loss": 0.3003,
+      "step": 1888
+    },
+    {
+      "epoch": 5.796310530361261,
+      "grad_norm": 1.2428261041641235,
+      "learning_rate": 7.419031719532554e-05,
+      "loss": 0.2606,
+      "step": 1889
+    },
+    {
+      "epoch": 5.799385088393543,
+      "grad_norm": 0.9653693437576294,
+      "learning_rate": 7.412353923205343e-05,
+      "loss": 0.2796,
+      "step": 1890
+    },
+    {
+      "epoch": 5.8024596464258265,
+      "grad_norm": 0.9089574217796326,
+      "learning_rate": 7.40567612687813e-05,
+      "loss": 0.2239,
+      "step": 1891
+    },
+    {
+      "epoch": 5.805534204458109,
+      "grad_norm": 1.7680071592330933,
+      "learning_rate": 7.398998330550919e-05,
+      "loss": 0.3336,
+      "step": 1892
+    },
+    {
+      "epoch": 5.808608762490392,
+      "grad_norm": 0.9146047830581665,
+      "learning_rate": 7.392320534223707e-05,
+      "loss": 0.3298,
+      "step": 1893
+    },
+    {
+      "epoch": 5.811683320522675,
+      "grad_norm": 0.6860531568527222,
+      "learning_rate": 7.385642737896495e-05,
+      "loss": 0.3238,
+      "step": 1894
+    },
+    {
+      "epoch": 5.814757878554958,
+      "grad_norm": 1.0863178968429565,
+      "learning_rate": 7.378964941569283e-05,
+      "loss": 0.2665,
+      "step": 1895
+    },
+    {
+      "epoch": 5.81783243658724,
+      "grad_norm": 1.128209114074707,
+      "learning_rate": 7.37228714524207e-05,
+      "loss": 0.3184,
+      "step": 1896
+    },
+    {
+      "epoch": 5.8209069946195235,
+      "grad_norm": 1.4680668115615845,
+      "learning_rate": 7.365609348914858e-05,
+      "loss": 0.3016,
+      "step": 1897
+    },
+    {
+      "epoch": 5.823981552651806,
+      "grad_norm": 1.0566920042037964,
+      "learning_rate": 7.358931552587647e-05,
+      "loss": 0.3585,
+      "step": 1898
+    },
+    {
+      "epoch": 5.827056110684089,
+      "grad_norm": 1.0700082778930664,
+      "learning_rate": 7.352253756260434e-05,
+      "loss": 0.373,
+      "step": 1899
+    },
+    {
+      "epoch": 5.830130668716372,
+      "grad_norm": 0.8159264326095581,
+      "learning_rate": 7.345575959933221e-05,
+      "loss": 0.3609,
+      "step": 1900
+    },
+    {
+      "epoch": 5.833205226748655,
+      "grad_norm": 2.8568453788757324,
+      "learning_rate": 7.33889816360601e-05,
+      "loss": 0.3603,
+      "step": 1901
+    },
+    {
+      "epoch": 5.836279784780938,
+      "grad_norm": 0.8656408190727234,
+      "learning_rate": 7.332220367278799e-05,
+      "loss": 0.2947,
+      "step": 1902
+    },
+    {
+      "epoch": 5.839354342813221,
+      "grad_norm": 2.0064942836761475,
+      "learning_rate": 7.325542570951587e-05,
+      "loss": 0.3535,
+      "step": 1903
+    },
+    {
+      "epoch": 5.842428900845503,
+      "grad_norm": 0.9026947617530823,
+      "learning_rate": 7.318864774624375e-05,
+      "loss": 0.3323,
+      "step": 1904
+    },
+    {
+      "epoch": 5.845503458877786,
+      "grad_norm": 0.9408707022666931,
+      "learning_rate": 7.312186978297162e-05,
+      "loss": 0.2998,
+      "step": 1905
+    },
+    {
+      "epoch": 5.84857801691007,
+      "grad_norm": 1.8345344066619873,
+      "learning_rate": 7.30550918196995e-05,
+      "loss": 0.219,
+      "step": 1906
+    },
+    {
+      "epoch": 5.851652574942352,
+      "grad_norm": 0.784744918346405,
+      "learning_rate": 7.298831385642738e-05,
+      "loss": 0.2207,
+      "step": 1907
+    },
+    {
+      "epoch": 5.854727132974634,
+      "grad_norm": 1.4914350509643555,
+      "learning_rate": 7.292153589315525e-05,
+      "loss": 0.347,
+      "step": 1908
+    },
+    {
+      "epoch": 5.857801691006918,
+      "grad_norm": 0.7770729660987854,
+      "learning_rate": 7.285475792988314e-05,
+      "loss": 0.3199,
+      "step": 1909
+    },
+    {
+      "epoch": 5.860876249039201,
+      "grad_norm": 0.6912123560905457,
+      "learning_rate": 7.278797996661103e-05,
+      "loss": 0.223,
+      "step": 1910
+    },
+    {
+      "epoch": 5.863950807071483,
+      "grad_norm": 0.6402047872543335,
+      "learning_rate": 7.272120200333891e-05,
+      "loss": 0.3393,
+      "step": 1911
+    },
+    {
+      "epoch": 5.867025365103767,
+      "grad_norm": 1.5074280500411987,
+      "learning_rate": 7.265442404006679e-05,
+      "loss": 0.4208,
+      "step": 1912
+    },
+    {
+      "epoch": 5.870099923136049,
+      "grad_norm": 1.1925088167190552,
+      "learning_rate": 7.258764607679466e-05,
+      "loss": 0.3486,
+      "step": 1913
+    },
+    {
+      "epoch": 5.873174481168332,
+      "grad_norm": 0.7136446237564087,
+      "learning_rate": 7.252086811352255e-05,
+      "loss": 0.3058,
+      "step": 1914
+    },
+    {
+      "epoch": 5.876249039200615,
+      "grad_norm": 0.7760949730873108,
+      "learning_rate": 7.245409015025042e-05,
+      "loss": 0.3423,
+      "step": 1915
+    },
+    {
+      "epoch": 5.879323597232898,
+      "grad_norm": 0.7054867744445801,
+      "learning_rate": 7.238731218697829e-05,
+      "loss": 0.3382,
+      "step": 1916
+    },
+    {
+      "epoch": 5.8823981552651805,
+      "grad_norm": 0.799457311630249,
+      "learning_rate": 7.232053422370618e-05,
+      "loss": 0.3117,
+      "step": 1917
+    },
+    {
+      "epoch": 5.885472713297464,
+      "grad_norm": 0.714888334274292,
+      "learning_rate": 7.225375626043405e-05,
+      "loss": 0.3093,
+      "step": 1918
+    },
+    {
+      "epoch": 5.888547271329746,
+      "grad_norm": 0.6139503717422485,
+      "learning_rate": 7.218697829716194e-05,
+      "loss": 0.2015,
+      "step": 1919
+    },
+    {
+      "epoch": 5.8916218293620295,
+      "grad_norm": 1.060932993888855,
+      "learning_rate": 7.212020033388982e-05,
+      "loss": 0.3479,
+      "step": 1920
+    },
+    {
+      "epoch": 5.894696387394312,
+      "grad_norm": 0.7673906683921814,
+      "learning_rate": 7.20534223706177e-05,
+      "loss": 0.261,
+      "step": 1921
+    },
+    {
+      "epoch": 5.897770945426595,
+      "grad_norm": 0.9193598031997681,
+      "learning_rate": 7.198664440734558e-05,
+      "loss": 0.3752,
+      "step": 1922
+    },
+    {
+      "epoch": 5.900845503458878,
+      "grad_norm": 0.8515580296516418,
+      "learning_rate": 7.191986644407346e-05,
+      "loss": 0.3318,
+      "step": 1923
+    },
+    {
+      "epoch": 5.903920061491161,
+      "grad_norm": 0.7641887664794922,
+      "learning_rate": 7.185308848080133e-05,
+      "loss": 0.352,
+      "step": 1924
+    },
+    {
+      "epoch": 5.906994619523443,
+      "grad_norm": 1.0089126825332642,
+      "learning_rate": 7.178631051752922e-05,
+      "loss": 0.262,
+      "step": 1925
+    },
+    {
+      "epoch": 5.910069177555727,
+      "grad_norm": 1.0288993120193481,
+      "learning_rate": 7.171953255425709e-05,
+      "loss": 0.294,
+      "step": 1926
+    },
+    {
+      "epoch": 5.913143735588009,
+      "grad_norm": 1.4110133647918701,
+      "learning_rate": 7.165275459098498e-05,
+      "loss": 0.2831,
+      "step": 1927
+    },
+    {
+      "epoch": 5.916218293620292,
+      "grad_norm": 0.9518840909004211,
+      "learning_rate": 7.158597662771286e-05,
+      "loss": 0.2872,
+      "step": 1928
+    },
+    {
+      "epoch": 5.919292851652575,
+      "grad_norm": 0.8720163106918335,
+      "learning_rate": 7.151919866444074e-05,
+      "loss": 0.2104,
+      "step": 1929
+    },
+    {
+      "epoch": 5.922367409684858,
+      "grad_norm": 1.4843337535858154,
+      "learning_rate": 7.145242070116862e-05,
+      "loss": 0.4018,
+      "step": 1930
+    },
+    {
+      "epoch": 5.92544196771714,
+      "grad_norm": 0.7498276233673096,
+      "learning_rate": 7.13856427378965e-05,
+      "loss": 0.2566,
+      "step": 1931
+    },
+    {
+      "epoch": 5.928516525749424,
+      "grad_norm": 0.856194019317627,
+      "learning_rate": 7.131886477462437e-05,
+      "loss": 0.2937,
+      "step": 1932
+    },
+    {
+      "epoch": 5.931591083781706,
+      "grad_norm": 0.766518771648407,
+      "learning_rate": 7.125208681135226e-05,
+      "loss": 0.2397,
+      "step": 1933
+    },
+    {
+      "epoch": 5.934665641813989,
+      "grad_norm": 0.9151931405067444,
+      "learning_rate": 7.118530884808013e-05,
+      "loss": 0.3297,
+      "step": 1934
+    },
+    {
+      "epoch": 5.937740199846272,
+      "grad_norm": 0.7466654181480408,
+      "learning_rate": 7.111853088480802e-05,
+      "loss": 0.303,
+      "step": 1935
+    },
+    {
+      "epoch": 5.940814757878555,
+      "grad_norm": 0.8686262965202332,
+      "learning_rate": 7.105175292153589e-05,
+      "loss": 0.2854,
+      "step": 1936
+    },
+    {
+      "epoch": 5.9438893159108375,
+      "grad_norm": 0.72053462266922,
+      "learning_rate": 7.098497495826378e-05,
+      "loss": 0.3252,
+      "step": 1937
+    },
+    {
+      "epoch": 5.946963873943121,
+      "grad_norm": 0.9415873885154724,
+      "learning_rate": 7.091819699499166e-05,
+      "loss": 0.2553,
+      "step": 1938
+    },
+    {
+      "epoch": 5.950038431975403,
+      "grad_norm": 0.7902587056159973,
+      "learning_rate": 7.085141903171954e-05,
+      "loss": 0.3969,
+      "step": 1939
+    },
+    {
+      "epoch": 5.9531129900076865,
+      "grad_norm": 0.7759074568748474,
+      "learning_rate": 7.078464106844741e-05,
+      "loss": 0.2123,
+      "step": 1940
+    },
+    {
+      "epoch": 5.956187548039969,
+      "grad_norm": 0.9863756895065308,
+      "learning_rate": 7.07178631051753e-05,
+      "loss": 0.3059,
+      "step": 1941
+    },
+    {
+      "epoch": 5.959262106072252,
+      "grad_norm": 1.1259315013885498,
+      "learning_rate": 7.065108514190317e-05,
+      "loss": 0.276,
+      "step": 1942
+    },
+    {
+      "epoch": 5.9623366641045354,
+      "grad_norm": 0.7862762212753296,
+      "learning_rate": 7.058430717863106e-05,
+      "loss": 0.3132,
+      "step": 1943
+    },
+    {
+      "epoch": 5.965411222136818,
+      "grad_norm": 1.240963101387024,
+      "learning_rate": 7.051752921535893e-05,
+      "loss": 0.3393,
+      "step": 1944
+    },
+    {
+      "epoch": 5.9684857801691,
+      "grad_norm": 0.9186695218086243,
+      "learning_rate": 7.045075125208682e-05,
+      "loss": 0.3061,
+      "step": 1945
+    },
+    {
+      "epoch": 5.9715603382013835,
+      "grad_norm": 0.6734002232551575,
+      "learning_rate": 7.03839732888147e-05,
+      "loss": 0.3114,
+      "step": 1946
+    },
+    {
+      "epoch": 5.974634896233667,
+      "grad_norm": 1.0199098587036133,
+      "learning_rate": 7.031719532554258e-05,
+      "loss": 0.2419,
+      "step": 1947
+    },
+    {
+      "epoch": 5.977709454265949,
+      "grad_norm": 0.8766542673110962,
+      "learning_rate": 7.025041736227045e-05,
+      "loss": 0.3097,
+      "step": 1948
+    },
+    {
+      "epoch": 5.980784012298232,
+      "grad_norm": 1.1471467018127441,
+      "learning_rate": 7.018363939899834e-05,
+      "loss": 0.4017,
+      "step": 1949
+    },
+    {
+      "epoch": 5.983858570330515,
+      "grad_norm": 0.9350420832633972,
+      "learning_rate": 7.011686143572621e-05,
+      "loss": 0.3464,
+      "step": 1950
+    },
+    {
+      "epoch": 5.986933128362798,
+      "grad_norm": 1.6979084014892578,
+      "learning_rate": 7.00500834724541e-05,
+      "loss": 0.3693,
+      "step": 1951
+    },
+    {
+      "epoch": 5.990007686395081,
+      "grad_norm": 0.822771430015564,
+      "learning_rate": 6.998330550918197e-05,
+      "loss": 0.2756,
+      "step": 1952
+    },
+    {
+      "epoch": 5.993082244427364,
+      "grad_norm": 1.6150286197662354,
+      "learning_rate": 6.991652754590986e-05,
+      "loss": 0.3242,
+      "step": 1953
+    },
+    {
+      "epoch": 5.996156802459646,
+      "grad_norm": 1.161136507987976,
+      "learning_rate": 6.984974958263774e-05,
+      "loss": 0.3127,
+      "step": 1954
+    },
+    {
+      "epoch": 5.99923136049193,
+      "grad_norm": 0.7732776403427124,
+      "learning_rate": 6.978297161936562e-05,
+      "loss": 0.2713,
+      "step": 1955
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": NaN,
+      "learning_rate": 6.978297161936562e-05,
+      "loss": 0.2807,
+      "step": 1956
+    },
+    {
+      "epoch": 6.003074558032283,
+      "grad_norm": 0.8221515417098999,
+      "learning_rate": 6.971619365609349e-05,
+      "loss": 0.3441,
+      "step": 1957
+    },
+    {
+      "epoch": 6.006149116064566,
+      "grad_norm": 0.7294884324073792,
+      "learning_rate": 6.964941569282138e-05,
+      "loss": 0.2538,
+      "step": 1958
+    },
+    {
+      "epoch": 6.009223674096849,
+      "grad_norm": 0.48791152238845825,
+      "learning_rate": 6.958263772954925e-05,
+      "loss": 0.2395,
+      "step": 1959
+    },
+    {
+      "epoch": 6.012298232129131,
+      "grad_norm": 0.7092695236206055,
+      "learning_rate": 6.951585976627714e-05,
+      "loss": 0.2032,
+      "step": 1960
+    },
+    {
+      "epoch": 6.015372790161415,
+      "grad_norm": 0.6608801484107971,
+      "learning_rate": 6.944908180300501e-05,
+      "loss": 0.2752,
+      "step": 1961
+    },
+    {
+      "epoch": 6.018447348193697,
+      "grad_norm": 0.7612819671630859,
+      "learning_rate": 6.938230383973288e-05,
+      "loss": 0.2647,
+      "step": 1962
+    },
+    {
+      "epoch": 6.02152190622598,
+      "grad_norm": 0.6396717429161072,
+      "learning_rate": 6.931552587646077e-05,
+      "loss": 0.319,
+      "step": 1963
+    },
+    {
+      "epoch": 6.024596464258263,
+      "grad_norm": 0.6826348304748535,
+      "learning_rate": 6.924874791318865e-05,
+      "loss": 0.1987,
+      "step": 1964
+    },
+    {
+      "epoch": 6.027671022290546,
+      "grad_norm": 0.7029145956039429,
+      "learning_rate": 6.918196994991654e-05,
+      "loss": 0.1958,
+      "step": 1965
+    },
+    {
+      "epoch": 6.0307455803228285,
+      "grad_norm": 0.46820268034935,
+      "learning_rate": 6.911519198664441e-05,
+      "loss": 0.2496,
+      "step": 1966
+    },
+    {
+      "epoch": 6.033820138355112,
+      "grad_norm": 0.6609360575675964,
+      "learning_rate": 6.904841402337229e-05,
+      "loss": 0.2595,
+      "step": 1967
+    },
+    {
+      "epoch": 6.036894696387394,
+      "grad_norm": 1.3419384956359863,
+      "learning_rate": 6.898163606010017e-05,
+      "loss": 0.1628,
+      "step": 1968
+    },
+    {
+      "epoch": 6.0399692544196775,
+      "grad_norm": 1.2338601350784302,
+      "learning_rate": 6.891485809682805e-05,
+      "loss": 0.245,
+      "step": 1969
+    },
+    {
+      "epoch": 6.04304381245196,
+      "grad_norm": 0.9335373640060425,
+      "learning_rate": 6.884808013355592e-05,
+      "loss": 0.2059,
+      "step": 1970
+    },
+    {
+      "epoch": 6.046118370484243,
+      "grad_norm": 0.7417526245117188,
+      "learning_rate": 6.878130217028381e-05,
+      "loss": 0.2704,
+      "step": 1971
+    },
+    {
+      "epoch": 6.049192928516526,
+      "grad_norm": 0.7647474408149719,
+      "learning_rate": 6.87145242070117e-05,
+      "loss": 0.2319,
+      "step": 1972
+    },
+    {
+      "epoch": 6.052267486548809,
+      "grad_norm": 0.6899215579032898,
+      "learning_rate": 6.864774624373958e-05,
+      "loss": 0.1459,
+      "step": 1973
+    },
+    {
+      "epoch": 6.055342044581091,
+      "grad_norm": 0.9612866044044495,
+      "learning_rate": 6.858096828046745e-05,
+      "loss": 0.3229,
+      "step": 1974
+    },
+    {
+      "epoch": 6.058416602613375,
+      "grad_norm": 1.1122326850891113,
+      "learning_rate": 6.851419031719533e-05,
+      "loss": 0.2506,
+      "step": 1975
+    },
+    {
+      "epoch": 6.061491160645657,
+      "grad_norm": 0.929296612739563,
+      "learning_rate": 6.844741235392321e-05,
+      "loss": 0.2081,
+      "step": 1976
+    },
+    {
+      "epoch": 6.06456571867794,
+      "grad_norm": 0.7660003304481506,
+      "learning_rate": 6.838063439065109e-05,
+      "loss": 0.2926,
+      "step": 1977
+    },
+    {
+      "epoch": 6.067640276710223,
+      "grad_norm": 0.5416483879089355,
+      "learning_rate": 6.831385642737896e-05,
+      "loss": 0.3146,
+      "step": 1978
+    },
+    {
+      "epoch": 6.070714834742506,
+      "grad_norm": 1.074669599533081,
+      "learning_rate": 6.824707846410685e-05,
+      "loss": 0.2937,
+      "step": 1979
+    },
+    {
+      "epoch": 6.073789392774788,
+      "grad_norm": 2.4369921684265137,
+      "learning_rate": 6.818030050083472e-05,
+      "loss": 0.2009,
+      "step": 1980
+    },
+    {
+      "epoch": 6.076863950807072,
+      "grad_norm": 1.6385631561279297,
+      "learning_rate": 6.811352253756261e-05,
+      "loss": 0.2339,
+      "step": 1981
+    },
+    {
+      "epoch": 6.079938508839354,
+      "grad_norm": 1.0690921545028687,
+      "learning_rate": 6.80467445742905e-05,
+      "loss": 0.2365,
+      "step": 1982
+    },
+    {
+      "epoch": 6.083013066871637,
+      "grad_norm": 1.094051480293274,
+      "learning_rate": 6.797996661101837e-05,
+      "loss": 0.3067,
+      "step": 1983
+    },
+    {
+      "epoch": 6.08608762490392,
+      "grad_norm": 1.2970466613769531,
+      "learning_rate": 6.791318864774625e-05,
+      "loss": 0.2976,
+      "step": 1984
+    },
+    {
+      "epoch": 6.089162182936203,
+      "grad_norm": 0.678546130657196,
+      "learning_rate": 6.784641068447413e-05,
+      "loss": 0.2391,
+      "step": 1985
+    },
+    {
+      "epoch": 6.092236740968485,
+      "grad_norm": 0.6241523027420044,
+      "learning_rate": 6.7779632721202e-05,
+      "loss": 0.2372,
+      "step": 1986
+    },
+    {
+      "epoch": 6.095311299000769,
+      "grad_norm": 0.8551547527313232,
+      "learning_rate": 6.771285475792989e-05,
+      "loss": 0.2309,
+      "step": 1987
+    },
+    {
+      "epoch": 6.098385857033051,
+      "grad_norm": 1.2978205680847168,
+      "learning_rate": 6.764607679465776e-05,
+      "loss": 0.221,
+      "step": 1988
+    },
+    {
+      "epoch": 6.101460415065334,
+      "grad_norm": 0.48349693417549133,
+      "learning_rate": 6.757929883138565e-05,
+      "loss": 0.2102,
+      "step": 1989
+    },
+    {
+      "epoch": 6.104534973097617,
+      "grad_norm": 0.9165658950805664,
+      "learning_rate": 6.751252086811353e-05,
+      "loss": 0.3046,
+      "step": 1990
+    },
+    {
+      "epoch": 6.1076095311299,
+      "grad_norm": 0.6445243954658508,
+      "learning_rate": 6.74457429048414e-05,
+      "loss": 0.2875,
+      "step": 1991
+    },
+    {
+      "epoch": 6.1106840891621825,
+      "grad_norm": 0.8826196789741516,
+      "learning_rate": 6.737896494156929e-05,
+      "loss": 0.3097,
+      "step": 1992
+    },
+    {
+      "epoch": 6.113758647194466,
+      "grad_norm": 0.7305975556373596,
+      "learning_rate": 6.731218697829717e-05,
+      "loss": 0.2431,
+      "step": 1993
+    },
+    {
+      "epoch": 6.116833205226748,
+      "grad_norm": 0.6952454447746277,
+      "learning_rate": 6.724540901502504e-05,
+      "loss": 0.2939,
+      "step": 1994
+    },
+    {
+      "epoch": 6.1199077632590315,
+      "grad_norm": 0.6714677810668945,
+      "learning_rate": 6.717863105175293e-05,
+      "loss": 0.2164,
+      "step": 1995
+    },
+    {
+      "epoch": 6.122982321291314,
+      "grad_norm": 1.3943935632705688,
+      "learning_rate": 6.71118530884808e-05,
+      "loss": 0.2819,
+      "step": 1996
+    },
+    {
+      "epoch": 6.126056879323597,
+      "grad_norm": 0.8125165700912476,
+      "learning_rate": 6.704507512520869e-05,
+      "loss": 0.2346,
+      "step": 1997
+    },
+    {
+      "epoch": 6.1291314373558805,
+      "grad_norm": 0.8236249089241028,
+      "learning_rate": 6.697829716193656e-05,
+      "loss": 0.2934,
+      "step": 1998
+    },
+    {
+      "epoch": 6.132205995388163,
+      "grad_norm": 0.6829390525817871,
+      "learning_rate": 6.691151919866445e-05,
+      "loss": 0.2319,
+      "step": 1999
+    },
+    {
+      "epoch": 6.135280553420446,
+      "grad_norm": 0.7294898629188538,
+      "learning_rate": 6.684474123539233e-05,
+      "loss": 0.2563,
+      "step": 2000
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 3000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.2193823136997376e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}