{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9939263407279775, "eval_steps": 500, "global_step": 1086, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0027568382511307344, "grad_norm": 7.122869968414307, "learning_rate": 6.060606060606061e-07, "loss": 0.4664, "step": 1 }, { "epoch": 0.005513676502261469, "grad_norm": 7.916077136993408, "learning_rate": 1.2121212121212122e-06, "loss": 0.4771, "step": 2 }, { "epoch": 0.008270514753392204, "grad_norm": 7.081696033477783, "learning_rate": 1.8181818181818183e-06, "loss": 0.4614, "step": 3 }, { "epoch": 0.011027353004522938, "grad_norm": 5.813568115234375, "learning_rate": 2.4242424242424244e-06, "loss": 0.4299, "step": 4 }, { "epoch": 0.013784191255653673, "grad_norm": 4.66733455657959, "learning_rate": 3.0303030303030305e-06, "loss": 0.3728, "step": 5 }, { "epoch": 0.01654102950678441, "grad_norm": 6.349670886993408, "learning_rate": 3.6363636363636366e-06, "loss": 0.3623, "step": 6 }, { "epoch": 0.01929786775791514, "grad_norm": 3.68088436126709, "learning_rate": 4.242424242424243e-06, "loss": 0.2288, "step": 7 }, { "epoch": 0.022054706009045875, "grad_norm": 2.508946418762207, "learning_rate": 4.848484848484849e-06, "loss": 0.1955, "step": 8 }, { "epoch": 0.02481154426017661, "grad_norm": 7.334312915802002, "learning_rate": 5.4545454545454545e-06, "loss": 0.1915, "step": 9 }, { "epoch": 0.027568382511307346, "grad_norm": 6.7029619216918945, "learning_rate": 6.060606060606061e-06, "loss": 0.1546, "step": 10 }, { "epoch": 0.030325220762438078, "grad_norm": 4.743812561035156, "learning_rate": 6.666666666666667e-06, "loss": 0.1565, "step": 11 }, { "epoch": 0.03308205901356882, "grad_norm": 1.9065254926681519, "learning_rate": 7.272727272727273e-06, "loss": 0.1344, "step": 12 }, { "epoch": 0.03583889726469955, "grad_norm": 1.0832651853561401, "learning_rate": 7.87878787878788e-06, "loss": 0.1244, "step": 13 }, { "epoch": 0.03859573551583028, "grad_norm": 1.2000082731246948, "learning_rate": 8.484848484848486e-06, "loss": 0.1115, "step": 14 }, { "epoch": 0.04135257376696102, "grad_norm": 0.937786877155304, "learning_rate": 9.090909090909091e-06, "loss": 0.1026, "step": 15 }, { "epoch": 0.04410941201809175, "grad_norm": 1.0514339208602905, "learning_rate": 9.696969696969698e-06, "loss": 0.0976, "step": 16 }, { "epoch": 0.04686625026922248, "grad_norm": 3.448519706726074, "learning_rate": 1.0303030303030304e-05, "loss": 0.1035, "step": 17 }, { "epoch": 0.04962308852035322, "grad_norm": 1.5411934852600098, "learning_rate": 1.0909090909090909e-05, "loss": 0.0969, "step": 18 }, { "epoch": 0.05237992677148395, "grad_norm": 0.6320972442626953, "learning_rate": 1.1515151515151517e-05, "loss": 0.0884, "step": 19 }, { "epoch": 0.05513676502261469, "grad_norm": 0.967760443687439, "learning_rate": 1.2121212121212122e-05, "loss": 0.0851, "step": 20 }, { "epoch": 0.057893603273745424, "grad_norm": 0.5067126750946045, "learning_rate": 1.2727272727272728e-05, "loss": 0.079, "step": 21 }, { "epoch": 0.060650441524876156, "grad_norm": 0.5082628726959229, "learning_rate": 1.3333333333333333e-05, "loss": 0.0787, "step": 22 }, { "epoch": 0.0634072797760069, "grad_norm": 1.0926717519760132, "learning_rate": 1.3939393939393942e-05, "loss": 0.0751, "step": 23 }, { "epoch": 0.06616411802713763, "grad_norm": 0.43889403343200684, "learning_rate": 1.4545454545454546e-05, "loss": 0.0696, "step": 24 }, { "epoch": 0.06892095627826836, "grad_norm": 0.3727055788040161, "learning_rate": 1.5151515151515153e-05, "loss": 0.0693, "step": 25 }, { "epoch": 0.0716777945293991, "grad_norm": 0.4280584454536438, "learning_rate": 1.575757575757576e-05, "loss": 0.0727, "step": 26 }, { "epoch": 0.07443463278052984, "grad_norm": 0.36678358912467957, "learning_rate": 1.6363636363636366e-05, "loss": 0.0654, "step": 27 }, { "epoch": 0.07719147103166056, "grad_norm": 0.3085872530937195, "learning_rate": 1.6969696969696972e-05, "loss": 0.0614, "step": 28 }, { "epoch": 0.0799483092827913, "grad_norm": 0.2715630829334259, "learning_rate": 1.7575757575757576e-05, "loss": 0.0598, "step": 29 }, { "epoch": 0.08270514753392204, "grad_norm": 0.36059409379959106, "learning_rate": 1.8181818181818182e-05, "loss": 0.0602, "step": 30 }, { "epoch": 0.08546198578505276, "grad_norm": 0.3720782995223999, "learning_rate": 1.8787878787878792e-05, "loss": 0.0602, "step": 31 }, { "epoch": 0.0882188240361835, "grad_norm": 0.24194574356079102, "learning_rate": 1.9393939393939395e-05, "loss": 0.0553, "step": 32 }, { "epoch": 0.09097566228731424, "grad_norm": 0.5713000893592834, "learning_rate": 2e-05, "loss": 0.0566, "step": 33 }, { "epoch": 0.09373250053844497, "grad_norm": 0.4913846552371979, "learning_rate": 1.9999955494602408e-05, "loss": 0.0582, "step": 34 }, { "epoch": 0.0964893387895757, "grad_norm": 0.26757943630218506, "learning_rate": 1.999982197880577e-05, "loss": 0.0544, "step": 35 }, { "epoch": 0.09924617704070644, "grad_norm": 0.31889644265174866, "learning_rate": 1.9999599453798523e-05, "loss": 0.055, "step": 36 }, { "epoch": 0.10200301529183717, "grad_norm": 0.26153671741485596, "learning_rate": 1.9999287921561385e-05, "loss": 0.0508, "step": 37 }, { "epoch": 0.1047598535429679, "grad_norm": 0.2633945345878601, "learning_rate": 1.9998887384867323e-05, "loss": 0.0506, "step": 38 }, { "epoch": 0.10751669179409865, "grad_norm": 0.22508633136749268, "learning_rate": 1.9998397847281548e-05, "loss": 0.0511, "step": 39 }, { "epoch": 0.11027353004522938, "grad_norm": 0.23420976102352142, "learning_rate": 1.9997819313161476e-05, "loss": 0.0504, "step": 40 }, { "epoch": 0.11303036829636011, "grad_norm": 0.3166053295135498, "learning_rate": 1.9997151787656678e-05, "loss": 0.0564, "step": 41 }, { "epoch": 0.11578720654749085, "grad_norm": 0.19986656308174133, "learning_rate": 1.9996395276708856e-05, "loss": 0.0512, "step": 42 }, { "epoch": 0.11854404479862159, "grad_norm": 0.274948388338089, "learning_rate": 1.9995549787051772e-05, "loss": 0.0489, "step": 43 }, { "epoch": 0.12130088304975231, "grad_norm": 0.23236143589019775, "learning_rate": 1.9994615326211203e-05, "loss": 0.0491, "step": 44 }, { "epoch": 0.12405772130088305, "grad_norm": 0.22261761128902435, "learning_rate": 1.9993591902504854e-05, "loss": 0.048, "step": 45 }, { "epoch": 0.1268145595520138, "grad_norm": 0.22965197265148163, "learning_rate": 1.9992479525042305e-05, "loss": 0.0473, "step": 46 }, { "epoch": 0.12957139780314453, "grad_norm": 0.22254930436611176, "learning_rate": 1.9991278203724908e-05, "loss": 0.0468, "step": 47 }, { "epoch": 0.13232823605427527, "grad_norm": 0.18861782550811768, "learning_rate": 1.9989987949245725e-05, "loss": 0.0446, "step": 48 }, { "epoch": 0.13508507430540598, "grad_norm": 0.2599838078022003, "learning_rate": 1.9988608773089413e-05, "loss": 0.0467, "step": 49 }, { "epoch": 0.13784191255653672, "grad_norm": 0.24845938384532928, "learning_rate": 1.998714068753213e-05, "loss": 0.0461, "step": 50 }, { "epoch": 0.14059875080766746, "grad_norm": 0.18050189316272736, "learning_rate": 1.9985583705641418e-05, "loss": 0.0459, "step": 51 }, { "epoch": 0.1433555890587982, "grad_norm": 0.2283959537744522, "learning_rate": 1.9983937841276103e-05, "loss": 0.0452, "step": 52 }, { "epoch": 0.14611242730992893, "grad_norm": 0.21824775636196136, "learning_rate": 1.9982203109086153e-05, "loss": 0.044, "step": 53 }, { "epoch": 0.14886926556105967, "grad_norm": 0.2457500696182251, "learning_rate": 1.998037952451255e-05, "loss": 0.0445, "step": 54 }, { "epoch": 0.15162610381219038, "grad_norm": 0.2404290735721588, "learning_rate": 1.9978467103787176e-05, "loss": 0.0461, "step": 55 }, { "epoch": 0.15438294206332112, "grad_norm": 0.24108456075191498, "learning_rate": 1.9976465863932632e-05, "loss": 0.044, "step": 56 }, { "epoch": 0.15713978031445186, "grad_norm": 0.19976645708084106, "learning_rate": 1.9974375822762117e-05, "loss": 0.0434, "step": 57 }, { "epoch": 0.1598966185655826, "grad_norm": 0.20052076876163483, "learning_rate": 1.9972196998879254e-05, "loss": 0.0442, "step": 58 }, { "epoch": 0.16265345681671334, "grad_norm": 0.19367817044258118, "learning_rate": 1.996992941167792e-05, "loss": 0.0436, "step": 59 }, { "epoch": 0.16541029506784408, "grad_norm": 0.21227090060710907, "learning_rate": 1.9967573081342103e-05, "loss": 0.0439, "step": 60 }, { "epoch": 0.1681671333189748, "grad_norm": 0.20237918198108673, "learning_rate": 1.9965128028845676e-05, "loss": 0.0414, "step": 61 }, { "epoch": 0.17092397157010553, "grad_norm": 0.16752171516418457, "learning_rate": 1.9962594275952246e-05, "loss": 0.0426, "step": 62 }, { "epoch": 0.17368080982123626, "grad_norm": 0.1880505383014679, "learning_rate": 1.9959971845214953e-05, "loss": 0.0414, "step": 63 }, { "epoch": 0.176437648072367, "grad_norm": 0.2021344006061554, "learning_rate": 1.995726075997626e-05, "loss": 0.0423, "step": 64 }, { "epoch": 0.17919448632349774, "grad_norm": 0.15245415270328522, "learning_rate": 1.9954461044367752e-05, "loss": 0.0413, "step": 65 }, { "epoch": 0.18195132457462848, "grad_norm": 0.17980217933654785, "learning_rate": 1.9951572723309918e-05, "loss": 0.0421, "step": 66 }, { "epoch": 0.18470816282575922, "grad_norm": 0.1832534670829773, "learning_rate": 1.994859582251194e-05, "loss": 0.0416, "step": 67 }, { "epoch": 0.18746500107688993, "grad_norm": 0.6252762079238892, "learning_rate": 1.9945530368471444e-05, "loss": 0.047, "step": 68 }, { "epoch": 0.19022183932802067, "grad_norm": 0.17044250667095184, "learning_rate": 1.9942376388474282e-05, "loss": 0.0406, "step": 69 }, { "epoch": 0.1929786775791514, "grad_norm": 0.1817871630191803, "learning_rate": 1.9939133910594276e-05, "loss": 0.042, "step": 70 }, { "epoch": 0.19573551583028215, "grad_norm": 0.1658797264099121, "learning_rate": 1.9935802963692988e-05, "loss": 0.041, "step": 71 }, { "epoch": 0.1984923540814129, "grad_norm": 0.18323002755641937, "learning_rate": 1.9932383577419432e-05, "loss": 0.0407, "step": 72 }, { "epoch": 0.20124919233254362, "grad_norm": 0.200252503156662, "learning_rate": 1.992887578220984e-05, "loss": 0.0427, "step": 73 }, { "epoch": 0.20400603058367434, "grad_norm": 0.19649459421634674, "learning_rate": 1.9925279609287384e-05, "loss": 0.0439, "step": 74 }, { "epoch": 0.20676286883480507, "grad_norm": 0.17329928278923035, "learning_rate": 1.9921595090661872e-05, "loss": 0.0415, "step": 75 }, { "epoch": 0.2095197070859358, "grad_norm": 0.4066649377346039, "learning_rate": 1.9917822259129508e-05, "loss": 0.0416, "step": 76 }, { "epoch": 0.21227654533706655, "grad_norm": 0.1922263205051422, "learning_rate": 1.991396114827256e-05, "loss": 0.041, "step": 77 }, { "epoch": 0.2150333835881973, "grad_norm": 0.1683184802532196, "learning_rate": 1.9910011792459086e-05, "loss": 0.0411, "step": 78 }, { "epoch": 0.21779022183932803, "grad_norm": 0.43947067856788635, "learning_rate": 1.9905974226842614e-05, "loss": 0.0393, "step": 79 }, { "epoch": 0.22054706009045877, "grad_norm": 0.22764648497104645, "learning_rate": 1.9901848487361834e-05, "loss": 0.0415, "step": 80 }, { "epoch": 0.22330389834158948, "grad_norm": 0.1933886855840683, "learning_rate": 1.989763461074029e-05, "loss": 0.0404, "step": 81 }, { "epoch": 0.22606073659272022, "grad_norm": 0.1945038139820099, "learning_rate": 1.989333263448602e-05, "loss": 0.0398, "step": 82 }, { "epoch": 0.22881757484385096, "grad_norm": 0.21237650513648987, "learning_rate": 1.9888942596891267e-05, "loss": 0.0406, "step": 83 }, { "epoch": 0.2315744130949817, "grad_norm": 0.19435830414295197, "learning_rate": 1.9884464537032103e-05, "loss": 0.0397, "step": 84 }, { "epoch": 0.23433125134611243, "grad_norm": 0.17015162110328674, "learning_rate": 1.9879898494768093e-05, "loss": 0.0398, "step": 85 }, { "epoch": 0.23708808959724317, "grad_norm": 0.19542446732521057, "learning_rate": 1.987524451074194e-05, "loss": 0.0386, "step": 86 }, { "epoch": 0.23984492784837388, "grad_norm": 0.23729243874549866, "learning_rate": 1.9870502626379127e-05, "loss": 0.0406, "step": 87 }, { "epoch": 0.24260176609950462, "grad_norm": 0.18620969355106354, "learning_rate": 1.9865672883887553e-05, "loss": 0.0395, "step": 88 }, { "epoch": 0.24535860435063536, "grad_norm": 0.254768043756485, "learning_rate": 1.9860755326257127e-05, "loss": 0.0405, "step": 89 }, { "epoch": 0.2481154426017661, "grad_norm": 0.2777726650238037, "learning_rate": 1.985574999725943e-05, "loss": 0.0406, "step": 90 }, { "epoch": 0.2508722808528968, "grad_norm": 0.23211295902729034, "learning_rate": 1.985065694144728e-05, "loss": 0.0389, "step": 91 }, { "epoch": 0.2536291191040276, "grad_norm": 0.22822345793247223, "learning_rate": 1.9845476204154387e-05, "loss": 0.0393, "step": 92 }, { "epoch": 0.2563859573551583, "grad_norm": 0.2358836829662323, "learning_rate": 1.9840207831494903e-05, "loss": 0.0383, "step": 93 }, { "epoch": 0.25914279560628906, "grad_norm": 0.2143673151731491, "learning_rate": 1.9834851870363024e-05, "loss": 0.0411, "step": 94 }, { "epoch": 0.26189963385741977, "grad_norm": 0.21156637370586395, "learning_rate": 1.9829408368432592e-05, "loss": 0.0393, "step": 95 }, { "epoch": 0.26465647210855053, "grad_norm": 0.22637519240379333, "learning_rate": 1.9823877374156647e-05, "loss": 0.0375, "step": 96 }, { "epoch": 0.26741331035968124, "grad_norm": 0.2607465386390686, "learning_rate": 1.9818258936767013e-05, "loss": 0.0371, "step": 97 }, { "epoch": 0.27017014861081196, "grad_norm": 0.1963801383972168, "learning_rate": 1.9812553106273848e-05, "loss": 0.0377, "step": 98 }, { "epoch": 0.2729269868619427, "grad_norm": 0.1716557741165161, "learning_rate": 1.98067599334652e-05, "loss": 0.0378, "step": 99 }, { "epoch": 0.27568382511307343, "grad_norm": 0.18233723938465118, "learning_rate": 1.980087946990656e-05, "loss": 0.0385, "step": 100 }, { "epoch": 0.2784406633642042, "grad_norm": 0.19480392336845398, "learning_rate": 1.9794911767940405e-05, "loss": 0.0377, "step": 101 }, { "epoch": 0.2811975016153349, "grad_norm": 0.21586693823337555, "learning_rate": 1.978885688068572e-05, "loss": 0.0366, "step": 102 }, { "epoch": 0.2839543398664656, "grad_norm": 0.4663015305995941, "learning_rate": 1.9782714862037544e-05, "loss": 0.0397, "step": 103 }, { "epoch": 0.2867111781175964, "grad_norm": 0.43779700994491577, "learning_rate": 1.977648576666647e-05, "loss": 0.0405, "step": 104 }, { "epoch": 0.2894680163687271, "grad_norm": 0.15922895073890686, "learning_rate": 1.977016965001817e-05, "loss": 0.0365, "step": 105 }, { "epoch": 0.29222485461985787, "grad_norm": 0.20054024457931519, "learning_rate": 1.9763766568312906e-05, "loss": 0.0397, "step": 106 }, { "epoch": 0.2949816928709886, "grad_norm": 0.23604053258895874, "learning_rate": 1.9757276578545013e-05, "loss": 0.0379, "step": 107 }, { "epoch": 0.29773853112211934, "grad_norm": 0.22856947779655457, "learning_rate": 1.9750699738482403e-05, "loss": 0.041, "step": 108 }, { "epoch": 0.30049536937325005, "grad_norm": 0.1889025717973709, "learning_rate": 1.974403610666606e-05, "loss": 0.0376, "step": 109 }, { "epoch": 0.30325220762438077, "grad_norm": 0.16721414029598236, "learning_rate": 1.9737285742409495e-05, "loss": 0.0393, "step": 110 }, { "epoch": 0.30600904587551153, "grad_norm": 0.19392457604408264, "learning_rate": 1.973044870579824e-05, "loss": 0.0382, "step": 111 }, { "epoch": 0.30876588412664224, "grad_norm": 0.17549856007099152, "learning_rate": 1.97235250576893e-05, "loss": 0.0386, "step": 112 }, { "epoch": 0.311522722377773, "grad_norm": 0.15814520418643951, "learning_rate": 1.971651485971062e-05, "loss": 0.0387, "step": 113 }, { "epoch": 0.3142795606289037, "grad_norm": 0.5654919147491455, "learning_rate": 1.9709418174260523e-05, "loss": 0.0403, "step": 114 }, { "epoch": 0.3170363988800345, "grad_norm": 0.16663803160190582, "learning_rate": 1.9702235064507175e-05, "loss": 0.0355, "step": 115 }, { "epoch": 0.3197932371311652, "grad_norm": 0.17256957292556763, "learning_rate": 1.9694965594388008e-05, "loss": 0.0379, "step": 116 }, { "epoch": 0.3225500753822959, "grad_norm": 0.23029914498329163, "learning_rate": 1.9687609828609156e-05, "loss": 0.0384, "step": 117 }, { "epoch": 0.3253069136334267, "grad_norm": 0.1971399337053299, "learning_rate": 1.9680167832644868e-05, "loss": 0.0385, "step": 118 }, { "epoch": 0.3280637518845574, "grad_norm": 0.21078315377235413, "learning_rate": 1.9672639672736947e-05, "loss": 0.0386, "step": 119 }, { "epoch": 0.33082059013568815, "grad_norm": 0.20524373650550842, "learning_rate": 1.966502541589414e-05, "loss": 0.0365, "step": 120 }, { "epoch": 0.33357742838681886, "grad_norm": 0.17085903882980347, "learning_rate": 1.9657325129891558e-05, "loss": 0.0375, "step": 121 }, { "epoch": 0.3363342666379496, "grad_norm": 0.20739462971687317, "learning_rate": 1.9649538883270053e-05, "loss": 0.0367, "step": 122 }, { "epoch": 0.33909110488908034, "grad_norm": 0.17642727494239807, "learning_rate": 1.9641666745335626e-05, "loss": 0.037, "step": 123 }, { "epoch": 0.34184794314021105, "grad_norm": 0.18456482887268066, "learning_rate": 1.9633708786158803e-05, "loss": 0.038, "step": 124 }, { "epoch": 0.3446047813913418, "grad_norm": 0.22288407385349274, "learning_rate": 1.962566507657402e-05, "loss": 0.0378, "step": 125 }, { "epoch": 0.34736161964247253, "grad_norm": 0.20137764513492584, "learning_rate": 1.961753568817896e-05, "loss": 0.0371, "step": 126 }, { "epoch": 0.3501184578936033, "grad_norm": 0.16192878782749176, "learning_rate": 1.9609320693333967e-05, "loss": 0.0375, "step": 127 }, { "epoch": 0.352875296144734, "grad_norm": 0.7257514595985413, "learning_rate": 1.960102016516136e-05, "loss": 0.0393, "step": 128 }, { "epoch": 0.3556321343958647, "grad_norm": 0.15738032758235931, "learning_rate": 1.9592634177544803e-05, "loss": 0.0374, "step": 129 }, { "epoch": 0.3583889726469955, "grad_norm": 0.15156516432762146, "learning_rate": 1.9584162805128636e-05, "loss": 0.0351, "step": 130 }, { "epoch": 0.3611458108981262, "grad_norm": 0.1443106234073639, "learning_rate": 1.9575606123317215e-05, "loss": 0.0359, "step": 131 }, { "epoch": 0.36390264914925696, "grad_norm": 0.16379106044769287, "learning_rate": 1.9566964208274254e-05, "loss": 0.0376, "step": 132 }, { "epoch": 0.3666594874003877, "grad_norm": 0.17260539531707764, "learning_rate": 1.9558237136922117e-05, "loss": 0.0382, "step": 133 }, { "epoch": 0.36941632565151844, "grad_norm": 0.1595146209001541, "learning_rate": 1.954942498694117e-05, "loss": 0.0378, "step": 134 }, { "epoch": 0.37217316390264915, "grad_norm": 0.1662372648715973, "learning_rate": 1.9540527836769047e-05, "loss": 0.036, "step": 135 }, { "epoch": 0.37493000215377986, "grad_norm": 0.17359688878059387, "learning_rate": 1.95315457656e-05, "loss": 0.0352, "step": 136 }, { "epoch": 0.37768684040491063, "grad_norm": 0.15664531290531158, "learning_rate": 1.9522478853384154e-05, "loss": 0.0372, "step": 137 }, { "epoch": 0.38044367865604134, "grad_norm": 0.14340853691101074, "learning_rate": 1.951332718082682e-05, "loss": 0.0378, "step": 138 }, { "epoch": 0.3832005169071721, "grad_norm": 0.16062387824058533, "learning_rate": 1.950409082938776e-05, "loss": 0.036, "step": 139 }, { "epoch": 0.3859573551583028, "grad_norm": 0.16279537975788116, "learning_rate": 1.949476988128047e-05, "loss": 0.0373, "step": 140 }, { "epoch": 0.38871419340943353, "grad_norm": 0.18597126007080078, "learning_rate": 1.9485364419471454e-05, "loss": 0.0367, "step": 141 }, { "epoch": 0.3914710316605643, "grad_norm": 0.6993559002876282, "learning_rate": 1.947587452767948e-05, "loss": 0.0453, "step": 142 }, { "epoch": 0.394227869911695, "grad_norm": 0.20166553556919098, "learning_rate": 1.946630029037482e-05, "loss": 0.0366, "step": 143 }, { "epoch": 0.3969847081628258, "grad_norm": 0.16907399892807007, "learning_rate": 1.9456641792778527e-05, "loss": 0.036, "step": 144 }, { "epoch": 0.3997415464139565, "grad_norm": 0.19966886937618256, "learning_rate": 1.9446899120861653e-05, "loss": 0.0348, "step": 145 }, { "epoch": 0.40249838466508725, "grad_norm": 0.25474515557289124, "learning_rate": 1.94370723613445e-05, "loss": 0.0388, "step": 146 }, { "epoch": 0.40525522291621796, "grad_norm": 0.2152235358953476, "learning_rate": 1.9427161601695833e-05, "loss": 0.0371, "step": 147 }, { "epoch": 0.40801206116734867, "grad_norm": 0.15147989988327026, "learning_rate": 1.941716693013211e-05, "loss": 0.0374, "step": 148 }, { "epoch": 0.41076889941847944, "grad_norm": 0.18094661831855774, "learning_rate": 1.94070884356167e-05, "loss": 0.036, "step": 149 }, { "epoch": 0.41352573766961015, "grad_norm": 0.20204827189445496, "learning_rate": 1.9396926207859085e-05, "loss": 0.0383, "step": 150 }, { "epoch": 0.4162825759207409, "grad_norm": 0.24904154241085052, "learning_rate": 1.938668033731406e-05, "loss": 0.0368, "step": 151 }, { "epoch": 0.4190394141718716, "grad_norm": 0.15912267565727234, "learning_rate": 1.9376350915180935e-05, "loss": 0.0352, "step": 152 }, { "epoch": 0.4217962524230024, "grad_norm": 0.1520577222108841, "learning_rate": 1.9365938033402715e-05, "loss": 0.036, "step": 153 }, { "epoch": 0.4245530906741331, "grad_norm": 0.1562613546848297, "learning_rate": 1.9355441784665295e-05, "loss": 0.0367, "step": 154 }, { "epoch": 0.4273099289252638, "grad_norm": 0.13876181840896606, "learning_rate": 1.9344862262396612e-05, "loss": 0.0354, "step": 155 }, { "epoch": 0.4300667671763946, "grad_norm": 0.15534985065460205, "learning_rate": 1.933419956076584e-05, "loss": 0.0358, "step": 156 }, { "epoch": 0.4328236054275253, "grad_norm": 0.15327094495296478, "learning_rate": 1.932345377468253e-05, "loss": 0.0341, "step": 157 }, { "epoch": 0.43558044367865606, "grad_norm": 0.1699693650007248, "learning_rate": 1.9312624999795784e-05, "loss": 0.0378, "step": 158 }, { "epoch": 0.43833728192978677, "grad_norm": 0.12956377863883972, "learning_rate": 1.9301713332493386e-05, "loss": 0.0351, "step": 159 }, { "epoch": 0.44109412018091754, "grad_norm": 0.14530932903289795, "learning_rate": 1.929071886990095e-05, "loss": 0.0357, "step": 160 }, { "epoch": 0.44385095843204825, "grad_norm": 0.15227288007736206, "learning_rate": 1.9279641709881067e-05, "loss": 0.0363, "step": 161 }, { "epoch": 0.44660779668317896, "grad_norm": 0.165790393948555, "learning_rate": 1.926848195103242e-05, "loss": 0.0354, "step": 162 }, { "epoch": 0.4493646349343097, "grad_norm": 0.16768325865268707, "learning_rate": 1.9257239692688907e-05, "loss": 0.0335, "step": 163 }, { "epoch": 0.45212147318544044, "grad_norm": 0.147464320063591, "learning_rate": 1.9245915034918763e-05, "loss": 0.0339, "step": 164 }, { "epoch": 0.4548783114365712, "grad_norm": 0.1301703006029129, "learning_rate": 1.923450807852367e-05, "loss": 0.0337, "step": 165 }, { "epoch": 0.4576351496877019, "grad_norm": 0.1339288204908371, "learning_rate": 1.922301892503785e-05, "loss": 0.0353, "step": 166 }, { "epoch": 0.4603919879388326, "grad_norm": 0.16490085422992706, "learning_rate": 1.9211447676727174e-05, "loss": 0.0346, "step": 167 }, { "epoch": 0.4631488261899634, "grad_norm": 0.24428266286849976, "learning_rate": 1.9199794436588244e-05, "loss": 0.0355, "step": 168 }, { "epoch": 0.4659056644410941, "grad_norm": 0.40584465861320496, "learning_rate": 1.9188059308347475e-05, "loss": 0.0397, "step": 169 }, { "epoch": 0.46866250269222487, "grad_norm": 0.17336200177669525, "learning_rate": 1.9176242396460184e-05, "loss": 0.036, "step": 170 }, { "epoch": 0.4714193409433556, "grad_norm": 0.22676712274551392, "learning_rate": 1.916434380610963e-05, "loss": 0.0354, "step": 171 }, { "epoch": 0.47417617919448635, "grad_norm": 0.19564731419086456, "learning_rate": 1.9152363643206126e-05, "loss": 0.0341, "step": 172 }, { "epoch": 0.47693301744561706, "grad_norm": 0.16920173168182373, "learning_rate": 1.9140302014386044e-05, "loss": 0.036, "step": 173 }, { "epoch": 0.47968985569674777, "grad_norm": 0.1923132985830307, "learning_rate": 1.912815902701091e-05, "loss": 0.0334, "step": 174 }, { "epoch": 0.48244669394787854, "grad_norm": 0.18520843982696533, "learning_rate": 1.911593478916641e-05, "loss": 0.0343, "step": 175 }, { "epoch": 0.48520353219900925, "grad_norm": 0.2041483074426651, "learning_rate": 1.9103629409661468e-05, "loss": 0.0361, "step": 176 }, { "epoch": 0.48796037045014, "grad_norm": 0.17951691150665283, "learning_rate": 1.909124299802724e-05, "loss": 0.0346, "step": 177 }, { "epoch": 0.4907172087012707, "grad_norm": 0.17455315589904785, "learning_rate": 1.9078775664516157e-05, "loss": 0.0328, "step": 178 }, { "epoch": 0.4934740469524015, "grad_norm": 0.149070605635643, "learning_rate": 1.906622752010095e-05, "loss": 0.0353, "step": 179 }, { "epoch": 0.4962308852035322, "grad_norm": 0.13988004624843597, "learning_rate": 1.9053598676473656e-05, "loss": 0.0335, "step": 180 }, { "epoch": 0.4989877234546629, "grad_norm": 0.12514030933380127, "learning_rate": 1.904088924604461e-05, "loss": 0.0341, "step": 181 }, { "epoch": 0.5017445617057936, "grad_norm": 0.14698189496994019, "learning_rate": 1.9028099341941457e-05, "loss": 0.0347, "step": 182 }, { "epoch": 0.5045013999569244, "grad_norm": 0.1326190084218979, "learning_rate": 1.9015229078008163e-05, "loss": 0.0344, "step": 183 }, { "epoch": 0.5072582382080552, "grad_norm": 0.1399255394935608, "learning_rate": 1.900227856880396e-05, "loss": 0.0338, "step": 184 }, { "epoch": 0.5100150764591859, "grad_norm": 0.17403383553028107, "learning_rate": 1.898924792960237e-05, "loss": 0.034, "step": 185 }, { "epoch": 0.5127719147103166, "grad_norm": 0.14294308423995972, "learning_rate": 1.8976137276390145e-05, "loss": 0.0335, "step": 186 }, { "epoch": 0.5155287529614473, "grad_norm": 0.13353058695793152, "learning_rate": 1.8962946725866246e-05, "loss": 0.0325, "step": 187 }, { "epoch": 0.5182855912125781, "grad_norm": 0.1507033258676529, "learning_rate": 1.8949676395440818e-05, "loss": 0.0354, "step": 188 }, { "epoch": 0.5210424294637088, "grad_norm": 0.14498627185821533, "learning_rate": 1.8936326403234125e-05, "loss": 0.0327, "step": 189 }, { "epoch": 0.5237992677148395, "grad_norm": 0.15840047597885132, "learning_rate": 1.892289686807551e-05, "loss": 0.0332, "step": 190 }, { "epoch": 0.5265561059659702, "grad_norm": 0.14784608781337738, "learning_rate": 1.8909387909502335e-05, "loss": 0.0324, "step": 191 }, { "epoch": 0.5293129442171011, "grad_norm": 0.13484050333499908, "learning_rate": 1.8895799647758912e-05, "loss": 0.0336, "step": 192 }, { "epoch": 0.5320697824682318, "grad_norm": 0.17485234141349792, "learning_rate": 1.888213220379544e-05, "loss": 0.0335, "step": 193 }, { "epoch": 0.5348266207193625, "grad_norm": 0.14618845283985138, "learning_rate": 1.8868385699266928e-05, "loss": 0.0328, "step": 194 }, { "epoch": 0.5375834589704932, "grad_norm": 0.12785577774047852, "learning_rate": 1.8854560256532098e-05, "loss": 0.0335, "step": 195 }, { "epoch": 0.5403402972216239, "grad_norm": 0.16961853206157684, "learning_rate": 1.8840655998652324e-05, "loss": 0.0358, "step": 196 }, { "epoch": 0.5430971354727547, "grad_norm": 0.14937855303287506, "learning_rate": 1.8826673049390508e-05, "loss": 0.0336, "step": 197 }, { "epoch": 0.5458539737238854, "grad_norm": 0.14861802756786346, "learning_rate": 1.881261153320999e-05, "loss": 0.0346, "step": 198 }, { "epoch": 0.5486108119750162, "grad_norm": 0.13903887569904327, "learning_rate": 1.8798471575273445e-05, "loss": 0.0319, "step": 199 }, { "epoch": 0.5513676502261469, "grad_norm": 0.5592267513275146, "learning_rate": 1.8784253301441767e-05, "loss": 0.0367, "step": 200 }, { "epoch": 0.5541244884772776, "grad_norm": 0.12278741598129272, "learning_rate": 1.8769956838272937e-05, "loss": 0.0326, "step": 201 }, { "epoch": 0.5568813267284084, "grad_norm": 0.16035325825214386, "learning_rate": 1.8755582313020912e-05, "loss": 0.035, "step": 202 }, { "epoch": 0.5596381649795391, "grad_norm": 0.15898433327674866, "learning_rate": 1.8741129853634483e-05, "loss": 0.0336, "step": 203 }, { "epoch": 0.5623950032306698, "grad_norm": 0.14194312691688538, "learning_rate": 1.8726599588756144e-05, "loss": 0.0327, "step": 204 }, { "epoch": 0.5651518414818005, "grad_norm": 0.14086908102035522, "learning_rate": 1.8711991647720936e-05, "loss": 0.034, "step": 205 }, { "epoch": 0.5679086797329312, "grad_norm": 0.13377539813518524, "learning_rate": 1.8697306160555303e-05, "loss": 0.0337, "step": 206 }, { "epoch": 0.5706655179840621, "grad_norm": 0.13268251717090607, "learning_rate": 1.868254325797594e-05, "loss": 0.0329, "step": 207 }, { "epoch": 0.5734223562351928, "grad_norm": 0.13881011307239532, "learning_rate": 1.8667703071388607e-05, "loss": 0.0339, "step": 208 }, { "epoch": 0.5761791944863235, "grad_norm": 0.15615610778331757, "learning_rate": 1.8652785732886988e-05, "loss": 0.0345, "step": 209 }, { "epoch": 0.5789360327374542, "grad_norm": 0.3299935460090637, "learning_rate": 1.8637791375251505e-05, "loss": 0.0353, "step": 210 }, { "epoch": 0.581692870988585, "grad_norm": 0.13316752016544342, "learning_rate": 1.862272013194812e-05, "loss": 0.0325, "step": 211 }, { "epoch": 0.5844497092397157, "grad_norm": 0.13292838633060455, "learning_rate": 1.8607572137127175e-05, "loss": 0.0342, "step": 212 }, { "epoch": 0.5872065474908464, "grad_norm": 0.1690252423286438, "learning_rate": 1.859234752562217e-05, "loss": 0.0333, "step": 213 }, { "epoch": 0.5899633857419772, "grad_norm": 0.13950853049755096, "learning_rate": 1.8577046432948586e-05, "loss": 0.032, "step": 214 }, { "epoch": 0.5927202239931079, "grad_norm": 0.12849925458431244, "learning_rate": 1.8561668995302668e-05, "loss": 0.0323, "step": 215 }, { "epoch": 0.5954770622442387, "grad_norm": 0.22346273064613342, "learning_rate": 1.8546215349560204e-05, "loss": 0.0346, "step": 216 }, { "epoch": 0.5982339004953694, "grad_norm": 0.12624599039554596, "learning_rate": 1.853068563327533e-05, "loss": 0.0328, "step": 217 }, { "epoch": 0.6009907387465001, "grad_norm": 0.14228811860084534, "learning_rate": 1.851507998467929e-05, "loss": 0.0334, "step": 218 }, { "epoch": 0.6037475769976308, "grad_norm": 0.1500948965549469, "learning_rate": 1.849939854267919e-05, "loss": 0.0346, "step": 219 }, { "epoch": 0.6065044152487615, "grad_norm": 0.17958402633666992, "learning_rate": 1.8483641446856798e-05, "loss": 0.0341, "step": 220 }, { "epoch": 0.6092612534998924, "grad_norm": 0.15875360369682312, "learning_rate": 1.8467808837467277e-05, "loss": 0.0354, "step": 221 }, { "epoch": 0.6120180917510231, "grad_norm": 0.13992802798748016, "learning_rate": 1.845190085543795e-05, "loss": 0.0346, "step": 222 }, { "epoch": 0.6147749300021538, "grad_norm": 0.14051182568073273, "learning_rate": 1.843591764236702e-05, "loss": 0.0325, "step": 223 }, { "epoch": 0.6175317682532845, "grad_norm": 0.13775014877319336, "learning_rate": 1.841985934052234e-05, "loss": 0.0323, "step": 224 }, { "epoch": 0.6202886065044152, "grad_norm": 0.1298038810491562, "learning_rate": 1.840372609284013e-05, "loss": 0.0322, "step": 225 }, { "epoch": 0.623045444755546, "grad_norm": 0.14838284254074097, "learning_rate": 1.8387518042923715e-05, "loss": 0.0334, "step": 226 }, { "epoch": 0.6258022830066767, "grad_norm": 0.1384505033493042, "learning_rate": 1.8371235335042236e-05, "loss": 0.0332, "step": 227 }, { "epoch": 0.6285591212578074, "grad_norm": 0.1435280740261078, "learning_rate": 1.8354878114129368e-05, "loss": 0.0323, "step": 228 }, { "epoch": 0.6313159595089382, "grad_norm": 0.19593602418899536, "learning_rate": 1.833844652578203e-05, "loss": 0.033, "step": 229 }, { "epoch": 0.634072797760069, "grad_norm": 0.11905160546302795, "learning_rate": 1.832194071625911e-05, "loss": 0.0329, "step": 230 }, { "epoch": 0.6368296360111997, "grad_norm": 0.13935434818267822, "learning_rate": 1.8305360832480118e-05, "loss": 0.0326, "step": 231 }, { "epoch": 0.6395864742623304, "grad_norm": 0.13467997312545776, "learning_rate": 1.8288707022023926e-05, "loss": 0.0323, "step": 232 }, { "epoch": 0.6423433125134611, "grad_norm": 0.13442496955394745, "learning_rate": 1.827197943312742e-05, "loss": 0.0326, "step": 233 }, { "epoch": 0.6451001507645918, "grad_norm": 0.13342751562595367, "learning_rate": 1.82551782146842e-05, "loss": 0.0326, "step": 234 }, { "epoch": 0.6478569890157226, "grad_norm": 0.11170811951160431, "learning_rate": 1.8238303516243253e-05, "loss": 0.0329, "step": 235 }, { "epoch": 0.6506138272668534, "grad_norm": 0.109471395611763, "learning_rate": 1.8221355488007606e-05, "loss": 0.0312, "step": 236 }, { "epoch": 0.6533706655179841, "grad_norm": 0.10828150063753128, "learning_rate": 1.8204334280833005e-05, "loss": 0.0317, "step": 237 }, { "epoch": 0.6561275037691148, "grad_norm": 0.1234995648264885, "learning_rate": 1.8187240046226576e-05, "loss": 0.0316, "step": 238 }, { "epoch": 0.6588843420202455, "grad_norm": 0.12001900374889374, "learning_rate": 1.817007293634545e-05, "loss": 0.0318, "step": 239 }, { "epoch": 0.6616411802713763, "grad_norm": 0.12810060381889343, "learning_rate": 1.8152833103995443e-05, "loss": 0.0315, "step": 240 }, { "epoch": 0.664398018522507, "grad_norm": 0.12769843637943268, "learning_rate": 1.8135520702629677e-05, "loss": 0.0324, "step": 241 }, { "epoch": 0.6671548567736377, "grad_norm": 0.12816517055034637, "learning_rate": 1.8118135886347207e-05, "loss": 0.0311, "step": 242 }, { "epoch": 0.6699116950247684, "grad_norm": 0.1479124128818512, "learning_rate": 1.8100678809891668e-05, "loss": 0.0322, "step": 243 }, { "epoch": 0.6726685332758991, "grad_norm": 0.1499953418970108, "learning_rate": 1.8083149628649887e-05, "loss": 0.0322, "step": 244 }, { "epoch": 0.67542537152703, "grad_norm": 0.16143152117729187, "learning_rate": 1.8065548498650495e-05, "loss": 0.0327, "step": 245 }, { "epoch": 0.6781822097781607, "grad_norm": 0.11848675459623337, "learning_rate": 1.8047875576562556e-05, "loss": 0.0323, "step": 246 }, { "epoch": 0.6809390480292914, "grad_norm": 0.14054562151432037, "learning_rate": 1.803013101969415e-05, "loss": 0.0311, "step": 247 }, { "epoch": 0.6836958862804221, "grad_norm": 0.14287616312503815, "learning_rate": 1.801231498599099e-05, "loss": 0.0321, "step": 248 }, { "epoch": 0.6864527245315529, "grad_norm": 0.1472490429878235, "learning_rate": 1.7994427634035016e-05, "loss": 0.0324, "step": 249 }, { "epoch": 0.6892095627826836, "grad_norm": 0.2156575620174408, "learning_rate": 1.7976469123042955e-05, "loss": 0.0331, "step": 250 }, { "epoch": 0.6919664010338143, "grad_norm": 0.15800388157367706, "learning_rate": 1.7958439612864954e-05, "loss": 0.0319, "step": 251 }, { "epoch": 0.6947232392849451, "grad_norm": 0.13451936841011047, "learning_rate": 1.7940339263983112e-05, "loss": 0.032, "step": 252 }, { "epoch": 0.6974800775360758, "grad_norm": 0.12370197474956512, "learning_rate": 1.7922168237510076e-05, "loss": 0.0328, "step": 253 }, { "epoch": 0.7002369157872066, "grad_norm": 0.1429441124200821, "learning_rate": 1.7903926695187595e-05, "loss": 0.0318, "step": 254 }, { "epoch": 0.7029937540383373, "grad_norm": 0.13566642999649048, "learning_rate": 1.7885614799385086e-05, "loss": 0.0322, "step": 255 }, { "epoch": 0.705750592289468, "grad_norm": 0.11789460480213165, "learning_rate": 1.78672327130982e-05, "loss": 0.0316, "step": 256 }, { "epoch": 0.7085074305405987, "grad_norm": 0.14814262092113495, "learning_rate": 1.7848780599947334e-05, "loss": 0.033, "step": 257 }, { "epoch": 0.7112642687917294, "grad_norm": 0.12498864531517029, "learning_rate": 1.7830258624176224e-05, "loss": 0.0321, "step": 258 }, { "epoch": 0.7140211070428603, "grad_norm": 0.12611018121242523, "learning_rate": 1.7811666950650445e-05, "loss": 0.0318, "step": 259 }, { "epoch": 0.716777945293991, "grad_norm": 0.1379752904176712, "learning_rate": 1.7793005744855967e-05, "loss": 0.0343, "step": 260 }, { "epoch": 0.7195347835451217, "grad_norm": 0.12495708465576172, "learning_rate": 1.777427517289766e-05, "loss": 0.0337, "step": 261 }, { "epoch": 0.7222916217962524, "grad_norm": 0.11509250104427338, "learning_rate": 1.775547540149784e-05, "loss": 0.0303, "step": 262 }, { "epoch": 0.7250484600473831, "grad_norm": 0.14822109043598175, "learning_rate": 1.7736606597994763e-05, "loss": 0.0329, "step": 263 }, { "epoch": 0.7278052982985139, "grad_norm": 0.1274510771036148, "learning_rate": 1.7717668930341152e-05, "loss": 0.0321, "step": 264 }, { "epoch": 0.7305621365496446, "grad_norm": 0.13247767090797424, "learning_rate": 1.769866256710269e-05, "loss": 0.0327, "step": 265 }, { "epoch": 0.7333189748007753, "grad_norm": 0.1502748280763626, "learning_rate": 1.767958767745653e-05, "loss": 0.0336, "step": 266 }, { "epoch": 0.7360758130519061, "grad_norm": 0.11968521028757095, "learning_rate": 1.766044443118978e-05, "loss": 0.032, "step": 267 }, { "epoch": 0.7388326513030369, "grad_norm": 0.12810976803302765, "learning_rate": 1.7641232998698e-05, "loss": 0.0317, "step": 268 }, { "epoch": 0.7415894895541676, "grad_norm": 0.13857212662696838, "learning_rate": 1.7621953550983677e-05, "loss": 0.03, "step": 269 }, { "epoch": 0.7443463278052983, "grad_norm": 0.12155446410179138, "learning_rate": 1.7602606259654704e-05, "loss": 0.0317, "step": 270 }, { "epoch": 0.747103166056429, "grad_norm": 0.13759441673755646, "learning_rate": 1.7583191296922866e-05, "loss": 0.0312, "step": 271 }, { "epoch": 0.7498600043075597, "grad_norm": 0.18140603601932526, "learning_rate": 1.7563708835602286e-05, "loss": 0.0316, "step": 272 }, { "epoch": 0.7526168425586905, "grad_norm": 0.16475705802440643, "learning_rate": 1.7544159049107902e-05, "loss": 0.0306, "step": 273 }, { "epoch": 0.7553736808098213, "grad_norm": 0.13975286483764648, "learning_rate": 1.7524542111453923e-05, "loss": 0.0335, "step": 274 }, { "epoch": 0.758130519060952, "grad_norm": 0.14528636634349823, "learning_rate": 1.7504858197252263e-05, "loss": 0.032, "step": 275 }, { "epoch": 0.7608873573120827, "grad_norm": 0.15437592566013336, "learning_rate": 1.7485107481711014e-05, "loss": 0.033, "step": 276 }, { "epoch": 0.7636441955632134, "grad_norm": 0.11566643416881561, "learning_rate": 1.746529014063286e-05, "loss": 0.0305, "step": 277 }, { "epoch": 0.7664010338143442, "grad_norm": 0.1528697907924652, "learning_rate": 1.7445406350413533e-05, "loss": 0.0301, "step": 278 }, { "epoch": 0.7691578720654749, "grad_norm": 0.13452287018299103, "learning_rate": 1.7425456288040236e-05, "loss": 0.0312, "step": 279 }, { "epoch": 0.7719147103166056, "grad_norm": 0.13913044333457947, "learning_rate": 1.740544013109005e-05, "loss": 0.0328, "step": 280 }, { "epoch": 0.7746715485677363, "grad_norm": 0.11884239315986633, "learning_rate": 1.738535805772838e-05, "loss": 0.0299, "step": 281 }, { "epoch": 0.7774283868188671, "grad_norm": 0.13907060027122498, "learning_rate": 1.736521024670737e-05, "loss": 0.0326, "step": 282 }, { "epoch": 0.7801852250699979, "grad_norm": 0.13899970054626465, "learning_rate": 1.7344996877364282e-05, "loss": 0.0329, "step": 283 }, { "epoch": 0.7829420633211286, "grad_norm": 0.13733629882335663, "learning_rate": 1.732471812961992e-05, "loss": 0.0317, "step": 284 }, { "epoch": 0.7856989015722593, "grad_norm": 0.12010809779167175, "learning_rate": 1.7304374183977032e-05, "loss": 0.0315, "step": 285 }, { "epoch": 0.78845573982339, "grad_norm": 0.1484297811985016, "learning_rate": 1.72839652215187e-05, "loss": 0.0321, "step": 286 }, { "epoch": 0.7912125780745208, "grad_norm": 0.12623994052410126, "learning_rate": 1.7263491423906716e-05, "loss": 0.0319, "step": 287 }, { "epoch": 0.7939694163256515, "grad_norm": 0.11906126886606216, "learning_rate": 1.7242952973379983e-05, "loss": 0.0305, "step": 288 }, { "epoch": 0.7967262545767823, "grad_norm": 0.1313122808933258, "learning_rate": 1.7222350052752883e-05, "loss": 0.0319, "step": 289 }, { "epoch": 0.799483092827913, "grad_norm": 0.1219288781285286, "learning_rate": 1.720168284541365e-05, "loss": 0.0297, "step": 290 }, { "epoch": 0.8022399310790437, "grad_norm": 0.16674523055553436, "learning_rate": 1.7180951535322742e-05, "loss": 0.0322, "step": 291 }, { "epoch": 0.8049967693301745, "grad_norm": 0.21712636947631836, "learning_rate": 1.7160156307011197e-05, "loss": 0.0318, "step": 292 }, { "epoch": 0.8077536075813052, "grad_norm": 0.1368701457977295, "learning_rate": 1.7139297345578992e-05, "loss": 0.0302, "step": 293 }, { "epoch": 0.8105104458324359, "grad_norm": 0.1643868386745453, "learning_rate": 1.7118374836693407e-05, "loss": 0.0328, "step": 294 }, { "epoch": 0.8132672840835666, "grad_norm": 0.11599249392747879, "learning_rate": 1.7097388966587355e-05, "loss": 0.0303, "step": 295 }, { "epoch": 0.8160241223346973, "grad_norm": 0.29641032218933105, "learning_rate": 1.7076339922057736e-05, "loss": 0.034, "step": 296 }, { "epoch": 0.8187809605858282, "grad_norm": 0.1458274871110916, "learning_rate": 1.705522789046377e-05, "loss": 0.0318, "step": 297 }, { "epoch": 0.8215377988369589, "grad_norm": 0.12504877150058746, "learning_rate": 1.7034053059725325e-05, "loss": 0.0321, "step": 298 }, { "epoch": 0.8242946370880896, "grad_norm": 0.16158223152160645, "learning_rate": 1.7012815618321256e-05, "loss": 0.0326, "step": 299 }, { "epoch": 0.8270514753392203, "grad_norm": 0.12104715406894684, "learning_rate": 1.6991515755287715e-05, "loss": 0.031, "step": 300 }, { "epoch": 0.8298083135903511, "grad_norm": 0.1327955275774002, "learning_rate": 1.697015366021648e-05, "loss": 0.033, "step": 301 }, { "epoch": 0.8325651518414818, "grad_norm": 0.12627461552619934, "learning_rate": 1.694872952325326e-05, "loss": 0.0305, "step": 302 }, { "epoch": 0.8353219900926125, "grad_norm": 0.13003765046596527, "learning_rate": 1.6927243535095995e-05, "loss": 0.0313, "step": 303 }, { "epoch": 0.8380788283437433, "grad_norm": 0.15389683842658997, "learning_rate": 1.690569588699318e-05, "loss": 0.0314, "step": 304 }, { "epoch": 0.840835666594874, "grad_norm": 0.1295783370733261, "learning_rate": 1.6884086770742138e-05, "loss": 0.0311, "step": 305 }, { "epoch": 0.8435925048460048, "grad_norm": 0.14465399086475372, "learning_rate": 1.686241637868734e-05, "loss": 0.0305, "step": 306 }, { "epoch": 0.8463493430971355, "grad_norm": 0.12934014201164246, "learning_rate": 1.6840684903718658e-05, "loss": 0.0315, "step": 307 }, { "epoch": 0.8491061813482662, "grad_norm": 0.13668671250343323, "learning_rate": 1.681889253926969e-05, "loss": 0.0308, "step": 308 }, { "epoch": 0.8518630195993969, "grad_norm": 0.1645808219909668, "learning_rate": 1.6797039479315994e-05, "loss": 0.0328, "step": 309 }, { "epoch": 0.8546198578505276, "grad_norm": 0.1372358649969101, "learning_rate": 1.67751259183734e-05, "loss": 0.0309, "step": 310 }, { "epoch": 0.8573766961016585, "grad_norm": 0.151007741689682, "learning_rate": 1.675315205149626e-05, "loss": 0.033, "step": 311 }, { "epoch": 0.8601335343527892, "grad_norm": 0.18326100707054138, "learning_rate": 1.67311180742757e-05, "loss": 0.0314, "step": 312 }, { "epoch": 0.8628903726039199, "grad_norm": 0.12580075860023499, "learning_rate": 1.6709024182837917e-05, "loss": 0.0314, "step": 313 }, { "epoch": 0.8656472108550506, "grad_norm": 0.14372077584266663, "learning_rate": 1.6686870573842388e-05, "loss": 0.0298, "step": 314 }, { "epoch": 0.8684040491061813, "grad_norm": 0.1365354061126709, "learning_rate": 1.6664657444480145e-05, "loss": 0.0304, "step": 315 }, { "epoch": 0.8711608873573121, "grad_norm": 0.17227758467197418, "learning_rate": 1.6642384992472026e-05, "loss": 0.0311, "step": 316 }, { "epoch": 0.8739177256084428, "grad_norm": 0.1729755699634552, "learning_rate": 1.6620053416066892e-05, "loss": 0.0315, "step": 317 }, { "epoch": 0.8766745638595735, "grad_norm": 0.1310206949710846, "learning_rate": 1.6597662914039885e-05, "loss": 0.0313, "step": 318 }, { "epoch": 0.8794314021107043, "grad_norm": 0.15664447844028473, "learning_rate": 1.657521368569064e-05, "loss": 0.0304, "step": 319 }, { "epoch": 0.8821882403618351, "grad_norm": 0.16371680796146393, "learning_rate": 1.6552705930841523e-05, "loss": 0.0306, "step": 320 }, { "epoch": 0.8849450786129658, "grad_norm": 0.132271409034729, "learning_rate": 1.653013984983585e-05, "loss": 0.0315, "step": 321 }, { "epoch": 0.8877019168640965, "grad_norm": 0.1789659708738327, "learning_rate": 1.6507515643536113e-05, "loss": 0.0304, "step": 322 }, { "epoch": 0.8904587551152272, "grad_norm": 0.12446186691522598, "learning_rate": 1.6484833513322155e-05, "loss": 0.0297, "step": 323 }, { "epoch": 0.8932155933663579, "grad_norm": 0.13840149343013763, "learning_rate": 1.6462093661089432e-05, "loss": 0.0302, "step": 324 }, { "epoch": 0.8959724316174887, "grad_norm": 0.12458626925945282, "learning_rate": 1.643929628924717e-05, "loss": 0.0321, "step": 325 }, { "epoch": 0.8987292698686195, "grad_norm": 0.13855692744255066, "learning_rate": 1.6416441600716593e-05, "loss": 0.0305, "step": 326 }, { "epoch": 0.9014861081197502, "grad_norm": 0.1461232304573059, "learning_rate": 1.6393529798929103e-05, "loss": 0.03, "step": 327 }, { "epoch": 0.9042429463708809, "grad_norm": 0.11444980651140213, "learning_rate": 1.637056108782446e-05, "loss": 0.0292, "step": 328 }, { "epoch": 0.9069997846220116, "grad_norm": 0.1289425641298294, "learning_rate": 1.6347535671848998e-05, "loss": 0.0305, "step": 329 }, { "epoch": 0.9097566228731424, "grad_norm": 0.12594164907932281, "learning_rate": 1.6324453755953772e-05, "loss": 0.0288, "step": 330 }, { "epoch": 0.9125134611242731, "grad_norm": 0.41084402799606323, "learning_rate": 1.6301315545592753e-05, "loss": 0.0299, "step": 331 }, { "epoch": 0.9152702993754038, "grad_norm": 0.13620588183403015, "learning_rate": 1.627812124672099e-05, "loss": 0.0302, "step": 332 }, { "epoch": 0.9180271376265345, "grad_norm": 0.10916028916835785, "learning_rate": 1.6254871065792776e-05, "loss": 0.031, "step": 333 }, { "epoch": 0.9207839758776653, "grad_norm": 0.1257525086402893, "learning_rate": 1.623156520975983e-05, "loss": 0.0313, "step": 334 }, { "epoch": 0.9235408141287961, "grad_norm": 0.10521648079156876, "learning_rate": 1.620820388606942e-05, "loss": 0.0296, "step": 335 }, { "epoch": 0.9262976523799268, "grad_norm": 0.11178401112556458, "learning_rate": 1.618478730266255e-05, "loss": 0.0307, "step": 336 }, { "epoch": 0.9290544906310575, "grad_norm": 0.11989890038967133, "learning_rate": 1.6161315667972095e-05, "loss": 0.0298, "step": 337 }, { "epoch": 0.9318113288821882, "grad_norm": 0.12103229016065598, "learning_rate": 1.6137789190920938e-05, "loss": 0.0302, "step": 338 }, { "epoch": 0.934568167133319, "grad_norm": 0.10210815072059631, "learning_rate": 1.6114208080920125e-05, "loss": 0.029, "step": 339 }, { "epoch": 0.9373250053844497, "grad_norm": 0.12528973817825317, "learning_rate": 1.6090572547866983e-05, "loss": 0.0303, "step": 340 }, { "epoch": 0.9400818436355804, "grad_norm": 0.12944018840789795, "learning_rate": 1.606688280214328e-05, "loss": 0.031, "step": 341 }, { "epoch": 0.9428386818867112, "grad_norm": 0.1170840710401535, "learning_rate": 1.6043139054613326e-05, "loss": 0.0304, "step": 342 }, { "epoch": 0.9455955201378419, "grad_norm": 0.1317935287952423, "learning_rate": 1.60193415166221e-05, "loss": 0.0309, "step": 343 }, { "epoch": 0.9483523583889727, "grad_norm": 0.10986040532588959, "learning_rate": 1.599549039999338e-05, "loss": 0.0306, "step": 344 }, { "epoch": 0.9511091966401034, "grad_norm": 0.11340153217315674, "learning_rate": 1.5971585917027864e-05, "loss": 0.0293, "step": 345 }, { "epoch": 0.9538660348912341, "grad_norm": 0.1250225007534027, "learning_rate": 1.594762828050124e-05, "loss": 0.029, "step": 346 }, { "epoch": 0.9566228731423648, "grad_norm": 0.1400870531797409, "learning_rate": 1.5923617703662346e-05, "loss": 0.0313, "step": 347 }, { "epoch": 0.9593797113934955, "grad_norm": 0.10701585561037064, "learning_rate": 1.5899554400231233e-05, "loss": 0.0316, "step": 348 }, { "epoch": 0.9621365496446264, "grad_norm": 0.1186312660574913, "learning_rate": 1.587543858439727e-05, "loss": 0.0307, "step": 349 }, { "epoch": 0.9648933878957571, "grad_norm": 0.10630480200052261, "learning_rate": 1.585127047081727e-05, "loss": 0.0297, "step": 350 }, { "epoch": 0.9676502261468878, "grad_norm": 0.11205119639635086, "learning_rate": 1.5827050274613512e-05, "loss": 0.0297, "step": 351 }, { "epoch": 0.9704070643980185, "grad_norm": 0.11059113591909409, "learning_rate": 1.580277821137191e-05, "loss": 0.0296, "step": 352 }, { "epoch": 0.9731639026491492, "grad_norm": 0.13033725321292877, "learning_rate": 1.577845449714001e-05, "loss": 0.0303, "step": 353 }, { "epoch": 0.97592074090028, "grad_norm": 0.10061930865049362, "learning_rate": 1.5754079348425137e-05, "loss": 0.0296, "step": 354 }, { "epoch": 0.9786775791514107, "grad_norm": 0.11028820276260376, "learning_rate": 1.5729652982192428e-05, "loss": 0.0284, "step": 355 }, { "epoch": 0.9814344174025414, "grad_norm": 0.11424390226602554, "learning_rate": 1.5705175615862906e-05, "loss": 0.0293, "step": 356 }, { "epoch": 0.9841912556536722, "grad_norm": 0.14579032361507416, "learning_rate": 1.568064746731156e-05, "loss": 0.0299, "step": 357 }, { "epoch": 0.986948093904803, "grad_norm": 0.11156564950942993, "learning_rate": 1.5656068754865388e-05, "loss": 0.0298, "step": 358 }, { "epoch": 0.9897049321559337, "grad_norm": 0.10791637748479843, "learning_rate": 1.5631439697301464e-05, "loss": 0.03, "step": 359 }, { "epoch": 0.9924617704070644, "grad_norm": 0.11196880787611008, "learning_rate": 1.560676051384499e-05, "loss": 0.0289, "step": 360 }, { "epoch": 0.9952186086581951, "grad_norm": 0.11056679487228394, "learning_rate": 1.558203142416734e-05, "loss": 0.0285, "step": 361 }, { "epoch": 0.9979754469093258, "grad_norm": 0.1029677763581276, "learning_rate": 1.5557252648384103e-05, "loss": 0.0303, "step": 362 }, { "epoch": 1.0007322851604565, "grad_norm": 0.12087027728557587, "learning_rate": 1.553242440705314e-05, "loss": 0.0297, "step": 363 }, { "epoch": 1.0034891234115872, "grad_norm": 0.11372894048690796, "learning_rate": 1.5507546921172595e-05, "loss": 0.0251, "step": 364 }, { "epoch": 1.0062459616627182, "grad_norm": 0.11590228229761124, "learning_rate": 1.548262041217895e-05, "loss": 0.0244, "step": 365 }, { "epoch": 1.009002799913849, "grad_norm": 0.129511296749115, "learning_rate": 1.5457645101945046e-05, "loss": 0.0247, "step": 366 }, { "epoch": 1.0117596381649796, "grad_norm": 0.12303431332111359, "learning_rate": 1.5432621212778105e-05, "loss": 0.0247, "step": 367 }, { "epoch": 1.0145164764161103, "grad_norm": 0.1147053986787796, "learning_rate": 1.5407548967417755e-05, "loss": 0.0241, "step": 368 }, { "epoch": 1.017273314667241, "grad_norm": 0.117189422249794, "learning_rate": 1.538242858903404e-05, "loss": 0.0244, "step": 369 }, { "epoch": 1.0200301529183717, "grad_norm": 0.1118486151099205, "learning_rate": 1.5357260301225448e-05, "loss": 0.0249, "step": 370 }, { "epoch": 1.0227869911695024, "grad_norm": 0.4378014802932739, "learning_rate": 1.5332044328016916e-05, "loss": 0.0262, "step": 371 }, { "epoch": 1.0255438294206332, "grad_norm": 0.12007389217615128, "learning_rate": 1.530678089385782e-05, "loss": 0.0256, "step": 372 }, { "epoch": 1.0283006676717639, "grad_norm": 0.13402129709720612, "learning_rate": 1.5281470223619995e-05, "loss": 0.0254, "step": 373 }, { "epoch": 1.0310575059228946, "grad_norm": 0.1433870643377304, "learning_rate": 1.525611254259574e-05, "loss": 0.0251, "step": 374 }, { "epoch": 1.0338143441740255, "grad_norm": 0.116049624979496, "learning_rate": 1.5230708076495777e-05, "loss": 0.0246, "step": 375 }, { "epoch": 1.0365711824251562, "grad_norm": 0.10888892412185669, "learning_rate": 1.5205257051447291e-05, "loss": 0.0248, "step": 376 }, { "epoch": 1.039328020676287, "grad_norm": 0.1329452395439148, "learning_rate": 1.5179759693991869e-05, "loss": 0.0248, "step": 377 }, { "epoch": 1.0420848589274176, "grad_norm": 0.12392039597034454, "learning_rate": 1.5154216231083522e-05, "loss": 0.0255, "step": 378 }, { "epoch": 1.0448416971785484, "grad_norm": 0.11657357215881348, "learning_rate": 1.5128626890086647e-05, "loss": 0.0247, "step": 379 }, { "epoch": 1.047598535429679, "grad_norm": 0.11587107181549072, "learning_rate": 1.5102991898774e-05, "loss": 0.0256, "step": 380 }, { "epoch": 1.0503553736808098, "grad_norm": 0.5603076219558716, "learning_rate": 1.507731148532468e-05, "loss": 0.0252, "step": 381 }, { "epoch": 1.0531122119319405, "grad_norm": 0.11775552481412888, "learning_rate": 1.505158587832208e-05, "loss": 0.0246, "step": 382 }, { "epoch": 1.0558690501830712, "grad_norm": 0.10402605682611465, "learning_rate": 1.5025815306751888e-05, "loss": 0.0238, "step": 383 }, { "epoch": 1.058625888434202, "grad_norm": 0.1355026811361313, "learning_rate": 1.5000000000000002e-05, "loss": 0.0262, "step": 384 }, { "epoch": 1.0613827266853328, "grad_norm": 0.11940840631723404, "learning_rate": 1.497414018785052e-05, "loss": 0.0246, "step": 385 }, { "epoch": 1.0641395649364636, "grad_norm": 0.12173695117235184, "learning_rate": 1.4948236100483688e-05, "loss": 0.025, "step": 386 }, { "epoch": 1.0668964031875943, "grad_norm": 0.11082287132740021, "learning_rate": 1.492228796847385e-05, "loss": 0.0258, "step": 387 }, { "epoch": 1.069653241438725, "grad_norm": 0.10860753059387207, "learning_rate": 1.4896296022787386e-05, "loss": 0.0244, "step": 388 }, { "epoch": 1.0724100796898557, "grad_norm": 0.10953570157289505, "learning_rate": 1.4870260494780679e-05, "loss": 0.0246, "step": 389 }, { "epoch": 1.0751669179409864, "grad_norm": 0.10230018198490143, "learning_rate": 1.4844181616198028e-05, "loss": 0.0249, "step": 390 }, { "epoch": 1.077923756192117, "grad_norm": 0.10772014409303665, "learning_rate": 1.4818059619169606e-05, "loss": 0.0247, "step": 391 }, { "epoch": 1.0806805944432478, "grad_norm": 0.11978469043970108, "learning_rate": 1.479189473620939e-05, "loss": 0.025, "step": 392 }, { "epoch": 1.0834374326943785, "grad_norm": 0.10893766582012177, "learning_rate": 1.4765687200213079e-05, "loss": 0.0239, "step": 393 }, { "epoch": 1.0861942709455095, "grad_norm": 0.1162242516875267, "learning_rate": 1.4739437244456039e-05, "loss": 0.024, "step": 394 }, { "epoch": 1.0889511091966402, "grad_norm": 0.11603809893131256, "learning_rate": 1.4713145102591209e-05, "loss": 0.0255, "step": 395 }, { "epoch": 1.0917079474477709, "grad_norm": 0.11185269802808762, "learning_rate": 1.4686811008647037e-05, "loss": 0.0244, "step": 396 }, { "epoch": 1.0944647856989016, "grad_norm": 0.11362854391336441, "learning_rate": 1.4660435197025391e-05, "loss": 0.0248, "step": 397 }, { "epoch": 1.0972216239500323, "grad_norm": 0.12096700072288513, "learning_rate": 1.463401790249946e-05, "loss": 0.0247, "step": 398 }, { "epoch": 1.099978462201163, "grad_norm": 0.12074792385101318, "learning_rate": 1.4607559360211688e-05, "loss": 0.025, "step": 399 }, { "epoch": 1.1027353004522937, "grad_norm": 0.13158197700977325, "learning_rate": 1.4581059805671662e-05, "loss": 0.026, "step": 400 }, { "epoch": 1.1054921387034244, "grad_norm": 0.1625628024339676, "learning_rate": 1.4554519474754025e-05, "loss": 0.0249, "step": 401 }, { "epoch": 1.1082489769545552, "grad_norm": 0.11231596022844315, "learning_rate": 1.4527938603696376e-05, "loss": 0.0246, "step": 402 }, { "epoch": 1.111005815205686, "grad_norm": 0.10914017260074615, "learning_rate": 1.4501317429097155e-05, "loss": 0.0247, "step": 403 }, { "epoch": 1.1137626534568168, "grad_norm": 0.11855874955654144, "learning_rate": 1.4474656187913558e-05, "loss": 0.0255, "step": 404 }, { "epoch": 1.1165194917079475, "grad_norm": 0.10382278263568878, "learning_rate": 1.4447955117459414e-05, "loss": 0.0235, "step": 405 }, { "epoch": 1.1192763299590782, "grad_norm": 0.11612187325954437, "learning_rate": 1.4421214455403071e-05, "loss": 0.0253, "step": 406 }, { "epoch": 1.122033168210209, "grad_norm": 0.11434487253427505, "learning_rate": 1.4394434439765295e-05, "loss": 0.0256, "step": 407 }, { "epoch": 1.1247900064613396, "grad_norm": 0.14260295033454895, "learning_rate": 1.436761530891713e-05, "loss": 0.0246, "step": 408 }, { "epoch": 1.1275468447124704, "grad_norm": 0.11852490156888962, "learning_rate": 1.4340757301577787e-05, "loss": 0.0242, "step": 409 }, { "epoch": 1.130303682963601, "grad_norm": 0.19492119550704956, "learning_rate": 1.4313860656812537e-05, "loss": 0.0254, "step": 410 }, { "epoch": 1.1330605212147318, "grad_norm": 0.14018341898918152, "learning_rate": 1.4286925614030542e-05, "loss": 0.0265, "step": 411 }, { "epoch": 1.1358173594658627, "grad_norm": 0.11521740257740021, "learning_rate": 1.425995241298277e-05, "loss": 0.0247, "step": 412 }, { "epoch": 1.1385741977169934, "grad_norm": 0.10554037988185883, "learning_rate": 1.423294129375982e-05, "loss": 0.0251, "step": 413 }, { "epoch": 1.1413310359681241, "grad_norm": 0.12840139865875244, "learning_rate": 1.4205892496789816e-05, "loss": 0.0248, "step": 414 }, { "epoch": 1.1440878742192548, "grad_norm": 0.11721502244472504, "learning_rate": 1.4178806262836252e-05, "loss": 0.0259, "step": 415 }, { "epoch": 1.1468447124703856, "grad_norm": 0.14833590388298035, "learning_rate": 1.4151682832995846e-05, "loss": 0.0238, "step": 416 }, { "epoch": 1.1496015507215163, "grad_norm": 0.10142626613378525, "learning_rate": 1.4124522448696407e-05, "loss": 0.0243, "step": 417 }, { "epoch": 1.152358388972647, "grad_norm": 0.13285695016384125, "learning_rate": 1.4097325351694674e-05, "loss": 0.0248, "step": 418 }, { "epoch": 1.1551152272237777, "grad_norm": 0.10552552342414856, "learning_rate": 1.407009178407417e-05, "loss": 0.0245, "step": 419 }, { "epoch": 1.1578720654749084, "grad_norm": 0.14879612624645233, "learning_rate": 1.404282198824305e-05, "loss": 0.0242, "step": 420 }, { "epoch": 1.160628903726039, "grad_norm": 0.13807976245880127, "learning_rate": 1.4015516206931932e-05, "loss": 0.0241, "step": 421 }, { "epoch": 1.1633857419771698, "grad_norm": 0.12726476788520813, "learning_rate": 1.3988174683191744e-05, "loss": 0.0244, "step": 422 }, { "epoch": 1.1661425802283008, "grad_norm": 0.11876961588859558, "learning_rate": 1.396079766039157e-05, "loss": 0.0241, "step": 423 }, { "epoch": 1.1688994184794315, "grad_norm": 0.13200323283672333, "learning_rate": 1.393338538221646e-05, "loss": 0.0254, "step": 424 }, { "epoch": 1.1716562567305622, "grad_norm": 0.1211104691028595, "learning_rate": 1.3905938092665283e-05, "loss": 0.0253, "step": 425 }, { "epoch": 1.1744130949816929, "grad_norm": 0.13063450157642365, "learning_rate": 1.387845603604855e-05, "loss": 0.0251, "step": 426 }, { "epoch": 1.1771699332328236, "grad_norm": 0.12195156514644623, "learning_rate": 1.385093945698623e-05, "loss": 0.0243, "step": 427 }, { "epoch": 1.1799267714839543, "grad_norm": 0.15483883023262024, "learning_rate": 1.382338860040558e-05, "loss": 0.0245, "step": 428 }, { "epoch": 1.182683609735085, "grad_norm": 0.11185739189386368, "learning_rate": 1.3795803711538966e-05, "loss": 0.0248, "step": 429 }, { "epoch": 1.1854404479862157, "grad_norm": 0.11978735029697418, "learning_rate": 1.3768185035921677e-05, "loss": 0.0242, "step": 430 }, { "epoch": 1.1881972862373464, "grad_norm": 0.11775299161672592, "learning_rate": 1.374053281938974e-05, "loss": 0.0245, "step": 431 }, { "epoch": 1.1909541244884774, "grad_norm": 0.10916128754615784, "learning_rate": 1.3712847308077737e-05, "loss": 0.024, "step": 432 }, { "epoch": 1.193710962739608, "grad_norm": 0.12981654703617096, "learning_rate": 1.3685128748416603e-05, "loss": 0.0251, "step": 433 }, { "epoch": 1.1964678009907388, "grad_norm": 0.1111079528927803, "learning_rate": 1.3657377387131443e-05, "loss": 0.0242, "step": 434 }, { "epoch": 1.1992246392418695, "grad_norm": 0.2875465154647827, "learning_rate": 1.3629593471239328e-05, "loss": 0.0246, "step": 435 }, { "epoch": 1.2019814774930002, "grad_norm": 0.10963843017816544, "learning_rate": 1.3601777248047105e-05, "loss": 0.0245, "step": 436 }, { "epoch": 1.204738315744131, "grad_norm": 0.15255135297775269, "learning_rate": 1.3573928965149188e-05, "loss": 0.0254, "step": 437 }, { "epoch": 1.2074951539952616, "grad_norm": 0.12013466656208038, "learning_rate": 1.3546048870425356e-05, "loss": 0.0252, "step": 438 }, { "epoch": 1.2102519922463923, "grad_norm": 0.11045455187559128, "learning_rate": 1.3518137212038554e-05, "loss": 0.0233, "step": 439 }, { "epoch": 1.213008830497523, "grad_norm": 0.11393214017152786, "learning_rate": 1.3490194238432665e-05, "loss": 0.0253, "step": 440 }, { "epoch": 1.215765668748654, "grad_norm": 0.39521655440330505, "learning_rate": 1.346222019833033e-05, "loss": 0.0243, "step": 441 }, { "epoch": 1.2185225069997847, "grad_norm": 0.11597371101379395, "learning_rate": 1.3434215340730692e-05, "loss": 0.0247, "step": 442 }, { "epoch": 1.2212793452509154, "grad_norm": 0.11905563622713089, "learning_rate": 1.340617991490722e-05, "loss": 0.0246, "step": 443 }, { "epoch": 1.2240361835020461, "grad_norm": 0.11847221851348877, "learning_rate": 1.3378114170405473e-05, "loss": 0.0244, "step": 444 }, { "epoch": 1.2267930217531768, "grad_norm": 0.11068813502788544, "learning_rate": 1.335001835704087e-05, "loss": 0.0252, "step": 445 }, { "epoch": 1.2295498600043075, "grad_norm": 0.1128387600183487, "learning_rate": 1.3321892724896483e-05, "loss": 0.025, "step": 446 }, { "epoch": 1.2323066982554383, "grad_norm": 0.10336292535066605, "learning_rate": 1.3293737524320798e-05, "loss": 0.0245, "step": 447 }, { "epoch": 1.235063536506569, "grad_norm": 0.11005783826112747, "learning_rate": 1.3265553005925494e-05, "loss": 0.0253, "step": 448 }, { "epoch": 1.2378203747576997, "grad_norm": 0.125181645154953, "learning_rate": 1.3237339420583213e-05, "loss": 0.0248, "step": 449 }, { "epoch": 1.2405772130088306, "grad_norm": 0.1266736388206482, "learning_rate": 1.3209097019425317e-05, "loss": 0.0255, "step": 450 }, { "epoch": 1.2433340512599613, "grad_norm": 0.13304491341114044, "learning_rate": 1.3180826053839668e-05, "loss": 0.0249, "step": 451 }, { "epoch": 1.246090889511092, "grad_norm": 0.09845131635665894, "learning_rate": 1.315252677546838e-05, "loss": 0.0242, "step": 452 }, { "epoch": 1.2488477277622227, "grad_norm": 0.11469008028507233, "learning_rate": 1.3124199436205575e-05, "loss": 0.0252, "step": 453 }, { "epoch": 1.2516045660133535, "grad_norm": 0.11134012043476105, "learning_rate": 1.309584428819516e-05, "loss": 0.0235, "step": 454 }, { "epoch": 1.2543614042644842, "grad_norm": 0.10330889374017715, "learning_rate": 1.3067461583828553e-05, "loss": 0.0251, "step": 455 }, { "epoch": 1.2571182425156149, "grad_norm": 0.10626427829265594, "learning_rate": 1.303905157574247e-05, "loss": 0.0232, "step": 456 }, { "epoch": 1.2598750807667456, "grad_norm": 0.10066665709018707, "learning_rate": 1.3010614516816652e-05, "loss": 0.0244, "step": 457 }, { "epoch": 1.2626319190178763, "grad_norm": 0.0952746570110321, "learning_rate": 1.2982150660171613e-05, "loss": 0.0231, "step": 458 }, { "epoch": 1.2653887572690072, "grad_norm": 0.11685788631439209, "learning_rate": 1.2953660259166413e-05, "loss": 0.024, "step": 459 }, { "epoch": 1.2681455955201377, "grad_norm": 0.10656964778900146, "learning_rate": 1.2925143567396374e-05, "loss": 0.0253, "step": 460 }, { "epoch": 1.2709024337712687, "grad_norm": 0.12450357526540756, "learning_rate": 1.2896600838690838e-05, "loss": 0.0241, "step": 461 }, { "epoch": 1.2736592720223994, "grad_norm": 0.10607194900512695, "learning_rate": 1.2868032327110904e-05, "loss": 0.0238, "step": 462 }, { "epoch": 1.27641611027353, "grad_norm": 0.12448973953723907, "learning_rate": 1.2839438286947163e-05, "loss": 0.0245, "step": 463 }, { "epoch": 1.2791729485246608, "grad_norm": 0.11434370279312134, "learning_rate": 1.2810818972717438e-05, "loss": 0.0236, "step": 464 }, { "epoch": 1.2819297867757915, "grad_norm": 0.11489138007164001, "learning_rate": 1.2782174639164528e-05, "loss": 0.0241, "step": 465 }, { "epoch": 1.2846866250269222, "grad_norm": 0.11225125193595886, "learning_rate": 1.2753505541253917e-05, "loss": 0.0238, "step": 466 }, { "epoch": 1.287443463278053, "grad_norm": 0.11676695942878723, "learning_rate": 1.272481193417153e-05, "loss": 0.0251, "step": 467 }, { "epoch": 1.2902003015291836, "grad_norm": 0.10919535160064697, "learning_rate": 1.269609407332144e-05, "loss": 0.0248, "step": 468 }, { "epoch": 1.2929571397803143, "grad_norm": 0.11963889002799988, "learning_rate": 1.2667352214323614e-05, "loss": 0.0253, "step": 469 }, { "epoch": 1.2957139780314453, "grad_norm": 0.11035304516553879, "learning_rate": 1.2638586613011625e-05, "loss": 0.0241, "step": 470 }, { "epoch": 1.298470816282576, "grad_norm": 0.1180749386548996, "learning_rate": 1.2609797525430374e-05, "loss": 0.0257, "step": 471 }, { "epoch": 1.3012276545337067, "grad_norm": 0.11271151155233383, "learning_rate": 1.258098520783382e-05, "loss": 0.024, "step": 472 }, { "epoch": 1.3039844927848374, "grad_norm": 0.10727877169847488, "learning_rate": 1.2552149916682695e-05, "loss": 0.0244, "step": 473 }, { "epoch": 1.3067413310359681, "grad_norm": 0.10205347836017609, "learning_rate": 1.2523291908642219e-05, "loss": 0.0235, "step": 474 }, { "epoch": 1.3094981692870988, "grad_norm": 0.10410701483488083, "learning_rate": 1.2494411440579814e-05, "loss": 0.0245, "step": 475 }, { "epoch": 1.3122550075382295, "grad_norm": 0.10330906510353088, "learning_rate": 1.2465508769562824e-05, "loss": 0.0246, "step": 476 }, { "epoch": 1.3150118457893603, "grad_norm": 0.10153473168611526, "learning_rate": 1.243658415285622e-05, "loss": 0.024, "step": 477 }, { "epoch": 1.317768684040491, "grad_norm": 0.13269540667533875, "learning_rate": 1.240763784792032e-05, "loss": 0.0265, "step": 478 }, { "epoch": 1.320525522291622, "grad_norm": 0.1011882796883583, "learning_rate": 1.2378670112408482e-05, "loss": 0.0244, "step": 479 }, { "epoch": 1.3232823605427526, "grad_norm": 0.10707399994134903, "learning_rate": 1.2349681204164823e-05, "loss": 0.0239, "step": 480 }, { "epoch": 1.3260391987938833, "grad_norm": 0.1126994714140892, "learning_rate": 1.2320671381221923e-05, "loss": 0.0256, "step": 481 }, { "epoch": 1.328796037045014, "grad_norm": 0.10313718020915985, "learning_rate": 1.229164090179852e-05, "loss": 0.0243, "step": 482 }, { "epoch": 1.3315528752961447, "grad_norm": 0.1096411719918251, "learning_rate": 1.2262590024297226e-05, "loss": 0.0242, "step": 483 }, { "epoch": 1.3343097135472755, "grad_norm": 0.11218464374542236, "learning_rate": 1.2233519007302201e-05, "loss": 0.0243, "step": 484 }, { "epoch": 1.3370665517984062, "grad_norm": 0.09545588493347168, "learning_rate": 1.2204428109576888e-05, "loss": 0.0235, "step": 485 }, { "epoch": 1.3398233900495369, "grad_norm": 0.10384158790111542, "learning_rate": 1.2175317590061676e-05, "loss": 0.0239, "step": 486 }, { "epoch": 1.3425802283006676, "grad_norm": 0.10288365185260773, "learning_rate": 1.2146187707871617e-05, "loss": 0.0246, "step": 487 }, { "epoch": 1.3453370665517985, "grad_norm": 0.12427128106355667, "learning_rate": 1.211703872229411e-05, "loss": 0.0247, "step": 488 }, { "epoch": 1.348093904802929, "grad_norm": 0.11715073883533478, "learning_rate": 1.2087870892786588e-05, "loss": 0.0245, "step": 489 }, { "epoch": 1.35085074305406, "grad_norm": 0.10031214356422424, "learning_rate": 1.2058684478974226e-05, "loss": 0.0237, "step": 490 }, { "epoch": 1.3536075813051907, "grad_norm": 0.13713191449642181, "learning_rate": 1.2029479740647613e-05, "loss": 0.0226, "step": 491 }, { "epoch": 1.3563644195563214, "grad_norm": 0.10992252826690674, "learning_rate": 1.2000256937760446e-05, "loss": 0.0254, "step": 492 }, { "epoch": 1.359121257807452, "grad_norm": 0.12964554131031036, "learning_rate": 1.1971016330427217e-05, "loss": 0.0249, "step": 493 }, { "epoch": 1.3618780960585828, "grad_norm": 0.10349434614181519, "learning_rate": 1.1941758178920899e-05, "loss": 0.0243, "step": 494 }, { "epoch": 1.3646349343097135, "grad_norm": 0.13970984518527985, "learning_rate": 1.1912482743670624e-05, "loss": 0.024, "step": 495 }, { "epoch": 1.3673917725608442, "grad_norm": 0.10153508186340332, "learning_rate": 1.188319028525937e-05, "loss": 0.0237, "step": 496 }, { "epoch": 1.3701486108119751, "grad_norm": 0.12652255594730377, "learning_rate": 1.1853881064421634e-05, "loss": 0.0248, "step": 497 }, { "epoch": 1.3729054490631056, "grad_norm": 0.1191074326634407, "learning_rate": 1.1824555342041129e-05, "loss": 0.0244, "step": 498 }, { "epoch": 1.3756622873142366, "grad_norm": 0.11915133893489838, "learning_rate": 1.1795213379148436e-05, "loss": 0.0232, "step": 499 }, { "epoch": 1.3784191255653673, "grad_norm": 0.11027630418539047, "learning_rate": 1.17658554369187e-05, "loss": 0.0255, "step": 500 }, { "epoch": 1.381175963816498, "grad_norm": 0.1299276500940323, "learning_rate": 1.1736481776669307e-05, "loss": 0.0243, "step": 501 }, { "epoch": 1.3839328020676287, "grad_norm": 0.10648605227470398, "learning_rate": 1.1707092659857531e-05, "loss": 0.024, "step": 502 }, { "epoch": 1.3866896403187594, "grad_norm": 0.12794575095176697, "learning_rate": 1.1677688348078244e-05, "loss": 0.0241, "step": 503 }, { "epoch": 1.3894464785698901, "grad_norm": 0.11318343877792358, "learning_rate": 1.1648269103061567e-05, "loss": 0.0239, "step": 504 }, { "epoch": 1.3922033168210208, "grad_norm": 0.10124088078737259, "learning_rate": 1.1618835186670532e-05, "loss": 0.0231, "step": 505 }, { "epoch": 1.3949601550721518, "grad_norm": 0.10141489654779434, "learning_rate": 1.1589386860898773e-05, "loss": 0.0246, "step": 506 }, { "epoch": 1.3977169933232823, "grad_norm": 0.11699099838733673, "learning_rate": 1.155992438786818e-05, "loss": 0.0237, "step": 507 }, { "epoch": 1.4004738315744132, "grad_norm": 0.10175668448209763, "learning_rate": 1.1530448029826566e-05, "loss": 0.023, "step": 508 }, { "epoch": 1.403230669825544, "grad_norm": 0.12864287197589874, "learning_rate": 1.1500958049145342e-05, "loss": 0.0247, "step": 509 }, { "epoch": 1.4059875080766746, "grad_norm": 0.8590685129165649, "learning_rate": 1.1471454708317163e-05, "loss": 0.0278, "step": 510 }, { "epoch": 1.4087443463278053, "grad_norm": 0.11892825365066528, "learning_rate": 1.1441938269953619e-05, "loss": 0.0251, "step": 511 }, { "epoch": 1.411501184578936, "grad_norm": 0.12736280262470245, "learning_rate": 1.1412408996782871e-05, "loss": 0.024, "step": 512 }, { "epoch": 1.4142580228300667, "grad_norm": 0.09979959577322006, "learning_rate": 1.1382867151647333e-05, "loss": 0.0234, "step": 513 }, { "epoch": 1.4170148610811975, "grad_norm": 0.11944937705993652, "learning_rate": 1.1353312997501313e-05, "loss": 0.0241, "step": 514 }, { "epoch": 1.4197716993323282, "grad_norm": 0.09399650245904922, "learning_rate": 1.1323746797408687e-05, "loss": 0.0229, "step": 515 }, { "epoch": 1.4225285375834589, "grad_norm": 0.11699047684669495, "learning_rate": 1.1294168814540554e-05, "loss": 0.025, "step": 516 }, { "epoch": 1.4252853758345898, "grad_norm": 0.137738898396492, "learning_rate": 1.1264579312172896e-05, "loss": 0.0253, "step": 517 }, { "epoch": 1.4280422140857205, "grad_norm": 0.11010394990444183, "learning_rate": 1.123497855368422e-05, "loss": 0.0245, "step": 518 }, { "epoch": 1.4307990523368512, "grad_norm": 0.09499143809080124, "learning_rate": 1.1205366802553231e-05, "loss": 0.0232, "step": 519 }, { "epoch": 1.433555890587982, "grad_norm": 0.11812635511159897, "learning_rate": 1.1175744322356488e-05, "loss": 0.0238, "step": 520 }, { "epoch": 1.4363127288391127, "grad_norm": 0.10630928725004196, "learning_rate": 1.1146111376766033e-05, "loss": 0.0247, "step": 521 }, { "epoch": 1.4390695670902434, "grad_norm": 0.11619951575994492, "learning_rate": 1.1116468229547079e-05, "loss": 0.024, "step": 522 }, { "epoch": 1.441826405341374, "grad_norm": 0.09959083050489426, "learning_rate": 1.1086815144555633e-05, "loss": 0.0244, "step": 523 }, { "epoch": 1.4445832435925048, "grad_norm": 0.10520867258310318, "learning_rate": 1.105715238573616e-05, "loss": 0.0248, "step": 524 }, { "epoch": 1.4473400818436355, "grad_norm": 0.09931975603103638, "learning_rate": 1.1027480217119245e-05, "loss": 0.0237, "step": 525 }, { "epoch": 1.4500969200947664, "grad_norm": 0.10330688953399658, "learning_rate": 1.0997798902819208e-05, "loss": 0.0245, "step": 526 }, { "epoch": 1.452853758345897, "grad_norm": 0.09967660158872604, "learning_rate": 1.0968108707031792e-05, "loss": 0.0245, "step": 527 }, { "epoch": 1.4556105965970279, "grad_norm": 0.11198758333921432, "learning_rate": 1.0938409894031793e-05, "loss": 0.0243, "step": 528 }, { "epoch": 1.4583674348481586, "grad_norm": 0.8267385959625244, "learning_rate": 1.0908702728170706e-05, "loss": 0.0254, "step": 529 }, { "epoch": 1.4611242730992893, "grad_norm": 0.09910894185304642, "learning_rate": 1.0878987473874381e-05, "loss": 0.0238, "step": 530 }, { "epoch": 1.46388111135042, "grad_norm": 0.10041078925132751, "learning_rate": 1.084926439564065e-05, "loss": 0.0246, "step": 531 }, { "epoch": 1.4666379496015507, "grad_norm": 0.1009271889925003, "learning_rate": 1.0819533758037002e-05, "loss": 0.0241, "step": 532 }, { "epoch": 1.4693947878526814, "grad_norm": 0.10631167888641357, "learning_rate": 1.0789795825698206e-05, "loss": 0.0249, "step": 533 }, { "epoch": 1.4721516261038121, "grad_norm": 0.1118827611207962, "learning_rate": 1.0760050863323961e-05, "loss": 0.0244, "step": 534 }, { "epoch": 1.474908464354943, "grad_norm": 0.11578582972288132, "learning_rate": 1.0730299135676545e-05, "loss": 0.0239, "step": 535 }, { "epoch": 1.4776653026060735, "grad_norm": 0.102451853454113, "learning_rate": 1.0700540907578447e-05, "loss": 0.0228, "step": 536 }, { "epoch": 1.4804221408572045, "grad_norm": 0.09739474207162857, "learning_rate": 1.0670776443910024e-05, "loss": 0.0235, "step": 537 }, { "epoch": 1.4831789791083352, "grad_norm": 0.1013341024518013, "learning_rate": 1.0641006009607137e-05, "loss": 0.0241, "step": 538 }, { "epoch": 1.485935817359466, "grad_norm": 0.10632984340190887, "learning_rate": 1.0611229869658785e-05, "loss": 0.0235, "step": 539 }, { "epoch": 1.4886926556105966, "grad_norm": 0.115880087018013, "learning_rate": 1.0581448289104759e-05, "loss": 0.0238, "step": 540 }, { "epoch": 1.4914494938617273, "grad_norm": 0.1438998281955719, "learning_rate": 1.0551661533033274e-05, "loss": 0.0249, "step": 541 }, { "epoch": 1.494206332112858, "grad_norm": 0.10747671127319336, "learning_rate": 1.052186986657862e-05, "loss": 0.0239, "step": 542 }, { "epoch": 1.4969631703639887, "grad_norm": 0.11497830599546432, "learning_rate": 1.0492073554918782e-05, "loss": 0.024, "step": 543 }, { "epoch": 1.4997200086151197, "grad_norm": 0.11187911778688431, "learning_rate": 1.0462272863273105e-05, "loss": 0.0237, "step": 544 }, { "epoch": 1.5024768468662502, "grad_norm": 0.10226496309041977, "learning_rate": 1.0432468056899909e-05, "loss": 0.0226, "step": 545 }, { "epoch": 1.505233685117381, "grad_norm": 0.12337212264537811, "learning_rate": 1.0402659401094154e-05, "loss": 0.0237, "step": 546 }, { "epoch": 1.5079905233685116, "grad_norm": 0.1370334029197693, "learning_rate": 1.0372847161185047e-05, "loss": 0.0246, "step": 547 }, { "epoch": 1.5107473616196425, "grad_norm": 0.1170111671090126, "learning_rate": 1.0343031602533713e-05, "loss": 0.0232, "step": 548 }, { "epoch": 1.5135041998707732, "grad_norm": 0.10803765803575516, "learning_rate": 1.0313212990530804e-05, "loss": 0.0241, "step": 549 }, { "epoch": 1.516261038121904, "grad_norm": 0.19091671705245972, "learning_rate": 1.028339159059416e-05, "loss": 0.025, "step": 550 }, { "epoch": 1.5190178763730346, "grad_norm": 0.11063394695520401, "learning_rate": 1.0253567668166436e-05, "loss": 0.024, "step": 551 }, { "epoch": 1.5217747146241654, "grad_norm": 0.11760086566209793, "learning_rate": 1.0223741488712732e-05, "loss": 0.0239, "step": 552 }, { "epoch": 1.5245315528752963, "grad_norm": 0.10287030041217804, "learning_rate": 1.0193913317718245e-05, "loss": 0.0237, "step": 553 }, { "epoch": 1.5272883911264268, "grad_norm": 0.1050201803445816, "learning_rate": 1.0164083420685898e-05, "loss": 0.0244, "step": 554 }, { "epoch": 1.5300452293775577, "grad_norm": 0.10405981540679932, "learning_rate": 1.0134252063133976e-05, "loss": 0.0231, "step": 555 }, { "epoch": 1.5328020676286882, "grad_norm": 0.09600961208343506, "learning_rate": 1.0104419510593764e-05, "loss": 0.0222, "step": 556 }, { "epoch": 1.5355589058798191, "grad_norm": 0.2610664367675781, "learning_rate": 1.0074586028607184e-05, "loss": 0.0242, "step": 557 }, { "epoch": 1.5383157441309498, "grad_norm": 0.10342807322740555, "learning_rate": 1.0044751882724436e-05, "loss": 0.0226, "step": 558 }, { "epoch": 1.5410725823820806, "grad_norm": 0.11091192066669464, "learning_rate": 1.0014917338501618e-05, "loss": 0.0234, "step": 559 }, { "epoch": 1.5438294206332113, "grad_norm": 0.10212964564561844, "learning_rate": 9.985082661498384e-06, "loss": 0.0241, "step": 560 }, { "epoch": 1.546586258884342, "grad_norm": 0.12537163496017456, "learning_rate": 9.955248117275566e-06, "loss": 0.026, "step": 561 }, { "epoch": 1.549343097135473, "grad_norm": 0.10132145881652832, "learning_rate": 9.925413971392816e-06, "loss": 0.0237, "step": 562 }, { "epoch": 1.5520999353866034, "grad_norm": 0.11508084088563919, "learning_rate": 9.89558048940624e-06, "loss": 0.0235, "step": 563 }, { "epoch": 1.5548567736377343, "grad_norm": 0.11934718489646912, "learning_rate": 9.865747936866027e-06, "loss": 0.0234, "step": 564 }, { "epoch": 1.5576136118888648, "grad_norm": 0.10267031192779541, "learning_rate": 9.835916579314105e-06, "loss": 0.0235, "step": 565 }, { "epoch": 1.5603704501399958, "grad_norm": 0.10713180154561996, "learning_rate": 9.806086682281759e-06, "loss": 0.0235, "step": 566 }, { "epoch": 1.5631272883911265, "grad_norm": 0.11230739206075668, "learning_rate": 9.776258511287271e-06, "loss": 0.0237, "step": 567 }, { "epoch": 1.5658841266422572, "grad_norm": 0.11151719093322754, "learning_rate": 9.746432331833568e-06, "loss": 0.0232, "step": 568 }, { "epoch": 1.568640964893388, "grad_norm": 0.10779338330030441, "learning_rate": 9.716608409405843e-06, "loss": 0.0236, "step": 569 }, { "epoch": 1.5713978031445186, "grad_norm": 0.10462480783462524, "learning_rate": 9.6867870094692e-06, "loss": 0.0232, "step": 570 }, { "epoch": 1.5741546413956493, "grad_norm": 0.11873821914196014, "learning_rate": 9.65696839746629e-06, "loss": 0.0235, "step": 571 }, { "epoch": 1.57691147964678, "grad_norm": 0.10550446808338165, "learning_rate": 9.627152838814954e-06, "loss": 0.0231, "step": 572 }, { "epoch": 1.579668317897911, "grad_norm": 0.10466040670871735, "learning_rate": 9.597340598905851e-06, "loss": 0.0241, "step": 573 }, { "epoch": 1.5824251561490414, "grad_norm": 0.09853997081518173, "learning_rate": 9.567531943100093e-06, "loss": 0.0227, "step": 574 }, { "epoch": 1.5851819944001724, "grad_norm": 0.4732920825481415, "learning_rate": 9.537727136726899e-06, "loss": 0.0275, "step": 575 }, { "epoch": 1.587938832651303, "grad_norm": 0.11485108733177185, "learning_rate": 9.50792644508122e-06, "loss": 0.0229, "step": 576 }, { "epoch": 1.5906956709024338, "grad_norm": 0.10848281532526016, "learning_rate": 9.478130133421381e-06, "loss": 0.0237, "step": 577 }, { "epoch": 1.5934525091535645, "grad_norm": 0.10901174694299698, "learning_rate": 9.448338466966727e-06, "loss": 0.0233, "step": 578 }, { "epoch": 1.5962093474046952, "grad_norm": 0.10002460330724716, "learning_rate": 9.418551710895243e-06, "loss": 0.0227, "step": 579 }, { "epoch": 1.598966185655826, "grad_norm": 0.10787548869848251, "learning_rate": 9.388770130341217e-06, "loss": 0.0235, "step": 580 }, { "epoch": 1.6017230239069566, "grad_norm": 0.10888142138719559, "learning_rate": 9.358993990392864e-06, "loss": 0.0235, "step": 581 }, { "epoch": 1.6044798621580876, "grad_norm": 0.11493578553199768, "learning_rate": 9.329223556089976e-06, "loss": 0.0244, "step": 582 }, { "epoch": 1.607236700409218, "grad_norm": 0.1134362742304802, "learning_rate": 9.299459092421558e-06, "loss": 0.0251, "step": 583 }, { "epoch": 1.609993538660349, "grad_norm": 0.10327161103487015, "learning_rate": 9.269700864323462e-06, "loss": 0.024, "step": 584 }, { "epoch": 1.6127503769114795, "grad_norm": 0.10661885142326355, "learning_rate": 9.239949136676042e-06, "loss": 0.0237, "step": 585 }, { "epoch": 1.6155072151626104, "grad_norm": 0.11634720116853714, "learning_rate": 9.210204174301797e-06, "loss": 0.0239, "step": 586 }, { "epoch": 1.6182640534137411, "grad_norm": 0.10218880325555801, "learning_rate": 9.180466241963e-06, "loss": 0.0231, "step": 587 }, { "epoch": 1.6210208916648718, "grad_norm": 0.10723837465047836, "learning_rate": 9.15073560435935e-06, "loss": 0.0233, "step": 588 }, { "epoch": 1.6237777299160026, "grad_norm": 0.10045973211526871, "learning_rate": 9.121012526125625e-06, "loss": 0.0229, "step": 589 }, { "epoch": 1.6265345681671333, "grad_norm": 0.1160292997956276, "learning_rate": 9.091297271829296e-06, "loss": 0.024, "step": 590 }, { "epoch": 1.6292914064182642, "grad_norm": 0.0929265022277832, "learning_rate": 9.061590105968208e-06, "loss": 0.022, "step": 591 }, { "epoch": 1.6320482446693947, "grad_norm": 0.09764169901609421, "learning_rate": 9.03189129296821e-06, "loss": 0.0237, "step": 592 }, { "epoch": 1.6348050829205256, "grad_norm": 0.10847126692533493, "learning_rate": 9.002201097180796e-06, "loss": 0.0229, "step": 593 }, { "epoch": 1.637561921171656, "grad_norm": 0.10012809187173843, "learning_rate": 8.97251978288076e-06, "loss": 0.0237, "step": 594 }, { "epoch": 1.640318759422787, "grad_norm": 0.10542436689138412, "learning_rate": 8.942847614263842e-06, "loss": 0.0225, "step": 595 }, { "epoch": 1.6430755976739178, "grad_norm": 0.10059002041816711, "learning_rate": 8.91318485544437e-06, "loss": 0.0229, "step": 596 }, { "epoch": 1.6458324359250485, "grad_norm": 0.09918685257434845, "learning_rate": 8.883531770452924e-06, "loss": 0.023, "step": 597 }, { "epoch": 1.6485892741761792, "grad_norm": 0.7748421430587769, "learning_rate": 8.853888623233967e-06, "loss": 0.0313, "step": 598 }, { "epoch": 1.6513461124273099, "grad_norm": 0.1017061397433281, "learning_rate": 8.824255677643518e-06, "loss": 0.0237, "step": 599 }, { "epoch": 1.6541029506784408, "grad_norm": 0.10307373106479645, "learning_rate": 8.79463319744677e-06, "loss": 0.0238, "step": 600 }, { "epoch": 1.6568597889295713, "grad_norm": 0.09575623273849487, "learning_rate": 8.765021446315784e-06, "loss": 0.0227, "step": 601 }, { "epoch": 1.6596166271807022, "grad_norm": 0.10236816108226776, "learning_rate": 8.735420687827107e-06, "loss": 0.0242, "step": 602 }, { "epoch": 1.6623734654318327, "grad_norm": 0.1926388442516327, "learning_rate": 8.705831185459446e-06, "loss": 0.0269, "step": 603 }, { "epoch": 1.6651303036829637, "grad_norm": 0.10154667496681213, "learning_rate": 8.676253202591318e-06, "loss": 0.0238, "step": 604 }, { "epoch": 1.6678871419340944, "grad_norm": 0.10679133236408234, "learning_rate": 8.646687002498692e-06, "loss": 0.0235, "step": 605 }, { "epoch": 1.670643980185225, "grad_norm": 0.1166776716709137, "learning_rate": 8.617132848352672e-06, "loss": 0.0228, "step": 606 }, { "epoch": 1.6734008184363558, "grad_norm": 0.10293897241353989, "learning_rate": 8.58759100321713e-06, "loss": 0.022, "step": 607 }, { "epoch": 1.6761576566874865, "grad_norm": 0.3156561255455017, "learning_rate": 8.558061730046385e-06, "loss": 0.0259, "step": 608 }, { "epoch": 1.6789144949386172, "grad_norm": 0.10844126343727112, "learning_rate": 8.528545291682839e-06, "loss": 0.0227, "step": 609 }, { "epoch": 1.681671333189748, "grad_norm": 0.10488175600767136, "learning_rate": 8.499041950854665e-06, "loss": 0.0239, "step": 610 }, { "epoch": 1.6844281714408789, "grad_norm": 0.12386349588632584, "learning_rate": 8.469551970173437e-06, "loss": 0.0238, "step": 611 }, { "epoch": 1.6871850096920094, "grad_norm": 0.11962109059095383, "learning_rate": 8.440075612131823e-06, "loss": 0.0241, "step": 612 }, { "epoch": 1.6899418479431403, "grad_norm": 0.09642872214317322, "learning_rate": 8.410613139101229e-06, "loss": 0.0227, "step": 613 }, { "epoch": 1.692698686194271, "grad_norm": 0.10245215147733688, "learning_rate": 8.38116481332947e-06, "loss": 0.0224, "step": 614 }, { "epoch": 1.6954555244454017, "grad_norm": 0.10805162042379379, "learning_rate": 8.351730896938438e-06, "loss": 0.024, "step": 615 }, { "epoch": 1.6982123626965324, "grad_norm": 0.10161999613046646, "learning_rate": 8.322311651921759e-06, "loss": 0.0228, "step": 616 }, { "epoch": 1.7009692009476631, "grad_norm": 0.10508878529071808, "learning_rate": 8.292907340142472e-06, "loss": 0.0241, "step": 617 }, { "epoch": 1.7037260391987938, "grad_norm": 0.10359475761651993, "learning_rate": 8.263518223330698e-06, "loss": 0.0241, "step": 618 }, { "epoch": 1.7064828774499246, "grad_norm": 0.11690201610326767, "learning_rate": 8.2341445630813e-06, "loss": 0.0236, "step": 619 }, { "epoch": 1.7092397157010555, "grad_norm": 0.1021987721323967, "learning_rate": 8.204786620851569e-06, "loss": 0.0227, "step": 620 }, { "epoch": 1.711996553952186, "grad_norm": 0.10730181634426117, "learning_rate": 8.175444657958875e-06, "loss": 0.0221, "step": 621 }, { "epoch": 1.714753392203317, "grad_norm": 0.10022248327732086, "learning_rate": 8.146118935578367e-06, "loss": 0.0239, "step": 622 }, { "epoch": 1.7175102304544474, "grad_norm": 0.11165450513362885, "learning_rate": 8.116809714740634e-06, "loss": 0.0229, "step": 623 }, { "epoch": 1.7202670687055783, "grad_norm": 1.2205007076263428, "learning_rate": 8.087517256329376e-06, "loss": 0.024, "step": 624 }, { "epoch": 1.723023906956709, "grad_norm": 0.09469296783208847, "learning_rate": 8.058241821079106e-06, "loss": 0.0234, "step": 625 }, { "epoch": 1.7257807452078398, "grad_norm": 0.09778100997209549, "learning_rate": 8.028983669572786e-06, "loss": 0.0229, "step": 626 }, { "epoch": 1.7285375834589705, "grad_norm": 0.10135500133037567, "learning_rate": 7.999743062239557e-06, "loss": 0.0227, "step": 627 }, { "epoch": 1.7312944217101012, "grad_norm": 0.0966990739107132, "learning_rate": 7.97052025935239e-06, "loss": 0.0232, "step": 628 }, { "epoch": 1.734051259961232, "grad_norm": 0.09644000977277756, "learning_rate": 7.941315521025776e-06, "loss": 0.023, "step": 629 }, { "epoch": 1.7368080982123626, "grad_norm": 0.10050094872713089, "learning_rate": 7.912129107213417e-06, "loss": 0.022, "step": 630 }, { "epoch": 1.7395649364634935, "grad_norm": 0.09904894977807999, "learning_rate": 7.882961277705897e-06, "loss": 0.0224, "step": 631 }, { "epoch": 1.742321774714624, "grad_norm": 0.1084563285112381, "learning_rate": 7.853812292128386e-06, "loss": 0.0228, "step": 632 }, { "epoch": 1.745078612965755, "grad_norm": 0.11320846527814865, "learning_rate": 7.824682409938328e-06, "loss": 0.0235, "step": 633 }, { "epoch": 1.7478354512168857, "grad_norm": 0.1111724004149437, "learning_rate": 7.795571890423116e-06, "loss": 0.023, "step": 634 }, { "epoch": 1.7505922894680164, "grad_norm": 0.09731971472501755, "learning_rate": 7.766480992697802e-06, "loss": 0.0227, "step": 635 }, { "epoch": 1.753349127719147, "grad_norm": 0.11039575189352036, "learning_rate": 7.73740997570278e-06, "loss": 0.0225, "step": 636 }, { "epoch": 1.7561059659702778, "grad_norm": 0.10182961076498032, "learning_rate": 7.708359098201483e-06, "loss": 0.022, "step": 637 }, { "epoch": 1.7588628042214087, "grad_norm": 0.103904128074646, "learning_rate": 7.67932861877808e-06, "loss": 0.0227, "step": 638 }, { "epoch": 1.7616196424725392, "grad_norm": 0.12430708110332489, "learning_rate": 7.650318795835179e-06, "loss": 0.0233, "step": 639 }, { "epoch": 1.7643764807236701, "grad_norm": 0.09785618633031845, "learning_rate": 7.621329887591519e-06, "loss": 0.0236, "step": 640 }, { "epoch": 1.7671333189748006, "grad_norm": 0.10391392558813095, "learning_rate": 7.592362152079684e-06, "loss": 0.0228, "step": 641 }, { "epoch": 1.7698901572259316, "grad_norm": 0.33653295040130615, "learning_rate": 7.563415847143782e-06, "loss": 0.0281, "step": 642 }, { "epoch": 1.7726469954770623, "grad_norm": 0.12001726031303406, "learning_rate": 7.5344912304371785e-06, "loss": 0.0229, "step": 643 }, { "epoch": 1.775403833728193, "grad_norm": 0.12422462552785873, "learning_rate": 7.505588559420188e-06, "loss": 0.0238, "step": 644 }, { "epoch": 1.7781606719793237, "grad_norm": 0.09668347984552383, "learning_rate": 7.476708091357783e-06, "loss": 0.0224, "step": 645 }, { "epoch": 1.7809175102304544, "grad_norm": 0.11820239573717117, "learning_rate": 7.447850083317308e-06, "loss": 0.023, "step": 646 }, { "epoch": 1.7836743484815853, "grad_norm": 0.10456949472427368, "learning_rate": 7.419014792166182e-06, "loss": 0.0239, "step": 647 }, { "epoch": 1.7864311867327158, "grad_norm": 0.11468762159347534, "learning_rate": 7.39020247456963e-06, "loss": 0.0229, "step": 648 }, { "epoch": 1.7891880249838468, "grad_norm": 0.09829288721084595, "learning_rate": 7.361413386988379e-06, "loss": 0.0231, "step": 649 }, { "epoch": 1.7919448632349773, "grad_norm": 0.10345254838466644, "learning_rate": 7.332647785676388e-06, "loss": 0.0231, "step": 650 }, { "epoch": 1.7947017014861082, "grad_norm": 0.09892752021551132, "learning_rate": 7.303905926678565e-06, "loss": 0.0234, "step": 651 }, { "epoch": 1.797458539737239, "grad_norm": 0.09165947884321213, "learning_rate": 7.275188065828476e-06, "loss": 0.022, "step": 652 }, { "epoch": 1.8002153779883696, "grad_norm": 0.09468812495470047, "learning_rate": 7.246494458746085e-06, "loss": 0.0223, "step": 653 }, { "epoch": 1.8029722162395003, "grad_norm": 0.1075858324766159, "learning_rate": 7.217825360835475e-06, "loss": 0.0233, "step": 654 }, { "epoch": 1.805729054490631, "grad_norm": 0.10927974432706833, "learning_rate": 7.189181027282561e-06, "loss": 0.0232, "step": 655 }, { "epoch": 1.8084858927417617, "grad_norm": 0.10256339609622955, "learning_rate": 7.160561713052843e-06, "loss": 0.0237, "step": 656 }, { "epoch": 1.8112427309928925, "grad_norm": 0.11086180061101913, "learning_rate": 7.131967672889101e-06, "loss": 0.0234, "step": 657 }, { "epoch": 1.8139995692440234, "grad_norm": 0.10017542541027069, "learning_rate": 7.103399161309165e-06, "loss": 0.0225, "step": 658 }, { "epoch": 1.8167564074951539, "grad_norm": 0.10286585986614227, "learning_rate": 7.074856432603628e-06, "loss": 0.0225, "step": 659 }, { "epoch": 1.8195132457462848, "grad_norm": 0.10518784821033478, "learning_rate": 7.04633974083359e-06, "loss": 0.0234, "step": 660 }, { "epoch": 1.8222700839974153, "grad_norm": 0.10116968303918839, "learning_rate": 7.017849339828388e-06, "loss": 0.0233, "step": 661 }, { "epoch": 1.8250269222485462, "grad_norm": 0.10069679468870163, "learning_rate": 6.989385483183355e-06, "loss": 0.0234, "step": 662 }, { "epoch": 1.827783760499677, "grad_norm": 0.09656015038490295, "learning_rate": 6.960948424257532e-06, "loss": 0.0225, "step": 663 }, { "epoch": 1.8305405987508077, "grad_norm": 0.10670676827430725, "learning_rate": 6.9325384161714485e-06, "loss": 0.0231, "step": 664 }, { "epoch": 1.8332974370019384, "grad_norm": 0.12347907572984695, "learning_rate": 6.904155711804843e-06, "loss": 0.0235, "step": 665 }, { "epoch": 1.836054275253069, "grad_norm": 0.09916937351226807, "learning_rate": 6.8758005637944245e-06, "loss": 0.0229, "step": 666 }, { "epoch": 1.8388111135042, "grad_norm": 0.09796298295259476, "learning_rate": 6.8474732245316245e-06, "loss": 0.0234, "step": 667 }, { "epoch": 1.8415679517553305, "grad_norm": 0.1063319593667984, "learning_rate": 6.819173946160336e-06, "loss": 0.0232, "step": 668 }, { "epoch": 1.8443247900064614, "grad_norm": 0.09941181540489197, "learning_rate": 6.7909029805746855e-06, "loss": 0.0234, "step": 669 }, { "epoch": 1.847081628257592, "grad_norm": 0.09763844311237335, "learning_rate": 6.762660579416791e-06, "loss": 0.0237, "step": 670 }, { "epoch": 1.8498384665087229, "grad_norm": 0.094894640147686, "learning_rate": 6.734446994074507e-06, "loss": 0.0216, "step": 671 }, { "epoch": 1.8525953047598536, "grad_norm": 0.10713546723127365, "learning_rate": 6.706262475679205e-06, "loss": 0.0232, "step": 672 }, { "epoch": 1.8553521430109843, "grad_norm": 0.09470642358064651, "learning_rate": 6.678107275103519e-06, "loss": 0.0231, "step": 673 }, { "epoch": 1.858108981262115, "grad_norm": 0.09599259495735168, "learning_rate": 6.649981642959133e-06, "loss": 0.023, "step": 674 }, { "epoch": 1.8608658195132457, "grad_norm": 0.09776968508958817, "learning_rate": 6.62188582959453e-06, "loss": 0.0224, "step": 675 }, { "epoch": 1.8636226577643766, "grad_norm": 0.09774177521467209, "learning_rate": 6.593820085092782e-06, "loss": 0.0222, "step": 676 }, { "epoch": 1.8663794960155071, "grad_norm": 0.10560328513383865, "learning_rate": 6.565784659269314e-06, "loss": 0.0233, "step": 677 }, { "epoch": 1.869136334266638, "grad_norm": 0.09616672247648239, "learning_rate": 6.537779801669677e-06, "loss": 0.023, "step": 678 }, { "epoch": 1.8718931725177685, "grad_norm": 0.10657691955566406, "learning_rate": 6.509805761567336e-06, "loss": 0.0232, "step": 679 }, { "epoch": 1.8746500107688995, "grad_norm": 0.09221483767032623, "learning_rate": 6.481862787961448e-06, "loss": 0.0227, "step": 680 }, { "epoch": 1.8774068490200302, "grad_norm": 0.08830998837947845, "learning_rate": 6.453951129574644e-06, "loss": 0.0221, "step": 681 }, { "epoch": 1.880163687271161, "grad_norm": 0.08807796984910965, "learning_rate": 6.4260710348508115e-06, "loss": 0.0224, "step": 682 }, { "epoch": 1.8829205255222916, "grad_norm": 0.08681437373161316, "learning_rate": 6.3982227519528986e-06, "loss": 0.0216, "step": 683 }, { "epoch": 1.8856773637734223, "grad_norm": 0.09876042604446411, "learning_rate": 6.370406528760675e-06, "loss": 0.0228, "step": 684 }, { "epoch": 1.8884342020245533, "grad_norm": 0.10514353215694427, "learning_rate": 6.34262261286856e-06, "loss": 0.0219, "step": 685 }, { "epoch": 1.8911910402756837, "grad_norm": 0.0905836671590805, "learning_rate": 6.3148712515833985e-06, "loss": 0.022, "step": 686 }, { "epoch": 1.8939478785268147, "grad_norm": 0.09052827209234238, "learning_rate": 6.287152691922264e-06, "loss": 0.0216, "step": 687 }, { "epoch": 1.8967047167779452, "grad_norm": 0.101600281894207, "learning_rate": 6.259467180610262e-06, "loss": 0.0233, "step": 688 }, { "epoch": 1.899461555029076, "grad_norm": 0.10384192317724228, "learning_rate": 6.231814964078327e-06, "loss": 0.0233, "step": 689 }, { "epoch": 1.9022183932802068, "grad_norm": 0.1043287143111229, "learning_rate": 6.204196288461037e-06, "loss": 0.0224, "step": 690 }, { "epoch": 1.9049752315313375, "grad_norm": 0.41572827100753784, "learning_rate": 6.176611399594422e-06, "loss": 0.0241, "step": 691 }, { "epoch": 1.9077320697824682, "grad_norm": 0.0979122593998909, "learning_rate": 6.149060543013772e-06, "loss": 0.0222, "step": 692 }, { "epoch": 1.910488908033599, "grad_norm": 0.09667398780584335, "learning_rate": 6.121543963951453e-06, "loss": 0.0232, "step": 693 }, { "epoch": 1.9132457462847297, "grad_norm": 0.10258089005947113, "learning_rate": 6.094061907334718e-06, "loss": 0.023, "step": 694 }, { "epoch": 1.9160025845358604, "grad_norm": 0.09479233622550964, "learning_rate": 6.066614617783542e-06, "loss": 0.0225, "step": 695 }, { "epoch": 1.9187594227869913, "grad_norm": 0.09578879177570343, "learning_rate": 6.039202339608432e-06, "loss": 0.0226, "step": 696 }, { "epoch": 1.9215162610381218, "grad_norm": 0.09669813513755798, "learning_rate": 6.0118253168082555e-06, "loss": 0.0228, "step": 697 }, { "epoch": 1.9242730992892527, "grad_norm": 0.09967345744371414, "learning_rate": 5.984483793068072e-06, "loss": 0.023, "step": 698 }, { "epoch": 1.9270299375403832, "grad_norm": 0.09882698953151703, "learning_rate": 5.957178011756952e-06, "loss": 0.0239, "step": 699 }, { "epoch": 1.9297867757915141, "grad_norm": 0.10672547668218613, "learning_rate": 5.92990821592583e-06, "loss": 0.0235, "step": 700 }, { "epoch": 1.9325436140426449, "grad_norm": 0.10070157796144485, "learning_rate": 5.902674648305329e-06, "loss": 0.0223, "step": 701 }, { "epoch": 1.9353004522937756, "grad_norm": 0.1068674623966217, "learning_rate": 5.875477551303596e-06, "loss": 0.0236, "step": 702 }, { "epoch": 1.9380572905449063, "grad_norm": 0.1479007601737976, "learning_rate": 5.848317167004159e-06, "loss": 0.024, "step": 703 }, { "epoch": 1.940814128796037, "grad_norm": 0.09680446237325668, "learning_rate": 5.8211937371637525e-06, "loss": 0.0224, "step": 704 }, { "epoch": 1.943570967047168, "grad_norm": 0.09958844631910324, "learning_rate": 5.794107503210187e-06, "loss": 0.0228, "step": 705 }, { "epoch": 1.9463278052982984, "grad_norm": 0.10216110944747925, "learning_rate": 5.767058706240183e-06, "loss": 0.0217, "step": 706 }, { "epoch": 1.9490846435494293, "grad_norm": 0.09057790040969849, "learning_rate": 5.740047587017232e-06, "loss": 0.0225, "step": 707 }, { "epoch": 1.9518414818005598, "grad_norm": 0.10457273572683334, "learning_rate": 5.713074385969457e-06, "loss": 0.0222, "step": 708 }, { "epoch": 1.9545983200516908, "grad_norm": 0.1035657674074173, "learning_rate": 5.686139343187468e-06, "loss": 0.0229, "step": 709 }, { "epoch": 1.9573551583028215, "grad_norm": 0.1085587665438652, "learning_rate": 5.659242698422213e-06, "loss": 0.0222, "step": 710 }, { "epoch": 1.9601119965539522, "grad_norm": 0.10512305051088333, "learning_rate": 5.632384691082874e-06, "loss": 0.023, "step": 711 }, { "epoch": 1.962868834805083, "grad_norm": 0.1115739643573761, "learning_rate": 5.605565560234707e-06, "loss": 0.0228, "step": 712 }, { "epoch": 1.9656256730562136, "grad_norm": 0.11410468816757202, "learning_rate": 5.578785544596928e-06, "loss": 0.0234, "step": 713 }, { "epoch": 1.9683825113073445, "grad_norm": 0.10378382354974747, "learning_rate": 5.55204488254059e-06, "loss": 0.0242, "step": 714 }, { "epoch": 1.971139349558475, "grad_norm": 0.11169740557670593, "learning_rate": 5.525343812086445e-06, "loss": 0.0227, "step": 715 }, { "epoch": 1.973896187809606, "grad_norm": 0.11073627322912216, "learning_rate": 5.498682570902849e-06, "loss": 0.0227, "step": 716 }, { "epoch": 1.9766530260607365, "grad_norm": 0.10393860191106796, "learning_rate": 5.47206139630363e-06, "loss": 0.0231, "step": 717 }, { "epoch": 1.9794098643118674, "grad_norm": 0.09541879594326019, "learning_rate": 5.445480525245976e-06, "loss": 0.0235, "step": 718 }, { "epoch": 1.982166702562998, "grad_norm": 0.10563753545284271, "learning_rate": 5.418940194328344e-06, "loss": 0.0232, "step": 719 }, { "epoch": 1.9849235408141288, "grad_norm": 0.09059736877679825, "learning_rate": 5.3924406397883174e-06, "loss": 0.0221, "step": 720 }, { "epoch": 1.9876803790652595, "grad_norm": 0.0970776304602623, "learning_rate": 5.365982097500545e-06, "loss": 0.0219, "step": 721 }, { "epoch": 1.9904372173163902, "grad_norm": 0.10448465496301651, "learning_rate": 5.339564802974615e-06, "loss": 0.0218, "step": 722 }, { "epoch": 1.9931940555675212, "grad_norm": 0.11694996803998947, "learning_rate": 5.313188991352964e-06, "loss": 0.0223, "step": 723 }, { "epoch": 1.9959508938186517, "grad_norm": 0.10104145109653473, "learning_rate": 5.286854897408793e-06, "loss": 0.0229, "step": 724 }, { "epoch": 1.9987077320697826, "grad_norm": 0.10007411241531372, "learning_rate": 5.2605627555439635e-06, "loss": 0.0229, "step": 725 }, { "epoch": 2.001464570320913, "grad_norm": 0.10868417471647263, "learning_rate": 5.234312799786921e-06, "loss": 0.0199, "step": 726 }, { "epoch": 2.004221408572044, "grad_norm": 0.1075412854552269, "learning_rate": 5.208105263790611e-06, "loss": 0.0169, "step": 727 }, { "epoch": 2.0069782468231745, "grad_norm": 0.09774283319711685, "learning_rate": 5.181940380830393e-06, "loss": 0.0166, "step": 728 }, { "epoch": 2.0097350850743054, "grad_norm": 0.09378799796104431, "learning_rate": 5.155818383801976e-06, "loss": 0.0169, "step": 729 }, { "epoch": 2.0124919233254364, "grad_norm": 0.09300612658262253, "learning_rate": 5.129739505219325e-06, "loss": 0.0175, "step": 730 }, { "epoch": 2.015248761576567, "grad_norm": 0.10933335870504379, "learning_rate": 5.103703977212615e-06, "loss": 0.0172, "step": 731 }, { "epoch": 2.018005599827698, "grad_norm": 0.10995355993509293, "learning_rate": 5.077712031526153e-06, "loss": 0.0165, "step": 732 }, { "epoch": 2.0207624380788283, "grad_norm": 0.0987633690237999, "learning_rate": 5.051763899516313e-06, "loss": 0.0169, "step": 733 }, { "epoch": 2.023519276329959, "grad_norm": 0.10309556126594543, "learning_rate": 5.025859812149481e-06, "loss": 0.0166, "step": 734 }, { "epoch": 2.0262761145810897, "grad_norm": 0.10243550688028336, "learning_rate": 5.000000000000003e-06, "loss": 0.0176, "step": 735 }, { "epoch": 2.0290329528322206, "grad_norm": 0.10226277261972427, "learning_rate": 4.9741846932481154e-06, "loss": 0.0163, "step": 736 }, { "epoch": 2.031789791083351, "grad_norm": 0.10129394382238388, "learning_rate": 4.94841412167792e-06, "loss": 0.0167, "step": 737 }, { "epoch": 2.034546629334482, "grad_norm": 0.09485170245170593, "learning_rate": 4.922688514675325e-06, "loss": 0.0167, "step": 738 }, { "epoch": 2.0373034675856125, "grad_norm": 0.10152100771665573, "learning_rate": 4.8970081012260014e-06, "loss": 0.0169, "step": 739 }, { "epoch": 2.0400603058367435, "grad_norm": 0.10325178503990173, "learning_rate": 4.8713731099133576e-06, "loss": 0.0172, "step": 740 }, { "epoch": 2.0428171440878744, "grad_norm": 0.10417655855417252, "learning_rate": 4.845783768916482e-06, "loss": 0.0167, "step": 741 }, { "epoch": 2.045573982339005, "grad_norm": 0.0955713763833046, "learning_rate": 4.820240306008136e-06, "loss": 0.0164, "step": 742 }, { "epoch": 2.048330820590136, "grad_norm": 0.10096914321184158, "learning_rate": 4.794742948552716e-06, "loss": 0.0164, "step": 743 }, { "epoch": 2.0510876588412663, "grad_norm": 0.11180409044027328, "learning_rate": 4.769291923504226e-06, "loss": 0.0169, "step": 744 }, { "epoch": 2.0538444970923972, "grad_norm": 0.10208172351121902, "learning_rate": 4.743887457404268e-06, "loss": 0.0159, "step": 745 }, { "epoch": 2.0566013353435277, "grad_norm": 0.10103499889373779, "learning_rate": 4.718529776380009e-06, "loss": 0.0161, "step": 746 }, { "epoch": 2.0593581735946587, "grad_norm": 0.09129516035318375, "learning_rate": 4.693219106142186e-06, "loss": 0.0164, "step": 747 }, { "epoch": 2.062115011845789, "grad_norm": 0.10983917117118835, "learning_rate": 4.66795567198309e-06, "loss": 0.0165, "step": 748 }, { "epoch": 2.06487185009692, "grad_norm": 0.10270462185144424, "learning_rate": 4.642739698774555e-06, "loss": 0.0171, "step": 749 }, { "epoch": 2.067628688348051, "grad_norm": 0.10552225261926651, "learning_rate": 4.617571410965964e-06, "loss": 0.0167, "step": 750 }, { "epoch": 2.0703855265991815, "grad_norm": 0.10517138242721558, "learning_rate": 4.59245103258225e-06, "loss": 0.0164, "step": 751 }, { "epoch": 2.0731423648503124, "grad_norm": 0.09321381151676178, "learning_rate": 4.567378787221896e-06, "loss": 0.0163, "step": 752 }, { "epoch": 2.075899203101443, "grad_norm": 0.10226123034954071, "learning_rate": 4.542354898054953e-06, "loss": 0.0168, "step": 753 }, { "epoch": 2.078656041352574, "grad_norm": 0.09658444672822952, "learning_rate": 4.517379587821049e-06, "loss": 0.0161, "step": 754 }, { "epoch": 2.0814128796037044, "grad_norm": 0.08875004947185516, "learning_rate": 4.492453078827409e-06, "loss": 0.0158, "step": 755 }, { "epoch": 2.0841697178548353, "grad_norm": 0.10765478014945984, "learning_rate": 4.467575592946865e-06, "loss": 0.016, "step": 756 }, { "epoch": 2.086926556105966, "grad_norm": 0.09269033372402191, "learning_rate": 4.442747351615899e-06, "loss": 0.0156, "step": 757 }, { "epoch": 2.0896833943570967, "grad_norm": 0.099794402718544, "learning_rate": 4.417968575832664e-06, "loss": 0.0155, "step": 758 }, { "epoch": 2.0924402326082276, "grad_norm": 0.10920445621013641, "learning_rate": 4.393239486155011e-06, "loss": 0.0164, "step": 759 }, { "epoch": 2.095197070859358, "grad_norm": 0.10013429075479507, "learning_rate": 4.3685603026985356e-06, "loss": 0.0159, "step": 760 }, { "epoch": 2.097953909110489, "grad_norm": 0.1004144474864006, "learning_rate": 4.343931245134616e-06, "loss": 0.0171, "step": 761 }, { "epoch": 2.1007107473616196, "grad_norm": 0.11513813585042953, "learning_rate": 4.319352532688444e-06, "loss": 0.0167, "step": 762 }, { "epoch": 2.1034675856127505, "grad_norm": 0.10025149583816528, "learning_rate": 4.294824384137096e-06, "loss": 0.0167, "step": 763 }, { "epoch": 2.106224423863881, "grad_norm": 0.13849525153636932, "learning_rate": 4.270347017807575e-06, "loss": 0.0173, "step": 764 }, { "epoch": 2.108981262115012, "grad_norm": 0.10839790850877762, "learning_rate": 4.245920651574864e-06, "loss": 0.0164, "step": 765 }, { "epoch": 2.1117381003661424, "grad_norm": 0.09577582031488419, "learning_rate": 4.221545502859994e-06, "loss": 0.0158, "step": 766 }, { "epoch": 2.1144949386172733, "grad_norm": 0.1021181270480156, "learning_rate": 4.197221788628096e-06, "loss": 0.0163, "step": 767 }, { "epoch": 2.117251776868404, "grad_norm": 0.10877656936645508, "learning_rate": 4.172949725386488e-06, "loss": 0.0159, "step": 768 }, { "epoch": 2.1200086151195348, "grad_norm": 0.09782122820615768, "learning_rate": 4.148729529182736e-06, "loss": 0.0167, "step": 769 }, { "epoch": 2.1227654533706657, "grad_norm": 0.10682158172130585, "learning_rate": 4.124561415602729e-06, "loss": 0.0171, "step": 770 }, { "epoch": 2.125522291621796, "grad_norm": 0.10475680232048035, "learning_rate": 4.100445599768774e-06, "loss": 0.0164, "step": 771 }, { "epoch": 2.128279129872927, "grad_norm": 0.09578834474086761, "learning_rate": 4.0763822963376585e-06, "loss": 0.0158, "step": 772 }, { "epoch": 2.1310359681240576, "grad_norm": 0.09797288477420807, "learning_rate": 4.0523717194987634e-06, "loss": 0.0158, "step": 773 }, { "epoch": 2.1337928063751885, "grad_norm": 0.1016959697008133, "learning_rate": 4.028414082972141e-06, "loss": 0.0168, "step": 774 }, { "epoch": 2.136549644626319, "grad_norm": 0.09790080785751343, "learning_rate": 4.004509600006618e-06, "loss": 0.0155, "step": 775 }, { "epoch": 2.13930648287745, "grad_norm": 0.1030542254447937, "learning_rate": 3.980658483377903e-06, "loss": 0.0166, "step": 776 }, { "epoch": 2.142063321128581, "grad_norm": 0.11205972731113434, "learning_rate": 3.956860945386677e-06, "loss": 0.0158, "step": 777 }, { "epoch": 2.1448201593797114, "grad_norm": 0.09531433135271072, "learning_rate": 3.9331171978567206e-06, "loss": 0.0154, "step": 778 }, { "epoch": 2.1475769976308423, "grad_norm": 0.09850120544433594, "learning_rate": 3.909427452133017e-06, "loss": 0.0162, "step": 779 }, { "epoch": 2.150333835881973, "grad_norm": 0.11114904284477234, "learning_rate": 3.885791919079878e-06, "loss": 0.016, "step": 780 }, { "epoch": 2.1530906741331037, "grad_norm": 0.10965460538864136, "learning_rate": 3.862210809079061e-06, "loss": 0.0156, "step": 781 }, { "epoch": 2.155847512384234, "grad_norm": 0.09533203393220901, "learning_rate": 3.838684332027908e-06, "loss": 0.0163, "step": 782 }, { "epoch": 2.158604350635365, "grad_norm": 0.09494911879301071, "learning_rate": 3.815212697337451e-06, "loss": 0.0158, "step": 783 }, { "epoch": 2.1613611888864956, "grad_norm": 0.10040943324565887, "learning_rate": 3.7917961139305835e-06, "loss": 0.0162, "step": 784 }, { "epoch": 2.1641180271376266, "grad_norm": 0.09985315054655075, "learning_rate": 3.7684347902401753e-06, "loss": 0.0164, "step": 785 }, { "epoch": 2.166874865388757, "grad_norm": 0.09866054356098175, "learning_rate": 3.745128934207225e-06, "loss": 0.0161, "step": 786 }, { "epoch": 2.169631703639888, "grad_norm": 0.09665759652853012, "learning_rate": 3.7218787532790167e-06, "loss": 0.0163, "step": 787 }, { "epoch": 2.172388541891019, "grad_norm": 0.09830871224403381, "learning_rate": 3.6986844544072496e-06, "loss": 0.0165, "step": 788 }, { "epoch": 2.1751453801421494, "grad_norm": 0.10809045284986496, "learning_rate": 3.6755462440462288e-06, "loss": 0.0167, "step": 789 }, { "epoch": 2.1779022183932804, "grad_norm": 0.09549186378717422, "learning_rate": 3.6524643281510018e-06, "loss": 0.0164, "step": 790 }, { "epoch": 2.180659056644411, "grad_norm": 0.09945723414421082, "learning_rate": 3.6294389121755404e-06, "loss": 0.0159, "step": 791 }, { "epoch": 2.1834158948955418, "grad_norm": 0.09856431931257248, "learning_rate": 3.606470201070904e-06, "loss": 0.0164, "step": 792 }, { "epoch": 2.1861727331466723, "grad_norm": 0.09308286756277084, "learning_rate": 3.58355839928341e-06, "loss": 0.0155, "step": 793 }, { "epoch": 2.188929571397803, "grad_norm": 0.10592252761125565, "learning_rate": 3.560703710752833e-06, "loss": 0.0167, "step": 794 }, { "epoch": 2.1916864096489337, "grad_norm": 0.10037653893232346, "learning_rate": 3.5379063389105727e-06, "loss": 0.0167, "step": 795 }, { "epoch": 2.1944432479000646, "grad_norm": 0.1029176115989685, "learning_rate": 3.515166486677848e-06, "loss": 0.0173, "step": 796 }, { "epoch": 2.1972000861511956, "grad_norm": 0.10505726933479309, "learning_rate": 3.4924843564638946e-06, "loss": 0.0164, "step": 797 }, { "epoch": 2.199956924402326, "grad_norm": 0.12057758867740631, "learning_rate": 3.4698601501641517e-06, "loss": 0.0168, "step": 798 }, { "epoch": 2.202713762653457, "grad_norm": 0.09361850470304489, "learning_rate": 3.447294069158481e-06, "loss": 0.0155, "step": 799 }, { "epoch": 2.2054706009045875, "grad_norm": 0.10088173300027847, "learning_rate": 3.424786314309365e-06, "loss": 0.0157, "step": 800 }, { "epoch": 2.2082274391557184, "grad_norm": 0.09545209258794785, "learning_rate": 3.4023370859601192e-06, "loss": 0.016, "step": 801 }, { "epoch": 2.210984277406849, "grad_norm": 0.10133402049541473, "learning_rate": 3.3799465839331103e-06, "loss": 0.0167, "step": 802 }, { "epoch": 2.21374111565798, "grad_norm": 0.10037866234779358, "learning_rate": 3.3576150075279757e-06, "loss": 0.0166, "step": 803 }, { "epoch": 2.2164979539091103, "grad_norm": 0.13703982532024384, "learning_rate": 3.335342555519855e-06, "loss": 0.0167, "step": 804 }, { "epoch": 2.2192547921602412, "grad_norm": 0.10422453284263611, "learning_rate": 3.313129426157613e-06, "loss": 0.016, "step": 805 }, { "epoch": 2.222011630411372, "grad_norm": 0.09957727044820786, "learning_rate": 3.290975817162082e-06, "loss": 0.0156, "step": 806 }, { "epoch": 2.2247684686625027, "grad_norm": 0.0947730764746666, "learning_rate": 3.2688819257242963e-06, "loss": 0.0157, "step": 807 }, { "epoch": 2.2275253069136336, "grad_norm": 0.10239588469266891, "learning_rate": 3.246847948503744e-06, "loss": 0.016, "step": 808 }, { "epoch": 2.230282145164764, "grad_norm": 0.10617274791002274, "learning_rate": 3.2248740816266012e-06, "loss": 0.0158, "step": 809 }, { "epoch": 2.233038983415895, "grad_norm": 0.10229507088661194, "learning_rate": 3.2029605206840088e-06, "loss": 0.0162, "step": 810 }, { "epoch": 2.2357958216670255, "grad_norm": 0.09557145833969116, "learning_rate": 3.181107460730314e-06, "loss": 0.0157, "step": 811 }, { "epoch": 2.2385526599181564, "grad_norm": 0.11345598846673965, "learning_rate": 3.1593150962813425e-06, "loss": 0.0167, "step": 812 }, { "epoch": 2.241309498169287, "grad_norm": 0.11662036180496216, "learning_rate": 3.1375836213126653e-06, "loss": 0.0168, "step": 813 }, { "epoch": 2.244066336420418, "grad_norm": 0.10055527836084366, "learning_rate": 3.115913229257864e-06, "loss": 0.0165, "step": 814 }, { "epoch": 2.2468231746715484, "grad_norm": 0.10372263938188553, "learning_rate": 3.0943041130068243e-06, "loss": 0.0165, "step": 815 }, { "epoch": 2.2495800129226793, "grad_norm": 0.10061642527580261, "learning_rate": 3.0727564649040066e-06, "loss": 0.0157, "step": 816 }, { "epoch": 2.25233685117381, "grad_norm": 0.10070322453975677, "learning_rate": 3.0512704767467417e-06, "loss": 0.016, "step": 817 }, { "epoch": 2.2550936894249407, "grad_norm": 0.09765329211950302, "learning_rate": 3.0298463397835223e-06, "loss": 0.0158, "step": 818 }, { "epoch": 2.2578505276760716, "grad_norm": 0.09247851371765137, "learning_rate": 3.008484244712286e-06, "loss": 0.0161, "step": 819 }, { "epoch": 2.260607365927202, "grad_norm": 0.21034474670886993, "learning_rate": 2.987184381678747e-06, "loss": 0.0165, "step": 820 }, { "epoch": 2.263364204178333, "grad_norm": 0.1013868898153305, "learning_rate": 2.965946940274678e-06, "loss": 0.0164, "step": 821 }, { "epoch": 2.2661210424294636, "grad_norm": 0.10465855151414871, "learning_rate": 2.9447721095362325e-06, "loss": 0.0162, "step": 822 }, { "epoch": 2.2688778806805945, "grad_norm": 0.10128065943717957, "learning_rate": 2.9236600779422674e-06, "loss": 0.0162, "step": 823 }, { "epoch": 2.2716347189317254, "grad_norm": 0.1028231531381607, "learning_rate": 2.902611033412648e-06, "loss": 0.0167, "step": 824 }, { "epoch": 2.274391557182856, "grad_norm": 0.09928223490715027, "learning_rate": 2.8816251633065963e-06, "loss": 0.0163, "step": 825 }, { "epoch": 2.277148395433987, "grad_norm": 0.1016731932759285, "learning_rate": 2.8607026544210115e-06, "loss": 0.0156, "step": 826 }, { "epoch": 2.2799052336851173, "grad_norm": 0.11646661907434464, "learning_rate": 2.8398436929888085e-06, "loss": 0.0165, "step": 827 }, { "epoch": 2.2826620719362483, "grad_norm": 0.1035120040178299, "learning_rate": 2.819048464677261e-06, "loss": 0.0163, "step": 828 }, { "epoch": 2.2854189101873787, "grad_norm": 0.10418447107076645, "learning_rate": 2.798317154586352e-06, "loss": 0.016, "step": 829 }, { "epoch": 2.2881757484385097, "grad_norm": 0.095628522336483, "learning_rate": 2.7776499472471184e-06, "loss": 0.0155, "step": 830 }, { "epoch": 2.29093258668964, "grad_norm": 0.09392069280147552, "learning_rate": 2.7570470266200177e-06, "loss": 0.0155, "step": 831 }, { "epoch": 2.293689424940771, "grad_norm": 0.10431677848100662, "learning_rate": 2.736508576093285e-06, "loss": 0.0163, "step": 832 }, { "epoch": 2.2964462631919016, "grad_norm": 0.1353488713502884, "learning_rate": 2.716034778481301e-06, "loss": 0.0164, "step": 833 }, { "epoch": 2.2992031014430325, "grad_norm": 0.100525863468647, "learning_rate": 2.69562581602297e-06, "loss": 0.0161, "step": 834 }, { "epoch": 2.3019599396941635, "grad_norm": 0.0952887013554573, "learning_rate": 2.675281870380082e-06, "loss": 0.0155, "step": 835 }, { "epoch": 2.304716777945294, "grad_norm": 0.13243414461612701, "learning_rate": 2.65500312263572e-06, "loss": 0.0159, "step": 836 }, { "epoch": 2.307473616196425, "grad_norm": 0.09811341762542725, "learning_rate": 2.6347897532926293e-06, "loss": 0.0157, "step": 837 }, { "epoch": 2.3102304544475554, "grad_norm": 0.10353419929742813, "learning_rate": 2.6146419422716174e-06, "loss": 0.0157, "step": 838 }, { "epoch": 2.3129872926986863, "grad_norm": 0.09999054670333862, "learning_rate": 2.594559868909956e-06, "loss": 0.0157, "step": 839 }, { "epoch": 2.315744130949817, "grad_norm": 0.10014519840478897, "learning_rate": 2.5745437119597704e-06, "loss": 0.0162, "step": 840 }, { "epoch": 2.3185009692009477, "grad_norm": 0.09995284676551819, "learning_rate": 2.554593649586469e-06, "loss": 0.0163, "step": 841 }, { "epoch": 2.321257807452078, "grad_norm": 0.10019443184137344, "learning_rate": 2.5347098593671417e-06, "loss": 0.0161, "step": 842 }, { "epoch": 2.324014645703209, "grad_norm": 0.09603513777256012, "learning_rate": 2.514892518288988e-06, "loss": 0.0162, "step": 843 }, { "epoch": 2.3267714839543396, "grad_norm": 0.09885770082473755, "learning_rate": 2.49514180274774e-06, "loss": 0.0158, "step": 844 }, { "epoch": 2.3295283222054706, "grad_norm": 0.09956613928079605, "learning_rate": 2.4754578885460813e-06, "loss": 0.0156, "step": 845 }, { "epoch": 2.3322851604566015, "grad_norm": 0.09634565562009811, "learning_rate": 2.455840950892099e-06, "loss": 0.0158, "step": 846 }, { "epoch": 2.335041998707732, "grad_norm": 0.09536509960889816, "learning_rate": 2.436291164397715e-06, "loss": 0.0161, "step": 847 }, { "epoch": 2.337798836958863, "grad_norm": 0.09601006656885147, "learning_rate": 2.416808703077135e-06, "loss": 0.0167, "step": 848 }, { "epoch": 2.3405556752099934, "grad_norm": 0.10354162007570267, "learning_rate": 2.3973937403452983e-06, "loss": 0.0159, "step": 849 }, { "epoch": 2.3433125134611243, "grad_norm": 0.10057395696640015, "learning_rate": 2.3780464490163267e-06, "loss": 0.0163, "step": 850 }, { "epoch": 2.346069351712255, "grad_norm": 0.09877093136310577, "learning_rate": 2.3587670013020026e-06, "loss": 0.0159, "step": 851 }, { "epoch": 2.3488261899633858, "grad_norm": 0.11428674310445786, "learning_rate": 2.339555568810221e-06, "loss": 0.0162, "step": 852 }, { "epoch": 2.3515830282145167, "grad_norm": 0.10734842717647552, "learning_rate": 2.3204123225434714e-06, "loss": 0.0158, "step": 853 }, { "epoch": 2.354339866465647, "grad_norm": 0.09699169546365738, "learning_rate": 2.3013374328973113e-06, "loss": 0.0161, "step": 854 }, { "epoch": 2.357096704716778, "grad_norm": 0.09945055842399597, "learning_rate": 2.28233106965885e-06, "loss": 0.0164, "step": 855 }, { "epoch": 2.3598535429679086, "grad_norm": 0.09800931066274643, "learning_rate": 2.2633934020052383e-06, "loss": 0.0155, "step": 856 }, { "epoch": 2.3626103812190395, "grad_norm": 0.09882977604866028, "learning_rate": 2.2445245985021613e-06, "loss": 0.0154, "step": 857 }, { "epoch": 2.36536721947017, "grad_norm": 0.0957275852560997, "learning_rate": 2.2257248271023424e-06, "loss": 0.0164, "step": 858 }, { "epoch": 2.368124057721301, "grad_norm": 0.10268919914960861, "learning_rate": 2.206994255144036e-06, "loss": 0.0164, "step": 859 }, { "epoch": 2.3708808959724315, "grad_norm": 0.10079904645681381, "learning_rate": 2.188333049349556e-06, "loss": 0.0157, "step": 860 }, { "epoch": 2.3736377342235624, "grad_norm": 0.0992964580655098, "learning_rate": 2.1697413758237785e-06, "loss": 0.0158, "step": 861 }, { "epoch": 2.376394572474693, "grad_norm": 0.10243627429008484, "learning_rate": 2.1512194000526676e-06, "loss": 0.0167, "step": 862 }, { "epoch": 2.379151410725824, "grad_norm": 0.0987883061170578, "learning_rate": 2.1327672869018036e-06, "loss": 0.0163, "step": 863 }, { "epoch": 2.3819082489769547, "grad_norm": 0.10566847771406174, "learning_rate": 2.114385200614912e-06, "loss": 0.0157, "step": 864 }, { "epoch": 2.3846650872280852, "grad_norm": 0.09772182255983353, "learning_rate": 2.0960733048124082e-06, "loss": 0.0153, "step": 865 }, { "epoch": 2.387421925479216, "grad_norm": 0.11045119166374207, "learning_rate": 2.077831762489927e-06, "loss": 0.0165, "step": 866 }, { "epoch": 2.3901787637303467, "grad_norm": 0.1000387892127037, "learning_rate": 2.0596607360168897e-06, "loss": 0.0163, "step": 867 }, { "epoch": 2.3929356019814776, "grad_norm": 0.09834710508584976, "learning_rate": 2.0415603871350476e-06, "loss": 0.0162, "step": 868 }, { "epoch": 2.395692440232608, "grad_norm": 0.10450678318738937, "learning_rate": 2.023530876957045e-06, "loss": 0.0155, "step": 869 }, { "epoch": 2.398449278483739, "grad_norm": 0.11825753003358841, "learning_rate": 2.0055723659649907e-06, "loss": 0.0157, "step": 870 }, { "epoch": 2.40120611673487, "grad_norm": 0.09847236424684525, "learning_rate": 1.987685014009011e-06, "loss": 0.0158, "step": 871 }, { "epoch": 2.4039629549860004, "grad_norm": 0.09447720646858215, "learning_rate": 1.9698689803058523e-06, "loss": 0.0159, "step": 872 }, { "epoch": 2.406719793237131, "grad_norm": 0.09634242951869965, "learning_rate": 1.952124423437447e-06, "loss": 0.016, "step": 873 }, { "epoch": 2.409476631488262, "grad_norm": 0.10218273848295212, "learning_rate": 1.934451501349507e-06, "loss": 0.0153, "step": 874 }, { "epoch": 2.412233469739393, "grad_norm": 0.0962023138999939, "learning_rate": 1.9168503713501184e-06, "loss": 0.0156, "step": 875 }, { "epoch": 2.4149903079905233, "grad_norm": 0.09034065902233124, "learning_rate": 1.8993211901083353e-06, "loss": 0.0152, "step": 876 }, { "epoch": 2.417747146241654, "grad_norm": 0.1083788052201271, "learning_rate": 1.8818641136527959e-06, "loss": 0.0165, "step": 877 }, { "epoch": 2.4205039844927847, "grad_norm": 0.09179603308439255, "learning_rate": 1.8644792973703252e-06, "loss": 0.0154, "step": 878 }, { "epoch": 2.4232608227439156, "grad_norm": 0.1054045706987381, "learning_rate": 1.8471668960045575e-06, "loss": 0.0162, "step": 879 }, { "epoch": 2.426017660995046, "grad_norm": 0.10657548904418945, "learning_rate": 1.8299270636545518e-06, "loss": 0.0149, "step": 880 }, { "epoch": 2.428774499246177, "grad_norm": 0.09735960513353348, "learning_rate": 1.8127599537734297e-06, "loss": 0.0161, "step": 881 }, { "epoch": 2.431531337497308, "grad_norm": 0.11152695119380951, "learning_rate": 1.7956657191669969e-06, "loss": 0.0164, "step": 882 }, { "epoch": 2.4342881757484385, "grad_norm": 0.09965367615222931, "learning_rate": 1.7786445119923967e-06, "loss": 0.0154, "step": 883 }, { "epoch": 2.4370450139995694, "grad_norm": 0.09934116899967194, "learning_rate": 1.7616964837567497e-06, "loss": 0.0159, "step": 884 }, { "epoch": 2.4398018522507, "grad_norm": 0.10009805858135223, "learning_rate": 1.7448217853158e-06, "loss": 0.0159, "step": 885 }, { "epoch": 2.442558690501831, "grad_norm": 0.0922827422618866, "learning_rate": 1.7280205668725814e-06, "loss": 0.0156, "step": 886 }, { "epoch": 2.4453155287529613, "grad_norm": 0.1045307070016861, "learning_rate": 1.7112929779760768e-06, "loss": 0.0156, "step": 887 }, { "epoch": 2.4480723670040923, "grad_norm": 0.09331434965133667, "learning_rate": 1.6946391675198838e-06, "loss": 0.0155, "step": 888 }, { "epoch": 2.4508292052552227, "grad_norm": 0.09583646059036255, "learning_rate": 1.6780592837408926e-06, "loss": 0.0156, "step": 889 }, { "epoch": 2.4535860435063537, "grad_norm": 0.0972675010561943, "learning_rate": 1.6615534742179684e-06, "loss": 0.0158, "step": 890 }, { "epoch": 2.456342881757484, "grad_norm": 0.09544920176267624, "learning_rate": 1.6451218858706374e-06, "loss": 0.0155, "step": 891 }, { "epoch": 2.459099720008615, "grad_norm": 0.0940442830324173, "learning_rate": 1.6287646649577672e-06, "loss": 0.0154, "step": 892 }, { "epoch": 2.461856558259746, "grad_norm": 0.09687791019678116, "learning_rate": 1.6124819570762862e-06, "loss": 0.0152, "step": 893 }, { "epoch": 2.4646133965108765, "grad_norm": 0.09892424196004868, "learning_rate": 1.5962739071598709e-06, "loss": 0.0162, "step": 894 }, { "epoch": 2.4673702347620075, "grad_norm": 0.09757326543331146, "learning_rate": 1.5801406594776625e-06, "loss": 0.0154, "step": 895 }, { "epoch": 2.470127073013138, "grad_norm": 0.22701092064380646, "learning_rate": 1.5640823576329844e-06, "loss": 0.017, "step": 896 }, { "epoch": 2.472883911264269, "grad_norm": 0.094330795109272, "learning_rate": 1.5480991445620541e-06, "loss": 0.0151, "step": 897 }, { "epoch": 2.4756407495153994, "grad_norm": 0.09668730944395065, "learning_rate": 1.5321911625327224e-06, "loss": 0.0155, "step": 898 }, { "epoch": 2.4783975877665303, "grad_norm": 0.09876801818609238, "learning_rate": 1.5163585531432046e-06, "loss": 0.0157, "step": 899 }, { "epoch": 2.4811544260176612, "grad_norm": 0.1035366952419281, "learning_rate": 1.500601457320814e-06, "loss": 0.0151, "step": 900 }, { "epoch": 2.4839112642687917, "grad_norm": 0.0975731685757637, "learning_rate": 1.4849200153207176e-06, "loss": 0.0155, "step": 901 }, { "epoch": 2.4866681025199227, "grad_norm": 0.0988369807600975, "learning_rate": 1.4693143667246713e-06, "loss": 0.0157, "step": 902 }, { "epoch": 2.489424940771053, "grad_norm": 0.10393507033586502, "learning_rate": 1.453784650439798e-06, "loss": 0.0159, "step": 903 }, { "epoch": 2.492181779022184, "grad_norm": 0.09848541766405106, "learning_rate": 1.4383310046973365e-06, "loss": 0.0156, "step": 904 }, { "epoch": 2.4949386172733146, "grad_norm": 0.09526592493057251, "learning_rate": 1.4229535670514162e-06, "loss": 0.0159, "step": 905 }, { "epoch": 2.4976954555244455, "grad_norm": 0.256574809551239, "learning_rate": 1.407652474377832e-06, "loss": 0.0174, "step": 906 }, { "epoch": 2.500452293775576, "grad_norm": 0.0996571034193039, "learning_rate": 1.3924278628728305e-06, "loss": 0.0164, "step": 907 }, { "epoch": 2.503209132026707, "grad_norm": 0.10017193108797073, "learning_rate": 1.3772798680518828e-06, "loss": 0.0156, "step": 908 }, { "epoch": 2.5059659702778374, "grad_norm": 0.09400206059217453, "learning_rate": 1.3622086247484989e-06, "loss": 0.0157, "step": 909 }, { "epoch": 2.5087228085289683, "grad_norm": 0.09920066595077515, "learning_rate": 1.3472142671130139e-06, "loss": 0.0156, "step": 910 }, { "epoch": 2.5114796467800993, "grad_norm": 0.09991924464702606, "learning_rate": 1.3322969286113973e-06, "loss": 0.0162, "step": 911 }, { "epoch": 2.5142364850312298, "grad_norm": 0.09943817555904388, "learning_rate": 1.3174567420240647e-06, "loss": 0.0158, "step": 912 }, { "epoch": 2.5169933232823607, "grad_norm": 0.10019614547491074, "learning_rate": 1.3026938394446976e-06, "loss": 0.0153, "step": 913 }, { "epoch": 2.519750161533491, "grad_norm": 0.09503661096096039, "learning_rate": 1.2880083522790654e-06, "loss": 0.0158, "step": 914 }, { "epoch": 2.522506999784622, "grad_norm": 0.09582358598709106, "learning_rate": 1.273400411243857e-06, "loss": 0.0155, "step": 915 }, { "epoch": 2.5252638380357526, "grad_norm": 0.09465383738279343, "learning_rate": 1.2588701463655172e-06, "loss": 0.0154, "step": 916 }, { "epoch": 2.5280206762868835, "grad_norm": 0.10085918009281158, "learning_rate": 1.2444176869790925e-06, "loss": 0.016, "step": 917 }, { "epoch": 2.5307775145380145, "grad_norm": 0.10438365489244461, "learning_rate": 1.2300431617270669e-06, "loss": 0.0153, "step": 918 }, { "epoch": 2.533534352789145, "grad_norm": 0.1272774487733841, "learning_rate": 1.2157466985582367e-06, "loss": 0.016, "step": 919 }, { "epoch": 2.5362911910402755, "grad_norm": 0.10423656553030014, "learning_rate": 1.2015284247265567e-06, "loss": 0.0163, "step": 920 }, { "epoch": 2.5390480292914064, "grad_norm": 0.09617140144109726, "learning_rate": 1.1873884667900125e-06, "loss": 0.0157, "step": 921 }, { "epoch": 2.5418048675425373, "grad_norm": 0.09703914076089859, "learning_rate": 1.1733269506094957e-06, "loss": 0.0162, "step": 922 }, { "epoch": 2.544561705793668, "grad_norm": 0.09608243405818939, "learning_rate": 1.1593440013476775e-06, "loss": 0.0151, "step": 923 }, { "epoch": 2.5473185440447987, "grad_norm": 0.107514888048172, "learning_rate": 1.1454397434679022e-06, "loss": 0.0158, "step": 924 }, { "epoch": 2.5500753822959292, "grad_norm": 0.09657265990972519, "learning_rate": 1.1316143007330739e-06, "loss": 0.0153, "step": 925 }, { "epoch": 2.55283222054706, "grad_norm": 0.08969084173440933, "learning_rate": 1.1178677962045604e-06, "loss": 0.0154, "step": 926 }, { "epoch": 2.5555890587981906, "grad_norm": 0.09439155459403992, "learning_rate": 1.1042003522410882e-06, "loss": 0.0157, "step": 927 }, { "epoch": 2.5583458970493216, "grad_norm": 0.10195891559123993, "learning_rate": 1.090612090497668e-06, "loss": 0.0164, "step": 928 }, { "epoch": 2.5611027353004525, "grad_norm": 0.10047302395105362, "learning_rate": 1.077103131924493e-06, "loss": 0.016, "step": 929 }, { "epoch": 2.563859573551583, "grad_norm": 0.10115232318639755, "learning_rate": 1.0636735967658785e-06, "loss": 0.0158, "step": 930 }, { "epoch": 2.5666164118027135, "grad_norm": 0.09630636125802994, "learning_rate": 1.0503236045591857e-06, "loss": 0.0156, "step": 931 }, { "epoch": 2.5693732500538444, "grad_norm": 0.09693799912929535, "learning_rate": 1.037053274133758e-06, "loss": 0.0153, "step": 932 }, { "epoch": 2.5721300883049754, "grad_norm": 0.09665205329656601, "learning_rate": 1.0238627236098619e-06, "loss": 0.0151, "step": 933 }, { "epoch": 2.574886926556106, "grad_norm": 0.0942736491560936, "learning_rate": 1.0107520703976325e-06, "loss": 0.0151, "step": 934 }, { "epoch": 2.577643764807237, "grad_norm": 0.10749771445989609, "learning_rate": 9.977214311960404e-07, "loss": 0.016, "step": 935 }, { "epoch": 2.5804006030583673, "grad_norm": 0.09226176887750626, "learning_rate": 9.8477092199184e-07, "loss": 0.0152, "step": 936 }, { "epoch": 2.583157441309498, "grad_norm": 0.10179731994867325, "learning_rate": 9.719006580585444e-07, "loss": 0.0158, "step": 937 }, { "epoch": 2.5859142795606287, "grad_norm": 0.09366576373577118, "learning_rate": 9.591107539553945e-07, "loss": 0.0159, "step": 938 }, { "epoch": 2.5886711178117596, "grad_norm": 0.097577303647995, "learning_rate": 9.464013235263458e-07, "loss": 0.0157, "step": 939 }, { "epoch": 2.5914279560628906, "grad_norm": 0.10417082905769348, "learning_rate": 9.337724798990489e-07, "loss": 0.018, "step": 940 }, { "epoch": 2.594184794314021, "grad_norm": 0.1003030315041542, "learning_rate": 9.212243354838435e-07, "loss": 0.016, "step": 941 }, { "epoch": 2.596941632565152, "grad_norm": 0.09599234908819199, "learning_rate": 9.08757001972762e-07, "loss": 0.0154, "step": 942 }, { "epoch": 2.5996984708162825, "grad_norm": 0.09770214557647705, "learning_rate": 8.963705903385344e-07, "loss": 0.016, "step": 943 }, { "epoch": 2.6024553090674134, "grad_norm": 0.09537551552057266, "learning_rate": 8.8406521083359e-07, "loss": 0.0158, "step": 944 }, { "epoch": 2.605212147318544, "grad_norm": 0.09884826838970184, "learning_rate": 8.71840972989092e-07, "loss": 0.0156, "step": 945 }, { "epoch": 2.607968985569675, "grad_norm": 0.09631089121103287, "learning_rate": 8.596979856139553e-07, "loss": 0.0157, "step": 946 }, { "epoch": 2.6107258238208058, "grad_norm": 0.09276318550109863, "learning_rate": 8.476363567938751e-07, "loss": 0.0151, "step": 947 }, { "epoch": 2.6134826620719362, "grad_norm": 0.10186196863651276, "learning_rate": 8.356561938903707e-07, "loss": 0.0151, "step": 948 }, { "epoch": 2.6162395003230667, "grad_norm": 0.0975264310836792, "learning_rate": 8.237576035398198e-07, "loss": 0.0156, "step": 949 }, { "epoch": 2.6189963385741977, "grad_norm": 0.10101190954446793, "learning_rate": 8.119406916525252e-07, "loss": 0.016, "step": 950 }, { "epoch": 2.6217531768253286, "grad_norm": 0.10287696123123169, "learning_rate": 8.002055634117578e-07, "loss": 0.0157, "step": 951 }, { "epoch": 2.624510015076459, "grad_norm": 0.0948198139667511, "learning_rate": 7.885523232728287e-07, "loss": 0.0158, "step": 952 }, { "epoch": 2.62726685332759, "grad_norm": 0.10126490145921707, "learning_rate": 7.769810749621532e-07, "loss": 0.0162, "step": 953 }, { "epoch": 2.6300236915787205, "grad_norm": 0.09267105907201767, "learning_rate": 7.654919214763357e-07, "loss": 0.0155, "step": 954 }, { "epoch": 2.6327805298298514, "grad_norm": 0.10376208275556564, "learning_rate": 7.540849650812409e-07, "loss": 0.0154, "step": 955 }, { "epoch": 2.635537368080982, "grad_norm": 0.09337103366851807, "learning_rate": 7.427603073110967e-07, "loss": 0.0157, "step": 956 }, { "epoch": 2.638294206332113, "grad_norm": 0.08885804563760757, "learning_rate": 7.315180489675822e-07, "loss": 0.0153, "step": 957 }, { "epoch": 2.641051044583244, "grad_norm": 0.09852619469165802, "learning_rate": 7.203582901189332e-07, "loss": 0.0152, "step": 958 }, { "epoch": 2.6438078828343743, "grad_norm": 0.0962531641125679, "learning_rate": 7.092811300990521e-07, "loss": 0.0153, "step": 959 }, { "epoch": 2.6465647210855052, "grad_norm": 0.09466900676488876, "learning_rate": 6.98286667506618e-07, "loss": 0.0152, "step": 960 }, { "epoch": 2.6493215593366357, "grad_norm": 0.09106676280498505, "learning_rate": 6.87375000204219e-07, "loss": 0.0147, "step": 961 }, { "epoch": 2.6520783975877666, "grad_norm": 0.09509000927209854, "learning_rate": 6.765462253174715e-07, "loss": 0.0155, "step": 962 }, { "epoch": 2.654835235838897, "grad_norm": 0.09613344073295593, "learning_rate": 6.658004392341633e-07, "loss": 0.0155, "step": 963 }, { "epoch": 2.657592074090028, "grad_norm": 0.09688057750463486, "learning_rate": 6.551377376033896e-07, "loss": 0.0157, "step": 964 }, { "epoch": 2.660348912341159, "grad_norm": 0.09640353918075562, "learning_rate": 6.445582153347074e-07, "loss": 0.0159, "step": 965 }, { "epoch": 2.6631057505922895, "grad_norm": 0.10333269834518433, "learning_rate": 6.340619665972847e-07, "loss": 0.0158, "step": 966 }, { "epoch": 2.66586258884342, "grad_norm": 0.09676087647676468, "learning_rate": 6.236490848190657e-07, "loss": 0.0156, "step": 967 }, { "epoch": 2.668619427094551, "grad_norm": 0.09745439141988754, "learning_rate": 6.133196626859406e-07, "loss": 0.0163, "step": 968 }, { "epoch": 2.671376265345682, "grad_norm": 0.0986739918589592, "learning_rate": 6.030737921409169e-07, "loss": 0.0163, "step": 969 }, { "epoch": 2.6741331035968123, "grad_norm": 0.09318210184574127, "learning_rate": 5.929115643833005e-07, "loss": 0.0158, "step": 970 }, { "epoch": 2.6768899418479433, "grad_norm": 0.09318020939826965, "learning_rate": 5.828330698678908e-07, "loss": 0.0153, "step": 971 }, { "epoch": 2.6796467800990738, "grad_norm": 0.0958879366517067, "learning_rate": 5.728383983041696e-07, "loss": 0.0159, "step": 972 }, { "epoch": 2.6824036183502047, "grad_norm": 0.10100314021110535, "learning_rate": 5.629276386555016e-07, "loss": 0.0152, "step": 973 }, { "epoch": 2.685160456601335, "grad_norm": 0.0920896902680397, "learning_rate": 5.531008791383485e-07, "loss": 0.0153, "step": 974 }, { "epoch": 2.687917294852466, "grad_norm": 0.09028688073158264, "learning_rate": 5.43358207221476e-07, "loss": 0.0156, "step": 975 }, { "epoch": 2.690674133103597, "grad_norm": 0.09332931786775589, "learning_rate": 5.336997096251816e-07, "loss": 0.0151, "step": 976 }, { "epoch": 2.6934309713547275, "grad_norm": 0.10222768783569336, "learning_rate": 5.241254723205225e-07, "loss": 0.0153, "step": 977 }, { "epoch": 2.696187809605858, "grad_norm": 0.10587936639785767, "learning_rate": 5.146355805285452e-07, "loss": 0.0151, "step": 978 }, { "epoch": 2.698944647856989, "grad_norm": 0.09399544447660446, "learning_rate": 5.052301187195296e-07, "loss": 0.0156, "step": 979 }, { "epoch": 2.70170148610812, "grad_norm": 0.09084748476743698, "learning_rate": 4.959091706122431e-07, "loss": 0.0152, "step": 980 }, { "epoch": 2.7044583243592504, "grad_norm": 0.09691156446933746, "learning_rate": 4.866728191731829e-07, "loss": 0.0156, "step": 981 }, { "epoch": 2.7072151626103813, "grad_norm": 0.09536400437355042, "learning_rate": 4.775211466158469e-07, "loss": 0.0157, "step": 982 }, { "epoch": 2.709972000861512, "grad_norm": 0.0932604968547821, "learning_rate": 4.6845423440000315e-07, "loss": 0.0151, "step": 983 }, { "epoch": 2.7127288391126427, "grad_norm": 0.0943024605512619, "learning_rate": 4.594721632309551e-07, "loss": 0.0157, "step": 984 }, { "epoch": 2.715485677363773, "grad_norm": 0.11024822294712067, "learning_rate": 4.505750130588371e-07, "loss": 0.0159, "step": 985 }, { "epoch": 2.718242515614904, "grad_norm": 0.09148909151554108, "learning_rate": 4.4176286307788475e-07, "loss": 0.0152, "step": 986 }, { "epoch": 2.720999353866035, "grad_norm": 0.10189063847064972, "learning_rate": 4.3303579172574884e-07, "loss": 0.0164, "step": 987 }, { "epoch": 2.7237561921171656, "grad_norm": 0.09687193483114243, "learning_rate": 4.243938766827849e-07, "loss": 0.0156, "step": 988 }, { "epoch": 2.7265130303682965, "grad_norm": 0.0970255509018898, "learning_rate": 4.1583719487136575e-07, "loss": 0.0154, "step": 989 }, { "epoch": 2.729269868619427, "grad_norm": 0.08850499987602234, "learning_rate": 4.0736582245519795e-07, "loss": 0.0151, "step": 990 }, { "epoch": 2.732026706870558, "grad_norm": 0.10004325211048126, "learning_rate": 3.9897983483863866e-07, "loss": 0.0157, "step": 991 }, { "epoch": 2.7347835451216884, "grad_norm": 0.09604191035032272, "learning_rate": 3.9067930666603304e-07, "loss": 0.0154, "step": 992 }, { "epoch": 2.7375403833728194, "grad_norm": 0.096500463783741, "learning_rate": 3.824643118210403e-07, "loss": 0.0159, "step": 993 }, { "epoch": 2.7402972216239503, "grad_norm": 0.08940698951482773, "learning_rate": 3.743349234259841e-07, "loss": 0.0155, "step": 994 }, { "epoch": 2.7430540598750808, "grad_norm": 0.09379703551530838, "learning_rate": 3.662912138411967e-07, "loss": 0.0153, "step": 995 }, { "epoch": 2.7458108981262113, "grad_norm": 0.09963972121477127, "learning_rate": 3.5833325466437697e-07, "loss": 0.0156, "step": 996 }, { "epoch": 2.748567736377342, "grad_norm": 0.09487731754779816, "learning_rate": 3.5046111672995097e-07, "loss": 0.0158, "step": 997 }, { "epoch": 2.751324574628473, "grad_norm": 0.09154865145683289, "learning_rate": 3.426748701084448e-07, "loss": 0.0152, "step": 998 }, { "epoch": 2.7540814128796036, "grad_norm": 0.09428098797798157, "learning_rate": 3.349745841058605e-07, "loss": 0.0155, "step": 999 }, { "epoch": 2.7568382511307346, "grad_norm": 0.09710869193077087, "learning_rate": 3.2736032726305546e-07, "loss": 0.0158, "step": 1000 }, { "epoch": 2.759595089381865, "grad_norm": 0.10234350711107254, "learning_rate": 3.198321673551341e-07, "loss": 0.0159, "step": 1001 }, { "epoch": 2.762351927632996, "grad_norm": 0.09612155705690384, "learning_rate": 3.1239017139084725e-07, "loss": 0.0162, "step": 1002 }, { "epoch": 2.7651087658841265, "grad_norm": 0.10025949776172638, "learning_rate": 3.050344056119925e-07, "loss": 0.0158, "step": 1003 }, { "epoch": 2.7678656041352574, "grad_norm": 0.09752298891544342, "learning_rate": 2.977649354928258e-07, "loss": 0.0153, "step": 1004 }, { "epoch": 2.7706224423863883, "grad_norm": 0.09761834889650345, "learning_rate": 2.905818257394799e-07, "loss": 0.0155, "step": 1005 }, { "epoch": 2.773379280637519, "grad_norm": 0.09161604940891266, "learning_rate": 2.834851402893857e-07, "loss": 0.0148, "step": 1006 }, { "epoch": 2.7761361188886493, "grad_norm": 0.09183601289987564, "learning_rate": 2.764749423107027e-07, "loss": 0.0151, "step": 1007 }, { "epoch": 2.7788929571397802, "grad_norm": 0.09860436618328094, "learning_rate": 2.6955129420176193e-07, "loss": 0.0165, "step": 1008 }, { "epoch": 2.781649795390911, "grad_norm": 0.09372715651988983, "learning_rate": 2.627142575905062e-07, "loss": 0.0154, "step": 1009 }, { "epoch": 2.7844066336420417, "grad_norm": 0.10581711679697037, "learning_rate": 2.559638933339414e-07, "loss": 0.0157, "step": 1010 }, { "epoch": 2.7871634718931726, "grad_norm": 0.09646962583065033, "learning_rate": 2.493002615175977e-07, "loss": 0.0156, "step": 1011 }, { "epoch": 2.7899203101443035, "grad_norm": 0.09317374974489212, "learning_rate": 2.4272342145499006e-07, "loss": 0.0154, "step": 1012 }, { "epoch": 2.792677148395434, "grad_norm": 0.09786811470985413, "learning_rate": 2.3623343168709624e-07, "loss": 0.0157, "step": 1013 }, { "epoch": 2.7954339866465645, "grad_norm": 0.09875071793794632, "learning_rate": 2.2983034998182997e-07, "loss": 0.0157, "step": 1014 }, { "epoch": 2.7981908248976954, "grad_norm": 0.09668000042438507, "learning_rate": 2.235142333335316e-07, "loss": 0.0151, "step": 1015 }, { "epoch": 2.8009476631488264, "grad_norm": 0.09402566403150558, "learning_rate": 2.1728513796245855e-07, "loss": 0.0154, "step": 1016 }, { "epoch": 2.803704501399957, "grad_norm": 0.09588855504989624, "learning_rate": 2.11143119314281e-07, "loss": 0.0157, "step": 1017 }, { "epoch": 2.806461339651088, "grad_norm": 0.09455292671918869, "learning_rate": 2.0508823205959815e-07, "loss": 0.0154, "step": 1018 }, { "epoch": 2.8092181779022183, "grad_norm": 0.09076821804046631, "learning_rate": 1.991205300934429e-07, "loss": 0.0154, "step": 1019 }, { "epoch": 2.811975016153349, "grad_norm": 0.09466226398944855, "learning_rate": 1.9324006653480332e-07, "loss": 0.0152, "step": 1020 }, { "epoch": 2.8147318544044797, "grad_norm": 0.10528804361820221, "learning_rate": 1.874468937261531e-07, "loss": 0.0157, "step": 1021 }, { "epoch": 2.8174886926556106, "grad_norm": 0.09444960951805115, "learning_rate": 1.8174106323298634e-07, "loss": 0.0154, "step": 1022 }, { "epoch": 2.8202455309067416, "grad_norm": 0.09570340067148209, "learning_rate": 1.761226258433524e-07, "loss": 0.0157, "step": 1023 }, { "epoch": 2.823002369157872, "grad_norm": 0.09762073308229446, "learning_rate": 1.7059163156740943e-07, "loss": 0.0155, "step": 1024 }, { "epoch": 2.8257592074090025, "grad_norm": 0.09936973452568054, "learning_rate": 1.6514812963697723e-07, "loss": 0.0156, "step": 1025 }, { "epoch": 2.8285160456601335, "grad_norm": 0.09471126645803452, "learning_rate": 1.5979216850509848e-07, "loss": 0.0151, "step": 1026 }, { "epoch": 2.8312728839112644, "grad_norm": 0.09497305750846863, "learning_rate": 1.545237958456125e-07, "loss": 0.0155, "step": 1027 }, { "epoch": 2.834029722162395, "grad_norm": 0.09496881812810898, "learning_rate": 1.4934305855271892e-07, "loss": 0.0153, "step": 1028 }, { "epoch": 2.836786560413526, "grad_norm": 0.09368608891963959, "learning_rate": 1.4425000274057577e-07, "loss": 0.0157, "step": 1029 }, { "epoch": 2.8395433986646563, "grad_norm": 0.1018013209104538, "learning_rate": 1.3924467374287432e-07, "loss": 0.0157, "step": 1030 }, { "epoch": 2.8423002369157873, "grad_norm": 0.09439851343631744, "learning_rate": 1.343271161124493e-07, "loss": 0.0159, "step": 1031 }, { "epoch": 2.8450570751669177, "grad_norm": 0.10299389809370041, "learning_rate": 1.2949737362087156e-07, "loss": 0.0157, "step": 1032 }, { "epoch": 2.8478139134180487, "grad_norm": 0.10252979397773743, "learning_rate": 1.247554892580616e-07, "loss": 0.0159, "step": 1033 }, { "epoch": 2.8505707516691796, "grad_norm": 0.08925472944974899, "learning_rate": 1.201015052319099e-07, "loss": 0.0155, "step": 1034 }, { "epoch": 2.85332758992031, "grad_norm": 0.09510352462530136, "learning_rate": 1.1553546296789952e-07, "loss": 0.0159, "step": 1035 }, { "epoch": 2.856084428171441, "grad_norm": 0.09258411824703217, "learning_rate": 1.1105740310873414e-07, "loss": 0.0151, "step": 1036 }, { "epoch": 2.8588412664225715, "grad_norm": 0.09796860814094543, "learning_rate": 1.066673655139816e-07, "loss": 0.0153, "step": 1037 }, { "epoch": 2.8615981046737025, "grad_norm": 0.0932827889919281, "learning_rate": 1.0236538925971429e-07, "loss": 0.0155, "step": 1038 }, { "epoch": 2.864354942924833, "grad_norm": 0.09266915917396545, "learning_rate": 9.815151263816714e-08, "loss": 0.0153, "step": 1039 }, { "epoch": 2.867111781175964, "grad_norm": 0.09166253358125687, "learning_rate": 9.402577315738904e-08, "loss": 0.0154, "step": 1040 }, { "epoch": 2.869868619427095, "grad_norm": 0.10095134377479553, "learning_rate": 8.99882075409153e-08, "loss": 0.0161, "step": 1041 }, { "epoch": 2.8726254576782253, "grad_norm": 0.09815669804811478, "learning_rate": 8.603885172744131e-08, "loss": 0.0163, "step": 1042 }, { "epoch": 2.875382295929356, "grad_norm": 0.09382443875074387, "learning_rate": 8.217774087049268e-08, "loss": 0.016, "step": 1043 }, { "epoch": 2.8781391341804867, "grad_norm": 0.09190834313631058, "learning_rate": 7.840490933812783e-08, "loss": 0.0158, "step": 1044 }, { "epoch": 2.8808959724316177, "grad_norm": 0.09133637696504593, "learning_rate": 7.472039071261927e-08, "loss": 0.0149, "step": 1045 }, { "epoch": 2.883652810682748, "grad_norm": 0.10109441727399826, "learning_rate": 7.112421779015944e-08, "loss": 0.0152, "step": 1046 }, { "epoch": 2.886409648933879, "grad_norm": 0.09567693620920181, "learning_rate": 6.761642258056977e-08, "loss": 0.0151, "step": 1047 }, { "epoch": 2.8891664871850096, "grad_norm": 0.17240361869335175, "learning_rate": 6.419703630701546e-08, "loss": 0.0174, "step": 1048 }, { "epoch": 2.8919233254361405, "grad_norm": 0.09366879612207413, "learning_rate": 6.086608940572447e-08, "loss": 0.0157, "step": 1049 }, { "epoch": 2.894680163687271, "grad_norm": 0.09174374490976334, "learning_rate": 5.7623611525721155e-08, "loss": 0.0152, "step": 1050 }, { "epoch": 2.897437001938402, "grad_norm": 0.096317358314991, "learning_rate": 5.446963152855644e-08, "loss": 0.0156, "step": 1051 }, { "epoch": 2.900193840189533, "grad_norm": 0.09576837718486786, "learning_rate": 5.140417748806026e-08, "loss": 0.0152, "step": 1052 }, { "epoch": 2.9029506784406633, "grad_norm": 0.09430639445781708, "learning_rate": 4.8427276690081735e-08, "loss": 0.0152, "step": 1053 }, { "epoch": 2.905707516691794, "grad_norm": 0.10490168631076813, "learning_rate": 4.553895563225053e-08, "loss": 0.015, "step": 1054 }, { "epoch": 2.9084643549429248, "grad_norm": 0.0979146957397461, "learning_rate": 4.2739240023742526e-08, "loss": 0.0154, "step": 1055 }, { "epoch": 2.9112211931940557, "grad_norm": 0.10030056536197662, "learning_rate": 4.002815478505007e-08, "loss": 0.0155, "step": 1056 }, { "epoch": 2.913978031445186, "grad_norm": 0.10702443867921829, "learning_rate": 3.7405724047756554e-08, "loss": 0.0163, "step": 1057 }, { "epoch": 2.916734869696317, "grad_norm": 0.09876976162195206, "learning_rate": 3.487197115432883e-08, "loss": 0.0147, "step": 1058 }, { "epoch": 2.9194917079474476, "grad_norm": 0.09448473155498505, "learning_rate": 3.242691865790071e-08, "loss": 0.0148, "step": 1059 }, { "epoch": 2.9222485461985785, "grad_norm": 0.10175088047981262, "learning_rate": 3.0070588322079765e-08, "loss": 0.0165, "step": 1060 }, { "epoch": 2.925005384449709, "grad_norm": 0.09407947957515717, "learning_rate": 2.780300112074974e-08, "loss": 0.0154, "step": 1061 }, { "epoch": 2.92776222270084, "grad_norm": 0.09297125786542892, "learning_rate": 2.5624177237884017e-08, "loss": 0.0156, "step": 1062 }, { "epoch": 2.930519060951971, "grad_norm": 0.10525539517402649, "learning_rate": 2.3534136067369094e-08, "loss": 0.0165, "step": 1063 }, { "epoch": 2.9332758992031014, "grad_norm": 0.09468672424554825, "learning_rate": 2.1532896212825837e-08, "loss": 0.0158, "step": 1064 }, { "epoch": 2.9360327374542323, "grad_norm": 0.0892932116985321, "learning_rate": 1.962047548744961e-08, "loss": 0.0148, "step": 1065 }, { "epoch": 2.938789575705363, "grad_norm": 0.09384763240814209, "learning_rate": 1.7796890913850395e-08, "loss": 0.0154, "step": 1066 }, { "epoch": 2.9415464139564937, "grad_norm": 0.10021128505468369, "learning_rate": 1.606215872389738e-08, "loss": 0.0154, "step": 1067 }, { "epoch": 2.9443032522076242, "grad_norm": 0.09487558901309967, "learning_rate": 1.4416294358582383e-08, "loss": 0.0151, "step": 1068 }, { "epoch": 2.947060090458755, "grad_norm": 0.09665752202272415, "learning_rate": 1.2859312467872197e-08, "loss": 0.0148, "step": 1069 }, { "epoch": 2.949816928709886, "grad_norm": 0.11037097126245499, "learning_rate": 1.1391226910588693e-08, "loss": 0.0153, "step": 1070 }, { "epoch": 2.9525737669610166, "grad_norm": 0.101137176156044, "learning_rate": 1.0012050754277802e-08, "loss": 0.0153, "step": 1071 }, { "epoch": 2.955330605212147, "grad_norm": 0.09380457550287247, "learning_rate": 8.721796275095173e-09, "loss": 0.0152, "step": 1072 }, { "epoch": 2.958087443463278, "grad_norm": 0.09098217636346817, "learning_rate": 7.520474957699586e-09, "loss": 0.0149, "step": 1073 }, { "epoch": 2.960844281714409, "grad_norm": 0.0910525918006897, "learning_rate": 6.40809749514637e-09, "loss": 0.015, "step": 1074 }, { "epoch": 2.9636011199655394, "grad_norm": 0.09770431369543076, "learning_rate": 5.384673788797479e-09, "loss": 0.0151, "step": 1075 }, { "epoch": 2.9663579582166704, "grad_norm": 0.09908950328826904, "learning_rate": 4.450212948227117e-09, "loss": 0.0161, "step": 1076 }, { "epoch": 2.969114796467801, "grad_norm": 0.09452182054519653, "learning_rate": 3.6047232911462506e-09, "loss": 0.0153, "step": 1077 }, { "epoch": 2.971871634718932, "grad_norm": 0.09382626414299011, "learning_rate": 2.8482123433248853e-09, "loss": 0.0153, "step": 1078 }, { "epoch": 2.9746284729700623, "grad_norm": 0.0985400527715683, "learning_rate": 2.180686838527679e-09, "loss": 0.0158, "step": 1079 }, { "epoch": 2.977385311221193, "grad_norm": 0.09935043007135391, "learning_rate": 1.6021527184528761e-09, "loss": 0.0163, "step": 1080 }, { "epoch": 2.980142149472324, "grad_norm": 0.09754447638988495, "learning_rate": 1.1126151326779077e-09, "loss": 0.0156, "step": 1081 }, { "epoch": 2.9828989877234546, "grad_norm": 0.09899445623159409, "learning_rate": 7.120784386160928e-10, "loss": 0.0162, "step": 1082 }, { "epoch": 2.985655825974585, "grad_norm": 0.09259970486164093, "learning_rate": 4.005462014766703e-10, "loss": 0.0153, "step": 1083 }, { "epoch": 2.988412664225716, "grad_norm": 0.09933052957057953, "learning_rate": 1.7802119423149244e-10, "loss": 0.016, "step": 1084 }, { "epoch": 2.991169502476847, "grad_norm": 0.09650570899248123, "learning_rate": 4.450539759393024e-11, "loss": 0.0155, "step": 1085 }, { "epoch": 2.9939263407279775, "grad_norm": 0.0921320915222168, "learning_rate": 0.0, "loss": 0.0156, "step": 1086 }, { "epoch": 2.9939263407279775, "step": 1086, "total_flos": 5.826670523626553e+18, "train_loss": 0.02890731801551597, "train_runtime": 65667.5663, "train_samples_per_second": 8.484, "train_steps_per_second": 0.017 } ], "logging_steps": 1.0, "max_steps": 1086, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.826670523626553e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }