{ "best_metric": 1.084336519241333, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.057406780861259075, "eval_steps": 25, "global_step": 228, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00025178412658446965, "grad_norm": 0.2894246280193329, "learning_rate": 1.1111111111111112e-05, "loss": 1.3358, "step": 1 }, { "epoch": 0.00025178412658446965, "eval_loss": 1.4041720628738403, "eval_runtime": 1.5236, "eval_samples_per_second": 32.818, "eval_steps_per_second": 8.533, "step": 1 }, { "epoch": 0.0005035682531689393, "grad_norm": 0.3336651027202606, "learning_rate": 2.2222222222222223e-05, "loss": 1.4653, "step": 2 }, { "epoch": 0.0007553523797534089, "grad_norm": 0.36238351464271545, "learning_rate": 3.3333333333333335e-05, "loss": 1.4747, "step": 3 }, { "epoch": 0.0010071365063378786, "grad_norm": 0.38116249442100525, "learning_rate": 4.4444444444444447e-05, "loss": 1.4752, "step": 4 }, { "epoch": 0.0012589206329223482, "grad_norm": 0.4064643979072571, "learning_rate": 5.555555555555556e-05, "loss": 1.4008, "step": 5 }, { "epoch": 0.0015107047595068178, "grad_norm": 0.43084007501602173, "learning_rate": 6.666666666666667e-05, "loss": 1.383, "step": 6 }, { "epoch": 0.0017624888860912876, "grad_norm": 0.46177080273628235, "learning_rate": 7.777777777777778e-05, "loss": 1.36, "step": 7 }, { "epoch": 0.002014273012675757, "grad_norm": 0.37692713737487793, "learning_rate": 8.888888888888889e-05, "loss": 1.3537, "step": 8 }, { "epoch": 0.0022660571392602268, "grad_norm": 0.25504958629608154, "learning_rate": 0.0001, "loss": 1.3239, "step": 9 }, { "epoch": 0.0025178412658446963, "grad_norm": 0.23159584403038025, "learning_rate": 9.999536994034917e-05, "loss": 1.3255, "step": 10 }, { "epoch": 0.002769625392429166, "grad_norm": 0.28963929414749146, "learning_rate": 9.99814807141723e-05, "loss": 1.3385, "step": 11 }, { "epoch": 0.0030214095190136355, "grad_norm": 0.31955209374427795, "learning_rate": 9.995833517960035e-05, "loss": 1.3097, "step": 12 }, { "epoch": 0.003273193645598105, "grad_norm": 0.23069429397583008, "learning_rate": 9.992593809953133e-05, "loss": 1.2359, "step": 13 }, { "epoch": 0.003524977772182575, "grad_norm": 0.27882739901542664, "learning_rate": 9.988429614065029e-05, "loss": 1.3039, "step": 14 }, { "epoch": 0.0037767618987670447, "grad_norm": 0.30989697575569153, "learning_rate": 9.983341787205736e-05, "loss": 1.3263, "step": 15 }, { "epoch": 0.004028546025351514, "grad_norm": 0.25735488533973694, "learning_rate": 9.97733137635045e-05, "loss": 1.3418, "step": 16 }, { "epoch": 0.0042803301519359835, "grad_norm": 0.18925218284130096, "learning_rate": 9.970399618324092e-05, "loss": 1.2892, "step": 17 }, { "epoch": 0.0045321142785204535, "grad_norm": 0.15477243065834045, "learning_rate": 9.962547939546805e-05, "loss": 1.2695, "step": 18 }, { "epoch": 0.0047838984051049235, "grad_norm": 0.1201956495642662, "learning_rate": 9.953777955740415e-05, "loss": 1.2515, "step": 19 }, { "epoch": 0.005035682531689393, "grad_norm": 0.08931563794612885, "learning_rate": 9.944091471595951e-05, "loss": 1.2399, "step": 20 }, { "epoch": 0.005287466658273863, "grad_norm": 0.10346703231334686, "learning_rate": 9.933490480402273e-05, "loss": 1.2807, "step": 21 }, { "epoch": 0.005539250784858332, "grad_norm": 0.14112868905067444, "learning_rate": 9.921977163635899e-05, "loss": 1.2333, "step": 22 }, { "epoch": 0.005791034911442802, "grad_norm": 0.1877644807100296, "learning_rate": 9.90955389051209e-05, "loss": 1.2533, "step": 23 }, { "epoch": 0.006042819038027271, "grad_norm": 0.215586319565773, "learning_rate": 9.89622321749732e-05, "loss": 1.2433, "step": 24 }, { "epoch": 0.006294603164611741, "grad_norm": 0.2447165846824646, "learning_rate": 9.881987887783194e-05, "loss": 1.1634, "step": 25 }, { "epoch": 0.006294603164611741, "eval_loss": 1.2141509056091309, "eval_runtime": 1.5236, "eval_samples_per_second": 32.817, "eval_steps_per_second": 8.532, "step": 25 }, { "epoch": 0.00654638729119621, "grad_norm": 0.1898128092288971, "learning_rate": 9.866850830721973e-05, "loss": 1.2322, "step": 26 }, { "epoch": 0.00679817141778068, "grad_norm": 0.16772133111953735, "learning_rate": 9.85081516122375e-05, "loss": 1.2542, "step": 27 }, { "epoch": 0.00704995554436515, "grad_norm": 0.11975707113742828, "learning_rate": 9.83388417911547e-05, "loss": 1.2536, "step": 28 }, { "epoch": 0.0073017396709496194, "grad_norm": 0.09228307008743286, "learning_rate": 9.816061368461896e-05, "loss": 1.2566, "step": 29 }, { "epoch": 0.0075535237975340895, "grad_norm": 0.0792098417878151, "learning_rate": 9.79735039684865e-05, "loss": 1.226, "step": 30 }, { "epoch": 0.007805307924118559, "grad_norm": 0.0848434716463089, "learning_rate": 9.777755114627491e-05, "loss": 1.1899, "step": 31 }, { "epoch": 0.008057092050703029, "grad_norm": 0.09312910586595535, "learning_rate": 9.757279554124004e-05, "loss": 1.1568, "step": 32 }, { "epoch": 0.008308876177287499, "grad_norm": 0.11826145648956299, "learning_rate": 9.735927928807813e-05, "loss": 1.2118, "step": 33 }, { "epoch": 0.008560660303871967, "grad_norm": 0.1303245574235916, "learning_rate": 9.713704632425529e-05, "loss": 1.1613, "step": 34 }, { "epoch": 0.008812444430456437, "grad_norm": 0.11749281734228134, "learning_rate": 9.690614238096617e-05, "loss": 1.2169, "step": 35 }, { "epoch": 0.009064228557040907, "grad_norm": 0.11288820207118988, "learning_rate": 9.666661497372324e-05, "loss": 1.1986, "step": 36 }, { "epoch": 0.009316012683625377, "grad_norm": 0.10559144616127014, "learning_rate": 9.641851339257912e-05, "loss": 1.2018, "step": 37 }, { "epoch": 0.009567796810209847, "grad_norm": 0.10261145234107971, "learning_rate": 9.616188869198361e-05, "loss": 1.0372, "step": 38 }, { "epoch": 0.009819580936794315, "grad_norm": 0.09313081204891205, "learning_rate": 9.589679368027765e-05, "loss": 1.2235, "step": 39 }, { "epoch": 0.010071365063378785, "grad_norm": 0.08457491546869278, "learning_rate": 9.562328290882643e-05, "loss": 1.2094, "step": 40 }, { "epoch": 0.010323149189963255, "grad_norm": 0.08044976741075516, "learning_rate": 9.534141266079385e-05, "loss": 1.2016, "step": 41 }, { "epoch": 0.010574933316547725, "grad_norm": 0.08418776094913483, "learning_rate": 9.505124093956045e-05, "loss": 1.2094, "step": 42 }, { "epoch": 0.010826717443132194, "grad_norm": 0.09928078949451447, "learning_rate": 9.475282745678749e-05, "loss": 1.1695, "step": 43 }, { "epoch": 0.011078501569716664, "grad_norm": 0.09286114573478699, "learning_rate": 9.444623362012944e-05, "loss": 1.0966, "step": 44 }, { "epoch": 0.011330285696301134, "grad_norm": 0.09147851914167404, "learning_rate": 9.413152252059749e-05, "loss": 1.1769, "step": 45 }, { "epoch": 0.011582069822885604, "grad_norm": 0.09161302447319031, "learning_rate": 9.380875891957674e-05, "loss": 1.1761, "step": 46 }, { "epoch": 0.011833853949470074, "grad_norm": 0.09320323169231415, "learning_rate": 9.347800923549942e-05, "loss": 1.1795, "step": 47 }, { "epoch": 0.012085638076054542, "grad_norm": 0.08234869688749313, "learning_rate": 9.313934153017741e-05, "loss": 1.1745, "step": 48 }, { "epoch": 0.012337422202639012, "grad_norm": 0.08729473501443863, "learning_rate": 9.279282549479634e-05, "loss": 1.1614, "step": 49 }, { "epoch": 0.012589206329223482, "grad_norm": 0.11173596233129501, "learning_rate": 9.243853243557462e-05, "loss": 1.0938, "step": 50 }, { "epoch": 0.012589206329223482, "eval_loss": 1.1531009674072266, "eval_runtime": 1.5229, "eval_samples_per_second": 32.832, "eval_steps_per_second": 8.536, "step": 50 }, { "epoch": 0.012840990455807952, "grad_norm": 0.10851637274026871, "learning_rate": 9.207653525908994e-05, "loss": 1.1919, "step": 51 }, { "epoch": 0.01309277458239242, "grad_norm": 0.12280933558940887, "learning_rate": 9.170690845727655e-05, "loss": 1.2008, "step": 52 }, { "epoch": 0.01334455870897689, "grad_norm": 0.11200592666864395, "learning_rate": 9.132972809209626e-05, "loss": 1.1743, "step": 53 }, { "epoch": 0.01359634283556136, "grad_norm": 0.10003667324781418, "learning_rate": 9.094507177988643e-05, "loss": 1.1631, "step": 54 }, { "epoch": 0.01384812696214583, "grad_norm": 0.09316058456897736, "learning_rate": 9.055301867538794e-05, "loss": 1.2185, "step": 55 }, { "epoch": 0.0140999110887303, "grad_norm": 0.07689037919044495, "learning_rate": 9.01536494554568e-05, "loss": 1.1249, "step": 56 }, { "epoch": 0.014351695215314769, "grad_norm": 0.0907893106341362, "learning_rate": 8.974704630246239e-05, "loss": 1.1772, "step": 57 }, { "epoch": 0.014603479341899239, "grad_norm": 0.09316601604223251, "learning_rate": 8.933329288737597e-05, "loss": 1.1482, "step": 58 }, { "epoch": 0.014855263468483709, "grad_norm": 0.09033600986003876, "learning_rate": 8.89124743525527e-05, "loss": 1.1258, "step": 59 }, { "epoch": 0.015107047595068179, "grad_norm": 0.10970163345336914, "learning_rate": 8.848467729421124e-05, "loss": 1.1349, "step": 60 }, { "epoch": 0.015358831721652647, "grad_norm": 0.10259843617677689, "learning_rate": 8.804998974461371e-05, "loss": 1.1576, "step": 61 }, { "epoch": 0.015610615848237117, "grad_norm": 0.11174706369638443, "learning_rate": 8.760850115395054e-05, "loss": 1.1705, "step": 62 }, { "epoch": 0.015862399974821587, "grad_norm": 0.08973610401153564, "learning_rate": 8.716030237193325e-05, "loss": 1.1515, "step": 63 }, { "epoch": 0.016114184101406057, "grad_norm": 0.08280681818723679, "learning_rate": 8.670548562909947e-05, "loss": 1.1607, "step": 64 }, { "epoch": 0.016365968227990527, "grad_norm": 0.09221348911523819, "learning_rate": 8.624414451783364e-05, "loss": 1.1482, "step": 65 }, { "epoch": 0.016617752354574997, "grad_norm": 0.09574998915195465, "learning_rate": 8.577637397310749e-05, "loss": 1.241, "step": 66 }, { "epoch": 0.016869536481159467, "grad_norm": 0.09795909374952316, "learning_rate": 8.530227025294435e-05, "loss": 1.1739, "step": 67 }, { "epoch": 0.017121320607743934, "grad_norm": 0.09504640102386475, "learning_rate": 8.482193091861112e-05, "loss": 1.1334, "step": 68 }, { "epoch": 0.017373104734328404, "grad_norm": 0.08822384476661682, "learning_rate": 8.433545481454206e-05, "loss": 1.137, "step": 69 }, { "epoch": 0.017624888860912874, "grad_norm": 0.0878458246588707, "learning_rate": 8.384294204799853e-05, "loss": 1.1719, "step": 70 }, { "epoch": 0.017876672987497344, "grad_norm": 0.08970195800065994, "learning_rate": 8.334449396846886e-05, "loss": 1.1298, "step": 71 }, { "epoch": 0.018128457114081814, "grad_norm": 0.0950387567281723, "learning_rate": 8.284021314681265e-05, "loss": 1.1556, "step": 72 }, { "epoch": 0.018380241240666284, "grad_norm": 0.09765107929706573, "learning_rate": 8.233020335415371e-05, "loss": 1.1732, "step": 73 }, { "epoch": 0.018632025367250754, "grad_norm": 0.11165706068277359, "learning_rate": 8.18145695405259e-05, "loss": 1.1746, "step": 74 }, { "epoch": 0.018883809493835224, "grad_norm": 0.15581081807613373, "learning_rate": 8.129341781327658e-05, "loss": 1.0963, "step": 75 }, { "epoch": 0.018883809493835224, "eval_loss": 1.122991681098938, "eval_runtime": 1.5233, "eval_samples_per_second": 32.822, "eval_steps_per_second": 8.534, "step": 75 }, { "epoch": 0.019135593620419694, "grad_norm": 0.08741319179534912, "learning_rate": 8.07668554152317e-05, "loss": 1.1688, "step": 76 }, { "epoch": 0.01938737774700416, "grad_norm": 0.08875293284654617, "learning_rate": 8.02349907026274e-05, "loss": 1.1829, "step": 77 }, { "epoch": 0.01963916187358863, "grad_norm": 0.08861377090215683, "learning_rate": 7.969793312281237e-05, "loss": 1.1803, "step": 78 }, { "epoch": 0.0198909460001731, "grad_norm": 0.09791383892297745, "learning_rate": 7.915579319172573e-05, "loss": 1.2001, "step": 79 }, { "epoch": 0.02014273012675757, "grad_norm": 0.0991906151175499, "learning_rate": 7.860868247115505e-05, "loss": 1.1669, "step": 80 }, { "epoch": 0.02039451425334204, "grad_norm": 0.09529578685760498, "learning_rate": 7.805671354577908e-05, "loss": 1.1522, "step": 81 }, { "epoch": 0.02064629837992651, "grad_norm": 0.09793104231357574, "learning_rate": 7.75e-05, "loss": 1.1407, "step": 82 }, { "epoch": 0.02089808250651098, "grad_norm": 0.09607716649770737, "learning_rate": 7.693865639457011e-05, "loss": 1.1291, "step": 83 }, { "epoch": 0.02114986663309545, "grad_norm": 0.10056506842374802, "learning_rate": 7.637279824301728e-05, "loss": 1.1124, "step": 84 }, { "epoch": 0.02140165075967992, "grad_norm": 0.10144215822219849, "learning_rate": 7.580254198787463e-05, "loss": 1.1022, "step": 85 }, { "epoch": 0.021653434886264387, "grad_norm": 0.10326708108186722, "learning_rate": 7.522800497671897e-05, "loss": 1.073, "step": 86 }, { "epoch": 0.021905219012848857, "grad_norm": 0.1260983794927597, "learning_rate": 7.464930543802289e-05, "loss": 1.1218, "step": 87 }, { "epoch": 0.022157003139433327, "grad_norm": 0.10883725434541702, "learning_rate": 7.406656245682565e-05, "loss": 1.1167, "step": 88 }, { "epoch": 0.022408787266017798, "grad_norm": 0.09835278987884521, "learning_rate": 7.34798959502279e-05, "loss": 1.1506, "step": 89 }, { "epoch": 0.022660571392602268, "grad_norm": 0.10083261877298355, "learning_rate": 7.288942664271503e-05, "loss": 1.1792, "step": 90 }, { "epoch": 0.022912355519186738, "grad_norm": 0.10250851511955261, "learning_rate": 7.229527604131436e-05, "loss": 1.1897, "step": 91 }, { "epoch": 0.023164139645771208, "grad_norm": 0.1085507944226265, "learning_rate": 7.16975664105915e-05, "loss": 1.1402, "step": 92 }, { "epoch": 0.023415923772355678, "grad_norm": 0.10734712332487106, "learning_rate": 7.109642074749067e-05, "loss": 1.0878, "step": 93 }, { "epoch": 0.023667707898940148, "grad_norm": 0.10455948859453201, "learning_rate": 7.049196275602421e-05, "loss": 1.116, "step": 94 }, { "epoch": 0.023919492025524614, "grad_norm": 0.10420308262109756, "learning_rate": 6.988431682181693e-05, "loss": 1.1243, "step": 95 }, { "epoch": 0.024171276152109084, "grad_norm": 0.10482881218194962, "learning_rate": 6.927360798650978e-05, "loss": 1.1198, "step": 96 }, { "epoch": 0.024423060278693554, "grad_norm": 0.11121159791946411, "learning_rate": 6.865996192202884e-05, "loss": 1.1097, "step": 97 }, { "epoch": 0.024674844405278024, "grad_norm": 0.11902576684951782, "learning_rate": 6.804350490472446e-05, "loss": 1.1056, "step": 98 }, { "epoch": 0.024926628531862494, "grad_norm": 0.12510505318641663, "learning_rate": 6.742436378938612e-05, "loss": 1.109, "step": 99 }, { "epoch": 0.025178412658446964, "grad_norm": 0.1589539796113968, "learning_rate": 6.680266598313802e-05, "loss": 1.065, "step": 100 }, { "epoch": 0.025178412658446964, "eval_loss": 1.106641411781311, "eval_runtime": 1.5233, "eval_samples_per_second": 32.824, "eval_steps_per_second": 8.534, "step": 100 }, { "epoch": 0.025430196785031434, "grad_norm": 0.09652504324913025, "learning_rate": 6.617853941922146e-05, "loss": 1.0807, "step": 101 }, { "epoch": 0.025681980911615904, "grad_norm": 0.10196559131145477, "learning_rate": 6.555211253066844e-05, "loss": 1.1715, "step": 102 }, { "epoch": 0.025933765038200374, "grad_norm": 0.10689554363489151, "learning_rate": 6.49235142238728e-05, "loss": 1.1938, "step": 103 }, { "epoch": 0.02618554916478484, "grad_norm": 0.11235436052083969, "learning_rate": 6.429287385206368e-05, "loss": 1.2069, "step": 104 }, { "epoch": 0.02643733329136931, "grad_norm": 0.11319782584905624, "learning_rate": 6.366032118868734e-05, "loss": 1.1022, "step": 105 }, { "epoch": 0.02668911741795378, "grad_norm": 0.12285393476486206, "learning_rate": 6.302598640070218e-05, "loss": 1.131, "step": 106 }, { "epoch": 0.02694090154453825, "grad_norm": 0.11379806697368622, "learning_rate": 6.23900000217929e-05, "loss": 1.0844, "step": 107 }, { "epoch": 0.02719268567112272, "grad_norm": 0.1146794855594635, "learning_rate": 6.175249292550937e-05, "loss": 1.1122, "step": 108 }, { "epoch": 0.02744446979770719, "grad_norm": 0.1138162612915039, "learning_rate": 6.111359629833533e-05, "loss": 1.1059, "step": 109 }, { "epoch": 0.02769625392429166, "grad_norm": 0.13073591887950897, "learning_rate": 6.0473441612692705e-05, "loss": 1.1073, "step": 110 }, { "epoch": 0.02794803805087613, "grad_norm": 0.13190491497516632, "learning_rate": 5.9832160599887344e-05, "loss": 1.1323, "step": 111 }, { "epoch": 0.0281998221774606, "grad_norm": 0.1431579738855362, "learning_rate": 5.9189885223001094e-05, "loss": 1.1094, "step": 112 }, { "epoch": 0.028451606304045068, "grad_norm": 0.11174172163009644, "learning_rate": 5.85467476497365e-05, "loss": 1.0261, "step": 113 }, { "epoch": 0.028703390430629538, "grad_norm": 0.10128598660230637, "learning_rate": 5.790288022521925e-05, "loss": 1.1019, "step": 114 }, { "epoch": 0.028955174557214008, "grad_norm": 0.10618976503610611, "learning_rate": 5.725841544476413e-05, "loss": 1.1591, "step": 115 }, { "epoch": 0.029206958683798478, "grad_norm": 0.11264601349830627, "learning_rate": 5.661348592661009e-05, "loss": 1.1057, "step": 116 }, { "epoch": 0.029458742810382948, "grad_norm": 0.11875821650028229, "learning_rate": 5.596822438463001e-05, "loss": 1.1278, "step": 117 }, { "epoch": 0.029710526936967418, "grad_norm": 0.13417275249958038, "learning_rate": 5.532276360102076e-05, "loss": 1.0758, "step": 118 }, { "epoch": 0.029962311063551888, "grad_norm": 0.12961986660957336, "learning_rate": 5.467723639897926e-05, "loss": 1.0888, "step": 119 }, { "epoch": 0.030214095190136358, "grad_norm": 0.12340384721755981, "learning_rate": 5.4031775615370004e-05, "loss": 1.098, "step": 120 }, { "epoch": 0.030465879316720828, "grad_norm": 0.11747000366449356, "learning_rate": 5.3386514073389936e-05, "loss": 1.1203, "step": 121 }, { "epoch": 0.030717663443305294, "grad_norm": 0.11870179325342178, "learning_rate": 5.274158455523588e-05, "loss": 1.1258, "step": 122 }, { "epoch": 0.030969447569889764, "grad_norm": 0.12783485651016235, "learning_rate": 5.209711977478078e-05, "loss": 1.097, "step": 123 }, { "epoch": 0.031221231696474235, "grad_norm": 0.14884835481643677, "learning_rate": 5.145325235026351e-05, "loss": 1.103, "step": 124 }, { "epoch": 0.031473015823058705, "grad_norm": 0.17791038751602173, "learning_rate": 5.081011477699893e-05, "loss": 1.0389, "step": 125 }, { "epoch": 0.031473015823058705, "eval_loss": 1.096808671951294, "eval_runtime": 1.5229, "eval_samples_per_second": 32.831, "eval_steps_per_second": 8.536, "step": 125 }, { "epoch": 0.031724799949643175, "grad_norm": 0.11103296279907227, "learning_rate": 5.016783940011267e-05, "loss": 1.1911, "step": 126 }, { "epoch": 0.031976584076227645, "grad_norm": 0.11420492827892303, "learning_rate": 4.952655838730731e-05, "loss": 1.1282, "step": 127 }, { "epoch": 0.032228368202812115, "grad_norm": 0.11212627589702606, "learning_rate": 4.888640370166469e-05, "loss": 1.1621, "step": 128 }, { "epoch": 0.032480152329396585, "grad_norm": 0.11292574554681778, "learning_rate": 4.824750707449064e-05, "loss": 1.1511, "step": 129 }, { "epoch": 0.032731936455981055, "grad_norm": 0.12400747090578079, "learning_rate": 4.760999997820711e-05, "loss": 1.1402, "step": 130 }, { "epoch": 0.032983720582565525, "grad_norm": 0.1370488405227661, "learning_rate": 4.6974013599297837e-05, "loss": 1.0905, "step": 131 }, { "epoch": 0.033235504709149995, "grad_norm": 0.1564428210258484, "learning_rate": 4.633967881131266e-05, "loss": 1.1266, "step": 132 }, { "epoch": 0.033487288835734465, "grad_norm": 0.12354940921068192, "learning_rate": 4.570712614793633e-05, "loss": 1.1013, "step": 133 }, { "epoch": 0.033739072962318935, "grad_norm": 0.12833015620708466, "learning_rate": 4.507648577612722e-05, "loss": 1.1246, "step": 134 }, { "epoch": 0.033990857088903405, "grad_norm": 0.12590007483959198, "learning_rate": 4.4447887469331574e-05, "loss": 1.1417, "step": 135 }, { "epoch": 0.03424264121548787, "grad_norm": 0.13550645112991333, "learning_rate": 4.382146058077855e-05, "loss": 1.13, "step": 136 }, { "epoch": 0.03449442534207234, "grad_norm": 0.14064566791057587, "learning_rate": 4.319733401686199e-05, "loss": 1.0597, "step": 137 }, { "epoch": 0.03474620946865681, "grad_norm": 0.1269257515668869, "learning_rate": 4.25756362106139e-05, "loss": 1.1038, "step": 138 }, { "epoch": 0.03499799359524128, "grad_norm": 0.11279810220003128, "learning_rate": 4.195649509527555e-05, "loss": 1.1551, "step": 139 }, { "epoch": 0.03524977772182575, "grad_norm": 0.1206924095749855, "learning_rate": 4.134003807797116e-05, "loss": 1.1122, "step": 140 }, { "epoch": 0.03550156184841022, "grad_norm": 0.12054243683815002, "learning_rate": 4.0726392013490235e-05, "loss": 1.188, "step": 141 }, { "epoch": 0.03575334597499469, "grad_norm": 0.1204872876405716, "learning_rate": 4.0115683178183084e-05, "loss": 1.0985, "step": 142 }, { "epoch": 0.03600513010157916, "grad_norm": 0.12411932647228241, "learning_rate": 3.95080372439758e-05, "loss": 1.1114, "step": 143 }, { "epoch": 0.03625691422816363, "grad_norm": 0.12376714497804642, "learning_rate": 3.8903579252509345e-05, "loss": 1.0835, "step": 144 }, { "epoch": 0.0365086983547481, "grad_norm": 0.12805478274822235, "learning_rate": 3.8302433589408525e-05, "loss": 1.1225, "step": 145 }, { "epoch": 0.03676048248133257, "grad_norm": 0.12796953320503235, "learning_rate": 3.770472395868566e-05, "loss": 1.1206, "step": 146 }, { "epoch": 0.03701226660791704, "grad_norm": 0.12267071008682251, "learning_rate": 3.711057335728499e-05, "loss": 1.1043, "step": 147 }, { "epoch": 0.03726405073450151, "grad_norm": 0.12217283993959427, "learning_rate": 3.65201040497721e-05, "loss": 1.0964, "step": 148 }, { "epoch": 0.03751583486108598, "grad_norm": 0.1348910927772522, "learning_rate": 3.5933437543174363e-05, "loss": 1.1044, "step": 149 }, { "epoch": 0.03776761898767045, "grad_norm": 0.16145659983158112, "learning_rate": 3.5350694561977125e-05, "loss": 1.0322, "step": 150 }, { "epoch": 0.03776761898767045, "eval_loss": 1.090155839920044, "eval_runtime": 1.5223, "eval_samples_per_second": 32.844, "eval_steps_per_second": 8.539, "step": 150 }, { "epoch": 0.03801940311425492, "grad_norm": 0.11269905418157578, "learning_rate": 3.4771995023281044e-05, "loss": 1.1413, "step": 151 }, { "epoch": 0.03827118724083939, "grad_norm": 0.11654523015022278, "learning_rate": 3.419745801212538e-05, "loss": 1.1552, "step": 152 }, { "epoch": 0.03852297136742386, "grad_norm": 0.12446989119052887, "learning_rate": 3.362720175698275e-05, "loss": 1.1491, "step": 153 }, { "epoch": 0.03877475549400832, "grad_norm": 0.1188817173242569, "learning_rate": 3.30613436054299e-05, "loss": 1.1733, "step": 154 }, { "epoch": 0.03902653962059279, "grad_norm": 0.12001082301139832, "learning_rate": 3.250000000000001e-05, "loss": 1.1019, "step": 155 }, { "epoch": 0.03927832374717726, "grad_norm": 0.12011527270078659, "learning_rate": 3.194328645422094e-05, "loss": 1.1118, "step": 156 }, { "epoch": 0.03953010787376173, "grad_norm": 0.1201213002204895, "learning_rate": 3.1391317528844965e-05, "loss": 1.1333, "step": 157 }, { "epoch": 0.0397818920003462, "grad_norm": 0.1220758855342865, "learning_rate": 3.0844206808274287e-05, "loss": 1.096, "step": 158 }, { "epoch": 0.04003367612693067, "grad_norm": 0.12169603258371353, "learning_rate": 3.030206687718765e-05, "loss": 1.1063, "step": 159 }, { "epoch": 0.04028546025351514, "grad_norm": 0.1275063306093216, "learning_rate": 2.9765009297372602e-05, "loss": 1.1171, "step": 160 }, { "epoch": 0.04053724438009961, "grad_norm": 0.13592584431171417, "learning_rate": 2.92331445847683e-05, "loss": 1.1148, "step": 161 }, { "epoch": 0.04078902850668408, "grad_norm": 0.1522008776664734, "learning_rate": 2.8706582186723417e-05, "loss": 1.0848, "step": 162 }, { "epoch": 0.04104081263326855, "grad_norm": 0.12438967078924179, "learning_rate": 2.8185430459474105e-05, "loss": 1.1159, "step": 163 }, { "epoch": 0.04129259675985302, "grad_norm": 0.11593794822692871, "learning_rate": 2.76697966458463e-05, "loss": 1.1523, "step": 164 }, { "epoch": 0.04154438088643749, "grad_norm": 0.1169944554567337, "learning_rate": 2.7159786853187362e-05, "loss": 1.1226, "step": 165 }, { "epoch": 0.04179616501302196, "grad_norm": 0.1212480217218399, "learning_rate": 2.6655506031531153e-05, "loss": 1.1441, "step": 166 }, { "epoch": 0.04204794913960643, "grad_norm": 0.12019576132297516, "learning_rate": 2.6157057952001485e-05, "loss": 1.1018, "step": 167 }, { "epoch": 0.0422997332661909, "grad_norm": 0.12491545081138611, "learning_rate": 2.5664545185457933e-05, "loss": 1.1019, "step": 168 }, { "epoch": 0.04255151739277537, "grad_norm": 0.13169804215431213, "learning_rate": 2.517806908138888e-05, "loss": 1.0623, "step": 169 }, { "epoch": 0.04280330151935984, "grad_norm": 0.13421769440174103, "learning_rate": 2.469772974705565e-05, "loss": 1.0871, "step": 170 }, { "epoch": 0.04305508564594431, "grad_norm": 0.12427503615617752, "learning_rate": 2.422362602689254e-05, "loss": 1.0794, "step": 171 }, { "epoch": 0.043306869772528775, "grad_norm": 0.12563304603099823, "learning_rate": 2.3755855482166378e-05, "loss": 1.1098, "step": 172 }, { "epoch": 0.043558653899113245, "grad_norm": 0.13072235882282257, "learning_rate": 2.3294514370900542e-05, "loss": 1.0911, "step": 173 }, { "epoch": 0.043810438025697715, "grad_norm": 0.14273561537265778, "learning_rate": 2.283969762806676e-05, "loss": 1.109, "step": 174 }, { "epoch": 0.044062222152282185, "grad_norm": 0.1899888515472412, "learning_rate": 2.239149884604948e-05, "loss": 0.9888, "step": 175 }, { "epoch": 0.044062222152282185, "eval_loss": 1.0866016149520874, "eval_runtime": 1.5236, "eval_samples_per_second": 32.816, "eval_steps_per_second": 8.532, "step": 175 }, { "epoch": 0.044314006278866655, "grad_norm": 0.10499967634677887, "learning_rate": 2.1950010255386297e-05, "loss": 1.0887, "step": 176 }, { "epoch": 0.044565790405451125, "grad_norm": 0.11740684509277344, "learning_rate": 2.1515322705788788e-05, "loss": 1.1305, "step": 177 }, { "epoch": 0.044817574532035595, "grad_norm": 0.12669669091701508, "learning_rate": 2.108752564744731e-05, "loss": 1.1639, "step": 178 }, { "epoch": 0.045069358658620065, "grad_norm": 0.12538516521453857, "learning_rate": 2.0666707112624058e-05, "loss": 1.1097, "step": 179 }, { "epoch": 0.045321142785204535, "grad_norm": 0.12294058501720428, "learning_rate": 2.025295369753761e-05, "loss": 1.122, "step": 180 }, { "epoch": 0.045572926911789005, "grad_norm": 0.12443839758634567, "learning_rate": 1.9846350544543215e-05, "loss": 1.0877, "step": 181 }, { "epoch": 0.045824711038373475, "grad_norm": 0.12381776422262192, "learning_rate": 1.944698132461207e-05, "loss": 1.0927, "step": 182 }, { "epoch": 0.046076495164957945, "grad_norm": 0.12740595638751984, "learning_rate": 1.905492822011359e-05, "loss": 1.1031, "step": 183 }, { "epoch": 0.046328279291542415, "grad_norm": 0.12863051891326904, "learning_rate": 1.8670271907903737e-05, "loss": 1.1065, "step": 184 }, { "epoch": 0.046580063418126885, "grad_norm": 0.1300704926252365, "learning_rate": 1.829309154272346e-05, "loss": 1.1169, "step": 185 }, { "epoch": 0.046831847544711355, "grad_norm": 0.13567733764648438, "learning_rate": 1.7923464740910063e-05, "loss": 1.1257, "step": 186 }, { "epoch": 0.047083631671295825, "grad_norm": 0.145668625831604, "learning_rate": 1.756146756442539e-05, "loss": 1.0824, "step": 187 }, { "epoch": 0.047335415797880295, "grad_norm": 0.12303412705659866, "learning_rate": 1.720717450520366e-05, "loss": 1.0278, "step": 188 }, { "epoch": 0.047587199924464765, "grad_norm": 0.11618991941213608, "learning_rate": 1.686065846982261e-05, "loss": 1.151, "step": 189 }, { "epoch": 0.04783898405104923, "grad_norm": 0.11975349485874176, "learning_rate": 1.6521990764500582e-05, "loss": 1.1182, "step": 190 }, { "epoch": 0.0480907681776337, "grad_norm": 0.12053832411766052, "learning_rate": 1.619124108042327e-05, "loss": 1.1471, "step": 191 }, { "epoch": 0.04834255230421817, "grad_norm": 0.12356948852539062, "learning_rate": 1.5868477479402504e-05, "loss": 1.1284, "step": 192 }, { "epoch": 0.04859433643080264, "grad_norm": 0.12398120015859604, "learning_rate": 1.5553766379870584e-05, "loss": 1.1457, "step": 193 }, { "epoch": 0.04884612055738711, "grad_norm": 0.12426292151212692, "learning_rate": 1.5247172543212521e-05, "loss": 1.0898, "step": 194 }, { "epoch": 0.04909790468397158, "grad_norm": 0.12630872428417206, "learning_rate": 1.4948759060439551e-05, "loss": 1.0753, "step": 195 }, { "epoch": 0.04934968881055605, "grad_norm": 0.1257810890674591, "learning_rate": 1.4658587339206153e-05, "loss": 1.1022, "step": 196 }, { "epoch": 0.04960147293714052, "grad_norm": 0.12942546606063843, "learning_rate": 1.4376717091173584e-05, "loss": 1.0655, "step": 197 }, { "epoch": 0.04985325706372499, "grad_norm": 0.13958528637886047, "learning_rate": 1.410320631972237e-05, "loss": 1.0952, "step": 198 }, { "epoch": 0.05010504119030946, "grad_norm": 0.14069540798664093, "learning_rate": 1.38381113080164e-05, "loss": 1.0969, "step": 199 }, { "epoch": 0.05035682531689393, "grad_norm": 0.17611843347549438, "learning_rate": 1.3581486607420874e-05, "loss": 0.9935, "step": 200 }, { "epoch": 0.05035682531689393, "eval_loss": 1.084336519241333, "eval_runtime": 1.5232, "eval_samples_per_second": 32.826, "eval_steps_per_second": 8.535, "step": 200 }, { "epoch": 0.0506086094434784, "grad_norm": 0.11260085552930832, "learning_rate": 1.333338502627676e-05, "loss": 1.139, "step": 201 }, { "epoch": 0.05086039357006287, "grad_norm": 0.11994125694036484, "learning_rate": 1.3093857619033844e-05, "loss": 1.1337, "step": 202 }, { "epoch": 0.05111217769664734, "grad_norm": 0.11982115358114243, "learning_rate": 1.2862953675744722e-05, "loss": 1.1353, "step": 203 }, { "epoch": 0.05136396182323181, "grad_norm": 0.12615817785263062, "learning_rate": 1.2640720711921882e-05, "loss": 1.1105, "step": 204 }, { "epoch": 0.05161574594981628, "grad_norm": 0.12592007219791412, "learning_rate": 1.2427204458759955e-05, "loss": 1.0824, "step": 205 }, { "epoch": 0.05186753007640075, "grad_norm": 0.1267462968826294, "learning_rate": 1.2222448853725088e-05, "loss": 1.0621, "step": 206 }, { "epoch": 0.05211931420298522, "grad_norm": 0.13077248632907867, "learning_rate": 1.2026496031513518e-05, "loss": 1.062, "step": 207 }, { "epoch": 0.05237109832956968, "grad_norm": 0.12852057814598083, "learning_rate": 1.1839386315381043e-05, "loss": 1.0745, "step": 208 }, { "epoch": 0.05262288245615415, "grad_norm": 0.12591220438480377, "learning_rate": 1.1661158208845307e-05, "loss": 1.072, "step": 209 }, { "epoch": 0.05287466658273862, "grad_norm": 0.12850439548492432, "learning_rate": 1.1491848387762514e-05, "loss": 1.1041, "step": 210 }, { "epoch": 0.05312645070932309, "grad_norm": 0.13350600004196167, "learning_rate": 1.1331491692780279e-05, "loss": 1.096, "step": 211 }, { "epoch": 0.05337823483590756, "grad_norm": 0.14876709878444672, "learning_rate": 1.1180121122168064e-05, "loss": 1.0571, "step": 212 }, { "epoch": 0.05363001896249203, "grad_norm": 0.12453413009643555, "learning_rate": 1.1037767825026826e-05, "loss": 1.0797, "step": 213 }, { "epoch": 0.0538818030890765, "grad_norm": 0.1117461547255516, "learning_rate": 1.0904461094879107e-05, "loss": 1.0913, "step": 214 }, { "epoch": 0.05413358721566097, "grad_norm": 0.11662810295820236, "learning_rate": 1.0780228363641018e-05, "loss": 1.1452, "step": 215 }, { "epoch": 0.05438537134224544, "grad_norm": 0.12274463474750519, "learning_rate": 1.0665095195977271e-05, "loss": 1.1162, "step": 216 }, { "epoch": 0.05463715546882991, "grad_norm": 0.12894800305366516, "learning_rate": 1.0559085284040506e-05, "loss": 1.0798, "step": 217 }, { "epoch": 0.05488893959541438, "grad_norm": 0.12095481902360916, "learning_rate": 1.0462220442595853e-05, "loss": 1.0852, "step": 218 }, { "epoch": 0.05514072372199885, "grad_norm": 0.12701094150543213, "learning_rate": 1.0374520604531953e-05, "loss": 1.0814, "step": 219 }, { "epoch": 0.05539250784858332, "grad_norm": 0.12494029849767685, "learning_rate": 1.0296003816759086e-05, "loss": 1.086, "step": 220 }, { "epoch": 0.05564429197516779, "grad_norm": 0.1267097145318985, "learning_rate": 1.0226686236495517e-05, "loss": 1.1057, "step": 221 }, { "epoch": 0.05589607610175226, "grad_norm": 0.13075008988380432, "learning_rate": 1.0166582127942649e-05, "loss": 1.1317, "step": 222 }, { "epoch": 0.05614786022833673, "grad_norm": 0.13261082768440247, "learning_rate": 1.0115703859349725e-05, "loss": 1.1039, "step": 223 }, { "epoch": 0.0563996443549212, "grad_norm": 0.14049747586250305, "learning_rate": 1.0074061900468672e-05, "loss": 1.116, "step": 224 }, { "epoch": 0.05665142848150567, "grad_norm": 0.19462983310222626, "learning_rate": 1.0041664820399652e-05, "loss": 0.9881, "step": 225 }, { "epoch": 0.05665142848150567, "eval_loss": 1.0821517705917358, "eval_runtime": 1.523, "eval_samples_per_second": 32.83, "eval_steps_per_second": 8.536, "step": 225 }, { "epoch": 0.056903212608090135, "grad_norm": 0.10417667776346207, "learning_rate": 1.0018519285827698e-05, "loss": 1.0784, "step": 226 }, { "epoch": 0.057154996734674605, "grad_norm": 0.1178843155503273, "learning_rate": 1.000463005965084e-05, "loss": 1.1522, "step": 227 }, { "epoch": 0.057406780861259075, "grad_norm": 0.12229274213314056, "learning_rate": 1e-05, "loss": 1.1221, "step": 228 } ], "logging_steps": 1, "max_steps": 228, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 30, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4583097096570143e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }