diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,18170 +10,18170 @@ "log_history": [ { "epoch": 0.002676032781401572, - "grad_norm": 6.179981970234677, - "learning_rate": 2.6737967914438506e-08, - "logits/chosen": -0.07354718446731567, - "logits/rejected": 0.1362501084804535, - "logps/chosen": -1.7156760692596436, - "logps/rejected": -1.8900222778320312, - "loss": 1.1357, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.7156760692596436, - "rewards/margins": 0.17434628307819366, - "rewards/rejected": -1.8900222778320312, - "sft_loss": 1.468214750289917, + "grad_norm": 6.162224436844012, + "learning_rate": 8.9126559714795e-09, + "logits/chosen": -0.07354654371738434, + "logits/rejected": 0.1361573040485382, + "logps/chosen": -1.7158677577972412, + "logps/rejected": -1.8894357681274414, + "loss": 1.1358, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.7158677577972412, + "rewards/margins": 0.17356786131858826, + "rewards/rejected": -1.8894357681274414, + "sft_loss": 1.4684427976608276, "step": 5 }, { "epoch": 0.005352065562803144, - "grad_norm": 10.909175691763762, - "learning_rate": 5.347593582887701e-08, - "logits/chosen": -0.002189463470131159, - "logits/rejected": 0.12079276889562607, - "logps/chosen": -1.7996399402618408, - "logps/rejected": -1.8447366952896118, - "loss": 1.2279, + "grad_norm": 10.941948565599809, + "learning_rate": 1.7825311942959e-08, + "logits/chosen": -0.0025766133330762386, + "logits/rejected": 0.11985313892364502, + "logps/chosen": -1.8013126850128174, + "logps/rejected": -1.8446658849716187, + "loss": 1.2299, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.7996399402618408, - "rewards/margins": 0.04509688913822174, - "rewards/rejected": -1.8447366952896118, - "sft_loss": 1.507448673248291, + "rewards/chosen": -1.8013126850128174, + "rewards/margins": 0.04335314407944679, + "rewards/rejected": -1.8446658849716187, + "sft_loss": 1.5082662105560303, "step": 10 }, { "epoch": 0.008028098344204716, - "grad_norm": 13.61904960367422, - "learning_rate": 8.021390374331551e-08, - "logits/chosen": -0.049028005450963974, - "logits/rejected": 0.049201685935258865, - "logps/chosen": -1.6352916955947876, - "logps/rejected": -1.76531982421875, - "loss": 1.2021, + "grad_norm": 13.573973853756051, + "learning_rate": 2.67379679144385e-08, + "logits/chosen": -0.045625053346157074, + "logits/rejected": 0.053189463913440704, + "logps/chosen": -1.6336523294448853, + "logps/rejected": -1.7631381750106812, + "loss": 1.2018, "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.6352916955947876, - "rewards/margins": 0.13002797961235046, - "rewards/rejected": -1.76531982421875, - "sft_loss": 1.5003212690353394, + "rewards/chosen": -1.6336523294448853, + "rewards/margins": 0.1294858753681183, + "rewards/rejected": -1.7631381750106812, + "sft_loss": 1.4996378421783447, "step": 15 }, { "epoch": 0.010704131125606288, - "grad_norm": 5.956213295300661, - "learning_rate": 1.0695187165775402e-07, - "logits/chosen": -0.05677938461303711, - "logits/rejected": 0.029643535614013672, - "logps/chosen": -1.7252578735351562, - "logps/rejected": -1.805281639099121, - "loss": 1.2265, + "grad_norm": 5.965322949752156, + "learning_rate": 3.5650623885918e-08, + "logits/chosen": -0.03252996876835823, + "logits/rejected": 0.056474365293979645, + "logps/chosen": -1.7247947454452515, + "logps/rejected": -1.8047034740447998, + "loss": 1.2262, "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.7252578735351562, - "rewards/margins": 0.0800238624215126, - "rewards/rejected": -1.805281639099121, - "sft_loss": 1.5005255937576294, + "rewards/chosen": -1.7247947454452515, + "rewards/margins": 0.0799088105559349, + "rewards/rejected": -1.8047034740447998, + "sft_loss": 1.5000253915786743, "step": 20 }, { "epoch": 0.013380163907007862, - "grad_norm": 17.76695461787378, - "learning_rate": 1.3368983957219251e-07, - "logits/chosen": -0.06971491873264313, - "logits/rejected": 0.015968088060617447, - "logps/chosen": -1.866454839706421, - "logps/rejected": -1.7767322063446045, - "loss": 1.3448, - "rewards/accuracies": 0.375, - "rewards/chosen": -1.866454839706421, - "rewards/margins": -0.0897226631641388, - "rewards/rejected": -1.7767322063446045, - "sft_loss": 1.5450823307037354, + "grad_norm": 17.583923004034727, + "learning_rate": 4.45632798573975e-08, + "logits/chosen": -0.06310281157493591, + "logits/rejected": 0.024211319163441658, + "logps/chosen": -1.8702064752578735, + "logps/rejected": -1.7776330709457397, + "loss": 1.3473, + "rewards/accuracies": 0.3812499940395355, + "rewards/chosen": -1.8702064752578735, + "rewards/margins": -0.09257296472787857, + "rewards/rejected": -1.7776330709457397, + "sft_loss": 1.5455201864242554, "step": 25 }, { "epoch": 0.016056196688409432, - "grad_norm": 12.909544852261007, - "learning_rate": 1.6042780748663102e-07, - "logits/chosen": -0.10329775512218475, - "logits/rejected": -0.007711836602538824, - "logps/chosen": -1.9049831628799438, - "logps/rejected": -1.8297088146209717, - "loss": 1.3272, + "grad_norm": 12.928955930161317, + "learning_rate": 5.3475935828877e-08, + "logits/chosen": -0.09293361008167267, + "logits/rejected": 0.002235558582469821, + "logps/chosen": -1.9093812704086304, + "logps/rejected": -1.8332099914550781, + "loss": 1.3291, "rewards/accuracies": 0.4437499940395355, - "rewards/chosen": -1.9049831628799438, - "rewards/margins": -0.0752745047211647, - "rewards/rejected": -1.8297088146209717, - "sft_loss": 1.6447674036026, + "rewards/chosen": -1.9093812704086304, + "rewards/margins": -0.07617148756980896, + "rewards/rejected": -1.8332099914550781, + "sft_loss": 1.6468474864959717, "step": 30 }, { "epoch": 0.018732229469811006, - "grad_norm": 11.82840491928946, - "learning_rate": 1.8716577540106952e-07, - "logits/chosen": -0.04202842339873314, - "logits/rejected": 0.12342722713947296, - "logps/chosen": -1.8364969491958618, - "logps/rejected": -1.9843240976333618, - "loss": 1.2642, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.8364969491958618, - "rewards/margins": 0.14782710373401642, - "rewards/rejected": -1.9843240976333618, - "sft_loss": 1.5586276054382324, + "grad_norm": 11.844224730796958, + "learning_rate": 6.23885918003565e-08, + "logits/chosen": -0.04166014865040779, + "logits/rejected": 0.12396843731403351, + "logps/chosen": -1.8452978134155273, + "logps/rejected": -1.9958372116088867, + "loss": 1.2679, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.8452978134155273, + "rewards/margins": 0.15053938329219818, + "rewards/rejected": -1.9958372116088867, + "sft_loss": 1.5612391233444214, "step": 35 }, { "epoch": 0.021408262251212576, - "grad_norm": 10.875725329247906, - "learning_rate": 2.1390374331550805e-07, - "logits/chosen": 0.02725973166525364, - "logits/rejected": 0.20334240794181824, - "logps/chosen": -1.8649237155914307, - "logps/rejected": -1.7300840616226196, - "loss": 1.3038, + "grad_norm": 11.104553717008118, + "learning_rate": 7.1301247771836e-08, + "logits/chosen": 0.04377365857362747, + "logits/rejected": 0.22366110980510712, + "logps/chosen": -1.880755066871643, + "logps/rejected": -1.7429109811782837, + "loss": 1.3102, "rewards/accuracies": 0.45625001192092896, - "rewards/chosen": -1.8649237155914307, - "rewards/margins": -0.1348399817943573, - "rewards/rejected": -1.7300840616226196, - "sft_loss": 1.5143520832061768, + "rewards/chosen": -1.880755066871643, + "rewards/margins": -0.13784421980381012, + "rewards/rejected": -1.7429109811782837, + "sft_loss": 1.519200325012207, "step": 40 }, { "epoch": 0.02408429503261415, - "grad_norm": 16.550871534498324, - "learning_rate": 2.4064171122994655e-07, - "logits/chosen": 0.031885191798210144, - "logits/rejected": 0.23356428742408752, - "logps/chosen": -1.8056827783584595, - "logps/rejected": -1.8406091928482056, - "loss": 1.268, + "grad_norm": 16.41709353765331, + "learning_rate": 8.021390374331551e-08, + "logits/chosen": 0.012500310316681862, + "logits/rejected": 0.21139463782310486, + "logps/chosen": -1.8342450857162476, + "logps/rejected": -1.8698947429656982, + "loss": 1.2791, "rewards/accuracies": 0.48124998807907104, - "rewards/chosen": -1.8056827783584595, - "rewards/margins": 0.034926436841487885, - "rewards/rejected": -1.8406091928482056, - "sft_loss": 1.5227737426757812, + "rewards/chosen": -1.8342450857162476, + "rewards/margins": 0.03564963862299919, + "rewards/rejected": -1.8698947429656982, + "sft_loss": 1.5351488590240479, "step": 45 }, { "epoch": 0.026760327814015723, - "grad_norm": 13.08952392847594, - "learning_rate": 2.6737967914438503e-07, - "logits/chosen": -0.0523727647960186, - "logits/rejected": 0.10018514096736908, - "logps/chosen": -1.8462203741073608, - "logps/rejected": -1.7395761013031006, - "loss": 1.3203, + "grad_norm": 13.19811950294341, + "learning_rate": 8.9126559714795e-08, + "logits/chosen": -0.06285648047924042, + "logits/rejected": 0.09227000921964645, + "logps/chosen": -1.8954331874847412, + "logps/rejected": -1.7754628658294678, + "loss": 1.3432, "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.8462203741073608, - "rewards/margins": -0.10664422810077667, - "rewards/rejected": -1.7395761013031006, - "sft_loss": 1.5634331703186035, + "rewards/chosen": -1.8954331874847412, + "rewards/margins": -0.11997010558843613, + "rewards/rejected": -1.7754628658294678, + "sft_loss": 1.5819414854049683, "step": 50 }, { "epoch": 0.029436360595417294, - "grad_norm": 9.035596704121557, - "learning_rate": 2.9411764705882356e-07, - "logits/chosen": -0.09299639612436295, - "logits/rejected": 0.1341559737920761, - "logps/chosen": -1.7790874242782593, - "logps/rejected": -1.8184455633163452, - "loss": 1.2866, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.7790874242782593, - "rewards/margins": 0.03935818746685982, - "rewards/rejected": -1.8184455633163452, - "sft_loss": 1.5595557689666748, + "grad_norm": 9.01823552109749, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -0.10592956840991974, + "logits/rejected": 0.1197512298822403, + "logps/chosen": -1.8273578882217407, + "logps/rejected": -1.8612210750579834, + "loss": 1.3116, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.8273578882217407, + "rewards/margins": 0.03386329859495163, + "rewards/rejected": -1.8612210750579834, + "sft_loss": 1.5808497667312622, "step": 55 }, { "epoch": 0.032112393376818864, - "grad_norm": 8.403697285946595, - "learning_rate": 3.2085561497326203e-07, - "logits/chosen": -0.10554766654968262, - "logits/rejected": 0.08509379625320435, - "logps/chosen": -1.7057393789291382, - "logps/rejected": -1.8046671152114868, - "loss": 1.2018, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.7057393789291382, - "rewards/margins": 0.0989275649189949, - "rewards/rejected": -1.8046671152114868, - "sft_loss": 1.5179922580718994, + "grad_norm": 8.760806112736027, + "learning_rate": 1.06951871657754e-07, + "logits/chosen": -0.07141174376010895, + "logits/rejected": 0.12480834871530533, + "logps/chosen": -1.7802963256835938, + "logps/rejected": -1.8852354288101196, + "loss": 1.2193, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.7802963256835938, + "rewards/margins": 0.1049388200044632, + "rewards/rejected": -1.8852354288101196, + "sft_loss": 1.5415582656860352, "step": 60 }, { "epoch": 0.03478842615822044, - "grad_norm": 6.4387299531804745, - "learning_rate": 3.475935828877005e-07, - "logits/chosen": -0.03802342340350151, - "logits/rejected": 0.1122186928987503, - "logps/chosen": -1.5271183252334595, - "logps/rejected": -1.637882947921753, - "loss": 1.136, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.5271183252334595, - "rewards/margins": 0.11076472699642181, - "rewards/rejected": -1.637882947921753, - "sft_loss": 1.426003336906433, + "grad_norm": 7.306903729984867, + "learning_rate": 1.158645276292335e-07, + "logits/chosen": -0.03792408108711243, + "logits/rejected": 0.11073604971170425, + "logps/chosen": -1.631109595298767, + "logps/rejected": -1.7599260807037354, + "loss": 1.1604, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.631109595298767, + "rewards/margins": 0.12881648540496826, + "rewards/rejected": -1.7599260807037354, + "sft_loss": 1.4718269109725952, "step": 65 }, { "epoch": 0.03746445893962201, - "grad_norm": 13.955625675437666, - "learning_rate": 3.7433155080213904e-07, - "logits/chosen": -0.08897098153829575, - "logits/rejected": 0.059094082564115524, - "logps/chosen": -1.6416652202606201, - "logps/rejected": -1.677323579788208, - "loss": 1.2583, - "rewards/accuracies": 0.4000000059604645, - "rewards/chosen": -1.6416652202606201, - "rewards/margins": 0.03565821796655655, - "rewards/rejected": -1.677323579788208, - "sft_loss": 1.5583785772323608, + "grad_norm": 14.757953721252662, + "learning_rate": 1.24777183600713e-07, + "logits/chosen": -0.06770111620426178, + "logits/rejected": 0.08704431354999542, + "logps/chosen": -1.759752869606018, + "logps/rejected": -1.8056182861328125, + "loss": 1.3103, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -1.759752869606018, + "rewards/margins": 0.045865464955568314, + "rewards/rejected": -1.8056182861328125, + "sft_loss": 1.6259733438491821, "step": 70 }, { "epoch": 0.04014049172102358, - "grad_norm": 13.363727924448506, - "learning_rate": 4.0106951871657757e-07, - "logits/chosen": -0.08906193822622299, - "logits/rejected": 0.08749326318502426, - "logps/chosen": -1.6009581089019775, - "logps/rejected": -1.8365551233291626, - "loss": 1.1338, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.6009581089019775, - "rewards/margins": 0.2355968952178955, - "rewards/rejected": -1.8365551233291626, - "sft_loss": 1.497521996498108, + "grad_norm": 14.591213515171626, + "learning_rate": 1.3368983957219251e-07, + "logits/chosen": -0.03633163869380951, + "logits/rejected": 0.15258662402629852, + "logps/chosen": -1.7644485235214233, + "logps/rejected": -2.022709369659424, + "loss": 1.1785, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.7644485235214233, + "rewards/margins": 0.258261114358902, + "rewards/rejected": -2.022709369659424, + "sft_loss": 1.5605233907699585, "step": 75 }, { "epoch": 0.04281652450242515, - "grad_norm": 8.865202503300885, - "learning_rate": 4.278074866310161e-07, - "logits/chosen": -0.0019889636896550655, - "logits/rejected": 0.10428965091705322, - "logps/chosen": -1.5218764543533325, - "logps/rejected": -1.5615147352218628, - "loss": 1.1697, + "grad_norm": 9.952040314815008, + "learning_rate": 1.42602495543672e-07, + "logits/chosen": 0.004873444326221943, + "logits/rejected": 0.1104719489812851, + "logps/chosen": -1.6954625844955444, + "logps/rejected": -1.7293781042099, + "loss": 1.2311, "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.5218764543533325, - "rewards/margins": 0.03963825851678848, - "rewards/rejected": -1.5615147352218628, - "sft_loss": 1.4340198040008545, + "rewards/chosen": -1.6954625844955444, + "rewards/margins": 0.03391539677977562, + "rewards/rejected": -1.7293781042099, + "sft_loss": 1.5150405168533325, "step": 80 }, { "epoch": 0.04549255728382673, - "grad_norm": 6.571045133474957, - "learning_rate": 4.5454545454545457e-07, - "logits/chosen": -0.16559067368507385, - "logits/rejected": 0.0771712213754654, - "logps/chosen": -1.5611470937728882, - "logps/rejected": -1.6897590160369873, - "loss": 1.1362, + "grad_norm": 5.982856438454467, + "learning_rate": 1.5151515151515152e-07, + "logits/chosen": -0.15907660126686096, + "logits/rejected": 0.0876961499452591, + "logps/chosen": -1.761853814125061, + "logps/rejected": -1.9326856136322021, + "loss": 1.1865, "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.5611470937728882, - "rewards/margins": 0.12861183285713196, - "rewards/rejected": -1.6897590160369873, - "sft_loss": 1.4134607315063477, + "rewards/chosen": -1.761853814125061, + "rewards/margins": 0.17083203792572021, + "rewards/rejected": -1.9326856136322021, + "sft_loss": 1.4846560955047607, "step": 85 }, { "epoch": 0.0481685900652283, - "grad_norm": 11.981701286510356, - "learning_rate": 4.812834224598931e-07, - "logits/chosen": 0.03413419798016548, - "logits/rejected": -0.004057231359183788, - "logps/chosen": -1.436505913734436, - "logps/rejected": -1.5269982814788818, - "loss": 1.1281, - "rewards/accuracies": 0.46875, - "rewards/chosen": -1.436505913734436, - "rewards/margins": 0.09049233794212341, - "rewards/rejected": -1.5269982814788818, - "sft_loss": 1.3672358989715576, + "grad_norm": 16.704401798380253, + "learning_rate": 1.6042780748663102e-07, + "logits/chosen": 0.08973591774702072, + "logits/rejected": 0.05205491930246353, + "logps/chosen": -1.712908387184143, + "logps/rejected": -1.7490822076797485, + "loss": 1.2372, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.712908387184143, + "rewards/margins": 0.03617396205663681, + "rewards/rejected": -1.7490822076797485, + "sft_loss": 1.4473885297775269, "step": 90 }, { "epoch": 0.05084462284662987, - "grad_norm": 5.582546128335958, - "learning_rate": 5.080213903743315e-07, - "logits/chosen": -0.10642153024673462, - "logits/rejected": 0.03692169114947319, - "logps/chosen": -1.3211487531661987, - "logps/rejected": -1.5329933166503906, - "loss": 1.0679, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3211487531661987, - "rewards/margins": 0.21184447407722473, - "rewards/rejected": -1.5329933166503906, - "sft_loss": 1.3490239381790161, + "grad_norm": 9.048528442375913, + "learning_rate": 1.693404634581105e-07, + "logits/chosen": -0.0784987211227417, + "logits/rejected": 0.07455357909202576, + "logps/chosen": -1.7556579113006592, + "logps/rejected": -1.8748867511749268, + "loss": 1.216, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.7556579113006592, + "rewards/margins": 0.11922872066497803, + "rewards/rejected": -1.8748867511749268, + "sft_loss": 1.5081722736358643, "step": 95 }, { "epoch": 0.05352065562803145, - "grad_norm": 5.6861974383348155, - "learning_rate": 5.347593582887701e-07, - "logits/chosen": -0.10629527270793915, - "logits/rejected": -0.044294875115156174, - "logps/chosen": -1.3627598285675049, - "logps/rejected": -1.4516956806182861, - "loss": 1.1149, + "grad_norm": 5.527964399235599, + "learning_rate": 1.7825311942959e-07, + "logits/chosen": -0.0294797420501709, + "logits/rejected": 0.03756122291088104, + "logps/chosen": -1.661158561706543, + "logps/rejected": -1.766716718673706, + "loss": 1.1814, "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.3627598285675049, - "rewards/margins": 0.08893603086471558, - "rewards/rejected": -1.4516956806182861, - "sft_loss": 1.3573975563049316, + "rewards/chosen": -1.661158561706543, + "rewards/margins": 0.10555823147296906, + "rewards/rejected": -1.766716718673706, + "sft_loss": 1.478492259979248, "step": 100 }, { "epoch": 0.05619668840943302, - "grad_norm": 4.705345534762539, - "learning_rate": 5.614973262032086e-07, - "logits/chosen": -0.0295580867677927, - "logits/rejected": -0.0050818738527596, - "logps/chosen": -1.3181979656219482, - "logps/rejected": -1.4915450811386108, - "loss": 1.059, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3181979656219482, - "rewards/margins": 0.17334721982479095, - "rewards/rejected": -1.4915450811386108, - "sft_loss": 1.2995798587799072, + "grad_norm": 11.601096875377186, + "learning_rate": 1.8716577540106952e-07, + "logits/chosen": 0.046459540724754333, + "logits/rejected": 0.07478093355894089, + "logps/chosen": -1.6010570526123047, + "logps/rejected": -1.7681080102920532, + "loss": 1.139, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.6010570526123047, + "rewards/margins": 0.16705112159252167, + "rewards/rejected": -1.7681080102920532, + "sft_loss": 1.4180337190628052, "step": 105 }, { "epoch": 0.05887272119083459, - "grad_norm": 7.966487192278806, - "learning_rate": 5.882352941176471e-07, - "logits/chosen": -0.08093095570802689, - "logits/rejected": 0.013903314247727394, - "logps/chosen": -1.2849345207214355, - "logps/rejected": -1.348474383354187, - "loss": 1.104, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.2849345207214355, - "rewards/margins": 0.06353993713855743, - "rewards/rejected": -1.348474383354187, - "sft_loss": 1.2881290912628174, + "grad_norm": 8.400580715452332, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": 0.004502465482801199, + "logits/rejected": 0.10074075311422348, + "logps/chosen": -1.5968029499053955, + "logps/rejected": -1.654841661453247, + "loss": 1.1857, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5968029499053955, + "rewards/margins": 0.058038532733917236, + "rewards/rejected": -1.654841661453247, + "sft_loss": 1.4301693439483643, "step": 110 }, { "epoch": 0.06154875397223616, - "grad_norm": 7.641023591192343, - "learning_rate": 6.149732620320855e-07, - "logits/chosen": -0.047448135912418365, - "logits/rejected": 0.14301401376724243, - "logps/chosen": -1.323791742324829, - "logps/rejected": -1.5212907791137695, - "loss": 1.0755, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.323791742324829, - "rewards/margins": 0.19749906659126282, - "rewards/rejected": -1.5212907791137695, - "sft_loss": 1.3769772052764893, + "grad_norm": 12.483377529373273, + "learning_rate": 2.049910873440285e-07, + "logits/chosen": 0.01744399033486843, + "logits/rejected": 0.22708511352539062, + "logps/chosen": -1.5779647827148438, + "logps/rejected": -1.8342781066894531, + "loss": 1.1262, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5779647827148438, + "rewards/margins": 0.2563134729862213, + "rewards/rejected": -1.8342781066894531, + "sft_loss": 1.518568515777588, "step": 115 }, { "epoch": 0.06422478675363773, - "grad_norm": 6.708387519634524, - "learning_rate": 6.417112299465241e-07, - "logits/chosen": -0.14979350566864014, - "logits/rejected": 0.002688088919967413, - "logps/chosen": -1.3275854587554932, - "logps/rejected": -1.408825159072876, - "loss": 1.1135, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.3275854587554932, - "rewards/margins": 0.0812397450208664, - "rewards/rejected": -1.408825159072876, - "sft_loss": 1.3396663665771484, + "grad_norm": 8.546077002576027, + "learning_rate": 2.13903743315508e-07, + "logits/chosen": -0.11730382591485977, + "logits/rejected": 0.050408393144607544, + "logps/chosen": -1.6292957067489624, + "logps/rejected": -1.7397758960723877, + "loss": 1.1787, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.6292957067489624, + "rewards/margins": 0.11048026382923126, + "rewards/rejected": -1.7397758960723877, + "sft_loss": 1.5028191804885864, "step": 120 }, { "epoch": 0.0669008195350393, - "grad_norm": 4.9165529393450536, - "learning_rate": 6.684491978609626e-07, - "logits/chosen": -0.15790846943855286, - "logits/rejected": -0.03613553196191788, - "logps/chosen": -1.3417673110961914, - "logps/rejected": -1.3494497537612915, - "loss": 1.1568, - "rewards/accuracies": 0.48124998807907104, - "rewards/chosen": -1.3417673110961914, - "rewards/margins": 0.007682465016841888, - "rewards/rejected": -1.3494497537612915, - "sft_loss": 1.3967574834823608, + "grad_norm": 5.241431853062498, + "learning_rate": 2.2281639928698751e-07, + "logits/chosen": -0.11590234935283661, + "logits/rejected": 0.013391993939876556, + "logps/chosen": -1.569298505783081, + "logps/rejected": -1.5349996089935303, + "loss": 1.224, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.569298505783081, + "rewards/margins": -0.03429893031716347, + "rewards/rejected": -1.5349996089935303, + "sft_loss": 1.4846540689468384, "step": 125 }, { "epoch": 0.06957685231644088, - "grad_norm": 6.427234971687256, - "learning_rate": 6.95187165775401e-07, - "logits/chosen": -0.04104025289416313, - "logits/rejected": 0.08000461757183075, - "logps/chosen": -1.376473069190979, - "logps/rejected": -1.488599181175232, - "loss": 1.1242, + "grad_norm": 9.842421905850513, + "learning_rate": 2.31729055258467e-07, + "logits/chosen": 0.01973932608962059, + "logits/rejected": 0.15391430258750916, + "logps/chosen": -1.6046987771987915, + "logps/rejected": -1.721130132675171, + "loss": 1.1556, "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -1.376473069190979, - "rewards/margins": 0.11212635040283203, - "rewards/rejected": -1.488599181175232, - "sft_loss": 1.444916844367981, + "rewards/chosen": -1.6046987771987915, + "rewards/margins": 0.11643137037754059, + "rewards/rejected": -1.721130132675171, + "sft_loss": 1.540232539176941, "step": 130 }, { "epoch": 0.07225288509784245, - "grad_norm": 8.054691538356435, - "learning_rate": 7.219251336898395e-07, - "logits/chosen": -0.10705189406871796, - "logits/rejected": 0.0006207168335095048, - "logps/chosen": -1.4154436588287354, - "logps/rejected": -1.5005041360855103, - "loss": 1.1338, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.4154436588287354, - "rewards/margins": 0.0850602462887764, - "rewards/rejected": -1.5005041360855103, - "sft_loss": 1.3668407201766968, + "grad_norm": 17.94063478027767, + "learning_rate": 2.406417112299465e-07, + "logits/chosen": -0.06300957500934601, + "logits/rejected": 0.05799748748540878, + "logps/chosen": -1.6499484777450562, + "logps/rejected": -1.6832249164581299, + "loss": 1.2084, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.6499484777450562, + "rewards/margins": 0.033276233822107315, + "rewards/rejected": -1.6832249164581299, + "sft_loss": 1.4744528532028198, "step": 135 }, { "epoch": 0.07492891787924402, - "grad_norm": 7.859083459156864, - "learning_rate": 7.486631016042781e-07, - "logits/chosen": -0.10198960453271866, - "logits/rejected": 0.04557307809591293, - "logps/chosen": -1.4215986728668213, - "logps/rejected": -1.4720596075057983, - "loss": 1.1592, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.4215986728668213, - "rewards/margins": 0.05046095699071884, - "rewards/rejected": -1.4720596075057983, - "sft_loss": 1.4362990856170654, + "grad_norm": 8.654963108118846, + "learning_rate": 2.49554367201426e-07, + "logits/chosen": -0.0444134920835495, + "logits/rejected": 0.12819430232048035, + "logps/chosen": -1.613511323928833, + "logps/rejected": -1.7291886806488037, + "loss": 1.1738, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.613511323928833, + "rewards/margins": 0.11567743122577667, + "rewards/rejected": -1.7291886806488037, + "sft_loss": 1.522329330444336, "step": 140 }, { "epoch": 0.0776049506606456, - "grad_norm": 8.85741719438616, - "learning_rate": 7.754010695187166e-07, - "logits/chosen": -0.061334170401096344, - "logits/rejected": 0.0797557458281517, - "logps/chosen": -1.3318393230438232, - "logps/rejected": -1.427393913269043, - "loss": 1.1288, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.3318393230438232, - "rewards/margins": 0.09555456787347794, - "rewards/rejected": -1.427393913269043, - "sft_loss": 1.4047821760177612, + "grad_norm": 9.908182221301256, + "learning_rate": 2.5846702317290554e-07, + "logits/chosen": -0.029214048758149147, + "logits/rejected": 0.12418278306722641, + "logps/chosen": -1.5189011096954346, + "logps/rejected": -1.6256484985351562, + "loss": 1.1541, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.5189011096954346, + "rewards/margins": 0.10674738883972168, + "rewards/rejected": -1.6256484985351562, + "sft_loss": 1.4731454849243164, "step": 145 }, { "epoch": 0.08028098344204716, - "grad_norm": 8.202696544058703, - "learning_rate": 8.021390374331551e-07, - "logits/chosen": -0.10961981862783432, - "logits/rejected": 0.033692121505737305, - "logps/chosen": -1.2658131122589111, - "logps/rejected": -1.2866407632827759, - "loss": 1.116, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.2658131122589111, - "rewards/margins": 0.02082763984799385, - "rewards/rejected": -1.2866407632827759, - "sft_loss": 1.2677156925201416, + "grad_norm": 13.151478113506652, + "learning_rate": 2.6737967914438503e-07, + "logits/chosen": -0.07878760993480682, + "logits/rejected": 0.08296145498752594, + "logps/chosen": -1.4598569869995117, + "logps/rejected": -1.4593470096588135, + "loss": 1.1583, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4598569869995117, + "rewards/margins": -0.0005100608104839921, + "rewards/rejected": -1.4593470096588135, + "sft_loss": 1.329883337020874, "step": 150 }, { "epoch": 0.08295701622344874, - "grad_norm": 5.762281777012094, - "learning_rate": 8.288770053475937e-07, - "logits/chosen": -0.11363419145345688, - "logits/rejected": -0.06518497318029404, - "logps/chosen": -1.2847747802734375, - "logps/rejected": -1.4079326391220093, - "loss": 1.0887, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.2847747802734375, - "rewards/margins": 0.12315795570611954, - "rewards/rejected": -1.4079326391220093, - "sft_loss": 1.3240041732788086, + "grad_norm": 9.598508318421777, + "learning_rate": 2.762923351158645e-07, + "logits/chosen": -0.10372467339038849, + "logits/rejected": -0.05565663054585457, + "logps/chosen": -1.4395086765289307, + "logps/rejected": -1.5460284948349, + "loss": 1.1221, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4395086765289307, + "rewards/margins": 0.10651954263448715, + "rewards/rejected": -1.5460284948349, + "sft_loss": 1.3945090770721436, "step": 155 }, { "epoch": 0.0856330490048503, - "grad_norm": 5.517470300559481, - "learning_rate": 8.556149732620322e-07, - "logits/chosen": -0.19057968258857727, - "logits/rejected": -0.059998493641614914, - "logps/chosen": -1.3801919221878052, - "logps/rejected": -1.3602113723754883, - "loss": 1.1731, - "rewards/accuracies": 0.4625000059604645, - "rewards/chosen": -1.3801919221878052, - "rewards/margins": -0.01998048648238182, - "rewards/rejected": -1.3602113723754883, - "sft_loss": 1.3693006038665771, + "grad_norm": 10.148738813875053, + "learning_rate": 2.85204991087344e-07, + "logits/chosen": -0.1604335755109787, + "logits/rejected": -0.01720331236720085, + "logps/chosen": -1.5059787034988403, + "logps/rejected": -1.4849656820297241, + "loss": 1.2026, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.5059787034988403, + "rewards/margins": -0.021012943238019943, + "rewards/rejected": -1.4849656820297241, + "sft_loss": 1.424424171447754, "step": 160 }, { "epoch": 0.08830908178625188, - "grad_norm": 7.395292190886674, - "learning_rate": 8.823529411764706e-07, - "logits/chosen": -0.09310317784547806, - "logits/rejected": 0.06683714687824249, - "logps/chosen": -1.2894189357757568, - "logps/rejected": -1.3747179508209229, - "loss": 1.1075, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.2894189357757568, - "rewards/margins": 0.08529897779226303, - "rewards/rejected": -1.3747179508209229, - "sft_loss": 1.288217306137085, + "grad_norm": 8.444078924667641, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -0.09711726009845734, + "logits/rejected": 0.07096539437770844, + "logps/chosen": -1.3629688024520874, + "logps/rejected": -1.449608564376831, + "loss": 1.1205, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3629688024520874, + "rewards/margins": 0.0866396352648735, + "rewards/rejected": -1.449608564376831, + "sft_loss": 1.3194470405578613, "step": 165 }, { "epoch": 0.09098511456765346, - "grad_norm": 6.2980422888580305, - "learning_rate": 9.090909090909091e-07, - "logits/chosen": -0.1289089173078537, - "logits/rejected": -0.08186782896518707, - "logps/chosen": -1.4113397598266602, - "logps/rejected": -1.489976406097412, - "loss": 1.1464, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.4113397598266602, - "rewards/margins": 0.0786367803812027, - "rewards/rejected": -1.489976406097412, - "sft_loss": 1.4135427474975586, + "grad_norm": 13.236412367686684, + "learning_rate": 3.0303030303030305e-07, + "logits/chosen": -0.145632803440094, + "logits/rejected": -0.09582562744617462, + "logps/chosen": -1.4657261371612549, + "logps/rejected": -1.5351206064224243, + "loss": 1.1642, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4657261371612549, + "rewards/margins": 0.06939435005187988, + "rewards/rejected": -1.5351206064224243, + "sft_loss": 1.438075304031372, "step": 170 }, { "epoch": 0.09366114734905502, - "grad_norm": 7.44909076238542, - "learning_rate": 9.358288770053477e-07, - "logits/chosen": 0.010211547836661339, - "logits/rejected": 0.011260807514190674, - "logps/chosen": -1.3008840084075928, - "logps/rejected": -1.3928817510604858, - "loss": 1.1112, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.3008840084075928, - "rewards/margins": 0.09199782460927963, - "rewards/rejected": -1.3928817510604858, - "sft_loss": 1.3410255908966064, + "grad_norm": 7.787774859149804, + "learning_rate": 3.1194295900178254e-07, + "logits/chosen": -0.02539270557463169, + "logits/rejected": -0.026614580303430557, + "logps/chosen": -1.3474395275115967, + "logps/rejected": -1.4411218166351318, + "loss": 1.1216, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3474395275115967, + "rewards/margins": 0.09368231147527695, + "rewards/rejected": -1.4411218166351318, + "sft_loss": 1.3676502704620361, "step": 175 }, { "epoch": 0.0963371801304566, - "grad_norm": 6.526310508545539, - "learning_rate": 9.625668449197862e-07, - "logits/chosen": -0.024570604786276817, - "logits/rejected": -0.02446991577744484, - "logps/chosen": -1.3202580213546753, - "logps/rejected": -1.527092456817627, - "loss": 1.1042, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3202580213546753, - "rewards/margins": 0.20683428645133972, - "rewards/rejected": -1.527092456817627, - "sft_loss": 1.360899567604065, + "grad_norm": 6.812116118866956, + "learning_rate": 3.2085561497326203e-07, + "logits/chosen": -0.06027358025312424, + "logits/rejected": -0.05768832564353943, + "logps/chosen": -1.3618038892745972, + "logps/rejected": -1.560143232345581, + "loss": 1.1173, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3618038892745972, + "rewards/margins": 0.1983393430709839, + "rewards/rejected": -1.560143232345581, + "sft_loss": 1.3774890899658203, "step": 180 }, { "epoch": 0.09901321291185818, - "grad_norm": 7.077223356554608, - "learning_rate": 9.893048128342246e-07, - "logits/chosen": -0.135633185505867, - "logits/rejected": -0.04922838136553764, - "logps/chosen": -1.3211921453475952, - "logps/rejected": -1.3769458532333374, - "loss": 1.132, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.3211921453475952, - "rewards/margins": 0.05575376749038696, - "rewards/rejected": -1.3769458532333374, - "sft_loss": 1.3447620868682861, + "grad_norm": 7.994529020372144, + "learning_rate": 3.297682709447415e-07, + "logits/chosen": -0.20169305801391602, + "logits/rejected": -0.11706896126270294, + "logps/chosen": -1.3508808612823486, + "logps/rejected": -1.39878249168396, + "loss": 1.1426, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.3508808612823486, + "rewards/margins": 0.04790160059928894, + "rewards/rejected": -1.39878249168396, + "sft_loss": 1.3566535711288452, "step": 185 }, { "epoch": 0.10168924569325974, - "grad_norm": 7.121196316462924, - "learning_rate": 1.016042780748663e-06, - "logits/chosen": -0.07600800693035126, - "logits/rejected": 0.0359211228787899, - "logps/chosen": -1.2341994047164917, - "logps/rejected": -1.3743019104003906, - "loss": 1.0644, + "grad_norm": 7.132584817326348, + "learning_rate": 3.38680926916221e-07, + "logits/chosen": -0.1200329065322876, + "logits/rejected": -0.011905002407729626, + "logps/chosen": -1.2903319597244263, + "logps/rejected": -1.417377233505249, + "loss": 1.0801, "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.2341994047164917, - "rewards/margins": 0.14010249078273773, - "rewards/rejected": -1.3743019104003906, - "sft_loss": 1.2724642753601074, + "rewards/chosen": -1.2903319597244263, + "rewards/margins": 0.12704533338546753, + "rewards/rejected": -1.417377233505249, + "sft_loss": 1.3000907897949219, "step": 190 }, { "epoch": 0.10436527847466132, - "grad_norm": 5.035842365815689, - "learning_rate": 1.0427807486631017e-06, - "logits/chosen": 0.018078740686178207, - "logits/rejected": 0.16420726478099823, - "logps/chosen": -1.229421854019165, - "logps/rejected": -1.388828992843628, - "loss": 1.052, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.229421854019165, - "rewards/margins": 0.15940706431865692, - "rewards/rejected": -1.388828992843628, - "sft_loss": 1.2792654037475586, + "grad_norm": 5.467117063832052, + "learning_rate": 3.475935828877005e-07, + "logits/chosen": -0.04079444706439972, + "logits/rejected": 0.10692572593688965, + "logps/chosen": -1.2521560192108154, + "logps/rejected": -1.4163955450057983, + "loss": 1.0558, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2521560192108154, + "rewards/margins": 0.16423942148685455, + "rewards/rejected": -1.4163955450057983, + "sft_loss": 1.2910324335098267, "step": 195 }, { "epoch": 0.1070413112560629, - "grad_norm": 15.548912349802093, - "learning_rate": 1.0695187165775401e-06, - "logits/chosen": -0.07511644065380096, - "logits/rejected": 0.052346598356962204, - "logps/chosen": -1.3431646823883057, - "logps/rejected": -1.3926780223846436, - "loss": 1.1349, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3431646823883057, - "rewards/margins": 0.04951336607336998, - "rewards/rejected": -1.3926780223846436, - "sft_loss": 1.3756563663482666, + "grad_norm": 16.412975138686853, + "learning_rate": 3.5650623885918e-07, + "logits/chosen": -0.1568303108215332, + "logits/rejected": -0.029046082869172096, + "logps/chosen": -1.3794472217559814, + "logps/rejected": -1.4154322147369385, + "loss": 1.1498, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3794472217559814, + "rewards/margins": 0.03598495572805405, + "rewards/rejected": -1.4154322147369385, + "sft_loss": 1.3919644355773926, "step": 200 }, { "epoch": 0.10971734403746446, - "grad_norm": 8.522924881240787, - "learning_rate": 1.0962566844919785e-06, - "logits/chosen": -0.06255607306957245, - "logits/rejected": 0.07017968595027924, - "logps/chosen": -1.2415074110031128, - "logps/rejected": -1.3377946615219116, - "loss": 1.0811, + "grad_norm": 11.53610009270648, + "learning_rate": 3.654188948306595e-07, + "logits/chosen": -0.094792440533638, + "logits/rejected": 0.04317759722471237, + "logps/chosen": -1.2949237823486328, + "logps/rejected": -1.359531044960022, + "loss": 1.1074, "rewards/accuracies": 0.53125, - "rewards/chosen": -1.2415074110031128, - "rewards/margins": 0.09628725051879883, - "rewards/rejected": -1.3377946615219116, - "sft_loss": 1.2635818719863892, + "rewards/chosen": -1.2949237823486328, + "rewards/margins": 0.0646071583032608, + "rewards/rejected": -1.359531044960022, + "sft_loss": 1.2842977046966553, "step": 205 }, { "epoch": 0.11239337681886603, - "grad_norm": 7.031854777872283, - "learning_rate": 1.1229946524064172e-06, - "logits/chosen": -0.12123604863882065, - "logits/rejected": 0.0530204176902771, - "logps/chosen": -1.3264672756195068, - "logps/rejected": -1.447780966758728, - "loss": 1.1011, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3264672756195068, - "rewards/margins": 0.12131373584270477, - "rewards/rejected": -1.447780966758728, - "sft_loss": 1.3216352462768555, + "grad_norm": 7.887283586640461, + "learning_rate": 3.7433155080213904e-07, + "logits/chosen": -0.19363883137702942, + "logits/rejected": -0.016594117507338524, + "logps/chosen": -1.3659045696258545, + "logps/rejected": -1.4803597927093506, + "loss": 1.1136, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3659045696258545, + "rewards/margins": 0.11445526778697968, + "rewards/rejected": -1.4803597927093506, + "sft_loss": 1.338024377822876, "step": 210 }, { "epoch": 0.1150694096002676, - "grad_norm": 6.236274148718416, - "learning_rate": 1.1497326203208556e-06, - "logits/chosen": -0.15441572666168213, - "logits/rejected": 0.07962942123413086, - "logps/chosen": -1.3619040250778198, - "logps/rejected": -1.4223322868347168, - "loss": 1.1135, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3619040250778198, - "rewards/margins": 0.06042822077870369, - "rewards/rejected": -1.4223322868347168, - "sft_loss": 1.3458638191223145, + "grad_norm": 6.085333351790094, + "learning_rate": 3.8324420677361853e-07, + "logits/chosen": -0.2194538116455078, + "logits/rejected": 0.015209652483463287, + "logps/chosen": -1.3826278448104858, + "logps/rejected": -1.4374979734420776, + "loss": 1.1212, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3826278448104858, + "rewards/margins": 0.054870136082172394, + "rewards/rejected": -1.4374979734420776, + "sft_loss": 1.3614327907562256, "step": 215 }, { "epoch": 0.11774544238166917, - "grad_norm": 12.60517336729982, - "learning_rate": 1.1764705882352942e-06, - "logits/chosen": 0.07616592943668365, - "logits/rejected": 0.1707703024148941, - "logps/chosen": -1.2778079509735107, - "logps/rejected": -1.4280025959014893, - "loss": 1.0803, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2778079509735107, - "rewards/margins": 0.1501947045326233, - "rewards/rejected": -1.4280025959014893, - "sft_loss": 1.3137943744659424, + "grad_norm": 12.841488245600196, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": -0.021501736715435982, + "logits/rejected": 0.066695936024189, + "logps/chosen": -1.3111062049865723, + "logps/rejected": -1.4572151899337769, + "loss": 1.0912, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3111062049865723, + "rewards/margins": 0.14610889554023743, + "rewards/rejected": -1.4572151899337769, + "sft_loss": 1.3320391178131104, "step": 220 }, { "epoch": 0.12042147516307075, - "grad_norm": 5.318042115628666, - "learning_rate": 1.2032085561497326e-06, - "logits/chosen": -0.0796518325805664, - "logits/rejected": 0.08061401546001434, - "logps/chosen": -1.2935796976089478, - "logps/rejected": -1.4250218868255615, - "loss": 1.068, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.2935796976089478, - "rewards/margins": 0.13144224882125854, - "rewards/rejected": -1.4250218868255615, - "sft_loss": 1.304241418838501, + "grad_norm": 5.5322486055819775, + "learning_rate": 4.010695187165775e-07, + "logits/chosen": -0.13724544644355774, + "logits/rejected": 0.023908359929919243, + "logps/chosen": -1.3111350536346436, + "logps/rejected": -1.4400126934051514, + "loss": 1.0738, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3111350536346436, + "rewards/margins": 0.1288774460554123, + "rewards/rejected": -1.4400126934051514, + "sft_loss": 1.310573935508728, "step": 225 }, { "epoch": 0.12309750794447231, - "grad_norm": 5.299354677635938, - "learning_rate": 1.229946524064171e-06, - "logits/chosen": 0.005102366209030151, - "logits/rejected": 0.07590552419424057, - "logps/chosen": -1.2844891548156738, - "logps/rejected": -1.4591628313064575, - "loss": 1.0508, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.2844891548156738, - "rewards/margins": 0.17467369139194489, - "rewards/rejected": -1.4591628313064575, - "sft_loss": 1.2599900960922241, + "grad_norm": 5.527098274388295, + "learning_rate": 4.09982174688057e-07, + "logits/chosen": -0.03876190260052681, + "logits/rejected": 0.03524729236960411, + "logps/chosen": -1.3190863132476807, + "logps/rejected": -1.4766333103179932, + "loss": 1.0667, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3190863132476807, + "rewards/margins": 0.15754687786102295, + "rewards/rejected": -1.4766333103179932, + "sft_loss": 1.2724026441574097, "step": 230 }, { "epoch": 0.1257735407258739, - "grad_norm": 6.761097593434722, - "learning_rate": 1.2566844919786097e-06, - "logits/chosen": 0.02427624724805355, - "logits/rejected": 0.1473507583141327, - "logps/chosen": -1.2645740509033203, - "logps/rejected": -1.4370596408843994, - "loss": 1.0477, + "grad_norm": 7.683355642835208, + "learning_rate": 4.188948306595365e-07, + "logits/chosen": -0.027026275172829628, + "logits/rejected": 0.10092300176620483, + "logps/chosen": -1.2846142053604126, + "logps/rejected": -1.4518486261367798, + "loss": 1.0556, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2645740509033203, - "rewards/margins": 0.17248567938804626, - "rewards/rejected": -1.4370596408843994, - "sft_loss": 1.2697627544403076, + "rewards/chosen": -1.2846142053604126, + "rewards/margins": 0.16723443567752838, + "rewards/rejected": -1.4518486261367798, + "sft_loss": 1.2829954624176025, "step": 235 }, { "epoch": 0.12844957350727546, - "grad_norm": 4.531213132964872, - "learning_rate": 1.2834224598930481e-06, - "logits/chosen": 0.005139252170920372, - "logits/rejected": 0.1295509934425354, - "logps/chosen": -1.2738149166107178, - "logps/rejected": -1.4708013534545898, - "loss": 1.0667, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2738149166107178, - "rewards/margins": 0.19698651134967804, - "rewards/rejected": -1.4708013534545898, - "sft_loss": 1.325798511505127, + "grad_norm": 4.770573773954613, + "learning_rate": 4.27807486631016e-07, + "logits/chosen": -0.06706438958644867, + "logits/rejected": 0.053351886570453644, + "logps/chosen": -1.3005207777023315, + "logps/rejected": -1.4855378866195679, + "loss": 1.0783, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3005207777023315, + "rewards/margins": 0.18501710891723633, + "rewards/rejected": -1.4855378866195679, + "sft_loss": 1.3445043563842773, "step": 240 }, { "epoch": 0.13112560628867703, - "grad_norm": 6.486508247077267, - "learning_rate": 1.3101604278074866e-06, - "logits/chosen": 0.03270702809095383, - "logits/rejected": 0.1433630883693695, - "logps/chosen": -1.3901463747024536, - "logps/rejected": -1.435687780380249, - "loss": 1.1513, + "grad_norm": 6.871730260983015, + "learning_rate": 4.3672014260249554e-07, + "logits/chosen": -0.004986020736396313, + "logits/rejected": 0.10724584758281708, + "logps/chosen": -1.418555498123169, + "logps/rejected": -1.4448583126068115, + "loss": 1.1693, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3901463747024536, - "rewards/margins": 0.045541636645793915, - "rewards/rejected": -1.435687780380249, - "sft_loss": 1.4123432636260986, + "rewards/chosen": -1.418555498123169, + "rewards/margins": 0.026302779093384743, + "rewards/rejected": -1.4448583126068115, + "sft_loss": 1.4301879405975342, "step": 245 }, { "epoch": 0.1338016390700786, - "grad_norm": 7.36897429327943, - "learning_rate": 1.3368983957219252e-06, - "logits/chosen": -0.04421486333012581, - "logits/rejected": 0.11287758499383926, - "logps/chosen": -1.2664507627487183, - "logps/rejected": -1.3541992902755737, - "loss": 1.1097, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.2664507627487183, - "rewards/margins": 0.08774860203266144, - "rewards/rejected": -1.3541992902755737, - "sft_loss": 1.3012326955795288, + "grad_norm": 10.203244665026718, + "learning_rate": 4.4563279857397503e-07, + "logits/chosen": -0.10470624268054962, + "logits/rejected": 0.051918040961027145, + "logps/chosen": -1.3056375980377197, + "logps/rejected": -1.3619807958602905, + "loss": 1.1313, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.3056375980377197, + "rewards/margins": 0.05634317919611931, + "rewards/rejected": -1.3619807958602905, + "sft_loss": 1.3133130073547363, "step": 250 }, { "epoch": 0.1364776718514802, - "grad_norm": 6.40818524595936, - "learning_rate": 1.3636363636363636e-06, - "logits/chosen": -0.004007840063422918, - "logits/rejected": 0.13231723010540009, - "logps/chosen": -1.2495529651641846, - "logps/rejected": -1.3691155910491943, - "loss": 1.0523, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.2495529651641846, - "rewards/margins": 0.11956258118152618, - "rewards/rejected": -1.3691155910491943, - "sft_loss": 1.2333502769470215, + "grad_norm": 6.790912025671572, + "learning_rate": 4.545454545454545e-07, + "logits/chosen": -0.055276162922382355, + "logits/rejected": 0.0787787213921547, + "logps/chosen": -1.2640888690948486, + "logps/rejected": -1.3716880083084106, + "loss": 1.0607, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2640888690948486, + "rewards/margins": 0.1075989231467247, + "rewards/rejected": -1.3716880083084106, + "sft_loss": 1.2419153451919556, "step": 255 }, { "epoch": 0.13915370463288176, - "grad_norm": 4.889337882490812, - "learning_rate": 1.390374331550802e-06, - "logits/chosen": -0.2317703664302826, - "logits/rejected": -0.12833379209041595, - "logps/chosen": -1.3310482501983643, - "logps/rejected": -1.5018017292022705, - "loss": 1.0713, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3310482501983643, - "rewards/margins": 0.1707535982131958, - "rewards/rejected": -1.5018017292022705, - "sft_loss": 1.3612117767333984, + "grad_norm": 5.636934333298813, + "learning_rate": 4.63458110516934e-07, + "logits/chosen": -0.2526181638240814, + "logits/rejected": -0.15046949684619904, + "logps/chosen": -1.3521802425384521, + "logps/rejected": -1.507084846496582, + "loss": 1.0814, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3521802425384521, + "rewards/margins": 0.1549045741558075, + "rewards/rejected": -1.507084846496582, + "sft_loss": 1.3677871227264404, "step": 260 }, { "epoch": 0.1418297374142833, - "grad_norm": 7.23659379779864, - "learning_rate": 1.4171122994652407e-06, - "logits/chosen": -0.11722488701343536, - "logits/rejected": -0.03951631858944893, - "logps/chosen": -1.3124138116836548, - "logps/rejected": -1.504041075706482, - "loss": 1.0828, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3124138116836548, - "rewards/margins": 0.1916274130344391, - "rewards/rejected": -1.504041075706482, - "sft_loss": 1.3750369548797607, + "grad_norm": 7.491534287261313, + "learning_rate": 4.723707664884135e-07, + "logits/chosen": -0.13241180777549744, + "logits/rejected": -0.05078262835741043, + "logps/chosen": -1.3444370031356812, + "logps/rejected": -1.508912444114685, + "loss": 1.1041, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3444370031356812, + "rewards/margins": 0.16447539627552032, + "rewards/rejected": -1.508912444114685, + "sft_loss": 1.3960318565368652, "step": 265 }, { "epoch": 0.1445057701956849, - "grad_norm": 4.501741960021518, - "learning_rate": 1.443850267379679e-06, - "logits/chosen": -0.07672002166509628, - "logits/rejected": 0.042726390063762665, - "logps/chosen": -1.294614553451538, - "logps/rejected": -1.4077153205871582, - "loss": 1.0837, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.294614553451538, - "rewards/margins": 0.11310066282749176, - "rewards/rejected": -1.4077153205871582, - "sft_loss": 1.3220535516738892, + "grad_norm": 4.784128370751162, + "learning_rate": 4.81283422459893e-07, + "logits/chosen": -0.12042725086212158, + "logits/rejected": 0.003054526401683688, + "logps/chosen": -1.316843032836914, + "logps/rejected": -1.417441964149475, + "loss": 1.0933, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.316843032836914, + "rewards/margins": 0.10059895366430283, + "rewards/rejected": -1.417441964149475, + "sft_loss": 1.3325092792510986, "step": 270 }, { "epoch": 0.14718180297708647, - "grad_norm": 5.164827833039285, - "learning_rate": 1.4705882352941175e-06, - "logits/chosen": -0.037459634244441986, - "logits/rejected": 0.05704887583851814, - "logps/chosen": -1.2455511093139648, - "logps/rejected": -1.4279637336730957, - "loss": 1.0358, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2455511093139648, - "rewards/margins": 0.18241265416145325, - "rewards/rejected": -1.4279637336730957, - "sft_loss": 1.2343533039093018, + "grad_norm": 5.799474251053073, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": -0.07320351898670197, + "logits/rejected": 0.021167168393731117, + "logps/chosen": -1.2687400579452515, + "logps/rejected": -1.4257270097732544, + "loss": 1.0499, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2687400579452515, + "rewards/margins": 0.15698681771755219, + "rewards/rejected": -1.4257270097732544, + "sft_loss": 1.2448979616165161, "step": 275 }, { "epoch": 0.14985783575848804, - "grad_norm": 8.367094770624977, - "learning_rate": 1.4973262032085562e-06, - "logits/chosen": -0.08834396302700043, - "logits/rejected": 0.05895150825381279, - "logps/chosen": -1.295008659362793, - "logps/rejected": -1.4184646606445312, - "loss": 1.0715, + "grad_norm": 7.0387417725027275, + "learning_rate": 4.99108734402852e-07, + "logits/chosen": -0.13529178500175476, + "logits/rejected": 0.012803696095943451, + "logps/chosen": -1.3235130310058594, + "logps/rejected": -1.4172152280807495, + "loss": 1.0932, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.295008659362793, - "rewards/margins": 0.12345600128173828, - "rewards/rejected": -1.4184646606445312, - "sft_loss": 1.2937798500061035, + "rewards/chosen": -1.3235130310058594, + "rewards/margins": 0.09370215237140656, + "rewards/rejected": -1.4172152280807495, + "sft_loss": 1.3104783296585083, "step": 280 }, { "epoch": 0.15253386853988962, - "grad_norm": 6.628270520705701, - "learning_rate": 1.5240641711229948e-06, - "logits/chosen": -0.05230358988046646, - "logits/rejected": 0.08594690263271332, - "logps/chosen": -1.3371986150741577, - "logps/rejected": -1.4414142370224, - "loss": 1.1202, + "grad_norm": 6.581168345074518, + "learning_rate": 5.080213903743315e-07, + "logits/chosen": -0.12943314015865326, + "logits/rejected": -0.0024739429354667664, + "logps/chosen": -1.345080852508545, + "logps/rejected": -1.4286965131759644, + "loss": 1.1339, "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3371986150741577, - "rewards/margins": 0.10421568155288696, - "rewards/rejected": -1.4414142370224, - "sft_loss": 1.3829238414764404, + "rewards/chosen": -1.345080852508545, + "rewards/margins": 0.0836157277226448, + "rewards/rejected": -1.4286965131759644, + "sft_loss": 1.3899028301239014, "step": 285 }, { "epoch": 0.1552099013212912, - "grad_norm": 6.450117105702508, - "learning_rate": 1.5508021390374332e-06, - "logits/chosen": -0.11853925883769989, - "logits/rejected": 0.16345106065273285, - "logps/chosen": -1.3565114736557007, - "logps/rejected": -1.495757818222046, - "loss": 1.0876, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3565114736557007, - "rewards/margins": 0.13924629986286163, - "rewards/rejected": -1.495757818222046, - "sft_loss": 1.3547443151474, + "grad_norm": 6.873481271303318, + "learning_rate": 5.169340463458111e-07, + "logits/chosen": -0.1648433953523636, + "logits/rejected": 0.11345580965280533, + "logps/chosen": -1.3644860982894897, + "logps/rejected": -1.484805703163147, + "loss": 1.0947, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3644860982894897, + "rewards/margins": 0.1203194409608841, + "rewards/rejected": -1.484805703163147, + "sft_loss": 1.357634425163269, "step": 290 }, { "epoch": 0.15788593410269275, - "grad_norm": 6.722988345540176, - "learning_rate": 1.5775401069518718e-06, - "logits/chosen": -0.05183395743370056, - "logits/rejected": 0.0007757678395137191, - "logps/chosen": -1.2508453130722046, - "logps/rejected": -1.4139963388442993, - "loss": 1.0458, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2508453130722046, - "rewards/margins": 0.16315098106861115, - "rewards/rejected": -1.4139963388442993, - "sft_loss": 1.265608310699463, + "grad_norm": 7.067404946544144, + "learning_rate": 5.258467023172905e-07, + "logits/chosen": -0.10759624093770981, + "logits/rejected": -0.05163930729031563, + "logps/chosen": -1.26571524143219, + "logps/rejected": -1.4039279222488403, + "loss": 1.0593, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.26571524143219, + "rewards/margins": 0.13821277022361755, + "rewards/rejected": -1.4039279222488403, + "sft_loss": 1.271865963935852, "step": 295 }, { "epoch": 0.16056196688409433, - "grad_norm": 6.04567526816521, - "learning_rate": 1.6042780748663103e-06, - "logits/chosen": -0.08818133920431137, - "logits/rejected": 0.06808780133724213, - "logps/chosen": -1.2863214015960693, - "logps/rejected": -1.410569429397583, - "loss": 1.0916, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2863214015960693, - "rewards/margins": 0.12424807250499725, - "rewards/rejected": -1.410569429397583, - "sft_loss": 1.3566596508026123, + "grad_norm": 6.4408716649587525, + "learning_rate": 5.347593582887701e-07, + "logits/chosen": -0.113519586622715, + "logits/rejected": 0.045286424458026886, + "logps/chosen": -1.308065414428711, + "logps/rejected": -1.3825656175613403, + "loss": 1.1161, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.308065414428711, + "rewards/margins": 0.0745001882314682, + "rewards/rejected": -1.3825656175613403, + "sft_loss": 1.3652435541152954, "step": 300 }, { "epoch": 0.1632379996654959, - "grad_norm": 5.0046549507985745, - "learning_rate": 1.6310160427807487e-06, - "logits/chosen": -0.028544578701257706, - "logits/rejected": 0.041024815291166306, - "logps/chosen": -1.397444486618042, - "logps/rejected": -1.4231725931167603, - "loss": 1.1662, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.397444486618042, - "rewards/margins": 0.02572813630104065, - "rewards/rejected": -1.4231725931167603, - "sft_loss": 1.4036086797714233, + "grad_norm": 5.194545703203444, + "learning_rate": 5.436720142602496e-07, + "logits/chosen": -0.07074855268001556, + "logits/rejected": -0.0032803595531731844, + "logps/chosen": -1.4063204526901245, + "logps/rejected": -1.417278528213501, + "loss": 1.1752, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4063204526901245, + "rewards/margins": 0.010958048515021801, + "rewards/rejected": -1.417278528213501, + "sft_loss": 1.4083219766616821, "step": 305 }, { "epoch": 0.16591403244689748, - "grad_norm": 6.8799911344678675, - "learning_rate": 1.6577540106951873e-06, - "logits/chosen": -0.22853362560272217, - "logits/rejected": -0.1439324915409088, - "logps/chosen": -1.362104892730713, - "logps/rejected": -1.472100019454956, - "loss": 1.1235, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.362104892730713, - "rewards/margins": 0.10999520123004913, - "rewards/rejected": -1.472100019454956, - "sft_loss": 1.3633081912994385, + "grad_norm": 7.158406715301559, + "learning_rate": 5.52584670231729e-07, + "logits/chosen": -0.2518317699432373, + "logits/rejected": -0.16715845465660095, + "logps/chosen": -1.3734468221664429, + "logps/rejected": -1.459455132484436, + "loss": 1.1388, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3734468221664429, + "rewards/margins": 0.08600833266973495, + "rewards/rejected": -1.459455132484436, + "sft_loss": 1.3688867092132568, "step": 310 }, { "epoch": 0.16859006522829906, - "grad_norm": 7.173718775073233, - "learning_rate": 1.6844919786096258e-06, - "logits/chosen": -0.02005874551832676, - "logits/rejected": 0.13060171902179718, - "logps/chosen": -1.3620105981826782, - "logps/rejected": -1.5362706184387207, - "loss": 1.1099, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3620105981826782, - "rewards/margins": 0.17425988614559174, - "rewards/rejected": -1.5362706184387207, - "sft_loss": 1.3757470846176147, + "grad_norm": 7.47312214042103, + "learning_rate": 5.614973262032086e-07, + "logits/chosen": -0.04104981943964958, + "logits/rejected": 0.11383312940597534, + "logps/chosen": -1.3609014749526978, + "logps/rejected": -1.509178876876831, + "loss": 1.1223, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3609014749526978, + "rewards/margins": 0.1482773870229721, + "rewards/rejected": -1.509178876876831, + "sft_loss": 1.3762632608413696, "step": 315 }, { "epoch": 0.1712660980097006, - "grad_norm": 6.003421500347755, - "learning_rate": 1.7112299465240644e-06, - "logits/chosen": -0.09477636963129044, - "logits/rejected": 0.03201219066977501, - "logps/chosen": -1.3208723068237305, - "logps/rejected": -1.380985975265503, - "loss": 1.1145, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3208723068237305, - "rewards/margins": 0.06011378765106201, - "rewards/rejected": -1.380985975265503, - "sft_loss": 1.3422465324401855, + "grad_norm": 5.142394838688744, + "learning_rate": 5.70409982174688e-07, + "logits/chosen": -0.09145097434520721, + "logits/rejected": 0.036288149654865265, + "logps/chosen": -1.3178232908248901, + "logps/rejected": -1.370639443397522, + "loss": 1.1185, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3178232908248901, + "rewards/margins": 0.05281621962785721, + "rewards/rejected": -1.370639443397522, + "sft_loss": 1.3386495113372803, "step": 320 }, { "epoch": 0.17394213079110218, - "grad_norm": 5.681080389428846, - "learning_rate": 1.7379679144385028e-06, - "logits/chosen": -0.158976748585701, - "logits/rejected": -0.050321273505687714, - "logps/chosen": -1.3078538179397583, - "logps/rejected": -1.6020857095718384, - "loss": 1.0574, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3078538179397583, - "rewards/margins": 0.29423192143440247, - "rewards/rejected": -1.6020857095718384, - "sft_loss": 1.400783896446228, + "grad_norm": 6.1215749137539035, + "learning_rate": 5.793226381461676e-07, + "logits/chosen": -0.16615521907806396, + "logits/rejected": -0.05705835670232773, + "logps/chosen": -1.320780634880066, + "logps/rejected": -1.5688358545303345, + "loss": 1.0807, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.320780634880066, + "rewards/margins": 0.24805521965026855, + "rewards/rejected": -1.5688358545303345, + "sft_loss": 1.4028794765472412, "step": 325 }, { "epoch": 0.17661816357250376, - "grad_norm": 7.299626652835666, - "learning_rate": 1.7647058823529412e-06, - "logits/chosen": -0.05776820331811905, - "logits/rejected": 0.07368157058954239, - "logps/chosen": -1.3429508209228516, - "logps/rejected": -1.5490708351135254, - "loss": 1.0678, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3429508209228516, - "rewards/margins": 0.2061198502779007, - "rewards/rejected": -1.5490708351135254, - "sft_loss": 1.3533952236175537, + "grad_norm": 9.554114693660987, + "learning_rate": 5.88235294117647e-07, + "logits/chosen": -0.06171814724802971, + "logits/rejected": 0.0778273418545723, + "logps/chosen": -1.3327248096466064, + "logps/rejected": -1.511650800704956, + "loss": 1.081, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3327248096466064, + "rewards/margins": 0.17892588675022125, + "rewards/rejected": -1.511650800704956, + "sft_loss": 1.3499819040298462, "step": 330 }, { "epoch": 0.17929419635390534, - "grad_norm": 8.171051466670482, - "learning_rate": 1.7914438502673799e-06, - "logits/chosen": -0.014549818821251392, - "logits/rejected": 0.07486838847398758, - "logps/chosen": -1.3574202060699463, - "logps/rejected": -1.4231749773025513, - "loss": 1.1095, + "grad_norm": 8.691211468546095, + "learning_rate": 5.971479500891266e-07, + "logits/chosen": -0.010685861110687256, + "logits/rejected": 0.0859212800860405, + "logps/chosen": -1.3480260372161865, + "logps/rejected": -1.3847862482070923, + "loss": 1.1218, "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.3574202060699463, - "rewards/margins": 0.06575469672679901, - "rewards/rejected": -1.4231749773025513, - "sft_loss": 1.3472013473510742, + "rewards/chosen": -1.3480260372161865, + "rewards/margins": 0.036760084331035614, + "rewards/rejected": -1.3847862482070923, + "sft_loss": 1.3458187580108643, "step": 335 }, { "epoch": 0.18197022913530692, - "grad_norm": 15.238594270083702, - "learning_rate": 1.8181818181818183e-06, - "logits/chosen": -0.0781969353556633, - "logits/rejected": 0.05372166633605957, - "logps/chosen": -1.4278004169464111, - "logps/rejected": -1.567711591720581, - "loss": 1.131, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.4278004169464111, - "rewards/margins": 0.13991113007068634, - "rewards/rejected": -1.567711591720581, - "sft_loss": 1.407225489616394, + "grad_norm": 8.682402624050773, + "learning_rate": 6.060606060606061e-07, + "logits/chosen": -0.08280332386493683, + "logits/rejected": 0.05182039737701416, + "logps/chosen": -1.4081138372421265, + "logps/rejected": -1.4955675601959229, + "loss": 1.1435, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.4081138372421265, + "rewards/margins": 0.08745387196540833, + "rewards/rejected": -1.4955675601959229, + "sft_loss": 1.395704746246338, "step": 340 }, { "epoch": 0.1846462619167085, - "grad_norm": 6.818258628989823, - "learning_rate": 1.8449197860962567e-06, - "logits/chosen": -0.009267864748835564, - "logits/rejected": 0.019993681460618973, - "logps/chosen": -1.33181893825531, - "logps/rejected": -1.5204057693481445, - "loss": 1.0749, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.33181893825531, - "rewards/margins": 0.18858689069747925, - "rewards/rejected": -1.5204057693481445, - "sft_loss": 1.3686116933822632, + "grad_norm": 10.09059167148764, + "learning_rate": 6.149732620320855e-07, + "logits/chosen": -0.0009210974094457924, + "logits/rejected": 0.026766661554574966, + "logps/chosen": -1.3219306468963623, + "logps/rejected": -1.4634308815002441, + "loss": 1.0916, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3219306468963623, + "rewards/margins": 0.14150023460388184, + "rewards/rejected": -1.4634308815002441, + "sft_loss": 1.3607900142669678, "step": 345 }, { "epoch": 0.18732229469811004, - "grad_norm": 6.747081545944619, - "learning_rate": 1.8716577540106954e-06, - "logits/chosen": -0.02299405448138714, - "logits/rejected": 0.0645705908536911, - "logps/chosen": -1.2961095571517944, - "logps/rejected": -1.446307897567749, - "loss": 1.0961, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.2961095571517944, - "rewards/margins": 0.15019826591014862, - "rewards/rejected": -1.446307897567749, - "sft_loss": 1.3511130809783936, + "grad_norm": 7.445334085344142, + "learning_rate": 6.238859180035651e-07, + "logits/chosen": -0.010182015597820282, + "logits/rejected": 0.08374631404876709, + "logps/chosen": -1.2952816486358643, + "logps/rejected": -1.414016604423523, + "loss": 1.1085, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2952816486358643, + "rewards/margins": 0.11873485893011093, + "rewards/rejected": -1.414016604423523, + "sft_loss": 1.3489550352096558, "step": 350 }, { "epoch": 0.18999832747951162, - "grad_norm": 6.665538710803582, - "learning_rate": 1.8983957219251338e-06, - "logits/chosen": -0.092487633228302, - "logits/rejected": 0.1206965297460556, - "logps/chosen": -1.3767292499542236, - "logps/rejected": -1.4593746662139893, - "loss": 1.1374, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.3767292499542236, - "rewards/margins": 0.0826452448964119, - "rewards/rejected": -1.4593746662139893, - "sft_loss": 1.4032217264175415, + "grad_norm": 6.852093118669004, + "learning_rate": 6.327985739750445e-07, + "logits/chosen": -0.12995010614395142, + "logits/rejected": 0.08240822702646255, + "logps/chosen": -1.396977186203003, + "logps/rejected": -1.427973747253418, + "loss": 1.1597, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.396977186203003, + "rewards/margins": 0.03099655732512474, + "rewards/rejected": -1.427973747253418, + "sft_loss": 1.405177354812622, "step": 355 }, { "epoch": 0.1926743602609132, - "grad_norm": 6.1643171333571, - "learning_rate": 1.9251336898395724e-06, - "logits/chosen": -0.12042136490345001, - "logits/rejected": -0.049273934215307236, - "logps/chosen": -1.314653992652893, - "logps/rejected": -1.4797592163085938, - "loss": 1.0733, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.314653992652893, - "rewards/margins": 0.16510513424873352, - "rewards/rejected": -1.4797592163085938, - "sft_loss": 1.2989557981491089, + "grad_norm": 6.5468973260672305, + "learning_rate": 6.417112299465241e-07, + "logits/chosen": -0.10669644922018051, + "logits/rejected": -0.03266788646578789, + "logps/chosen": -1.3259212970733643, + "logps/rejected": -1.4534105062484741, + "loss": 1.0918, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3259212970733643, + "rewards/margins": 0.12748919427394867, + "rewards/rejected": -1.4534105062484741, + "sft_loss": 1.3055691719055176, "step": 360 }, { "epoch": 0.19535039304231477, - "grad_norm": 7.255399369010597, - "learning_rate": 1.951871657754011e-06, - "logits/chosen": 0.021989356726408005, - "logits/rejected": 0.09734012186527252, - "logps/chosen": -1.2959258556365967, - "logps/rejected": -1.4293915033340454, - "loss": 1.08, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2959258556365967, - "rewards/margins": 0.13346561789512634, - "rewards/rejected": -1.4293915033340454, - "sft_loss": 1.280139446258545, + "grad_norm": 7.65512352583562, + "learning_rate": 6.506238859180035e-07, + "logits/chosen": -0.0375606045126915, + "logits/rejected": 0.038245074450969696, + "logps/chosen": -1.3043148517608643, + "logps/rejected": -1.4049714803695679, + "loss": 1.092, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3043148517608643, + "rewards/margins": 0.10065661370754242, + "rewards/rejected": -1.4049714803695679, + "sft_loss": 1.2806155681610107, "step": 365 }, { "epoch": 0.19802642582371635, - "grad_norm": 5.332067731065435, - "learning_rate": 1.9786096256684493e-06, - "logits/chosen": -0.028383517637848854, - "logits/rejected": 0.058793745934963226, - "logps/chosen": -1.282463788986206, - "logps/rejected": -1.3635833263397217, - "loss": 1.0841, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.282463788986206, - "rewards/margins": 0.08111962676048279, - "rewards/rejected": -1.3635833263397217, - "sft_loss": 1.2678134441375732, + "grad_norm": 6.490861236478968, + "learning_rate": 6.59536541889483e-07, + "logits/chosen": -0.06274469941854477, + "logits/rejected": 0.02214394509792328, + "logps/chosen": -1.2985416650772095, + "logps/rejected": -1.3463687896728516, + "loss": 1.102, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.2985416650772095, + "rewards/margins": 0.0478271022439003, + "rewards/rejected": -1.3463687896728516, + "sft_loss": 1.2697174549102783, "step": 370 }, { "epoch": 0.2007024586051179, - "grad_norm": 6.569832791382864, - "learning_rate": 2.0053475935828877e-06, - "logits/chosen": -0.07550617307424545, - "logits/rejected": 0.07492499053478241, - "logps/chosen": -1.2536544799804688, - "logps/rejected": -1.449466586112976, - "loss": 1.0603, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2536544799804688, - "rewards/margins": 0.19581225514411926, - "rewards/rejected": -1.449466586112976, - "sft_loss": 1.3272215127944946, + "grad_norm": 6.908038561608901, + "learning_rate": 6.684491978609626e-07, + "logits/chosen": -0.1052565723657608, + "logits/rejected": 0.04586447775363922, + "logps/chosen": -1.2791210412979126, + "logps/rejected": -1.426588535308838, + "loss": 1.0847, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2791210412979126, + "rewards/margins": 0.1474677473306656, + "rewards/rejected": -1.426588535308838, + "sft_loss": 1.3325226306915283, "step": 375 }, { "epoch": 0.20337849138651948, - "grad_norm": 5.57444327708168, - "learning_rate": 2.032085561497326e-06, - "logits/chosen": -0.05334942415356636, - "logits/rejected": 0.025158772245049477, - "logps/chosen": -1.2964035272598267, - "logps/rejected": -1.471839189529419, - "loss": 1.0621, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2964035272598267, - "rewards/margins": 0.17543578147888184, - "rewards/rejected": -1.471839189529419, - "sft_loss": 1.308597207069397, + "grad_norm": 6.034487961079161, + "learning_rate": 6.77361853832442e-07, + "logits/chosen": -0.06363788992166519, + "logits/rejected": 0.01771625317633152, + "logps/chosen": -1.2925516366958618, + "logps/rejected": -1.4579870700836182, + "loss": 1.0643, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2925516366958618, + "rewards/margins": 0.16543535888195038, + "rewards/rejected": -1.4579870700836182, + "sft_loss": 1.303420066833496, "step": 380 }, { "epoch": 0.20605452416792105, - "grad_norm": 4.900223930336491, - "learning_rate": 2.058823529411765e-06, - "logits/chosen": -0.019271325320005417, - "logits/rejected": 0.05648297816514969, - "logps/chosen": -1.3950426578521729, - "logps/rejected": -1.4196631908416748, - "loss": 1.172, + "grad_norm": 4.691163180630138, + "learning_rate": 6.862745098039216e-07, + "logits/chosen": -0.007407332770526409, + "logits/rejected": 0.07189084589481354, + "logps/chosen": -1.391524314880371, + "logps/rejected": -1.3882184028625488, + "loss": 1.1825, "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3950426578521729, - "rewards/margins": 0.024620627984404564, - "rewards/rejected": -1.4196631908416748, - "sft_loss": 1.4064253568649292, + "rewards/chosen": -1.391524314880371, + "rewards/margins": -0.0033058510161936283, + "rewards/rejected": -1.3882184028625488, + "sft_loss": 1.4002737998962402, "step": 385 }, { "epoch": 0.20873055694932263, - "grad_norm": 7.6217444720228045, - "learning_rate": 2.0855614973262034e-06, - "logits/chosen": 0.040967244654893875, - "logits/rejected": 0.19986467063426971, - "logps/chosen": -1.3848092555999756, - "logps/rejected": -1.4894533157348633, - "loss": 1.1273, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.3848092555999756, - "rewards/margins": 0.10464394092559814, - "rewards/rejected": -1.4894533157348633, - "sft_loss": 1.3908588886260986, + "grad_norm": 8.263656354996776, + "learning_rate": 6.95187165775401e-07, + "logits/chosen": 0.04927052557468414, + "logits/rejected": 0.21582689881324768, + "logps/chosen": -1.387182593345642, + "logps/rejected": -1.4510324001312256, + "loss": 1.1495, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.387182593345642, + "rewards/margins": 0.06384972482919693, + "rewards/rejected": -1.4510324001312256, + "sft_loss": 1.3905909061431885, "step": 390 }, { "epoch": 0.2114065897307242, - "grad_norm": 5.889142181465144, - "learning_rate": 2.112299465240642e-06, - "logits/chosen": -0.0939127653837204, - "logits/rejected": 0.05425548553466797, - "logps/chosen": -1.337721347808838, - "logps/rejected": -1.4008605480194092, - "loss": 1.1072, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.337721347808838, - "rewards/margins": 0.06313915550708771, - "rewards/rejected": -1.4008605480194092, - "sft_loss": 1.3472613096237183, + "grad_norm": 6.478916151612064, + "learning_rate": 7.040998217468806e-07, + "logits/chosen": -0.07751376926898956, + "logits/rejected": 0.0774260088801384, + "logps/chosen": -1.335992693901062, + "logps/rejected": -1.3548707962036133, + "loss": 1.1256, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.335992693901062, + "rewards/margins": 0.01887820102274418, + "rewards/rejected": -1.3548707962036133, + "sft_loss": 1.342871904373169, "step": 395 }, { "epoch": 0.2140826225121258, - "grad_norm": 5.840741949185277, - "learning_rate": 2.1390374331550802e-06, - "logits/chosen": 0.047120727598667145, - "logits/rejected": 0.13390924036502838, - "logps/chosen": -1.3149826526641846, - "logps/rejected": -1.4476349353790283, - "loss": 1.0742, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3149826526641846, - "rewards/margins": 0.13265222311019897, - "rewards/rejected": -1.4476349353790283, - "sft_loss": 1.3131649494171143, + "grad_norm": 5.185324220042295, + "learning_rate": 7.1301247771836e-07, + "logits/chosen": 0.06050665304064751, + "logits/rejected": 0.153707355260849, + "logps/chosen": -1.3152250051498413, + "logps/rejected": -1.4181197881698608, + "loss": 1.09, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3152250051498413, + "rewards/margins": 0.10289473831653595, + "rewards/rejected": -1.4181197881698608, + "sft_loss": 1.3137023448944092, "step": 400 }, { "epoch": 0.2140826225121258, - "eval_logits/chosen": 0.2665702998638153, - "eval_logits/rejected": 0.3536551594734192, - "eval_logps/chosen": -1.3461052179336548, - "eval_logps/rejected": -1.5309417247772217, - "eval_loss": 1.0823242664337158, - "eval_rewards/accuracies": 0.5778931975364685, - "eval_rewards/chosen": -1.3461052179336548, - "eval_rewards/margins": 0.18483661115169525, - "eval_rewards/rejected": -1.5309417247772217, - "eval_runtime": 49.4628, - "eval_samples_per_second": 27.192, - "eval_sft_loss": 1.3695601224899292, - "eval_steps_per_second": 6.813, + "eval_logits/chosen": 0.23501165211200714, + "eval_logits/rejected": 0.32066836953163147, + "eval_logps/chosen": -1.3477087020874023, + "eval_logps/rejected": -1.4854915142059326, + "eval_loss": 1.1010371446609497, + "eval_rewards/accuracies": 0.5586053133010864, + "eval_rewards/chosen": -1.3477087020874023, + "eval_rewards/margins": 0.137783020734787, + "eval_rewards/rejected": -1.4854915142059326, + "eval_runtime": 44.0326, + "eval_samples_per_second": 30.546, + "eval_sft_loss": 1.3681285381317139, + "eval_steps_per_second": 7.653, "step": 400 }, { "epoch": 0.21675865529352734, - "grad_norm": 7.697700600455325, - "learning_rate": 2.1657754010695186e-06, - "logits/chosen": -0.018059352412819862, - "logits/rejected": 0.07431250065565109, - "logps/chosen": -1.3391938209533691, - "logps/rejected": -1.477910041809082, - "loss": 1.0935, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3391938209533691, - "rewards/margins": 0.13871631026268005, - "rewards/rejected": -1.477910041809082, - "sft_loss": 1.3267302513122559, + "grad_norm": 7.899141152347657, + "learning_rate": 7.219251336898395e-07, + "logits/chosen": -0.014751395210623741, + "logits/rejected": 0.08054462820291519, + "logps/chosen": -1.3261982202529907, + "logps/rejected": -1.3998126983642578, + "loss": 1.1073, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3261982202529907, + "rewards/margins": 0.0736144408583641, + "rewards/rejected": -1.3998126983642578, + "sft_loss": 1.3163540363311768, "step": 405 }, { "epoch": 0.2194346880749289, - "grad_norm": 6.022975123352896, - "learning_rate": 2.192513368983957e-06, - "logits/chosen": -0.0013650401961058378, - "logits/rejected": 0.12200506776571274, - "logps/chosen": -1.2950294017791748, - "logps/rejected": -1.4449807405471802, - "loss": 1.081, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.2950294017791748, - "rewards/margins": 0.1499512493610382, - "rewards/rejected": -1.4449807405471802, - "sft_loss": 1.3392434120178223, + "grad_norm": 6.426580024778647, + "learning_rate": 7.30837789661319e-07, + "logits/chosen": 0.010929781012237072, + "logits/rejected": 0.1424231231212616, + "logps/chosen": -1.2975648641586304, + "logps/rejected": -1.3927663564682007, + "loss": 1.1032, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2975648641586304, + "rewards/margins": 0.09520147740840912, + "rewards/rejected": -1.3927663564682007, + "sft_loss": 1.332929015159607, "step": 410 }, { "epoch": 0.2221107208563305, - "grad_norm": 4.99124806968459, - "learning_rate": 2.219251336898396e-06, - "logits/chosen": -0.009035291150212288, - "logits/rejected": 0.028119832277297974, - "logps/chosen": -1.3074760437011719, - "logps/rejected": -1.5078462362289429, - "loss": 1.0654, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3074760437011719, - "rewards/margins": 0.200370192527771, - "rewards/rejected": -1.5078462362289429, - "sft_loss": 1.312064528465271, + "grad_norm": 5.321301590410997, + "learning_rate": 7.397504456327985e-07, + "logits/chosen": -0.05153341218829155, + "logits/rejected": -0.017596019431948662, + "logps/chosen": -1.293229341506958, + "logps/rejected": -1.460153341293335, + "loss": 1.0731, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.293229341506958, + "rewards/margins": 0.16692404448986053, + "rewards/rejected": -1.460153341293335, + "sft_loss": 1.3046073913574219, "step": 415 }, { "epoch": 0.22478675363773207, - "grad_norm": 5.36991559487342, - "learning_rate": 2.2459893048128343e-06, - "logits/chosen": -0.029295751824975014, - "logits/rejected": 0.15687605738639832, - "logps/chosen": -1.273036241531372, - "logps/rejected": -1.439571738243103, - "loss": 1.0761, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.273036241531372, - "rewards/margins": 0.16653569042682648, - "rewards/rejected": -1.439571738243103, - "sft_loss": 1.3252718448638916, + "grad_norm": 6.338567776813103, + "learning_rate": 7.486631016042781e-07, + "logits/chosen": -0.022293150424957275, + "logits/rejected": 0.16873207688331604, + "logps/chosen": -1.2768056392669678, + "logps/rejected": -1.3799188137054443, + "loss": 1.1008, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.2768056392669678, + "rewards/margins": 0.10311311483383179, + "rewards/rejected": -1.3799188137054443, + "sft_loss": 1.3268121480941772, "step": 420 }, { "epoch": 0.22746278641913364, - "grad_norm": 6.070579691423694, - "learning_rate": 2.2727272727272728e-06, - "logits/chosen": -0.05621107667684555, - "logits/rejected": 0.13872070610523224, - "logps/chosen": -1.3344993591308594, - "logps/rejected": -1.5649782419204712, - "loss": 1.0722, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3344993591308594, - "rewards/margins": 0.23047864437103271, - "rewards/rejected": -1.5649782419204712, - "sft_loss": 1.4042408466339111, + "grad_norm": 6.0883866845079675, + "learning_rate": 7.575757575757575e-07, + "logits/chosen": -0.07399999350309372, + "logits/rejected": 0.12590864300727844, + "logps/chosen": -1.3194478750228882, + "logps/rejected": -1.4870588779449463, + "loss": 1.0876, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3194478750228882, + "rewards/margins": 0.1676110327243805, + "rewards/rejected": -1.4870588779449463, + "sft_loss": 1.3896230459213257, "step": 425 }, { "epoch": 0.2301388192005352, - "grad_norm": 7.067992114737753, - "learning_rate": 2.299465240641711e-06, - "logits/chosen": -0.0927404910326004, - "logits/rejected": 0.10216756165027618, - "logps/chosen": -1.3587287664413452, - "logps/rejected": -1.5804948806762695, - "loss": 1.08, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3587287664413452, - "rewards/margins": 0.22176587581634521, - "rewards/rejected": -1.5804948806762695, - "sft_loss": 1.3935238122940063, + "grad_norm": 6.718130456854809, + "learning_rate": 7.664884135472371e-07, + "logits/chosen": -0.08521527796983719, + "logits/rejected": 0.11168261617422104, + "logps/chosen": -1.3500198125839233, + "logps/rejected": -1.4981739521026611, + "loss": 1.0938, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3500198125839233, + "rewards/margins": 0.14815422892570496, + "rewards/rejected": -1.4981739521026611, + "sft_loss": 1.3889285326004028, "step": 430 }, { "epoch": 0.23281485198193677, - "grad_norm": 8.14751313426182, - "learning_rate": 2.3262032085561496e-06, - "logits/chosen": -0.02563117817044258, - "logits/rejected": 0.05861321836709976, - "logps/chosen": -1.2418124675750732, - "logps/rejected": -1.4311327934265137, - "loss": 1.0511, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2418124675750732, - "rewards/margins": 0.18932026624679565, - "rewards/rejected": -1.4311327934265137, - "sft_loss": 1.3078510761260986, + "grad_norm": 7.101305216195323, + "learning_rate": 7.754010695187165e-07, + "logits/chosen": -0.017105095088481903, + "logits/rejected": 0.07132132351398468, + "logps/chosen": -1.2278684377670288, + "logps/rejected": -1.3633949756622314, + "loss": 1.067, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2278684377670288, + "rewards/margins": 0.13552668690681458, + "rewards/rejected": -1.3633949756622314, + "sft_loss": 1.2899867296218872, "step": 435 }, { "epoch": 0.23549088476333835, - "grad_norm": 6.318596335672091, - "learning_rate": 2.3529411764705885e-06, - "logits/chosen": -0.010552659630775452, - "logits/rejected": 0.08236038684844971, - "logps/chosen": -1.3421692848205566, - "logps/rejected": -1.4967700242996216, - "loss": 1.0794, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3421692848205566, - "rewards/margins": 0.15460090339183807, - "rewards/rejected": -1.4967700242996216, - "sft_loss": 1.345083236694336, + "grad_norm": 5.380683221006391, + "learning_rate": 7.84313725490196e-07, + "logits/chosen": -0.015782993286848068, + "logits/rejected": 0.07443811744451523, + "logps/chosen": -1.2975280284881592, + "logps/rejected": -1.398431658744812, + "loss": 1.0852, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2975280284881592, + "rewards/margins": 0.10090353339910507, + "rewards/rejected": -1.398431658744812, + "sft_loss": 1.3160045146942139, "step": 440 }, { "epoch": 0.23816691754473993, - "grad_norm": 5.876139249639594, - "learning_rate": 2.379679144385027e-06, - "logits/chosen": -0.024869054555892944, - "logits/rejected": 0.09330997616052628, - "logps/chosen": -1.3537369966506958, - "logps/rejected": -1.5938447713851929, - "loss": 1.0821, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3537369966506958, - "rewards/margins": 0.2401077002286911, - "rewards/rejected": -1.5938447713851929, - "sft_loss": 1.3780953884124756, + "grad_norm": 5.636935957579122, + "learning_rate": 7.932263814616755e-07, + "logits/chosen": -0.083624929189682, + "logits/rejected": 0.023563571274280548, + "logps/chosen": -1.3276898860931396, + "logps/rejected": -1.4673435688018799, + "loss": 1.1097, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3276898860931396, + "rewards/margins": 0.13965357840061188, + "rewards/rejected": -1.4673435688018799, + "sft_loss": 1.3639371395111084, "step": 445 }, { "epoch": 0.2408429503261415, - "grad_norm": 8.493429935039602, - "learning_rate": 2.4064171122994653e-06, - "logits/chosen": 0.026296118274331093, - "logits/rejected": 0.15405510365962982, - "logps/chosen": -1.351786494255066, - "logps/rejected": -1.5464465618133545, - "loss": 1.0431, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.351786494255066, - "rewards/margins": 0.19466015696525574, - "rewards/rejected": -1.5464465618133545, - "sft_loss": 1.3259353637695312, + "grad_norm": 10.335663880408164, + "learning_rate": 8.02139037433155e-07, + "logits/chosen": -0.0073973932303488255, + "logits/rejected": 0.11964131891727448, + "logps/chosen": -1.3213504552841187, + "logps/rejected": -1.461319088935852, + "loss": 1.0537, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3213504552841187, + "rewards/margins": 0.13996846973896027, + "rewards/rejected": -1.461319088935852, + "sft_loss": 1.3012510538101196, "step": 450 }, { "epoch": 0.24351898310754308, - "grad_norm": 6.197921325965198, - "learning_rate": 2.4331550802139037e-06, - "logits/chosen": 0.012350971810519695, - "logits/rejected": 0.1070106253027916, - "logps/chosen": -1.2704529762268066, - "logps/rejected": -1.5693228244781494, - "loss": 1.0265, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2704529762268066, - "rewards/margins": 0.2988698482513428, - "rewards/rejected": -1.5693228244781494, - "sft_loss": 1.2969739437103271, + "grad_norm": 6.305064923156825, + "learning_rate": 8.110516934046346e-07, + "logits/chosen": -0.008607283234596252, + "logits/rejected": 0.08251720666885376, + "logps/chosen": -1.2554385662078857, + "logps/rejected": -1.4584252834320068, + "loss": 1.0472, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2554385662078857, + "rewards/margins": 0.20298662781715393, + "rewards/rejected": -1.4584252834320068, + "sft_loss": 1.2757747173309326, "step": 455 }, { "epoch": 0.24619501588894463, - "grad_norm": 6.124793908509888, - "learning_rate": 2.459893048128342e-06, - "logits/chosen": -0.11806248128414154, - "logits/rejected": 0.005137929227203131, - "logps/chosen": -1.403852105140686, - "logps/rejected": -1.512068510055542, - "loss": 1.1314, + "grad_norm": 7.5266659029364975, + "learning_rate": 8.19964349376114e-07, + "logits/chosen": -0.1394522488117218, + "logits/rejected": -0.015918530523777008, + "logps/chosen": -1.3858789205551147, + "logps/rejected": -1.4430443048477173, + "loss": 1.1494, "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -1.403852105140686, - "rewards/margins": 0.1082165464758873, - "rewards/rejected": -1.512068510055542, - "sft_loss": 1.4219634532928467, + "rewards/chosen": -1.3858789205551147, + "rewards/margins": 0.057165395468473434, + "rewards/rejected": -1.4430443048477173, + "sft_loss": 1.4158474206924438, "step": 460 }, { "epoch": 0.2488710486703462, - "grad_norm": 6.465729425548841, - "learning_rate": 2.4866310160427806e-06, - "logits/chosen": 0.16115108132362366, - "logits/rejected": 0.18483372032642365, - "logps/chosen": -1.3084897994995117, - "logps/rejected": -1.5060487985610962, - "loss": 1.0691, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3084897994995117, - "rewards/margins": 0.19755904376506805, - "rewards/rejected": -1.5060487985610962, - "sft_loss": 1.3134046792984009, + "grad_norm": 6.809377978897288, + "learning_rate": 8.288770053475936e-07, + "logits/chosen": 0.1192874163389206, + "logits/rejected": 0.14119035005569458, + "logps/chosen": -1.2931272983551025, + "logps/rejected": -1.4714761972427368, + "loss": 1.0697, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2931272983551025, + "rewards/margins": 0.17834879457950592, + "rewards/rejected": -1.4714761972427368, + "sft_loss": 1.297696828842163, "step": 465 }, { "epoch": 0.2515470814517478, - "grad_norm": 6.861663361054339, - "learning_rate": 2.5133689839572194e-06, - "logits/chosen": 0.16848713159561157, - "logits/rejected": 0.12264200299978256, - "logps/chosen": -1.2748721837997437, - "logps/rejected": -1.502882957458496, - "loss": 1.0426, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2748721837997437, - "rewards/margins": 0.22801072895526886, - "rewards/rejected": -1.502882957458496, - "sft_loss": 1.303835153579712, + "grad_norm": 6.671818465646382, + "learning_rate": 8.37789661319073e-07, + "logits/chosen": 0.1525087058544159, + "logits/rejected": 0.1035248264670372, + "logps/chosen": -1.2508265972137451, + "logps/rejected": -1.4483669996261597, + "loss": 1.0476, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2508265972137451, + "rewards/margins": 0.19754032790660858, + "rewards/rejected": -1.4483669996261597, + "sft_loss": 1.2858855724334717, "step": 470 }, { "epoch": 0.25422311423314936, - "grad_norm": 5.417790338120884, - "learning_rate": 2.540106951871658e-06, - "logits/chosen": -0.02455167844891548, - "logits/rejected": 0.11675968021154404, - "logps/chosen": -1.3103151321411133, - "logps/rejected": -1.6406517028808594, - "loss": 1.0362, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3103151321411133, - "rewards/margins": 0.33033671975135803, - "rewards/rejected": -1.6406517028808594, - "sft_loss": 1.3476392030715942, + "grad_norm": 6.6227441453418665, + "learning_rate": 8.467023172905525e-07, + "logits/chosen": -0.05956697463989258, + "logits/rejected": 0.08246360719203949, + "logps/chosen": -1.3117125034332275, + "logps/rejected": -1.5495645999908447, + "loss": 1.0497, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3117125034332275, + "rewards/margins": 0.23785214126110077, + "rewards/rejected": -1.5495645999908447, + "sft_loss": 1.341576337814331, "step": 475 }, { "epoch": 0.2568991470145509, - "grad_norm": 6.187708853753433, - "learning_rate": 2.5668449197860963e-06, - "logits/chosen": -0.023285821080207825, - "logits/rejected": 0.1779555380344391, - "logps/chosen": -1.3003066778182983, - "logps/rejected": -1.4369903802871704, - "loss": 1.0748, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3003066778182983, - "rewards/margins": 0.13668350875377655, - "rewards/rejected": -1.4369903802871704, - "sft_loss": 1.315993070602417, + "grad_norm": 6.317720549483203, + "learning_rate": 8.55614973262032e-07, + "logits/chosen": -0.044476814568042755, + "logits/rejected": 0.15844932198524475, + "logps/chosen": -1.2904717922210693, + "logps/rejected": -1.3672595024108887, + "loss": 1.0942, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2904717922210693, + "rewards/margins": 0.07678767293691635, + "rewards/rejected": -1.3672595024108887, + "sft_loss": 1.3166091442108154, "step": 480 }, { "epoch": 0.2595751797959525, - "grad_norm": 6.949952876484532, - "learning_rate": 2.5935828877005347e-06, - "logits/chosen": 0.018455123528838158, - "logits/rejected": 0.061692021787166595, - "logps/chosen": -1.3897532224655151, - "logps/rejected": -1.5637627840042114, - "loss": 1.0926, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3897532224655151, - "rewards/margins": 0.1740095317363739, - "rewards/rejected": -1.5637627840042114, - "sft_loss": 1.3837809562683105, + "grad_norm": 8.218639401359006, + "learning_rate": 8.645276292335115e-07, + "logits/chosen": -0.012549695558845997, + "logits/rejected": 0.025943463668227196, + "logps/chosen": -1.3858129978179932, + "logps/rejected": -1.479968786239624, + "loss": 1.123, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3858129978179932, + "rewards/margins": 0.09415578842163086, + "rewards/rejected": -1.479968786239624, + "sft_loss": 1.3809711933135986, "step": 485 }, { "epoch": 0.26225121257735406, - "grad_norm": 8.424582028696578, - "learning_rate": 2.620320855614973e-06, - "logits/chosen": 0.016538361087441444, - "logits/rejected": 0.08852274715900421, - "logps/chosen": -1.3464523553848267, - "logps/rejected": -1.488745093345642, - "loss": 1.1035, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3464523553848267, - "rewards/margins": 0.14229276776313782, - "rewards/rejected": -1.488745093345642, - "sft_loss": 1.3327219486236572, + "grad_norm": 5.9336956749373675, + "learning_rate": 8.734402852049911e-07, + "logits/chosen": 0.02575538493692875, + "logits/rejected": 0.0991683155298233, + "logps/chosen": -1.3275907039642334, + "logps/rejected": -1.414987325668335, + "loss": 1.1151, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3275907039642334, + "rewards/margins": 0.08739662170410156, + "rewards/rejected": -1.414987325668335, + "sft_loss": 1.3196409940719604, "step": 490 }, { "epoch": 0.26492724535875567, - "grad_norm": 6.887461643958575, - "learning_rate": 2.647058823529412e-06, - "logits/chosen": -0.03392522409558296, - "logits/rejected": -0.011492741294205189, - "logps/chosen": -1.3476107120513916, - "logps/rejected": -1.490720272064209, - "loss": 1.1142, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3476107120513916, - "rewards/margins": 0.1431095004081726, - "rewards/rejected": -1.490720272064209, - "sft_loss": 1.41537344455719, + "grad_norm": 7.283706408339409, + "learning_rate": 8.823529411764705e-07, + "logits/chosen": -0.060841191560029984, + "logits/rejected": -0.03938784822821617, + "logps/chosen": -1.3379141092300415, + "logps/rejected": -1.456209421157837, + "loss": 1.1165, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3379141092300415, + "rewards/margins": 0.11829522997140884, + "rewards/rejected": -1.456209421157837, + "sft_loss": 1.3973592519760132, "step": 495 }, { "epoch": 0.2676032781401572, - "grad_norm": 5.493118625889312, - "learning_rate": 2.6737967914438504e-06, - "logits/chosen": -0.05503328517079353, - "logits/rejected": 0.04382333159446716, - "logps/chosen": -1.243357539176941, - "logps/rejected": -1.4519065618515015, - "loss": 1.0576, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.243357539176941, - "rewards/margins": 0.2085491418838501, - "rewards/rejected": -1.4519065618515015, - "sft_loss": 1.2865723371505737, + "grad_norm": 6.058178591419342, + "learning_rate": 8.912655971479501e-07, + "logits/chosen": -0.053350646048784256, + "logits/rejected": 0.0480460450053215, + "logps/chosen": -1.2455228567123413, + "logps/rejected": -1.4103162288665771, + "loss": 1.0704, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2455228567123413, + "rewards/margins": 0.16479340195655823, + "rewards/rejected": -1.4103162288665771, + "sft_loss": 1.2850632667541504, "step": 500 }, { "epoch": 0.27027931092155877, - "grad_norm": 7.436163935133731, - "learning_rate": 2.700534759358289e-06, - "logits/chosen": -0.06544395536184311, - "logits/rejected": 0.07414183020591736, - "logps/chosen": -1.3723136186599731, - "logps/rejected": -1.4571316242218018, - "loss": 1.1255, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3723136186599731, - "rewards/margins": 0.08481813967227936, - "rewards/rejected": -1.4571316242218018, - "sft_loss": 1.387662410736084, + "grad_norm": 8.871998038934212, + "learning_rate": 9.001782531194295e-07, + "logits/chosen": -0.03907988965511322, + "logits/rejected": 0.11153552681207657, + "logps/chosen": -1.3700611591339111, + "logps/rejected": -1.4140938520431519, + "loss": 1.1365, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3700611591339111, + "rewards/margins": 0.044032808393239975, + "rewards/rejected": -1.4140938520431519, + "sft_loss": 1.3838356733322144, "step": 505 }, { "epoch": 0.2729553437029604, - "grad_norm": 5.367166647355689, - "learning_rate": 2.7272727272727272e-06, - "logits/chosen": 0.09557502716779709, - "logits/rejected": 0.1608404666185379, - "logps/chosen": -1.3068989515304565, - "logps/rejected": -1.560426115989685, - "loss": 1.0323, + "grad_norm": 8.170312428775517, + "learning_rate": 9.09090909090909e-07, + "logits/chosen": 0.09565381705760956, + "logits/rejected": 0.15702112019062042, + "logps/chosen": -1.3171789646148682, + "logps/rejected": -1.503444790840149, + "loss": 1.0555, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3068989515304565, - "rewards/margins": 0.25352734327316284, - "rewards/rejected": -1.560426115989685, - "sft_loss": 1.2871196269989014, + "rewards/chosen": -1.3171789646148682, + "rewards/margins": 0.18626593053340912, + "rewards/rejected": -1.503444790840149, + "sft_loss": 1.2960052490234375, "step": 510 }, { "epoch": 0.2756313764843619, - "grad_norm": 4.8720988314465155, - "learning_rate": 2.7540106951871656e-06, - "logits/chosen": 0.06531797349452972, - "logits/rejected": 0.1588929146528244, - "logps/chosen": -1.2504026889801025, - "logps/rejected": -1.4396889209747314, - "loss": 1.0612, + "grad_norm": 4.850778687239608, + "learning_rate": 9.180035650623885e-07, + "logits/chosen": 0.0379464253783226, + "logits/rejected": 0.13338619470596313, + "logps/chosen": -1.260594129562378, + "logps/rejected": -1.420748233795166, + "loss": 1.0678, "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.2504026889801025, - "rewards/margins": 0.18928605318069458, - "rewards/rejected": -1.4396889209747314, - "sft_loss": 1.297341227531433, + "rewards/chosen": -1.260594129562378, + "rewards/margins": 0.16015416383743286, + "rewards/rejected": -1.420748233795166, + "sft_loss": 1.2914608716964722, "step": 515 }, { "epoch": 0.27830740926576353, - "grad_norm": 5.480711455776227, - "learning_rate": 2.780748663101604e-06, - "logits/chosen": -0.04449567198753357, - "logits/rejected": 0.10000330209732056, - "logps/chosen": -1.316209316253662, - "logps/rejected": -1.4778051376342773, - "loss": 1.1114, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.316209316253662, - "rewards/margins": 0.16159582138061523, - "rewards/rejected": -1.4778051376342773, - "sft_loss": 1.4180432558059692, + "grad_norm": 4.985466695799678, + "learning_rate": 9.26916221033868e-07, + "logits/chosen": -0.09366512298583984, + "logits/rejected": 0.05123235657811165, + "logps/chosen": -1.307841181755066, + "logps/rejected": -1.4251517057418823, + "loss": 1.1218, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.307841181755066, + "rewards/margins": 0.11731058359146118, + "rewards/rejected": -1.4251517057418823, + "sft_loss": 1.4095752239227295, "step": 520 }, { "epoch": 0.2809834420471651, - "grad_norm": 11.096748515913996, - "learning_rate": 2.807486631016043e-06, - "logits/chosen": 0.13434985280036926, - "logits/rejected": 0.20518159866333008, - "logps/chosen": -1.3031270503997803, - "logps/rejected": -1.5300318002700806, - "loss": 1.0819, + "grad_norm": 11.716219695853741, + "learning_rate": 9.358288770053476e-07, + "logits/chosen": 0.10370634496212006, + "logits/rejected": 0.1765981912612915, + "logps/chosen": -1.2913461923599243, + "logps/rejected": -1.4900809526443481, + "loss": 1.0875, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3031270503997803, - "rewards/margins": 0.22690463066101074, - "rewards/rejected": -1.5300318002700806, - "sft_loss": 1.3854058980941772, + "rewards/chosen": -1.2913461923599243, + "rewards/margins": 0.19873474538326263, + "rewards/rejected": -1.4900809526443481, + "sft_loss": 1.3755576610565186, "step": 525 }, { "epoch": 0.2836594748285666, - "grad_norm": 4.995978658701775, - "learning_rate": 2.8342245989304813e-06, - "logits/chosen": 0.08955325186252594, - "logits/rejected": 0.17318478226661682, - "logps/chosen": -1.2554371356964111, - "logps/rejected": -1.4383143186569214, - "loss": 1.043, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.2554371356964111, - "rewards/margins": 0.182877317070961, - "rewards/rejected": -1.4383143186569214, - "sft_loss": 1.2290910482406616, + "grad_norm": 5.036271191010346, + "learning_rate": 9.44741532976827e-07, + "logits/chosen": 0.06814324855804443, + "logits/rejected": 0.15410225093364716, + "logps/chosen": -1.259528398513794, + "logps/rejected": -1.3755052089691162, + "loss": 1.0642, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.259528398513794, + "rewards/margins": 0.1159767359495163, + "rewards/rejected": -1.3755052089691162, + "sft_loss": 1.227862000465393, "step": 530 }, { "epoch": 0.28633550760996823, - "grad_norm": 4.997013561916889, - "learning_rate": 2.8609625668449198e-06, - "logits/chosen": -0.07346369326114655, - "logits/rejected": 0.19550183415412903, - "logps/chosen": -1.2598817348480225, - "logps/rejected": -1.3762149810791016, - "loss": 1.0623, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.2598817348480225, - "rewards/margins": 0.11633334308862686, - "rewards/rejected": -1.3762149810791016, - "sft_loss": 1.2499682903289795, + "grad_norm": 5.019426570113224, + "learning_rate": 9.536541889483066e-07, + "logits/chosen": -0.10004373639822006, + "logits/rejected": 0.16594137251377106, + "logps/chosen": -1.270310640335083, + "logps/rejected": -1.3529855012893677, + "loss": 1.0724, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.270310640335083, + "rewards/margins": 0.08267480880022049, + "rewards/rejected": -1.3529855012893677, + "sft_loss": 1.2502983808517456, "step": 535 }, { "epoch": 0.2890115403913698, - "grad_norm": 5.079883060726165, - "learning_rate": 2.887700534759358e-06, - "logits/chosen": 0.020293405279517174, - "logits/rejected": 0.09537501633167267, - "logps/chosen": -1.3982828855514526, - "logps/rejected": -1.543436050415039, - "loss": 1.1171, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3982828855514526, - "rewards/margins": 0.14515307545661926, - "rewards/rejected": -1.543436050415039, - "sft_loss": 1.4294731616973877, + "grad_norm": 5.393200355549483, + "learning_rate": 9.62566844919786e-07, + "logits/chosen": 0.009550745598971844, + "logits/rejected": 0.08285556733608246, + "logps/chosen": -1.3991400003433228, + "logps/rejected": -1.479514479637146, + "loss": 1.1429, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3991400003433228, + "rewards/margins": 0.08037451654672623, + "rewards/rejected": -1.479514479637146, + "sft_loss": 1.4356224536895752, "step": 540 }, { "epoch": 0.2916875731727714, - "grad_norm": 5.485233483620466, - "learning_rate": 2.9144385026737966e-06, - "logits/chosen": -0.1151435375213623, - "logits/rejected": 0.08566157519817352, - "logps/chosen": -1.3151395320892334, - "logps/rejected": -1.521807312965393, - "loss": 1.0565, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3151395320892334, - "rewards/margins": 0.2066677063703537, - "rewards/rejected": -1.521807312965393, - "sft_loss": 1.3310606479644775, + "grad_norm": 5.7140958550538, + "learning_rate": 9.714795008912655e-07, + "logits/chosen": -0.08716743439435959, + "logits/rejected": 0.11977878957986832, + "logps/chosen": -1.311036229133606, + "logps/rejected": -1.4368469715118408, + "loss": 1.0752, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.311036229133606, + "rewards/margins": 0.12581077218055725, + "rewards/rejected": -1.4368469715118408, + "sft_loss": 1.3234432935714722, "step": 545 }, { "epoch": 0.29436360595417294, - "grad_norm": 5.4571151840945085, - "learning_rate": 2.941176470588235e-06, - "logits/chosen": -0.010904309339821339, - "logits/rejected": 0.054650772362947464, - "logps/chosen": -1.3527603149414062, - "logps/rejected": -1.5563316345214844, - "loss": 1.055, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3527603149414062, - "rewards/margins": 0.20357127487659454, - "rewards/rejected": -1.5563316345214844, - "sft_loss": 1.315617561340332, + "grad_norm": 6.076491006812562, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": 0.0319821834564209, + "logits/rejected": 0.10077917575836182, + "logps/chosen": -1.3090689182281494, + "logps/rejected": -1.4595156908035278, + "loss": 1.06, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3090689182281494, + "rewards/margins": 0.150446817278862, + "rewards/rejected": -1.4595156908035278, + "sft_loss": 1.2954964637756348, "step": 550 }, { "epoch": 0.2970396387355745, - "grad_norm": 9.253522422552011, - "learning_rate": 2.967914438502674e-06, - "logits/chosen": -0.08174613863229752, - "logits/rejected": 0.04986164718866348, - "logps/chosen": -1.4038634300231934, - "logps/rejected": -1.5366909503936768, - "loss": 1.1312, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.4038634300231934, - "rewards/margins": 0.13282766938209534, - "rewards/rejected": -1.5366909503936768, - "sft_loss": 1.402178168296814, + "grad_norm": 8.604477479782899, + "learning_rate": 9.893048128342244e-07, + "logits/chosen": -0.07530542463064194, + "logits/rejected": 0.05456411838531494, + "logps/chosen": -1.385756492614746, + "logps/rejected": -1.464805245399475, + "loss": 1.1424, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.385756492614746, + "rewards/margins": 0.07904873043298721, + "rewards/rejected": -1.464805245399475, + "sft_loss": 1.3918272256851196, "step": 555 }, { "epoch": 0.2997156715169761, - "grad_norm": 4.965375183552373, - "learning_rate": 2.9946524064171123e-06, - "logits/chosen": 0.052502263337373734, - "logits/rejected": 0.06982048600912094, - "logps/chosen": -1.250318169593811, - "logps/rejected": -1.4745540618896484, - "loss": 1.0707, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.250318169593811, - "rewards/margins": 0.2242358922958374, - "rewards/rejected": -1.4745540618896484, - "sft_loss": 1.3915551900863647, + "grad_norm": 6.859470538857442, + "learning_rate": 9.98217468805704e-07, + "logits/chosen": 0.042590927332639694, + "logits/rejected": 0.05760595202445984, + "logps/chosen": -1.2400459051132202, + "logps/rejected": -1.393143653869629, + "loss": 1.0823, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2400459051132202, + "rewards/margins": 0.15309767425060272, + "rewards/rejected": -1.393143653869629, + "sft_loss": 1.3680957555770874, "step": 560 }, { "epoch": 0.30239170429837764, - "grad_norm": 4.351113289208154, - "learning_rate": 2.999995343036539e-06, - "logits/chosen": 0.03482988476753235, - "logits/rejected": 0.09091867506504059, - "logps/chosen": -1.328087568283081, - "logps/rejected": -1.521519422531128, - "loss": 1.0806, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.328087568283081, - "rewards/margins": 0.1934318095445633, - "rewards/rejected": -1.521519422531128, - "sft_loss": 1.3730275630950928, + "grad_norm": 5.010480197078524, + "learning_rate": 9.999984476788462e-07, + "logits/chosen": -0.0016426980728283525, + "logits/rejected": 0.0538158118724823, + "logps/chosen": -1.3501454591751099, + "logps/rejected": -1.484586238861084, + "loss": 1.1061, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3501454591751099, + "rewards/margins": 0.13444092869758606, + "rewards/rejected": -1.484586238861084, + "sft_loss": 1.3812997341156006, "step": 565 }, { "epoch": 0.30506773707977924, - "grad_norm": 6.469054086250016, - "learning_rate": 2.9999764241720397e-06, - "logits/chosen": -0.03793569654226303, - "logits/rejected": 0.19039729237556458, - "logps/chosen": -1.3180876970291138, - "logps/rejected": -1.466033697128296, - "loss": 1.1138, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3180876970291138, - "rewards/margins": 0.14794600009918213, - "rewards/rejected": -1.466033697128296, - "sft_loss": 1.4070537090301514, + "grad_norm": 7.232570424800202, + "learning_rate": 9.999921413906797e-07, + "logits/chosen": -0.06226827949285507, + "logits/rejected": 0.1579519659280777, + "logps/chosen": -1.319981336593628, + "logps/rejected": -1.4229294061660767, + "loss": 1.1295, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.319981336593628, + "rewards/margins": 0.10294802486896515, + "rewards/rejected": -1.4229294061660767, + "sft_loss": 1.4071719646453857, "step": 570 }, { "epoch": 0.3077437698611808, - "grad_norm": 5.026428226754346, - "learning_rate": 2.9999429525296936e-06, - "logits/chosen": -0.01073513738811016, - "logits/rejected": 0.049251411110162735, - "logps/chosen": -1.2521588802337646, - "logps/rejected": -1.4337341785430908, - "loss": 1.0529, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2521588802337646, - "rewards/margins": 0.18157517910003662, - "rewards/rejected": -1.4337341785430908, - "sft_loss": 1.27958083152771, + "grad_norm": 5.655540568810285, + "learning_rate": 9.999809841765644e-07, + "logits/chosen": -0.046928636729717255, + "logits/rejected": 0.013672498986124992, + "logps/chosen": -1.2534117698669434, + "logps/rejected": -1.3882801532745361, + "loss": 1.063, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2534117698669434, + "rewards/margins": 0.13486838340759277, + "rewards/rejected": -1.3882801532745361, + "sft_loss": 1.2757145166397095, "step": 575 }, { "epoch": 0.3104198026425824, - "grad_norm": 4.571129756220789, - "learning_rate": 2.9998949284342434e-06, - "logits/chosen": -0.06468029320240021, - "logits/rejected": 0.08895576000213623, - "logps/chosen": -1.2564995288848877, - "logps/rejected": -1.5854532718658447, - "loss": 1.0069, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2564995288848877, - "rewards/margins": 0.32895392179489136, - "rewards/rejected": -1.5854532718658447, - "sft_loss": 1.2941913604736328, + "grad_norm": 5.707833022797435, + "learning_rate": 9.999649761447477e-07, + "logits/chosen": -0.049519095569849014, + "logits/rejected": 0.10778944194316864, + "logps/chosen": -1.2623019218444824, + "logps/rejected": -1.459825038909912, + "loss": 1.0382, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2623019218444824, + "rewards/margins": 0.19752296805381775, + "rewards/rejected": -1.459825038909912, + "sft_loss": 1.285144567489624, "step": 580 }, { "epoch": 0.31309583542398395, - "grad_norm": 6.512823438890916, - "learning_rate": 2.99983235235162e-06, - "logits/chosen": -0.13302397727966309, - "logits/rejected": -0.03166192024946213, - "logps/chosen": -1.4814156293869019, - "logps/rejected": -1.6018117666244507, - "loss": 1.1668, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.4814156293869019, - "rewards/margins": 0.12039615958929062, - "rewards/rejected": -1.6018117666244507, - "sft_loss": 1.4760197401046753, + "grad_norm": 6.503149678617331, + "learning_rate": 9.999441174505398e-07, + "logits/chosen": -0.10968382656574249, + "logits/rejected": -0.006436157040297985, + "logps/chosen": -1.4161120653152466, + "logps/rejected": -1.4973254203796387, + "loss": 1.1564, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4161120653152466, + "rewards/margins": 0.08121319115161896, + "rewards/rejected": -1.4973254203796387, + "sft_loss": 1.4389655590057373, "step": 585 }, { "epoch": 0.3157718682053855, - "grad_norm": 7.017628351517967, - "learning_rate": 2.999755224888935e-06, - "logits/chosen": -0.0977492704987526, - "logits/rejected": 0.015863103792071342, - "logps/chosen": -1.3981412649154663, - "logps/rejected": -1.4746181964874268, - "loss": 1.1421, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3981412649154663, - "rewards/margins": 0.07647692412137985, - "rewards/rejected": -1.4746181964874268, - "sft_loss": 1.4174917936325073, + "grad_norm": 8.15081653817286, + "learning_rate": 9.999184082963116e-07, + "logits/chosen": -0.049354761838912964, + "logits/rejected": 0.0784081220626831, + "logps/chosen": -1.3722199201583862, + "logps/rejected": -1.419933557510376, + "loss": 1.1471, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3722199201583862, + "rewards/margins": 0.04771358519792557, + "rewards/rejected": -1.419933557510376, + "sft_loss": 1.4077204465866089, "step": 590 }, { "epoch": 0.3184479009867871, - "grad_norm": 5.609570856110609, - "learning_rate": 2.9996635467944813e-06, - "logits/chosen": -0.032395754009485245, - "logits/rejected": 0.09316650032997131, - "logps/chosen": -1.2944605350494385, - "logps/rejected": -1.4776843786239624, - "loss": 1.0656, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.2944605350494385, - "rewards/margins": 0.18322405219078064, - "rewards/rejected": -1.4776843786239624, - "sft_loss": 1.3147443532943726, + "grad_norm": 6.3894639571813, + "learning_rate": 9.998878489314937e-07, + "logits/chosen": 0.009565544314682484, + "logits/rejected": 0.14058226346969604, + "logps/chosen": -1.2977478504180908, + "logps/rejected": -1.4063743352890015, + "loss": 1.0868, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2977478504180908, + "rewards/margins": 0.108626589179039, + "rewards/rejected": -1.4063743352890015, + "sft_loss": 1.3116247653961182, "step": 595 }, { "epoch": 0.32112393376818865, - "grad_norm": 4.871214261969739, - "learning_rate": 2.999557318957719e-06, - "logits/chosen": -0.11450288444757462, - "logits/rejected": 0.02715505287051201, - "logps/chosen": -1.3029778003692627, - "logps/rejected": -1.3981813192367554, - "loss": 1.1033, - "rewards/accuracies": 0.5, - "rewards/chosen": -1.3029778003692627, - "rewards/margins": 0.09520343691110611, - "rewards/rejected": -1.3981813192367554, - "sft_loss": 1.3368868827819824, + "grad_norm": 5.1902933563226625, + "learning_rate": 9.99852439652573e-07, + "logits/chosen": -0.07538153231143951, + "logits/rejected": 0.0692978948354721, + "logps/chosen": -1.2987381219863892, + "logps/rejected": -1.34641432762146, + "loss": 1.1182, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.2987381219863892, + "rewards/margins": 0.0476762130856514, + "rewards/rejected": -1.34641432762146, + "sft_loss": 1.3346712589263916, "step": 600 }, { "epoch": 0.32379996654959026, - "grad_norm": 7.595463039232994, - "learning_rate": 2.9994365424092717e-06, - "logits/chosen": -0.15079785883426666, - "logits/rejected": -0.06511592119932175, - "logps/chosen": -1.382546067237854, - "logps/rejected": -1.610282301902771, - "loss": 1.094, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.382546067237854, - "rewards/margins": 0.22773627936840057, - "rewards/rejected": -1.610282301902771, - "sft_loss": 1.4123454093933105, + "grad_norm": 6.027709998924295, + "learning_rate": 9.998121808030904e-07, + "logits/chosen": -0.08905048668384552, + "logits/rejected": 0.002031295094639063, + "logps/chosen": -1.366342306137085, + "logps/rejected": -1.5508638620376587, + "loss": 1.101, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.366342306137085, + "rewards/margins": 0.1845216453075409, + "rewards/rejected": -1.5508638620376587, + "sft_loss": 1.396206021308899, "step": 605 }, { "epoch": 0.3264759993309918, - "grad_norm": 12.103020392515973, - "learning_rate": 2.9993012183209135e-06, - "logits/chosen": -0.005148774944245815, - "logits/rejected": 0.14879630506038666, - "logps/chosen": -1.3624019622802734, - "logps/rejected": -1.522040843963623, - "loss": 1.1099, + "grad_norm": 15.537170016358042, + "learning_rate": 9.997670727736379e-07, + "logits/chosen": 0.04228251427412033, + "logits/rejected": 0.19599834084510803, + "logps/chosen": -1.3465120792388916, + "logps/rejected": -1.4623966217041016, + "loss": 1.1151, "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3624019622802734, - "rewards/margins": 0.15963879227638245, - "rewards/rejected": -1.522040843963623, - "sft_loss": 1.362701177597046, + "rewards/chosen": -1.3465120792388916, + "rewards/margins": 0.1158844456076622, + "rewards/rejected": -1.4623966217041016, + "sft_loss": 1.3602538108825684, "step": 610 }, { "epoch": 0.32915203211239336, - "grad_norm": 6.393010819793559, - "learning_rate": 2.9991513480055592e-06, - "logits/chosen": -0.11131584644317627, - "logits/rejected": -0.004262803588062525, - "logps/chosen": -1.351694107055664, - "logps/rejected": -1.6122684478759766, - "loss": 1.0688, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.351694107055664, - "rewards/margins": 0.2605743110179901, - "rewards/rejected": -1.6122684478759766, - "sft_loss": 1.374330997467041, + "grad_norm": 5.428017296314078, + "learning_rate": 9.99717116001853e-07, + "logits/chosen": -0.07138704508543015, + "logits/rejected": 0.03556728735566139, + "logps/chosen": -1.3303929567337036, + "logps/rejected": -1.5150206089019775, + "loss": 1.0923, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3303929567337036, + "rewards/margins": 0.1846274882555008, + "rewards/rejected": -1.5150206089019775, + "sft_loss": 1.3598021268844604, "step": 615 }, { "epoch": 0.33182806489379496, - "grad_norm": 5.351828483818866, - "learning_rate": 2.998986932917252e-06, - "logits/chosen": 0.040585193783044815, - "logits/rejected": 0.10566142946481705, - "logps/chosen": -1.4071991443634033, - "logps/rejected": -1.579245924949646, - "loss": 1.1081, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.4071991443634033, - "rewards/margins": 0.1720467507839203, - "rewards/rejected": -1.579245924949646, - "sft_loss": 1.405289888381958, + "grad_norm": 5.632474227462753, + "learning_rate": 9.996623109724173e-07, + "logits/chosen": 0.028153136372566223, + "logits/rejected": 0.09476649016141891, + "logps/chosen": -1.401899814605713, + "logps/rejected": -1.5244743824005127, + "loss": 1.1213, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.401899814605713, + "rewards/margins": 0.12257473170757294, + "rewards/rejected": -1.5244743824005127, + "sft_loss": 1.391305685043335, "step": 620 }, { "epoch": 0.3345040976751965, - "grad_norm": 6.112200213434097, - "learning_rate": 2.998807974651147e-06, - "logits/chosen": 0.009924227371811867, - "logits/rejected": 0.11972503364086151, - "logps/chosen": -1.3159351348876953, - "logps/rejected": -1.59053635597229, - "loss": 1.0489, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3159351348876953, - "rewards/margins": 0.27460095286369324, - "rewards/rejected": -1.59053635597229, - "sft_loss": 1.3495194911956787, + "grad_norm": 7.736028855117305, + "learning_rate": 9.996026582170488e-07, + "logits/chosen": 0.03588408976793289, + "logits/rejected": 0.15140631794929504, + "logps/chosen": -1.3081789016723633, + "logps/rejected": -1.4813811779022217, + "loss": 1.0652, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3081789016723633, + "rewards/margins": 0.17320233583450317, + "rewards/rejected": -1.4813811779022217, + "sft_loss": 1.338536024093628, "step": 625 }, { "epoch": 0.3371801304565981, - "grad_norm": 14.44182735717814, - "learning_rate": 2.9986144749434987e-06, - "logits/chosen": -0.06408433616161346, - "logits/rejected": 0.04238145425915718, - "logps/chosen": -1.369593858718872, - "logps/rejected": -1.6089508533477783, - "loss": 1.0433, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.369593858718872, - "rewards/margins": 0.23935675621032715, - "rewards/rejected": -1.6089508533477783, - "sft_loss": 1.3346786499023438, + "grad_norm": 6.92638354225366, + "learning_rate": 9.995381583144996e-07, + "logits/chosen": -0.039926640689373016, + "logits/rejected": 0.0706775039434433, + "logps/chosen": -1.3458665609359741, + "logps/rejected": -1.5222132205963135, + "loss": 1.0572, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3458665609359741, + "rewards/margins": 0.17634673416614532, + "rewards/rejected": -1.5222132205963135, + "sft_loss": 1.3203465938568115, "step": 630 }, { "epoch": 0.33985616323799966, - "grad_norm": 6.096249152789308, - "learning_rate": 2.9984064356716413e-06, - "logits/chosen": -0.08199284970760345, - "logits/rejected": 0.1549263298511505, - "logps/chosen": -1.434735894203186, - "logps/rejected": -1.5937846899032593, - "loss": 1.1334, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.434735894203186, - "rewards/margins": 0.15904894471168518, - "rewards/rejected": -1.5937846899032593, - "sft_loss": 1.4382418394088745, + "grad_norm": 4.937948526823435, + "learning_rate": 9.994688118905471e-07, + "logits/chosen": -0.02016545459628105, + "logits/rejected": 0.22592997550964355, + "logps/chosen": -1.4141805171966553, + "logps/rejected": -1.503955602645874, + "loss": 1.1504, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.4141805171966553, + "rewards/margins": 0.08977502584457397, + "rewards/rejected": -1.503955602645874, + "sft_loss": 1.4209810495376587, "step": 635 }, { "epoch": 0.3425321960194012, - "grad_norm": 9.288480097861305, - "learning_rate": 2.998183858853974e-06, - "logits/chosen": -0.17539802193641663, - "logits/rejected": 0.019877593964338303, - "logps/chosen": -1.3679062128067017, - "logps/rejected": -1.5570322275161743, - "loss": 1.1068, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3679062128067017, - "rewards/margins": 0.1891259253025055, - "rewards/rejected": -1.5570322275161743, - "sft_loss": 1.4440988302230835, + "grad_norm": 21.163919428320348, + "learning_rate": 9.993946196179912e-07, + "logits/chosen": -0.11903375387191772, + "logits/rejected": 0.08460259437561035, + "logps/chosen": -1.3530133962631226, + "logps/rejected": -1.4804702997207642, + "loss": 1.1239, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3530133962631226, + "rewards/margins": 0.12745679914951324, + "rewards/rejected": -1.4804702997207642, + "sft_loss": 1.4217400550842285, "step": 640 }, { "epoch": 0.3452082288008028, - "grad_norm": 6.120276977609556, - "learning_rate": 2.997946746649937e-06, - "logits/chosen": -0.17709307372570038, - "logits/rejected": -0.09089900553226471, - "logps/chosen": -1.2848079204559326, - "logps/rejected": -1.5811681747436523, - "loss": 1.0051, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2848079204559326, - "rewards/margins": 0.29636019468307495, - "rewards/rejected": -1.5811681747436523, - "sft_loss": 1.2749412059783936, + "grad_norm": 6.355009326849635, + "learning_rate": 9.993155822166455e-07, + "logits/chosen": -0.09540441632270813, + "logits/rejected": -0.0039695026353001595, + "logps/chosen": -1.2626725435256958, + "logps/rejected": -1.4676183462142944, + "loss": 1.0236, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2626725435256958, + "rewards/margins": 0.20494571328163147, + "rewards/rejected": -1.4676183462142944, + "sft_loss": 1.256805658340454, "step": 645 }, { "epoch": 0.34788426158220437, - "grad_norm": 8.751065928216027, - "learning_rate": 2.997695101359994e-06, - "logits/chosen": -0.12560425698757172, - "logits/rejected": 0.016330739483237267, - "logps/chosen": -1.4288761615753174, - "logps/rejected": -1.6702779531478882, - "loss": 1.0873, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.4288761615753174, - "rewards/margins": 0.24140167236328125, - "rewards/rejected": -1.6702779531478882, - "sft_loss": 1.4446970224380493, + "grad_norm": 8.182729549160133, + "learning_rate": 9.992317004533313e-07, + "logits/chosen": -0.03559732064604759, + "logits/rejected": 0.10734357684850693, + "logps/chosen": -1.4046621322631836, + "logps/rejected": -1.5828628540039062, + "loss": 1.0986, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4046621322631836, + "rewards/margins": 0.17820079624652863, + "rewards/rejected": -1.5828628540039062, + "sft_loss": 1.4164342880249023, "step": 650 }, { "epoch": 0.350560294363606, - "grad_norm": 7.261081306992401, - "learning_rate": 2.997428925425609e-06, - "logits/chosen": -0.03356175869703293, - "logits/rejected": -0.024207763373851776, - "logps/chosen": -1.3508546352386475, - "logps/rejected": -1.6246044635772705, - "loss": 1.076, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3508546352386475, - "rewards/margins": 0.2737496495246887, - "rewards/rejected": -1.6246044635772705, - "sft_loss": 1.3727697134017944, + "grad_norm": 7.017317393413723, + "learning_rate": 9.991429751418696e-07, + "logits/chosen": 0.0322679728269577, + "logits/rejected": 0.0426609069108963, + "logps/chosen": -1.3416144847869873, + "logps/rejected": -1.5490596294403076, + "loss": 1.0927, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3416144847869873, + "rewards/margins": 0.2074451446533203, + "rewards/rejected": -1.5490596294403076, + "sft_loss": 1.353048324584961, "step": 655 }, { "epoch": 0.3532363271450075, - "grad_norm": 6.460018434735591, - "learning_rate": 2.997148221429223e-06, - "logits/chosen": -0.09538416564464569, - "logits/rejected": 0.02857859805226326, - "logps/chosen": -1.3230699300765991, - "logps/rejected": -1.446380615234375, - "loss": 1.103, + "grad_norm": 5.997866167672772, + "learning_rate": 9.99049407143074e-07, + "logits/chosen": 0.010479466058313847, + "logits/rejected": 0.14654429256916046, + "logps/chosen": -1.3053934574127197, + "logps/rejected": -1.3830502033233643, + "loss": 1.1136, "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3230699300765991, - "rewards/margins": 0.12331060320138931, - "rewards/rejected": -1.446380615234375, - "sft_loss": 1.3620294332504272, + "rewards/chosen": -1.3053934574127197, + "rewards/margins": 0.0776568129658699, + "rewards/rejected": -1.3830502033233643, + "sft_loss": 1.3474743366241455, "step": 660 }, { "epoch": 0.35591235992640907, - "grad_norm": 4.745852200616508, - "learning_rate": 2.996852992094225e-06, - "logits/chosen": -0.1095251813530922, - "logits/rejected": 0.02362101897597313, - "logps/chosen": -1.2727789878845215, - "logps/rejected": -1.4820854663848877, - "loss": 1.063, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2727789878845215, - "rewards/margins": 0.20930643379688263, - "rewards/rejected": -1.4820854663848877, - "sft_loss": 1.3236764669418335, + "grad_norm": 7.708771512424861, + "learning_rate": 9.989509973647416e-07, + "logits/chosen": -0.005743196699768305, + "logits/rejected": 0.13655301928520203, + "logps/chosen": -1.2623566389083862, + "logps/rejected": -1.422377347946167, + "loss": 1.0684, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2623566389083862, + "rewards/margins": 0.1600208282470703, + "rewards/rejected": -1.422377347946167, + "sft_loss": 1.3066545724868774, "step": 665 }, { "epoch": 0.3585883927078107, - "grad_norm": 4.751418020702247, - "learning_rate": 2.9965432402849336e-06, - "logits/chosen": -0.10184980928897858, - "logits/rejected": 0.11156318336725235, - "logps/chosen": -1.302096962928772, - "logps/rejected": -1.4565203189849854, - "loss": 1.0951, + "grad_norm": 5.59439241677133, + "learning_rate": 9.988477467616445e-07, + "logits/chosen": -0.028394797816872597, + "logits/rejected": 0.18746501207351685, + "logps/chosen": -1.2996008396148682, + "logps/rejected": -1.3868398666381836, + "loss": 1.1132, "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": -1.302096962928772, - "rewards/margins": 0.154423326253891, - "rewards/rejected": -1.4565203189849854, - "sft_loss": 1.4031466245651245, + "rewards/chosen": -1.2996008396148682, + "rewards/margins": 0.08723914623260498, + "rewards/rejected": -1.3868398666381836, + "sft_loss": 1.3947397470474243, "step": 670 }, { "epoch": 0.3612644254892122, - "grad_norm": 5.594116499670167, - "learning_rate": 2.9962189690065614e-06, - "logits/chosen": -0.15128448605537415, - "logits/rejected": -0.07776673883199692, - "logps/chosen": -1.3057724237442017, - "logps/rejected": -1.6141293048858643, - "loss": 1.0376, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3057724237442017, - "rewards/margins": 0.3083568811416626, - "rewards/rejected": -1.6141293048858643, - "sft_loss": 1.3583399057388306, + "grad_norm": 6.069672629787288, + "learning_rate": 9.987396563355205e-07, + "logits/chosen": -0.04292871430516243, + "logits/rejected": 0.040585361421108246, + "logps/chosen": -1.2948095798492432, + "logps/rejected": -1.5414516925811768, + "loss": 1.0479, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2948095798492432, + "rewards/margins": 0.2466420829296112, + "rewards/rejected": -1.5414516925811768, + "sft_loss": 1.345273733139038, "step": 675 }, { "epoch": 0.36394045827061383, - "grad_norm": 5.211897854634603, - "learning_rate": 2.99588018140519e-06, - "logits/chosen": -0.07172641158103943, - "logits/rejected": 0.09021764993667603, - "logps/chosen": -1.3617281913757324, - "logps/rejected": -1.54132080078125, - "loss": 1.1241, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.3617281913757324, - "rewards/margins": 0.17959263920783997, - "rewards/rejected": -1.54132080078125, - "sft_loss": 1.3477673530578613, + "grad_norm": 5.583661151396963, + "learning_rate": 9.986267271350631e-07, + "logits/chosen": 0.08234535157680511, + "logits/rejected": 0.25308576226234436, + "logps/chosen": -1.3450154066085815, + "logps/rejected": -1.4483263492584229, + "loss": 1.1309, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3450154066085815, + "rewards/margins": 0.10331089794635773, + "rewards/rejected": -1.4483263492584229, + "sft_loss": 1.327911138534546, "step": 680 }, { "epoch": 0.3666164910520154, - "grad_norm": 15.889378620354021, - "learning_rate": 2.995526880767737e-06, - "logits/chosen": -0.11468782275915146, - "logits/rejected": 0.03871222585439682, - "logps/chosen": -1.3583406209945679, - "logps/rejected": -1.5525429248809814, - "loss": 1.0944, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3583406209945679, - "rewards/margins": 0.19420206546783447, - "rewards/rejected": -1.5525429248809814, - "sft_loss": 1.3450286388397217, + "grad_norm": 9.253153511063843, + "learning_rate": 9.985089602559123e-07, + "logits/chosen": 0.03376932814717293, + "logits/rejected": 0.2015124261379242, + "logps/chosen": -1.3188982009887695, + "logps/rejected": -1.4139798879623413, + "loss": 1.1083, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3188982009887695, + "rewards/margins": 0.0950816422700882, + "rewards/rejected": -1.4139798879623413, + "sft_loss": 1.329420804977417, "step": 685 }, { "epoch": 0.369292523833417, - "grad_norm": 6.6170605417450705, - "learning_rate": 2.9951590705219287e-06, - "logits/chosen": -0.1409483253955841, - "logits/rejected": -0.10424485057592392, - "logps/chosen": -1.3454536199569702, - "logps/rejected": -1.5386629104614258, - "loss": 1.1092, + "grad_norm": 7.192508641833526, + "learning_rate": 9.983863568406428e-07, + "logits/chosen": 0.05541741102933884, + "logits/rejected": 0.09754420816898346, + "logps/chosen": -1.3204059600830078, + "logps/rejected": -1.4669349193572998, + "loss": 1.1117, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3454536199569702, - "rewards/margins": 0.1932092308998108, - "rewards/rejected": -1.5386629104614258, - "sft_loss": 1.4022139310836792, + "rewards/chosen": -1.3204059600830078, + "rewards/margins": 0.14652886986732483, + "rewards/rejected": -1.4669349193572998, + "sft_loss": 1.3720275163650513, "step": 690 }, { "epoch": 0.37196855661481854, - "grad_norm": 5.267932830709378, - "learning_rate": 2.99477675423626e-06, - "logits/chosen": -0.1686527580022812, - "logits/rejected": -0.08229938894510269, - "logps/chosen": -1.252075433731079, - "logps/rejected": -1.4953110218048096, - "loss": 1.0305, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.252075433731079, - "rewards/margins": 0.2432354986667633, - "rewards/rejected": -1.4953110218048096, - "sft_loss": 1.2790673971176147, + "grad_norm": 5.811098505616518, + "learning_rate": 9.982589180787532e-07, + "logits/chosen": 0.01835859753191471, + "logits/rejected": 0.11538205295801163, + "logps/chosen": -1.2177239656448364, + "logps/rejected": -1.4119118452072144, + "loss": 1.037, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2177239656448364, + "rewards/margins": 0.19418802857398987, + "rewards/rejected": -1.4119118452072144, + "sft_loss": 1.260658860206604, "step": 695 }, { "epoch": 0.3746445893962201, - "grad_norm": 16.544037674536877, - "learning_rate": 2.994379935619966e-06, - "logits/chosen": -0.301688551902771, - "logits/rejected": -0.17635759711265564, - "logps/chosen": -1.4327962398529053, - "logps/rejected": -1.5365126132965088, - "loss": 1.1198, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.4327962398529053, - "rewards/margins": 0.1037164181470871, - "rewards/rejected": -1.5365126132965088, - "sft_loss": 1.404392123222351, + "grad_norm": 7.543525850507948, + "learning_rate": 9.981266452066553e-07, + "logits/chosen": -0.09179548919200897, + "logits/rejected": 0.05748724937438965, + "logps/chosen": -1.38742995262146, + "logps/rejected": -1.4838206768035889, + "loss": 1.1139, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.38742995262146, + "rewards/margins": 0.0963907390832901, + "rewards/rejected": -1.4838206768035889, + "sft_loss": 1.3663232326507568, "step": 700 }, { "epoch": 0.3773206221776217, - "grad_norm": 5.581296428398838, - "learning_rate": 2.9939686185229826e-06, - "logits/chosen": -0.26242876052856445, - "logits/rejected": -0.0957266241312027, - "logps/chosen": -1.3624424934387207, - "logps/rejected": -1.6302438974380493, - "loss": 1.0601, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3624424934387207, - "rewards/margins": 0.2678012251853943, - "rewards/rejected": -1.6302438974380493, - "sft_loss": 1.373811960220337, + "grad_norm": 5.602787814154604, + "learning_rate": 9.979895395076608e-07, + "logits/chosen": -0.08347173035144806, + "logits/rejected": 0.10157792270183563, + "logps/chosen": -1.3246452808380127, + "logps/rejected": -1.5341843366622925, + "loss": 1.0643, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3246452808380127, + "rewards/margins": 0.2095392644405365, + "rewards/rejected": -1.5341843366622925, + "sft_loss": 1.3606746196746826, "step": 705 }, { "epoch": 0.37999665495902324, - "grad_norm": 6.0713875990807, - "learning_rate": 2.9935428069359103e-06, - "logits/chosen": -0.16007235646247864, - "logits/rejected": -0.07162095606327057, - "logps/chosen": -1.3129180669784546, - "logps/rejected": -1.5489208698272705, - "loss": 1.0377, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3129180669784546, - "rewards/margins": 0.23600268363952637, - "rewards/rejected": -1.5489208698272705, - "sft_loss": 1.3114233016967773, + "grad_norm": 5.9317855501058885, + "learning_rate": 9.9784760231197e-07, + "logits/chosen": 0.04531757906079292, + "logits/rejected": 0.1470317542552948, + "logps/chosen": -1.28559148311615, + "logps/rejected": -1.4513322114944458, + "loss": 1.0489, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.28559148311615, + "rewards/margins": 0.16574081778526306, + "rewards/rejected": -1.4513322114944458, + "sft_loss": 1.2866220474243164, "step": 710 }, { "epoch": 0.38267268774042484, - "grad_norm": 6.726419075457986, - "learning_rate": 2.9931025049899744e-06, - "logits/chosen": -0.22567526996135712, - "logits/rejected": -0.07417559623718262, - "logps/chosen": -1.3759498596191406, - "logps/rejected": -1.5786223411560059, - "loss": 1.0625, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3759498596191406, - "rewards/margins": 0.20267245173454285, - "rewards/rejected": -1.5786223411560059, - "sft_loss": 1.3487598896026611, + "grad_norm": 7.107217318726965, + "learning_rate": 9.97700834996658e-07, + "logits/chosen": -0.02601141855120659, + "logits/rejected": 0.15081417560577393, + "logps/chosen": -1.3638850450515747, + "logps/rejected": -1.5114787817001343, + "loss": 1.0756, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3638850450515747, + "rewards/margins": 0.1475939303636551, + "rewards/rejected": -1.5114787817001343, + "sft_loss": 1.3407633304595947, "step": 715 }, { "epoch": 0.3853487205218264, - "grad_norm": 6.154001565581968, - "learning_rate": 2.9926477169569865e-06, - "logits/chosen": -0.1562787890434265, - "logits/rejected": 0.011226480826735497, - "logps/chosen": -1.475956678390503, - "logps/rejected": -1.6560490131378174, - "loss": 1.1454, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.475956678390503, - "rewards/margins": 0.18009230494499207, - "rewards/rejected": -1.6560490131378174, - "sft_loss": 1.4573657512664795, + "grad_norm": 6.8293223560449015, + "learning_rate": 9.97549238985662e-07, + "logits/chosen": 0.04921600967645645, + "logits/rejected": 0.2499714344739914, + "logps/chosen": -1.4132165908813477, + "logps/rejected": -1.546648621559143, + "loss": 1.1354, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4132165908813477, + "rewards/margins": 0.13343189656734467, + "rewards/rejected": -1.546648621559143, + "sft_loss": 1.4309947490692139, "step": 720 }, { "epoch": 0.38802475330322794, - "grad_norm": 6.469702586737282, - "learning_rate": 2.9921784472493023e-06, - "logits/chosen": -0.2580137848854065, - "logits/rejected": -0.13596820831298828, - "logps/chosen": -1.2518982887268066, - "logps/rejected": -1.564278244972229, - "loss": 1.0125, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.2518982887268066, - "rewards/margins": 0.3123798966407776, - "rewards/rejected": -1.564278244972229, - "sft_loss": 1.3133872747421265, + "grad_norm": 6.7983001447550375, + "learning_rate": 9.973928157497674e-07, + "logits/chosen": -0.11556123197078705, + "logits/rejected": 0.027849048376083374, + "logps/chosen": -1.2401540279388428, + "logps/rejected": -1.535406470298767, + "loss": 1.0106, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2401540279388428, + "rewards/margins": 0.29525232315063477, + "rewards/rejected": -1.535406470298767, + "sft_loss": 1.3017752170562744, "step": 725 }, { "epoch": 0.39070078608462955, - "grad_norm": 5.696707257736793, - "learning_rate": 2.9916947004197784e-06, - "logits/chosen": -0.2973279356956482, - "logits/rejected": -0.14339035749435425, - "logps/chosen": -1.3605453968048096, - "logps/rejected": -1.5471065044403076, - "loss": 1.0808, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3605453968048096, - "rewards/margins": 0.18656139075756073, - "rewards/rejected": -1.5471065044403076, - "sft_loss": 1.3702105283737183, + "grad_norm": 6.005120314203222, + "learning_rate": 9.972315668065927e-07, + "logits/chosen": -0.13792164623737335, + "logits/rejected": 0.031013095751404762, + "logps/chosen": -1.3492605686187744, + "logps/rejected": -1.5073153972625732, + "loss": 1.0958, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3492605686187744, + "rewards/margins": 0.1580551117658615, + "rewards/rejected": -1.5073153972625732, + "sft_loss": 1.3596335649490356, "step": 730 }, { "epoch": 0.3933768188660311, - "grad_norm": 5.510506460182416, - "learning_rate": 2.9911964811617288e-06, - "logits/chosen": -0.2812764644622803, - "logits/rejected": -0.19277769327163696, - "logps/chosen": -1.3888843059539795, - "logps/rejected": -1.5353819131851196, - "loss": 1.1113, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3888843059539795, - "rewards/margins": 0.14649756252765656, - "rewards/rejected": -1.5353819131851196, - "sft_loss": 1.4330034255981445, + "grad_norm": 5.887621096403035, + "learning_rate": 9.97065493720576e-07, + "logits/chosen": -0.11696688830852509, + "logits/rejected": -0.009747383184731007, + "logps/chosen": -1.3687269687652588, + "logps/rejected": -1.4918612241744995, + "loss": 1.1015, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3687269687652588, + "rewards/margins": 0.12313439697027206, + "rewards/rejected": -1.4918612241744995, + "sft_loss": 1.4068576097488403, "step": 735 }, { "epoch": 0.3960528516474327, - "grad_norm": 6.748687357461805, - "learning_rate": 2.990683794308879e-06, - "logits/chosen": -0.2419544905424118, - "logits/rejected": -0.08293026685714722, - "logps/chosen": -1.46367347240448, - "logps/rejected": -1.6116256713867188, - "loss": 1.1492, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": -1.46367347240448, - "rewards/margins": 0.1479521244764328, - "rewards/rejected": -1.6116256713867188, - "sft_loss": 1.4724233150482178, + "grad_norm": 10.029945413063814, + "learning_rate": 9.968945981029594e-07, + "logits/chosen": -0.0755590945482254, + "logits/rejected": 0.10666684806346893, + "logps/chosen": -1.448311686515808, + "logps/rejected": -1.524222493171692, + "loss": 1.1654, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.448311686515808, + "rewards/margins": 0.07591084390878677, + "rewards/rejected": -1.524222493171692, + "sft_loss": 1.454717993736267, "step": 740 }, { "epoch": 0.39872888442883425, - "grad_norm": 4.83181808694649, - "learning_rate": 2.990156644835318e-06, - "logits/chosen": -0.14327023923397064, - "logits/rejected": -0.08006924390792847, - "logps/chosen": -1.4113633632659912, - "logps/rejected": -1.674155831336975, - "loss": 1.1005, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.4113633632659912, - "rewards/margins": 0.2627924978733063, - "rewards/rejected": -1.674155831336975, - "sft_loss": 1.3983561992645264, + "grad_norm": 5.7304804725359375, + "learning_rate": 9.967188816117726e-07, + "logits/chosen": 0.06337063759565353, + "logits/rejected": 0.1517437994480133, + "logps/chosen": -1.4033968448638916, + "logps/rejected": -1.6506102085113525, + "loss": 1.1044, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4033968448638916, + "rewards/margins": 0.24721336364746094, + "rewards/rejected": -1.6506102085113525, + "sft_loss": 1.3806891441345215, "step": 745 }, { "epoch": 0.4014049172102358, - "grad_norm": 4.0797099615299, - "learning_rate": 2.989615037855454e-06, - "logits/chosen": -0.19163314998149872, - "logits/rejected": -0.045805417001247406, - "logps/chosen": -1.326471209526062, - "logps/rejected": -1.604789137840271, - "loss": 1.0458, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.326471209526062, - "rewards/margins": 0.2783178389072418, - "rewards/rejected": -1.604789137840271, - "sft_loss": 1.353487253189087, + "grad_norm": 6.054151935024956, + "learning_rate": 9.965383459518179e-07, + "logits/chosen": -0.052485160529613495, + "logits/rejected": 0.11813749372959137, + "logps/chosen": -1.3426921367645264, + "logps/rejected": -1.5462762117385864, + "loss": 1.063, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3426921367645264, + "rewards/margins": 0.20358403027057648, + "rewards/rejected": -1.5462762117385864, + "sft_loss": 1.3526374101638794, "step": 750 }, { "epoch": 0.4040809499916374, - "grad_norm": 5.719836220088525, - "learning_rate": 2.98905897862396e-06, - "logits/chosen": -0.1349993646144867, - "logits/rejected": -0.021838178858160973, - "logps/chosen": -1.3588494062423706, - "logps/rejected": -1.4981552362442017, - "loss": 1.1149, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3588494062423706, - "rewards/margins": 0.13930585980415344, - "rewards/rejected": -1.4981552362442017, - "sft_loss": 1.3990111351013184, + "grad_norm": 5.912686110630131, + "learning_rate": 9.963529928746533e-07, + "logits/chosen": 0.0018018543487414718, + "logits/rejected": 0.14155825972557068, + "logps/chosen": -1.3522013425827026, + "logps/rejected": -1.491640329360962, + "loss": 1.114, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3522013425827026, + "rewards/margins": 0.1394389122724533, + "rewards/rejected": -1.491640329360962, + "sft_loss": 1.3876965045928955, "step": 755 }, { "epoch": 0.40675698277303896, - "grad_norm": 4.736699163937537, - "learning_rate": 2.9884884725357237e-06, - "logits/chosen": -0.22783195972442627, - "logits/rejected": -0.16126468777656555, - "logps/chosen": -1.3684322834014893, - "logps/rejected": -1.5770494937896729, - "loss": 1.0834, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3684322834014893, - "rewards/margins": 0.20861713588237762, - "rewards/rejected": -1.5770494937896729, - "sft_loss": 1.4068214893341064, + "grad_norm": 4.741798302717173, + "learning_rate": 9.961628241785746e-07, + "logits/chosen": -0.0866033211350441, + "logits/rejected": -0.01305533666163683, + "logps/chosen": -1.3660171031951904, + "logps/rejected": -1.5640350580215454, + "loss": 1.0843, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3660171031951904, + "rewards/margins": 0.1980178952217102, + "rewards/rejected": -1.5640350580215454, + "sft_loss": 1.3981744050979614, "step": 760 }, { "epoch": 0.40943301555444056, - "grad_norm": 5.749657872809939, - "learning_rate": 2.9879035251257994e-06, - "logits/chosen": -0.19161829352378845, - "logits/rejected": -0.1089678555727005, - "logps/chosen": -1.3445179462432861, - "logps/rejected": -1.4991188049316406, - "loss": 1.075, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3445179462432861, - "rewards/margins": 0.15460094809532166, - "rewards/rejected": -1.4991188049316406, - "sft_loss": 1.337806224822998, + "grad_norm": 6.469291565478886, + "learning_rate": 9.959678417085998e-07, + "logits/chosen": -0.03893253952264786, + "logits/rejected": 0.05785505101084709, + "logps/chosen": -1.3518002033233643, + "logps/rejected": -1.482414960861206, + "loss": 1.0818, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3518002033233643, + "rewards/margins": 0.13061493635177612, + "rewards/rejected": -1.482414960861206, + "sft_loss": 1.3298957347869873, "step": 765 }, { "epoch": 0.4121090483358421, - "grad_norm": 6.624603107735152, - "learning_rate": 2.9873041420693485e-06, - "logits/chosen": -0.08060692250728607, - "logits/rejected": 0.04494641348719597, - "logps/chosen": -1.3140990734100342, - "logps/rejected": -1.6481949090957642, - "loss": 1.0355, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3140990734100342, - "rewards/margins": 0.33409592509269714, - "rewards/rejected": -1.6481949090957642, - "sft_loss": 1.3196513652801514, + "grad_norm": 6.762984889125461, + "learning_rate": 9.957680473564493e-07, + "logits/chosen": 0.033855509012937546, + "logits/rejected": 0.16177025437355042, + "logps/chosen": -1.291409969329834, + "logps/rejected": -1.590272068977356, + "loss": 1.0386, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.291409969329834, + "rewards/margins": 0.2988620400428772, + "rewards/rejected": -1.590272068977356, + "sft_loss": 1.3071105480194092, "step": 770 }, { "epoch": 0.41478508111724366, - "grad_norm": 6.240389256604791, - "learning_rate": 2.9866903291815874e-06, - "logits/chosen": -0.24704810976982117, - "logits/rejected": -0.08406410366296768, - "logps/chosen": -1.3743051290512085, - "logps/rejected": -1.620736837387085, - "loss": 1.0634, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3743051290512085, - "rewards/margins": 0.24643178284168243, - "rewards/rejected": -1.620736837387085, - "sft_loss": 1.3263603448867798, + "grad_norm": 6.759201707442236, + "learning_rate": 9.95563443060529e-07, + "logits/chosen": -0.12731292843818665, + "logits/rejected": 0.047011490911245346, + "logps/chosen": -1.3670367002487183, + "logps/rejected": -1.5798918008804321, + "loss": 1.077, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3670367002487183, + "rewards/margins": 0.21285513043403625, + "rewards/rejected": -1.5798918008804321, + "sft_loss": 1.3249647617340088, "step": 775 }, { "epoch": 0.41746111389864526, - "grad_norm": 4.075127017126476, - "learning_rate": 2.986062092417733e-06, - "logits/chosen": -0.29847627878189087, - "logits/rejected": -0.15218086540699005, - "logps/chosen": -1.3139820098876953, - "logps/rejected": -1.5687217712402344, - "loss": 1.0493, + "grad_norm": 4.474840266700944, + "learning_rate": 9.95354030805911e-07, + "logits/chosen": -0.17214761674404144, + "logits/rejected": -0.01971476711332798, + "logps/chosen": -1.2922381162643433, + "logps/rejected": -1.5108747482299805, + "loss": 1.0514, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3139820098876953, - "rewards/margins": 0.254739910364151, - "rewards/rejected": -1.5687217712402344, - "sft_loss": 1.3553965091705322, + "rewards/chosen": -1.2922381162643433, + "rewards/margins": 0.21863672137260437, + "rewards/rejected": -1.5108747482299805, + "sft_loss": 1.3367199897766113, "step": 780 }, { "epoch": 0.4201371466800468, - "grad_norm": 5.403533929290899, - "learning_rate": 2.9854194378729402e-06, - "logits/chosen": -0.1648341715335846, - "logits/rejected": -0.03919944912195206, - "logps/chosen": -1.320778250694275, - "logps/rejected": -1.6576478481292725, - "loss": 1.0246, + "grad_norm": 5.417824675948416, + "learning_rate": 9.951398126243133e-07, + "logits/chosen": -0.0019948245026171207, + "logits/rejected": 0.12875740230083466, + "logps/chosen": -1.2788978815078735, + "logps/rejected": -1.5393446683883667, + "loss": 1.0361, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.320778250694275, - "rewards/margins": 0.3368696868419647, - "rewards/rejected": -1.6576478481292725, - "sft_loss": 1.328687310218811, + "rewards/chosen": -1.2788978815078735, + "rewards/margins": 0.2604469656944275, + "rewards/rejected": -1.5393446683883667, + "sft_loss": 1.2987463474273682, "step": 785 }, { "epoch": 0.4228131794614484, - "grad_norm": 5.640372007436496, - "learning_rate": 2.984762371782246e-06, - "logits/chosen": -0.22557714581489563, - "logits/rejected": -0.09820972383022308, - "logps/chosen": -1.3596031665802002, - "logps/rejected": -1.6158663034439087, - "loss": 1.0578, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3596031665802002, - "rewards/margins": 0.2562631368637085, - "rewards/rejected": -1.6158663034439087, - "sft_loss": 1.3415796756744385, + "grad_norm": 6.207791305444593, + "learning_rate": 9.94920790594082e-07, + "logits/chosen": -0.09008261561393738, + "logits/rejected": 0.040010981261730194, + "logps/chosen": -1.337257981300354, + "logps/rejected": -1.4963468313217163, + "loss": 1.0769, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.337257981300354, + "rewards/margins": 0.15908858180046082, + "rewards/rejected": -1.4963468313217163, + "sft_loss": 1.3328725099563599, "step": 790 }, { "epoch": 0.42548921224284997, - "grad_norm": 5.045707472029078, - "learning_rate": 2.9840909005205093e-06, - "logits/chosen": -0.21985206007957458, - "logits/rejected": -0.0283858273178339, - "logps/chosen": -1.3438150882720947, - "logps/rejected": -1.74857497215271, - "loss": 1.0471, + "grad_norm": 7.907547690864367, + "learning_rate": 9.946969668401696e-07, + "logits/chosen": -0.08714650571346283, + "logits/rejected": 0.11193932592868805, + "logps/chosen": -1.3179038763046265, + "logps/rejected": -1.5680185556411743, + "loss": 1.0667, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3438150882720947, - "rewards/margins": 0.4047599732875824, - "rewards/rejected": -1.74857497215271, - "sft_loss": 1.3679378032684326, + "rewards/chosen": -1.3179038763046265, + "rewards/margins": 0.25011464953422546, + "rewards/rejected": -1.5680185556411743, + "sft_loss": 1.342376470565796, "step": 795 }, { "epoch": 0.4281652450242516, - "grad_norm": 6.148936941607535, - "learning_rate": 2.9834050306023467e-06, - "logits/chosen": -0.18061117827892303, - "logits/rejected": -0.09824011474847794, - "logps/chosen": -1.3768579959869385, - "logps/rejected": -1.6590843200683594, - "loss": 1.0513, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3768579959869385, - "rewards/margins": 0.28222647309303284, - "rewards/rejected": -1.6590843200683594, - "sft_loss": 1.3391609191894531, + "grad_norm": 5.856203248101176, + "learning_rate": 9.944683435341155e-07, + "logits/chosen": -0.06098737567663193, + "logits/rejected": 0.02430593967437744, + "logps/chosen": -1.322656273841858, + "logps/rejected": -1.4689748287200928, + "loss": 1.0764, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.322656273841858, + "rewards/margins": 0.14631858468055725, + "rewards/rejected": -1.4689748287200928, + "sft_loss": 1.310194730758667, "step": 800 }, { "epoch": 0.4281652450242516, - "eval_logits/chosen": 0.17743077874183655, - "eval_logits/rejected": 0.2683868110179901, - "eval_logps/chosen": -1.4223155975341797, - "eval_logps/rejected": -1.7731083631515503, - "eval_loss": 1.0583148002624512, - "eval_rewards/accuracies": 0.6038575768470764, - "eval_rewards/chosen": -1.4223155975341797, - "eval_rewards/margins": 0.35079291462898254, - "eval_rewards/rejected": -1.7731083631515503, - "eval_runtime": 48.9553, - "eval_samples_per_second": 27.474, - "eval_sft_loss": 1.4055895805358887, - "eval_steps_per_second": 6.884, + "eval_logits/chosen": 0.28842443227767944, + "eval_logits/rejected": 0.38061627745628357, + "eval_logps/chosen": -1.3602670431137085, + "eval_logps/rejected": -1.5872876644134521, + "eval_loss": 1.0738680362701416, + "eval_rewards/accuracies": 0.5823442339897156, + "eval_rewards/chosen": -1.3602670431137085, + "eval_rewards/margins": 0.22702065110206604, + "eval_rewards/rejected": -1.5872876644134521, + "eval_runtime": 43.4088, + "eval_samples_per_second": 30.985, + "eval_sft_loss": 1.3759210109710693, + "eval_steps_per_second": 7.763, "step": 800 }, { "epoch": 0.4308412778056531, - "grad_norm": 8.529398356886302, - "learning_rate": 2.9827047686820714e-06, - "logits/chosen": -0.21581999957561493, - "logits/rejected": -0.050138603895902634, - "logps/chosen": -1.3935011625289917, - "logps/rejected": -1.8084440231323242, - "loss": 1.0447, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3935011625289917, - "rewards/margins": 0.41494283080101013, - "rewards/rejected": -1.8084440231323242, - "sft_loss": 1.4068191051483154, + "grad_norm": 7.53475500870321, + "learning_rate": 9.942349228940236e-07, + "logits/chosen": -0.11175141483545303, + "logits/rejected": 0.053176987916231155, + "logps/chosen": -1.361185073852539, + "logps/rejected": -1.6331669092178345, + "loss": 1.0675, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.361185073852539, + "rewards/margins": 0.2719815969467163, + "rewards/rejected": -1.6331669092178345, + "sft_loss": 1.3797080516815186, "step": 805 }, { "epoch": 0.43351731058705467, - "grad_norm": 6.776422278764382, - "learning_rate": 2.981990121553627e-06, - "logits/chosen": -0.10206502676010132, - "logits/rejected": -0.02784993313252926, - "logps/chosen": -1.3431257009506226, - "logps/rejected": -1.7025333642959595, - "loss": 1.0352, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3431257009506226, - "rewards/margins": 0.35940781235694885, - "rewards/rejected": -1.7025333642959595, - "sft_loss": 1.349726676940918, + "grad_norm": 6.7075427581976514, + "learning_rate": 9.939967071845424e-07, + "logits/chosen": 0.023669257760047913, + "logits/rejected": 0.10120322555303574, + "logps/chosen": -1.2819058895111084, + "logps/rejected": -1.5174692869186401, + "loss": 1.0449, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2819058895111084, + "rewards/margins": 0.23556344211101532, + "rewards/rejected": -1.5174692869186401, + "sft_loss": 1.2974836826324463, "step": 810 }, { "epoch": 0.4361933433684563, - "grad_norm": 9.001427842804857, - "learning_rate": 2.9812610961505237e-06, - "logits/chosen": -0.10953329503536224, - "logits/rejected": 0.024411043152213097, - "logps/chosen": -1.3474403619766235, - "logps/rejected": -1.7991764545440674, - "loss": 1.0292, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3474403619766235, - "rewards/margins": 0.45173612236976624, - "rewards/rejected": -1.7991764545440674, - "sft_loss": 1.387012243270874, + "grad_norm": 10.800344000340381, + "learning_rate": 9.937536987168413e-07, + "logits/chosen": 0.029402485117316246, + "logits/rejected": 0.17515210807323456, + "logps/chosen": -1.2987323999404907, + "logps/rejected": -1.6028416156768799, + "loss": 1.0363, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2987323999404907, + "rewards/margins": 0.30410921573638916, + "rewards/rejected": -1.6028416156768799, + "sft_loss": 1.3477541208267212, "step": 815 }, { "epoch": 0.4388693761498578, - "grad_norm": 7.739663573480024, - "learning_rate": 2.980517699545769e-06, - "logits/chosen": -0.08769674599170685, - "logits/rejected": -0.04560214281082153, - "logps/chosen": -1.3872299194335938, - "logps/rejected": -1.7312628030776978, - "loss": 1.0722, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3872299194335938, - "rewards/margins": 0.34403276443481445, - "rewards/rejected": -1.7312628030776978, - "sft_loss": 1.3998725414276123, + "grad_norm": 6.970876705465844, + "learning_rate": 9.935058998485896e-07, + "logits/chosen": 0.017118550837039948, + "logits/rejected": 0.06861446797847748, + "logps/chosen": -1.3582419157028198, + "logps/rejected": -1.6129653453826904, + "loss": 1.0714, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3582419157028198, + "rewards/margins": 0.25472337007522583, + "rewards/rejected": -1.6129653453826904, + "sft_loss": 1.3721500635147095, "step": 820 }, { "epoch": 0.44154540893125943, - "grad_norm": 6.01912269730338, - "learning_rate": 2.9797599389518003e-06, - "logits/chosen": -0.14980466663837433, - "logits/rejected": -0.01854553446173668, - "logps/chosen": -1.2706798315048218, - "logps/rejected": -1.5689154863357544, - "loss": 1.064, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.2706798315048218, - "rewards/margins": 0.29823535680770874, - "rewards/rejected": -1.5689154863357544, - "sft_loss": 1.3684272766113281, + "grad_norm": 12.073160531115478, + "learning_rate": 9.932533129839333e-07, + "logits/chosen": -0.054477252066135406, + "logits/rejected": 0.07593884319067001, + "logps/chosen": -1.2791856527328491, + "logps/rejected": -1.4791104793548584, + "loss": 1.093, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2791856527328491, + "rewards/margins": 0.19992482662200928, + "rewards/rejected": -1.4791104793548584, + "sft_loss": 1.3659038543701172, "step": 825 }, { "epoch": 0.444221441712661, - "grad_norm": 9.561618737261167, - "learning_rate": 2.9789878217204138e-06, - "logits/chosen": -0.043389469385147095, - "logits/rejected": 0.12749220430850983, - "logps/chosen": -1.3664634227752686, - "logps/rejected": -1.5824370384216309, - "loss": 1.0751, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3664634227752686, - "rewards/margins": 0.21597354114055634, - "rewards/rejected": -1.5824370384216309, - "sft_loss": 1.3450241088867188, + "grad_norm": 6.829016103118746, + "learning_rate": 9.929959405734711e-07, + "logits/chosen": 0.017300017178058624, + "logits/rejected": 0.18187452852725983, + "logps/chosen": -1.365114450454712, + "logps/rejected": -1.502744197845459, + "loss": 1.0911, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.365114450454712, + "rewards/margins": 0.13762979209423065, + "rewards/rejected": -1.502744197845459, + "sft_loss": 1.3418179750442505, "step": 830 }, { "epoch": 0.44689747449406253, - "grad_norm": 6.2442349386788125, - "learning_rate": 2.9782013553426944e-06, - "logits/chosen": -0.09964965283870697, - "logits/rejected": 0.03761683404445648, - "logps/chosen": -1.302367091178894, - "logps/rejected": -1.6130015850067139, - "loss": 1.0555, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.302367091178894, - "rewards/margins": 0.31063464283943176, - "rewards/rejected": -1.6130015850067139, - "sft_loss": 1.3702852725982666, + "grad_norm": 6.897677799362671, + "learning_rate": 9.927337851142314e-07, + "logits/chosen": -0.013731849379837513, + "logits/rejected": 0.12393651157617569, + "logps/chosen": -1.2885291576385498, + "logps/rejected": -1.485176682472229, + "loss": 1.0742, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2885291576385498, + "rewards/margins": 0.19664745032787323, + "rewards/rejected": -1.485176682472229, + "sft_loss": 1.3521944284439087, "step": 835 }, { "epoch": 0.44957350727546413, - "grad_norm": 7.289038637772866, - "learning_rate": 2.977400547448942e-06, - "logits/chosen": -0.0977490097284317, - "logits/rejected": 0.06860803067684174, - "logps/chosen": -1.3717734813690186, - "logps/rejected": -1.6933990716934204, - "loss": 1.077, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3717734813690186, - "rewards/margins": 0.3216255307197571, - "rewards/rejected": -1.6933990716934204, - "sft_loss": 1.409711480140686, + "grad_norm": 5.820938820543634, + "learning_rate": 9.924668491496474e-07, + "logits/chosen": -0.013512268662452698, + "logits/rejected": 0.14876945316791534, + "logps/chosen": -1.3282172679901123, + "logps/rejected": -1.5874078273773193, + "loss": 1.0753, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3282172679901123, + "rewards/margins": 0.25919073820114136, + "rewards/rejected": -1.5874078273773193, + "sft_loss": 1.3661084175109863, "step": 840 }, { "epoch": 0.4522495400568657, - "grad_norm": 4.052377699298299, - "learning_rate": 2.976585405808599e-06, - "logits/chosen": -0.06485694646835327, - "logits/rejected": 0.009555049240589142, - "logps/chosen": -1.3541524410247803, - "logps/rejected": -1.6053316593170166, - "loss": 1.0928, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3541524410247803, - "rewards/margins": 0.2511790692806244, - "rewards/rejected": -1.6053316593170166, - "sft_loss": 1.405263900756836, + "grad_norm": 3.816211993237183, + "learning_rate": 9.92195135269533e-07, + "logits/chosen": 0.07689845561981201, + "logits/rejected": 0.15110646188259125, + "logps/chosen": -1.3248929977416992, + "logps/rejected": -1.463343620300293, + "loss": 1.1157, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3248929977416992, + "rewards/margins": 0.13845068216323853, + "rewards/rejected": -1.463343620300293, + "sft_loss": 1.388550043106079, "step": 845 }, { "epoch": 0.4549255728382673, - "grad_norm": 7.249188220946621, - "learning_rate": 2.9757559383301726e-06, - "logits/chosen": -0.10097716003656387, - "logits/rejected": -0.024070020765066147, - "logps/chosen": -1.398870587348938, - "logps/rejected": -1.6465431451797485, - "loss": 1.0542, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.398870587348938, - "rewards/margins": 0.24767239391803741, - "rewards/rejected": -1.6465431451797485, - "sft_loss": 1.3686355352401733, + "grad_norm": 6.669844224252639, + "learning_rate": 9.919186461100574e-07, + "logits/chosen": 0.0023181817959994078, + "logits/rejected": 0.07454365491867065, + "logps/chosen": -1.2944786548614502, + "logps/rejected": -1.496414303779602, + "loss": 1.0347, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2944786548614502, + "rewards/margins": 0.20193564891815186, + "rewards/rejected": -1.496414303779602, + "sft_loss": 1.311168909072876, "step": 850 }, { "epoch": 0.45760160561966884, - "grad_norm": 9.224325589545659, - "learning_rate": 2.9749121530611605e-06, - "logits/chosen": -0.13083770871162415, - "logits/rejected": 0.026032855734229088, - "logps/chosen": -1.4113142490386963, - "logps/rejected": -1.7621266841888428, - "loss": 1.0958, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.4113142490386963, - "rewards/margins": 0.3508125841617584, - "rewards/rejected": -1.7621266841888428, - "sft_loss": 1.3893181085586548, + "grad_norm": 8.563766361912304, + "learning_rate": 9.9163738435372e-07, + "logits/chosen": -0.014415117911994457, + "logits/rejected": 0.1365610510110855, + "logps/chosen": -1.3581111431121826, + "logps/rejected": -1.6337801218032837, + "loss": 1.0798, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3581111431121826, + "rewards/margins": 0.2756689190864563, + "rewards/rejected": -1.6337801218032837, + "sft_loss": 1.3649697303771973, "step": 855 }, { "epoch": 0.4602776384010704, - "grad_norm": 5.128594397616122, - "learning_rate": 2.97405405818797e-06, - "logits/chosen": -0.19242218136787415, - "logits/rejected": -0.022298630326986313, - "logps/chosen": -1.4043214321136475, - "logps/rejected": -1.729936957359314, - "loss": 1.0579, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.4043214321136475, - "rewards/margins": 0.32561546564102173, - "rewards/rejected": -1.729936957359314, - "sft_loss": 1.4137206077575684, + "grad_norm": 5.764935710539851, + "learning_rate": 9.913513527293234e-07, + "logits/chosen": -0.06975705921649933, + "logits/rejected": 0.10214301198720932, + "logps/chosen": -1.3912808895111084, + "logps/rejected": -1.6792808771133423, + "loss": 1.0662, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3912808895111084, + "rewards/margins": 0.28800004720687866, + "rewards/rejected": -1.6792808771133423, + "sft_loss": 1.4005149602890015, "step": 860 }, { "epoch": 0.462953671182472, - "grad_norm": 21.840546326122123, - "learning_rate": 2.9731816620358426e-06, - "logits/chosen": -0.10733000189065933, - "logits/rejected": 0.005105187650769949, - "logps/chosen": -1.3446229696273804, - "logps/rejected": -1.6905028820037842, - "loss": 1.0707, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3446229696273804, - "rewards/margins": 0.3458799123764038, - "rewards/rejected": -1.6905028820037842, - "sft_loss": 1.3272907733917236, + "grad_norm": 9.952313300460148, + "learning_rate": 9.910605540119474e-07, + "logits/chosen": -0.00964144803583622, + "logits/rejected": 0.09255403280258179, + "logps/chosen": -1.3124510049819946, + "logps/rejected": -1.6167314052581787, + "loss": 1.0525, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3124510049819946, + "rewards/margins": 0.30428043007850647, + "rewards/rejected": -1.6167314052581787, + "sft_loss": 1.3130879402160645, "step": 865 }, { "epoch": 0.46562970396387354, - "grad_norm": 4.725786898369011, - "learning_rate": 2.9722949730687687e-06, - "logits/chosen": -0.22538034617900848, - "logits/rejected": 0.0403381884098053, - "logps/chosen": -1.3605945110321045, - "logps/rejected": -1.602994680404663, - "loss": 1.1037, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.3605945110321045, - "rewards/margins": 0.24240007996559143, - "rewards/rejected": -1.602994680404663, - "sft_loss": 1.4284287691116333, + "grad_norm": 9.218895866731891, + "learning_rate": 9.907649910229227e-07, + "logits/chosen": -0.10176321119070053, + "logits/rejected": 0.16877254843711853, + "logps/chosen": -1.362540602684021, + "logps/rejected": -1.5649629831314087, + "loss": 1.1078, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.362540602684021, + "rewards/margins": 0.2024223804473877, + "rewards/rejected": -1.5649629831314087, + "sft_loss": 1.4236619472503662, "step": 870 }, { "epoch": 0.46830573674527515, - "grad_norm": 6.615571050104055, - "learning_rate": 2.9713939998894087e-06, - "logits/chosen": -0.1339145302772522, - "logits/rejected": -0.061110563576221466, - "logps/chosen": -1.3974251747131348, - "logps/rejected": -1.5865617990493774, - "loss": 1.1303, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.3974251747131348, - "rewards/margins": 0.18913669884204865, - "rewards/rejected": -1.5865617990493774, - "sft_loss": 1.3784010410308838, + "grad_norm": 6.9159090268667995, + "learning_rate": 9.90464666629803e-07, + "logits/chosen": 0.005976744927465916, + "logits/rejected": 0.08559007197618484, + "logps/chosen": -1.3926869630813599, + "logps/rejected": -1.5787603855133057, + "loss": 1.1216, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3926869630813599, + "rewards/margins": 0.1860734224319458, + "rewards/rejected": -1.5787603855133057, + "sft_loss": 1.3701099157333374, "step": 875 }, { "epoch": 0.4709817695266767, - "grad_norm": 5.315430604545462, - "learning_rate": 2.970478751239009e-06, - "logits/chosen": -0.1227155476808548, - "logits/rejected": 0.04234758019447327, - "logps/chosen": -1.4257047176361084, - "logps/rejected": -1.658288598060608, - "loss": 1.072, + "grad_norm": 5.733344853923304, + "learning_rate": 9.901595837463363e-07, + "logits/chosen": 0.00022365078621078283, + "logits/rejected": 0.16461063921451569, + "logps/chosen": -1.4307525157928467, + "logps/rejected": -1.6447960138320923, + "loss": 1.088, "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.4257047176361084, - "rewards/margins": 0.23258371651172638, - "rewards/rejected": -1.658288598060608, - "sft_loss": 1.3530858755111694, + "rewards/chosen": -1.4307525157928467, + "rewards/margins": 0.21404337882995605, + "rewards/rejected": -1.6447960138320923, + "sft_loss": 1.3551019430160522, "step": 880 }, { "epoch": 0.47365780230807825, - "grad_norm": 6.141860733800608, - "learning_rate": 2.9695492359973153e-06, - "logits/chosen": -0.18545794486999512, - "logits/rejected": -0.10255476087331772, - "logps/chosen": -1.3349025249481201, - "logps/rejected": -1.6219761371612549, - "loss": 1.0275, + "grad_norm": 7.0965498514729575, + "learning_rate": 9.898497453324384e-07, + "logits/chosen": -0.0713481456041336, + "logits/rejected": 0.010779242031276226, + "logps/chosen": -1.3277537822723389, + "logps/rejected": -1.5847175121307373, + "loss": 1.0414, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3349025249481201, - "rewards/margins": 0.2870733141899109, - "rewards/rejected": -1.6219761371612549, - "sft_loss": 1.3557451963424683, + "rewards/chosen": -1.3277537822723389, + "rewards/margins": 0.2569636106491089, + "rewards/rejected": -1.5847175121307373, + "sft_loss": 1.3591192960739136, "step": 885 }, { "epoch": 0.47633383508947985, - "grad_norm": 4.785365239112048, - "learning_rate": 2.9686054631824884e-06, - "logits/chosen": -0.3034934103488922, - "logits/rejected": -0.17880980670452118, - "logps/chosen": -1.3840830326080322, - "logps/rejected": -1.6017013788223267, - "loss": 1.0908, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3840830326080322, - "rewards/margins": 0.21761831641197205, - "rewards/rejected": -1.6017013788223267, - "sft_loss": 1.4323046207427979, + "grad_norm": 6.339401667531269, + "learning_rate": 9.895351543941628e-07, + "logits/chosen": -0.17879191040992737, + "logits/rejected": -0.05101003497838974, + "logps/chosen": -1.3813947439193726, + "logps/rejected": -1.5883454084396362, + "loss": 1.095, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3813947439193726, + "rewards/margins": 0.20695054531097412, + "rewards/rejected": -1.5883454084396362, + "sft_loss": 1.425832748413086, "step": 890 }, { "epoch": 0.4790098678708814, - "grad_norm": 4.901710099079587, - "learning_rate": 2.9676474419510175e-06, - "logits/chosen": -0.08637824654579163, - "logits/rejected": 0.02645046077668667, - "logps/chosen": -1.264495849609375, - "logps/rejected": -1.4407931566238403, - "loss": 1.058, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.264495849609375, - "rewards/margins": 0.17629732191562653, - "rewards/rejected": -1.4407931566238403, - "sft_loss": 1.3106262683868408, + "grad_norm": 5.747980078764873, + "learning_rate": 9.892158139836724e-07, + "logits/chosen": 0.030855897814035416, + "logits/rejected": 0.14914752542972565, + "logps/chosen": -1.258709192276001, + "logps/rejected": -1.4399124383926392, + "loss": 1.0615, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.258709192276001, + "rewards/margins": 0.1812031865119934, + "rewards/rejected": -1.4399124383926392, + "sft_loss": 1.3080114126205444, "step": 895 }, { "epoch": 0.481685900652283, - "grad_norm": 5.044502800081685, - "learning_rate": 2.966675181597627e-06, - "logits/chosen": -0.22000615298748016, - "logits/rejected": -0.15294332802295685, - "logps/chosen": -1.2635812759399414, - "logps/rejected": -1.5805613994598389, - "loss": 1.0245, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2635812759399414, - "rewards/margins": 0.3169800937175751, - "rewards/rejected": -1.5805613994598389, - "sft_loss": 1.3097639083862305, + "grad_norm": 5.999983841636828, + "learning_rate": 9.88891727199209e-07, + "logits/chosen": -0.09650512784719467, + "logits/rejected": -0.025805041193962097, + "logps/chosen": -1.2652348279953003, + "logps/rejected": -1.5569730997085571, + "loss": 1.0381, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2652348279953003, + "rewards/margins": 0.2917383015155792, + "rewards/rejected": -1.5569730997085571, + "sft_loss": 1.3040887117385864, "step": 900 }, { "epoch": 0.48436193343368455, - "grad_norm": 5.220203526780124, - "learning_rate": 2.965688691555193e-06, - "logits/chosen": -0.1628127098083496, - "logits/rejected": 0.016382919624447823, - "logps/chosen": -1.3505499362945557, - "logps/rejected": -1.6661789417266846, - "loss": 1.0762, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3505499362945557, - "rewards/margins": 0.3156289756298065, - "rewards/rejected": -1.6661789417266846, - "sft_loss": 1.4187357425689697, + "grad_norm": 5.729597922053128, + "learning_rate": 9.885628971850641e-07, + "logits/chosen": -0.006761780474334955, + "logits/rejected": 0.1825835257768631, + "logps/chosen": -1.3510396480560303, + "logps/rejected": -1.6165297031402588, + "loss": 1.0879, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3510396480560303, + "rewards/margins": 0.2654899060726166, + "rewards/rejected": -1.6165297031402588, + "sft_loss": 1.396750569343567, "step": 905 }, { "epoch": 0.48703796621508616, - "grad_norm": 4.052014511902803, - "learning_rate": 2.964687981394644e-06, - "logits/chosen": -0.21893298625946045, - "logits/rejected": -0.11495008319616318, - "logps/chosen": -1.3846489191055298, - "logps/rejected": -1.548292875289917, - "loss": 1.1082, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3846489191055298, - "rewards/margins": 0.16364414989948273, - "rewards/rejected": -1.548292875289917, - "sft_loss": 1.3700069189071655, + "grad_norm": 3.9699737151417662, + "learning_rate": 9.882293271315481e-07, + "logits/chosen": -0.04263655096292496, + "logits/rejected": 0.07031328976154327, + "logps/chosen": -1.3665261268615723, + "logps/rejected": -1.5466525554656982, + "loss": 1.1073, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3665261268615723, + "rewards/margins": 0.18012654781341553, + "rewards/rejected": -1.5466525554656982, + "sft_loss": 1.358182668685913, "step": 910 }, { "epoch": 0.4897139989964877, - "grad_norm": 5.4281835015917155, - "learning_rate": 2.963673060824877e-06, - "logits/chosen": -0.21939484775066376, - "logits/rejected": -0.04351986199617386, - "logps/chosen": -1.3538082838058472, - "logps/rejected": -1.595873236656189, - "loss": 1.0634, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3538082838058472, - "rewards/margins": 0.24206483364105225, - "rewards/rejected": -1.595873236656189, - "sft_loss": 1.3432961702346802, + "grad_norm": 5.605907446139715, + "learning_rate": 9.878910202749589e-07, + "logits/chosen": 0.010189378634095192, + "logits/rejected": 0.20491977035999298, + "logps/chosen": -1.3190621137619019, + "logps/rejected": -1.5155293941497803, + "loss": 1.0695, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3190621137619019, + "rewards/margins": 0.196467325091362, + "rewards/rejected": -1.5155293941497803, + "sft_loss": 1.3298662900924683, "step": 915 }, { "epoch": 0.49239003177788926, - "grad_norm": 5.4281666785371865, - "learning_rate": 2.9626439396926536e-06, - "logits/chosen": -0.10007022321224213, - "logits/rejected": 0.029562795534729958, - "logps/chosen": -1.2651135921478271, - "logps/rejected": -1.620726227760315, - "loss": 1.0511, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2651135921478271, - "rewards/margins": 0.35561278462409973, - "rewards/rejected": -1.620726227760315, - "sft_loss": 1.3307000398635864, + "grad_norm": 6.713530471211944, + "learning_rate": 9.875479798975512e-07, + "logits/chosen": 0.11538930237293243, + "logits/rejected": 0.26346588134765625, + "logps/chosen": -1.2698981761932373, + "logps/rejected": -1.5307300090789795, + "loss": 1.0583, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2698981761932373, + "rewards/margins": 0.26083195209503174, + "rewards/rejected": -1.5307300090789795, + "sft_loss": 1.3178085088729858, "step": 920 }, { "epoch": 0.49506606455929086, - "grad_norm": 6.509746165340049, - "learning_rate": 2.9616006279825125e-06, - "logits/chosen": -0.2518424689769745, - "logits/rejected": -0.09067486226558685, - "logps/chosen": -1.4045971632003784, - "logps/rejected": -1.63739013671875, - "loss": 1.0792, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.4045971632003784, - "rewards/margins": 0.23279277980327606, - "rewards/rejected": -1.63739013671875, - "sft_loss": 1.3810770511627197, + "grad_norm": 5.811831680741933, + "learning_rate": 9.87200209327504e-07, + "logits/chosen": -0.04077720642089844, + "logits/rejected": 0.13811177015304565, + "logps/chosen": -1.375215768814087, + "logps/rejected": -1.5146197080612183, + "loss": 1.1019, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.375215768814087, + "rewards/margins": 0.1394037902355194, + "rewards/rejected": -1.5146197080612183, + "sft_loss": 1.364015817642212, "step": 925 }, { "epoch": 0.4977420973406924, - "grad_norm": 8.013188896663015, - "learning_rate": 2.9605431358166687e-06, - "logits/chosen": -0.24829821288585663, - "logits/rejected": -0.13628308475017548, - "logps/chosen": -1.3312690258026123, - "logps/rejected": -1.717877745628357, - "loss": 1.0382, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3312690258026123, - "rewards/margins": 0.38660869002342224, - "rewards/rejected": -1.717877745628357, - "sft_loss": 1.3500382900238037, + "grad_norm": 8.295351988413582, + "learning_rate": 9.868477119388894e-07, + "logits/chosen": -0.054763637483119965, + "logits/rejected": 0.06431882083415985, + "logps/chosen": -1.3140287399291992, + "logps/rejected": -1.6464738845825195, + "loss": 1.0531, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3140287399291992, + "rewards/margins": 0.33244508504867554, + "rewards/rejected": -1.6464738845825195, + "sft_loss": 1.3359577655792236, "step": 930 }, { "epoch": 0.500418130122094, - "grad_norm": 5.294809343057805, - "learning_rate": 2.959471473454915e-06, - "logits/chosen": -0.16869133710861206, - "logits/rejected": -0.12402909994125366, - "logps/chosen": -1.326379656791687, - "logps/rejected": -1.610535979270935, - "loss": 1.0652, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.326379656791687, - "rewards/margins": 0.28415626287460327, - "rewards/rejected": -1.610535979270935, - "sft_loss": 1.3545112609863281, + "grad_norm": 5.337603380866405, + "learning_rate": 9.864904911516383e-07, + "logits/chosen": 0.0186604093760252, + "logits/rejected": 0.06460899114608765, + "logps/chosen": -1.2734229564666748, + "logps/rejected": -1.526149868965149, + "loss": 1.0612, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2734229564666748, + "rewards/margins": 0.2527269423007965, + "rewards/rejected": -1.526149868965149, + "sft_loss": 1.3169612884521484, "step": 935 }, { "epoch": 0.5030941629034956, - "grad_norm": 7.067492338783618, - "learning_rate": 2.9583856512945257e-06, - "logits/chosen": -0.2058066874742508, - "logits/rejected": -0.09429244697093964, - "logps/chosen": -1.3667848110198975, - "logps/rejected": -1.6059554815292358, - "loss": 1.0784, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3667848110198975, - "rewards/margins": 0.23917081952095032, - "rewards/rejected": -1.6059554815292358, - "sft_loss": 1.3848192691802979, + "grad_norm": 7.876389679343498, + "learning_rate": 9.861285504315084e-07, + "logits/chosen": -0.033787619322538376, + "logits/rejected": 0.08481265604496002, + "logps/chosen": -1.3385286331176758, + "logps/rejected": -1.5050402879714966, + "loss": 1.089, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3385286331176758, + "rewards/margins": 0.1665116548538208, + "rewards/rejected": -1.5050402879714966, + "sft_loss": 1.3689563274383545, "step": 940 }, { "epoch": 0.5057701956848971, - "grad_norm": 7.861411871397975, - "learning_rate": 2.957285679870151e-06, - "logits/chosen": -0.23982122540473938, - "logits/rejected": -0.10797332227230072, - "logps/chosen": -1.3651232719421387, - "logps/rejected": -1.7063182592391968, - "loss": 1.017, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3651232719421387, - "rewards/margins": 0.3411949872970581, - "rewards/rejected": -1.7063182592391968, - "sft_loss": 1.3415919542312622, + "grad_norm": 9.469117695278278, + "learning_rate": 9.857618932900502e-07, + "logits/chosen": -0.06023671478033066, + "logits/rejected": 0.07909716665744781, + "logps/chosen": -1.3198426961898804, + "logps/rejected": -1.6279243230819702, + "loss": 1.0234, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3198426961898804, + "rewards/margins": 0.30808156728744507, + "rewards/rejected": -1.6279243230819702, + "sft_loss": 1.31898832321167, "step": 945 }, { "epoch": 0.5084462284662987, - "grad_norm": 4.724553950873356, - "learning_rate": 2.9561715698537184e-06, - "logits/chosen": -0.2262788712978363, - "logits/rejected": -0.05607231333851814, - "logps/chosen": -1.4432750940322876, - "logps/rejected": -1.6429545879364014, - "loss": 1.1349, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.4432750940322876, - "rewards/margins": 0.1996796727180481, - "rewards/rejected": -1.6429545879364014, - "sft_loss": 1.4282090663909912, + "grad_norm": 6.068755565762643, + "learning_rate": 9.853905232845727e-07, + "logits/chosen": -0.02013743296265602, + "logits/rejected": 0.16080233454704285, + "logps/chosen": -1.425286054611206, + "logps/rejected": -1.582793951034546, + "loss": 1.1457, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.425286054611206, + "rewards/margins": 0.157507985830307, + "rewards/rejected": -1.582793951034546, + "sft_loss": 1.4227519035339355, "step": 950 }, { "epoch": 0.5111222612477003, - "grad_norm": 5.446634366003163, - "learning_rate": 2.955043332054329e-06, - "logits/chosen": -0.15960340201854706, - "logits/rejected": 0.07019929587841034, - "logps/chosen": -1.4225099086761475, - "logps/rejected": -1.663869857788086, - "loss": 1.1221, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.4225099086761475, - "rewards/margins": 0.24135980010032654, - "rewards/rejected": -1.663869857788086, - "sft_loss": 1.474645972251892, + "grad_norm": 9.116465562050807, + "learning_rate": 9.850144440181095e-07, + "logits/chosen": 0.022849196568131447, + "logits/rejected": 0.258659303188324, + "logps/chosen": -1.4186164140701294, + "logps/rejected": -1.5973553657531738, + "loss": 1.1386, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4186164140701294, + "rewards/margins": 0.17873908579349518, + "rewards/rejected": -1.5973553657531738, + "sft_loss": 1.4624502658843994, "step": 955 }, { "epoch": 0.5137982940291018, - "grad_norm": 5.827944497702404, - "learning_rate": 2.95390097741815e-06, - "logits/chosen": -0.18014737963676453, - "logits/rejected": -0.029367715120315552, - "logps/chosen": -1.3832664489746094, - "logps/rejected": -1.5638530254364014, - "loss": 1.1006, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3832664489746094, - "rewards/margins": 0.1805865466594696, - "rewards/rejected": -1.5638530254364014, - "sft_loss": 1.3904502391815186, + "grad_norm": 6.918046448941572, + "learning_rate": 9.846336591393832e-07, + "logits/chosen": -0.0351165309548378, + "logits/rejected": 0.11834853887557983, + "logps/chosen": -1.3530397415161133, + "logps/rejected": -1.4933912754058838, + "loss": 1.1083, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3530397415161133, + "rewards/margins": 0.14035165309906006, + "rewards/rejected": -1.4933912754058838, + "sft_loss": 1.3699219226837158, "step": 960 }, { "epoch": 0.5164743268105034, - "grad_norm": 6.324392438431142, - "learning_rate": 2.952744517028312e-06, - "logits/chosen": -0.08480402082204819, - "logits/rejected": -0.08991466462612152, - "logps/chosen": -1.4066896438598633, - "logps/rejected": -1.710906982421875, - "loss": 1.0802, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.4066896438598633, - "rewards/margins": 0.3042174279689789, - "rewards/rejected": -1.710906982421875, - "sft_loss": 1.4253562688827515, + "grad_norm": 6.1912600134012505, + "learning_rate": 9.842481723427704e-07, + "logits/chosen": 0.05374523252248764, + "logits/rejected": 0.05349243804812431, + "logps/chosen": -1.3894104957580566, + "logps/rejected": -1.6738580465316772, + "loss": 1.0849, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3894104957580566, + "rewards/margins": 0.28444749116897583, + "rewards/rejected": -1.6738580465316772, + "sft_loss": 1.4124128818511963, "step": 965 }, { "epoch": 0.519150359591905, - "grad_norm": 4.919009362060755, - "learning_rate": 2.951573962104798e-06, - "logits/chosen": -0.07150016725063324, - "logits/rejected": -0.06851210445165634, - "logps/chosen": -1.288236379623413, - "logps/rejected": -1.532168984413147, - "loss": 1.0465, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.288236379623413, - "rewards/margins": 0.24393276870250702, - "rewards/rejected": -1.532168984413147, - "sft_loss": 1.2897863388061523, + "grad_norm": 5.065747116617297, + "learning_rate": 9.838579873682658e-07, + "logits/chosen": 0.06455997377634048, + "logits/rejected": 0.07006511837244034, + "logps/chosen": -1.2654345035552979, + "logps/rejected": -1.4739532470703125, + "loss": 1.0503, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2654345035552979, + "rewards/margins": 0.20851869881153107, + "rewards/rejected": -1.4739532470703125, + "sft_loss": 1.2775202989578247, "step": 970 }, { "epoch": 0.5218263923733065, - "grad_norm": 5.147263046472833, - "learning_rate": 2.950389324004337e-06, - "logits/chosen": -0.2402597963809967, - "logits/rejected": -0.05485190078616142, - "logps/chosen": -1.3670083284378052, - "logps/rejected": -1.5608259439468384, - "loss": 1.0778, + "grad_norm": 5.327061763368941, + "learning_rate": 9.834631080014457e-07, + "logits/chosen": -0.10575266182422638, + "logits/rejected": 0.08543635904788971, + "logps/chosen": -1.3447576761245728, + "logps/rejected": -1.5180432796478271, + "loss": 1.0823, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3670083284378052, - "rewards/margins": 0.19381758570671082, - "rewards/rejected": -1.5608259439468384, - "sft_loss": 1.4064772129058838, + "rewards/chosen": -1.3447576761245728, + "rewards/margins": 0.17328575253486633, + "rewards/rejected": -1.5180432796478271, + "sft_loss": 1.3948814868927002, "step": 975 }, { "epoch": 0.5245024251547081, - "grad_norm": 8.649844009938525, - "learning_rate": 2.949190614220294e-06, - "logits/chosen": -0.2297898232936859, - "logits/rejected": -0.028187647461891174, - "logps/chosen": -1.4226592779159546, - "logps/rejected": -1.664825677871704, - "loss": 1.0959, + "grad_norm": 8.020182464396328, + "learning_rate": 9.830635380734312e-07, + "logits/chosen": -0.08101774752140045, + "logits/rejected": 0.1236882358789444, + "logps/chosen": -1.3992125988006592, + "logps/rejected": -1.600408911705017, + "loss": 1.1059, "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.4226592779159546, - "rewards/margins": 0.2421664446592331, - "rewards/rejected": -1.664825677871704, - "sft_loss": 1.4162763357162476, + "rewards/chosen": -1.3992125988006592, + "rewards/margins": 0.20119652152061462, + "rewards/rejected": -1.600408911705017, + "sft_loss": 1.4059171676635742, "step": 980 }, { "epoch": 0.5271784579361097, - "grad_norm": 6.148260341059483, - "learning_rate": 2.9479778443825553e-06, - "logits/chosen": -0.1363251805305481, - "logits/rejected": 0.060792457312345505, - "logps/chosen": -1.3869860172271729, - "logps/rejected": -1.6044954061508179, - "loss": 1.0984, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3869860172271729, - "rewards/margins": 0.21750938892364502, - "rewards/rejected": -1.6044954061508179, - "sft_loss": 1.4573136568069458, + "grad_norm": 6.744511276591418, + "learning_rate": 9.826592814608517e-07, + "logits/chosen": 0.012089039199054241, + "logits/rejected": 0.20911423861980438, + "logps/chosen": -1.3623487949371338, + "logps/rejected": -1.5705373287200928, + "loss": 1.1032, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3623487949371338, + "rewards/margins": 0.20818853378295898, + "rewards/rejected": -1.5705373287200928, + "sft_loss": 1.4302294254302979, "step": 985 }, { "epoch": 0.5298544907175113, - "grad_norm": 4.843430622851707, - "learning_rate": 2.9467510262574204e-06, - "logits/chosen": -0.0660191997885704, - "logits/rejected": -0.03288702294230461, - "logps/chosen": -1.2430740594863892, - "logps/rejected": -1.5976035594940186, - "loss": 0.9959, + "grad_norm": 5.064980584747938, + "learning_rate": 9.822503420858067e-07, + "logits/chosen": 0.06235666945576668, + "logits/rejected": 0.09905209392309189, + "logps/chosen": -1.208686351776123, + "logps/rejected": -1.4888397455215454, + "loss": 1.0117, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.2430740594863892, - "rewards/margins": 0.3545294404029846, - "rewards/rejected": -1.5976035594940186, - "sft_loss": 1.3150891065597534, + "rewards/chosen": -1.208686351776123, + "rewards/margins": 0.28015339374542236, + "rewards/rejected": -1.4888397455215454, + "sft_loss": 1.2937543392181396, "step": 990 }, { "epoch": 0.5325305234989128, - "grad_norm": 7.473104680416714, - "learning_rate": 2.9455101717474834e-06, - "logits/chosen": -0.06151014566421509, - "logits/rejected": 0.01840631663799286, - "logps/chosen": -1.3655961751937866, - "logps/rejected": -1.5589427947998047, - "loss": 1.1384, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": -1.3655961751937866, - "rewards/margins": 0.19334658980369568, - "rewards/rejected": -1.5589427947998047, - "sft_loss": 1.4315303564071655, + "grad_norm": 6.429721404080052, + "learning_rate": 9.818367239158277e-07, + "logits/chosen": 0.08417926728725433, + "logits/rejected": 0.16567587852478027, + "logps/chosen": -1.326179027557373, + "logps/rejected": -1.454859972000122, + "loss": 1.1338, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.326179027557373, + "rewards/margins": 0.1286809742450714, + "rewards/rejected": -1.454859972000122, + "sft_loss": 1.3937551975250244, "step": 995 }, { "epoch": 0.5352065562803144, - "grad_norm": 5.904135631586209, - "learning_rate": 2.9442552928915203e-06, - "logits/chosen": -0.04618370905518532, - "logits/rejected": 0.0838208794593811, - "logps/chosen": -1.3669238090515137, - "logps/rejected": -1.6801611185073853, - "loss": 1.0887, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3669238090515137, - "rewards/margins": 0.3132372200489044, - "rewards/rejected": -1.6801611185073853, - "sft_loss": 1.396959900856018, + "grad_norm": 5.9936758517936415, + "learning_rate": 9.8141843096384e-07, + "logits/chosen": 0.06984042376279831, + "logits/rejected": 0.19560235738754272, + "logps/chosen": -1.3520762920379639, + "logps/rejected": -1.6111743450164795, + "loss": 1.0875, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3520762920379639, + "rewards/margins": 0.2590981125831604, + "rewards/rejected": -1.6111743450164795, + "sft_loss": 1.3810752630233765, "step": 1000 }, { "epoch": 0.537882589061716, - "grad_norm": 6.878902971181851, - "learning_rate": 2.942986401864371e-06, - "logits/chosen": -0.09690725803375244, - "logits/rejected": 0.07714128494262695, - "logps/chosen": -1.4062764644622803, - "logps/rejected": -1.6796951293945312, - "loss": 1.1054, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.4062764644622803, - "rewards/margins": 0.2734185457229614, - "rewards/rejected": -1.6796951293945312, - "sft_loss": 1.445420265197754, + "grad_norm": 9.248342880601742, + "learning_rate": 9.809954672881237e-07, + "logits/chosen": 0.041423261165618896, + "logits/rejected": 0.2167244404554367, + "logps/chosen": -1.368880271911621, + "logps/rejected": -1.5877668857574463, + "loss": 1.1082, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.368880271911621, + "rewards/margins": 0.21888649463653564, + "rewards/rejected": -1.5877668857574463, + "sft_loss": 1.413527250289917, "step": 1005 }, { "epoch": 0.5405586218431175, - "grad_norm": 6.274384213051333, - "learning_rate": 2.9417035109768225e-06, - "logits/chosen": -0.1101980209350586, - "logits/rejected": 0.08307985216379166, - "logps/chosen": -1.2378486394882202, - "logps/rejected": -1.6158936023712158, - "loss": 1.0127, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.2378486394882202, - "rewards/margins": 0.3780447542667389, - "rewards/rejected": -1.6158936023712158, - "sft_loss": 1.2778068780899048, + "grad_norm": 8.22317017700777, + "learning_rate": 9.80567836992274e-07, + "logits/chosen": 0.007480998523533344, + "logits/rejected": 0.2067534625530243, + "logps/chosen": -1.225550889968872, + "logps/rejected": -1.5645802021026611, + "loss": 1.0161, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.225550889968872, + "rewards/margins": 0.33902937173843384, + "rewards/rejected": -1.5645802021026611, + "sft_loss": 1.2699626684188843, "step": 1010 }, { "epoch": 0.5432346546245191, - "grad_norm": 5.674580639618625, - "learning_rate": 2.9404066326754874e-06, - "logits/chosen": -0.12508752942085266, - "logits/rejected": 0.05659140273928642, - "logps/chosen": -1.3017909526824951, - "logps/rejected": -1.5709102153778076, - "loss": 1.0562, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3017909526824951, - "rewards/margins": 0.2691193222999573, - "rewards/rejected": -1.5709102153778076, - "sft_loss": 1.3575425148010254, + "grad_norm": 6.876544893331995, + "learning_rate": 9.801355442251625e-07, + "logits/chosen": -0.02531070075929165, + "logits/rejected": 0.15708212554454803, + "logps/chosen": -1.2901298999786377, + "logps/rejected": -1.5163029432296753, + "loss": 1.06, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2901298999786377, + "rewards/margins": 0.22617287933826447, + "rewards/rejected": -1.5163029432296753, + "sft_loss": 1.3433908224105835, "step": 1015 }, { "epoch": 0.5459106874059207, - "grad_norm": 6.827706232097836, - "learning_rate": 2.9390957795426847e-06, - "logits/chosen": -0.11372099071741104, - "logits/rejected": 0.03485158085823059, - "logps/chosen": -1.365986943244934, - "logps/rejected": -1.6966253519058228, - "loss": 1.0481, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.365986943244934, - "rewards/margins": 0.33063825964927673, - "rewards/rejected": -1.6966253519058228, - "sft_loss": 1.4129970073699951, + "grad_norm": 6.719145985760094, + "learning_rate": 9.796985931808949e-07, + "logits/chosen": 0.0003074899432249367, + "logits/rejected": 0.15064993500709534, + "logps/chosen": -1.3295269012451172, + "logps/rejected": -1.5829570293426514, + "loss": 1.0556, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3295269012451172, + "rewards/margins": 0.2534298896789551, + "rewards/rejected": -1.5829570293426514, + "sft_loss": 1.3862606287002563, "step": 1020 }, { "epoch": 0.5485867201873222, - "grad_norm": 6.158787026981841, - "learning_rate": 2.9377709642963177e-06, - "logits/chosen": -0.15712231397628784, - "logits/rejected": -0.03692762926220894, - "logps/chosen": -1.3038196563720703, - "logps/rejected": -1.7374274730682373, - "loss": 1.0094, + "grad_norm": 8.751983877055366, + "learning_rate": 9.792569880987724e-07, + "logits/chosen": -0.02134351059794426, + "logits/rejected": 0.10547629743814468, + "logps/chosen": -1.2631934881210327, + "logps/rejected": -1.5912244319915771, + "loss": 1.0187, "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3038196563720703, - "rewards/margins": 0.43360796570777893, - "rewards/rejected": -1.7374274730682373, - "sft_loss": 1.3382656574249268, + "rewards/chosen": -1.2631934881210327, + "rewards/margins": 0.3280307948589325, + "rewards/rejected": -1.5912244319915771, + "sft_loss": 1.288779616355896, "step": 1025 }, { "epoch": 0.5512627529687238, - "grad_norm": 5.364208643132879, - "learning_rate": 2.9364321997897485e-06, - "logits/chosen": -0.14669297635555267, - "logits/rejected": -0.059150196611881256, - "logps/chosen": -1.4097700119018555, - "logps/rejected": -1.7058780193328857, - "loss": 1.0981, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.4097700119018555, - "rewards/margins": 0.2961081862449646, - "rewards/rejected": -1.7058780193328857, - "sft_loss": 1.441806435585022, + "grad_norm": 8.947801392685536, + "learning_rate": 9.788107332632493e-07, + "logits/chosen": 0.0026383884251117706, + "logits/rejected": 0.0938570499420166, + "logps/chosen": -1.3454915285110474, + "logps/rejected": -1.5478971004486084, + "loss": 1.0977, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3454915285110474, + "rewards/margins": 0.2024056613445282, + "rewards/rejected": -1.5478971004486084, + "sft_loss": 1.3874528408050537, "step": 1030 }, { "epoch": 0.5539387857501255, - "grad_norm": 6.597472204582022, - "learning_rate": 2.935079499011677e-06, - "logits/chosen": -0.16338512301445007, - "logits/rejected": -0.04339775815606117, - "logps/chosen": -1.431691288948059, - "logps/rejected": -1.5908737182617188, - "loss": 1.1178, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.431691288948059, - "rewards/margins": 0.1591825932264328, - "rewards/rejected": -1.5908737182617188, - "sft_loss": 1.4271605014801025, + "grad_norm": 8.030719702681695, + "learning_rate": 9.783598330038924e-07, + "logits/chosen": -0.02961505576968193, + "logits/rejected": 0.0943761020898819, + "logps/chosen": -1.4095746278762817, + "logps/rejected": -1.5417417287826538, + "loss": 1.1212, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4095746278762817, + "rewards/margins": 0.13216717541217804, + "rewards/rejected": -1.5417417287826538, + "sft_loss": 1.4097391366958618, "step": 1035 }, { "epoch": 0.5566148185315271, - "grad_norm": 8.155965406182842, - "learning_rate": 2.9337128750860126e-06, - "logits/chosen": -0.10452456772327423, - "logits/rejected": 0.04732733964920044, - "logps/chosen": -1.3343483209609985, - "logps/rejected": -1.635978102684021, - "loss": 1.068, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3343483209609985, - "rewards/margins": 0.3016297221183777, - "rewards/rejected": -1.635978102684021, - "sft_loss": 1.3880765438079834, + "grad_norm": 13.073240298154332, + "learning_rate": 9.779042916953376e-07, + "logits/chosen": 0.013182336464524269, + "logits/rejected": 0.17865802347660065, + "logps/chosen": -1.330731987953186, + "logps/rejected": -1.6062644720077515, + "loss": 1.0829, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.330731987953186, + "rewards/margins": 0.2755325734615326, + "rewards/rejected": -1.6062644720077515, + "sft_loss": 1.3806637525558472, "step": 1040 }, { "epoch": 0.5592908513129285, - "grad_norm": 4.07354796456379, - "learning_rate": 2.932332341271746e-06, - "logits/chosen": -0.1672249734401703, - "logits/rejected": -0.03276212140917778, - "logps/chosen": -1.306597113609314, - "logps/rejected": -1.6181869506835938, - "loss": 1.0614, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.306597113609314, - "rewards/margins": 0.3115897476673126, - "rewards/rejected": -1.6181869506835938, - "sft_loss": 1.3978981971740723, + "grad_norm": 3.859480468158749, + "learning_rate": 9.774441137572487e-07, + "logits/chosen": -0.0784904956817627, + "logits/rejected": 0.066562220454216, + "logps/chosen": -1.3023416996002197, + "logps/rejected": -1.6012264490127563, + "loss": 1.0615, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3023416996002197, + "rewards/margins": 0.2988850474357605, + "rewards/rejected": -1.6012264490127563, + "sft_loss": 1.3957256078720093, "step": 1045 }, { "epoch": 0.5619668840943302, - "grad_norm": 5.813364166003015, - "learning_rate": 2.930937910962822e-06, - "logits/chosen": -0.19471415877342224, - "logits/rejected": -0.09367385506629944, - "logps/chosen": -1.37113618850708, - "logps/rejected": -1.7180440425872803, - "loss": 1.0714, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.37113618850708, - "rewards/margins": 0.34690791368484497, - "rewards/rejected": -1.7180440425872803, - "sft_loss": 1.4200074672698975, + "grad_norm": 7.389526699160587, + "learning_rate": 9.76979303654274e-07, + "logits/chosen": -0.08934468030929565, + "logits/rejected": 0.01706491783261299, + "logps/chosen": -1.3709447383880615, + "logps/rejected": -1.700160026550293, + "loss": 1.0676, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3709447383880615, + "rewards/margins": 0.3292153477668762, + "rewards/rejected": -1.700160026550293, + "sft_loss": 1.4127471446990967, "step": 1050 }, { "epoch": 0.5646429168757318, - "grad_norm": 7.793661568914185, - "learning_rate": 2.9295295976880107e-06, - "logits/chosen": -0.1325044184923172, - "logits/rejected": -0.058493874967098236, - "logps/chosen": -1.397251009941101, - "logps/rejected": -1.69857656955719, - "loss": 1.0641, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.397251009941101, - "rewards/margins": 0.3013255000114441, - "rewards/rejected": -1.69857656955719, - "sft_loss": 1.4198894500732422, + "grad_norm": 8.982720974634528, + "learning_rate": 9.765098658960035e-07, + "logits/chosen": -0.013047914020717144, + "logits/rejected": 0.0696062445640564, + "logps/chosen": -1.3615453243255615, + "logps/rejected": -1.6033151149749756, + "loss": 1.0755, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3615453243255615, + "rewards/margins": 0.24176998436450958, + "rewards/rejected": -1.6033151149749756, + "sft_loss": 1.4036492109298706, "step": 1055 }, { "epoch": 0.5673189496571333, - "grad_norm": 7.6046364035152365, - "learning_rate": 2.9281074151107727e-06, - "logits/chosen": -0.12792737782001495, - "logits/rejected": 0.04613568261265755, - "logps/chosen": -1.4523102045059204, - "logps/rejected": -1.7039387226104736, - "loss": 1.0922, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.4523102045059204, - "rewards/margins": 0.2516286075115204, - "rewards/rejected": -1.7039387226104736, - "sft_loss": 1.4452135562896729, + "grad_norm": 7.6494199427540135, + "learning_rate": 9.76035805036924e-07, + "logits/chosen": 0.025501590222120285, + "logits/rejected": 0.20580200850963593, + "logps/chosen": -1.4298983812332153, + "logps/rejected": -1.6337827444076538, + "loss": 1.1003, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4298983812332153, + "rewards/margins": 0.20388436317443848, + "rewards/rejected": -1.6337827444076538, + "sft_loss": 1.4225795269012451, "step": 1060 }, { "epoch": 0.5699949824385349, - "grad_norm": 4.692723677939538, - "learning_rate": 2.926671377029129e-06, - "logits/chosen": -0.12723150849342346, - "logits/rejected": -0.001778355217538774, - "logps/chosen": -1.3645659685134888, - "logps/rejected": -1.7801551818847656, - "loss": 1.0433, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3645659685134888, - "rewards/margins": 0.41558918356895447, - "rewards/rejected": -1.7801551818847656, - "sft_loss": 1.4643785953521729, + "grad_norm": 4.614253761473676, + "learning_rate": 9.755571256763764e-07, + "logits/chosen": 0.032683633267879486, + "logits/rejected": 0.16721466183662415, + "logps/chosen": -1.3229620456695557, + "logps/rejected": -1.6731964349746704, + "loss": 1.0445, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3229620456695557, + "rewards/margins": 0.3502345085144043, + "rewards/rejected": -1.6731964349746704, + "sft_loss": 1.41048264503479, "step": 1065 }, { "epoch": 0.5726710152199365, - "grad_norm": 5.786753397518219, - "learning_rate": 2.9252214973755294e-06, - "logits/chosen": -0.25807952880859375, - "logits/rejected": -0.01430868823081255, - "logps/chosen": -1.399417519569397, - "logps/rejected": -1.7631251811981201, - "loss": 1.0388, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.399417519569397, - "rewards/margins": 0.36370766162872314, - "rewards/rejected": -1.7631251811981201, - "sft_loss": 1.4122426509857178, + "grad_norm": 6.6237622453727605, + "learning_rate": 9.750738324585097e-07, + "logits/chosen": -0.1174614205956459, + "logits/rejected": 0.1363617330789566, + "logps/chosen": -1.3533555269241333, + "logps/rejected": -1.6572144031524658, + "loss": 1.0431, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3533555269241333, + "rewards/margins": 0.3038588762283325, + "rewards/rejected": -1.6572144031524658, + "sft_loss": 1.357505202293396, "step": 1070 }, { "epoch": 0.5753470480013381, - "grad_norm": 4.920914855578094, - "learning_rate": 2.923757790216711e-06, - "logits/chosen": -0.1378275752067566, - "logits/rejected": 6.657242920482531e-05, - "logps/chosen": -1.3422327041625977, - "logps/rejected": -1.7522796392440796, - "loss": 1.0385, + "grad_norm": 5.882814130729912, + "learning_rate": 9.74585930072237e-07, + "logits/chosen": -0.021414924412965775, + "logits/rejected": 0.11573544889688492, + "logps/chosen": -1.308184266090393, + "logps/rejected": -1.6624805927276611, + "loss": 1.0415, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3422327041625977, - "rewards/margins": 0.41004714369773865, - "rewards/rejected": -1.7522796392440796, - "sft_loss": 1.3954424858093262, + "rewards/chosen": -1.308184266090393, + "rewards/margins": 0.3542962670326233, + "rewards/rejected": -1.6624805927276611, + "sft_loss": 1.358493447303772, "step": 1075 }, { "epoch": 0.5780230807827396, - "grad_norm": 6.888088507193936, - "learning_rate": 2.922280269753568e-06, - "logits/chosen": -0.19910190999507904, - "logits/rejected": -0.09001894295215607, - "logps/chosen": -1.4434906244277954, - "logps/rejected": -1.6869752407073975, - "loss": 1.1025, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.4434906244277954, - "rewards/margins": 0.24348478019237518, - "rewards/rejected": -1.6869752407073975, - "sft_loss": 1.4737919569015503, + "grad_norm": 7.510823956083402, + "learning_rate": 9.740934232511892e-07, + "logits/chosen": -0.08719275146722794, + "logits/rejected": 0.02298247441649437, + "logps/chosen": -1.438455581665039, + "logps/rejected": -1.5987141132354736, + "loss": 1.1326, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.438455581665039, + "rewards/margins": 0.1602584421634674, + "rewards/rejected": -1.5987141132354736, + "sft_loss": 1.4535404443740845, "step": 1080 }, { "epoch": 0.5806991135641412, - "grad_norm": 8.187201984819211, - "learning_rate": 2.9207889503210094e-06, - "logits/chosen": -0.08224952965974808, - "logits/rejected": 0.09409169852733612, - "logps/chosen": -1.3726158142089844, - "logps/rejected": -1.4843034744262695, - "loss": 1.1351, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.3726158142089844, - "rewards/margins": 0.11168781667947769, - "rewards/rejected": -1.4843034744262695, - "sft_loss": 1.3914871215820312, + "grad_norm": 8.493464333567369, + "learning_rate": 9.735963167736698e-07, + "logits/chosen": -0.022415516898036003, + "logits/rejected": 0.1457872837781906, + "logps/chosen": -1.3727210760116577, + "logps/rejected": -1.4455827474594116, + "loss": 1.1389, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -1.3727210760116577, + "rewards/margins": 0.0728617012500763, + "rewards/rejected": -1.4455827474594116, + "sft_loss": 1.382156491279602, "step": 1085 }, { "epoch": 0.5833751463455428, - "grad_norm": 5.930900410234613, - "learning_rate": 2.9192838463878236e-06, - "logits/chosen": -0.09559588134288788, - "logits/rejected": 0.014999288134276867, - "logps/chosen": -1.3439271450042725, - "logps/rejected": -1.467355728149414, - "loss": 1.1066, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3439271450042725, - "rewards/margins": 0.12342876195907593, - "rewards/rejected": -1.467355728149414, - "sft_loss": 1.340254783630371, + "grad_norm": 7.6187882316184705, + "learning_rate": 9.730946154626078e-07, + "logits/chosen": -0.0488225594162941, + "logits/rejected": 0.05780552700161934, + "logps/chosen": -1.3416324853897095, + "logps/rejected": -1.4273046255111694, + "loss": 1.1184, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3416324853897095, + "rewards/margins": 0.08567220717668533, + "rewards/rejected": -1.4273046255111694, + "sft_loss": 1.3312969207763672, "step": 1090 }, { "epoch": 0.5860511791269443, - "grad_norm": 5.875408013228155, - "learning_rate": 2.917764972556535e-06, - "logits/chosen": -0.21042868494987488, - "logits/rejected": -0.06721127033233643, - "logps/chosen": -1.313696026802063, - "logps/rejected": -1.569189190864563, - "loss": 1.0512, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.313696026802063, - "rewards/margins": 0.2554931044578552, - "rewards/rejected": -1.569189190864563, - "sft_loss": 1.3528454303741455, + "grad_norm": 6.128001263278665, + "learning_rate": 9.725883241855117e-07, + "logits/chosen": -0.1998719424009323, + "logits/rejected": -0.07255266606807709, + "logps/chosen": -1.2856649160385132, + "logps/rejected": -1.519370675086975, + "loss": 1.0475, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2856649160385132, + "rewards/margins": 0.23370572924613953, + "rewards/rejected": -1.519370675086975, + "sft_loss": 1.3287267684936523, "step": 1095 }, { "epoch": 0.5887272119083459, - "grad_norm": 6.238798929829675, - "learning_rate": 2.9162323435632657e-06, - "logits/chosen": -0.09522127360105515, - "logits/rejected": 0.020591190084815025, - "logps/chosen": -1.2226125001907349, - "logps/rejected": -1.6992833614349365, - "loss": 0.9786, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2226125001907349, - "rewards/margins": 0.4766710698604584, - "rewards/rejected": -1.6992833614349365, - "sft_loss": 1.265278935432434, + "grad_norm": 7.2254979243325606, + "learning_rate": 9.720774478544218e-07, + "logits/chosen": -0.06452328711748123, + "logits/rejected": 0.03451596572995186, + "logps/chosen": -1.197929859161377, + "logps/rejected": -1.6241276264190674, + "loss": 0.9772, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.197929859161377, + "rewards/margins": 0.42619770765304565, + "rewards/rejected": -1.6241276264190674, + "sft_loss": 1.2388383150100708, "step": 1100 }, { "epoch": 0.5914032446897475, - "grad_norm": 5.288216235681103, - "learning_rate": 2.914685974277587e-06, - "logits/chosen": -0.16337811946868896, - "logits/rejected": -0.08522491157054901, - "logps/chosen": -1.3429590463638306, - "logps/rejected": -1.5685818195343018, - "loss": 1.0812, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3429590463638306, - "rewards/margins": 0.2256227433681488, - "rewards/rejected": -1.5685818195343018, - "sft_loss": 1.33394455909729, + "grad_norm": 7.583297166453061, + "learning_rate": 9.715619914258624e-07, + "logits/chosen": -0.09063072502613068, + "logits/rejected": -0.011754634790122509, + "logps/chosen": -1.3256213665008545, + "logps/rejected": -1.5247026681900024, + "loss": 1.0775, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3256213665008545, + "rewards/margins": 0.19908128678798676, + "rewards/rejected": -1.5247026681900024, + "sft_loss": 1.3233109712600708, "step": 1105 }, { "epoch": 0.594079277471149, - "grad_norm": 6.9294869208445, - "learning_rate": 2.9131258797023814e-06, - "logits/chosen": -0.1615566909313202, - "logits/rejected": -0.027435744181275368, - "logps/chosen": -1.305321455001831, - "logps/rejected": -1.5158964395523071, - "loss": 1.0534, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": -1.305321455001831, - "rewards/margins": 0.21057498455047607, - "rewards/rejected": -1.5158964395523071, - "sft_loss": 1.3161168098449707, + "grad_norm": 7.966476893906217, + "learning_rate": 9.710419599007937e-07, + "logits/chosen": -0.07386825978755951, + "logits/rejected": 0.05149867385625839, + "logps/chosen": -1.3055206537246704, + "logps/rejected": -1.4467628002166748, + "loss": 1.0725, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.3055206537246704, + "rewards/margins": 0.14124202728271484, + "rewards/rejected": -1.4467628002166748, + "sft_loss": 1.3087950944900513, "step": 1110 }, { "epoch": 0.5967553102525506, - "grad_norm": 5.726396935468426, - "learning_rate": 2.9115520749736934e-06, - "logits/chosen": -0.05702148750424385, - "logits/rejected": 0.07917685061693192, - "logps/chosen": -1.2768105268478394, - "logps/rejected": -1.6988168954849243, - "loss": 0.9809, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2768105268478394, - "rewards/margins": 0.4220063090324402, - "rewards/rejected": -1.6988168954849243, - "sft_loss": 1.2530466318130493, + "grad_norm": 11.249291657242251, + "learning_rate": 9.705173583245643e-07, + "logits/chosen": -0.02822117879986763, + "logits/rejected": 0.08976884186267853, + "logps/chosen": -1.2478373050689697, + "logps/rejected": -1.5560951232910156, + "loss": 0.9948, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2478373050689697, + "rewards/margins": 0.3082577586174011, + "rewards/rejected": -1.5560951232910156, + "sft_loss": 1.2365633249282837, "step": 1115 }, { "epoch": 0.5994313430339522, - "grad_norm": 5.465210594232592, - "learning_rate": 2.909964575360583e-06, - "logits/chosen": -0.25398239493370056, - "logits/rejected": -0.14137795567512512, - "logps/chosen": -1.319097876548767, - "logps/rejected": -1.7301028966903687, - "loss": 1.0282, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.319097876548767, - "rewards/margins": 0.41100484132766724, - "rewards/rejected": -1.7301028966903687, - "sft_loss": 1.355691909790039, + "grad_norm": 6.592433420949461, + "learning_rate": 9.699881917868609e-07, + "logits/chosen": -0.18813884258270264, + "logits/rejected": -0.08145233243703842, + "logps/chosen": -1.292750358581543, + "logps/rejected": -1.5763019323349, + "loss": 1.0412, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.292750358581543, + "rewards/margins": 0.28355178236961365, + "rewards/rejected": -1.5763019323349, + "sft_loss": 1.325889229774475, "step": 1120 }, { "epoch": 0.6021073758153538, - "grad_norm": 10.103974202551575, - "learning_rate": 2.9083633962649783e-06, - "logits/chosen": -0.2338935136795044, - "logits/rejected": -0.03408069536089897, - "logps/chosen": -1.4205596446990967, - "logps/rejected": -1.866235375404358, - "loss": 1.0359, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.4205596446990967, - "rewards/margins": 0.44567546248435974, - "rewards/rejected": -1.866235375404358, - "sft_loss": 1.405672311782837, + "grad_norm": 9.103225273951479, + "learning_rate": 9.694544654216594e-07, + "logits/chosen": -0.1960795670747757, + "logits/rejected": -0.01546870730817318, + "logps/chosen": -1.3597862720489502, + "logps/rejected": -1.7043052911758423, + "loss": 1.0409, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3597862720489502, + "rewards/margins": 0.34451884031295776, + "rewards/rejected": -1.7043052911758423, + "sft_loss": 1.3517060279846191, "step": 1125 }, { "epoch": 0.6047834085967553, - "grad_norm": 7.660448870032001, - "learning_rate": 2.906748553221527e-06, - "logits/chosen": -0.007981347851455212, - "logits/rejected": 0.062486834824085236, - "logps/chosen": -1.3957369327545166, - "logps/rejected": -1.7972627878189087, - "loss": 1.025, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3957369327545166, - "rewards/margins": 0.4015257954597473, - "rewards/rejected": -1.7972627878189087, - "sft_loss": 1.319511890411377, + "grad_norm": 8.953833006284222, + "learning_rate": 9.689161844071755e-07, + "logits/chosen": -0.015761854127049446, + "logits/rejected": 0.04325942322611809, + "logps/chosen": -1.3681681156158447, + "logps/rejected": -1.6375095844268799, + "loss": 1.0446, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3681681156158447, + "rewards/margins": 0.26934143900871277, + "rewards/rejected": -1.6375095844268799, + "sft_loss": 1.3063232898712158, "step": 1130 }, { "epoch": 0.6074594413781569, - "grad_norm": 7.685179844824281, - "learning_rate": 2.9051200618974418e-06, - "logits/chosen": -0.10257701575756073, - "logits/rejected": 0.08493608981370926, - "logps/chosen": -1.4827502965927124, - "logps/rejected": -1.8549566268920898, - "loss": 1.0427, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.4827502965927124, - "rewards/margins": 0.37220636010169983, - "rewards/rejected": -1.8549566268920898, - "sft_loss": 1.343749761581421, + "grad_norm": 7.912221491269787, + "learning_rate": 9.683733539658138e-07, + "logits/chosen": -0.09429174661636353, + "logits/rejected": 0.06249885633587837, + "logps/chosen": -1.4401443004608154, + "logps/rejected": -1.7265408039093018, + "loss": 1.0669, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4401443004608154, + "rewards/margins": 0.286396324634552, + "rewards/rejected": -1.7265408039093018, + "sft_loss": 1.3381919860839844, "step": 1135 }, { "epoch": 0.6101354741595585, - "grad_norm": 6.3001894697285605, - "learning_rate": 2.903477938092354e-06, - "logits/chosen": -0.08838716894388199, - "logits/rejected": -0.047435659915208817, - "logps/chosen": -1.39447021484375, - "logps/rejected": -1.5364607572555542, - "loss": 1.1398, + "grad_norm": 7.290932210175596, + "learning_rate": 9.678259793641178e-07, + "logits/chosen": -0.09101923555135727, + "logits/rejected": -0.04793179780244827, + "logps/chosen": -1.3850233554840088, + "logps/rejected": -1.5009348392486572, + "loss": 1.137, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.39447021484375, - "rewards/margins": 0.14199037849903107, - "rewards/rejected": -1.5364607572555542, - "sft_loss": 1.4440193176269531, + "rewards/chosen": -1.3850233554840088, + "rewards/margins": 0.115911565721035, + "rewards/rejected": -1.5009348392486572, + "sft_loss": 1.4382996559143066, "step": 1140 }, { "epoch": 0.61281150694096, - "grad_norm": 5.5920734867489434, - "learning_rate": 2.901822197738155e-06, - "logits/chosen": -0.1857428401708603, - "logits/rejected": -0.049183569848537445, - "logps/chosen": -1.3664112091064453, - "logps/rejected": -1.703650712966919, - "loss": 1.0848, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3664112091064453, - "rewards/margins": 0.33723941445350647, - "rewards/rejected": -1.703650712966919, - "sft_loss": 1.4322891235351562, + "grad_norm": 5.994763978269897, + "learning_rate": 9.672740659127183e-07, + "logits/chosen": -0.21217043697834015, + "logits/rejected": -0.09275046736001968, + "logps/chosen": -1.3658145666122437, + "logps/rejected": -1.6350940465927124, + "loss": 1.0997, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3658145666122437, + "rewards/margins": 0.2692795395851135, + "rewards/rejected": -1.6350940465927124, + "sft_loss": 1.430191993713379, "step": 1145 }, { "epoch": 0.6154875397223616, - "grad_norm": 5.623834598583619, - "learning_rate": 2.9001528568988454e-06, - "logits/chosen": -0.16399559378623962, - "logits/rejected": -0.016664093360304832, - "logps/chosen": -1.2457985877990723, - "logps/rejected": -1.6100772619247437, - "loss": 0.9906, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2457985877990723, - "rewards/margins": 0.36427873373031616, - "rewards/rejected": -1.6100772619247437, - "sft_loss": 1.2594926357269287, + "grad_norm": 6.2537328293323515, + "learning_rate": 9.667176189662818e-07, + "logits/chosen": -0.18638640642166138, + "logits/rejected": -0.04663598909974098, + "logps/chosen": -1.2614182233810425, + "logps/rejected": -1.5357428789138794, + "loss": 1.0133, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2614182233810425, + "rewards/margins": 0.27432459592819214, + "rewards/rejected": -1.5357428789138794, + "sft_loss": 1.2610071897506714, "step": 1150 }, { "epoch": 0.6181635725037632, - "grad_norm": 8.323688037510497, - "learning_rate": 2.898469931770378e-06, - "logits/chosen": -0.023495309054851532, - "logits/rejected": 0.08119723200798035, - "logps/chosen": -1.3699777126312256, - "logps/rejected": -1.5569205284118652, - "loss": 1.0993, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3699777126312256, - "rewards/margins": 0.1869426965713501, - "rewards/rejected": -1.5569205284118652, - "sft_loss": 1.3976179361343384, + "grad_norm": 6.153085584027617, + "learning_rate": 9.661566439234592e-07, + "logits/chosen": -0.0817769393324852, + "logits/rejected": 0.008442547172307968, + "logps/chosen": -1.3463274240493774, + "logps/rejected": -1.5101841688156128, + "loss": 1.0983, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3463274240493774, + "rewards/margins": 0.16385677456855774, + "rewards/rejected": -1.5101841688156128, + "sft_loss": 1.383152723312378, "step": 1155 }, { "epoch": 0.6208396052851648, - "grad_norm": 7.410365546672956, - "learning_rate": 2.896773438680498e-06, - "logits/chosen": -0.008547200821340084, - "logits/rejected": 0.09500636160373688, - "logps/chosen": -1.342989206314087, - "logps/rejected": -1.706742525100708, - "loss": 1.0439, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.342989206314087, - "rewards/margins": 0.3637532591819763, - "rewards/rejected": -1.706742525100708, - "sft_loss": 1.3892873525619507, + "grad_norm": 14.169619640940896, + "learning_rate": 9.655911462268327e-07, + "logits/chosen": -0.02574927732348442, + "logits/rejected": 0.06538109481334686, + "logps/chosen": -1.30341637134552, + "logps/rejected": -1.5070956945419312, + "loss": 1.07, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.30341637134552, + "rewards/margins": 0.20367932319641113, + "rewards/rejected": -1.5070956945419312, + "sft_loss": 1.3562206029891968, "step": 1160 }, { "epoch": 0.6235156380665663, - "grad_norm": 8.28756211424691, - "learning_rate": 2.8950633940885908e-06, - "logits/chosen": -0.09383663535118103, - "logits/rejected": 0.0011787057155743241, - "logps/chosen": -1.3285152912139893, - "logps/rejected": -1.649229645729065, - "loss": 1.0439, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3285152912139893, - "rewards/margins": 0.32071438431739807, - "rewards/rejected": -1.649229645729065, - "sft_loss": 1.336268663406372, + "grad_norm": 5.081540117717634, + "learning_rate": 9.650211313628636e-07, + "logits/chosen": -0.07648883759975433, + "logits/rejected": 0.006164224352687597, + "logps/chosen": -1.256722092628479, + "logps/rejected": -1.4922142028808594, + "loss": 1.0285, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.256722092628479, + "rewards/margins": 0.23549184203147888, + "rewards/rejected": -1.4922142028808594, + "sft_loss": 1.2796533107757568, "step": 1165 }, { "epoch": 0.6261916708479679, - "grad_norm": 4.7219237142016075, - "learning_rate": 2.893339814585516e-06, - "logits/chosen": -0.14430832862854004, - "logits/rejected": 0.03167320415377617, - "logps/chosen": -1.5874072313308716, - "logps/rejected": -1.8974529504776, - "loss": 1.1458, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.5874072313308716, - "rewards/margins": 0.3100458085536957, - "rewards/rejected": -1.8974529504776, - "sft_loss": 1.509259819984436, + "grad_norm": 5.366403260527972, + "learning_rate": 9.644466048618386e-07, + "logits/chosen": -0.10264239460229874, + "logits/rejected": 0.056674420833587646, + "logps/chosen": -1.4898585081100464, + "logps/rejected": -1.6733713150024414, + "loss": 1.1404, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.4898585081100464, + "rewards/margins": 0.18351267278194427, + "rewards/rejected": -1.6733713150024414, + "sft_loss": 1.449183702468872, "step": 1170 }, { "epoch": 0.6288677036293695, - "grad_norm": 4.943932933930905, - "learning_rate": 2.8916027168934483e-06, - "logits/chosen": -0.0802348330616951, - "logits/rejected": 0.12030297517776489, - "logps/chosen": -1.3435288667678833, - "logps/rejected": -1.6719791889190674, - "loss": 1.0802, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3435288667678833, - "rewards/margins": 0.32845038175582886, - "rewards/rejected": -1.6719791889190674, - "sft_loss": 1.3819727897644043, + "grad_norm": 4.08768961851226, + "learning_rate": 9.63867572297816e-07, + "logits/chosen": -0.08438001573085785, + "logits/rejected": 0.0963730663061142, + "logps/chosen": -1.2894665002822876, + "logps/rejected": -1.5089442729949951, + "loss": 1.0759, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2894665002822876, + "rewards/margins": 0.21947786211967468, + "rewards/rejected": -1.5089442729949951, + "sft_loss": 1.3434022665023804, "step": 1175 }, { "epoch": 0.631543736410771, - "grad_norm": 5.123270327438244, - "learning_rate": 2.889852117865718e-06, - "logits/chosen": -0.07914379984140396, - "logits/rejected": 0.08770457655191422, - "logps/chosen": -1.4189870357513428, - "logps/rejected": -1.7545255422592163, - "loss": 1.0451, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.4189870357513428, - "rewards/margins": 0.33553850650787354, - "rewards/rejected": -1.7545255422592163, - "sft_loss": 1.4011993408203125, + "grad_norm": 5.640060679407475, + "learning_rate": 9.632840392885727e-07, + "logits/chosen": -0.08749563992023468, + "logits/rejected": 0.05261549353599548, + "logps/chosen": -1.391072392463684, + "logps/rejected": -1.680537462234497, + "loss": 1.0486, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.391072392463684, + "rewards/margins": 0.2894650399684906, + "rewards/rejected": -1.680537462234497, + "sft_loss": 1.3792742490768433, "step": 1180 }, { "epoch": 0.6342197691921726, - "grad_norm": 6.8755673937049195, - "learning_rate": 2.888088034486645e-06, - "logits/chosen": -0.008513232693076134, - "logits/rejected": 0.15203821659088135, - "logps/chosen": -1.4755704402923584, - "logps/rejected": -1.7755672931671143, - "loss": 1.0964, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.4755704402923584, - "rewards/margins": 0.29999667406082153, - "rewards/rejected": -1.7755672931671143, - "sft_loss": 1.426283836364746, + "grad_norm": 8.121299748032293, + "learning_rate": 9.626960114955483e-07, + "logits/chosen": -0.031509868800640106, + "logits/rejected": 0.10663716495037079, + "logps/chosen": -1.4060813188552856, + "logps/rejected": -1.670353651046753, + "loss": 1.0896, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4060813188552856, + "rewards/margins": 0.26427239179611206, + "rewards/rejected": -1.670353651046753, + "sft_loss": 1.3924411535263062, "step": 1185 }, { "epoch": 0.6368958019735742, - "grad_norm": 7.966400753891832, - "learning_rate": 2.886310483871373e-06, - "logits/chosen": -0.08631936460733414, - "logits/rejected": 0.07498349249362946, - "logps/chosen": -1.4149017333984375, - "logps/rejected": -1.783022165298462, - "loss": 1.0399, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.4149017333984375, - "rewards/margins": 0.36812031269073486, - "rewards/rejected": -1.783022165298462, - "sft_loss": 1.4194279909133911, + "grad_norm": 7.942419990123221, + "learning_rate": 9.621034946237909e-07, + "logits/chosen": -0.09847302734851837, + "logits/rejected": 0.049504801630973816, + "logps/chosen": -1.392059564590454, + "logps/rejected": -1.6771005392074585, + "loss": 1.0557, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.392059564590454, + "rewards/margins": 0.2850412428379059, + "rewards/rejected": -1.6771005392074585, + "sft_loss": 1.4009144306182861, "step": 1190 }, { "epoch": 0.6395718347549757, - "grad_norm": 5.166588529451397, - "learning_rate": 2.8845194832657067e-06, - "logits/chosen": -0.026654431596398354, - "logits/rejected": 0.11714208126068115, - "logps/chosen": -1.261995553970337, - "logps/rejected": -1.717275857925415, - "loss": 0.9894, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.261995553970337, - "rewards/margins": 0.45528024435043335, - "rewards/rejected": -1.717275857925415, - "sft_loss": 1.3523094654083252, + "grad_norm": 5.441289218712803, + "learning_rate": 9.615064944219021e-07, + "logits/chosen": -0.04815680533647537, + "logits/rejected": 0.0761982649564743, + "logps/chosen": -1.2798337936401367, + "logps/rejected": -1.6139675378799438, + "loss": 1.0236, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2798337936401367, + "rewards/margins": 0.3341337740421295, + "rewards/rejected": -1.6139675378799438, + "sft_loss": 1.3444349765777588, "step": 1195 }, { "epoch": 0.6422478675363773, - "grad_norm": 8.259488569986726, - "learning_rate": 2.882715050045941e-06, - "logits/chosen": -0.09320361912250519, - "logits/rejected": -0.018301691859960556, - "logps/chosen": -1.3758822679519653, - "logps/rejected": -1.6776307821273804, - "loss": 1.0763, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3758822679519653, - "rewards/margins": 0.30174845457077026, - "rewards/rejected": -1.6776307821273804, - "sft_loss": 1.3632720708847046, + "grad_norm": 11.168127119077242, + "learning_rate": 9.609050166819803e-07, + "logits/chosen": -0.07909546047449112, + "logits/rejected": -0.0054620252922177315, + "logps/chosen": -1.3398972749710083, + "logps/rejected": -1.5796291828155518, + "loss": 1.077, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3398972749710083, + "rewards/margins": 0.239732027053833, + "rewards/rejected": -1.5796291828155518, + "sft_loss": 1.3416160345077515, "step": 1200 }, { "epoch": 0.6422478675363773, - "eval_logits/chosen": 0.33133628964424133, - "eval_logits/rejected": 0.43879061937332153, - "eval_logps/chosen": -1.3875893354415894, - "eval_logps/rejected": -1.74982488155365, - "eval_loss": 1.0494909286499023, - "eval_rewards/accuracies": 0.6045994162559509, - "eval_rewards/chosen": -1.3875893354415894, - "eval_rewards/margins": 0.3622351884841919, - "eval_rewards/rejected": -1.74982488155365, - "eval_runtime": 43.0416, - "eval_samples_per_second": 31.249, - "eval_sft_loss": 1.3954273462295532, - "eval_steps_per_second": 7.83, + "eval_logits/chosen": 0.264872670173645, + "eval_logits/rejected": 0.35892850160598755, + "eval_logps/chosen": -1.3685014247894287, + "eval_logps/rejected": -1.6704237461090088, + "eval_loss": 1.0590972900390625, + "eval_rewards/accuracies": 0.5934718251228333, + "eval_rewards/chosen": -1.3685014247894287, + "eval_rewards/margins": 0.30192235112190247, + "eval_rewards/rejected": -1.6704237461090088, + "eval_runtime": 43.4948, + "eval_samples_per_second": 30.923, + "eval_sft_loss": 1.382156491279602, + "eval_steps_per_second": 7.748, "step": 1200 }, { "epoch": 0.6449239003177789, - "grad_norm": 8.512204625035347, - "learning_rate": 2.8808972017186957e-06, - "logits/chosen": -0.20027823746204376, - "logits/rejected": -0.0024368553422391415, - "logps/chosen": -1.3654619455337524, - "logps/rejected": -1.6480159759521484, - "loss": 1.0671, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3654619455337524, - "rewards/margins": 0.28255385160446167, - "rewards/rejected": -1.6480159759521484, - "sft_loss": 1.3939441442489624, + "grad_norm": 8.162444465412007, + "learning_rate": 9.602990672395653e-07, + "logits/chosen": -0.19529876112937927, + "logits/rejected": -0.012327780947089195, + "logps/chosen": -1.3344361782073975, + "logps/rejected": -1.5679818391799927, + "loss": 1.071, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3344361782073975, + "rewards/margins": 0.23354558646678925, + "rewards/rejected": -1.5679818391799927, + "sft_loss": 1.3671363592147827, "step": 1205 }, { "epoch": 0.6475999330991805, - "grad_norm": 5.435589749333542, - "learning_rate": 2.8790659559207434e-06, - "logits/chosen": -0.10933800041675568, - "logits/rejected": 0.12187595665454865, - "logps/chosen": -1.339516282081604, - "logps/rejected": -1.6303303241729736, - "loss": 1.0521, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.339516282081604, - "rewards/margins": 0.2908141016960144, - "rewards/rejected": -1.6303303241729736, - "sft_loss": 1.3552604913711548, + "grad_norm": 8.134798688280451, + "learning_rate": 9.59688651973581e-07, + "logits/chosen": -0.09892721474170685, + "logits/rejected": 0.10894634574651718, + "logps/chosen": -1.3433425426483154, + "logps/rejected": -1.5716124773025513, + "loss": 1.0678, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3433425426483154, + "rewards/margins": 0.22826996445655823, + "rewards/rejected": -1.5716124773025513, + "sft_loss": 1.3458704948425293, "step": 1210 }, { "epoch": 0.650275965880582, - "grad_norm": 5.825372619643404, - "learning_rate": 2.877221330418838e-06, - "logits/chosen": -0.15005064010620117, - "logits/rejected": -0.004594183061271906, - "logps/chosen": -1.3774538040161133, - "logps/rejected": -1.5936380624771118, - "loss": 1.1074, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3774538040161133, - "rewards/margins": 0.21618422865867615, - "rewards/rejected": -1.5936380624771118, - "sft_loss": 1.3924789428710938, + "grad_norm": 5.685370068762198, + "learning_rate": 9.590737768062792e-07, + "logits/chosen": -0.13253986835479736, + "logits/rejected": -0.004379653837531805, + "logps/chosen": -1.3532987833023071, + "logps/rejected": -1.5336767435073853, + "loss": 1.1017, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3532987833023071, + "rewards/margins": 0.18037788569927216, + "rewards/rejected": -1.5336767435073853, + "sft_loss": 1.3750277757644653, "step": 1215 }, { "epoch": 0.6529519986619836, - "grad_norm": 6.1466877107192355, - "learning_rate": 2.875363343109545e-06, - "logits/chosen": 0.013457834720611572, - "logits/rejected": 0.14119693636894226, - "logps/chosen": -1.28562331199646, - "logps/rejected": -1.5532904863357544, - "loss": 1.0477, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.28562331199646, - "rewards/margins": 0.26766690611839294, - "rewards/rejected": -1.5532904863357544, - "sft_loss": 1.2760810852050781, + "grad_norm": 7.141213912440162, + "learning_rate": 9.584544477031816e-07, + "logits/chosen": 0.045246172696352005, + "logits/rejected": 0.16152077913284302, + "logps/chosen": -1.2738714218139648, + "logps/rejected": -1.4765288829803467, + "loss": 1.06, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2738714218139648, + "rewards/margins": 0.2026575803756714, + "rewards/rejected": -1.4765288829803467, + "sft_loss": 1.2613481283187866, "step": 1220 }, { "epoch": 0.6556280314433852, - "grad_norm": 6.2074220518550804, - "learning_rate": 2.8734920120190645e-06, - "logits/chosen": -0.20628468692302704, - "logits/rejected": 0.02951905131340027, - "logps/chosen": -1.4024266004562378, - "logps/rejected": -1.6120758056640625, - "loss": 1.1006, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.4024266004562378, - "rewards/margins": 0.20964908599853516, - "rewards/rejected": -1.6120758056640625, - "sft_loss": 1.4208050966262817, + "grad_norm": 6.258863579208841, + "learning_rate": 9.578306706730215e-07, + "logits/chosen": -0.18121086061000824, + "logits/rejected": 0.03602520748972893, + "logps/chosen": -1.3640462160110474, + "logps/rejected": -1.5456712245941162, + "loss": 1.0971, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3640462160110474, + "rewards/margins": 0.18162491917610168, + "rewards/rejected": -1.5456712245941162, + "sft_loss": 1.3939239978790283, "step": 1225 }, { "epoch": 0.6583040642247867, - "grad_norm": 8.262618460377562, - "learning_rate": 2.8716073553030593e-06, - "logits/chosen": -0.10507309436798096, - "logits/rejected": 0.011842099949717522, - "logps/chosen": -1.334250569343567, - "logps/rejected": -1.6416661739349365, - "loss": 1.0449, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.334250569343567, - "rewards/margins": 0.3074159026145935, - "rewards/rejected": -1.6416661739349365, - "sft_loss": 1.308292031288147, + "grad_norm": 9.745810185971298, + "learning_rate": 9.572024517676865e-07, + "logits/chosen": -0.03982186317443848, + "logits/rejected": 0.07728839665651321, + "logps/chosen": -1.3038945198059082, + "logps/rejected": -1.5501224994659424, + "loss": 1.0451, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3038945198059082, + "rewards/margins": 0.24622802436351776, + "rewards/rejected": -1.5501224994659424, + "sft_loss": 1.2884595394134521, "step": 1230 }, { "epoch": 0.6609800970061883, - "grad_norm": 5.237995416480608, - "learning_rate": 2.8697093912464782e-06, - "logits/chosen": -0.08195515722036362, - "logits/rejected": 0.07722845673561096, - "logps/chosen": -1.3526867628097534, - "logps/rejected": -1.5650540590286255, - "loss": 1.0897, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3526867628097534, - "rewards/margins": 0.21236717700958252, - "rewards/rejected": -1.5650540590286255, - "sft_loss": 1.4327796697616577, + "grad_norm": 5.964030136663616, + "learning_rate": 9.565697970821593e-07, + "logits/chosen": -0.04003802686929703, + "logits/rejected": 0.10245601087808609, + "logps/chosen": -1.343867301940918, + "logps/rejected": -1.4991952180862427, + "loss": 1.1028, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.343867301940918, + "rewards/margins": 0.15532787144184113, + "rewards/rejected": -1.4991952180862427, + "sft_loss": 1.419036626815796, "step": 1235 }, { "epoch": 0.6636561297875899, - "grad_norm": 4.975635376617012, - "learning_rate": 2.8677981382633753e-06, - "logits/chosen": -0.2311730831861496, - "logits/rejected": -0.0849604532122612, - "logps/chosen": -1.3269524574279785, - "logps/rejected": -1.647106409072876, - "loss": 1.041, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3269524574279785, - "rewards/margins": 0.32015395164489746, - "rewards/rejected": -1.647106409072876, - "sft_loss": 1.3890091180801392, + "grad_norm": 6.519019052078934, + "learning_rate": 9.559327127544585e-07, + "logits/chosen": -0.16727600991725922, + "logits/rejected": -0.029840881004929543, + "logps/chosen": -1.3048359155654907, + "logps/rejected": -1.5225237607955933, + "loss": 1.066, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3048359155654907, + "rewards/margins": 0.21768799424171448, + "rewards/rejected": -1.5225237607955933, + "sft_loss": 1.3605889081954956, "step": 1240 }, { "epoch": 0.6663321625689914, - "grad_norm": 5.458841354429057, - "learning_rate": 2.8658736148967366e-06, - "logits/chosen": -0.1458047479391098, - "logits/rejected": 0.0449729859828949, - "logps/chosen": -1.4105819463729858, - "logps/rejected": -1.6483221054077148, - "loss": 1.1175, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.4105819463729858, - "rewards/margins": 0.23774006962776184, - "rewards/rejected": -1.6483221054077148, - "sft_loss": 1.4562586545944214, + "grad_norm": 6.917448069209358, + "learning_rate": 9.552912049655789e-07, + "logits/chosen": -0.06976354122161865, + "logits/rejected": 0.11428710073232651, + "logps/chosen": -1.3906006813049316, + "logps/rejected": -1.538474678993225, + "loss": 1.1394, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.3906006813049316, + "rewards/margins": 0.14787396788597107, + "rewards/rejected": -1.538474678993225, + "sft_loss": 1.437873125076294, "step": 1245 }, { "epoch": 0.669008195350393, - "grad_norm": 7.300740100285893, - "learning_rate": 2.8639358398182947e-06, - "logits/chosen": -0.14232835173606873, - "logits/rejected": 0.07038958370685577, - "logps/chosen": -1.452538013458252, - "logps/rejected": -1.6742738485336304, - "loss": 1.118, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -1.452538013458252, - "rewards/margins": 0.22173579037189484, - "rewards/rejected": -1.6742738485336304, - "sft_loss": 1.4558159112930298, + "grad_norm": 7.9944608337121705, + "learning_rate": 9.546452799394315e-07, + "logits/chosen": -0.04944934695959091, + "logits/rejected": 0.16209319233894348, + "logps/chosen": -1.4066098928451538, + "logps/rejected": -1.5687237977981567, + "loss": 1.1164, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.4066098928451538, + "rewards/margins": 0.1621139943599701, + "rewards/rejected": -1.5687237977981567, + "sft_loss": 1.4220317602157593, "step": 1250 }, { "epoch": 0.6716842281317946, - "grad_norm": 6.969694171415584, - "learning_rate": 2.8619848318283538e-06, - "logits/chosen": -0.1824021339416504, - "logits/rejected": -0.051718395203351974, - "logps/chosen": -1.3421350717544556, - "logps/rejected": -1.6273431777954102, - "loss": 1.0732, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3421350717544556, - "rewards/margins": 0.285208135843277, - "rewards/rejected": -1.6273431777954102, - "sft_loss": 1.430902123451233, + "grad_norm": 7.613567393058519, + "learning_rate": 9.539949439427846e-07, + "logits/chosen": -0.06999413669109344, + "logits/rejected": 0.06490659713745117, + "logps/chosen": -1.3217823505401611, + "logps/rejected": -1.558611273765564, + "loss": 1.0764, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3217823505401611, + "rewards/margins": 0.23682892322540283, + "rewards/rejected": -1.558611273765564, + "sft_loss": 1.393017053604126, "step": 1255 }, { "epoch": 0.6743602609131962, - "grad_norm": 6.732449093746142, - "learning_rate": 2.860020609855601e-06, - "logits/chosen": -0.2673589587211609, - "logits/rejected": -0.1350955367088318, - "logps/chosen": -1.3457567691802979, - "logps/rejected": -1.7736858129501343, - "loss": 1.0376, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3457567691802979, - "rewards/margins": 0.42792901396751404, - "rewards/rejected": -1.7736858129501343, - "sft_loss": 1.3949072360992432, + "grad_norm": 5.503462436845649, + "learning_rate": 9.533402032852002e-07, + "logits/chosen": -0.07714545726776123, + "logits/rejected": 0.06864900887012482, + "logps/chosen": -1.2933156490325928, + "logps/rejected": -1.6474330425262451, + "loss": 1.0371, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2933156490325928, + "rewards/margins": 0.3541174829006195, + "rewards/rejected": -1.6474330425262451, + "sft_loss": 1.3451446294784546, "step": 1260 }, { "epoch": 0.6770362936945977, - "grad_norm": 5.744150499257258, - "learning_rate": 2.858043192956926e-06, - "logits/chosen": -0.13780884444713593, - "logits/rejected": 0.010678360238671303, - "logps/chosen": -1.3624342679977417, - "logps/rejected": -1.707216501235962, - "loss": 1.0493, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3624342679977417, - "rewards/margins": 0.34478217363357544, - "rewards/rejected": -1.707216501235962, - "sft_loss": 1.3893402814865112, + "grad_norm": 6.962243867292986, + "learning_rate": 9.526810643189754e-07, + "logits/chosen": -0.0016354650724679232, + "logits/rejected": 0.16194012761116028, + "logps/chosen": -1.3304073810577393, + "logps/rejected": -1.604431390762329, + "loss": 1.0525, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3304073810577393, + "rewards/margins": 0.2740240693092346, + "rewards/rejected": -1.604431390762329, + "sft_loss": 1.3525432348251343, "step": 1265 }, { "epoch": 0.6797123264759993, - "grad_norm": 6.619509558086586, - "learning_rate": 2.856052600317237e-06, - "logits/chosen": -0.21283817291259766, - "logits/rejected": -0.10551418364048004, - "logps/chosen": -1.3334189653396606, - "logps/rejected": -1.694124460220337, - "loss": 1.0483, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3334189653396606, - "rewards/margins": 0.36070531606674194, - "rewards/rejected": -1.694124460220337, - "sft_loss": 1.3852914571762085, + "grad_norm": 7.674531775502987, + "learning_rate": 9.52017533439079e-07, + "logits/chosen": -0.09173519909381866, + "logits/rejected": 0.021121881902217865, + "logps/chosen": -1.3317499160766602, + "logps/rejected": -1.6477981805801392, + "loss": 1.0598, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3317499160766602, + "rewards/margins": 0.31604841351509094, + "rewards/rejected": -1.6477981805801392, + "sft_loss": 1.3782113790512085, "step": 1270 }, { "epoch": 0.6823883592574009, - "grad_norm": 5.5743901442295565, - "learning_rate": 2.8540488512492725e-06, - "logits/chosen": -0.16439157724380493, - "logits/rejected": -0.053611718118190765, - "logps/chosen": -1.3701202869415283, - "logps/rejected": -1.631082534790039, - "loss": 1.0734, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3701202869415283, - "rewards/margins": 0.2609623372554779, - "rewards/rejected": -1.631082534790039, - "sft_loss": 1.3603696823120117, + "grad_norm": 6.056954538944757, + "learning_rate": 9.513496170830909e-07, + "logits/chosen": -0.07323311269283295, + "logits/rejected": 0.039187707006931305, + "logps/chosen": -1.3623502254486084, + "logps/rejected": -1.5875145196914673, + "loss": 1.0739, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3623502254486084, + "rewards/margins": 0.22516405582427979, + "rewards/rejected": -1.5875145196914673, + "sft_loss": 1.3416621685028076, "step": 1275 }, { "epoch": 0.6850643920388024, - "grad_norm": 7.116426977910912, - "learning_rate": 2.8520319651934147e-06, - "logits/chosen": -0.1693895161151886, - "logits/rejected": -0.007607897277921438, - "logps/chosen": -1.4261596202850342, - "logps/rejected": -1.6268680095672607, - "loss": 1.1275, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.4261596202850342, - "rewards/margins": 0.2007085531949997, - "rewards/rejected": -1.6268680095672607, - "sft_loss": 1.4610538482666016, + "grad_norm": 8.226179672171405, + "learning_rate": 9.506773217311382e-07, + "logits/chosen": -0.056537926197052, + "logits/rejected": 0.11088557541370392, + "logps/chosen": -1.4244935512542725, + "logps/rejected": -1.5980981588363647, + "loss": 1.1354, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.4244935512542725, + "rewards/margins": 0.17360465228557587, + "rewards/rejected": -1.5980981588363647, + "sft_loss": 1.4504040479660034, "step": 1280 }, { "epoch": 0.687740424820204, - "grad_norm": 6.482471516998786, - "learning_rate": 2.8500019617175005e-06, - "logits/chosen": -0.1612972617149353, - "logits/rejected": -0.014109318144619465, - "logps/chosen": -1.288216233253479, - "logps/rejected": -1.5245250463485718, - "loss": 1.0442, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.288216233253479, - "rewards/margins": 0.2363087385892868, - "rewards/rejected": -1.5245250463485718, - "sft_loss": 1.318396806716919, + "grad_norm": 7.010250854431581, + "learning_rate": 9.500006539058334e-07, + "logits/chosen": -0.054350245743989944, + "logits/rejected": 0.09495130926370621, + "logps/chosen": -1.2873237133026123, + "logps/rejected": -1.4944299459457397, + "loss": 1.0511, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2873237133026123, + "rewards/margins": 0.20710627734661102, + "rewards/rejected": -1.4944299459457397, + "sft_loss": 1.3140829801559448, "step": 1285 }, { "epoch": 0.6904164576016056, - "grad_norm": 7.346320744981688, - "learning_rate": 2.847958860516633e-06, - "logits/chosen": -0.31444284319877625, - "logits/rejected": -0.1650298833847046, - "logps/chosen": -1.3861689567565918, - "logps/rejected": -1.5643253326416016, - "loss": 1.1179, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.3861689567565918, - "rewards/margins": 0.17815645039081573, - "rewards/rejected": -1.5643253326416016, - "sft_loss": 1.380351185798645, + "grad_norm": 6.737089823542583, + "learning_rate": 9.493196201722109e-07, + "logits/chosen": -0.19877436757087708, + "logits/rejected": -0.042660392820835114, + "logps/chosen": -1.3604657649993896, + "logps/rejected": -1.480218529701233, + "loss": 1.1203, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.3604657649993896, + "rewards/margins": 0.11975283920764923, + "rewards/rejected": -1.480218529701233, + "sft_loss": 1.3572709560394287, "step": 1290 }, { "epoch": 0.6930924903830072, - "grad_norm": 4.507333897677239, - "learning_rate": 2.8459026814129887e-06, - "logits/chosen": -0.25474271178245544, - "logits/rejected": -0.2486313134431839, - "logps/chosen": -1.4083340167999268, - "logps/rejected": -1.7456175088882446, - "loss": 1.0595, + "grad_norm": 5.140746161365096, + "learning_rate": 9.486342271376628e-07, + "logits/chosen": -0.08950953930616379, + "logits/rejected": -0.07718075811862946, + "logps/chosen": -1.3389971256256104, + "logps/rejected": -1.635023832321167, + "loss": 1.0446, "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.4083340167999268, - "rewards/margins": 0.33728331327438354, - "rewards/rejected": -1.7456175088882446, - "sft_loss": 1.4073834419250488, + "rewards/chosen": -1.3389971256256104, + "rewards/margins": 0.2960268557071686, + "rewards/rejected": -1.635023832321167, + "sft_loss": 1.3482633829116821, "step": 1295 }, { "epoch": 0.6957685231644087, - "grad_norm": 5.9865959007720235, - "learning_rate": 2.8438334443556268e-06, - "logits/chosen": -0.2621445059776306, - "logits/rejected": -0.019263360649347305, - "logps/chosen": -1.3438646793365479, - "logps/rejected": -1.7832008600234985, - "loss": 1.0314, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3438646793365479, - "rewards/margins": 0.43933621048927307, - "rewards/rejected": -1.7832008600234985, - "sft_loss": 1.3932850360870361, + "grad_norm": 6.4103278006865585, + "learning_rate": 9.479444814518755e-07, + "logits/chosen": -0.09214359521865845, + "logits/rejected": 0.17356061935424805, + "logps/chosen": -1.3123610019683838, + "logps/rejected": -1.6263983249664307, + "loss": 1.0509, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3123610019683838, + "rewards/margins": 0.31403714418411255, + "rewards/rejected": -1.6263983249664307, + "sft_loss": 1.3706796169281006, "step": 1300 }, { "epoch": 0.6984445559458103, - "grad_norm": 5.275881261428366, - "learning_rate": 2.8417511694202938e-06, - "logits/chosen": -0.1755446493625641, - "logits/rejected": -0.1253184974193573, - "logps/chosen": -1.370633602142334, - "logps/rejected": -1.6897408962249756, - "loss": 1.0674, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.370633602142334, - "rewards/margins": 0.3191072940826416, - "rewards/rejected": -1.6897408962249756, - "sft_loss": 1.3716998100280762, + "grad_norm": 6.504675097223832, + "learning_rate": 9.472503898067645e-07, + "logits/chosen": 0.002763524651527405, + "logits/rejected": 0.05632271245121956, + "logps/chosen": -1.3333485126495361, + "logps/rejected": -1.5826590061187744, + "loss": 1.0595, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3333485126495361, + "rewards/margins": 0.24931053817272186, + "rewards/rejected": -1.5826590061187744, + "sft_loss": 1.338838815689087, "step": 1305 }, { "epoch": 0.701120588727212, - "grad_norm": 4.190044503398931, - "learning_rate": 2.83965587680923e-06, - "logits/chosen": -0.13763971626758575, - "logits/rejected": -0.06209099292755127, - "logps/chosen": -1.3407888412475586, - "logps/rejected": -1.6444101333618164, - "loss": 1.0604, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3407888412475586, - "rewards/margins": 0.30362120270729065, - "rewards/rejected": -1.6444101333618164, - "sft_loss": 1.3559894561767578, + "grad_norm": 4.090952103579744, + "learning_rate": 9.465519589364099e-07, + "logits/chosen": -0.008335872553288937, + "logits/rejected": 0.07565923780202866, + "logps/chosen": -1.3219220638275146, + "logps/rejected": -1.5549265146255493, + "loss": 1.071, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3219220638275146, + "rewards/margins": 0.2330043613910675, + "rewards/rejected": -1.5549265146255493, + "sft_loss": 1.3386104106903076, "step": 1310 }, { "epoch": 0.7037966215086134, - "grad_norm": 8.589029882972284, - "learning_rate": 2.837547586850974e-06, - "logits/chosen": -0.2269708216190338, - "logits/rejected": -0.051694173365831375, - "logps/chosen": -1.2944409847259521, - "logps/rejected": -1.626725435256958, - "loss": 1.0151, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2944409847259521, - "rewards/margins": 0.33228427171707153, - "rewards/rejected": -1.626725435256958, - "sft_loss": 1.2930586338043213, + "grad_norm": 6.926733832435039, + "learning_rate": 9.458491956169914e-07, + "logits/chosen": -0.11416655778884888, + "logits/rejected": 0.07172179222106934, + "logps/chosen": -1.2963732481002808, + "logps/rejected": -1.5955102443695068, + "loss": 1.0273, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2963732481002808, + "rewards/margins": 0.2991369962692261, + "rewards/rejected": -1.5955102443695068, + "sft_loss": 1.2894750833511353, "step": 1315 }, { "epoch": 0.706472654290015, - "grad_norm": 5.081003427653556, - "learning_rate": 2.8354263200001645e-06, - "logits/chosen": -0.3047958016395569, - "logits/rejected": -0.10471458733081818, - "logps/chosen": -1.274651288986206, - "logps/rejected": -1.5908291339874268, - "loss": 1.0108, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.274651288986206, - "rewards/margins": 0.3161778748035431, - "rewards/rejected": -1.5908291339874268, - "sft_loss": 1.3193323612213135, + "grad_norm": 4.802406989227033, + "learning_rate": 9.451421066667215e-07, + "logits/chosen": -0.1949145644903183, + "logits/rejected": 0.018199989572167397, + "logps/chosen": -1.267913579940796, + "logps/rejected": -1.5648223161697388, + "loss": 1.0175, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.267913579940796, + "rewards/margins": 0.29690876603126526, + "rewards/rejected": -1.5648223161697388, + "sft_loss": 1.2981455326080322, "step": 1320 }, { "epoch": 0.7091486870714167, - "grad_norm": 6.115327160037618, - "learning_rate": 2.8332920968373414e-06, - "logits/chosen": -0.12503978610038757, - "logits/rejected": -0.01366239320486784, - "logps/chosen": -1.3743711709976196, - "logps/rejected": -1.6227270364761353, - "loss": 1.1023, - "rewards/accuracies": 0.53125, - "rewards/chosen": -1.3743711709976196, - "rewards/margins": 0.24835577607154846, - "rewards/rejected": -1.6227270364761353, - "sft_loss": 1.3661524057388306, + "grad_norm": 5.30569400241199, + "learning_rate": 9.444306989457805e-07, + "logits/chosen": -0.03965713828802109, + "logits/rejected": 0.07506690919399261, + "logps/chosen": -1.3903119564056396, + "logps/rejected": -1.5926454067230225, + "loss": 1.1198, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3903119564056396, + "rewards/margins": 0.2023334801197052, + "rewards/rejected": -1.5926454067230225, + "sft_loss": 1.3613229990005493, "step": 1325 }, { "epoch": 0.7118247198528181, - "grad_norm": 5.621581268348691, - "learning_rate": 2.831144938068747e-06, - "logits/chosen": -0.15741266310214996, - "logits/rejected": -0.04052863270044327, - "logps/chosen": -1.3196581602096558, - "logps/rejected": -1.546939492225647, - "loss": 1.0601, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3196581602096558, - "rewards/margins": 0.22728124260902405, - "rewards/rejected": -1.546939492225647, - "sft_loss": 1.326621413230896, + "grad_norm": 6.066655619066335, + "learning_rate": 9.437149793562489e-07, + "logits/chosen": -0.06363092362880707, + "logits/rejected": 0.05990830063819885, + "logps/chosen": -1.3233710527420044, + "logps/rejected": -1.4781570434570312, + "loss": 1.0817, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3233710527420044, + "rewards/margins": 0.15478602051734924, + "rewards/rejected": -1.4781570434570312, + "sft_loss": 1.3233697414398193, "step": 1330 }, { "epoch": 0.7145007526342197, - "grad_norm": 8.501981777373967, - "learning_rate": 2.8289848645261253e-06, - "logits/chosen": -0.1369256228208542, - "logits/rejected": -0.05569840595126152, - "logps/chosen": -1.416253685951233, - "logps/rejected": -1.6683292388916016, - "loss": 1.0777, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.416253685951233, - "rewards/margins": 0.25207552313804626, - "rewards/rejected": -1.6683292388916016, - "sft_loss": 1.4455569982528687, + "grad_norm": 5.762243489235919, + "learning_rate": 9.429949548420417e-07, + "logits/chosen": -0.028935562819242477, + "logits/rejected": 0.05837502330541611, + "logps/chosen": -1.395919919013977, + "logps/rejected": -1.6004736423492432, + "loss": 1.0824, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.395919919013977, + "rewards/margins": 0.2045535296201706, + "rewards/rejected": -1.6004736423492432, + "sft_loss": 1.4127261638641357, "step": 1335 }, { "epoch": 0.7171767854156214, - "grad_norm": 6.8237644835522975, - "learning_rate": 2.826811897166519e-06, - "logits/chosen": -0.16232003271579742, - "logits/rejected": -0.1322738230228424, - "logps/chosen": -1.3589154481887817, - "logps/rejected": -1.6097590923309326, - "loss": 1.0632, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3589154481887817, - "rewards/margins": 0.25084370374679565, - "rewards/rejected": -1.6097590923309326, - "sft_loss": 1.3681776523590088, + "grad_norm": 8.214963066893405, + "learning_rate": 9.422706323888396e-07, + "logits/chosen": -0.03690364211797714, + "logits/rejected": -0.0002823077084030956, + "logps/chosen": -1.346161127090454, + "logps/rejected": -1.5365341901779175, + "loss": 1.0805, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.346161127090454, + "rewards/margins": 0.19037306308746338, + "rewards/rejected": -1.5365341901779175, + "sft_loss": 1.3572635650634766, "step": 1340 }, { "epoch": 0.719852818197023, - "grad_norm": 5.273153198621067, - "learning_rate": 2.8246260570720673e-06, - "logits/chosen": -0.1638951450586319, - "logits/rejected": 0.008594045415520668, - "logps/chosen": -1.3590314388275146, - "logps/rejected": -1.7121204137802124, - "loss": 1.0402, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3590314388275146, - "rewards/margins": 0.3530888259410858, - "rewards/rejected": -1.7121204137802124, - "sft_loss": 1.3907363414764404, + "grad_norm": 4.476913094970793, + "learning_rate": 9.415420190240225e-07, + "logits/chosen": 0.01798745058476925, + "logits/rejected": 0.21475832164287567, + "logps/chosen": -1.3314244747161865, + "logps/rejected": -1.5622254610061646, + "loss": 1.0663, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3314244747161865, + "rewards/margins": 0.2308010309934616, + "rewards/rejected": -1.5622254610061646, + "sft_loss": 1.367760419845581, "step": 1345 }, { "epoch": 0.7225288509784245, - "grad_norm": 7.631563301834823, - "learning_rate": 2.8224273654498007e-06, - "logits/chosen": -0.16318397223949432, - "logits/rejected": -0.11493877321481705, - "logps/chosen": -1.3974087238311768, - "logps/rejected": -1.540160894393921, - "loss": 1.1194, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3974087238311768, - "rewards/margins": 0.14275220036506653, - "rewards/rejected": -1.540160894393921, - "sft_loss": 1.4131476879119873, + "grad_norm": 7.3383665115717776, + "learning_rate": 9.408091218166002e-07, + "logits/chosen": 0.0036095590330660343, + "logits/rejected": 0.06418739259243011, + "logps/chosen": -1.3319792747497559, + "logps/rejected": -1.4216053485870361, + "loss": 1.1099, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3319792747497559, + "rewards/margins": 0.0896257609128952, + "rewards/rejected": -1.4216053485870361, + "sft_loss": 1.3568004369735718, "step": 1350 }, { "epoch": 0.7252048837598261, - "grad_norm": 5.247204698791591, - "learning_rate": 2.8202158436314348e-06, - "logits/chosen": -0.20993058383464813, - "logits/rejected": 0.0656299963593483, - "logps/chosen": -1.4481570720672607, - "logps/rejected": -1.7513744831085205, - "loss": 1.0944, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.4481570720672607, - "rewards/margins": 0.3032172918319702, - "rewards/rejected": -1.7513744831085205, - "sft_loss": 1.4534757137298584, + "grad_norm": 5.1576036315994385, + "learning_rate": 9.400719478771449e-07, + "logits/chosen": -0.034273095428943634, + "logits/rejected": 0.2666153609752655, + "logps/chosen": -1.3899375200271606, + "logps/rejected": -1.5520015954971313, + "loss": 1.1164, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3899375200271606, + "rewards/margins": 0.16206394135951996, + "rewards/rejected": -1.5520015954971313, + "sft_loss": 1.4064255952835083, "step": 1355 }, { "epoch": 0.7278809165412277, - "grad_norm": 6.147247511492117, - "learning_rate": 2.817991513073163e-06, - "logits/chosen": -0.3009326756000519, - "logits/rejected": -0.1586245447397232, - "logps/chosen": -1.480035424232483, - "logps/rejected": -1.8455111980438232, - "loss": 1.0894, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.480035424232483, - "rewards/margins": 0.3654758334159851, - "rewards/rejected": -1.8455111980438232, - "sft_loss": 1.506760835647583, + "grad_norm": 6.155451220270838, + "learning_rate": 9.393305043577209e-07, + "logits/chosen": -0.141189306974411, + "logits/rejected": 0.014295866712927818, + "logps/chosen": -1.4161455631256104, + "logps/rejected": -1.6726045608520508, + "loss": 1.0969, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4161455631256104, + "rewards/margins": 0.2564590871334076, + "rewards/rejected": -1.6726045608520508, + "sft_loss": 1.4574393033981323, "step": 1360 }, { "epoch": 0.7305569493226292, - "grad_norm": 5.125439426438534, - "learning_rate": 2.8157543953554515e-06, - "logits/chosen": -0.16923581063747406, - "logits/rejected": -0.04237973317503929, - "logps/chosen": -1.3896102905273438, - "logps/rejected": -1.6864780187606812, - "loss": 1.0593, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3896102905273438, - "rewards/margins": 0.2968676686286926, - "rewards/rejected": -1.6864780187606812, - "sft_loss": 1.4087154865264893, + "grad_norm": 5.507852253151474, + "learning_rate": 9.38584798451817e-07, + "logits/chosen": -0.013143645599484444, + "logits/rejected": 0.13177946209907532, + "logps/chosen": -1.343443512916565, + "logps/rejected": -1.5579149723052979, + "loss": 1.0692, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.343443512916565, + "rewards/margins": 0.2144714891910553, + "rewards/rejected": -1.5579149723052979, + "sft_loss": 1.3730356693267822, "step": 1365 }, { "epoch": 0.7332329821040308, - "grad_norm": 7.886149599248979, - "learning_rate": 2.813504512182825e-06, - "logits/chosen": -0.14230379462242126, - "logits/rejected": -0.021787326782941818, - "logps/chosen": -1.4374529123306274, - "logps/rejected": -1.8932501077651978, - "loss": 1.0296, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.4374529123306274, - "rewards/margins": 0.45579713582992554, - "rewards/rejected": -1.8932501077651978, - "sft_loss": 1.4520795345306396, + "grad_norm": 16.701879236246025, + "learning_rate": 9.37834837394275e-07, + "logits/chosen": -0.0122370021417737, + "logits/rejected": 0.11792914569377899, + "logps/chosen": -1.3964431285858154, + "logps/rejected": -1.7372715473175049, + "loss": 1.0529, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3964431285858154, + "rewards/margins": 0.3408281207084656, + "rewards/rejected": -1.7372715473175049, + "sft_loss": 1.4153270721435547, "step": 1370 }, { "epoch": 0.7359090148854324, - "grad_norm": 4.451903315499975, - "learning_rate": 2.811241885383661e-06, - "logits/chosen": -0.15278813242912292, - "logits/rejected": -0.0027175932191312313, - "logps/chosen": -1.363525152206421, - "logps/rejected": -1.8220170736312866, - "loss": 1.0206, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.363525152206421, - "rewards/margins": 0.45849180221557617, - "rewards/rejected": -1.8220170736312866, - "sft_loss": 1.4273641109466553, + "grad_norm": 4.722020047322209, + "learning_rate": 9.370806284612203e-07, + "logits/chosen": -0.07226811349391937, + "logits/rejected": 0.08238498121500015, + "logps/chosen": -1.3164997100830078, + "logps/rejected": -1.6869707107543945, + "loss": 1.0365, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3164997100830078, + "rewards/margins": 0.37047088146209717, + "rewards/rejected": -1.6869707107543945, + "sft_loss": 1.3838088512420654, "step": 1375 }, { "epoch": 0.738585047666834, - "grad_norm": 5.185954540567181, - "learning_rate": 2.8089665369099737e-06, - "logits/chosen": -0.19583071768283844, - "logits/rejected": -0.06919754296541214, - "logps/chosen": -1.4165420532226562, - "logps/rejected": -1.6947746276855469, - "loss": 1.0948, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.4165420532226562, - "rewards/margins": 0.2782325744628906, - "rewards/rejected": -1.6947746276855469, - "sft_loss": 1.3944652080535889, + "grad_norm": 5.955961811337593, + "learning_rate": 9.363221789699912e-07, + "logits/chosen": -0.11595847457647324, + "logits/rejected": 0.016401495784521103, + "logps/chosen": -1.367996096611023, + "logps/rejected": -1.5372674465179443, + "loss": 1.1093, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.367996096611023, + "rewards/margins": 0.16927149891853333, + "rewards/rejected": -1.5372674465179443, + "sft_loss": 1.3482203483581543, "step": 1380 }, { "epoch": 0.7412610804482355, - "grad_norm": 11.778771528863663, - "learning_rate": 2.806678488837205e-06, - "logits/chosen": -0.1822900027036667, - "logits/rejected": -0.052754055708646774, - "logps/chosen": -1.361941933631897, - "logps/rejected": -1.6913766860961914, - "loss": 1.0664, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.361941933631897, - "rewards/margins": 0.3294347822666168, - "rewards/rejected": -1.6913766860961914, - "sft_loss": 1.3936665058135986, + "grad_norm": 11.099017032697603, + "learning_rate": 9.355594962790682e-07, + "logits/chosen": -0.08770633488893509, + "logits/rejected": 0.05239544063806534, + "logps/chosen": -1.2994236946105957, + "logps/rejected": -1.551729440689087, + "loss": 1.0551, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2994236946105957, + "rewards/margins": 0.25230568647384644, + "rewards/rejected": -1.551729440689087, + "sft_loss": 1.331479549407959, "step": 1385 }, { "epoch": 0.7439371132296371, - "grad_norm": 8.142489210482202, - "learning_rate": 2.804377763364006e-06, - "logits/chosen": -0.05914510414004326, - "logits/rejected": 0.06488485634326935, - "logps/chosen": -1.4206465482711792, - "logps/rejected": -1.731772780418396, - "loss": 1.0692, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.4206465482711792, - "rewards/margins": 0.31112608313560486, - "rewards/rejected": -1.731772780418396, - "sft_loss": 1.4565393924713135, + "grad_norm": 9.492347311893651, + "learning_rate": 9.34792587788002e-07, + "logits/chosen": 0.004612951073795557, + "logits/rejected": 0.12525108456611633, + "logps/chosen": -1.3718888759613037, + "logps/rejected": -1.640920877456665, + "loss": 1.0616, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3718888759613037, + "rewards/margins": 0.2690318822860718, + "rewards/rejected": -1.640920877456665, + "sft_loss": 1.4074128866195679, "step": 1390 }, { "epoch": 0.7466131460110387, - "grad_norm": 5.424351957136408, - "learning_rate": 2.8020643828120263e-06, - "logits/chosen": -0.016803156584501266, - "logits/rejected": 0.07223924249410629, - "logps/chosen": -1.3424150943756104, - "logps/rejected": -1.546939492225647, - "loss": 1.0573, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3424150943756104, - "rewards/margins": 0.204524427652359, - "rewards/rejected": -1.546939492225647, - "sft_loss": 1.3497629165649414, + "grad_norm": 5.333530619271008, + "learning_rate": 9.34021460937342e-07, + "logits/chosen": 0.009407530538737774, + "logits/rejected": 0.10629584640264511, + "logps/chosen": -1.3189841508865356, + "logps/rejected": -1.4889742136001587, + "loss": 1.0625, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3189841508865356, + "rewards/margins": 0.1699899584054947, + "rewards/rejected": -1.4889742136001587, + "sft_loss": 1.3322023153305054, "step": 1395 }, { "epoch": 0.7492891787924402, - "grad_norm": 5.95213805104973, - "learning_rate": 2.799738369625694e-06, - "logits/chosen": -0.23150594532489777, - "logits/rejected": -0.09028539806604385, - "logps/chosen": -1.4120293855667114, - "logps/rejected": -1.6809848546981812, - "loss": 1.0749, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.4120293855667114, - "rewards/margins": 0.2689554989337921, - "rewards/rejected": -1.6809848546981812, - "sft_loss": 1.419755220413208, + "grad_norm": 6.1020804158599855, + "learning_rate": 9.332461232085646e-07, + "logits/chosen": -0.17140880227088928, + "logits/rejected": -0.028482386842370033, + "logps/chosen": -1.4129021167755127, + "logps/rejected": -1.6202716827392578, + "loss": 1.1006, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4129021167755127, + "rewards/margins": 0.2073693722486496, + "rewards/rejected": -1.6202716827392578, + "sft_loss": 1.4178996086120605, "step": 1400 }, { "epoch": 0.7519652115738418, - "grad_norm": 4.5709113857061885, - "learning_rate": 2.7973997463719993e-06, - "logits/chosen": -0.13744693994522095, - "logits/rejected": 0.05361655354499817, - "logps/chosen": -1.2519391775131226, - "logps/rejected": -1.7877897024154663, - "loss": 1.0051, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2519391775131226, - "rewards/margins": 0.5358504056930542, - "rewards/rejected": -1.7877897024154663, - "sft_loss": 1.3105920553207397, + "grad_norm": 4.848542855963102, + "learning_rate": 9.324665821239998e-07, + "logits/chosen": -0.056819796562194824, + "logits/rejected": 0.1313043087720871, + "logps/chosen": -1.2349143028259277, + "logps/rejected": -1.6562589406967163, + "loss": 1.0076, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2349143028259277, + "rewards/margins": 0.4213446080684662, + "rewards/rejected": -1.6562589406967163, + "sft_loss": 1.2959566116333008, "step": 1405 }, { "epoch": 0.7546412443552434, - "grad_norm": 7.597294311838919, - "learning_rate": 2.7950485357402754e-06, - "logits/chosen": -0.154087632894516, - "logits/rejected": 0.03357213735580444, - "logps/chosen": -1.4047690629959106, - "logps/rejected": -1.7237621545791626, - "loss": 1.0731, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.4047690629959106, - "rewards/margins": 0.31899309158325195, - "rewards/rejected": -1.7237621545791626, - "sft_loss": 1.451963186264038, + "grad_norm": 7.14445775993856, + "learning_rate": 9.316828452467583e-07, + "logits/chosen": -0.08984025567770004, + "logits/rejected": 0.10239746421575546, + "logps/chosen": -1.3794463872909546, + "logps/rejected": -1.649232268333435, + "loss": 1.0763, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3794463872909546, + "rewards/margins": 0.26978588104248047, + "rewards/rejected": -1.649232268333435, + "sft_loss": 1.4267511367797852, "step": 1410 }, { "epoch": 0.7573172771366449, - "grad_norm": 7.598509225801618, - "learning_rate": 2.7926847605419776e-06, - "logits/chosen": -0.05727584287524223, - "logits/rejected": 0.09257154166698456, - "logps/chosen": -1.4217592477798462, - "logps/rejected": -1.512450933456421, + "grad_norm": 8.302373732710077, + "learning_rate": 9.30894920180659e-07, + "logits/chosen": -0.01051993016153574, + "logits/rejected": 0.13935205340385437, + "logps/chosen": -1.4041858911514282, + "logps/rejected": -1.4739868640899658, "loss": 1.1334, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.4217592477798462, - "rewards/margins": 0.09069158881902695, - "rewards/rejected": -1.512450933456421, - "sft_loss": 1.388801097869873, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.4041858911514282, + "rewards/margins": 0.06980089098215103, + "rewards/rejected": -1.4739868640899658, + "sft_loss": 1.379386067390442, "step": 1415 }, { "epoch": 0.7599933099180465, - "grad_norm": 4.161958163004443, - "learning_rate": 2.7903084437104633e-06, - "logits/chosen": -0.06450797617435455, - "logits/rejected": 0.07489770650863647, - "logps/chosen": -1.302138328552246, - "logps/rejected": -1.842264175415039, - "loss": 1.0035, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.302138328552246, - "rewards/margins": 0.5401259660720825, - "rewards/rejected": -1.842264175415039, - "sft_loss": 1.3500391244888306, + "grad_norm": 6.116793410992373, + "learning_rate": 9.301028145701543e-07, + "logits/chosen": 0.02948124334216118, + "logits/rejected": 0.17535671591758728, + "logps/chosen": -1.2956939935684204, + "logps/rejected": -1.7455459833145142, + "loss": 1.0261, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2956939935684204, + "rewards/margins": 0.4498518407344818, + "rewards/rejected": -1.7455459833145142, + "sft_loss": 1.3457810878753662, "step": 1420 }, { "epoch": 0.7626693426994481, - "grad_norm": 8.043149328035888, - "learning_rate": 2.787919608300769e-06, - "logits/chosen": -0.03159303590655327, - "logits/rejected": 0.0678340420126915, - "logps/chosen": -1.3615281581878662, - "logps/rejected": -1.7922245264053345, - "loss": 1.0269, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3615281581878662, - "rewards/margins": 0.43069639801979065, - "rewards/rejected": -1.7922245264053345, - "sft_loss": 1.3711903095245361, + "grad_norm": 5.8029302256255395, + "learning_rate": 9.293065361002563e-07, + "logits/chosen": 0.05574868991971016, + "logits/rejected": 0.1520715355873108, + "logps/chosen": -1.3259742259979248, + "logps/rejected": -1.7329938411712646, + "loss": 1.0178, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3259742259979248, + "rewards/margins": 0.4070195257663727, + "rewards/rejected": -1.7329938411712646, + "sft_loss": 1.3466819524765015, "step": 1425 }, { "epoch": 0.7653453754808497, - "grad_norm": 6.851486546482081, - "learning_rate": 2.785518277489387e-06, - "logits/chosen": -0.1362898051738739, - "logits/rejected": 0.004780137445777655, - "logps/chosen": -1.4045703411102295, - "logps/rejected": -1.6434154510498047, - "loss": 1.0694, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.4045703411102295, - "rewards/margins": 0.23884525895118713, - "rewards/rejected": -1.6434154510498047, - "sft_loss": 1.3844901323318481, + "grad_norm": 10.907972977051783, + "learning_rate": 9.285060924964622e-07, + "logits/chosen": -0.061895061284303665, + "logits/rejected": 0.0811760351061821, + "logps/chosen": -1.3956642150878906, + "logps/rejected": -1.5960217714309692, + "loss": 1.0789, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3956642150878906, + "rewards/margins": 0.2003575563430786, + "rewards/rejected": -1.5960217714309692, + "sft_loss": 1.374837875366211, "step": 1430 }, { "epoch": 0.7680214082622512, - "grad_norm": 4.307609136140785, - "learning_rate": 2.783104474574038e-06, - "logits/chosen": 0.05091600492596626, - "logits/rejected": 0.10818967968225479, - "logps/chosen": -1.3143080472946167, - "logps/rejected": -1.7300665378570557, - "loss": 1.0169, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3143080472946167, - "rewards/margins": 0.4157584309577942, - "rewards/rejected": -1.7300665378570557, - "sft_loss": 1.3309390544891357, + "grad_norm": 4.703252203250831, + "learning_rate": 9.277014915246792e-07, + "logits/chosen": 0.0737505629658699, + "logits/rejected": 0.13021281361579895, + "logps/chosen": -1.2985069751739502, + "logps/rejected": -1.6690765619277954, + "loss": 1.0218, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2985069751739502, + "rewards/margins": 0.3705694377422333, + "rewards/rejected": -1.6690765619277954, + "sft_loss": 1.3173308372497559, "step": 1435 }, { "epoch": 0.7706974410436528, - "grad_norm": 4.361601400253105, - "learning_rate": 2.7806782229734495e-06, - "logits/chosen": -0.050930093973875046, - "logits/rejected": 0.06570479273796082, - "logps/chosen": -1.3809306621551514, - "logps/rejected": -1.5890841484069824, - "loss": 1.0979, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.3809306621551514, - "rewards/margins": 0.20815351605415344, - "rewards/rejected": -1.5890841484069824, - "sft_loss": 1.41715407371521, + "grad_norm": 4.46831297658942, + "learning_rate": 9.268927409911498e-07, + "logits/chosen": -0.04367053136229515, + "logits/rejected": 0.06593044847249985, + "logps/chosen": -1.3501296043395996, + "logps/rejected": -1.5123838186264038, + "loss": 1.1018, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3501296043395996, + "rewards/margins": 0.1622542440891266, + "rewards/rejected": -1.5123838186264038, + "sft_loss": 1.399683952331543, "step": 1440 }, { "epoch": 0.7733734738250544, - "grad_norm": 6.9265210413386775, - "learning_rate": 2.7782395462271247e-06, - "logits/chosen": -0.12000956386327744, - "logits/rejected": 0.11605888605117798, - "logps/chosen": -1.4133331775665283, - "logps/rejected": -1.6733297109603882, - "loss": 1.1133, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.4133331775665283, - "rewards/margins": 0.2599967122077942, - "rewards/rejected": -1.6733297109603882, - "sft_loss": 1.4759104251861572, + "grad_norm": 7.067902265570916, + "learning_rate": 9.260798487423749e-07, + "logits/chosen": -0.11400493234395981, + "logits/rejected": 0.10712490230798721, + "logps/chosen": -1.3892908096313477, + "logps/rejected": -1.571804165840149, + "loss": 1.1281, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3892908096313477, + "rewards/margins": 0.1825132519006729, + "rewards/rejected": -1.571804165840149, + "sft_loss": 1.450918197631836, "step": 1445 }, { "epoch": 0.7760495066064559, - "grad_norm": 8.684710551750364, - "learning_rate": 2.7757884679951167e-06, - "logits/chosen": -0.0006413728115148842, - "logits/rejected": 0.09884602576494217, - "logps/chosen": -1.339545488357544, - "logps/rejected": -1.605812430381775, - "loss": 1.07, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.339545488357544, - "rewards/margins": 0.26626691222190857, - "rewards/rejected": -1.605812430381775, - "sft_loss": 1.3702061176300049, + "grad_norm": 8.924429900342966, + "learning_rate": 9.252628226650389e-07, + "logits/chosen": 0.027986442670226097, + "logits/rejected": 0.12609614431858063, + "logps/chosen": -1.3055108785629272, + "logps/rejected": -1.4768112897872925, + "loss": 1.081, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3055108785629272, + "rewards/margins": 0.1713004857301712, + "rewards/rejected": -1.4768112897872925, + "sft_loss": 1.3337790966033936, "step": 1450 }, { "epoch": 0.7787255393878575, - "grad_norm": 6.184752771063054, - "learning_rate": 2.7733250120577967e-06, - "logits/chosen": -0.062450431287288666, - "logits/rejected": 0.118833526968956, - "logps/chosen": -1.3291722536087036, - "logps/rejected": -1.7080695629119873, - "loss": 1.0305, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.3291722536087036, - "rewards/margins": 0.3788975179195404, - "rewards/rejected": -1.7080695629119873, - "sft_loss": 1.3770744800567627, + "grad_norm": 9.113430126638274, + "learning_rate": 9.244416706859321e-07, + "logits/chosen": -0.0451820082962513, + "logits/rejected": 0.1254032701253891, + "logps/chosen": -1.3234608173370361, + "logps/rejected": -1.6025323867797852, + "loss": 1.0535, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3234608173370361, + "rewards/margins": 0.27907148003578186, + "rewards/rejected": -1.6025323867797852, + "sft_loss": 1.3641140460968018, "step": 1455 }, { "epoch": 0.7814015721692591, - "grad_norm": 5.404834701352218, - "learning_rate": 2.770849202315625e-06, - "logits/chosen": -0.03863369673490524, - "logits/rejected": 0.1544675976037979, - "logps/chosen": -1.306546688079834, - "logps/rejected": -1.64643132686615, - "loss": 1.0246, + "grad_norm": 5.923575127090414, + "learning_rate": 9.23616400771875e-07, + "logits/chosen": -0.027543995529413223, + "logits/rejected": 0.14627543091773987, + "logps/chosen": -1.2854974269866943, + "logps/rejected": -1.573866844177246, + "loss": 1.0222, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.306546688079834, - "rewards/margins": 0.33988457918167114, - "rewards/rejected": -1.64643132686615, - "sft_loss": 1.3078439235687256, + "rewards/chosen": -1.2854974269866943, + "rewards/margins": 0.2883693277835846, + "rewards/rejected": -1.573866844177246, + "sft_loss": 1.2862838506698608, "step": 1460 }, { "epoch": 0.7840776049506607, - "grad_norm": 5.904208941026598, - "learning_rate": 2.768361062788919e-06, - "logits/chosen": -0.020415937528014183, - "logits/rejected": 0.10936160385608673, - "logps/chosen": -1.4067522287368774, - "logps/rejected": -1.695853590965271, - "loss": 1.0899, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.4067522287368774, - "rewards/margins": 0.28910142183303833, - "rewards/rejected": -1.695853590965271, - "sft_loss": 1.4484775066375732, + "grad_norm": 5.1028035186060094, + "learning_rate": 9.227870209296395e-07, + "logits/chosen": -0.012474549934267998, + "logits/rejected": 0.09835416078567505, + "logps/chosen": -1.3849401473999023, + "logps/rejected": -1.6243913173675537, + "loss": 1.0999, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3849401473999023, + "rewards/margins": 0.23945105075836182, + "rewards/rejected": -1.6243913173675537, + "sft_loss": 1.4353958368301392, "step": 1465 }, { "epoch": 0.7867536377320622, - "grad_norm": 5.874991355168797, - "learning_rate": 2.7658606176176186e-06, - "logits/chosen": -0.09968818724155426, - "logits/rejected": -0.0667973980307579, - "logps/chosen": -1.365729570388794, - "logps/rejected": -1.6799007654190063, - "loss": 1.0703, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.365729570388794, - "rewards/margins": 0.31417128443717957, - "rewards/rejected": -1.6799007654190063, - "sft_loss": 1.4080615043640137, + "grad_norm": 6.4051747902711265, + "learning_rate": 9.219535392058728e-07, + "logits/chosen": -0.07133413851261139, + "logits/rejected": -0.03863609582185745, + "logps/chosen": -1.3444093465805054, + "logps/rejected": -1.5882580280303955, + "loss": 1.0804, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3444093465805054, + "rewards/margins": 0.2438487708568573, + "rewards/rejected": -1.5882580280303955, + "sft_loss": 1.387082815170288, "step": 1470 }, { "epoch": 0.7894296705134638, - "grad_norm": 5.676763085762059, - "learning_rate": 2.763347891061054e-06, - "logits/chosen": -0.15920597314834595, - "logits/rejected": 0.025278815999627113, - "logps/chosen": -1.3561172485351562, - "logps/rejected": -1.7594220638275146, - "loss": 1.0372, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3561172485351562, - "rewards/margins": 0.4033048748970032, - "rewards/rejected": -1.7594220638275146, - "sft_loss": 1.4026418924331665, + "grad_norm": 6.734701551616222, + "learning_rate": 9.211159636870181e-07, + "logits/chosen": -0.11850512027740479, + "logits/rejected": 0.05692233517765999, + "logps/chosen": -1.3440381288528442, + "logps/rejected": -1.6370413303375244, + "loss": 1.056, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3440381288528442, + "rewards/margins": 0.29300326108932495, + "rewards/rejected": -1.6370413303375244, + "sft_loss": 1.3866641521453857, "step": 1475 }, { "epoch": 0.7921057032948654, - "grad_norm": 5.215987898101865, - "learning_rate": 2.7608229074977103e-06, - "logits/chosen": -0.06529693305492401, - "logits/rejected": 0.04853939637541771, - "logps/chosen": -1.3548333644866943, - "logps/rejected": -1.8335111141204834, - "loss": 1.0368, + "grad_norm": 5.882501221972273, + "learning_rate": 9.202743024992367e-07, + "logits/chosen": -0.021683160215616226, + "logits/rejected": 0.09188304096460342, + "logps/chosen": -1.312713861465454, + "logps/rejected": -1.6861861944198608, + "loss": 1.0403, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3548333644866943, - "rewards/margins": 0.4786776602268219, - "rewards/rejected": -1.8335111141204834, - "sft_loss": 1.3940856456756592, + "rewards/chosen": -1.312713861465454, + "rewards/margins": 0.37347230315208435, + "rewards/rejected": -1.6861861944198608, + "sft_loss": 1.3612488508224487, "step": 1480 }, { "epoch": 0.7947817360762669, - "grad_norm": 9.073385343283363, - "learning_rate": 2.758285691424988e-06, - "logits/chosen": -0.07635178416967392, - "logits/rejected": 0.0887339860200882, - "logps/chosen": -1.4306265115737915, - "logps/rejected": -1.8580020666122437, - "loss": 1.0567, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.4306265115737915, - "rewards/margins": 0.4273756146430969, - "rewards/rejected": -1.8580020666122437, - "sft_loss": 1.3975563049316406, + "grad_norm": 6.189785230569557, + "learning_rate": 9.194285638083293e-07, + "logits/chosen": -0.003997665364295244, + "logits/rejected": 0.16212308406829834, + "logps/chosen": -1.393507957458496, + "logps/rejected": -1.743019461631775, + "loss": 1.0658, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.393507957458496, + "rewards/margins": 0.34951135516166687, + "rewards/rejected": -1.743019461631775, + "sft_loss": 1.3766828775405884, "step": 1485 }, { "epoch": 0.7974577688576685, - "grad_norm": 8.815064683541053, - "learning_rate": 2.7557362674589687e-06, - "logits/chosen": -0.1560938060283661, - "logits/rejected": -0.04363911226391792, - "logps/chosen": -1.3959918022155762, - "logps/rejected": -1.690941572189331, - "loss": 1.0874, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3959918022155762, - "rewards/margins": 0.2949499785900116, - "rewards/rejected": -1.690941572189331, - "sft_loss": 1.3853015899658203, + "grad_norm": 8.010598484403992, + "learning_rate": 9.185787558196562e-07, + "logits/chosen": -0.05653812736272812, + "logits/rejected": 0.06350454688072205, + "logps/chosen": -1.360586166381836, + "logps/rejected": -1.5594004392623901, + "loss": 1.0994, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.360586166381836, + "rewards/margins": 0.1988140493631363, + "rewards/rejected": -1.5594004392623901, + "sft_loss": 1.3619134426116943, "step": 1490 }, { "epoch": 0.8001338016390701, - "grad_norm": 7.199444095750238, - "learning_rate": 2.753174660334175e-06, - "logits/chosen": -0.14701077342033386, - "logits/rejected": -0.039738696068525314, - "logps/chosen": -1.575615644454956, - "logps/rejected": -1.7450742721557617, - "loss": 1.1747, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.575615644454956, - "rewards/margins": 0.16945865750312805, - "rewards/rejected": -1.7450742721557617, - "sft_loss": 1.5748717784881592, + "grad_norm": 7.260851698872519, + "learning_rate": 9.177248867780583e-07, + "logits/chosen": -0.044975005090236664, + "logits/rejected": 0.06740692257881165, + "logps/chosen": -1.508986473083496, + "logps/rejected": -1.6397491693496704, + "loss": 1.1746, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.508986473083496, + "rewards/margins": 0.1307627260684967, + "rewards/rejected": -1.6397491693496704, + "sft_loss": 1.5416109561920166, "step": 1495 }, { "epoch": 0.8028098344204716, - "grad_norm": 9.70254139336959, - "learning_rate": 2.750600894903331e-06, - "logits/chosen": -0.1704801619052887, - "logits/rejected": -0.06618437170982361, - "logps/chosen": -1.4099934101104736, - "logps/rejected": -1.7492620944976807, - "loss": 1.0938, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.4099934101104736, - "rewards/margins": 0.3392687439918518, - "rewards/rejected": -1.7492620944976807, - "sft_loss": 1.4731342792510986, + "grad_norm": 16.709625608251123, + "learning_rate": 9.168669649677769e-07, + "logits/chosen": -0.0836515948176384, + "logits/rejected": 0.021665522828698158, + "logps/chosen": -1.392195463180542, + "logps/rejected": -1.7002432346343994, + "loss": 1.082, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.392195463180542, + "rewards/margins": 0.30804774165153503, + "rewards/rejected": -1.7002432346343994, + "sft_loss": 1.4488755464553833, "step": 1500 }, { "epoch": 0.8054858672018732, - "grad_norm": 6.201274593098627, - "learning_rate": 2.7480149961371194e-06, - "logits/chosen": -0.07009023427963257, - "logits/rejected": -0.005963495466858149, - "logps/chosen": -1.3297359943389893, - "logps/rejected": -1.8149020671844482, - "loss": 0.9912, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3297359943389893, - "rewards/margins": 0.4851660132408142, - "rewards/rejected": -1.8149020671844482, - "sft_loss": 1.332242727279663, + "grad_norm": 10.151517285966094, + "learning_rate": 9.16004998712373e-07, + "logits/chosen": 0.017516251653432846, + "logits/rejected": 0.08750508725643158, + "logps/chosen": -1.324232816696167, + "logps/rejected": -1.7349544763565063, + "loss": 0.9967, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.324232816696167, + "rewards/margins": 0.4107215404510498, + "rewards/rejected": -1.7349544763565063, + "sft_loss": 1.313400149345398, "step": 1505 }, { "epoch": 0.8081618999832748, - "grad_norm": 4.584004786925617, - "learning_rate": 2.745416989123942e-06, - "logits/chosen": -0.16404542326927185, - "logits/rejected": 0.11613837629556656, - "logps/chosen": -1.3978588581085205, - "logps/rejected": -1.738348364830017, - "loss": 1.0535, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3978588581085205, - "rewards/margins": 0.34048959612846375, - "rewards/rejected": -1.738348364830017, - "sft_loss": 1.4199836254119873, + "grad_norm": 5.47793827688658, + "learning_rate": 9.151389963746472e-07, + "logits/chosen": -0.08696796745061874, + "logits/rejected": 0.20715276896953583, + "logps/chosen": -1.4028065204620361, + "logps/rejected": -1.7313998937606812, + "loss": 1.0586, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4028065204620361, + "rewards/margins": 0.32859352231025696, + "rewards/rejected": -1.7313998937606812, + "sft_loss": 1.4195729494094849, "step": 1510 }, { "epoch": 0.8108379327646764, - "grad_norm": 4.7711233664827475, - "learning_rate": 2.7428068990696735e-06, - "logits/chosen": -0.08198316395282745, - "logits/rejected": -0.01538595836609602, - "logps/chosen": -1.3322317600250244, - "logps/rejected": -1.6438226699829102, - "loss": 1.0403, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3322317600250244, - "rewards/margins": 0.3115909695625305, - "rewards/rejected": -1.6438226699829102, - "sft_loss": 1.3519912958145142, + "grad_norm": 5.933438633720938, + "learning_rate": 9.142689663565577e-07, + "logits/chosen": -0.024196814745664597, + "logits/rejected": 0.04886165261268616, + "logps/chosen": -1.329463243484497, + "logps/rejected": -1.601779580116272, + "loss": 1.0526, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.329463243484497, + "rewards/margins": 0.2723161578178406, + "rewards/rejected": -1.601779580116272, + "sft_loss": 1.3440353870391846, "step": 1515 }, { "epoch": 0.8135139655460779, - "grad_norm": 7.822975676442135, - "learning_rate": 2.7401847512974194e-06, - "logits/chosen": -0.10810144990682602, - "logits/rejected": -0.025854378938674927, - "logps/chosen": -1.3840287923812866, - "logps/rejected": -1.6693832874298096, - "loss": 1.0843, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3840287923812866, - "rewards/margins": 0.2853543758392334, - "rewards/rejected": -1.6693832874298096, - "sft_loss": 1.4771511554718018, + "grad_norm": 13.207225278484374, + "learning_rate": 9.133949170991397e-07, + "logits/chosen": -0.026406964287161827, + "logits/rejected": 0.06156226992607117, + "logps/chosen": -1.371176838874817, + "logps/rejected": -1.6098140478134155, + "loss": 1.0931, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.371176838874817, + "rewards/margins": 0.23863713443279266, + "rewards/rejected": -1.6098140478134155, + "sft_loss": 1.4641458988189697, "step": 1520 }, { "epoch": 0.8161899983274795, - "grad_norm": 4.848775932175174, - "learning_rate": 2.7375505712472695e-06, - "logits/chosen": -0.10527946054935455, - "logits/rejected": 0.08367304503917694, - "logps/chosen": -1.3714563846588135, - "logps/rejected": -1.6404037475585938, - "loss": 1.0966, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3714563846588135, - "rewards/margins": 0.26894742250442505, - "rewards/rejected": -1.6404037475585938, - "sft_loss": 1.3577983379364014, + "grad_norm": 5.810313113640756, + "learning_rate": 9.125168570824231e-07, + "logits/chosen": -0.058356620371341705, + "logits/rejected": 0.13299037516117096, + "logps/chosen": -1.3551055192947388, + "logps/rejected": -1.5649089813232422, + "loss": 1.0987, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3551055192947388, + "rewards/margins": 0.20980365574359894, + "rewards/rejected": -1.5649089813232422, + "sft_loss": 1.3459253311157227, "step": 1525 }, { "epoch": 0.8188660311088811, - "grad_norm": 8.808636218080713, - "learning_rate": 2.734904384476049e-06, - "logits/chosen": -0.09855546057224274, - "logits/rejected": 0.023541796952486038, - "logps/chosen": -1.404067039489746, - "logps/rejected": -1.6858993768692017, - "loss": 1.0677, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.404067039489746, - "rewards/margins": 0.28183233737945557, - "rewards/rejected": -1.6858993768692017, - "sft_loss": 1.3637378215789795, + "grad_norm": 8.495492846008881, + "learning_rate": 9.116347948253496e-07, + "logits/chosen": -0.07601507008075714, + "logits/rejected": 0.03884817287325859, + "logps/chosen": -1.3821711540222168, + "logps/rejected": -1.6108427047729492, + "loss": 1.0717, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3821711540222168, + "rewards/margins": 0.22867155075073242, + "rewards/rejected": -1.6108427047729492, + "sft_loss": 1.3557847738265991, "step": 1530 }, { "epoch": 0.8215420638902826, - "grad_norm": 7.6000091929001465, - "learning_rate": 2.732246216657075e-06, - "logits/chosen": -0.0942855104804039, - "logits/rejected": 0.09655526280403137, - "logps/chosen": -1.3157068490982056, - "logps/rejected": -1.6163606643676758, - "loss": 1.0324, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.3157068490982056, - "rewards/margins": 0.300653874874115, - "rewards/rejected": -1.6163606643676758, - "sft_loss": 1.3629335165023804, + "grad_norm": 9.245630706009585, + "learning_rate": 9.107487388856916e-07, + "logits/chosen": -0.09205770492553711, + "logits/rejected": 0.08740700036287308, + "logps/chosen": -1.3065885305404663, + "logps/rejected": -1.5755001306533813, + "loss": 1.037, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3065885305404663, + "rewards/margins": 0.2689115107059479, + "rewards/rejected": -1.5755001306533813, + "sft_loss": 1.3555333614349365, "step": 1535 }, { "epoch": 0.8242180966716842, - "grad_norm": 11.762348997233314, - "learning_rate": 2.729576093579902e-06, - "logits/chosen": -0.033582575619220734, - "logits/rejected": 0.1363482028245926, - "logps/chosen": -1.342462420463562, - "logps/rejected": -1.8016321659088135, - "loss": 1.0026, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.342462420463562, - "rewards/margins": 0.4591697156429291, - "rewards/rejected": -1.8016321659088135, - "sft_loss": 1.3745462894439697, + "grad_norm": 9.24164649587981, + "learning_rate": 9.098586978599673e-07, + "logits/chosen": -0.037065595388412476, + "logits/rejected": 0.12826170027256012, + "logps/chosen": -1.3346545696258545, + "logps/rejected": -1.7283185720443726, + "loss": 1.0112, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3346545696258545, + "rewards/margins": 0.3936638832092285, + "rewards/rejected": -1.7283185720443726, + "sft_loss": 1.3487634658813477, "step": 1540 }, { "epoch": 0.8268941294530858, - "grad_norm": 4.7769659985032265, - "learning_rate": 2.726894041150077e-06, - "logits/chosen": -0.08443383872509003, - "logits/rejected": 0.08157948404550552, - "logps/chosen": -1.358519196510315, - "logps/rejected": -1.6386398077011108, - "loss": 1.077, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.358519196510315, - "rewards/margins": 0.2801207900047302, - "rewards/rejected": -1.6386398077011108, - "sft_loss": 1.3989120721817017, + "grad_norm": 5.61559058889292, + "learning_rate": 9.089646803833588e-07, + "logits/chosen": 0.0015423030126839876, + "logits/rejected": 0.17759868502616882, + "logps/chosen": -1.3313993215560913, + "logps/rejected": -1.568703293800354, + "loss": 1.082, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3313993215560913, + "rewards/margins": 0.23730382323265076, + "rewards/rejected": -1.568703293800354, + "sft_loss": 1.3770296573638916, "step": 1545 }, { "epoch": 0.8295701622344873, - "grad_norm": 6.547673950226795, - "learning_rate": 2.7242000853888833e-06, - "logits/chosen": -0.2460707128047943, - "logits/rejected": 0.01976541243493557, - "logps/chosen": -1.4292163848876953, - "logps/rejected": -1.8006445169448853, - "loss": 1.0714, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.4292163848876953, - "rewards/margins": 0.37142807245254517, - "rewards/rejected": -1.8006445169448853, - "sft_loss": 1.4601786136627197, + "grad_norm": 7.34705254818011, + "learning_rate": 9.080666951296276e-07, + "logits/chosen": -0.17329472303390503, + "logits/rejected": 0.10016246140003204, + "logps/chosen": -1.3927847146987915, + "logps/rejected": -1.686934232711792, + "loss": 1.0866, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3927847146987915, + "rewards/margins": 0.2941496670246124, + "rewards/rejected": -1.686934232711792, + "sft_loss": 1.4320820569992065, "step": 1550 }, { "epoch": 0.8322461950158889, - "grad_norm": 4.736053048394042, - "learning_rate": 2.7214942524330918e-06, - "logits/chosen": -0.2725904583930969, - "logits/rejected": -0.03147698566317558, - "logps/chosen": -1.4534674882888794, - "logps/rejected": -1.9418662786483765, - "loss": 1.0458, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.4534674882888794, - "rewards/margins": 0.488398939371109, - "rewards/rejected": -1.9418662786483765, - "sft_loss": 1.4068353176116943, + "grad_norm": 4.983173298606035, + "learning_rate": 9.071647508110305e-07, + "logits/chosen": -0.1340692788362503, + "logits/rejected": 0.1313193291425705, + "logps/chosen": -1.4077918529510498, + "logps/rejected": -1.7832787036895752, + "loss": 1.0661, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4077918529510498, + "rewards/margins": 0.3754867911338806, + "rewards/rejected": -1.7832787036895752, + "sft_loss": 1.374353289604187, "step": 1555 }, { "epoch": 0.8349222277972905, - "grad_norm": 6.440375726068714, - "learning_rate": 2.7187765685347063e-06, - "logits/chosen": -0.2839708924293518, - "logits/rejected": -0.22860319912433624, - "logps/chosen": -1.52508544921875, - "logps/rejected": -1.84702467918396, - "loss": 1.1148, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.52508544921875, - "rewards/margins": 0.32193922996520996, - "rewards/rejected": -1.84702467918396, - "sft_loss": 1.5309407711029053, + "grad_norm": 7.815769239132045, + "learning_rate": 9.062588561782354e-07, + "logits/chosen": -0.014563229866325855, + "logits/rejected": 0.053143925964832306, + "logps/chosen": -1.4143667221069336, + "logps/rejected": -1.6859022378921509, + "loss": 1.0941, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4143667221069336, + "rewards/margins": 0.27153539657592773, + "rewards/rejected": -1.6859022378921509, + "sft_loss": 1.4450477361679077, "step": 1560 }, { "epoch": 0.8375982605786921, - "grad_norm": 6.367797012460332, - "learning_rate": 2.7160470600607076e-06, - "logits/chosen": -0.20195667445659637, - "logits/rejected": -0.12136946618556976, - "logps/chosen": -1.4493902921676636, - "logps/rejected": -1.7250474691390991, - "loss": 1.1045, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.4493902921676636, - "rewards/margins": 0.27565696835517883, - "rewards/rejected": -1.7250474691390991, - "sft_loss": 1.472017526626587, + "grad_norm": 5.775345808606975, + "learning_rate": 9.053490200202358e-07, + "logits/chosen": -0.015811914578080177, + "logits/rejected": 0.08232811838388443, + "logps/chosen": -1.4326014518737793, + "logps/rejected": -1.661134958267212, + "loss": 1.1093, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4326014518737793, + "rewards/margins": 0.2285335510969162, + "rewards/rejected": -1.661134958267212, + "sft_loss": 1.456074595451355, "step": 1565 }, { "epoch": 0.8402742933600936, - "grad_norm": 11.041022879549015, - "learning_rate": 2.7133057534927986e-06, - "logits/chosen": -0.034810569137334824, - "logits/rejected": -0.012720304541289806, - "logps/chosen": -1.2938734292984009, - "logps/rejected": -1.5503425598144531, - "loss": 1.0604, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.2938734292984009, - "rewards/margins": 0.256469190120697, - "rewards/rejected": -1.5503425598144531, - "sft_loss": 1.345172643661499, + "grad_norm": 10.144975569245078, + "learning_rate": 9.044352511642661e-07, + "logits/chosen": 0.018102655187249184, + "logits/rejected": 0.03663431853055954, + "logps/chosen": -1.3111892938613892, + "logps/rejected": -1.5250718593597412, + "loss": 1.0807, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3111892938613892, + "rewards/margins": 0.21388253569602966, + "rewards/rejected": -1.5250718593597412, + "sft_loss": 1.3417662382125854, "step": 1570 }, { "epoch": 0.8429503261414952, - "grad_norm": 5.873930927000431, - "learning_rate": 2.710552675427148e-06, - "logits/chosen": 0.010924572125077248, - "logits/rejected": 0.12592843174934387, - "logps/chosen": -1.3183715343475342, - "logps/rejected": -1.4716814756393433, - "loss": 1.0897, + "grad_norm": 5.937690748680565, + "learning_rate": 9.03517558475716e-07, + "logits/chosen": -0.030872244387865067, + "logits/rejected": 0.08035401999950409, + "logps/chosen": -1.3295609951019287, + "logps/rejected": -1.500370740890503, + "loss": 1.0938, "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3183715343475342, - "rewards/margins": 0.15330982208251953, - "rewards/rejected": -1.4716814756393433, - "sft_loss": 1.35067617893219, + "rewards/chosen": -1.3295609951019287, + "rewards/margins": 0.1708097904920578, + "rewards/rejected": -1.500370740890503, + "sft_loss": 1.3526275157928467, "step": 1575 }, { "epoch": 0.8456263589228968, - "grad_norm": 5.425517400664494, - "learning_rate": 2.707787852574131e-06, - "logits/chosen": 0.09026463329792023, - "logits/rejected": 0.3710160255432129, - "logps/chosen": -1.3301565647125244, - "logps/rejected": -1.5640554428100586, - "loss": 1.0565, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3301565647125244, - "rewards/margins": 0.23389868438243866, - "rewards/rejected": -1.5640554428100586, - "sft_loss": 1.3714720010757446, + "grad_norm": 7.6310828764205985, + "learning_rate": 9.025959508580436e-07, + "logits/chosen": 0.02246050536632538, + "logits/rejected": 0.27992087602615356, + "logps/chosen": -1.341638207435608, + "logps/rejected": -1.6132581233978271, + "loss": 1.0564, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.341638207435608, + "rewards/margins": 0.2716197371482849, + "rewards/rejected": -1.6132581233978271, + "sft_loss": 1.3668334484100342, "step": 1580 }, { "epoch": 0.8483023917042983, - "grad_norm": 4.013920767960187, - "learning_rate": 2.7050113117580716e-06, - "logits/chosen": 0.05647750943899155, - "logits/rejected": 0.2564930021762848, - "logps/chosen": -1.267033576965332, - "logps/rejected": -1.6264817714691162, - "loss": 0.9891, + "grad_norm": 4.038509966910056, + "learning_rate": 9.016704372526905e-07, + "logits/chosen": -0.022608067840337753, + "logits/rejected": 0.14372625946998596, + "logps/chosen": -1.2611603736877441, + "logps/rejected": -1.6419155597686768, + "loss": 0.99, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.267033576965332, - "rewards/margins": 0.3594485819339752, - "rewards/rejected": -1.6264817714691162, - "sft_loss": 1.2766499519348145, + "rewards/chosen": -1.2611603736877441, + "rewards/margins": 0.38075512647628784, + "rewards/rejected": -1.6419155597686768, + "sft_loss": 1.2734200954437256, "step": 1585 }, { "epoch": 0.8509784244856999, - "grad_norm": 5.962891588311921, - "learning_rate": 2.70222307991698e-06, - "logits/chosen": -0.01841115951538086, - "logits/rejected": 0.08693607151508331, - "logps/chosen": -1.290045976638794, - "logps/rejected": -1.4861773252487183, - "loss": 1.0702, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.290045976638794, - "rewards/margins": 0.19613142311573029, - "rewards/rejected": -1.4861773252487183, - "sft_loss": 1.3594423532485962, + "grad_norm": 6.569454422734971, + "learning_rate": 9.007410266389934e-07, + "logits/chosen": -0.08741948008537292, + "logits/rejected": 0.009515544399619102, + "logps/chosen": -1.2867238521575928, + "logps/rejected": -1.4618428945541382, + "loss": 1.0808, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2867238521575928, + "rewards/margins": 0.1751190721988678, + "rewards/rejected": -1.4618428945541382, + "sft_loss": 1.3573739528656006, "step": 1590 }, { "epoch": 0.8536544572671015, - "grad_norm": 7.335803525766391, - "learning_rate": 2.6994231841022947e-06, - "logits/chosen": 0.0794888362288475, - "logits/rejected": 0.16885769367218018, - "logps/chosen": -1.4231336116790771, - "logps/rejected": -1.5418658256530762, - "loss": 1.1379, - "rewards/accuracies": 0.48124998807907104, - "rewards/chosen": -1.4231336116790771, - "rewards/margins": 0.11873219162225723, - "rewards/rejected": -1.5418658256530762, - "sft_loss": 1.4285242557525635, + "grad_norm": 9.212700327011175, + "learning_rate": 8.998077280340981e-07, + "logits/chosen": -0.0024367093574255705, + "logits/rejected": 0.08118894696235657, + "logps/chosen": -1.4251039028167725, + "logps/rejected": -1.5358526706695557, + "loss": 1.1365, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4251039028167725, + "rewards/margins": 0.11074896156787872, + "rewards/rejected": -1.5358526706695557, + "sft_loss": 1.4242948293685913, "step": 1595 }, { "epoch": 0.8563304900485031, - "grad_norm": 4.23742331064105, - "learning_rate": 2.6966116514786166e-06, - "logits/chosen": -0.0561373308300972, - "logits/rejected": 0.19555945694446564, - "logps/chosen": -1.3804311752319336, - "logps/rejected": -1.74152410030365, - "loss": 1.0436, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3804311752319336, - "rewards/margins": 0.36109280586242676, - "rewards/rejected": -1.74152410030365, - "sft_loss": 1.3977186679840088, + "grad_norm": 4.203891282882451, + "learning_rate": 8.988705504928722e-07, + "logits/chosen": -0.0969192385673523, + "logits/rejected": 0.11570189148187637, + "logps/chosen": -1.3715673685073853, + "logps/rejected": -1.714408278465271, + "loss": 1.0489, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3715673685073853, + "rewards/margins": 0.3428409993648529, + "rewards/rejected": -1.714408278465271, + "sft_loss": 1.3946869373321533, "step": 1600 }, { "epoch": 0.8563304900485031, - "eval_logits/chosen": 0.44763803482055664, - "eval_logits/rejected": 0.5672866702079773, - "eval_logps/chosen": -1.3712090253829956, - "eval_logps/rejected": -1.6780245304107666, - "eval_loss": 1.0524135828018188, - "eval_rewards/accuracies": 0.609050452709198, - "eval_rewards/chosen": -1.3712090253829956, - "eval_rewards/margins": 0.3068154454231262, - "eval_rewards/rejected": -1.6780245304107666, - "eval_runtime": 43.1222, - "eval_samples_per_second": 31.19, - "eval_sft_loss": 1.3880013227462769, - "eval_steps_per_second": 7.815, + "eval_logits/chosen": 0.329307496547699, + "eval_logits/rejected": 0.429740846157074, + "eval_logps/chosen": -1.351759910583496, + "eval_logps/rejected": -1.6476715803146362, + "eval_loss": 1.0554943084716797, + "eval_rewards/accuracies": 0.5905044674873352, + "eval_rewards/chosen": -1.351759910583496, + "eval_rewards/margins": 0.295911580324173, + "eval_rewards/rejected": -1.6476715803146362, + "eval_runtime": 43.4771, + "eval_samples_per_second": 30.936, + "eval_sft_loss": 1.3766752481460571, + "eval_steps_per_second": 7.751, "step": 1600 }, { "epoch": 0.8590065228299046, - "grad_norm": 5.002638375551188, - "learning_rate": 2.6937885093234477e-06, - "logits/chosen": -0.06392344832420349, - "logits/rejected": 0.22160764038562775, - "logps/chosen": -1.3333699703216553, - "logps/rejected": -1.6846908330917358, - "loss": 1.0288, + "grad_norm": 5.530882088518943, + "learning_rate": 8.979295031078157e-07, + "logits/chosen": -0.12066706269979477, + "logits/rejected": 0.10889676958322525, + "logps/chosen": -1.3122395277023315, + "logps/rejected": -1.6446495056152344, + "loss": 1.0359, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3333699703216553, - "rewards/margins": 0.35132086277008057, - "rewards/rejected": -1.6846908330917358, - "sft_loss": 1.3701868057250977, + "rewards/chosen": -1.3122395277023315, + "rewards/margins": 0.33240991830825806, + "rewards/rejected": -1.6446495056152344, + "sft_loss": 1.3601765632629395, "step": 1605 }, { "epoch": 0.8616825556113062, - "grad_norm": 4.64471612658279, - "learning_rate": 2.6909537850269256e-06, - "logits/chosen": -0.09533826261758804, - "logits/rejected": 0.13083715736865997, - "logps/chosen": -1.3075568675994873, - "logps/rejected": -1.6741119623184204, - "loss": 1.0239, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3075568675994873, - "rewards/margins": 0.36655518412590027, - "rewards/rejected": -1.6741119623184204, - "sft_loss": 1.33218514919281, + "grad_norm": 5.129344765895249, + "learning_rate": 8.969845950089751e-07, + "logits/chosen": -0.15109679102897644, + "logits/rejected": 0.033114198595285416, + "logps/chosen": -1.2982641458511353, + "logps/rejected": -1.697427749633789, + "loss": 1.0177, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2982641458511353, + "rewards/margins": 0.3991636335849762, + "rewards/rejected": -1.697427749633789, + "sft_loss": 1.3306095600128174, "step": 1610 }, { "epoch": 0.8643585883927078, - "grad_norm": 6.172356811989879, - "learning_rate": 2.688107506091558e-06, - "logits/chosen": -0.015719827264547348, - "logits/rejected": 0.12321136146783829, - "logps/chosen": -1.4253634214401245, - "logps/rejected": -1.7837772369384766, - "loss": 1.0835, + "grad_norm": 6.609932526409894, + "learning_rate": 8.960358353638526e-07, + "logits/chosen": -0.0883597657084465, + "logits/rejected": 0.02439655363559723, + "logps/chosen": -1.4022852182388306, + "logps/rejected": -1.7770626544952393, + "loss": 1.0731, "rewards/accuracies": 0.59375, - "rewards/chosen": -1.4253634214401245, - "rewards/margins": 0.3584136366844177, - "rewards/rejected": -1.7837772369384766, - "sft_loss": 1.4478522539138794, + "rewards/chosen": -1.4022852182388306, + "rewards/margins": 0.3747774064540863, + "rewards/rejected": -1.7770626544952393, + "sft_loss": 1.4265952110290527, "step": 1615 }, { "epoch": 0.8670346211741093, - "grad_norm": 5.227709282984366, - "learning_rate": 2.6852497001319555e-06, - "logits/chosen": 0.04353545233607292, - "logits/rejected": 0.23053112626075745, - "logps/chosen": -1.2532846927642822, - "logps/rejected": -1.6680285930633545, - "loss": 0.9879, + "grad_norm": 6.147812997346706, + "learning_rate": 8.950832333773184e-07, + "logits/chosen": -0.04175793007016182, + "logits/rejected": 0.10314915329217911, + "logps/chosen": -1.27086341381073, + "logps/rejected": -1.6761993169784546, + "loss": 1.0046, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2532846927642822, - "rewards/margins": 0.41474419832229614, - "rewards/rejected": -1.6680285930633545, - "sft_loss": 1.2794431447982788, + "rewards/chosen": -1.27086341381073, + "rewards/margins": 0.4053359627723694, + "rewards/rejected": -1.6761993169784546, + "sft_loss": 1.283146619796753, "step": 1620 }, { "epoch": 0.869710653955511, - "grad_norm": 5.939591442905057, - "learning_rate": 2.682380394874564e-06, - "logits/chosen": 0.11653991788625717, - "logits/rejected": 0.17456406354904175, - "logps/chosen": -1.4384989738464355, - "logps/rejected": -1.6639492511749268, - "loss": 1.1006, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.4384989738464355, - "rewards/margins": 0.22545020282268524, - "rewards/rejected": -1.6639492511749268, - "sft_loss": 1.3895422220230103, + "grad_norm": 7.6358092026569375, + "learning_rate": 8.941267982915213e-07, + "logits/chosen": 0.01520055253058672, + "logits/rejected": 0.06274382770061493, + "logps/chosen": -1.4354435205459595, + "logps/rejected": -1.6466639041900635, + "loss": 1.099, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4354435205459595, + "rewards/margins": 0.21122050285339355, + "rewards/rejected": -1.6466639041900635, + "sft_loss": 1.3758140802383423, "step": 1625 }, { "epoch": 0.8723866867369126, - "grad_norm": 5.613994005388873, - "learning_rate": 2.6794996181573953e-06, - "logits/chosen": 0.019490620121359825, - "logits/rejected": 0.20958340167999268, - "logps/chosen": -1.3698194026947021, - "logps/rejected": -1.6351855993270874, - "loss": 1.0802, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3698194026947021, - "rewards/margins": 0.26536640524864197, - "rewards/rejected": -1.6351855993270874, - "sft_loss": 1.382866621017456, + "grad_norm": 7.221964111745881, + "learning_rate": 8.931665393857983e-07, + "logits/chosen": -0.07788058370351791, + "logits/rejected": 0.07453887164592743, + "logps/chosen": -1.3857173919677734, + "logps/rejected": -1.6590982675552368, + "loss": 1.0866, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3857173919677734, + "rewards/margins": 0.27338093519210815, + "rewards/rejected": -1.6590982675552368, + "sft_loss": 1.3786849975585938, "step": 1630 }, { "epoch": 0.875062719518314, - "grad_norm": 5.922049659999224, - "learning_rate": 2.6766073979297584e-06, - "logits/chosen": -0.06902351975440979, - "logits/rejected": 0.08735234290361404, - "logps/chosen": -1.2911508083343506, - "logps/rejected": -1.6872625350952148, - "loss": 1.0277, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.2911508083343506, - "rewards/margins": 0.3961116671562195, - "rewards/rejected": -1.6872625350952148, - "sft_loss": 1.337499737739563, + "grad_norm": 6.640721192897655, + "learning_rate": 8.922024659765861e-07, + "logits/chosen": -0.14042620360851288, + "logits/rejected": -0.017853520810604095, + "logps/chosen": -1.2752376794815063, + "logps/rejected": -1.6433387994766235, + "loss": 1.0192, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2752376794815063, + "rewards/margins": 0.3681010603904724, + "rewards/rejected": -1.6433387994766235, + "sft_loss": 1.320291519165039, "step": 1635 }, { "epoch": 0.8777387522997157, - "grad_norm": 6.2625347475235635, - "learning_rate": 2.6737037622519866e-06, - "logits/chosen": -0.06433682888746262, - "logits/rejected": 0.0965544730424881, - "logps/chosen": -1.304503321647644, - "logps/rejected": -1.7035449743270874, - "loss": 1.0314, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.304503321647644, - "rewards/margins": 0.39904141426086426, - "rewards/rejected": -1.7035449743270874, - "sft_loss": 1.3322417736053467, + "grad_norm": 9.724216699659413, + "learning_rate": 8.912345874173288e-07, + "logits/chosen": -0.1258830577135086, + "logits/rejected": -0.0012362360721454024, + "logps/chosen": -1.2984638214111328, + "logps/rejected": -1.6937593221664429, + "loss": 1.029, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2984638214111328, + "rewards/margins": 0.39529532194137573, + "rewards/rejected": -1.6937593221664429, + "sft_loss": 1.322912335395813, "step": 1640 }, { "epoch": 0.8804147850811173, - "grad_norm": 6.559343546184692, - "learning_rate": 2.670788739295166e-06, - "logits/chosen": -0.011899260804057121, - "logits/rejected": 0.06090838462114334, - "logps/chosen": -1.3005620241165161, - "logps/rejected": -1.567338466644287, - "loss": 1.032, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3005620241165161, - "rewards/margins": 0.2667763829231262, - "rewards/rejected": -1.567338466644287, - "sft_loss": 1.3188759088516235, + "grad_norm": 7.377517402484911, + "learning_rate": 8.902629130983885e-07, + "logits/chosen": -0.07669184356927872, + "logits/rejected": -0.016088414937257767, + "logps/chosen": -1.2998298406600952, + "logps/rejected": -1.537240982055664, + "loss": 1.0434, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2998298406600952, + "rewards/margins": 0.2374109923839569, + "rewards/rejected": -1.537240982055664, + "sft_loss": 1.3199104070663452, "step": 1645 }, { "epoch": 0.8830908178625189, - "grad_norm": 7.289341250127611, - "learning_rate": 2.6678623573408613e-06, - "logits/chosen": 0.03133546561002731, - "logits/rejected": 0.10692791640758514, - "logps/chosen": -1.3453984260559082, - "logps/rejected": -1.6661173105239868, - "loss": 1.0295, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.3453984260559082, - "rewards/margins": 0.32071900367736816, - "rewards/rejected": -1.6661173105239868, - "sft_loss": 1.3463585376739502, + "grad_norm": 8.256103406403513, + "learning_rate": 8.892874524469537e-07, + "logits/chosen": 0.020514000207185745, + "logits/rejected": 0.09050575643777847, + "logps/chosen": -1.3168718814849854, + "logps/rejected": -1.610409140586853, + "loss": 1.0248, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3168718814849854, + "rewards/margins": 0.2935372591018677, + "rewards/rejected": -1.610409140586853, + "sft_loss": 1.3246101140975952, "step": 1650 }, { "epoch": 0.8857668506439204, - "grad_norm": 6.203168074083552, - "learning_rate": 2.664924644780844e-06, - "logits/chosen": -0.1072305217385292, - "logits/rejected": 0.02232867106795311, - "logps/chosen": -1.4158356189727783, - "logps/rejected": -1.749995470046997, - "loss": 1.064, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.4158356189727783, - "rewards/margins": 0.33415982127189636, - "rewards/rejected": -1.749995470046997, - "sft_loss": 1.4020464420318604, + "grad_norm": 6.034861988633968, + "learning_rate": 8.883082149269478e-07, + "logits/chosen": -0.08695786446332932, + "logits/rejected": 0.02888108417391777, + "logps/chosen": -1.3646390438079834, + "logps/rejected": -1.6218284368515015, + "loss": 1.0699, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3646390438079834, + "rewards/margins": 0.2571892738342285, + "rewards/rejected": -1.6218284368515015, + "sft_loss": 1.3555657863616943, "step": 1655 }, { "epoch": 0.888442883425322, - "grad_norm": 7.027198385868576, - "learning_rate": 2.661975630116813e-06, - "logits/chosen": -0.030960649251937866, - "logits/rejected": -0.007368740625679493, - "logps/chosen": -1.3272325992584229, - "logps/rejected": -1.690601110458374, - "loss": 1.0076, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3272325992584229, - "rewards/margins": 0.3633684515953064, - "rewards/rejected": -1.690601110458374, - "sft_loss": 1.2698527574539185, + "grad_norm": 5.876243524685892, + "learning_rate": 8.873252100389377e-07, + "logits/chosen": -0.023615438491106033, + "logits/rejected": -0.002601534128189087, + "logps/chosen": -1.2742602825164795, + "logps/rejected": -1.5528959035873413, + "loss": 1.0223, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2742602825164795, + "rewards/margins": 0.2786356806755066, + "rewards/rejected": -1.5528959035873413, + "sft_loss": 1.2252178192138672, "step": 1660 }, { "epoch": 0.8911189162067236, - "grad_norm": 5.9083328170756655, - "learning_rate": 2.6590153419601236e-06, - "logits/chosen": -0.06882845610380173, - "logits/rejected": 0.008100676350295544, - "logps/chosen": -1.4700895547866821, - "logps/rejected": -1.7158946990966797, - "loss": 1.1199, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.4700895547866821, - "rewards/margins": 0.2458050698041916, - "rewards/rejected": -1.7158946990966797, - "sft_loss": 1.4389417171478271, + "grad_norm": 4.336071573444084, + "learning_rate": 8.863384473200411e-07, + "logits/chosen": -0.02723364531993866, + "logits/rejected": 0.04549012333154678, + "logps/chosen": -1.3814964294433594, + "logps/rejected": -1.5734784603118896, + "loss": 1.1007, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3814964294433594, + "rewards/margins": 0.19198183715343475, + "rewards/rejected": -1.5734784603118896, + "sft_loss": 1.3701436519622803, "step": 1665 }, { "epoch": 0.8937949489881251, - "grad_norm": 5.138637229143034, - "learning_rate": 2.656043809031503e-06, - "logits/chosen": -0.02306452952325344, - "logits/rejected": 0.1670946180820465, - "logps/chosen": -1.4814660549163818, - "logps/rejected": -1.7213853597640991, - "loss": 1.1272, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.4814660549163818, - "rewards/margins": 0.23991911113262177, - "rewards/rejected": -1.7213853597640991, - "sft_loss": 1.4016886949539185, + "grad_norm": 5.367175487319009, + "learning_rate": 8.853479363438342e-07, + "logits/chosen": 0.01956319436430931, + "logits/rejected": 0.20048554241657257, + "logps/chosen": -1.4105165004730225, + "logps/rejected": -1.5900005102157593, + "loss": 1.1235, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4105165004730225, + "rewards/margins": 0.17948417365550995, + "rewards/rejected": -1.5900005102157593, + "sft_loss": 1.3565194606781006, "step": 1670 }, { "epoch": 0.8964709817695267, - "grad_norm": 4.711908559504893, - "learning_rate": 2.6530610601607764e-06, - "logits/chosen": -0.025666356086730957, - "logits/rejected": 0.20217475295066833, - "logps/chosen": -1.398291826248169, - "logps/rejected": -1.856555700302124, - "loss": 1.0498, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.398291826248169, - "rewards/margins": 0.45826393365859985, - "rewards/rejected": -1.856555700302124, - "sft_loss": 1.4265127182006836, + "grad_norm": 5.722057114528683, + "learning_rate": 8.843536867202588e-07, + "logits/chosen": 0.008368739858269691, + "logits/rejected": 0.22673054039478302, + "logps/chosen": -1.3704248666763306, + "logps/rejected": -1.751349687576294, + "loss": 1.0563, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3704248666763306, + "rewards/margins": 0.38092485070228577, + "rewards/rejected": -1.751349687576294, + "sft_loss": 1.4024537801742554, "step": 1675 }, { "epoch": 0.8991470145509283, - "grad_norm": 5.912792494626306, - "learning_rate": 2.6500671242865877e-06, - "logits/chosen": -0.11349250376224518, - "logits/rejected": 0.018579021096229553, - "logps/chosen": -1.4116361141204834, - "logps/rejected": -1.6884177923202515, - "loss": 1.0678, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.4116361141204834, - "rewards/margins": 0.27678191661834717, - "rewards/rejected": -1.6884177923202515, - "sft_loss": 1.4139634370803833, + "grad_norm": 7.455299392555562, + "learning_rate": 8.833557080955292e-07, + "logits/chosen": -0.07408158481121063, + "logits/rejected": 0.05479319021105766, + "logps/chosen": -1.388621211051941, + "logps/rejected": -1.6247622966766357, + "loss": 1.0734, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.388621211051941, + "rewards/margins": 0.23614096641540527, + "rewards/rejected": -1.6247622966766357, + "sft_loss": 1.3891102075576782, "step": 1680 }, { "epoch": 0.9018230473323299, - "grad_norm": 9.673599748893212, - "learning_rate": 2.6470620304561147e-06, - "logits/chosen": -0.09010537713766098, - "logits/rejected": 0.16394095122814178, - "logps/chosen": -1.3627606630325317, - "logps/rejected": -1.6861966848373413, - "loss": 1.0783, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3627606630325317, - "rewards/margins": 0.32343605160713196, - "rewards/rejected": -1.6861966848373413, - "sft_loss": 1.3936148881912231, + "grad_norm": 8.678480463656214, + "learning_rate": 8.823540101520381e-07, + "logits/chosen": -0.06411705911159515, + "logits/rejected": 0.18151408433914185, + "logps/chosen": -1.3492683172225952, + "logps/rejected": -1.6280282735824585, + "loss": 1.0863, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3492683172225952, + "rewards/margins": 0.27876001596450806, + "rewards/rejected": -1.6280282735824585, + "sft_loss": 1.3718981742858887, "step": 1685 }, { "epoch": 0.9044990801137314, - "grad_norm": 5.433251100646687, - "learning_rate": 2.6440458078247914e-06, - "logits/chosen": -0.08464756608009338, - "logits/rejected": 0.13104847073554993, - "logps/chosen": -1.3060964345932007, - "logps/rejected": -1.687644362449646, - "loss": 1.0294, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3060964345932007, - "rewards/margins": 0.38154786825180054, - "rewards/rejected": -1.687644362449646, - "sft_loss": 1.3979648351669312, + "grad_norm": 5.53415486038033, + "learning_rate": 8.813486026082637e-07, + "logits/chosen": -0.0433022603392601, + "logits/rejected": 0.164881631731987, + "logps/chosen": -1.2780497074127197, + "logps/rejected": -1.584289789199829, + "loss": 1.0377, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2780497074127197, + "rewards/margins": 0.30624014139175415, + "rewards/rejected": -1.584289789199829, + "sft_loss": 1.3719570636749268, "step": 1690 }, { "epoch": 0.907175112895133, - "grad_norm": 10.324471284438198, - "learning_rate": 2.641018485656023e-06, - "logits/chosen": -0.260097473859787, - "logits/rejected": -0.1005074754357338, - "logps/chosen": -1.4122055768966675, - "logps/rejected": -1.6907997131347656, - "loss": 1.1087, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.4122055768966675, - "rewards/margins": 0.27859410643577576, - "rewards/rejected": -1.6907997131347656, - "sft_loss": 1.4807606935501099, + "grad_norm": 11.890457950281, + "learning_rate": 8.803394952186742e-07, + "logits/chosen": -0.1934044361114502, + "logits/rejected": -0.035475775599479675, + "logps/chosen": -1.385246992111206, + "logps/rejected": -1.6005245447158813, + "loss": 1.1158, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.385246992111206, + "rewards/margins": 0.21527759730815887, + "rewards/rejected": -1.6005245447158813, + "sft_loss": 1.4564180374145508, "step": 1695 }, { "epoch": 0.9098511456765346, - "grad_norm": 6.001739367878445, - "learning_rate": 2.6379800933209028e-06, - "logits/chosen": -0.06367066502571106, - "logits/rejected": -0.12096105515956879, - "logps/chosen": -1.4194618463516235, - "logps/rejected": -1.5650646686553955, - "loss": 1.125, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.4194618463516235, - "rewards/margins": 0.14560283720493317, - "rewards/rejected": -1.5650646686553955, - "sft_loss": 1.4329822063446045, + "grad_norm": 7.191009316057824, + "learning_rate": 8.793266977736342e-07, + "logits/chosen": 0.0019111812580376863, + "logits/rejected": -0.04607079178094864, + "logps/chosen": -1.3749603033065796, + "logps/rejected": -1.485594630241394, + "loss": 1.1236, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.3749603033065796, + "rewards/margins": 0.11063437163829803, + "rewards/rejected": -1.485594630241394, + "sft_loss": 1.397199273109436, "step": 1700 }, { "epoch": 0.9125271784579361, - "grad_norm": 7.867099607944738, - "learning_rate": 2.634930660297926e-06, - "logits/chosen": -0.06066015362739563, - "logits/rejected": 0.0960855782032013, - "logps/chosen": -1.3637042045593262, - "logps/rejected": -1.5945680141448975, - "loss": 1.0748, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3637042045593262, - "rewards/margins": 0.23086369037628174, - "rewards/rejected": -1.5945680141448975, - "sft_loss": 1.394171953201294, + "grad_norm": 8.97809173868377, + "learning_rate": 8.783102200993085e-07, + "logits/chosen": -0.006315511651337147, + "logits/rejected": 0.14795732498168945, + "logps/chosen": -1.3420408964157104, + "logps/rejected": -1.511926293373108, + "loss": 1.0932, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3420408964157104, + "rewards/margins": 0.16988542675971985, + "rewards/rejected": -1.511926293373108, + "sft_loss": 1.3750202655792236, "step": 1705 }, { "epoch": 0.9152032112393377, - "grad_norm": 5.64421578862294, - "learning_rate": 2.631870216172705e-06, - "logits/chosen": -0.11932742595672607, - "logits/rejected": -0.026896988973021507, - "logps/chosen": -1.3475847244262695, - "logps/rejected": -1.6134121417999268, - "loss": 1.0596, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -1.3475847244262695, - "rewards/margins": 0.2658274471759796, - "rewards/rejected": -1.6134121417999268, - "sft_loss": 1.3892669677734375, + "grad_norm": 5.8200911103522115, + "learning_rate": 8.772900720575683e-07, + "logits/chosen": -0.030334660783410072, + "logits/rejected": 0.06693456321954727, + "logps/chosen": -1.3011609315872192, + "logps/rejected": -1.5238155126571655, + "loss": 1.0611, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3011609315872192, + "rewards/margins": 0.22265465557575226, + "rewards/rejected": -1.5238155126571655, + "sft_loss": 1.3497133255004883, "step": 1710 }, { "epoch": 0.9178792440207393, - "grad_norm": 7.7643387899472565, - "learning_rate": 2.6287987906376834e-06, - "logits/chosen": -0.11442971229553223, - "logits/rejected": 0.10210961103439331, - "logps/chosen": -1.4364932775497437, - "logps/rejected": -1.6449638605117798, - "loss": 1.1332, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -1.4364932775497437, - "rewards/margins": 0.20847049355506897, - "rewards/rejected": -1.6449638605117798, - "sft_loss": 1.4206666946411133, + "grad_norm": 9.054622362148715, + "learning_rate": 8.762662635458944e-07, + "logits/chosen": -0.02639349177479744, + "logits/rejected": 0.19740860164165497, + "logps/chosen": -1.40165114402771, + "logps/rejected": -1.5750657320022583, + "loss": 1.1327, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.40165114402771, + "rewards/margins": 0.17341431975364685, + "rewards/rejected": -1.5750657320022583, + "sft_loss": 1.38827383518219, "step": 1715 }, { "epoch": 0.9205552768021408, - "grad_norm": 10.238029687623929, - "learning_rate": 2.6257164134918435e-06, - "logits/chosen": -0.11684347689151764, - "logits/rejected": -0.044993676245212555, - "logps/chosen": -1.2513480186462402, - "logps/rejected": -1.7037394046783447, - "loss": 0.9972, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2513480186462402, - "rewards/margins": 0.45239129662513733, - "rewards/rejected": -1.7037394046783447, - "sft_loss": 1.2875103950500488, + "grad_norm": 12.320116491659183, + "learning_rate": 8.752388044972811e-07, + "logits/chosen": -0.03263465687632561, + "logits/rejected": 0.03770837560296059, + "logps/chosen": -1.2219219207763672, + "logps/rejected": -1.6054325103759766, + "loss": 1.0056, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2219219207763672, + "rewards/margins": 0.3835105299949646, + "rewards/rejected": -1.6054325103759766, + "sft_loss": 1.2506580352783203, "step": 1720 }, { "epoch": 0.9232313095835424, - "grad_norm": 4.716017354618029, - "learning_rate": 2.622623114640423e-06, - "logits/chosen": -0.10425164550542831, - "logits/rejected": 0.011850642040371895, - "logps/chosen": -1.3868951797485352, - "logps/rejected": -1.8481849431991577, - "loss": 1.0351, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.3868951797485352, - "rewards/margins": 0.4612897038459778, - "rewards/rejected": -1.8481849431991577, - "sft_loss": 1.4478013515472412, + "grad_norm": 5.4753550125906445, + "learning_rate": 8.74207704880141e-07, + "logits/chosen": -0.03400716930627823, + "logits/rejected": 0.08444056659936905, + "logps/chosen": -1.3580182790756226, + "logps/rejected": -1.7344977855682373, + "loss": 1.0558, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3580182790756226, + "rewards/margins": 0.3764795660972595, + "rewards/rejected": -1.7344977855682373, + "sft_loss": 1.4085122346878052, "step": 1725 }, { "epoch": 0.925907342364944, - "grad_norm": 5.886652291268732, - "learning_rate": 2.6195189240946205e-06, - "logits/chosen": -0.04155025631189346, - "logits/rejected": 0.011087211780250072, - "logps/chosen": -1.3720247745513916, - "logps/rejected": -1.573352575302124, - "loss": 1.0922, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3720247745513916, - "rewards/margins": 0.20132780075073242, - "rewards/rejected": -1.573352575302124, - "sft_loss": 1.3772026300430298, + "grad_norm": 6.076706383744978, + "learning_rate": 8.731729746982068e-07, + "logits/chosen": 0.06157577782869339, + "logits/rejected": 0.11648639291524887, + "logps/chosen": -1.3367359638214111, + "logps/rejected": -1.5042455196380615, + "loss": 1.0917, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3367359638214111, + "rewards/margins": 0.1675097644329071, + "rewards/rejected": -1.5042455196380615, + "sft_loss": 1.34554922580719, "step": 1730 }, { "epoch": 0.9285833751463456, - "grad_norm": 5.855119768331059, - "learning_rate": 2.6164038719713065e-06, - "logits/chosen": -0.21831436455249786, - "logits/rejected": -0.02849755249917507, - "logps/chosen": -1.2959465980529785, - "logps/rejected": -1.9212411642074585, - "loss": 0.9482, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2959465980529785, - "rewards/margins": 0.6252948641777039, - "rewards/rejected": -1.9212411642074585, - "sft_loss": 1.2650933265686035, + "grad_norm": 6.843318001545579, + "learning_rate": 8.721346239904355e-07, + "logits/chosen": -0.1242741122841835, + "logits/rejected": 0.05887087434530258, + "logps/chosen": -1.2805627584457397, + "logps/rejected": -1.8210529088974, + "loss": 0.9578, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2805627584457397, + "rewards/margins": 0.5404902696609497, + "rewards/rejected": -1.8210529088974, + "sft_loss": 1.2435736656188965, "step": 1735 }, { "epoch": 0.9312594079277471, - "grad_norm": 8.313999999494468, - "learning_rate": 2.6132779884927303e-06, - "logits/chosen": -0.1863516867160797, - "logits/rejected": -0.03137914463877678, - "logps/chosen": -1.3538908958435059, - "logps/rejected": -1.6903021335601807, - "loss": 1.0279, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3538908958435059, - "rewards/margins": 0.3364112973213196, - "rewards/rejected": -1.6903021335601807, - "sft_loss": 1.2951140403747559, + "grad_norm": 7.6052098654695035, + "learning_rate": 8.710926628309101e-07, + "logits/chosen": -0.05907027795910835, + "logits/rejected": 0.09855295717716217, + "logps/chosen": -1.3115413188934326, + "logps/rejected": -1.587019920349121, + "loss": 1.0278, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3115413188934326, + "rewards/margins": 0.2754787802696228, + "rewards/rejected": -1.587019920349121, + "sft_loss": 1.2654893398284912, "step": 1740 }, { "epoch": 0.9339354407091487, - "grad_norm": 4.376347329320102, - "learning_rate": 2.6101413039862274e-06, - "logits/chosen": -0.09724593162536621, - "logits/rejected": -0.07287286221981049, - "logps/chosen": -1.3620411157608032, - "logps/rejected": -1.6702470779418945, - "loss": 1.0637, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.3620411157608032, - "rewards/margins": 0.30820587277412415, - "rewards/rejected": -1.6702470779418945, - "sft_loss": 1.403433084487915, + "grad_norm": 4.790584119290133, + "learning_rate": 8.700471013287424e-07, + "logits/chosen": 0.022471506148576736, + "logits/rejected": 0.05765972658991814, + "logps/chosen": -1.3353490829467773, + "logps/rejected": -1.5706102848052979, + "loss": 1.0756, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3353490829467773, + "rewards/margins": 0.23526112735271454, + "rewards/rejected": -1.5706102848052979, + "sft_loss": 1.3784520626068115, "step": 1745 }, { "epoch": 0.9366114734905503, - "grad_norm": 10.776839930667265, - "learning_rate": 2.606993848883924e-06, - "logits/chosen": -0.14971527457237244, - "logits/rejected": -0.08246836811304092, - "logps/chosen": -1.3855582475662231, - "logps/rejected": -1.7720504999160767, - "loss": 1.0406, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3855582475662231, - "rewards/margins": 0.38649213314056396, - "rewards/rejected": -1.7720504999160767, - "sft_loss": 1.3715816736221313, + "grad_norm": 13.045781580257584, + "learning_rate": 8.689979496279746e-07, + "logits/chosen": -0.02450602874159813, + "logits/rejected": 0.044948406517505646, + "logps/chosen": -1.360719084739685, + "logps/rejected": -1.6785328388214111, + "loss": 1.0467, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.360719084739685, + "rewards/margins": 0.31781378388404846, + "rewards/rejected": -1.6785328388214111, + "sft_loss": 1.3535664081573486, "step": 1750 }, { "epoch": 0.9392875062719518, - "grad_norm": 6.5716180189865785, - "learning_rate": 2.6038356537224433e-06, - "logits/chosen": -0.1572066694498062, - "logits/rejected": -0.04727526754140854, - "logps/chosen": -1.3189653158187866, - "logps/rejected": -1.6002721786499023, - "loss": 1.0475, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3189653158187866, - "rewards/margins": 0.2813068926334381, - "rewards/rejected": -1.6002721786499023, - "sft_loss": 1.330338478088379, + "grad_norm": 5.408447209248505, + "learning_rate": 8.679452179074811e-07, + "logits/chosen": -0.031846821308135986, + "logits/rejected": 0.07990224659442902, + "logps/chosen": -1.2945154905319214, + "logps/rejected": -1.5294058322906494, + "loss": 1.0546, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2945154905319214, + "rewards/margins": 0.23489025235176086, + "rewards/rejected": -1.5294058322906494, + "sft_loss": 1.3085170984268188, "step": 1755 }, { "epoch": 0.9419635390533534, - "grad_norm": 5.7150594115528435, - "learning_rate": 2.6006667491426098e-06, - "logits/chosen": -0.10597167909145355, - "logits/rejected": 0.03009122982621193, - "logps/chosen": -1.2991611957550049, - "logps/rejected": -1.6237561702728271, - "loss": 1.0552, + "grad_norm": 6.636071157387697, + "learning_rate": 8.668889163808698e-07, + "logits/chosen": -0.030917812138795853, + "logits/rejected": 0.10335756838321686, + "logps/chosen": -1.2730026245117188, + "logps/rejected": -1.5577722787857056, + "loss": 1.0495, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.2991611957550049, - "rewards/margins": 0.32459500432014465, - "rewards/rejected": -1.6237561702728271, - "sft_loss": 1.388869285583496, + "rewards/chosen": -1.2730026245117188, + "rewards/margins": 0.2847694754600525, + "rewards/rejected": -1.5577722787857056, + "sft_loss": 1.360364556312561, "step": 1760 }, { "epoch": 0.944639571834755, - "grad_norm": 6.6110723627273025, - "learning_rate": 2.5974871658891483e-06, - "logits/chosen": -0.04896597936749458, - "logits/rejected": -0.018855730071663857, - "logps/chosen": -1.3407930135726929, - "logps/rejected": -1.709975004196167, - "loss": 1.0233, + "grad_norm": 7.336694066056145, + "learning_rate": 8.658290552963827e-07, + "logits/chosen": 0.005440413951873779, + "logits/rejected": 0.030295390635728836, + "logps/chosen": -1.3073769807815552, + "logps/rejected": -1.6339219808578491, + "loss": 1.0344, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3407930135726929, - "rewards/margins": 0.369181752204895, - "rewards/rejected": -1.709975004196167, - "sft_loss": 1.3349924087524414, + "rewards/chosen": -1.3073769807815552, + "rewards/margins": 0.3265449106693268, + "rewards/rejected": -1.6339219808578491, + "sft_loss": 1.3142335414886475, "step": 1765 }, { "epoch": 0.9473156046161565, - "grad_norm": 5.332542486808135, - "learning_rate": 2.59429693481039e-06, - "logits/chosen": -0.08531305938959122, - "logits/rejected": 0.08129794895648956, - "logps/chosen": -1.3761022090911865, - "logps/rejected": -1.6028330326080322, - "loss": 1.0791, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3761022090911865, - "rewards/margins": 0.226730614900589, - "rewards/rejected": -1.6028330326080322, - "sft_loss": 1.4245785474777222, + "grad_norm": 5.013746797596467, + "learning_rate": 8.647656449367966e-07, + "logits/chosen": -0.0021407068707048893, + "logits/rejected": 0.15958009660243988, + "logps/chosen": -1.3515747785568237, + "logps/rejected": -1.513629674911499, + "loss": 1.0904, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3515747785568237, + "rewards/margins": 0.1620550900697708, + "rewards/rejected": -1.513629674911499, + "sft_loss": 1.400733232498169, "step": 1770 }, { "epoch": 0.9499916373975581, - "grad_norm": 6.260488330364942, - "learning_rate": 2.5910960868579707e-06, - "logits/chosen": -0.17833852767944336, - "logits/rejected": -0.0832989290356636, - "logps/chosen": -1.3513391017913818, - "logps/rejected": -1.6686162948608398, - "loss": 1.0535, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3513391017913818, - "rewards/margins": 0.31727713346481323, - "rewards/rejected": -1.6686162948608398, - "sft_loss": 1.3597242832183838, + "grad_norm": 7.874936122712779, + "learning_rate": 8.636986956193235e-07, + "logits/chosen": -0.059729017317295074, + "logits/rejected": 0.04206446558237076, + "logps/chosen": -1.2914938926696777, + "logps/rejected": -1.564706563949585, + "loss": 1.0404, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2914938926696777, + "rewards/margins": 0.2732127010822296, + "rewards/rejected": -1.564706563949585, + "sft_loss": 1.3349140882492065, "step": 1775 }, { "epoch": 0.9526676701789597, - "grad_norm": 7.700677741681279, - "learning_rate": 2.5878846530865316e-06, - "logits/chosen": -0.1629796177148819, - "logits/rejected": -0.034121911972761154, - "logps/chosen": -1.3352044820785522, - "logps/rejected": -1.6965748071670532, - "loss": 1.022, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3352044820785522, - "rewards/margins": 0.3613702952861786, - "rewards/rejected": -1.6965748071670532, - "sft_loss": 1.2992217540740967, + "grad_norm": 12.882113345243505, + "learning_rate": 8.626282176955104e-07, + "logits/chosen": -0.06150873750448227, + "logits/rejected": 0.07276951521635056, + "logps/chosen": -1.3248833417892456, + "logps/rejected": -1.6360422372817993, + "loss": 1.0316, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3248833417892456, + "rewards/margins": 0.3111588656902313, + "rewards/rejected": -1.6360422372817993, + "sft_loss": 1.2923238277435303, "step": 1780 }, { "epoch": 0.9553437029603613, - "grad_norm": 6.446010485829404, - "learning_rate": 2.584662664653417e-06, - "logits/chosen": -0.07534436881542206, - "logits/rejected": 0.0013519420754164457, - "logps/chosen": -1.2989352941513062, - "logps/rejected": -1.492027997970581, - "loss": 1.0538, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.2989352941513062, - "rewards/margins": 0.19309253990650177, - "rewards/rejected": -1.492027997970581, - "sft_loss": 1.2925944328308105, + "grad_norm": 8.01846677038553, + "learning_rate": 8.615542215511389e-07, + "logits/chosen": 0.030636975541710854, + "logits/rejected": 0.10803890228271484, + "logps/chosen": -1.280662178993225, + "logps/rejected": -1.4428179264068604, + "loss": 1.057, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.280662178993225, + "rewards/margins": 0.1621556282043457, + "rewards/rejected": -1.4428179264068604, + "sft_loss": 1.281178593635559, "step": 1785 }, { "epoch": 0.9580197357417628, - "grad_norm": 6.389180090896336, - "learning_rate": 2.5814301528183724e-06, - "logits/chosen": -0.06336723268032074, - "logits/rejected": -0.013006513938307762, - "logps/chosen": -1.3607739210128784, - "logps/rejected": -1.599548578262329, - "loss": 1.0743, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3607739210128784, - "rewards/margins": 0.23877449333667755, - "rewards/rejected": -1.599548578262329, - "sft_loss": 1.3988316059112549, + "grad_norm": 6.808143996882543, + "learning_rate": 8.604767176061241e-07, + "logits/chosen": 0.0560939684510231, + "logits/rejected": 0.10724347829818726, + "logps/chosen": -1.343372106552124, + "logps/rejected": -1.5355207920074463, + "loss": 1.0812, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.343372106552124, + "rewards/margins": 0.19214865565299988, + "rewards/rejected": -1.5355207920074463, + "sft_loss": 1.3805100917816162, "step": 1790 }, { "epoch": 0.9606957685231644, - "grad_norm": 4.7739184001154396, - "learning_rate": 2.5781871489432425e-06, - "logits/chosen": -0.19657868146896362, - "logits/rejected": -0.05092762038111687, - "logps/chosen": -1.3206299543380737, - "logps/rejected": -1.6578598022460938, - "loss": 1.0403, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3206299543380737, - "rewards/margins": 0.33722978830337524, - "rewards/rejected": -1.6578598022460938, - "sft_loss": 1.367488980293274, + "grad_norm": 5.031018144550673, + "learning_rate": 8.593957163144141e-07, + "logits/chosen": -0.10228991508483887, + "logits/rejected": 0.04363972693681717, + "logps/chosen": -1.2929900884628296, + "logps/rejected": -1.5646179914474487, + "loss": 1.0514, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2929900884628296, + "rewards/margins": 0.2716279923915863, + "rewards/rejected": -1.5646179914474487, + "sft_loss": 1.345216989517212, "step": 1795 }, { "epoch": 0.963371801304566, - "grad_norm": 6.31162142812504, - "learning_rate": 2.5749336844916644e-06, - "logits/chosen": -0.14416718482971191, - "logits/rejected": -0.0685230940580368, - "logps/chosen": -1.343637228012085, - "logps/rejected": -1.5947457551956177, - "loss": 1.0791, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.343637228012085, - "rewards/margins": 0.25110840797424316, - "rewards/rejected": -1.5947457551956177, - "sft_loss": 1.412238359451294, + "grad_norm": 5.282991735343984, + "learning_rate": 8.58311228163888e-07, + "logits/chosen": -0.04971680790185928, + "logits/rejected": 0.031308285892009735, + "logps/chosen": -1.2953236103057861, + "logps/rejected": -1.4893214702606201, + "loss": 1.0824, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2953236103057861, + "rewards/margins": 0.19399778544902802, + "rewards/rejected": -1.4893214702606201, + "sft_loss": 1.3761422634124756, "step": 1800 }, { "epoch": 0.9660478340859675, - "grad_norm": 5.616802509695661, - "learning_rate": 2.5716697910287653e-06, - "logits/chosen": -0.2638893723487854, - "logits/rejected": -0.12773671746253967, - "logps/chosen": -1.296654462814331, - "logps/rejected": -1.7552036046981812, - "loss": 0.9963, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.296654462814331, - "rewards/margins": 0.4585490822792053, - "rewards/rejected": -1.7552036046981812, - "sft_loss": 1.3569939136505127, + "grad_norm": 5.142929280960286, + "learning_rate": 8.57223263676255e-07, + "logits/chosen": -0.18090695142745972, + "logits/rejected": -0.04280409589409828, + "logps/chosen": -1.2623097896575928, + "logps/rejected": -1.640331506729126, + "loss": 1.0059, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2623097896575928, + "rewards/margins": 0.3780217170715332, + "rewards/rejected": -1.640331506729126, + "sft_loss": 1.3180025815963745, "step": 1805 }, { "epoch": 0.9687238668673691, - "grad_norm": 5.274540470269256, - "learning_rate": 2.5683955002208533e-06, - "logits/chosen": -0.14453324675559998, - "logits/rejected": 0.0004860505578108132, - "logps/chosen": -1.3231637477874756, - "logps/rejected": -1.6313807964324951, - "loss": 1.0423, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3231637477874756, - "rewards/margins": 0.30821719765663147, - "rewards/rejected": -1.6313807964324951, - "sft_loss": 1.3522526025772095, + "grad_norm": 6.015219080547802, + "learning_rate": 8.561318334069511e-07, + "logits/chosen": -0.04094986245036125, + "logits/rejected": 0.10819850116968155, + "logps/chosen": -1.3020027875900269, + "logps/rejected": -1.5312139987945557, + "loss": 1.0636, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3020027875900269, + "rewards/margins": 0.22921118140220642, + "rewards/rejected": -1.5312139987945557, + "sft_loss": 1.3365328311920166, "step": 1810 }, { "epoch": 0.9713998996487707, - "grad_norm": 4.9948455748446134, - "learning_rate": 2.5651108438351125e-06, - "logits/chosen": -0.17525389790534973, - "logits/rejected": -0.03190991282463074, - "logps/chosen": -1.3508412837982178, - "logps/rejected": -1.6075427532196045, - "loss": 1.0749, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3508412837982178, - "rewards/margins": 0.256701797246933, - "rewards/rejected": -1.6075427532196045, - "sft_loss": 1.4097492694854736, + "grad_norm": 6.2344472094409635, + "learning_rate": 8.550369479450375e-07, + "logits/chosen": -0.07465235143899918, + "logits/rejected": 0.07446275651454926, + "logps/chosen": -1.3151795864105225, + "logps/rejected": -1.5252140760421753, + "loss": 1.0723, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3151795864105225, + "rewards/margins": 0.21003445982933044, + "rewards/rejected": -1.5252140760421753, + "sft_loss": 1.3637964725494385, "step": 1815 }, { "epoch": 0.9740759324301723, - "grad_norm": 6.723527857237567, - "learning_rate": 2.5618158537392933e-06, - "logits/chosen": -0.17094561457633972, - "logits/rejected": -0.0991709753870964, - "logps/chosen": -1.3676115274429321, - "logps/rejected": -1.6341607570648193, - "loss": 1.039, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.3676115274429321, - "rewards/margins": 0.2665492594242096, - "rewards/rejected": -1.6341607570648193, - "sft_loss": 1.3335355520248413, + "grad_norm": 9.360317087259874, + "learning_rate": 8.539386179130977e-07, + "logits/chosen": -0.03616784140467644, + "logits/rejected": 0.03897800296545029, + "logps/chosen": -1.3293553590774536, + "logps/rejected": -1.5339654684066772, + "loss": 1.0521, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3293553590774536, + "rewards/margins": 0.20461025834083557, + "rewards/rejected": -1.5339654684066772, + "sft_loss": 1.312124252319336, "step": 1820 }, { "epoch": 0.9767519652115738, - "grad_norm": 6.955491736729167, - "learning_rate": 2.5585105619014042e-06, - "logits/chosen": -0.2364232838153839, - "logits/rejected": -0.08536256849765778, - "logps/chosen": -1.3084943294525146, - "logps/rejected": -1.698646903038025, - "loss": 1.0407, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.3084943294525146, - "rewards/margins": 0.39015254378318787, - "rewards/rejected": -1.698646903038025, - "sft_loss": 1.3571867942810059, + "grad_norm": 8.443872063306976, + "learning_rate": 8.528368539671347e-07, + "logits/chosen": -0.1401108205318451, + "logits/rejected": 0.015832537785172462, + "logps/chosen": -1.2916826009750366, + "logps/rejected": -1.6708862781524658, + "loss": 1.0449, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2916826009750366, + "rewards/margins": 0.3792034685611725, + "rewards/rejected": -1.6708862781524658, + "sft_loss": 1.3466923236846924, "step": 1825 }, { "epoch": 0.9794279979929754, - "grad_norm": 4.935043830160641, - "learning_rate": 2.555195000389401e-06, - "logits/chosen": -0.09421467036008835, - "logits/rejected": -0.06247818470001221, - "logps/chosen": -1.398600697517395, - "logps/rejected": -1.5775256156921387, - "loss": 1.0923, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.398600697517395, - "rewards/margins": 0.1789250671863556, - "rewards/rejected": -1.5775256156921387, - "sft_loss": 1.4048207998275757, + "grad_norm": 5.552897718500467, + "learning_rate": 8.51731666796467e-07, + "logits/chosen": 0.07638445496559143, + "logits/rejected": 0.1197354644536972, + "logps/chosen": -1.3811604976654053, + "logps/rejected": -1.518565058708191, + "loss": 1.102, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3811604976654053, + "rewards/margins": 0.13740459084510803, + "rewards/rejected": -1.518565058708191, + "sft_loss": 1.3802618980407715, "step": 1830 }, { "epoch": 0.982104030774377, - "grad_norm": 5.488009318290176, - "learning_rate": 2.5518692013708764e-06, - "logits/chosen": -0.19971255958080292, - "logits/rejected": -0.13240863382816315, - "logps/chosen": -1.3496644496917725, - "logps/rejected": -1.4494019746780396, - "loss": 1.117, + "grad_norm": 6.599402994951173, + "learning_rate": 8.506230671236254e-07, + "logits/chosen": -0.060695432126522064, + "logits/rejected": 0.014427835121750832, + "logps/chosen": -1.3435875177383423, + "logps/rejected": -1.4622777700424194, + "loss": 1.1121, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -1.3496644496917725, - "rewards/margins": 0.09973742812871933, - "rewards/rejected": -1.4494019746780396, - "sft_loss": 1.3982349634170532, + "rewards/chosen": -1.3435875177383423, + "rewards/margins": 0.11869029700756073, + "rewards/rejected": -1.4622777700424194, + "sft_loss": 1.3856557607650757, "step": 1835 }, { "epoch": 0.9847800635557785, - "grad_norm": 5.861461820294152, - "learning_rate": 2.5485331971127467e-06, - "logits/chosen": -0.17495878040790558, - "logits/rejected": -0.061311207711696625, - "logps/chosen": -1.3403594493865967, - "logps/rejected": -1.7291936874389648, - "loss": 1.0203, + "grad_norm": 6.3743422415046345, + "learning_rate": 8.495110657042488e-07, + "logits/chosen": 0.006036204285919666, + "logits/rejected": 0.12875476479530334, + "logps/chosen": -1.3412091732025146, + "logps/rejected": -1.6945394277572632, + "loss": 1.025, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3403594493865967, - "rewards/margins": 0.3888341784477234, - "rewards/rejected": -1.7291936874389648, - "sft_loss": 1.3728001117706299, + "rewards/chosen": -1.3412091732025146, + "rewards/margins": 0.3533302843570709, + "rewards/rejected": -1.6945394277572632, + "sft_loss": 1.362762212753296, "step": 1840 }, { "epoch": 0.9874560963371801, - "grad_norm": 8.398297831143816, - "learning_rate": 2.5451870199809398e-06, - "logits/chosen": -0.2426266372203827, - "logits/rejected": -0.15641649067401886, - "logps/chosen": -1.3813214302062988, - "logps/rejected": -1.6519542932510376, - "loss": 1.1025, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -1.3813214302062988, - "rewards/margins": 0.27063268423080444, - "rewards/rejected": -1.6519542932510376, - "sft_loss": 1.407288908958435, + "grad_norm": 8.105021438155644, + "learning_rate": 8.483956733269799e-07, + "logits/chosen": -0.05386769026517868, + "logits/rejected": 0.04598368704319, + "logps/chosen": -1.3784972429275513, + "logps/rejected": -1.6016931533813477, + "loss": 1.1197, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3784972429275513, + "rewards/margins": 0.22319582104682922, + "rewards/rejected": -1.6016931533813477, + "sft_loss": 1.4001914262771606, "step": 1845 }, { "epoch": 0.9901321291185817, - "grad_norm": 10.421658236786511, - "learning_rate": 2.5418307024400808e-06, - "logits/chosen": -0.3867731988430023, - "logits/rejected": -0.25579994916915894, - "logps/chosen": -1.4403059482574463, - "logps/rejected": -1.6446120738983154, - "loss": 1.1147, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": -1.4403059482574463, - "rewards/margins": 0.20430617034435272, - "rewards/rejected": -1.6446120738983154, - "sft_loss": 1.4047480821609497, + "grad_norm": 10.38510756238131, + "learning_rate": 8.472769008133602e-07, + "logits/chosen": -0.20087404549121857, + "logits/rejected": -0.05395135283470154, + "logps/chosen": -1.402502417564392, + "logps/rejected": -1.540789008140564, + "loss": 1.1224, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.402502417564392, + "rewards/margins": 0.13828660547733307, + "rewards/rejected": -1.540789008140564, + "sft_loss": 1.3710534572601318, "step": 1850 }, { "epoch": 0.9928081618999832, - "grad_norm": 8.945771287029995, - "learning_rate": 2.538464277053178e-06, - "logits/chosen": -0.3046836256980896, - "logits/rejected": -0.20668058097362518, - "logps/chosen": -1.3661428689956665, - "logps/rejected": -1.698788046836853, - "loss": 1.0565, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3661428689956665, - "rewards/margins": 0.332645058631897, - "rewards/rejected": -1.698788046836853, - "sft_loss": 1.3933660984039307, + "grad_norm": 9.39666350747347, + "learning_rate": 8.461547590177259e-07, + "logits/chosen": -0.07947036623954773, + "logits/rejected": 0.03734710440039635, + "logps/chosen": -1.3037769794464111, + "logps/rejected": -1.5832937955856323, + "loss": 1.0464, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3037769794464111, + "rewards/margins": 0.2795167565345764, + "rewards/rejected": -1.5832937955856323, + "sft_loss": 1.3439761400222778, "step": 1855 }, { "epoch": 0.9954841946813848, - "grad_norm": 6.4600258169068105, - "learning_rate": 2.5350877764813042e-06, - "logits/chosen": -0.24842870235443115, - "logits/rejected": -0.17129719257354736, - "logps/chosen": -1.4513623714447021, - "logps/rejected": -1.7643362283706665, - "loss": 1.0665, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.4513623714447021, - "rewards/margins": 0.3129737079143524, - "rewards/rejected": -1.7643362283706665, - "sft_loss": 1.4227665662765503, + "grad_norm": 6.725630926196394, + "learning_rate": 8.450292588271014e-07, + "logits/chosen": -0.04184072092175484, + "logits/rejected": 0.049746450036764145, + "logps/chosen": -1.4166171550750732, + "logps/rejected": -1.637763261795044, + "loss": 1.0816, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4166171550750732, + "rewards/margins": 0.22114595770835876, + "rewards/rejected": -1.637763261795044, + "sft_loss": 1.3935834169387817, "step": 1860 }, { "epoch": 0.9981602274627864, - "grad_norm": 5.726169321686694, - "learning_rate": 2.531701233483284e-06, - "logits/chosen": -0.2357204407453537, - "logits/rejected": -0.1661442667245865, - "logps/chosen": -1.3102283477783203, - "logps/rejected": -1.7116100788116455, - "loss": 1.0178, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3102283477783203, - "rewards/margins": 0.40138188004493713, - "rewards/rejected": -1.7116100788116455, - "sft_loss": 1.3458659648895264, + "grad_norm": 6.224624107660037, + "learning_rate": 8.439004111610945e-07, + "logits/chosen": -0.07246112823486328, + "logits/rejected": 0.014533983543515205, + "logps/chosen": -1.2703287601470947, + "logps/rejected": -1.5969499349594116, + "loss": 1.021, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2703287601470947, + "rewards/margins": 0.3266211450099945, + "rewards/rejected": -1.5969499349594116, + "sft_loss": 1.3040375709533691, "step": 1865 }, { "epoch": 1.000836260244188, - "grad_norm": 6.694603142400152, - "learning_rate": 2.5283046809153708e-06, - "logits/chosen": -0.28832611441612244, - "logits/rejected": -0.1387026458978653, - "logps/chosen": -1.4116193056106567, - "logps/rejected": -1.744187593460083, - "loss": 1.0715, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.4116193056106567, - "rewards/margins": 0.3325682580471039, - "rewards/rejected": -1.744187593460083, - "sft_loss": 1.4343914985656738, + "grad_norm": 8.211500198907471, + "learning_rate": 8.427682269717901e-07, + "logits/chosen": -0.14373692870140076, + "logits/rejected": 0.017202334478497505, + "logps/chosen": -1.3954145908355713, + "logps/rejected": -1.6238315105438232, + "loss": 1.0964, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3954145908355713, + "rewards/margins": 0.22841675579547882, + "rewards/rejected": -1.6238315105438232, + "sft_loss": 1.4205118417739868, "step": 1870 }, { "epoch": 1.0035122930255895, - "grad_norm": 8.177964909816092, - "learning_rate": 2.524898151730934e-06, - "logits/chosen": -0.3231280446052551, - "logits/rejected": -0.19551904499530792, - "logps/chosen": -1.3622983694076538, - "logps/rejected": -1.7303298711776733, - "loss": 1.0075, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3622983694076538, - "rewards/margins": 0.36803165078163147, - "rewards/rejected": -1.7303298711776733, - "sft_loss": 1.3062353134155273, + "grad_norm": 7.784393754102794, + "learning_rate": 8.416327172436446e-07, + "logits/chosen": -0.1777457594871521, + "logits/rejected": -0.03787863254547119, + "logps/chosen": -1.380475640296936, + "logps/rejected": -1.5722134113311768, + "loss": 1.0659, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.380475640296936, + "rewards/margins": 0.1917375922203064, + "rewards/rejected": -1.5722134113311768, + "sft_loss": 1.3126857280731201, "step": 1875 }, { "epoch": 1.0061883258069912, - "grad_norm": 7.594474341201221, - "learning_rate": 2.5214816789801337e-06, - "logits/chosen": -0.2040158212184906, - "logits/rejected": -0.044173695147037506, - "logps/chosen": -1.3152166604995728, - "logps/rejected": -1.9164692163467407, - "loss": 0.9674, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3152166604995728, - "rewards/margins": 0.6012526154518127, - "rewards/rejected": -1.9164692163467407, - "sft_loss": 1.3626232147216797, + "grad_norm": 9.483278737279292, + "learning_rate": 8.404938929933778e-07, + "logits/chosen": -0.04560353606939316, + "logits/rejected": 0.127095028758049, + "logps/chosen": -1.3196014165878296, + "logps/rejected": -1.7367639541625977, + "loss": 1.0283, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3196014165878296, + "rewards/margins": 0.4171624183654785, + "rewards/rejected": -1.7367639541625977, + "sft_loss": 1.3578262329101562, "step": 1880 }, { "epoch": 1.0088643585883927, - "grad_norm": 5.9740793893869935, - "learning_rate": 2.518055295809604e-06, - "logits/chosen": -0.2451355904340744, - "logits/rejected": -0.16891005635261536, - "logps/chosen": -1.2620337009429932, - "logps/rejected": -1.720070242881775, - "loss": 0.9786, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2620337009429932, - "rewards/margins": 0.4580365717411041, - "rewards/rejected": -1.720070242881775, - "sft_loss": 1.2582600116729736, + "grad_norm": 6.936539864471406, + "learning_rate": 8.39351765269868e-07, + "logits/chosen": -0.10250772535800934, + "logits/rejected": -0.022584009915590286, + "logps/chosen": -1.2747838497161865, + "logps/rejected": -1.587401032447815, + "loss": 1.0299, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2747838497161865, + "rewards/margins": 0.31261715292930603, + "rewards/rejected": -1.587401032447815, + "sft_loss": 1.2519252300262451, "step": 1885 }, { "epoch": 1.0115403913697942, - "grad_norm": 5.63311132788852, - "learning_rate": 2.5146190354621295e-06, - "logits/chosen": -0.29360517859458923, - "logits/rejected": -0.09512095153331757, - "logps/chosen": -1.2986427545547485, - "logps/rejected": -1.7647788524627686, - "loss": 0.9846, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2986427545547485, - "rewards/margins": 0.4661361575126648, - "rewards/rejected": -1.7647788524627686, - "sft_loss": 1.3902837038040161, + "grad_norm": 5.619838750688994, + "learning_rate": 8.382063451540431e-07, + "logits/chosen": -0.11007682234048843, + "logits/rejected": 0.08966507017612457, + "logps/chosen": -1.3074545860290527, + "logps/rejected": -1.5856674909591675, + "loss": 1.05, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3074545860290527, + "rewards/margins": 0.2782130241394043, + "rewards/rejected": -1.5856674909591675, + "sft_loss": 1.386627197265625, "step": 1890 }, { "epoch": 1.014216424151196, - "grad_norm": 6.071013462973501, - "learning_rate": 2.511172931276323e-06, - "logits/chosen": -0.23120129108428955, - "logits/rejected": -0.1789826899766922, - "logps/chosen": -1.3082417249679565, - "logps/rejected": -1.6579334735870361, - "loss": 0.9905, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3082417249679565, - "rewards/margins": 0.34969156980514526, - "rewards/rejected": -1.6579334735870361, - "sft_loss": 1.325251817703247, + "grad_norm": 5.740860945772739, + "learning_rate": 8.370576437587742e-07, + "logits/chosen": -0.042826805263757706, + "logits/rejected": 0.01792163774371147, + "logps/chosen": -1.3253368139266968, + "logps/rejected": -1.5498539209365845, + "loss": 1.0328, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3253368139266968, + "rewards/margins": 0.2245168685913086, + "rewards/rejected": -1.5498539209365845, + "sft_loss": 1.3255261182785034, "step": 1895 }, { "epoch": 1.0168924569325974, - "grad_norm": 4.573280999864407, - "learning_rate": 2.5077170166863026e-06, - "logits/chosen": -0.3544650971889496, - "logits/rejected": -0.10121510177850723, - "logps/chosen": -1.328932762145996, - "logps/rejected": -1.7810074090957642, - "loss": 0.9955, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.328932762145996, - "rewards/margins": 0.4520746171474457, - "rewards/rejected": -1.7810074090957642, - "sft_loss": 1.377091646194458, + "grad_norm": 5.0922788595642, + "learning_rate": 8.359056722287674e-07, + "logits/chosen": -0.1649695336818695, + "logits/rejected": 0.1055021733045578, + "logps/chosen": -1.3374435901641846, + "logps/rejected": -1.5897200107574463, + "loss": 1.0576, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3374435901641846, + "rewards/margins": 0.25227636098861694, + "rewards/rejected": -1.5897200107574463, + "sft_loss": 1.3770906925201416, "step": 1900 }, { "epoch": 1.019568489713999, - "grad_norm": 4.361287268957535, - "learning_rate": 2.504251325221366e-06, - "logits/chosen": -0.28322911262512207, - "logits/rejected": -0.14230282604694366, - "logps/chosen": -1.3653513193130493, - "logps/rejected": -1.7526214122772217, - "loss": 1.0219, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3653513193130493, - "rewards/margins": 0.38727012276649475, - "rewards/rejected": -1.7526214122772217, - "sft_loss": 1.3574550151824951, + "grad_norm": 3.926305418162864, + "learning_rate": 8.347504417404553e-07, + "logits/chosen": -0.08125348389148712, + "logits/rejected": 0.07250069081783295, + "logps/chosen": -1.3621914386749268, + "logps/rejected": -1.594728708267212, + "loss": 1.0676, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3621914386749268, + "rewards/margins": 0.2325374186038971, + "rewards/rejected": -1.594728708267212, + "sft_loss": 1.352673888206482, "step": 1905 }, { "epoch": 1.0222445224954007, - "grad_norm": 6.172709458505564, - "learning_rate": 2.500775890505668e-06, - "logits/chosen": -0.3746119737625122, - "logits/rejected": -0.2532230019569397, - "logps/chosen": -1.307799220085144, - "logps/rejected": -1.6917346715927124, - "loss": 0.9872, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.307799220085144, - "rewards/margins": 0.38393548130989075, - "rewards/rejected": -1.6917346715927124, - "sft_loss": 1.3196412324905396, + "grad_norm": 6.780850327708814, + "learning_rate": 8.335919635018893e-07, + "logits/chosen": -0.17982670664787292, + "logits/rejected": -0.05172845721244812, + "logps/chosen": -1.3353796005249023, + "logps/rejected": -1.570927381515503, + "loss": 1.0505, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3353796005249023, + "rewards/margins": 0.23554787039756775, + "rewards/rejected": -1.570927381515503, + "sft_loss": 1.3366186618804932, "step": 1910 }, { "epoch": 1.0249205552768021, - "grad_norm": 4.0633181580152735, - "learning_rate": 2.497290746257891e-06, - "logits/chosen": -0.29822617769241333, - "logits/rejected": -0.22360272705554962, - "logps/chosen": -1.2829499244689941, - "logps/rejected": -1.6371288299560547, - "loss": 1.0297, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2829499244689941, - "rewards/margins": 0.35417887568473816, - "rewards/rejected": -1.6371288299560547, - "sft_loss": 1.359552025794983, + "grad_norm": 4.888021467437801, + "learning_rate": 8.324302487526303e-07, + "logits/chosen": -0.10884324461221695, + "logits/rejected": -0.02582775428891182, + "logps/chosen": -1.3080412149429321, + "logps/rejected": -1.5268137454986572, + "loss": 1.076, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3080412149429321, + "rewards/margins": 0.2187725007534027, + "rewards/rejected": -1.5268137454986572, + "sft_loss": 1.3675590753555298, "step": 1915 }, { "epoch": 1.0275965880582036, - "grad_norm": 7.153620462384848, - "learning_rate": 2.49379592629092e-06, - "logits/chosen": -0.34407928586006165, - "logits/rejected": -0.27675971388816833, - "logps/chosen": -1.1805469989776611, - "logps/rejected": -1.652948021888733, - "loss": 0.9315, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.1805469989776611, - "rewards/margins": 0.47240084409713745, - "rewards/rejected": -1.652948021888733, - "sft_loss": 1.2449908256530762, + "grad_norm": 5.630214545738198, + "learning_rate": 8.312653087636398e-07, + "logits/chosen": -0.12240447849035263, + "logits/rejected": -0.039300721138715744, + "logps/chosen": -1.2059236764907837, + "logps/rejected": -1.545480728149414, + "loss": 0.9855, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2059236764907837, + "rewards/margins": 0.3395571708679199, + "rewards/rejected": -1.545480728149414, + "sft_loss": 1.258929967880249, "step": 1920 }, { "epoch": 1.0302726208396054, - "grad_norm": 6.279139416421326, - "learning_rate": 2.4902914645115135e-06, - "logits/chosen": -0.46754807233810425, - "logits/rejected": -0.27566924691200256, - "logps/chosen": -1.3483153581619263, - "logps/rejected": -1.741081953048706, - "loss": 1.0151, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3483153581619263, - "rewards/margins": 0.39276665449142456, - "rewards/rejected": -1.741081953048706, - "sft_loss": 1.4039819240570068, + "grad_norm": 7.493618244933485, + "learning_rate": 8.300971548371711e-07, + "logits/chosen": -0.2451147735118866, + "logits/rejected": -0.020291466265916824, + "logps/chosen": -1.375449299812317, + "logps/rejected": -1.613504409790039, + "loss": 1.0801, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.375449299812317, + "rewards/margins": 0.23805518448352814, + "rewards/rejected": -1.613504409790039, + "sft_loss": 1.4104481935501099, "step": 1925 }, { "epoch": 1.0329486536210069, - "grad_norm": 15.078125841372746, - "learning_rate": 2.4867773949199748e-06, - "logits/chosen": -0.4051434397697449, - "logits/rejected": -0.27849093079566956, - "logps/chosen": -1.2485005855560303, - "logps/rejected": -1.7951538562774658, - "loss": 0.9279, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2485005855560303, - "rewards/margins": 0.5466530919075012, - "rewards/rejected": -1.7951538562774658, - "sft_loss": 1.3159576654434204, + "grad_norm": 7.361962075932273, + "learning_rate": 8.289257983066582e-07, + "logits/chosen": -0.1329573094844818, + "logits/rejected": 0.01702733151614666, + "logps/chosen": -1.2550948858261108, + "logps/rejected": -1.5974655151367188, + "loss": 0.987, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2550948858261108, + "rewards/margins": 0.3423704504966736, + "rewards/rejected": -1.5974655151367188, + "sft_loss": 1.2982847690582275, "step": 1930 }, { "epoch": 1.0356246864024083, - "grad_norm": 7.358861910072948, - "learning_rate": 2.483253751609823e-06, - "logits/chosen": -0.3944740891456604, - "logits/rejected": -0.21546992659568787, - "logps/chosen": -1.341235637664795, - "logps/rejected": -1.9729608297348022, - "loss": 0.9468, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.341235637664795, - "rewards/margins": 0.6317251920700073, - "rewards/rejected": -1.9729608297348022, - "sft_loss": 1.3817254304885864, + "grad_norm": 8.331882985469626, + "learning_rate": 8.277512505366077e-07, + "logits/chosen": -0.17175039649009705, + "logits/rejected": 0.02810470387339592, + "logps/chosen": -1.3440721035003662, + "logps/rejected": -1.7161515951156616, + "loss": 1.0069, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3440721035003662, + "rewards/margins": 0.3720795810222626, + "rewards/rejected": -1.7161515951156616, + "sft_loss": 1.3601570129394531, "step": 1935 }, { "epoch": 1.03830071918381, - "grad_norm": 6.991473226065031, - "learning_rate": 2.4797205687674608e-06, - "logits/chosen": -0.31449756026268005, - "logits/rejected": -0.21377721428871155, - "logps/chosen": -1.345902681350708, - "logps/rejected": -1.9185478687286377, - "loss": 0.9752, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.345902681350708, - "rewards/margins": 0.5726450085639954, - "rewards/rejected": -1.9185478687286377, - "sft_loss": 1.3627817630767822, + "grad_norm": 6.441709503588563, + "learning_rate": 8.265735229224868e-07, + "logits/chosen": -0.08347249776124954, + "logits/rejected": 0.03723754733800888, + "logps/chosen": -1.3489030599594116, + "logps/rejected": -1.716583013534546, + "loss": 1.0284, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3489030599594116, + "rewards/margins": 0.36768001317977905, + "rewards/rejected": -1.716583013534546, + "sft_loss": 1.363987684249878, "step": 1940 }, { "epoch": 1.0409767519652116, - "grad_norm": 11.559584684546387, - "learning_rate": 2.476177880671843e-06, - "logits/chosen": -0.43540072441101074, - "logits/rejected": -0.27904751896858215, - "logps/chosen": -1.3725993633270264, - "logps/rejected": -2.1540937423706055, - "loss": 0.9587, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3725993633270264, - "rewards/margins": 0.7814942598342896, - "rewards/rejected": -2.1540937423706055, - "sft_loss": 1.4134228229522705, + "grad_norm": 4.881873706559672, + "learning_rate": 8.253926268906144e-07, + "logits/chosen": -0.18445467948913574, + "logits/rejected": -0.005650246050208807, + "logps/chosen": -1.3698749542236328, + "logps/rejected": -1.8684381246566772, + "loss": 1.0048, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3698749542236328, + "rewards/margins": 0.498563289642334, + "rewards/rejected": -1.8684381246566772, + "sft_loss": 1.3930516242980957, "step": 1945 }, { "epoch": 1.043652784746613, - "grad_norm": 7.589617000083219, - "learning_rate": 2.4726257216941463e-06, - "logits/chosen": -0.34726226329803467, - "logits/rejected": -0.1460946649312973, - "logps/chosen": -1.3964488506317139, - "logps/rejected": -1.9331518411636353, - "loss": 1.0112, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3964488506317139, - "rewards/margins": 0.5367029905319214, - "rewards/rejected": -1.9331518411636353, - "sft_loss": 1.4542505741119385, + "grad_norm": 6.394072611705559, + "learning_rate": 8.242085738980487e-07, + "logits/chosen": -0.08021242916584015, + "logits/rejected": 0.1560056209564209, + "logps/chosen": -1.4112555980682373, + "logps/rejected": -1.7629226446151733, + "loss": 1.0666, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4112555980682373, + "rewards/margins": 0.35166722536087036, + "rewards/rejected": -1.7629226446151733, + "sft_loss": 1.4502036571502686, "step": 1950 }, { "epoch": 1.0463288175280148, - "grad_norm": 5.53217072199513, - "learning_rate": 2.4690641262974317e-06, - "logits/chosen": -0.3106427490711212, - "logits/rejected": -0.24054069817066193, - "logps/chosen": -1.254062294960022, - "logps/rejected": -1.7604010105133057, - "loss": 0.9492, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.254062294960022, - "rewards/margins": 0.5063384771347046, - "rewards/rejected": -1.7604010105133057, - "sft_loss": 1.2828395366668701, + "grad_norm": 5.268966441319738, + "learning_rate": 8.230213754324772e-07, + "logits/chosen": -0.09227783232927322, + "logits/rejected": -0.02726762369275093, + "logps/chosen": -1.2625749111175537, + "logps/rejected": -1.6419893503189087, + "loss": 0.98, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2625749111175537, + "rewards/margins": 0.3794143795967102, + "rewards/rejected": -1.6419893503189087, + "sft_loss": 1.28301203250885, "step": 1955 }, { "epoch": 1.0490048503094163, - "grad_norm": 5.789997895299221, - "learning_rate": 2.4654931290363135e-06, - "logits/chosen": -0.39274802803993225, - "logits/rejected": -0.3678857684135437, - "logps/chosen": -1.3077292442321777, - "logps/rejected": -1.8038746118545532, - "loss": 0.9884, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.3077292442321777, - "rewards/margins": 0.4961455464363098, - "rewards/rejected": -1.8038746118545532, - "sft_loss": 1.3811976909637451, + "grad_norm": 7.312813219261093, + "learning_rate": 8.218310430121045e-07, + "logits/chosen": -0.14062745869159698, + "logits/rejected": -0.10793910175561905, + "logps/chosen": -1.3452221155166626, + "logps/rejected": -1.6348785161972046, + "loss": 1.0561, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3452221155166626, + "rewards/margins": 0.28965651988983154, + "rewards/rejected": -1.6348785161972046, + "sft_loss": 1.4023061990737915, "step": 1960 }, { "epoch": 1.051680883090818, - "grad_norm": 7.167359990925718, - "learning_rate": 2.461912764556623e-06, - "logits/chosen": -0.32825708389282227, - "logits/rejected": -0.2701486349105835, - "logps/chosen": -1.2460230588912964, - "logps/rejected": -1.9635534286499023, - "loss": 0.9254, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2460230588912964, - "rewards/margins": 0.717530369758606, - "rewards/rejected": -1.9635534286499023, - "sft_loss": 1.3071447610855103, + "grad_norm": 6.6596167436143565, + "learning_rate": 8.20637588185541e-07, + "logits/chosen": -0.057622771710157394, + "logits/rejected": 0.018117045983672142, + "logps/chosen": -1.2645435333251953, + "logps/rejected": -1.7956523895263672, + "loss": 0.9724, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2645435333251953, + "rewards/margins": 0.5311091542243958, + "rewards/rejected": -1.7956523895263672, + "sft_loss": 1.31093430519104, "step": 1965 }, { "epoch": 1.0543569158722195, - "grad_norm": 4.728180593919103, - "learning_rate": 2.4583230675950717e-06, - "logits/chosen": -0.3871462643146515, - "logits/rejected": -0.2666874825954437, - "logps/chosen": -1.287161111831665, - "logps/rejected": -1.8006842136383057, - "loss": 0.9718, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.287161111831665, - "rewards/margins": 0.5135231614112854, - "rewards/rejected": -1.8006842136383057, - "sft_loss": 1.3299510478973389, + "grad_norm": 5.363156650986811, + "learning_rate": 8.194410225316906e-07, + "logits/chosen": -0.1434890478849411, + "logits/rejected": 0.008483712561428547, + "logps/chosen": -1.3194637298583984, + "logps/rejected": -1.646680235862732, + "loss": 1.034, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3194637298583984, + "rewards/margins": 0.3272164762020111, + "rewards/rejected": -1.646680235862732, + "sft_loss": 1.3527896404266357, "step": 1970 }, { "epoch": 1.057032948653621, - "grad_norm": 6.5082169516852355, - "learning_rate": 2.4547240729789156e-06, - "logits/chosen": -0.3427007794380188, - "logits/rejected": -0.26755183935165405, - "logps/chosen": -1.2617965936660767, - "logps/rejected": -1.7926756143569946, - "loss": 0.9507, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2617965936660767, - "rewards/margins": 0.5308788418769836, - "rewards/rejected": -1.7926756143569946, - "sft_loss": 1.3087375164031982, + "grad_norm": 7.53618794053218, + "learning_rate": 8.182413576596385e-07, + "logits/chosen": -0.06295988708734512, + "logits/rejected": 0.03186292201280594, + "logps/chosen": -1.2528502941131592, + "logps/rejected": -1.591352939605713, + "loss": 1.0003, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2528502941131592, + "rewards/margins": 0.3385026752948761, + "rewards/rejected": -1.591352939605713, + "sft_loss": 1.3003718852996826, "step": 1975 }, { "epoch": 1.0597089814350227, - "grad_norm": 6.383588115894568, - "learning_rate": 2.451115815625617e-06, - "logits/chosen": -0.2760846018791199, - "logits/rejected": -0.16218586266040802, - "logps/chosen": -1.3795157670974731, - "logps/rejected": -1.9243261814117432, - "loss": 0.9986, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3795157670974731, - "rewards/margins": 0.5448102951049805, - "rewards/rejected": -1.9243261814117432, - "sft_loss": 1.3830487728118896, + "grad_norm": 6.419840363438428, + "learning_rate": 8.170386052085389e-07, + "logits/chosen": 0.008228405378758907, + "logits/rejected": 0.13828298449516296, + "logps/chosen": -1.3528460264205933, + "logps/rejected": -1.6920034885406494, + "loss": 1.0396, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3528460264205933, + "rewards/margins": 0.3391575813293457, + "rewards/rejected": -1.6920034885406494, + "sft_loss": 1.3606065511703491, "step": 1980 }, { "epoch": 1.0623850142164242, - "grad_norm": 5.9339767920468, - "learning_rate": 2.4474983305425025e-06, - "logits/chosen": -0.35163047909736633, - "logits/rejected": -0.20219776034355164, - "logps/chosen": -1.3649961948394775, - "logps/rejected": -1.8313382863998413, - "loss": 1.0088, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3649961948394775, - "rewards/margins": 0.46634215116500854, - "rewards/rejected": -1.8313382863998413, - "sft_loss": 1.356627345085144, + "grad_norm": 7.530584399794697, + "learning_rate": 8.158327768475008e-07, + "logits/chosen": -0.08242753148078918, + "logits/rejected": 0.08598792552947998, + "logps/chosen": -1.3864794969558716, + "logps/rejected": -1.6212832927703857, + "loss": 1.0795, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3864794969558716, + "rewards/margins": 0.23480382561683655, + "rewards/rejected": -1.6212832927703857, + "sft_loss": 1.3649652004241943, "step": 1985 }, { "epoch": 1.0650610469978257, - "grad_norm": 9.750541232945201, - "learning_rate": 2.4438716528264307e-06, - "logits/chosen": -0.39386358857154846, - "logits/rejected": -0.3099510073661804, - "logps/chosen": -1.3966448307037354, - "logps/rejected": -1.8608152866363525, - "loss": 0.989, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3966448307037354, - "rewards/margins": 0.4641706049442291, - "rewards/rejected": -1.8608152866363525, - "sft_loss": 1.3813846111297607, + "grad_norm": 7.789921643519763, + "learning_rate": 8.146238842754767e-07, + "logits/chosen": -0.1279408484697342, + "logits/rejected": -0.02753205969929695, + "logps/chosen": -1.3840112686157227, + "logps/rejected": -1.6482646465301514, + "loss": 1.0611, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3840112686157227, + "rewards/margins": 0.26425355672836304, + "rewards/rejected": -1.6482646465301514, + "sft_loss": 1.3844091892242432, "step": 1990 }, { "epoch": 1.0677370797792274, - "grad_norm": 6.0436821093239175, - "learning_rate": 2.440235817663443e-06, - "logits/chosen": -0.2763604521751404, - "logits/rejected": -0.14427319169044495, - "logps/chosen": -1.2740201950073242, - "logps/rejected": -1.9402685165405273, - "loss": 0.941, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2740201950073242, - "rewards/margins": 0.6662485003471375, - "rewards/rejected": -1.9402685165405273, - "sft_loss": 1.3213818073272705, + "grad_norm": 5.410895902437541, + "learning_rate": 8.134119392211476e-07, + "logits/chosen": -0.020565593615174294, + "logits/rejected": 0.135699063539505, + "logps/chosen": -1.2775145769119263, + "logps/rejected": -1.7521684169769287, + "loss": 0.9907, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2775145769119263, + "rewards/margins": 0.47465381026268005, + "rewards/rejected": -1.7521684169769287, + "sft_loss": 1.3165042400360107, "step": 1995 }, { "epoch": 1.0704131125606289, - "grad_norm": 13.243435815567677, - "learning_rate": 2.4365908603284285e-06, - "logits/chosen": -0.3816941976547241, - "logits/rejected": -0.23525352776050568, - "logps/chosen": -1.4328410625457764, - "logps/rejected": -2.0395724773406982, - "loss": 1.0569, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -1.4328410625457764, - "rewards/margins": 0.6067315340042114, - "rewards/rejected": -2.0395724773406982, - "sft_loss": 1.42452871799469, + "grad_norm": 10.314479472868056, + "learning_rate": 8.121969534428094e-07, + "logits/chosen": -0.09743030369281769, + "logits/rejected": 0.06601885706186295, + "logps/chosen": -1.433811068534851, + "logps/rejected": -1.760123610496521, + "loss": 1.1366, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.433811068534851, + "rewards/margins": 0.32631251215934753, + "rewards/rejected": -1.760123610496521, + "sft_loss": 1.414137601852417, "step": 2000 }, { "epoch": 1.0704131125606289, - "eval_logits/chosen": -0.06950338929891586, - "eval_logits/rejected": 0.009176608175039291, - "eval_logps/chosen": -1.4133427143096924, - "eval_logps/rejected": -1.8531452417373657, - "eval_loss": 1.0426981449127197, - "eval_rewards/accuracies": 0.6298219561576843, - "eval_rewards/chosen": -1.4133427143096924, - "eval_rewards/margins": 0.43980276584625244, - "eval_rewards/rejected": -1.8531452417373657, - "eval_runtime": 43.1749, - "eval_samples_per_second": 31.152, - "eval_sft_loss": 1.416014313697815, - "eval_steps_per_second": 7.805, + "eval_logits/chosen": 0.24525417387485504, + "eval_logits/rejected": 0.34158626198768616, + "eval_logps/chosen": -1.3555330038070679, + "eval_logps/rejected": -1.7039648294448853, + "eval_loss": 1.049640417098999, + "eval_rewards/accuracies": 0.5986647009849548, + "eval_rewards/chosen": -1.3555330038070679, + "eval_rewards/margins": 0.3484318256378174, + "eval_rewards/rejected": -1.7039648294448853, + "eval_runtime": 43.3282, + "eval_samples_per_second": 31.042, + "eval_sft_loss": 1.3798303604125977, + "eval_steps_per_second": 7.778, "step": 2000 }, { "epoch": 1.0730891453420304, - "grad_norm": 6.708271089434237, - "learning_rate": 2.4329368161847796e-06, - "logits/chosen": -0.34353774785995483, - "logits/rejected": -0.27555742859840393, - "logps/chosen": -1.352007269859314, - "logps/rejected": -1.7914186716079712, - "loss": 1.0402, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.352007269859314, - "rewards/margins": 0.4394114911556244, - "rewards/rejected": -1.7914186716079712, - "sft_loss": 1.4041087627410889, + "grad_norm": 7.411979260880644, + "learning_rate": 8.109789387282599e-07, + "logits/chosen": -0.051903557032346725, + "logits/rejected": 0.03727956861257553, + "logps/chosen": -1.3741095066070557, + "logps/rejected": -1.613830804824829, + "loss": 1.0907, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3741095066070557, + "rewards/margins": 0.23972125351428986, + "rewards/rejected": -1.613830804824829, + "sft_loss": 1.402032494544983, "step": 2005 }, { "epoch": 1.075765178123432, - "grad_norm": 8.254107888627756, - "learning_rate": 2.4292737206840483e-06, - "logits/chosen": -0.2618446946144104, - "logits/rejected": -0.16235823929309845, - "logps/chosen": -1.2731168270111084, - "logps/rejected": -1.714038610458374, - "loss": 0.9913, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2731168270111084, - "rewards/margins": 0.4409221112728119, - "rewards/rejected": -1.714038610458374, - "sft_loss": 1.3453394174575806, + "grad_norm": 8.070388340931352, + "learning_rate": 8.097579068946827e-07, + "logits/chosen": -0.002574050333350897, + "logits/rejected": 0.11509355157613754, + "logps/chosen": -1.2788218259811401, + "logps/rejected": -1.5739259719848633, + "loss": 1.037, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2788218259811401, + "rewards/margins": 0.29510411620140076, + "rewards/rejected": -1.5739259719848633, + "sft_loss": 1.3477932214736938, "step": 2010 }, { "epoch": 1.0784412109048336, - "grad_norm": 7.6854724761378135, - "learning_rate": 2.4256016093656035e-06, - "logits/chosen": -0.3184880018234253, - "logits/rejected": -0.17739680409431458, - "logps/chosen": -1.2959625720977783, - "logps/rejected": -1.75029718875885, - "loss": 0.9699, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2959625720977783, - "rewards/margins": 0.454334557056427, - "rewards/rejected": -1.75029718875885, - "sft_loss": 1.289369821548462, + "grad_norm": 6.782166764980172, + "learning_rate": 8.085338697885344e-07, + "logits/chosen": -0.03715590387582779, + "logits/rejected": 0.12355498969554901, + "logps/chosen": -1.2872711420059204, + "logps/rejected": -1.597229242324829, + "loss": 1.0119, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2872711420059204, + "rewards/margins": 0.30995815992355347, + "rewards/rejected": -1.597229242324829, + "sft_loss": 1.2761762142181396, "step": 2015 }, { "epoch": 1.081117243686235, - "grad_norm": 6.3231906945046825, - "learning_rate": 2.421920517856285e-06, - "logits/chosen": -0.38904905319213867, - "logits/rejected": -0.21985527873039246, - "logps/chosen": -1.3628504276275635, - "logps/rejected": -1.8863815069198608, - "loss": 0.977, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3628504276275635, - "rewards/margins": 0.5235310196876526, - "rewards/rejected": -1.8863815069198608, - "sft_loss": 1.369199514389038, + "grad_norm": 6.363280249836633, + "learning_rate": 8.073068392854282e-07, + "logits/chosen": -0.11921674013137817, + "logits/rejected": 0.08086469024419785, + "logps/chosen": -1.3570458889007568, + "logps/rejected": -1.6779505014419556, + "loss": 1.0451, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3570458889007568, + "rewards/margins": 0.32090455293655396, + "rewards/rejected": -1.6779505014419556, + "sft_loss": 1.3510466814041138, "step": 2020 }, { "epoch": 1.0837932764676368, - "grad_norm": 7.658576283536331, - "learning_rate": 2.418230481870058e-06, - "logits/chosen": -0.300046443939209, - "logits/rejected": -0.17431317269802094, - "logps/chosen": -1.3725075721740723, - "logps/rejected": -1.9859756231307983, - "loss": 0.9873, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3725075721740723, - "rewards/margins": 0.6134681701660156, - "rewards/rejected": -1.9859756231307983, - "sft_loss": 1.4489480257034302, + "grad_norm": 5.666752186518074, + "learning_rate": 8.060768272900193e-07, + "logits/chosen": -0.019246716052293777, + "logits/rejected": 0.12416522204875946, + "logps/chosen": -1.3521109819412231, + "logps/rejected": -1.7478277683258057, + "loss": 1.0479, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3521109819412231, + "rewards/margins": 0.39571696519851685, + "rewards/rejected": -1.7478277683258057, + "sft_loss": 1.402245283126831, "step": 2025 }, { "epoch": 1.0864693092490383, - "grad_norm": 5.871835625416842, - "learning_rate": 2.41453153720767e-06, - "logits/chosen": -0.3782210946083069, - "logits/rejected": -0.36119428277015686, - "logps/chosen": -1.2919793128967285, - "logps/rejected": -1.691510796546936, - "loss": 1.0062, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2919793128967285, - "rewards/margins": 0.39953145384788513, - "rewards/rejected": -1.691510796546936, - "sft_loss": 1.3468921184539795, + "grad_norm": 6.035031708124639, + "learning_rate": 8.0484384573589e-07, + "logits/chosen": -0.08087825030088425, + "logits/rejected": -0.054591964930295944, + "logps/chosen": -1.294004201889038, + "logps/rejected": -1.542240858078003, + "loss": 1.0639, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.294004201889038, + "rewards/margins": 0.24823662638664246, + "rewards/rejected": -1.542240858078003, + "sft_loss": 1.3424246311187744, "step": 2030 }, { "epoch": 1.0891453420304398, - "grad_norm": 6.901454702299402, - "learning_rate": 2.4108237197562963e-06, - "logits/chosen": -0.4194413721561432, - "logits/rejected": -0.2624618411064148, - "logps/chosen": -1.327225685119629, - "logps/rejected": -1.884225606918335, - "loss": 0.9807, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.327225685119629, - "rewards/margins": 0.5570000410079956, - "rewards/rejected": -1.884225606918335, - "sft_loss": 1.346602201461792, + "grad_norm": 10.063671915106058, + "learning_rate": 8.03607906585432e-07, + "logits/chosen": -0.11445492506027222, + "logits/rejected": 0.06764774024486542, + "logps/chosen": -1.3052616119384766, + "logps/rejected": -1.6242589950561523, + "loss": 1.05, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3052616119384766, + "rewards/margins": 0.31899750232696533, + "rewards/rejected": -1.6242589950561523, + "sft_loss": 1.3409321308135986, "step": 2035 }, { "epoch": 1.0918213748118415, - "grad_norm": 29.751282062885846, - "learning_rate": 2.407107065489199e-06, - "logits/chosen": -0.471548855304718, - "logits/rejected": -0.42120179533958435, - "logps/chosen": -1.386148452758789, - "logps/rejected": -1.854129433631897, - "loss": 1.0606, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.386148452758789, - "rewards/margins": 0.46798110008239746, - "rewards/rejected": -1.854129433631897, - "sft_loss": 1.4089126586914062, + "grad_norm": 30.912182108309786, + "learning_rate": 8.023690218297329e-07, + "logits/chosen": -0.16330935060977936, + "logits/rejected": -0.08893314749002457, + "logps/chosen": -1.3520915508270264, + "logps/rejected": -1.588045597076416, + "loss": 1.1011, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3520915508270264, + "rewards/margins": 0.23595380783081055, + "rewards/rejected": -1.588045597076416, + "sft_loss": 1.399583101272583, "step": 2040 }, { "epoch": 1.094497407593243, - "grad_norm": 6.698173603667034, - "learning_rate": 2.403381610465374e-06, - "logits/chosen": -0.32573026418685913, - "logits/rejected": -0.289516419172287, - "logps/chosen": -1.35390305519104, - "logps/rejected": -1.8425319194793701, - "loss": 0.9591, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.35390305519104, - "rewards/margins": 0.488629013299942, - "rewards/rejected": -1.8425319194793701, - "sft_loss": 1.3057984113693237, + "grad_norm": 8.051710900963842, + "learning_rate": 8.01127203488458e-07, + "logits/chosen": -0.028850510716438293, + "logits/rejected": 0.020016059279441833, + "logps/chosen": -1.3128397464752197, + "logps/rejected": -1.6394424438476562, + "loss": 1.0122, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3128397464752197, + "rewards/margins": 0.3266026973724365, + "rewards/rejected": -1.6394424438476562, + "sft_loss": 1.2767457962036133, "step": 2045 }, { "epoch": 1.0971734403746445, - "grad_norm": 6.012288384070778, - "learning_rate": 2.3996473908292017e-06, - "logits/chosen": -0.4606549143791199, - "logits/rejected": -0.3668895959854126, - "logps/chosen": -1.3278155326843262, - "logps/rejected": -1.7195066213607788, - "loss": 1.0423, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3278155326843262, - "rewards/margins": 0.39169105887413025, - "rewards/rejected": -1.7195066213607788, - "sft_loss": 1.4088014364242554, + "grad_norm": 6.283046847276603, + "learning_rate": 7.998824636097339e-07, + "logits/chosen": -0.1665606051683426, + "logits/rejected": -0.031640782952308655, + "logps/chosen": -1.3250031471252441, + "logps/rejected": -1.5790154933929443, + "loss": 1.0946, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3250031471252441, + "rewards/margins": 0.25401216745376587, + "rewards/rejected": -1.5790154933929443, + "sft_loss": 1.4127658605575562, "step": 2050 }, { "epoch": 1.0998494731560462, - "grad_norm": 7.264707775672911, - "learning_rate": 2.3959044428100985e-06, - "logits/chosen": -0.3544057011604309, - "logits/rejected": -0.25213176012039185, - "logps/chosen": -1.293614149093628, - "logps/rejected": -1.742283821105957, - "loss": 1.0046, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.293614149093628, - "rewards/margins": 0.4486696124076843, - "rewards/rejected": -1.742283821105957, - "sft_loss": 1.35196852684021, + "grad_norm": 7.269065772821926, + "learning_rate": 7.986348142700328e-07, + "logits/chosen": -0.11072566360235214, + "logits/rejected": 0.017708975821733475, + "logps/chosen": -1.2886245250701904, + "logps/rejected": -1.5549981594085693, + "loss": 1.0582, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2886245250701904, + "rewards/margins": 0.26637381315231323, + "rewards/rejected": -1.5549981594085693, + "sft_loss": 1.3413861989974976, "step": 2055 }, { "epoch": 1.1025255059374477, - "grad_norm": 6.904375922387609, - "learning_rate": 2.392152802722162e-06, - "logits/chosen": -0.29229849576950073, - "logits/rejected": -0.25606706738471985, - "logps/chosen": -1.3440862894058228, - "logps/rejected": -1.8628699779510498, - "loss": 1.0064, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3440862894058228, - "rewards/margins": 0.5187836289405823, - "rewards/rejected": -1.8628699779510498, - "sft_loss": 1.4032114744186401, + "grad_norm": 6.24349177180485, + "learning_rate": 7.973842675740539e-07, + "logits/chosen": -0.0787181407213211, + "logits/rejected": -0.022912293672561646, + "logps/chosen": -1.3490071296691895, + "logps/rejected": -1.7265007495880127, + "loss": 1.049, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3490071296691895, + "rewards/margins": 0.37749359011650085, + "rewards/rejected": -1.7265007495880127, + "sft_loss": 1.4076695442199707, "step": 2060 }, { "epoch": 1.1052015387188494, - "grad_norm": 6.313972841462663, - "learning_rate": 2.38839250696382e-06, - "logits/chosen": -0.3327978551387787, - "logits/rejected": -0.22336368262767792, - "logps/chosen": -1.2951395511627197, - "logps/rejected": -1.7230771780014038, - "loss": 0.9956, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -1.2951395511627197, - "rewards/margins": 0.42793768644332886, - "rewards/rejected": -1.7230771780014038, - "sft_loss": 1.300781488418579, + "grad_norm": 6.539411262922854, + "learning_rate": 7.961308356546066e-07, + "logits/chosen": -0.10219565778970718, + "logits/rejected": 0.02910270355641842, + "logps/chosen": -1.3191393613815308, + "logps/rejected": -1.5854027271270752, + "loss": 1.0652, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3191393613815308, + "rewards/margins": 0.26626336574554443, + "rewards/rejected": -1.5854027271270752, + "sft_loss": 1.3123081922531128, "step": 2065 }, { "epoch": 1.107877571500251, - "grad_norm": 6.052818712794828, - "learning_rate": 2.3846235920174794e-06, - "logits/chosen": -0.3500472903251648, - "logits/rejected": -0.21844089031219482, - "logps/chosen": -1.2430912256240845, - "logps/rejected": -1.7730716466903687, - "loss": 0.934, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2430912256240845, - "rewards/margins": 0.529980480670929, - "rewards/rejected": -1.7730716466903687, - "sft_loss": 1.2939088344573975, + "grad_norm": 6.305327686728363, + "learning_rate": 7.948745306724931e-07, + "logits/chosen": -0.13566602766513824, + "logits/rejected": 0.016496330499649048, + "logps/chosen": -1.2557474374771118, + "logps/rejected": -1.6477458477020264, + "loss": 0.9846, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2557474374771118, + "rewards/margins": 0.3919984698295593, + "rewards/rejected": -1.6477458477020264, + "sft_loss": 1.2934027910232544, "step": 2070 }, { "epoch": 1.1105536042816524, - "grad_norm": 10.532652555200189, - "learning_rate": 2.380846094449169e-06, - "logits/chosen": -0.3928828835487366, - "logits/rejected": -0.3052484393119812, - "logps/chosen": -1.3178021907806396, - "logps/rejected": -1.8242807388305664, - "loss": 0.9956, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3178021907806396, - "rewards/margins": 0.5064784288406372, - "rewards/rejected": -1.8242807388305664, - "sft_loss": 1.3954349756240845, + "grad_norm": 13.694157376075486, + "learning_rate": 7.936153648163897e-07, + "logits/chosen": -0.1433573067188263, + "logits/rejected": -0.038366906344890594, + "logps/chosen": -1.3339672088623047, + "logps/rejected": -1.6908581256866455, + "loss": 1.0569, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3339672088623047, + "rewards/margins": 0.3568907380104065, + "rewards/rejected": -1.6908581256866455, + "sft_loss": 1.4096620082855225, "step": 2075 }, { "epoch": 1.1132296370630541, - "grad_norm": 5.505296514008136, - "learning_rate": 2.3770600509081872e-06, - "logits/chosen": -0.448671817779541, - "logits/rejected": -0.29919299483299255, - "logps/chosen": -1.252861738204956, - "logps/rejected": -1.7711899280548096, - "loss": 0.9439, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.252861738204956, - "rewards/margins": 0.5183283090591431, - "rewards/rejected": -1.7711899280548096, - "sft_loss": 1.30695378780365, + "grad_norm": 5.398515345858442, + "learning_rate": 7.92353350302729e-07, + "logits/chosen": -0.19307354092597961, + "logits/rejected": -0.03149569779634476, + "logps/chosen": -1.2381755113601685, + "logps/rejected": -1.6369307041168213, + "loss": 0.9736, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2381755113601685, + "rewards/margins": 0.3987550735473633, + "rewards/rejected": -1.6369307041168213, + "sft_loss": 1.2880828380584717, "step": 2080 }, { "epoch": 1.1159056698444556, - "grad_norm": 9.690465905387198, - "learning_rate": 2.373265498126745e-06, - "logits/chosen": -0.4107128083705902, - "logits/rejected": -0.31409990787506104, - "logps/chosen": -1.3148083686828613, - "logps/rejected": -1.8912432193756104, - "loss": 0.974, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3148083686828613, - "rewards/margins": 0.5764346122741699, - "rewards/rejected": -1.8912432193756104, - "sft_loss": 1.3471014499664307, + "grad_norm": 12.257195395456149, + "learning_rate": 7.910884993755816e-07, + "logits/chosen": -0.1564318835735321, + "logits/rejected": -0.039251018315553665, + "logps/chosen": -1.3086373805999756, + "logps/rejected": -1.677406668663025, + "loss": 1.0341, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3086373805999756, + "rewards/margins": 0.3687690198421478, + "rewards/rejected": -1.677406668663025, + "sft_loss": 1.3267216682434082, "step": 2085 }, { "epoch": 1.118581702625857, - "grad_norm": 8.136623207062934, - "learning_rate": 2.36946247291961e-06, - "logits/chosen": -0.4802681803703308, - "logits/rejected": -0.4794479012489319, - "logps/chosen": -1.3315585851669312, - "logps/rejected": -1.769972801208496, - "loss": 1.0317, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3315585851669312, - "rewards/margins": 0.4384143352508545, - "rewards/rejected": -1.769972801208496, - "sft_loss": 1.4288564920425415, + "grad_norm": 6.909176677099874, + "learning_rate": 7.898208243065367e-07, + "logits/chosen": -0.22223138809204102, + "logits/rejected": -0.2099127471446991, + "logps/chosen": -1.2886847257614136, + "logps/rejected": -1.5642931461334229, + "loss": 1.0756, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2886847257614136, + "rewards/margins": 0.2756083607673645, + "rewards/rejected": -1.5642931461334229, + "sft_loss": 1.3992526531219482, "step": 2090 }, { "epoch": 1.1212577354072588, - "grad_norm": 6.752891233419036, - "learning_rate": 2.3656510121837492e-06, - "logits/chosen": -0.4110310971736908, - "logits/rejected": -0.27823466062545776, - "logps/chosen": -1.4545396566390991, - "logps/rejected": -1.8475834131240845, - "loss": 1.0645, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.4545396566390991, - "rewards/margins": 0.3930436670780182, - "rewards/rejected": -1.8475834131240845, - "sft_loss": 1.485935091972351, + "grad_norm": 7.316699419141525, + "learning_rate": 7.88550337394583e-07, + "logits/chosen": -0.15762130916118622, + "logits/rejected": -0.011286157183349133, + "logps/chosen": -1.4465506076812744, + "logps/rejected": -1.7105118036270142, + "loss": 1.1048, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4465506076812744, + "rewards/margins": 0.26396113634109497, + "rewards/rejected": -1.7105118036270142, + "sft_loss": 1.474279761314392, "step": 2095 }, { "epoch": 1.1239337681886603, - "grad_norm": 8.288303344067318, - "learning_rate": 2.3618311528979717e-06, - "logits/chosen": -0.2994609475135803, - "logits/rejected": -0.2679150700569153, - "logps/chosen": -1.418294906616211, - "logps/rejected": -1.8473249673843384, - "loss": 1.0135, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.418294906616211, - "rewards/margins": 0.4290298521518707, - "rewards/rejected": -1.8473249673843384, - "sft_loss": 1.4293949604034424, + "grad_norm": 7.1151059070629, + "learning_rate": 7.872770509659905e-07, + "logits/chosen": -0.04906468465924263, + "logits/rejected": 0.007820269092917442, + "logps/chosen": -1.450408935546875, + "logps/rejected": -1.6943660974502563, + "loss": 1.0934, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.450408935546875, + "rewards/margins": 0.24395708739757538, + "rewards/rejected": -1.6943660974502563, + "sft_loss": 1.4425398111343384, "step": 2100 }, { "epoch": 1.1266098009700618, - "grad_norm": 8.36414752887087, - "learning_rate": 2.3580029321225692e-06, - "logits/chosen": -0.30061233043670654, - "logits/rejected": -0.19322152435779572, - "logps/chosen": -1.3510148525238037, - "logps/rejected": -1.9490478038787842, - "loss": 0.9667, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.3510148525238037, - "rewards/margins": 0.5980329513549805, - "rewards/rejected": -1.9490478038787842, - "sft_loss": 1.3427172899246216, + "grad_norm": 7.058544700250131, + "learning_rate": 7.860009773741896e-07, + "logits/chosen": -0.027908477932214737, + "logits/rejected": 0.09882084280252457, + "logps/chosen": -1.3739533424377441, + "logps/rejected": -1.7385390996932983, + "loss": 1.0386, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3739533424377441, + "rewards/margins": 0.3645857274532318, + "rewards/rejected": -1.7385390996932983, + "sft_loss": 1.357513189315796, "step": 2105 }, { "epoch": 1.1292858337514635, - "grad_norm": 5.440835529900344, - "learning_rate": 2.354166386998956e-06, - "logits/chosen": -0.3956003189086914, - "logits/rejected": -0.24811363220214844, - "logps/chosen": -1.2949475049972534, - "logps/rejected": -2.068253993988037, - "loss": 0.9611, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.2949475049972534, - "rewards/margins": 0.7733063101768494, - "rewards/rejected": -2.068253993988037, - "sft_loss": 1.3592411279678345, + "grad_norm": 7.232777343396231, + "learning_rate": 7.84722128999652e-07, + "logits/chosen": -0.15299741923809052, + "logits/rejected": 0.01500866748392582, + "logps/chosen": -1.3116304874420166, + "logps/rejected": -1.8638451099395752, + "loss": 1.0158, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3116304874420166, + "rewards/margins": 0.552214503288269, + "rewards/rejected": -1.8638451099395752, + "sft_loss": 1.368384599685669, "step": 2110 }, { "epoch": 1.131961866532865, - "grad_norm": 7.4646506111629565, - "learning_rate": 2.3503215547493097e-06, - "logits/chosen": -0.24873106181621552, - "logits/rejected": -0.20497290790081024, - "logps/chosen": -1.326761245727539, - "logps/rejected": -1.8516361713409424, - "loss": 1.0211, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.326761245727539, - "rewards/margins": 0.524874746799469, - "rewards/rejected": -1.8516361713409424, - "sft_loss": 1.3926951885223389, + "grad_norm": 8.938851736706948, + "learning_rate": 7.834405182497699e-07, + "logits/chosen": -0.006882402114570141, + "logits/rejected": 0.05765901878476143, + "logps/chosen": -1.3367273807525635, + "logps/rejected": -1.6397292613983154, + "loss": 1.0679, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3367273807525635, + "rewards/margins": 0.3030018210411072, + "rewards/rejected": -1.6397292613983154, + "sft_loss": 1.3929353952407837, "step": 2115 }, { "epoch": 1.1346378993142665, - "grad_norm": 6.778428238441266, - "learning_rate": 2.3464684726762104e-06, - "logits/chosen": -0.370392769575119, - "logits/rejected": -0.3410020172595978, - "logps/chosen": -1.3269846439361572, - "logps/rejected": -1.7383434772491455, - "loss": 1.0277, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3269846439361572, - "rewards/margins": 0.41135889291763306, - "rewards/rejected": -1.7383434772491455, - "sft_loss": 1.4061932563781738, + "grad_norm": 6.702017921722495, + "learning_rate": 7.821561575587368e-07, + "logits/chosen": -0.12569338083267212, + "logits/rejected": -0.0814434066414833, + "logps/chosen": -1.3406602144241333, + "logps/rejected": -1.5882494449615479, + "loss": 1.0758, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3406602144241333, + "rewards/margins": 0.24758926033973694, + "rewards/rejected": -1.5882494449615479, + "sft_loss": 1.417973518371582, "step": 2120 }, { "epoch": 1.1373139320956682, - "grad_norm": 5.1636028770142826, - "learning_rate": 2.342607178162276e-06, - "logits/chosen": -0.2839960753917694, - "logits/rejected": -0.22470524907112122, - "logps/chosen": -1.260023832321167, - "logps/rejected": -1.901210069656372, - "loss": 0.9193, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.260023832321167, - "rewards/margins": 0.6411863565444946, - "rewards/rejected": -1.901210069656372, - "sft_loss": 1.2899072170257568, + "grad_norm": 4.990351697136921, + "learning_rate": 7.808690593874254e-07, + "logits/chosen": -0.08439499139785767, + "logits/rejected": -0.01835174486041069, + "logps/chosen": -1.2689683437347412, + "logps/rejected": -1.693606972694397, + "loss": 0.9787, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2689683437347412, + "rewards/margins": 0.4246388375759125, + "rewards/rejected": -1.693606972694397, + "sft_loss": 1.3011971712112427, "step": 2125 }, { "epoch": 1.1399899648770697, - "grad_norm": 14.15197203351975, - "learning_rate": 2.338737708669804e-06, - "logits/chosen": -0.2899111211299896, - "logits/rejected": -0.061303604394197464, - "logps/chosen": -1.3471908569335938, - "logps/rejected": -1.8845421075820923, - "loss": 1.0016, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3471908569335938, - "rewards/margins": 0.5373513698577881, - "rewards/rejected": -1.8845421075820923, - "sft_loss": 1.4021618366241455, + "grad_norm": 8.595033598654478, + "learning_rate": 7.79579236223268e-07, + "logits/chosen": -0.04252857714891434, + "logits/rejected": 0.2256098985671997, + "logps/chosen": -1.352736473083496, + "logps/rejected": -1.7007734775543213, + "loss": 1.0446, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.352736473083496, + "rewards/margins": 0.34803682565689087, + "rewards/rejected": -1.7007734775543213, + "sft_loss": 1.407582402229309, "step": 2130 }, { "epoch": 1.1426659976584714, - "grad_norm": 7.043226372929394, - "learning_rate": 2.334860101740404e-06, - "logits/chosen": -0.3275575637817383, - "logits/rejected": -0.17974238097667694, - "logps/chosen": -1.3362411260604858, - "logps/rejected": -1.9289495944976807, - "loss": 0.9747, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3362411260604858, - "rewards/margins": 0.5927082896232605, - "rewards/rejected": -1.9289495944976807, - "sft_loss": 1.362756609916687, + "grad_norm": 5.344545744263389, + "learning_rate": 7.782867005801346e-07, + "logits/chosen": -0.030758550390601158, + "logits/rejected": 0.15121528506278992, + "logps/chosen": -1.3427588939666748, + "logps/rejected": -1.7324472665786743, + "loss": 1.037, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3427588939666748, + "rewards/margins": 0.3896884322166443, + "rewards/rejected": -1.7324472665786743, + "sft_loss": 1.370418906211853, "step": 2135 }, { "epoch": 1.145342030439873, - "grad_norm": 15.077464220247649, - "learning_rate": 2.330974394994635e-06, - "logits/chosen": -0.371978759765625, - "logits/rejected": -0.24568262696266174, - "logps/chosen": -1.3593480587005615, - "logps/rejected": -1.8906453847885132, - "loss": 0.999, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3593480587005615, - "rewards/margins": 0.5312973856925964, - "rewards/rejected": -1.8906453847885132, - "sft_loss": 1.373910665512085, + "grad_norm": 9.515661126009993, + "learning_rate": 7.769914649982117e-07, + "logits/chosen": -0.0861203521490097, + "logits/rejected": 0.0647224485874176, + "logps/chosen": -1.3469445705413818, + "logps/rejected": -1.6751673221588135, + "loss": 1.0474, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3469445705413818, + "rewards/margins": 0.32822278141975403, + "rewards/rejected": -1.6751673221588135, + "sft_loss": 1.3514187335968018, "step": 2140 }, { "epoch": 1.1480180632212744, - "grad_norm": 8.061203841557813, - "learning_rate": 2.327080626131641e-06, - "logits/chosen": -0.34131118655204773, - "logits/rejected": -0.2771221995353699, - "logps/chosen": -1.2546392679214478, - "logps/rejected": -1.965981125831604, - "loss": 0.9339, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2546392679214478, - "rewards/margins": 0.7113418579101562, - "rewards/rejected": -1.965981125831604, - "sft_loss": 1.3372882604599, + "grad_norm": 9.819615223092732, + "learning_rate": 7.756935420438803e-07, + "logits/chosen": -0.04387445002794266, + "logits/rejected": 0.052936069667339325, + "logps/chosen": -1.2406890392303467, + "logps/rejected": -1.6722383499145508, + "loss": 1.003, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2406890392303467, + "rewards/margins": 0.4315493106842041, + "rewards/rejected": -1.6722383499145508, + "sft_loss": 1.3141372203826904, "step": 2145 }, { "epoch": 1.1506940960026761, - "grad_norm": 6.198725779486274, - "learning_rate": 2.3231788329287855e-06, - "logits/chosen": -0.3761574625968933, - "logits/rejected": -0.33477336168289185, - "logps/chosen": -1.4132637977600098, - "logps/rejected": -1.9284454584121704, - "loss": 1.0295, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.4132637977600098, - "rewards/margins": 0.5151815414428711, - "rewards/rejected": -1.9284454584121704, - "sft_loss": 1.4557913541793823, + "grad_norm": 6.81998773618303, + "learning_rate": 7.743929443095951e-07, + "logits/chosen": -0.10533297061920166, + "logits/rejected": -0.043056420981884, + "logps/chosen": -1.3972469568252563, + "logps/rejected": -1.7338556051254272, + "loss": 1.0677, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3972469568252563, + "rewards/margins": 0.3366088569164276, + "rewards/rejected": -1.7338556051254272, + "sft_loss": 1.4280717372894287, "step": 2150 }, { "epoch": 1.1533701287840776, - "grad_norm": 8.643689993037764, - "learning_rate": 2.3192690532412827e-06, - "logits/chosen": -0.3037932217121124, - "logits/rejected": -0.24522796273231506, - "logps/chosen": -1.3926770687103271, - "logps/rejected": -1.7917263507843018, - "loss": 1.0373, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3926770687103271, - "rewards/margins": 0.3990491032600403, - "rewards/rejected": -1.7917263507843018, - "sft_loss": 1.4622339010238647, + "grad_norm": 6.608290407818981, + "learning_rate": 7.730896844137609e-07, + "logits/chosen": -0.007891124114394188, + "logits/rejected": 0.06537625938653946, + "logps/chosen": -1.4082162380218506, + "logps/rejected": -1.709684133529663, + "loss": 1.083, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.4082162380218506, + "rewards/margins": 0.30146756768226624, + "rewards/rejected": -1.709684133529663, + "sft_loss": 1.4514955282211304, "step": 2155 }, { "epoch": 1.1560461615654791, - "grad_norm": 8.998457327542715, - "learning_rate": 2.315351325001832e-06, - "logits/chosen": -0.3872067928314209, - "logits/rejected": -0.2904983162879944, - "logps/chosen": -1.3250735998153687, - "logps/rejected": -1.9157615900039673, - "loss": 0.9745, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3250735998153687, - "rewards/margins": 0.5906879305839539, - "rewards/rejected": -1.9157615900039673, - "sft_loss": 1.3803297281265259, + "grad_norm": 7.508397978132879, + "learning_rate": 7.717837750006106e-07, + "logits/chosen": -0.11488986015319824, + "logits/rejected": 0.0039465115405619144, + "logps/chosen": -1.3215010166168213, + "logps/rejected": -1.7341926097869873, + "loss": 1.0216, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3215010166168213, + "rewards/margins": 0.4126916825771332, + "rewards/rejected": -1.7341926097869873, + "sft_loss": 1.3616695404052734, "step": 2160 }, { "epoch": 1.1587221943468808, - "grad_norm": 6.358258923592004, - "learning_rate": 2.3114256862202495e-06, - "logits/chosen": -0.3770531713962555, - "logits/rejected": -0.21621887385845184, - "logps/chosen": -1.3132208585739136, - "logps/rejected": -1.9442708492279053, - "loss": 0.9482, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3132208585739136, - "rewards/margins": 0.6310499906539917, - "rewards/rejected": -1.9442708492279053, - "sft_loss": 1.3465019464492798, + "grad_norm": 7.207242047125425, + "learning_rate": 7.704752287400832e-07, + "logits/chosen": -0.1129886656999588, + "logits/rejected": 0.06885222345590591, + "logps/chosen": -1.346785545349121, + "logps/rejected": -1.8081716299057007, + "loss": 1.0042, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.346785545349121, + "rewards/margins": 0.4613862633705139, + "rewards/rejected": -1.8081716299057007, + "sft_loss": 1.3442741632461548, "step": 2165 }, { "epoch": 1.1613982271282823, - "grad_norm": 4.281762607044848, - "learning_rate": 2.3074921749831013e-06, - "logits/chosen": -0.33440592885017395, - "logits/rejected": -0.17447985708713531, - "logps/chosen": -1.3043618202209473, - "logps/rejected": -1.869027853012085, - "loss": 0.97, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3043618202209473, - "rewards/margins": 0.5646663904190063, - "rewards/rejected": -1.869027853012085, - "sft_loss": 1.3304054737091064, + "grad_norm": 14.002661900284268, + "learning_rate": 7.691640583277004e-07, + "logits/chosen": -0.08279160410165787, + "logits/rejected": 0.1021057590842247, + "logps/chosen": -1.321763515472412, + "logps/rejected": -1.7964341640472412, + "loss": 1.0098, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.321763515472412, + "rewards/margins": 0.47467073798179626, + "rewards/rejected": -1.7964341640472412, + "sft_loss": 1.3471057415008545, "step": 2170 }, { "epoch": 1.1640742599096838, - "grad_norm": 5.573774423455921, - "learning_rate": 2.30355082945333e-06, - "logits/chosen": -0.3903493881225586, - "logits/rejected": -0.23366177082061768, - "logps/chosen": -1.3278374671936035, - "logps/rejected": -1.7130565643310547, - "loss": 1.0106, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.3278374671936035, - "rewards/margins": 0.3852190375328064, - "rewards/rejected": -1.7130565643310547, - "sft_loss": 1.3740935325622559, + "grad_norm": 7.03794605195869, + "learning_rate": 7.678502764844433e-07, + "logits/chosen": -0.11434582620859146, + "logits/rejected": 0.06666027009487152, + "logps/chosen": -1.3573967218399048, + "logps/rejected": -1.6420046091079712, + "loss": 1.0553, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3573967218399048, + "rewards/margins": 0.2846079468727112, + "rewards/rejected": -1.6420046091079712, + "sft_loss": 1.386859655380249, "step": 2175 }, { "epoch": 1.1667502926910855, - "grad_norm": 5.561648952860178, - "learning_rate": 2.2996016878698866e-06, - "logits/chosen": -0.41646987199783325, - "logits/rejected": -0.36212459206581116, - "logps/chosen": -1.2699323892593384, - "logps/rejected": -1.782680869102478, - "loss": 0.978, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2699323892593384, - "rewards/margins": 0.5127487182617188, - "rewards/rejected": -1.782680869102478, - "sft_loss": 1.3473727703094482, + "grad_norm": 5.144226817708508, + "learning_rate": 7.665338959566288e-07, + "logits/chosen": -0.10920850187540054, + "logits/rejected": -0.023228798061609268, + "logps/chosen": -1.3085116147994995, + "logps/rejected": -1.6747932434082031, + "loss": 1.0385, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3085116147994995, + "rewards/margins": 0.3662816882133484, + "rewards/rejected": -1.6747932434082031, + "sft_loss": 1.3602230548858643, "step": 2180 }, { "epoch": 1.169426325472487, - "grad_norm": 7.516882134975139, - "learning_rate": 2.2956447885473607e-06, - "logits/chosen": -0.32953667640686035, - "logits/rejected": -0.20225711166858673, - "logps/chosen": -1.3292700052261353, - "logps/rejected": -1.7645435333251953, - "loss": 0.9786, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3292700052261353, - "rewards/margins": 0.4352734088897705, - "rewards/rejected": -1.7645435333251953, - "sft_loss": 1.3366014957427979, + "grad_norm": 8.1907000790852, + "learning_rate": 7.652149295157868e-07, + "logits/chosen": -0.01839374378323555, + "logits/rejected": 0.13123062252998352, + "logps/chosen": -1.340759515762329, + "logps/rejected": -1.6185781955718994, + "loss": 1.0455, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.340759515762329, + "rewards/margins": 0.277818500995636, + "rewards/rejected": -1.6185781955718994, + "sft_loss": 1.3402440547943115, "step": 2185 }, { "epoch": 1.1721023582538885, - "grad_norm": 5.618775063863522, - "learning_rate": 2.2916801698756063e-06, - "logits/chosen": -0.2886897027492523, - "logits/rejected": -0.2478192150592804, - "logps/chosen": -1.3457328081130981, - "logps/rejected": -1.8688604831695557, - "loss": 1.0074, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3457328081130981, - "rewards/margins": 0.5231277346611023, - "rewards/rejected": -1.8688604831695557, - "sft_loss": 1.4351282119750977, + "grad_norm": 6.992143239090281, + "learning_rate": 7.638933899585354e-07, + "logits/chosen": 0.09558597952127457, + "logits/rejected": 0.15679627656936646, + "logps/chosen": -1.324134111404419, + "logps/rejected": -1.6583322286605835, + "loss": 1.0549, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.324134111404419, + "rewards/margins": 0.3341982960700989, + "rewards/rejected": -1.6583322286605835, + "sft_loss": 1.4005115032196045, "step": 2190 }, { "epoch": 1.1747783910352902, - "grad_norm": 7.458063698457432, - "learning_rate": 2.287707870319372e-06, - "logits/chosen": -0.41209912300109863, - "logits/rejected": -0.32010191679000854, - "logps/chosen": -1.3309061527252197, - "logps/rejected": -1.969655990600586, - "loss": 0.98, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3309061527252197, - "rewards/margins": 0.6387497782707214, - "rewards/rejected": -1.969655990600586, - "sft_loss": 1.379540205001831, + "grad_norm": 10.815572949506729, + "learning_rate": 7.625692901064573e-07, + "logits/chosen": 0.027740132063627243, + "logits/rejected": 0.13389061391353607, + "logps/chosen": -1.307284951210022, + "logps/rejected": -1.735878348350525, + "loss": 1.0106, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.307284951210022, + "rewards/margins": 0.428593248128891, + "rewards/rejected": -1.735878348350525, + "sft_loss": 1.3475149869918823, "step": 2195 }, { "epoch": 1.1774544238166917, - "grad_norm": 7.046611503518798, - "learning_rate": 2.283727928417925e-06, - "logits/chosen": -0.48315876722335815, - "logits/rejected": -0.49728766083717346, - "logps/chosen": -1.3308050632476807, - "logps/rejected": -1.9045906066894531, - "loss": 0.9829, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3308050632476807, - "rewards/margins": 0.573785662651062, - "rewards/rejected": -1.9045906066894531, - "sft_loss": 1.400823950767517, + "grad_norm": 6.395374339569294, + "learning_rate": 7.61242642805975e-07, + "logits/chosen": -0.07172278314828873, + "logits/rejected": -0.0779951885342598, + "logps/chosen": -1.342049241065979, + "logps/rejected": -1.6955223083496094, + "loss": 1.0557, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.342049241065979, + "rewards/margins": 0.3534731864929199, + "rewards/rejected": -1.6955223083496094, + "sft_loss": 1.4067165851593018, "step": 2200 }, { "epoch": 1.1801304565980932, - "grad_norm": 6.089821566872165, - "learning_rate": 2.27974038278468e-06, - "logits/chosen": -0.5165932774543762, - "logits/rejected": -0.3608459532260895, - "logps/chosen": -1.2554948329925537, - "logps/rejected": -1.802454948425293, - "loss": 0.9347, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.2554948329925537, - "rewards/margins": 0.5469598770141602, - "rewards/rejected": -1.802454948425293, - "sft_loss": 1.268705129623413, + "grad_norm": 4.956191208540565, + "learning_rate": 7.599134609282266e-07, + "logits/chosen": -0.12661337852478027, + "logits/rejected": 0.07305797189474106, + "logps/chosen": -1.253143548965454, + "logps/rejected": -1.6444408893585205, + "loss": 0.9738, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.253143548965454, + "rewards/margins": 0.39129742980003357, + "rewards/rejected": -1.6444408893585205, + "sft_loss": 1.2474725246429443, "step": 2205 }, { "epoch": 1.182806489379495, - "grad_norm": 7.429965290613201, - "learning_rate": 2.2757452721068206e-06, - "logits/chosen": -0.5314079523086548, - "logits/rejected": -0.44425535202026367, - "logps/chosen": -1.1852877140045166, - "logps/rejected": -1.7990539073944092, - "loss": 0.9296, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.1852877140045166, - "rewards/margins": 0.6137663125991821, - "rewards/rejected": -1.7990539073944092, - "sft_loss": 1.2573305368423462, + "grad_norm": 7.880389938787795, + "learning_rate": 7.585817573689402e-07, + "logits/chosen": -0.16296562552452087, + "logits/rejected": -0.0367860347032547, + "logps/chosen": -1.2162091732025146, + "logps/rejected": -1.6427171230316162, + "loss": 0.9846, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2162091732025146, + "rewards/margins": 0.42650800943374634, + "rewards/rejected": -1.6427171230316162, + "sft_loss": 1.2575920820236206, "step": 2210 }, { "epoch": 1.1854825221608964, - "grad_norm": 8.428947513370273, - "learning_rate": 2.2717426351449294e-06, - "logits/chosen": -0.5099958181381226, - "logits/rejected": -0.453339159488678, - "logps/chosen": -1.4342055320739746, - "logps/rejected": -2.0443735122680664, - "loss": 0.9893, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.4342055320739746, - "rewards/margins": 0.6101681590080261, - "rewards/rejected": -2.0443735122680664, - "sft_loss": 1.3941960334777832, + "grad_norm": 9.198438837473466, + "learning_rate": 7.572475450483098e-07, + "logits/chosen": -0.14288155734539032, + "logits/rejected": -0.05155428498983383, + "logps/chosen": -1.4426788091659546, + "logps/rejected": -1.8797391653060913, + "loss": 1.0364, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4426788091659546, + "rewards/margins": 0.4370604455471039, + "rewards/rejected": -1.8797391653060913, + "sft_loss": 1.391655683517456, "step": 2215 }, { "epoch": 1.188158554942298, - "grad_norm": 8.649507351765893, - "learning_rate": 2.2677325107326067e-06, - "logits/chosen": -0.5668259859085083, - "logits/rejected": -0.46763554215431213, - "logps/chosen": -1.2631756067276, - "logps/rejected": -1.745678186416626, - "loss": 0.9944, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2631756067276, - "rewards/margins": 0.4825025200843811, - "rewards/rejected": -1.745678186416626, - "sft_loss": 1.325333833694458, + "grad_norm": 7.850915911352129, + "learning_rate": 7.559108369108689e-07, + "logits/chosen": -0.18914887309074402, + "logits/rejected": -0.05741081386804581, + "logps/chosen": -1.2726414203643799, + "logps/rejected": -1.6055552959442139, + "loss": 1.0441, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2726414203643799, + "rewards/margins": 0.33291396498680115, + "rewards/rejected": -1.6055552959442139, + "sft_loss": 1.3326237201690674, "step": 2220 }, { "epoch": 1.1908345877236997, - "grad_norm": 6.626725196731764, - "learning_rate": 2.2637149377760985e-06, - "logits/chosen": -0.5213819742202759, - "logits/rejected": -0.3564545512199402, - "logps/chosen": -1.239008903503418, - "logps/rejected": -1.8543846607208252, - "loss": 0.9431, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.239008903503418, - "rewards/margins": 0.6153759360313416, - "rewards/rejected": -1.8543846607208252, - "sft_loss": 1.3210550546646118, + "grad_norm": 7.153901114036553, + "learning_rate": 7.54571645925366e-07, + "logits/chosen": -0.17279231548309326, + "logits/rejected": 0.04552667587995529, + "logps/chosen": -1.2659995555877686, + "logps/rejected": -1.68410325050354, + "loss": 1.0001, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2659995555877686, + "rewards/margins": 0.41810378432273865, + "rewards/rejected": -1.68410325050354, + "sft_loss": 1.323227047920227, "step": 2225 }, { "epoch": 1.1935106205051011, - "grad_norm": 10.636372148532034, - "learning_rate": 2.2596899552539136e-06, - "logits/chosen": -0.5316981077194214, - "logits/rejected": -0.4064369201660156, - "logps/chosen": -1.340803861618042, - "logps/rejected": -2.076864004135132, - "loss": 0.9649, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.340803861618042, - "rewards/margins": 0.7360602021217346, - "rewards/rejected": -2.076864004135132, - "sft_loss": 1.355507731437683, + "grad_norm": 10.417386352993438, + "learning_rate": 7.532299850846378e-07, + "logits/chosen": -0.1916198432445526, + "logits/rejected": -0.04756034165620804, + "logps/chosen": -1.3450605869293213, + "logps/rejected": -1.8921089172363281, + "loss": 1.0104, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3450605869293213, + "rewards/margins": 0.5470482110977173, + "rewards/rejected": -1.8921089172363281, + "sft_loss": 1.349582552909851, "step": 2230 }, { "epoch": 1.1961866532865026, - "grad_norm": 6.704793830964508, - "learning_rate": 2.2556576022164516e-06, - "logits/chosen": -0.4898918569087982, - "logits/rejected": -0.3309337794780731, - "logps/chosen": -1.28102445602417, - "logps/rejected": -1.877788782119751, - "loss": 0.9655, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.28102445602417, - "rewards/margins": 0.5967644453048706, - "rewards/rejected": -1.877788782119751, - "sft_loss": 1.312239408493042, + "grad_norm": 6.542569036259526, + "learning_rate": 7.518858674054838e-07, + "logits/chosen": -0.16324032843112946, + "logits/rejected": 0.029138848185539246, + "logps/chosen": -1.2792186737060547, + "logps/rejected": -1.7051481008529663, + "loss": 1.0097, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2792186737060547, + "rewards/margins": 0.42592939734458923, + "rewards/rejected": -1.7051481008529663, + "sft_loss": 1.2989497184753418, "step": 2235 }, { "epoch": 1.1988626860679044, - "grad_norm": 6.073604347367257, - "learning_rate": 2.2516179177856182e-06, - "logits/chosen": -0.48533496260643005, - "logits/rejected": -0.35858696699142456, - "logps/chosen": -1.2944753170013428, - "logps/rejected": -1.8752460479736328, - "loss": 0.9308, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2944753170013428, - "rewards/margins": 0.5807708501815796, - "rewards/rejected": -1.8752460479736328, - "sft_loss": 1.343420386314392, + "grad_norm": 5.6759487246974185, + "learning_rate": 7.505393059285394e-07, + "logits/chosen": -0.13519462943077087, + "logits/rejected": 0.031025957316160202, + "logps/chosen": -1.2971508502960205, + "logps/rejected": -1.6557174921035767, + "loss": 1.0111, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2971508502960205, + "rewards/margins": 0.3585665225982666, + "rewards/rejected": -1.6557174921035767, + "sft_loss": 1.3277194499969482, "step": 2240 }, { "epoch": 1.2015387188493059, - "grad_norm": 6.93473369461339, - "learning_rate": 2.2475709411544503e-06, - "logits/chosen": -0.4326443672180176, - "logits/rejected": -0.39417821168899536, - "logps/chosen": -1.2652297019958496, - "logps/rejected": -1.755052924156189, - "loss": 0.9683, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.2652297019958496, - "rewards/margins": 0.48982328176498413, - "rewards/rejected": -1.755052924156189, - "sft_loss": 1.3312979936599731, + "grad_norm": 7.694915042722196, + "learning_rate": 7.491903137181501e-07, + "logits/chosen": -0.10741202533245087, + "logits/rejected": -0.057648736983537674, + "logps/chosen": -1.2721848487854004, + "logps/rejected": -1.5831544399261475, + "loss": 1.031, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2721848487854004, + "rewards/margins": 0.3109695315361023, + "rewards/rejected": -1.5831544399261475, + "sft_loss": 1.3286257982254028, "step": 2245 }, { "epoch": 1.2042147516307076, - "grad_norm": 7.19328412489189, - "learning_rate": 2.2435167115867325e-06, - "logits/chosen": -0.4243658185005188, - "logits/rejected": -0.41740983724594116, - "logps/chosen": -1.27482008934021, - "logps/rejected": -1.8900315761566162, - "loss": 0.9154, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.27482008934021, - "rewards/margins": 0.6152116060256958, - "rewards/rejected": -1.8900315761566162, - "sft_loss": 1.3052546977996826, + "grad_norm": 6.7435684660765185, + "learning_rate": 7.478389038622441e-07, + "logits/chosen": -0.02682996354997158, + "logits/rejected": -0.0047081769444048405, + "logps/chosen": -1.2474782466888428, + "logps/rejected": -1.6963155269622803, + "loss": 0.9555, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2474782466888428, + "rewards/margins": 0.44883736968040466, + "rewards/rejected": -1.6963155269622803, + "sft_loss": 1.2800469398498535, "step": 2250 }, { "epoch": 1.206890784412109, - "grad_norm": 8.271740290521542, - "learning_rate": 2.239455268416618e-06, - "logits/chosen": -0.49116426706314087, - "logits/rejected": -0.404990017414093, - "logps/chosen": -1.3746150732040405, - "logps/rejected": -1.832044005393982, - "loss": 1.0338, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.3746150732040405, - "rewards/margins": 0.45742878317832947, - "rewards/rejected": -1.832044005393982, - "sft_loss": 1.3913629055023193, + "grad_norm": 8.174106215852202, + "learning_rate": 7.46485089472206e-07, + "logits/chosen": -0.11709091812372208, + "logits/rejected": -0.0057418374344706535, + "logps/chosen": -1.3883098363876343, + "logps/rejected": -1.6163625717163086, + "loss": 1.1113, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3883098363876343, + "rewards/margins": 0.22805269062519073, + "rewards/rejected": -1.6163625717163086, + "sft_loss": 1.3883463144302368, "step": 2255 }, { "epoch": 1.2095668171935106, - "grad_norm": 6.574147946513313, - "learning_rate": 2.2353866510482463e-06, - "logits/chosen": -0.43215814232826233, - "logits/rejected": -0.44993463158607483, - "logps/chosen": -1.3542969226837158, - "logps/rejected": -1.8027689456939697, - "loss": 0.9946, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3542969226837158, - "rewards/margins": 0.4484720230102539, - "rewards/rejected": -1.8027689456939697, - "sft_loss": 1.3709371089935303, + "grad_norm": 7.346237599422651, + "learning_rate": 7.451288836827487e-07, + "logits/chosen": -0.06870261579751968, + "logits/rejected": -0.08070772886276245, + "logps/chosen": -1.3321738243103027, + "logps/rejected": -1.5784136056900024, + "loss": 1.0576, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3321738243103027, + "rewards/margins": 0.24623961746692657, + "rewards/rejected": -1.5784136056900024, + "sft_loss": 1.3463549613952637, "step": 2260 }, { "epoch": 1.2122428499749123, - "grad_norm": 5.6040914299629145, - "learning_rate": 2.231310898955361e-06, - "logits/chosen": -0.5037276744842529, - "logits/rejected": -0.4357093870639801, - "logps/chosen": -1.37501060962677, - "logps/rejected": -1.9487228393554688, - "loss": 1.0059, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.37501060962677, - "rewards/margins": 0.5737122893333435, - "rewards/rejected": -1.9487228393554688, - "sft_loss": 1.4598820209503174, + "grad_norm": 10.244188574833444, + "learning_rate": 7.437702996517869e-07, + "logits/chosen": -0.17542986571788788, + "logits/rejected": -0.07479909807443619, + "logps/chosen": -1.381115436553955, + "logps/rejected": -1.6750051975250244, + "loss": 1.076, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.381115436553955, + "rewards/margins": 0.29388970136642456, + "rewards/rejected": -1.6750051975250244, + "sft_loss": 1.4426143169403076, "step": 2265 }, { "epoch": 1.2149188827563138, - "grad_norm": 10.060128681878934, - "learning_rate": 2.2272280516809262e-06, - "logits/chosen": -0.5884903073310852, - "logits/rejected": -0.45150431990623474, - "logps/chosen": -1.3184568881988525, - "logps/rejected": -1.9314219951629639, - "loss": 0.9558, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3184568881988525, - "rewards/margins": 0.6129651069641113, - "rewards/rejected": -1.9314219951629639, - "sft_loss": 1.326110601425171, + "grad_norm": 8.758781027902057, + "learning_rate": 7.424093505603087e-07, + "logits/chosen": -0.2611498534679413, + "logits/rejected": -0.08540613949298859, + "logps/chosen": -1.3195956945419312, + "logps/rejected": -1.7180702686309814, + "loss": 1.0156, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3195956945419312, + "rewards/margins": 0.39847445487976074, + "rewards/rejected": -1.7180702686309814, + "sft_loss": 1.3189489841461182, "step": 2270 }, { "epoch": 1.2175949155377153, - "grad_norm": 9.465708541220504, - "learning_rate": 2.2231381488367447e-06, - "logits/chosen": -0.46365708112716675, - "logits/rejected": -0.3804323077201843, - "logps/chosen": -1.2983791828155518, - "logps/rejected": -1.9503008127212524, - "loss": 0.9392, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2983791828155518, - "rewards/margins": 0.6519216299057007, - "rewards/rejected": -1.9503008127212524, - "sft_loss": 1.317442536354065, + "grad_norm": 7.234792016644986, + "learning_rate": 7.410460496122482e-07, + "logits/chosen": -0.13181889057159424, + "logits/rejected": -0.005660903174430132, + "logps/chosen": -1.2871992588043213, + "logps/rejected": -1.7353289127349854, + "loss": 0.997, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2871992588043213, + "rewards/margins": 0.448129802942276, + "rewards/rejected": -1.7353289127349854, + "sft_loss": 1.313063621520996, "step": 2275 }, { "epoch": 1.220270948319117, - "grad_norm": 7.15563960123028, - "learning_rate": 2.2190412301030717e-06, - "logits/chosen": -0.5409069061279297, - "logits/rejected": -0.41943830251693726, - "logps/chosen": -1.2157642841339111, - "logps/rejected": -1.7413638830184937, - "loss": 0.9514, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2157642841339111, - "rewards/margins": 0.5255998373031616, - "rewards/rejected": -1.7413638830184937, - "sft_loss": 1.2732056379318237, + "grad_norm": 7.931257532736728, + "learning_rate": 7.396804100343572e-07, + "logits/chosen": -0.22026875615119934, + "logits/rejected": -0.05739130824804306, + "logps/chosen": -1.2202383279800415, + "logps/rejected": -1.5603992938995361, + "loss": 1.0085, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2202383279800415, + "rewards/margins": 0.34016093611717224, + "rewards/rejected": -1.5603992938995361, + "sft_loss": 1.2787656784057617, "step": 2280 }, { "epoch": 1.2229469811005185, - "grad_norm": 9.345389829897462, - "learning_rate": 2.2149373352282307e-06, - "logits/chosen": -0.49041399359703064, - "logits/rejected": -0.34547704458236694, - "logps/chosen": -1.3812358379364014, - "logps/rejected": -2.0083115100860596, - "loss": 0.9721, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3812358379364014, - "rewards/margins": 0.6270755529403687, - "rewards/rejected": -2.0083115100860596, - "sft_loss": 1.380582571029663, + "grad_norm": 6.290230551872891, + "learning_rate": 7.383124450760768e-07, + "logits/chosen": -0.1508043259382248, + "logits/rejected": 0.047995325177907944, + "logps/chosen": -1.366620659828186, + "logps/rejected": -1.7611169815063477, + "loss": 1.029, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.366620659828186, + "rewards/margins": 0.39449644088745117, + "rewards/rejected": -1.7611169815063477, + "sft_loss": 1.3677552938461304, "step": 2285 }, { "epoch": 1.22562301388192, - "grad_norm": 4.908793417677732, - "learning_rate": 2.2108265040282275e-06, - "logits/chosen": -0.6118310689926147, - "logits/rejected": -0.5083206295967102, - "logps/chosen": -1.2254685163497925, - "logps/rejected": -1.7813608646392822, - "loss": 0.9601, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2254685163497925, - "rewards/margins": 0.555892288684845, - "rewards/rejected": -1.7813608646392822, - "sft_loss": 1.2873799800872803, + "grad_norm": 5.940764257248773, + "learning_rate": 7.369421680094091e-07, + "logits/chosen": -0.21726492047309875, + "logits/rejected": -0.052529554814100266, + "logps/chosen": -1.2305254936218262, + "logps/rejected": -1.570896029472351, + "loss": 1.0202, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2305254936218262, + "rewards/margins": 0.3403705954551697, + "rewards/rejected": -1.570896029472351, + "sft_loss": 1.2762455940246582, "step": 2290 }, { "epoch": 1.2282990466633217, - "grad_norm": 12.854062830258666, - "learning_rate": 2.2067087763863644e-06, - "logits/chosen": -0.5858707427978516, - "logits/rejected": -0.5209950804710388, - "logps/chosen": -1.3518887758255005, - "logps/rejected": -1.9931846857070923, - "loss": 1.0183, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3518887758255005, - "rewards/margins": 0.6412959098815918, - "rewards/rejected": -1.9931846857070923, - "sft_loss": 1.4554845094680786, + "grad_norm": 5.630415390245314, + "learning_rate": 7.355695921287881e-07, + "logits/chosen": -0.17441818118095398, + "logits/rejected": -0.08711175620555878, + "logps/chosen": -1.306196928024292, + "logps/rejected": -1.6996771097183228, + "loss": 1.0499, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.306196928024292, + "rewards/margins": 0.393480122089386, + "rewards/rejected": -1.6996771097183228, + "sft_loss": 1.4036033153533936, "step": 2295 }, { "epoch": 1.2309750794447232, - "grad_norm": 11.230806427426023, - "learning_rate": 2.202584192252854e-06, - "logits/chosen": -0.5063179731369019, - "logits/rejected": -0.41649264097213745, - "logps/chosen": -1.3351062536239624, - "logps/rejected": -1.8526620864868164, - "loss": 1.0234, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3351062536239624, - "rewards/margins": 0.517555832862854, - "rewards/rejected": -1.8526620864868164, - "sft_loss": 1.392976999282837, + "grad_norm": 11.230895712558812, + "learning_rate": 7.341947307509513e-07, + "logits/chosen": -0.09944915771484375, + "logits/rejected": 0.03277222439646721, + "logps/chosen": -1.3344061374664307, + "logps/rejected": -1.6166597604751587, + "loss": 1.0808, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3344061374664307, + "rewards/margins": 0.2822534739971161, + "rewards/rejected": -1.6166597604751587, + "sft_loss": 1.3840522766113281, "step": 2300 }, { "epoch": 1.233651112226125, - "grad_norm": 8.630130641238273, - "learning_rate": 2.1984527916444283e-06, - "logits/chosen": -0.5501508712768555, - "logits/rejected": -0.44187331199645996, - "logps/chosen": -1.4538100957870483, - "logps/rejected": -2.0695865154266357, - "loss": 1.0058, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.4538100957870483, - "rewards/margins": 0.6157761812210083, - "rewards/rejected": -2.0695865154266357, - "sft_loss": 1.4128062725067139, + "grad_norm": 7.693466549251898, + "learning_rate": 7.328175972148094e-07, + "logits/chosen": -0.14864280819892883, + "logits/rejected": 0.0014330834383144975, + "logps/chosen": -1.4466898441314697, + "logps/rejected": -1.8557466268539429, + "loss": 1.0487, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4466898441314697, + "rewards/margins": 0.40905675292015076, + "rewards/rejected": -1.8557466268539429, + "sft_loss": 1.406456708908081, "step": 2305 }, { "epoch": 1.2363271450075264, - "grad_norm": 8.889623842267012, - "learning_rate": 2.1943146146439557e-06, - "logits/chosen": -0.49043694138526917, - "logits/rejected": -0.28054124116897583, - "logps/chosen": -1.3581206798553467, - "logps/rejected": -2.006803274154663, - "loss": 0.9749, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3581206798553467, - "rewards/margins": 0.648682713508606, - "rewards/rejected": -2.006803274154663, - "sft_loss": 1.3612374067306519, + "grad_norm": 8.179927899991316, + "learning_rate": 7.314382048813185e-07, + "logits/chosen": -0.0914086252450943, + "logits/rejected": 0.19686779379844666, + "logps/chosen": -1.3624199628829956, + "logps/rejected": -1.8249889612197876, + "loss": 1.0177, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3624199628829956, + "rewards/margins": 0.46256914734840393, + "rewards/rejected": -1.8249889612197876, + "sft_loss": 1.3630585670471191, "step": 2310 }, { "epoch": 1.2390031777889279, - "grad_norm": 8.460384516617, - "learning_rate": 2.190169701400046e-06, - "logits/chosen": -0.5204485654830933, - "logits/rejected": -0.38551202416419983, - "logps/chosen": -1.3627592325210571, - "logps/rejected": -2.024308681488037, - "loss": 0.9908, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3627592325210571, - "rewards/margins": 0.6615496277809143, - "rewards/rejected": -2.024308681488037, - "sft_loss": 1.4165655374526978, + "grad_norm": 8.392424794281267, + "learning_rate": 7.300565671333486e-07, + "logits/chosen": -0.06835584342479706, + "logits/rejected": 0.12229911237955093, + "logps/chosen": -1.379624605178833, + "logps/rejected": -1.802242636680603, + "loss": 1.0556, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.379624605178833, + "rewards/margins": 0.42261797189712524, + "rewards/rejected": -1.802242636680603, + "sft_loss": 1.4104360342025757, "step": 2315 }, { "epoch": 1.2416792105703296, - "grad_norm": 6.849685883510946, - "learning_rate": 2.186018092126666e-06, - "logits/chosen": -0.4214208722114563, - "logits/rejected": -0.4097623825073242, - "logps/chosen": -1.3190131187438965, - "logps/rejected": -1.9012644290924072, - "loss": 0.9591, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3190131187438965, - "rewards/margins": 0.582251250743866, - "rewards/rejected": -1.9012644290924072, - "sft_loss": 1.3551105260849, + "grad_norm": 7.796472695999387, + "learning_rate": 7.286726973755554e-07, + "logits/chosen": 0.007735180668532848, + "logits/rejected": 0.037343163043260574, + "logps/chosen": -1.3578131198883057, + "logps/rejected": -1.7218258380889893, + "loss": 1.0405, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3578131198883057, + "rewards/margins": 0.3640126585960388, + "rewards/rejected": -1.7218258380889893, + "sft_loss": 1.3624298572540283, "step": 2320 }, { "epoch": 1.244355243351731, - "grad_norm": 8.481789607899168, - "learning_rate": 2.181859827102748e-06, - "logits/chosen": -0.3990851044654846, - "logits/rejected": -0.3555835783481598, - "logps/chosen": -1.3547569513320923, - "logps/rejected": -2.081740140914917, - "loss": 0.9382, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3547569513320923, - "rewards/margins": 0.726983368396759, - "rewards/rejected": -2.081740140914917, - "sft_loss": 1.3477305173873901, + "grad_norm": 7.3567407076824205, + "learning_rate": 7.272866090342493e-07, + "logits/chosen": 0.05104445293545723, + "logits/rejected": 0.1391843557357788, + "logps/chosen": -1.3850175142288208, + "logps/rejected": -1.7918422222137451, + "loss": 1.0238, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3850175142288208, + "rewards/margins": 0.40682488679885864, + "rewards/rejected": -1.7918422222137451, + "sft_loss": 1.3535354137420654, "step": 2325 }, { "epoch": 1.2470312761331326, - "grad_norm": 9.195165027494054, - "learning_rate": 2.1776949466717967e-06, - "logits/chosen": -0.5580836534500122, - "logits/rejected": -0.4793972074985504, - "logps/chosen": -1.365241289138794, - "logps/rejected": -1.9775069952011108, - "loss": 0.9959, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.365241289138794, - "rewards/margins": 0.6122655868530273, - "rewards/rejected": -1.9775069952011108, - "sft_loss": 1.4184669256210327, + "grad_norm": 6.877849690366655, + "learning_rate": 7.258983155572656e-07, + "logits/chosen": -0.13437165319919586, + "logits/rejected": -0.012639102526009083, + "logps/chosen": -1.3359447717666626, + "logps/rejected": -1.663922667503357, + "loss": 1.0615, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3359447717666626, + "rewards/margins": 0.32797807455062866, + "rewards/rejected": -1.663922667503357, + "sft_loss": 1.3866608142852783, "step": 2330 }, { "epoch": 1.2497073089145343, - "grad_norm": 8.394492326756822, - "learning_rate": 2.1735234912415007e-06, - "logits/chosen": -0.4259399473667145, - "logits/rejected": -0.3873990774154663, - "logps/chosen": -1.3863012790679932, - "logps/rejected": -1.9576469659805298, - "loss": 0.9876, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3863012790679932, - "rewards/margins": 0.5713458061218262, - "rewards/rejected": -1.9576469659805298, - "sft_loss": 1.404350996017456, + "grad_norm": 5.778524261215048, + "learning_rate": 7.245078304138335e-07, + "logits/chosen": 0.009187871590256691, + "logits/rejected": 0.07843149453401566, + "logps/chosen": -1.3242933750152588, + "logps/rejected": -1.7084728479385376, + "loss": 1.0208, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3242933750152588, + "rewards/margins": 0.38417941331863403, + "rewards/rejected": -1.7084728479385376, + "sft_loss": 1.3560478687286377, "step": 2335 }, { "epoch": 1.2523833416959358, - "grad_norm": 7.4852815755175035, - "learning_rate": 2.1693455012833388e-06, - "logits/chosen": -0.5679572224617004, - "logits/rejected": -0.4183397889137268, - "logps/chosen": -1.3491175174713135, - "logps/rejected": -1.9551103115081787, - "loss": 0.9983, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3491175174713135, - "rewards/margins": 0.6059929132461548, - "rewards/rejected": -1.9551103115081787, - "sft_loss": 1.366546630859375, + "grad_norm": 5.6294753375067295, + "learning_rate": 7.231151670944462e-07, + "logits/chosen": -0.17960448563098907, + "logits/rejected": 0.014394590631127357, + "logps/chosen": -1.339212417602539, + "logps/rejected": -1.6445707082748413, + "loss": 1.0601, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.339212417602539, + "rewards/margins": 0.3053584098815918, + "rewards/rejected": -1.6445707082748413, + "sft_loss": 1.3502283096313477, "step": 2340 }, { "epoch": 1.2550593744773373, - "grad_norm": 7.392603378684189, - "learning_rate": 2.1651610173321877e-06, - "logits/chosen": -0.4930770993232727, - "logits/rejected": -0.3636007010936737, - "logps/chosen": -1.330756425857544, - "logps/rejected": -1.9209697246551514, - "loss": 0.9733, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.330756425857544, - "rewards/margins": 0.5902132391929626, - "rewards/rejected": -1.9209697246551514, - "sft_loss": 1.3710582256317139, + "grad_norm": 6.783265605580876, + "learning_rate": 7.217203391107291e-07, + "logits/chosen": -0.08887577801942825, + "logits/rejected": 0.08649233728647232, + "logps/chosen": -1.3186254501342773, + "logps/rejected": -1.6747783422470093, + "loss": 1.0501, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3186254501342773, + "rewards/margins": 0.3561529517173767, + "rewards/rejected": -1.6747783422470093, + "sft_loss": 1.3644529581069946, "step": 2345 }, { "epoch": 1.257735407258739, - "grad_norm": 5.9058043996703145, - "learning_rate": 2.1609700799859287e-06, - "logits/chosen": -0.5141445994377136, - "logits/rejected": -0.4121854305267334, - "logps/chosen": -1.35433030128479, - "logps/rejected": -1.8730039596557617, - "loss": 1.0054, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.35433030128479, - "rewards/margins": 0.5186737179756165, - "rewards/rejected": -1.8730039596557617, - "sft_loss": 1.381361961364746, + "grad_norm": 6.405760070279392, + "learning_rate": 7.203233599953096e-07, + "logits/chosen": -0.08319975435733795, + "logits/rejected": 0.0747746080160141, + "logps/chosen": -1.3581855297088623, + "logps/rejected": -1.646794080734253, + "loss": 1.06, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3581855297088623, + "rewards/margins": 0.2886084318161011, + "rewards/rejected": -1.646794080734253, + "sft_loss": 1.3697419166564941, "step": 2350 }, { "epoch": 1.2604114400401405, - "grad_norm": 8.438160978332148, - "learning_rate": 2.1567727299050555e-06, - "logits/chosen": -0.497615247964859, - "logits/rejected": -0.3885241150856018, - "logps/chosen": -1.2426456212997437, - "logps/rejected": -2.0065460205078125, - "loss": 0.9398, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.2426456212997437, - "rewards/margins": 0.7639003396034241, - "rewards/rejected": -2.0065460205078125, - "sft_loss": 1.3120468854904175, + "grad_norm": 7.571472104965407, + "learning_rate": 7.189242433016852e-07, + "logits/chosen": -0.04250494763255119, + "logits/rejected": 0.10361369699239731, + "logps/chosen": -1.2371678352355957, + "logps/rejected": -1.7140600681304932, + "loss": 1.0039, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2371678352355957, + "rewards/margins": 0.4768921732902527, + "rewards/rejected": -1.7140600681304932, + "sft_loss": 1.3077231645584106, "step": 2355 }, { "epoch": 1.263087472821542, - "grad_norm": 11.62262649985683, - "learning_rate": 2.152569007812276e-06, - "logits/chosen": -0.525715708732605, - "logits/rejected": -0.4441584646701813, - "logps/chosen": -1.3071136474609375, - "logps/rejected": -2.1241071224212646, - "loss": 0.9343, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3071136474609375, - "rewards/margins": 0.8169934153556824, - "rewards/rejected": -2.1241071224212646, - "sft_loss": 1.3898365497589111, + "grad_norm": 10.78619174464002, + "learning_rate": 7.17523002604092e-07, + "logits/chosen": -0.07091349363327026, + "logits/rejected": 0.06878503412008286, + "logps/chosen": -1.305056095123291, + "logps/rejected": -1.8479982614517212, + "loss": 0.9951, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.305056095123291, + "rewards/margins": 0.5429421663284302, + "rewards/rejected": -1.8479982614517212, + "sft_loss": 1.3788334131240845, "step": 2360 }, { "epoch": 1.2657635056029437, - "grad_norm": 5.788344945163352, - "learning_rate": 2.1483589544921202e-06, - "logits/chosen": -0.5040058493614197, - "logits/rejected": -0.42346611618995667, - "logps/chosen": -1.361579179763794, - "logps/rejected": -1.98703932762146, - "loss": 1.0044, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.361579179763794, - "rewards/margins": 0.6254600882530212, - "rewards/rejected": -1.98703932762146, - "sft_loss": 1.4275305271148682, + "grad_norm": 5.651471963037841, + "learning_rate": 7.161196514973734e-07, + "logits/chosen": -0.05510222911834717, + "logits/rejected": 0.0910344272851944, + "logps/chosen": -1.3466378450393677, + "logps/rejected": -1.7891864776611328, + "loss": 1.0345, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3466378450393677, + "rewards/margins": 0.4425484538078308, + "rewards/rejected": -1.7891864776611328, + "sft_loss": 1.4098457098007202, "step": 2365 }, { "epoch": 1.2684395383843452, - "grad_norm": 7.765495051030483, - "learning_rate": 2.144142610790545e-06, - "logits/chosen": -0.4902319014072418, - "logits/rejected": -0.40892449021339417, - "logps/chosen": -1.3193453550338745, - "logps/rejected": -1.8786367177963257, - "loss": 0.9648, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3193453550338745, - "rewards/margins": 0.5592910647392273, - "rewards/rejected": -1.8786367177963257, - "sft_loss": 1.4025046825408936, + "grad_norm": 7.355148492814228, + "learning_rate": 7.147142035968483e-07, + "logits/chosen": -0.018819155171513557, + "logits/rejected": 0.11880254745483398, + "logps/chosen": -1.3327100276947021, + "logps/rejected": -1.7054532766342163, + "loss": 1.0319, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3327100276947021, + "rewards/margins": 0.3727432191371918, + "rewards/rejected": -1.7054532766342163, + "sft_loss": 1.4049968719482422, "step": 2370 }, { "epoch": 1.2711155711657467, - "grad_norm": 9.18984071370844, - "learning_rate": 2.1399200176145344e-06, - "logits/chosen": -0.6545987129211426, - "logits/rejected": -0.5351656079292297, - "logps/chosen": -1.2366050481796265, - "logps/rejected": -1.7836157083511353, - "loss": 0.9548, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2366050481796265, - "rewards/margins": 0.5470104217529297, - "rewards/rejected": -1.7836157083511353, - "sft_loss": 1.2792375087738037, + "grad_norm": 10.943024429606822, + "learning_rate": 7.133066725381781e-07, + "logits/chosen": -0.1734219342470169, + "logits/rejected": 0.013913175091147423, + "logps/chosen": -1.247011423110962, + "logps/rejected": -1.558885931968689, + "loss": 1.0241, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.247011423110962, + "rewards/margins": 0.3118746280670166, + "rewards/rejected": -1.558885931968689, + "sft_loss": 1.2846260070800781, "step": 2375 }, { "epoch": 1.2737916039471484, - "grad_norm": 7.955905733638929, - "learning_rate": 2.1356912159317067e-06, - "logits/chosen": -0.6400793790817261, - "logits/rejected": -0.48976173996925354, - "logps/chosen": -1.4008651971817017, - "logps/rejected": -2.1842517852783203, - "loss": 0.9835, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.4008651971817017, - "rewards/margins": 0.7833863496780396, - "rewards/rejected": -2.1842517852783203, - "sft_loss": 1.4612462520599365, + "grad_norm": 11.851152770608321, + "learning_rate": 7.118970719772354e-07, + "logits/chosen": -0.13905705511569977, + "logits/rejected": 0.09097929298877716, + "logps/chosen": -1.3812119960784912, + "logps/rejected": -1.8251721858978271, + "loss": 1.0541, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3812119960784912, + "rewards/margins": 0.4439600110054016, + "rewards/rejected": -1.8251721858978271, + "sft_loss": 1.4391671419143677, "step": 2380 }, { "epoch": 1.27646763672855, - "grad_norm": 6.902439419571325, - "learning_rate": 2.1314562467699133e-06, - "logits/chosen": -0.5332831740379333, - "logits/rejected": -0.4664790630340576, - "logps/chosen": -1.3477102518081665, - "logps/rejected": -1.8963968753814697, - "loss": 0.9786, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.3477102518081665, - "rewards/margins": 0.5486865043640137, - "rewards/rejected": -1.8963968753814697, - "sft_loss": 1.3262543678283691, + "grad_norm": 5.63380268619042, + "learning_rate": 7.104854155899711e-07, + "logits/chosen": -0.034522850066423416, + "logits/rejected": 0.0760420560836792, + "logps/chosen": -1.3381052017211914, + "logps/rejected": -1.6845115423202515, + "loss": 1.0354, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3381052017211914, + "rewards/margins": 0.34640640020370483, + "rewards/rejected": -1.6845115423202515, + "sft_loss": 1.3170645236968994, "step": 2385 }, { "epoch": 1.2791436695099514, - "grad_norm": 9.388764886978345, - "learning_rate": 2.1272151512168453e-06, - "logits/chosen": -0.5033223628997803, - "logits/rejected": -0.46903854608535767, - "logps/chosen": -1.2872415781021118, - "logps/rejected": -2.043466567993164, - "loss": 0.9396, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2872415781021118, - "rewards/margins": 0.7562249898910522, - "rewards/rejected": -2.043466567993164, - "sft_loss": 1.3627054691314697, + "grad_norm": 5.798808205023328, + "learning_rate": 7.090717170722817e-07, + "logits/chosen": -0.017745880410075188, + "logits/rejected": 0.04981974512338638, + "logps/chosen": -1.2860960960388184, + "logps/rejected": -1.7253410816192627, + "loss": 0.9965, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2860960960388184, + "rewards/margins": 0.43924492597579956, + "rewards/rejected": -1.7253410816192627, + "sft_loss": 1.348103642463684, "step": 2390 }, { "epoch": 1.2818197022913531, - "grad_norm": 6.369306347442532, - "learning_rate": 2.122967970419629e-06, - "logits/chosen": -0.6639467477798462, - "logits/rejected": -0.5871747732162476, - "logps/chosen": -1.262899398803711, - "logps/rejected": -1.8460830450057983, - "loss": 0.9408, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.262899398803711, - "rewards/margins": 0.5831834077835083, - "rewards/rejected": -1.8460830450057983, - "sft_loss": 1.3119322061538696, + "grad_norm": 6.97187103687865, + "learning_rate": 7.076559901398762e-07, + "logits/chosen": -0.24344687163829803, + "logits/rejected": -0.10979632288217545, + "logps/chosen": -1.232178807258606, + "logps/rejected": -1.6103588342666626, + "loss": 0.9885, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.232178807258606, + "rewards/margins": 0.37818005681037903, + "rewards/rejected": -1.6103588342666626, + "sft_loss": 1.2975971698760986, "step": 2395 }, { "epoch": 1.2844957350727546, - "grad_norm": 6.598712576180703, - "learning_rate": 2.118714745584431e-06, - "logits/chosen": -0.5417832732200623, - "logits/rejected": -0.46218061447143555, - "logps/chosen": -1.2741425037384033, - "logps/rejected": -1.827161431312561, - "loss": 0.9655, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2741425037384033, - "rewards/margins": 0.5530189275741577, - "rewards/rejected": -1.827161431312561, - "sft_loss": 1.3322670459747314, + "grad_norm": 7.840210741237986, + "learning_rate": 7.062382485281436e-07, + "logits/chosen": -0.07378415018320084, + "logits/rejected": 0.05714235454797745, + "logps/chosen": -1.2715994119644165, + "logps/rejected": -1.623050332069397, + "loss": 1.0133, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2715994119644165, + "rewards/margins": 0.3514510989189148, + "rewards/rejected": -1.623050332069397, + "sft_loss": 1.3193587064743042, "step": 2400 }, { "epoch": 1.2844957350727546, - "eval_logits/chosen": -0.32509803771972656, - "eval_logits/rejected": -0.2668491303920746, - "eval_logps/chosen": -1.4205245971679688, - "eval_logps/rejected": -1.9133427143096924, - "eval_loss": 1.0376254320144653, - "eval_rewards/accuracies": 0.6357566714286804, - "eval_rewards/chosen": -1.4205245971679688, - "eval_rewards/margins": 0.4928181767463684, - "eval_rewards/rejected": -1.9133427143096924, - "eval_runtime": 43.0353, - "eval_samples_per_second": 31.253, - "eval_sft_loss": 1.416913628578186, - "eval_steps_per_second": 7.831, + "eval_logits/chosen": 0.2707947790622711, + "eval_logits/rejected": 0.3701152205467224, + "eval_logps/chosen": -1.3638824224472046, + "eval_logps/rejected": -1.7321114540100098, + "eval_loss": 1.046066164970398, + "eval_rewards/accuracies": 0.6053412556648254, + "eval_rewards/chosen": -1.3638824224472046, + "eval_rewards/margins": 0.36822912096977234, + "eval_rewards/rejected": -1.7321114540100098, + "eval_runtime": 43.3451, + "eval_samples_per_second": 31.03, + "eval_sft_loss": 1.3863850831985474, + "eval_steps_per_second": 7.775, "step": 2400 }, { "epoch": 1.287171767854156, - "grad_norm": 5.32756149749768, - "learning_rate": 2.1144555179760582e-06, - "logits/chosen": -0.5373546481132507, - "logits/rejected": -0.42289772629737854, - "logps/chosen": -1.3479527235031128, - "logps/rejected": -2.0310306549072266, - "loss": 0.977, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3479527235031128, - "rewards/margins": 0.6830779314041138, - "rewards/rejected": -2.0310306549072266, - "sft_loss": 1.384868860244751, + "grad_norm": 6.179847425136886, + "learning_rate": 7.048185059920193e-07, + "logits/chosen": -0.056697629392147064, + "logits/rejected": 0.10876087844371796, + "logps/chosen": -1.3541631698608398, + "logps/rejected": -1.806817650794983, + "loss": 1.0321, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3541631698608398, + "rewards/margins": 0.4526546001434326, + "rewards/rejected": -1.806817650794983, + "sft_loss": 1.3751753568649292, "step": 2405 }, { "epoch": 1.2898478006355578, - "grad_norm": 7.929697943613389, - "learning_rate": 2.110190328917555e-06, - "logits/chosen": -0.6266440153121948, - "logits/rejected": -0.46866053342819214, - "logps/chosen": -1.3129150867462158, - "logps/rejected": -1.685058832168579, - "loss": 1.0223, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.3129150867462158, - "rewards/margins": 0.3721437156200409, - "rewards/rejected": -1.685058832168579, - "sft_loss": 1.3628969192504883, + "grad_norm": 6.282016623035913, + "learning_rate": 7.033967763058516e-07, + "logits/chosen": -0.18400783836841583, + "logits/rejected": 0.03770359605550766, + "logps/chosen": -1.3120901584625244, + "logps/rejected": -1.5234274864196777, + "loss": 1.0655, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3120901584625244, + "rewards/margins": 0.2113373726606369, + "rewards/rejected": -1.5234274864196777, + "sft_loss": 1.3530489206314087, "step": 2410 }, { "epoch": 1.2925238334169593, - "grad_norm": 8.923537045074404, - "learning_rate": 2.1059192197898044e-06, - "logits/chosen": -0.4667263925075531, - "logits/rejected": -0.41617077589035034, - "logps/chosen": -1.2272765636444092, - "logps/rejected": -1.9868072271347046, - "loss": 0.909, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2272765636444092, - "rewards/margins": 0.7595307230949402, - "rewards/rejected": -1.9868072271347046, - "sft_loss": 1.2580385208129883, + "grad_norm": 6.879817145949223, + "learning_rate": 7.019730732632681e-07, + "logits/chosen": -0.04526624456048012, + "logits/rejected": 0.03693125396966934, + "logps/chosen": -1.24125337600708, + "logps/rejected": -1.7764074802398682, + "loss": 0.9613, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.24125337600708, + "rewards/margins": 0.5351541042327881, + "rewards/rejected": -1.7764074802398682, + "sft_loss": 1.2563873529434204, "step": 2415 }, { "epoch": 1.2951998661983608, - "grad_norm": 8.038623786392707, - "learning_rate": 2.1016422320311257e-06, - "logits/chosen": -0.563264787197113, - "logits/rejected": -0.4620184302330017, - "logps/chosen": -1.3580414056777954, - "logps/rejected": -1.9529234170913696, - "loss": 0.9653, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3580414056777954, - "rewards/margins": 0.5948818325996399, - "rewards/rejected": -1.9529234170913696, - "sft_loss": 1.433401107788086, + "grad_norm": 5.706686252449379, + "learning_rate": 7.005474106770418e-07, + "logits/chosen": -0.17454300820827484, + "logits/rejected": -0.03778272494673729, + "logps/chosen": -1.3651527166366577, + "logps/rejected": -1.8023183345794678, + "loss": 1.0069, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3651527166366577, + "rewards/margins": 0.43716558814048767, + "rewards/rejected": -1.8023183345794678, + "sft_loss": 1.4338901042938232, "step": 2420 }, { "epoch": 1.2978758989797625, - "grad_norm": 6.234418417515403, - "learning_rate": 2.097359407136873e-06, - "logits/chosen": -0.44907650351524353, - "logits/rejected": -0.38264599442481995, - "logps/chosen": -1.258331537246704, - "logps/rejected": -1.6858599185943604, - "loss": 0.987, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.258331537246704, - "rewards/margins": 0.4275285601615906, - "rewards/rejected": -1.6858599185943604, - "sft_loss": 1.351828932762146, + "grad_norm": 8.818330042647473, + "learning_rate": 6.991198023789577e-07, + "logits/chosen": -0.03695257008075714, + "logits/rejected": 0.04921901971101761, + "logps/chosen": -1.2691466808319092, + "logps/rejected": -1.5711511373519897, + "loss": 1.0228, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2691466808319092, + "rewards/margins": 0.30200451612472534, + "rewards/rejected": -1.5711511373519897, + "sft_loss": 1.3589909076690674, "step": 2425 }, { "epoch": 1.300551931761164, - "grad_norm": 8.813954012728816, - "learning_rate": 2.093070786659033e-06, - "logits/chosen": -0.4727330207824707, - "logits/rejected": -0.4371541142463684, - "logps/chosen": -1.3942360877990723, - "logps/rejected": -1.9270381927490234, - "loss": 1.0123, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3942360877990723, - "rewards/margins": 0.5328022241592407, - "rewards/rejected": -1.9270381927490234, - "sft_loss": 1.434128999710083, + "grad_norm": 8.297898842674472, + "learning_rate": 6.976902622196776e-07, + "logits/chosen": -0.029715800657868385, + "logits/rejected": 0.031632810831069946, + "logps/chosen": -1.4271819591522217, + "logps/rejected": -1.768367052078247, + "loss": 1.0625, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4271819591522217, + "rewards/margins": 0.3411853313446045, + "rewards/rejected": -1.768367052078247, + "sft_loss": 1.4373775720596313, "step": 2430 }, { "epoch": 1.3032279645425655, - "grad_norm": 5.8086980544229245, - "learning_rate": 2.0887764122058195e-06, - "logits/chosen": -0.45605263113975525, - "logits/rejected": -0.348542183637619, - "logps/chosen": -1.3246591091156006, - "logps/rejected": -1.7781784534454346, - "loss": 0.994, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3246591091156006, - "rewards/margins": 0.4535194933414459, - "rewards/rejected": -1.7781784534454346, - "sft_loss": 1.3365366458892822, + "grad_norm": 5.81668195731196, + "learning_rate": 6.962588040686064e-07, + "logits/chosen": -0.02307548001408577, + "logits/rejected": 0.10794766992330551, + "logps/chosen": -1.303928256034851, + "logps/rejected": -1.580761194229126, + "loss": 1.0365, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.303928256034851, + "rewards/margins": 0.2768331468105316, + "rewards/rejected": -1.580761194229126, + "sft_loss": 1.3223628997802734, "step": 2435 }, { "epoch": 1.3059039973239672, - "grad_norm": 10.34553529868521, - "learning_rate": 2.084476325441272e-06, - "logits/chosen": -0.5627564191818237, - "logits/rejected": -0.4698103368282318, - "logps/chosen": -1.291358232498169, - "logps/rejected": -1.9262027740478516, - "loss": 0.9394, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.291358232498169, - "rewards/margins": 0.6348446607589722, - "rewards/rejected": -1.9262027740478516, - "sft_loss": 1.2897425889968872, + "grad_norm": 7.254516734316676, + "learning_rate": 6.948254418137573e-07, + "logits/chosen": -0.15354299545288086, + "logits/rejected": -0.027570974081754684, + "logps/chosen": -1.2981069087982178, + "logps/rejected": -1.7389848232269287, + "loss": 0.9941, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2981069087982178, + "rewards/margins": 0.44087809324264526, + "rewards/rejected": -1.7389848232269287, + "sft_loss": 1.2723134756088257, "step": 2440 }, { "epoch": 1.3085800301053687, - "grad_norm": 30.91851751488722, - "learning_rate": 2.0801705680848523e-06, - "logits/chosen": -0.5131195187568665, - "logits/rejected": -0.3856009542942047, - "logps/chosen": -1.389692783355713, - "logps/rejected": -1.9225718975067139, - "loss": 1.0114, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.389692783355713, - "rewards/margins": 0.5328791737556458, - "rewards/rejected": -1.9225718975067139, - "sft_loss": 1.35300612449646, + "grad_norm": 6.238727861139878, + "learning_rate": 6.933901893616174e-07, + "logits/chosen": -0.14542250335216522, + "logits/rejected": 0.005346921272575855, + "logps/chosen": -1.3731145858764648, + "logps/rejected": -1.6807750463485718, + "loss": 1.062, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3731145858764648, + "rewards/margins": 0.3076605498790741, + "rewards/rejected": -1.6807750463485718, + "sft_loss": 1.3621718883514404, "step": 2445 }, { "epoch": 1.3112560628867704, - "grad_norm": 9.622022723192302, - "learning_rate": 2.0758591819110364e-06, - "logits/chosen": -0.5288017988204956, - "logits/rejected": -0.40660151839256287, - "logps/chosen": -1.2950856685638428, - "logps/rejected": -1.991323709487915, - "loss": 0.949, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2950856685638428, - "rewards/margins": 0.6962381601333618, - "rewards/rejected": -1.991323709487915, - "sft_loss": 1.302475929260254, + "grad_norm": 7.541602340260906, + "learning_rate": 6.919530606370121e-07, + "logits/chosen": -0.11518025398254395, + "logits/rejected": 0.05123286694288254, + "logps/chosen": -1.3132606744766235, + "logps/rejected": -1.8348419666290283, + "loss": 0.9956, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3132606744766235, + "rewards/margins": 0.5215811133384705, + "rewards/rejected": -1.8348419666290283, + "sft_loss": 1.3019697666168213, "step": 2450 }, { "epoch": 1.313932095668172, - "grad_norm": 4.744900707874614, - "learning_rate": 2.071542208748912e-06, - "logits/chosen": -0.5513706207275391, - "logits/rejected": -0.3701619803905487, - "logps/chosen": -1.3338464498519897, - "logps/rejected": -1.896228551864624, - "loss": 0.9767, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.3338464498519897, - "rewards/margins": 0.5623821020126343, - "rewards/rejected": -1.896228551864624, - "sft_loss": 1.3844144344329834, + "grad_norm": 4.756591565678028, + "learning_rate": 6.905140695829706e-07, + "logits/chosen": -0.13995513319969177, + "logits/rejected": 0.10645530372858047, + "logps/chosen": -1.382127046585083, + "logps/rejected": -1.7566617727279663, + "loss": 1.0435, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.382127046585083, + "rewards/margins": 0.37453463673591614, + "rewards/rejected": -1.7566617727279663, + "sft_loss": 1.4149354696273804, "step": 2455 }, { "epoch": 1.3166081284495736, - "grad_norm": 8.34519438573912, - "learning_rate": 2.0672196904817715e-06, - "logits/chosen": -0.5042263269424438, - "logits/rejected": -0.4224318563938141, - "logps/chosen": -1.3436133861541748, - "logps/rejected": -1.7886947393417358, - "loss": 1.0281, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3436133861541748, - "rewards/margins": 0.44508129358291626, - "rewards/rejected": -1.7886947393417358, - "sft_loss": 1.3645684719085693, + "grad_norm": 11.56479469442216, + "learning_rate": 6.890732301605904e-07, + "logits/chosen": -0.10803036391735077, + "logits/rejected": 0.005168232135474682, + "logps/chosen": -1.3827905654907227, + "logps/rejected": -1.649247169494629, + "loss": 1.0953, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3827905654907227, + "rewards/margins": 0.26645663380622864, + "rewards/rejected": -1.649247169494629, + "sft_loss": 1.3851019144058228, "step": 2460 }, { "epoch": 1.3192841612309751, - "grad_norm": 5.267127509259156, - "learning_rate": 2.0628916690467066e-06, - "logits/chosen": -0.47091466188430786, - "logits/rejected": -0.422675222158432, - "logps/chosen": -1.286413311958313, - "logps/rejected": -1.9102262258529663, - "loss": 0.9609, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.286413311958313, - "rewards/margins": 0.6238128542900085, - "rewards/rejected": -1.9102262258529663, - "sft_loss": 1.3008480072021484, + "grad_norm": 7.275890781310672, + "learning_rate": 6.876305563489021e-07, + "logits/chosen": -0.07685400545597076, + "logits/rejected": 0.005793456919491291, + "logps/chosen": -1.3236695528030396, + "logps/rejected": -1.7799656391143799, + "loss": 1.018, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3236695528030396, + "rewards/margins": 0.45629605650901794, + "rewards/rejected": -1.7799656391143799, + "sft_loss": 1.3315627574920654, "step": 2465 }, { "epoch": 1.3219601940123766, - "grad_norm": 7.586235743737926, - "learning_rate": 2.0585581864341995e-06, - "logits/chosen": -0.6090031862258911, - "logits/rejected": -0.508503258228302, - "logps/chosen": -1.2876276969909668, - "logps/rejected": -1.6935985088348389, - "loss": 1.0107, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2876276969909668, - "rewards/margins": 0.405970960855484, - "rewards/rejected": -1.6935985088348389, - "sft_loss": 1.3544785976409912, + "grad_norm": 7.208833715018191, + "learning_rate": 6.861860621447331e-07, + "logits/chosen": -0.23564176261425018, + "logits/rejected": -0.09703844785690308, + "logps/chosen": -1.2878791093826294, + "logps/rejected": -1.5357494354248047, + "loss": 1.0621, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2878791093826294, + "rewards/margins": 0.2478702962398529, + "rewards/rejected": -1.5357494354248047, + "sft_loss": 1.35406494140625, "step": 2470 }, { "epoch": 1.3246362267937783, - "grad_norm": 6.45092653818614, - "learning_rate": 2.0542192846877177e-06, - "logits/chosen": -0.5176225304603577, - "logits/rejected": -0.47460445761680603, - "logps/chosen": -1.3008874654769897, - "logps/rejected": -1.8060070276260376, - "loss": 0.9677, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3008874654769897, - "rewards/margins": 0.5051193237304688, - "rewards/rejected": -1.8060070276260376, - "sft_loss": 1.3537688255310059, + "grad_norm": 6.192328808121661, + "learning_rate": 6.847397615625725e-07, + "logits/chosen": -0.12468357384204865, + "logits/rejected": -0.04723244532942772, + "logps/chosen": -1.3072025775909424, + "logps/rejected": -1.6626510620117188, + "loss": 1.0168, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3072025775909424, + "rewards/margins": 0.35544854402542114, + "rewards/rejected": -1.6626510620117188, + "sft_loss": 1.3535258769989014, "step": 2475 }, { "epoch": 1.3273122595751798, - "grad_norm": 6.57852999670518, - "learning_rate": 2.049875005903305e-06, - "logits/chosen": -0.6487798690795898, - "logits/rejected": -0.5197803378105164, - "logps/chosen": -1.3330776691436768, - "logps/rejected": -2.0792503356933594, - "loss": 0.9503, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3330776691436768, - "rewards/margins": 0.7461727857589722, - "rewards/rejected": -2.0792503356933594, - "sft_loss": 1.449393391609192, + "grad_norm": 5.514122164787027, + "learning_rate": 6.83291668634435e-07, + "logits/chosen": -0.25773757696151733, + "logits/rejected": -0.0840383991599083, + "logps/chosen": -1.3327374458312988, + "logps/rejected": -1.7903152704238892, + "loss": 1.0202, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3327374458312988, + "rewards/margins": 0.4575781226158142, + "rewards/rejected": -1.7903152704238892, + "sft_loss": 1.4295130968093872, "step": 2480 }, { "epoch": 1.3299882923565813, - "grad_norm": 10.072456903626007, - "learning_rate": 2.045525392229174e-06, - "logits/chosen": -0.48566898703575134, - "logits/rejected": -0.33911052346229553, - "logps/chosen": -1.39583158493042, - "logps/rejected": -2.1033527851104736, - "loss": 1.0366, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.39583158493042, - "rewards/margins": 0.7075213193893433, - "rewards/rejected": -2.1033527851104736, - "sft_loss": 1.4875379800796509, + "grad_norm": 6.430291188228354, + "learning_rate": 6.818417974097246e-07, + "logits/chosen": -0.07060518860816956, + "logits/rejected": 0.11927783489227295, + "logps/chosen": -1.3509200811386108, + "logps/rejected": -1.8332151174545288, + "loss": 1.0684, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3509200811386108, + "rewards/margins": 0.48229488730430603, + "rewards/rejected": -1.8332151174545288, + "sft_loss": 1.4322224855422974, "step": 2485 }, { "epoch": 1.332664325137983, - "grad_norm": 15.057610984252282, - "learning_rate": 2.0411704858652946e-06, - "logits/chosen": -0.542778491973877, - "logits/rejected": -0.5049811601638794, - "logps/chosen": -1.3699305057525635, - "logps/rejected": -2.0171689987182617, - "loss": 0.9714, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3699305057525635, - "rewards/margins": 0.6472384929656982, - "rewards/rejected": -2.0171689987182617, - "sft_loss": 1.43272864818573, + "grad_norm": 9.207303108778117, + "learning_rate": 6.803901619550981e-07, + "logits/chosen": -0.19833900034427643, + "logits/rejected": -0.13946710526943207, + "logps/chosen": -1.3640668392181396, + "logps/rejected": -1.7328455448150635, + "loss": 1.0381, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3640668392181396, + "rewards/margins": 0.368778795003891, + "rewards/rejected": -1.7328455448150635, + "sft_loss": 1.4077961444854736, "step": 2490 }, { "epoch": 1.3353403579193845, - "grad_norm": 5.7032019325446, - "learning_rate": 2.0368103290629877e-06, - "logits/chosen": -0.42945393919944763, - "logits/rejected": -0.4224371314048767, - "logps/chosen": -1.3021575212478638, - "logps/rejected": -1.8281514644622803, - "loss": 0.9871, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.3021575212478638, - "rewards/margins": 0.5259938836097717, - "rewards/rejected": -1.8281514644622803, - "sft_loss": 1.344936490058899, + "grad_norm": 6.748889211680902, + "learning_rate": 6.789367763543292e-07, + "logits/chosen": -0.08480075001716614, + "logits/rejected": -0.07643101364374161, + "logps/chosen": -1.3230348825454712, + "logps/rejected": -1.7031409740447998, + "loss": 1.0347, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3230348825454712, + "rewards/margins": 0.3801063001155853, + "rewards/rejected": -1.7031409740447998, + "sft_loss": 1.3512378931045532, "step": 2495 }, { "epoch": 1.338016390700786, - "grad_norm": 7.021802073199058, - "learning_rate": 2.0324449641245145e-06, - "logits/chosen": -0.4047786295413971, - "logits/rejected": -0.2578263282775879, - "logps/chosen": -1.2514671087265015, - "logps/rejected": -1.685306191444397, - "loss": 0.9715, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2514671087265015, - "rewards/margins": 0.4338390827178955, - "rewards/rejected": -1.685306191444397, - "sft_loss": 1.324374794960022, + "grad_norm": 7.100440960952916, + "learning_rate": 6.774816547081714e-07, + "logits/chosen": -0.05479288101196289, + "logits/rejected": 0.11367790400981903, + "logps/chosen": -1.273905634880066, + "logps/rejected": -1.6257766485214233, + "loss": 1.0125, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.273905634880066, + "rewards/margins": 0.3518711030483246, + "rewards/rejected": -1.6257766485214233, + "sft_loss": 1.3452768325805664, "step": 2500 }, { "epoch": 1.3406924234821878, - "grad_norm": 6.0144852456574185, - "learning_rate": 2.028074433402664e-06, - "logits/chosen": -0.4007970690727234, - "logits/rejected": -0.2498869001865387, - "logps/chosen": -1.2454150915145874, - "logps/rejected": -1.7854740619659424, - "loss": 0.9722, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2454150915145874, - "rewards/margins": 0.5400589108467102, - "rewards/rejected": -1.7854740619659424, - "sft_loss": 1.2837181091308594, + "grad_norm": 6.950889135224402, + "learning_rate": 6.760248111342211e-07, + "logits/chosen": -0.0914456844329834, + "logits/rejected": 0.08299463987350464, + "logps/chosen": -1.2730414867401123, + "logps/rejected": -1.665330171585083, + "loss": 1.0166, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2730414867401123, + "rewards/margins": 0.3922889232635498, + "rewards/rejected": -1.665330171585083, + "sft_loss": 1.2936381101608276, "step": 2505 }, { "epoch": 1.3433684562635893, - "grad_norm": 9.461615744398255, - "learning_rate": 2.023698779300344e-06, - "logits/chosen": -0.48871421813964844, - "logits/rejected": -0.38032227754592896, - "logps/chosen": -1.2504608631134033, - "logps/rejected": -1.7310020923614502, - "loss": 0.9611, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2504608631134033, - "rewards/margins": 0.48054131865501404, - "rewards/rejected": -1.7310020923614502, - "sft_loss": 1.3119781017303467, + "grad_norm": 9.560143778048294, + "learning_rate": 6.745662597667813e-07, + "logits/chosen": -0.15366467833518982, + "logits/rejected": -0.012053056620061398, + "logps/chosen": -1.2696418762207031, + "logps/rejected": -1.6315727233886719, + "loss": 1.0137, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2696418762207031, + "rewards/margins": 0.36193081736564636, + "rewards/rejected": -1.6315727233886719, + "sft_loss": 1.322128176689148, "step": 2510 }, { "epoch": 1.3460444890449907, - "grad_norm": 6.2563350888606815, - "learning_rate": 2.019318044270171e-06, - "logits/chosen": -0.42136192321777344, - "logits/rejected": -0.3354993462562561, - "logps/chosen": -1.3155518770217896, - "logps/rejected": -1.7504370212554932, - "loss": 1.0227, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3155518770217896, - "rewards/margins": 0.43488508462905884, - "rewards/rejected": -1.7504370212554932, - "sft_loss": 1.3969285488128662, + "grad_norm": 7.348808577763548, + "learning_rate": 6.731060147567236e-07, + "logits/chosen": -0.07675446569919586, + "logits/rejected": 0.04603142291307449, + "logps/chosen": -1.337294578552246, + "logps/rejected": -1.6499254703521729, + "loss": 1.0765, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.337294578552246, + "rewards/margins": 0.312630832195282, + "rewards/rejected": -1.6499254703521729, + "sft_loss": 1.409766435623169, "step": 2515 }, { "epoch": 1.3487205218263925, - "grad_norm": 6.955482075506187, - "learning_rate": 2.0149322708140545e-06, - "logits/chosen": -0.5253806710243225, - "logits/rejected": -0.4579823613166809, - "logps/chosen": -1.3426711559295654, - "logps/rejected": -1.7703368663787842, - "loss": 0.9858, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.3426711559295654, - "rewards/margins": 0.42766571044921875, - "rewards/rejected": -1.7703368663787842, - "sft_loss": 1.3179153203964233, + "grad_norm": 6.2283781946087915, + "learning_rate": 6.716440902713515e-07, + "logits/chosen": -0.19296064972877502, + "logits/rejected": -0.1111554503440857, + "logps/chosen": -1.3319435119628906, + "logps/rejected": -1.645559310913086, + "loss": 1.0162, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3319435119628906, + "rewards/margins": 0.3136158883571625, + "rewards/rejected": -1.645559310913086, + "sft_loss": 1.3112457990646362, "step": 2520 }, { "epoch": 1.351396554607794, - "grad_norm": 8.697109095245661, - "learning_rate": 2.0105415014827886e-06, - "logits/chosen": -0.5423754453659058, - "logits/rejected": -0.4885048270225525, - "logps/chosen": -1.3919349908828735, - "logps/rejected": -2.001356601715088, - "loss": 1.0147, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3919349908828735, - "rewards/margins": 0.6094216704368591, - "rewards/rejected": -2.001356601715088, - "sft_loss": 1.4702781438827515, + "grad_norm": 7.404843081341424, + "learning_rate": 6.701805004942627e-07, + "logits/chosen": -0.1959419697523117, + "logits/rejected": -0.1169554591178894, + "logps/chosen": -1.3728597164154053, + "logps/rejected": -1.762760877609253, + "loss": 1.0579, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3728597164154053, + "rewards/margins": 0.38990116119384766, + "rewards/rejected": -1.762760877609253, + "sft_loss": 1.4591501951217651, "step": 2525 }, { "epoch": 1.3540725873891954, - "grad_norm": 6.615814137244643, - "learning_rate": 2.006145778875636e-06, - "logits/chosen": -0.5715299844741821, - "logits/rejected": -0.534439206123352, - "logps/chosen": -1.3156145811080933, - "logps/rejected": -1.8228168487548828, - "loss": 1.0074, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3156145811080933, - "rewards/margins": 0.5072023272514343, - "rewards/rejected": -1.8228168487548828, - "sft_loss": 1.3530737161636353, + "grad_norm": 8.809403280161979, + "learning_rate": 6.687152596252119e-07, + "logits/chosen": -0.2071094512939453, + "logits/rejected": -0.15805336833000183, + "logps/chosen": -1.3241145610809326, + "logps/rejected": -1.590745210647583, + "loss": 1.0759, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3241145610809326, + "rewards/margins": 0.2666308283805847, + "rewards/rejected": -1.590745210647583, + "sft_loss": 1.3455761671066284, "step": 2530 }, { "epoch": 1.3567486201705972, - "grad_norm": 6.026078647434834, - "learning_rate": 2.0017451456399165e-06, - "logits/chosen": -0.587544322013855, - "logits/rejected": -0.4726153314113617, - "logps/chosen": -1.3580278158187866, - "logps/rejected": -1.9934993982315063, - "loss": 0.9682, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3580278158187866, - "rewards/margins": 0.6354714035987854, - "rewards/rejected": -1.9934993982315063, - "sft_loss": 1.3702924251556396, + "grad_norm": 4.819707847104657, + "learning_rate": 6.672483818799722e-07, + "logits/chosen": -0.2637235224246979, + "logits/rejected": -0.1225053071975708, + "logps/chosen": -1.3090099096298218, + "logps/rejected": -1.7086610794067383, + "loss": 1.0081, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3090099096298218, + "rewards/margins": 0.39965111017227173, + "rewards/rejected": -1.7086610794067383, + "sft_loss": 1.3366161584854126, "step": 2535 }, { "epoch": 1.3594246529519987, - "grad_norm": 7.655586715147393, - "learning_rate": 1.9973396444705934e-06, - "logits/chosen": -0.5170526504516602, - "logits/rejected": -0.38636916875839233, - "logps/chosen": -1.4224025011062622, - "logps/rejected": -1.9178695678710938, - "loss": 1.0333, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.4224025011062622, - "rewards/margins": 0.4954671859741211, - "rewards/rejected": -1.9178695678710938, - "sft_loss": 1.4758238792419434, + "grad_norm": 6.503670880002155, + "learning_rate": 6.657798814901978e-07, + "logits/chosen": -0.1527249813079834, + "logits/rejected": 0.020453324541449547, + "logps/chosen": -1.418816089630127, + "logps/rejected": -1.6873157024383545, + "loss": 1.0984, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.418816089630127, + "rewards/margins": 0.2684997320175171, + "rewards/rejected": -1.6873157024383545, + "sft_loss": 1.475125789642334, "step": 2540 }, { "epoch": 1.3621006857334002, - "grad_norm": 9.384836133788575, - "learning_rate": 1.9929293181098588e-06, - "logits/chosen": -0.48921626806259155, - "logits/rejected": -0.35572922229766846, - "logps/chosen": -1.3469831943511963, - "logps/rejected": -2.0367984771728516, - "loss": 0.9761, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3469831943511963, - "rewards/margins": 0.6898151636123657, - "rewards/rejected": -2.0367984771728516, - "sft_loss": 1.4044857025146484, + "grad_norm": 5.701889647109998, + "learning_rate": 6.643097727032863e-07, + "logits/chosen": -0.1425601989030838, + "logits/rejected": 0.027763869613409042, + "logps/chosen": -1.3176789283752441, + "logps/rejected": -1.7970898151397705, + "loss": 1.0188, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3176789283752441, + "rewards/margins": 0.4794110357761383, + "rewards/rejected": -1.7970898151397705, + "sft_loss": 1.373015284538269, "step": 2545 }, { "epoch": 1.3647767185148019, - "grad_norm": 6.119379297533443, - "learning_rate": 1.988514209346718e-06, - "logits/chosen": -0.5070446729660034, - "logits/rejected": -0.3788461983203888, - "logps/chosen": -1.3717761039733887, - "logps/rejected": -1.871145248413086, - "loss": 1.0153, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.3717761039733887, - "rewards/margins": 0.4993690848350525, - "rewards/rejected": -1.871145248413086, - "sft_loss": 1.4144275188446045, + "grad_norm": 7.455068759266927, + "learning_rate": 6.628380697822392e-07, + "logits/chosen": -0.1664392203092575, + "logits/rejected": 0.0011624842882156372, + "logps/chosen": -1.3408262729644775, + "logps/rejected": -1.6157557964324951, + "loss": 1.0608, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3408262729644775, + "rewards/margins": 0.274929404258728, + "rewards/rejected": -1.6157557964324951, + "sft_loss": 1.3625861406326294, "step": 2550 }, { "epoch": 1.3674527512962034, - "grad_norm": 15.852340801316688, - "learning_rate": 1.984094361016575e-06, - "logits/chosen": -0.4378221929073334, - "logits/rejected": -0.36867469549179077, - "logps/chosen": -1.2725999355316162, - "logps/rejected": -1.9760814905166626, - "loss": 0.9695, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2725999355316162, - "rewards/margins": 0.7034815549850464, - "rewards/rejected": -1.9760814905166626, - "sft_loss": 1.3369901180267334, + "grad_norm": 11.204779201931467, + "learning_rate": 6.61364787005525e-07, + "logits/chosen": -0.12884119153022766, + "logits/rejected": -0.03482980281114578, + "logps/chosen": -1.2732433080673218, + "logps/rejected": -1.7895492315292358, + "loss": 1.0184, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2732433080673218, + "rewards/margins": 0.5163058638572693, + "rewards/rejected": -1.7895492315292358, + "sft_loss": 1.3326706886291504, "step": 2555 }, { "epoch": 1.3701287840776049, - "grad_norm": 10.310229691402508, - "learning_rate": 1.9796698160008187e-06, - "logits/chosen": -0.41603922843933105, - "logits/rejected": -0.30848008394241333, - "logps/chosen": -1.322511076927185, - "logps/rejected": -1.90826416015625, - "loss": 0.9483, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.322511076927185, - "rewards/margins": 0.5857528448104858, - "rewards/rejected": -1.90826416015625, - "sft_loss": 1.3534080982208252, + "grad_norm": 10.327413438865861, + "learning_rate": 6.598899386669395e-07, + "logits/chosen": -0.12610045075416565, + "logits/rejected": -0.00018233135051559657, + "logps/chosen": -1.331601858139038, + "logps/rejected": -1.7389627695083618, + "loss": 1.0157, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.331601858139038, + "rewards/margins": 0.4073609709739685, + "rewards/rejected": -1.7389627695083618, + "sft_loss": 1.3417237997055054, "step": 2560 }, { "epoch": 1.3728048168590066, - "grad_norm": 7.437344286420298, - "learning_rate": 1.975240617226404e-06, - "logits/chosen": -0.3992313742637634, - "logits/rejected": -0.28970590233802795, - "logps/chosen": -1.2926113605499268, - "logps/rejected": -1.9368501901626587, - "loss": 0.958, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2926113605499268, - "rewards/margins": 0.6442388296127319, - "rewards/rejected": -1.9368501901626587, - "sft_loss": 1.3517730236053467, + "grad_norm": 12.337276545842748, + "learning_rate": 6.584135390754679e-07, + "logits/chosen": -0.13059648871421814, + "logits/rejected": -0.0020403326489031315, + "logps/chosen": -1.3097715377807617, + "logps/rejected": -1.7724525928497314, + "loss": 1.019, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3097715377807617, + "rewards/margins": 0.4626809060573578, + "rewards/rejected": -1.7724525928497314, + "sft_loss": 1.346893072128296, "step": 2565 }, { "epoch": 1.375480849640408, - "grad_norm": 5.739484924634926, - "learning_rate": 1.9708068076654364e-06, - "logits/chosen": -0.3324227035045624, - "logits/rejected": -0.27540525794029236, - "logps/chosen": -1.2775561809539795, - "logps/rejected": -1.87930166721344, - "loss": 0.9508, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.2775561809539795, - "rewards/margins": 0.6017455458641052, - "rewards/rejected": -1.87930166721344, - "sft_loss": 1.326965093612671, + "grad_norm": 6.504970472580104, + "learning_rate": 6.569356025551454e-07, + "logits/chosen": -0.03162531182169914, + "logits/rejected": 0.038547057658433914, + "logps/chosen": -1.3186380863189697, + "logps/rejected": -1.7203162908554077, + "loss": 1.0231, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3186380863189697, + "rewards/margins": 0.4016784727573395, + "rewards/rejected": -1.7203162908554077, + "sft_loss": 1.3438231945037842, "step": 2570 }, { "epoch": 1.3781568824218096, - "grad_norm": 8.181676957715014, - "learning_rate": 1.966368430334756e-06, - "logits/chosen": -0.4677346348762512, - "logits/rejected": -0.3302808701992035, - "logps/chosen": -1.2933305501937866, - "logps/rejected": -1.8723223209381104, - "loss": 0.9541, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2933305501937866, - "rewards/margins": 0.5789917707443237, - "rewards/rejected": -1.8723223209381104, - "sft_loss": 1.3387445211410522, + "grad_norm": 6.770026584337929, + "learning_rate": 6.554561434449186e-07, + "logits/chosen": -0.20331497490406036, + "logits/rejected": -0.05360947176814079, + "logps/chosen": -1.2827414274215698, + "logps/rejected": -1.6763532161712646, + "loss": 1.0076, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2827414274215698, + "rewards/margins": 0.393611878156662, + "rewards/rejected": -1.6763532161712646, + "sft_loss": 1.3209357261657715, "step": 2575 }, { "epoch": 1.3808329152032113, - "grad_norm": 10.306750512008252, - "learning_rate": 1.961925528295519e-06, - "logits/chosen": -0.41305112838745117, - "logits/rejected": -0.34868156909942627, - "logps/chosen": -1.3654184341430664, - "logps/rejected": -1.779240608215332, - "loss": 1.0189, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3654184341430664, - "rewards/margins": 0.4138219356536865, - "rewards/rejected": -1.779240608215332, - "sft_loss": 1.4515666961669922, + "grad_norm": 6.561231636640119, + "learning_rate": 6.539751760985063e-07, + "logits/chosen": -0.13221685588359833, + "logits/rejected": -0.05130600929260254, + "logps/chosen": -1.366477131843567, + "logps/rejected": -1.6261088848114014, + "loss": 1.0784, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.366477131843567, + "rewards/margins": 0.2596319019794464, + "rewards/rejected": -1.6261088848114014, + "sft_loss": 1.4306046962738037, "step": 2580 }, { "epoch": 1.3835089479846128, - "grad_norm": 8.789090186645119, - "learning_rate": 1.9574781446527806e-06, - "logits/chosen": -0.3169155716896057, - "logits/rejected": -0.17200681567192078, - "logps/chosen": -1.277912974357605, - "logps/rejected": -1.8929014205932617, - "loss": 0.9165, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.277912974357605, - "rewards/margins": 0.6149882078170776, - "rewards/rejected": -1.8929014205932617, - "sft_loss": 1.3085170984268188, + "grad_norm": 8.893681756902707, + "learning_rate": 6.524927148842602e-07, + "logits/chosen": -0.035487428307533264, + "logits/rejected": 0.130559042096138, + "logps/chosen": -1.2470409870147705, + "logps/rejected": -1.6920645236968994, + "loss": 0.9589, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2470409870147705, + "rewards/margins": 0.4450235962867737, + "rewards/rejected": -1.6920645236968994, + "sft_loss": 1.2662039995193481, "step": 2585 }, { "epoch": 1.3861849807660143, - "grad_norm": 12.34770292195211, - "learning_rate": 1.9530263225550765e-06, - "logits/chosen": -0.43816858530044556, - "logits/rejected": -0.31687816977500916, - "logps/chosen": -1.2946866750717163, - "logps/rejected": -1.8282054662704468, - "loss": 0.9975, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.2946866750717163, - "rewards/margins": 0.5335186719894409, - "rewards/rejected": -1.8282054662704468, - "sft_loss": 1.4057872295379639, + "grad_norm": 8.02611117109066, + "learning_rate": 6.510087741850254e-07, + "logits/chosen": -0.15100671350955963, + "logits/rejected": -0.009929725900292397, + "logps/chosen": -1.280668020248413, + "logps/rejected": -1.6584885120391846, + "loss": 1.0392, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.280668020248413, + "rewards/margins": 0.37782055139541626, + "rewards/rejected": -1.6584885120391846, + "sft_loss": 1.3776158094406128, "step": 2590 }, { "epoch": 1.388861013547416, - "grad_norm": 8.534590340266245, - "learning_rate": 1.9485701051940037e-06, - "logits/chosen": -0.4214317202568054, - "logits/rejected": -0.3847965598106384, - "logps/chosen": -1.329071283340454, - "logps/rejected": -1.8042049407958984, - "loss": 0.9922, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.329071283340454, - "rewards/margins": 0.4751337468624115, - "rewards/rejected": -1.8042049407958984, - "sft_loss": 1.3613306283950806, + "grad_norm": 6.564305665115609, + "learning_rate": 6.495233683980012e-07, + "logits/chosen": -0.08805367350578308, + "logits/rejected": -0.04162890464067459, + "logps/chosen": -1.300323486328125, + "logps/rejected": -1.6258924007415771, + "loss": 1.0244, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.300323486328125, + "rewards/margins": 0.325569212436676, + "rewards/rejected": -1.6258924007415771, + "sft_loss": 1.3119922876358032, "step": 2595 }, { "epoch": 1.3915370463288175, - "grad_norm": 8.168860666279095, - "learning_rate": 1.9441095358038035e-06, - "logits/chosen": -0.3514612019062042, - "logits/rejected": -0.2528729736804962, - "logps/chosen": -1.3265708684921265, - "logps/rejected": -1.7354872226715088, - "loss": 1.0077, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3265708684921265, - "rewards/margins": 0.40891632437705994, - "rewards/rejected": -1.7354872226715088, - "sft_loss": 1.3667696714401245, + "grad_norm": 8.780559757186948, + "learning_rate": 6.480365119346011e-07, + "logits/chosen": -0.02691541239619255, + "logits/rejected": 0.10557906329631805, + "logps/chosen": -1.322824239730835, + "logps/rejected": -1.562229871749878, + "loss": 1.062, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.322824239730835, + "rewards/margins": 0.2394055873155594, + "rewards/rejected": -1.562229871749878, + "sft_loss": 1.3450852632522583, "step": 2600 }, { "epoch": 1.394213079110219, - "grad_norm": 12.852456295482222, - "learning_rate": 1.9396446576609387e-06, - "logits/chosen": -0.36365336179733276, - "logits/rejected": -0.33231550455093384, - "logps/chosen": -1.2981659173965454, - "logps/rejected": -1.7442443370819092, - "loss": 0.9804, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2981659173965454, - "rewards/margins": 0.4460783898830414, - "rewards/rejected": -1.7442443370819092, - "sft_loss": 1.3654030561447144, + "grad_norm": 9.368043742615306, + "learning_rate": 6.465482192203129e-07, + "logits/chosen": -0.050204623490571976, + "logits/rejected": -0.002213549567386508, + "logps/chosen": -1.3146531581878662, + "logps/rejected": -1.6026958227157593, + "loss": 1.0455, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3146531581878662, + "rewards/margins": 0.28804266452789307, + "rewards/rejected": -1.6026958227157593, + "sft_loss": 1.3670458793640137, "step": 2605 }, { "epoch": 1.3968891118916207, - "grad_norm": 9.120917157459237, - "learning_rate": 1.935175514083677e-06, - "logits/chosen": -0.3523769676685333, - "logits/rejected": -0.3050524890422821, - "logps/chosen": -1.3515031337738037, - "logps/rejected": -1.8546243906021118, - "loss": 1.0265, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -1.3515031337738037, - "rewards/margins": 0.503121018409729, - "rewards/rejected": -1.8546243906021118, - "sft_loss": 1.399251937866211, + "grad_norm": 10.781070285392934, + "learning_rate": 6.45058504694559e-07, + "logits/chosen": 0.007836557924747467, + "logits/rejected": 0.09162791818380356, + "logps/chosen": -1.3540400266647339, + "logps/rejected": -1.6941732168197632, + "loss": 1.084, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3540400266647339, + "rewards/margins": 0.3401332199573517, + "rewards/rejected": -1.6941732168197632, + "sft_loss": 1.3959096670150757, "step": 2610 }, { "epoch": 1.3995651446730222, - "grad_norm": 13.956277485278768, - "learning_rate": 1.9307021484316693e-06, - "logits/chosen": -0.43114447593688965, - "logits/rejected": -0.3138170838356018, - "logps/chosen": -1.2429790496826172, - "logps/rejected": -1.8580067157745361, - "loss": 0.9582, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2429790496826172, - "rewards/margins": 0.615027666091919, - "rewards/rejected": -1.8580067157745361, - "sft_loss": 1.309478998184204, + "grad_norm": 18.03243421842275, + "learning_rate": 6.435673828105564e-07, + "logits/chosen": -0.14814485609531403, + "logits/rejected": -0.009653128683567047, + "logps/chosen": -1.2639660835266113, + "logps/rejected": -1.7054649591445923, + "loss": 1.0177, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2639660835266113, + "rewards/margins": 0.4414988160133362, + "rewards/rejected": -1.7054649591445923, + "sft_loss": 1.331084132194519, "step": 2615 }, { "epoch": 1.402241177454424, - "grad_norm": 6.04682907749214, - "learning_rate": 1.926224604105529e-06, - "logits/chosen": -0.46197718381881714, - "logits/rejected": -0.48134127259254456, - "logps/chosen": -1.3629053831100464, - "logps/rejected": -1.7455739974975586, - "loss": 1.0485, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -1.3629053831100464, - "rewards/margins": 0.38266849517822266, - "rewards/rejected": -1.7455739974975586, - "sft_loss": 1.4094994068145752, + "grad_norm": 8.824553369123494, + "learning_rate": 6.420748680351763e-07, + "logits/chosen": -0.13758358359336853, + "logits/rejected": -0.13457614183425903, + "logps/chosen": -1.3871793746948242, + "logps/rejected": -1.588173270225525, + "loss": 1.1179, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.3871793746948242, + "rewards/margins": 0.20099392533302307, + "rewards/rejected": -1.588173270225525, + "sft_loss": 1.4249435663223267, "step": 2620 }, { "epoch": 1.4049172102358254, - "grad_norm": 15.757296282890119, - "learning_rate": 1.92174292454641e-06, - "logits/chosen": -0.4499499201774597, - "logits/rejected": -0.3384786546230316, - "logps/chosen": -1.3081551790237427, - "logps/rejected": -1.918835997581482, - "loss": 0.9591, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3081551790237427, - "rewards/margins": 0.6106808185577393, - "rewards/rejected": -1.918835997581482, - "sft_loss": 1.3155691623687744, + "grad_norm": 9.777034551382114, + "learning_rate": 6.405809748488032e-07, + "logits/chosen": -0.0834670439362526, + "logits/rejected": 0.0681837722659111, + "logps/chosen": -1.333077073097229, + "logps/rejected": -1.6984277963638306, + "loss": 1.027, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.333077073097229, + "rewards/margins": 0.3653508424758911, + "rewards/rejected": -1.6984277963638306, + "sft_loss": 1.3320882320404053, "step": 2625 }, { "epoch": 1.4075932430172269, - "grad_norm": 6.048744754749274, - "learning_rate": 1.917257153235587e-06, - "logits/chosen": -0.6204741597175598, - "logits/rejected": -0.45593032240867615, - "logps/chosen": -1.3463962078094482, - "logps/rejected": -1.8253018856048584, - "loss": 1.0093, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.3463962078094482, - "rewards/margins": 0.4789056181907654, - "rewards/rejected": -1.8253018856048584, - "sft_loss": 1.3738082647323608, + "grad_norm": 9.18561160006191, + "learning_rate": 6.390857177451956e-07, + "logits/chosen": -0.24002377688884735, + "logits/rejected": -0.04056672379374504, + "logps/chosen": -1.3651529550552368, + "logps/rejected": -1.670501708984375, + "loss": 1.0717, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3651529550552368, + "rewards/margins": 0.30534881353378296, + "rewards/rejected": -1.670501708984375, + "sft_loss": 1.3859248161315918, "step": 2630 }, { "epoch": 1.4102692757986286, - "grad_norm": 9.04564590360611, - "learning_rate": 1.9127673336940335e-06, - "logits/chosen": -0.48332133889198303, - "logits/rejected": -0.42709770798683167, - "logps/chosen": -1.305422067642212, - "logps/rejected": -1.817214012145996, - "loss": 0.995, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.305422067642212, - "rewards/margins": 0.5117920637130737, - "rewards/rejected": -1.817214012145996, - "sft_loss": 1.3502801656723022, + "grad_norm": 18.481416157033973, + "learning_rate": 6.375891112313445e-07, + "logits/chosen": -0.1223798543214798, + "logits/rejected": -0.030973097309470177, + "logps/chosen": -1.3061634302139282, + "logps/rejected": -1.631188988685608, + "loss": 1.038, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3061634302139282, + "rewards/margins": 0.325025349855423, + "rewards/rejected": -1.631188988685608, + "sft_loss": 1.3667783737182617, "step": 2635 }, { "epoch": 1.41294530858003, - "grad_norm": 5.615734265148532, - "learning_rate": 1.908273509481998e-06, - "logits/chosen": -0.4106292724609375, - "logits/rejected": -0.3586362898349762, - "logps/chosen": -1.3568456172943115, - "logps/rejected": -1.8621975183486938, - "loss": 0.9954, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3568456172943115, - "rewards/margins": 0.5053519010543823, - "rewards/rejected": -1.8621975183486938, - "sft_loss": 1.3644628524780273, + "grad_norm": 5.841523887577847, + "learning_rate": 6.360911698273326e-07, + "logits/chosen": -0.08212809264659882, + "logits/rejected": -0.0031019255984574556, + "logps/chosen": -1.3627724647521973, + "logps/rejected": -1.6993802785873413, + "loss": 1.0494, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3627724647521973, + "rewards/margins": 0.3366078734397888, + "rewards/rejected": -1.6993802785873413, + "sft_loss": 1.36763596534729, "step": 2640 }, { "epoch": 1.4156213413614318, - "grad_norm": 7.975761092606305, - "learning_rate": 1.9037757241985832e-06, - "logits/chosen": -0.44296973943710327, - "logits/rejected": -0.38010460138320923, - "logps/chosen": -1.293700098991394, - "logps/rejected": -1.8513076305389404, - "loss": 0.9578, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.293700098991394, - "rewards/margins": 0.5576077699661255, - "rewards/rejected": -1.8513076305389404, - "sft_loss": 1.314415693283081, + "grad_norm": 9.005289663929863, + "learning_rate": 6.345919080661944e-07, + "logits/chosen": -0.12843796610832214, + "logits/rejected": -0.047466933727264404, + "logps/chosen": -1.3065561056137085, + "logps/rejected": -1.6848160028457642, + "loss": 1.0111, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3065561056137085, + "rewards/margins": 0.37825995683670044, + "rewards/rejected": -1.6848160028457642, + "sft_loss": 1.3081876039505005, "step": 2645 }, { "epoch": 1.4182973741428333, - "grad_norm": 9.470303618697493, - "learning_rate": 1.899274021481321e-06, - "logits/chosen": -0.5351869463920593, - "logits/rejected": -0.39628365635871887, - "logps/chosen": -1.3325697183609009, - "logps/rejected": -2.1645429134368896, - "loss": 0.9519, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3325697183609009, - "rewards/margins": 0.8319734334945679, - "rewards/rejected": -2.1645429134368896, - "sft_loss": 1.3672149181365967, + "grad_norm": 5.470887175371584, + "learning_rate": 6.330913404937737e-07, + "logits/chosen": -0.18649035692214966, + "logits/rejected": -0.03194127231836319, + "logps/chosen": -1.3147566318511963, + "logps/rejected": -1.8917913436889648, + "loss": 0.9953, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3147566318511963, + "rewards/margins": 0.5770348310470581, + "rewards/rejected": -1.8917913436889648, + "sft_loss": 1.352008581161499, "step": 2650 }, { "epoch": 1.4209734069242348, - "grad_norm": 8.118243747697562, - "learning_rate": 1.8947684450057516e-06, - "logits/chosen": -0.4836540222167969, - "logits/rejected": -0.3647536039352417, - "logps/chosen": -1.2214539051055908, - "logps/rejected": -1.8250150680541992, - "loss": 0.9029, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2214539051055908, - "rewards/margins": 0.6035611629486084, - "rewards/rejected": -1.8250150680541992, - "sft_loss": 1.2677587270736694, + "grad_norm": 9.071697896178694, + "learning_rate": 6.315894816685838e-07, + "logits/chosen": -0.10353100299835205, + "logits/rejected": 0.05252040550112724, + "logps/chosen": -1.2148979902267456, + "logps/rejected": -1.5812180042266846, + "loss": 0.9736, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2148979902267456, + "rewards/margins": 0.3663199543952942, + "rewards/rejected": -1.5812180042266846, + "sft_loss": 1.262518286705017, "step": 2655 }, { "epoch": 1.4236494397056365, - "grad_norm": 8.409738270885674, - "learning_rate": 1.890259038484997e-06, - "logits/chosen": -0.4491191804409027, - "logits/rejected": -0.431225061416626, - "logps/chosen": -1.2447589635849, - "logps/rejected": -1.9232664108276367, - "loss": 0.927, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.2447589635849, - "rewards/margins": 0.678507387638092, - "rewards/rejected": -1.9232664108276367, - "sft_loss": 1.2517122030258179, + "grad_norm": 12.04479228633977, + "learning_rate": 6.300863461616657e-07, + "logits/chosen": -0.03726924955844879, + "logits/rejected": 0.014096438884735107, + "logps/chosen": -1.2449657917022705, + "logps/rejected": -1.7064529657363892, + "loss": 0.9874, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2449657917022705, + "rewards/margins": 0.4614872336387634, + "rewards/rejected": -1.7064529657363892, + "sft_loss": 1.2565490007400513, "step": 2660 }, { "epoch": 1.426325472487038, - "grad_norm": 6.0457132942542335, - "learning_rate": 1.8857458456693398e-06, - "logits/chosen": -0.5236254334449768, - "logits/rejected": -0.4272289276123047, - "logps/chosen": -1.3596875667572021, - "logps/rejected": -1.9714336395263672, - "loss": 0.9883, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3596875667572021, - "rewards/margins": 0.6117460131645203, - "rewards/rejected": -1.9714336395263672, - "sft_loss": 1.4770348072052002, + "grad_norm": 7.666946249067871, + "learning_rate": 6.285819485564465e-07, + "logits/chosen": -0.18409328162670135, + "logits/rejected": -0.044957343488931656, + "logps/chosen": -1.3527686595916748, + "logps/rejected": -1.743014931678772, + "loss": 1.0477, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3527686595916748, + "rewards/margins": 0.39024627208709717, + "rewards/rejected": -1.743014931678772, + "sft_loss": 1.468995213508606, "step": 2665 }, { "epoch": 1.4290015052684395, - "grad_norm": 11.462157523408354, - "learning_rate": 1.881228910345796e-06, - "logits/chosen": -0.4460994601249695, - "logits/rejected": -0.3811623454093933, - "logps/chosen": -1.4243602752685547, - "logps/rejected": -1.946854591369629, - "loss": 1.0049, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.4243602752685547, - "rewards/margins": 0.5224944949150085, - "rewards/rejected": -1.946854591369629, - "sft_loss": 1.4408633708953857, + "grad_norm": 10.556427345967442, + "learning_rate": 6.270763034485986e-07, + "logits/chosen": -0.06500004231929779, + "logits/rejected": 0.026881689205765724, + "logps/chosen": -1.4370088577270508, + "logps/rejected": -1.703037977218628, + "loss": 1.091, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4370088577270508, + "rewards/margins": 0.2660290002822876, + "rewards/rejected": -1.703037977218628, + "sft_loss": 1.4438323974609375, "step": 2670 }, { "epoch": 1.4316775380498412, - "grad_norm": 12.819377325527697, - "learning_rate": 1.8767082763376916e-06, - "logits/chosen": -0.49858832359313965, - "logits/rejected": -0.377076655626297, - "logps/chosen": -1.3824807405471802, - "logps/rejected": -1.892433762550354, - "loss": 0.9976, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3824807405471802, - "rewards/margins": 0.509952962398529, - "rewards/rejected": -1.892433762550354, - "sft_loss": 1.3168888092041016, + "grad_norm": 12.558739727033354, + "learning_rate": 6.255694254458972e-07, + "logits/chosen": -0.13178256154060364, + "logits/rejected": 0.031609587371349335, + "logps/chosen": -1.3780395984649658, + "logps/rejected": -1.6814756393432617, + "loss": 1.0411, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3780395984649658, + "rewards/margins": 0.3034361004829407, + "rewards/rejected": -1.6814756393432617, + "sft_loss": 1.2921960353851318, "step": 2675 }, { "epoch": 1.4343535708312427, - "grad_norm": 9.435587493834031, - "learning_rate": 1.8721839875042386e-06, - "logits/chosen": -0.5917202830314636, - "logits/rejected": -0.4736374318599701, - "logps/chosen": -1.347184419631958, - "logps/rejected": -1.8652584552764893, - "loss": 1.002, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.347184419631958, - "rewards/margins": 0.5180739164352417, - "rewards/rejected": -1.8652584552764893, - "sft_loss": 1.4115911722183228, + "grad_norm": 10.25144106346559, + "learning_rate": 6.240613291680795e-07, + "logits/chosen": -0.17347553372383118, + "logits/rejected": -0.004042397253215313, + "logps/chosen": -1.3607873916625977, + "logps/rejected": -1.6869118213653564, + "loss": 1.0642, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3607873916625977, + "rewards/margins": 0.3261243999004364, + "rewards/rejected": -1.6869118213653564, + "sft_loss": 1.4142537117004395, "step": 2680 }, { "epoch": 1.4370296036126442, - "grad_norm": 7.706508747987979, - "learning_rate": 1.8676560877401062e-06, - "logits/chosen": -0.5961927175521851, - "logits/rejected": -0.465129554271698, - "logps/chosen": -1.323976755142212, - "logps/rejected": -1.871092438697815, - "loss": 0.9549, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.323976755142212, - "rewards/margins": 0.5471157431602478, - "rewards/rejected": -1.871092438697815, - "sft_loss": 1.3681257963180542, + "grad_norm": 8.068433911299971, + "learning_rate": 6.225520292467021e-07, + "logits/chosen": -0.1763458549976349, + "logits/rejected": 0.03705815225839615, + "logps/chosen": -1.3179851770401, + "logps/rejected": -1.624053716659546, + "loss": 1.0269, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3179851770401, + "rewards/margins": 0.3060687780380249, + "rewards/rejected": -1.624053716659546, + "sft_loss": 1.3464586734771729, "step": 2685 }, { "epoch": 1.439705636394046, - "grad_norm": 63.92300829705635, - "learning_rate": 1.8631246209749982e-06, - "logits/chosen": -0.7190247774124146, - "logits/rejected": -0.5581813454627991, - "logps/chosen": -1.3239647150039673, - "logps/rejected": -2.063500165939331, - "loss": 0.9355, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3239647150039673, - "rewards/margins": 0.7395354509353638, - "rewards/rejected": -2.063500165939331, - "sft_loss": 1.3667714595794678, + "grad_norm": 21.85675698669309, + "learning_rate": 6.210415403249993e-07, + "logits/chosen": -0.30096036195755005, + "logits/rejected": -0.06160721182823181, + "logps/chosen": -1.345220685005188, + "logps/rejected": -1.830958604812622, + "loss": 1.0149, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.345220685005188, + "rewards/margins": 0.48573771119117737, + "rewards/rejected": -1.830958604812622, + "sft_loss": 1.3653134107589722, "step": 2690 }, { "epoch": 1.4423816691754474, - "grad_norm": 7.07593824509397, - "learning_rate": 1.8585896311732247e-06, - "logits/chosen": -0.5935165882110596, - "logits/rejected": -0.5888763070106506, - "logps/chosen": -1.3361377716064453, - "logps/rejected": -1.9687163829803467, - "loss": 0.9731, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3361377716064453, - "rewards/margins": 0.6325784921646118, - "rewards/rejected": -1.9687163829803467, - "sft_loss": 1.3577778339385986, + "grad_norm": 12.176114443779284, + "learning_rate": 6.195298770577415e-07, + "logits/chosen": -0.0862717255949974, + "logits/rejected": -0.054527319967746735, + "logps/chosen": -1.33372962474823, + "logps/rejected": -1.7393748760223389, + "loss": 1.0235, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.33372962474823, + "rewards/margins": 0.4056454300880432, + "rewards/rejected": -1.7393748760223389, + "sft_loss": 1.3457729816436768, "step": 2695 }, { "epoch": 1.445057701956849, - "grad_norm": 8.352588214209106, - "learning_rate": 1.854051162333277e-06, - "logits/chosen": -0.5560085773468018, - "logits/rejected": -0.4076048731803894, - "logps/chosen": -1.3173125982284546, - "logps/rejected": -1.8631242513656616, - "loss": 0.9914, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3173125982284546, - "rewards/margins": 0.5458115935325623, - "rewards/rejected": -1.8631242513656616, - "sft_loss": 1.4011826515197754, + "grad_norm": 9.070582268020733, + "learning_rate": 6.180170541110923e-07, + "logits/chosen": -0.16579201817512512, + "logits/rejected": 0.02601494826376438, + "logps/chosen": -1.3615131378173828, + "logps/rejected": -1.7076470851898193, + "loss": 1.0667, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3615131378173828, + "rewards/margins": 0.34613385796546936, + "rewards/rejected": -1.7076470851898193, + "sft_loss": 1.4235488176345825, "step": 2700 }, { "epoch": 1.4477337347382506, - "grad_norm": 6.70826330315372, - "learning_rate": 1.8495092584873992e-06, - "logits/chosen": -0.571670413017273, - "logits/rejected": -0.4206954538822174, - "logps/chosen": -1.2080904245376587, - "logps/rejected": -1.8981767892837524, - "loss": 0.8717, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2080904245376587, - "rewards/margins": 0.6900863647460938, - "rewards/rejected": -1.8981767892837524, - "sft_loss": 1.2135100364685059, + "grad_norm": 5.839713039735426, + "learning_rate": 6.165030861624663e-07, + "logits/chosen": -0.1995636522769928, + "logits/rejected": 0.01864362321794033, + "logps/chosen": -1.218213677406311, + "logps/rejected": -1.7912061214447021, + "loss": 0.918, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.218213677406311, + "rewards/margins": 0.5729925036430359, + "rewards/rejected": -1.7912061214447021, + "sft_loss": 1.2121398448944092, "step": 2705 }, { "epoch": 1.4504097675196521, - "grad_norm": 6.540141215860256, - "learning_rate": 1.844963963701163e-06, - "logits/chosen": -0.49980098009109497, - "logits/rejected": -0.4846612811088562, - "logps/chosen": -1.3230160474777222, - "logps/rejected": -1.87786066532135, - "loss": 0.9543, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3230160474777222, - "rewards/margins": 0.5548445582389832, - "rewards/rejected": -1.87786066532135, - "sft_loss": 1.3326829671859741, + "grad_norm": 9.179052304791725, + "learning_rate": 6.149879879003876e-07, + "logits/chosen": -0.09858128428459167, + "logits/rejected": -0.066671222448349, + "logps/chosen": -1.3320286273956299, + "logps/rejected": -1.705482840538025, + "loss": 1.0185, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3320286273956299, + "rewards/margins": 0.37345418334007263, + "rewards/rejected": -1.705482840538025, + "sft_loss": 1.3238738775253296, "step": 2710 }, { "epoch": 1.4530858003010536, - "grad_norm": 8.881381528164832, - "learning_rate": 1.8404153220730383e-06, - "logits/chosen": -0.6166124939918518, - "logits/rejected": -0.5608782172203064, - "logps/chosen": -1.2661406993865967, - "logps/rejected": -1.8450348377227783, - "loss": 0.9832, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2661406993865967, - "rewards/margins": 0.5788939595222473, - "rewards/rejected": -1.8450348377227783, - "sft_loss": 1.3628873825073242, + "grad_norm": 5.483634729918384, + "learning_rate": 6.13471774024346e-07, + "logits/chosen": -0.2557533383369446, + "logits/rejected": -0.15631580352783203, + "logps/chosen": -1.2451080083847046, + "logps/rejected": -1.5999696254730225, + "loss": 1.0356, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2451080083847046, + "rewards/margins": 0.35486167669296265, + "rewards/rejected": -1.5999696254730225, + "sft_loss": 1.3500633239746094, "step": 2715 }, { "epoch": 1.4557618330824553, - "grad_norm": 6.028525689985978, - "learning_rate": 1.8358633777339654e-06, - "logits/chosen": -0.5961281061172485, - "logits/rejected": -0.5205580592155457, - "logps/chosen": -1.3284294605255127, - "logps/rejected": -1.8279927968978882, - "loss": 0.9702, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3284294605255127, - "rewards/margins": 0.4995633065700531, - "rewards/rejected": -1.8279927968978882, - "sft_loss": 1.330628752708435, + "grad_norm": 6.63302785728638, + "learning_rate": 6.119544592446551e-07, + "logits/chosen": -0.18139012157917023, + "logits/rejected": -0.0644666850566864, + "logps/chosen": -1.298688530921936, + "logps/rejected": -1.5520808696746826, + "loss": 1.0355, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.298688530921936, + "rewards/margins": 0.2533922791481018, + "rewards/rejected": -1.5520808696746826, + "sft_loss": 1.297526478767395, "step": 2720 }, { "epoch": 1.4584378658638568, - "grad_norm": 10.231848785799206, - "learning_rate": 1.831308174846929e-06, - "logits/chosen": -0.5107966065406799, - "logits/rejected": -0.42333516478538513, - "logps/chosen": -1.3227458000183105, - "logps/rejected": -1.9830515384674072, - "loss": 0.9385, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3227458000183105, - "rewards/margins": 0.6603055000305176, - "rewards/rejected": -1.9830515384674072, - "sft_loss": 1.3296210765838623, + "grad_norm": 7.301750134692686, + "learning_rate": 6.104360582823096e-07, + "logits/chosen": -0.1751355677843094, + "logits/rejected": -0.06084384769201279, + "logps/chosen": -1.2951709032058716, + "logps/rejected": -1.6943944692611694, + "loss": 1.004, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2951709032058716, + "rewards/margins": 0.399223655462265, + "rewards/rejected": -1.6943944692611694, + "sft_loss": 1.303188443183899, "step": 2725 }, { "epoch": 1.4611138986452583, - "grad_norm": 8.00344492894106, - "learning_rate": 1.826749757606527e-06, - "logits/chosen": -0.5675755739212036, - "logits/rejected": -0.4411475658416748, - "logps/chosen": -1.3341108560562134, - "logps/rejected": -2.089054822921753, - "loss": 0.9605, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3341108560562134, - "rewards/margins": 0.7549439668655396, - "rewards/rejected": -2.089054822921753, - "sft_loss": 1.3772116899490356, + "grad_norm": 7.705834050978145, + "learning_rate": 6.089165858688423e-07, + "logits/chosen": -0.21830907464027405, + "logits/rejected": -0.04081263020634651, + "logps/chosen": -1.3108503818511963, + "logps/rejected": -1.7535419464111328, + "loss": 1.024, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3108503818511963, + "rewards/margins": 0.44269147515296936, + "rewards/rejected": -1.7535419464111328, + "sft_loss": 1.3522942066192627, "step": 2730 }, { "epoch": 1.46378993142666, - "grad_norm": 6.2724946757219175, - "learning_rate": 1.8221881702385435e-06, - "logits/chosen": -0.5259631872177124, - "logits/rejected": -0.35373836755752563, - "logps/chosen": -1.2499545812606812, - "logps/rejected": -2.0189995765686035, - "loss": 0.9046, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2499545812606812, - "rewards/margins": 0.7690447568893433, - "rewards/rejected": -2.0189995765686035, - "sft_loss": 1.3529435396194458, + "grad_norm": 7.187849820236266, + "learning_rate": 6.073960567461811e-07, + "logits/chosen": -0.19904953241348267, + "logits/rejected": -0.0006094604614190757, + "logps/chosen": -1.2227904796600342, + "logps/rejected": -1.7524545192718506, + "loss": 0.9557, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2227904796600342, + "rewards/margins": 0.5296639204025269, + "rewards/rejected": -1.7524545192718506, + "sft_loss": 1.3017102479934692, "step": 2735 }, { "epoch": 1.4664659642080615, - "grad_norm": 11.644048954737464, - "learning_rate": 1.8176234569995196e-06, - "logits/chosen": -0.5477747917175293, - "logits/rejected": -0.47055092453956604, - "logps/chosen": -1.346920371055603, - "logps/rejected": -2.2004103660583496, - "loss": 0.9552, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.346920371055603, - "rewards/margins": 0.8534899950027466, - "rewards/rejected": -2.2004103660583496, - "sft_loss": 1.3843533992767334, + "grad_norm": 10.480615331176718, + "learning_rate": 6.058744856665065e-07, + "logits/chosen": -0.1879226267337799, + "logits/rejected": -0.06761939823627472, + "logps/chosen": -1.293309211730957, + "logps/rejected": -1.8878666162490845, + "loss": 0.984, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.293309211730957, + "rewards/margins": 0.5945574045181274, + "rewards/rejected": -1.8878666162490845, + "sft_loss": 1.3281826972961426, "step": 2740 }, { "epoch": 1.469141996989463, - "grad_norm": 7.688148681172665, - "learning_rate": 1.8130556621763223e-06, - "logits/chosen": -0.525569498538971, - "logits/rejected": -0.44252434372901917, - "logps/chosen": -1.305481195449829, - "logps/rejected": -1.9038594961166382, - "loss": 0.9807, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.305481195449829, - "rewards/margins": 0.5983783006668091, - "rewards/rejected": -1.9038594961166382, - "sft_loss": 1.3511950969696045, + "grad_norm": 6.39305835700618, + "learning_rate": 6.043518873921074e-07, + "logits/chosen": -0.17131337523460388, + "logits/rejected": -0.03572530299425125, + "logps/chosen": -1.2976057529449463, + "logps/rejected": -1.6221458911895752, + "loss": 1.0387, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2976057529449463, + "rewards/margins": 0.32454022765159607, + "rewards/rejected": -1.6221458911895752, + "sft_loss": 1.3275409936904907, "step": 2745 }, { "epoch": 1.4718180297708647, - "grad_norm": 7.370258227670556, - "learning_rate": 1.808484830085718e-06, - "logits/chosen": -0.514094889163971, - "logits/rejected": -0.4376614987850189, - "logps/chosen": -1.4168999195098877, - "logps/rejected": -2.2200722694396973, - "loss": 0.9447, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.4168999195098877, - "rewards/margins": 0.8031722903251648, - "rewards/rejected": -2.2200722694396973, - "sft_loss": 1.4548296928405762, + "grad_norm": 6.729328891716704, + "learning_rate": 6.028282766952393e-07, + "logits/chosen": -0.15203949809074402, + "logits/rejected": -0.045757196843624115, + "logps/chosen": -1.3872594833374023, + "logps/rejected": -1.879111886024475, + "loss": 1.0008, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3872594833374023, + "rewards/margins": 0.49185243248939514, + "rewards/rejected": -1.879111886024475, + "sft_loss": 1.4086813926696777, "step": 2750 }, { "epoch": 1.4744940625522662, - "grad_norm": 11.663556866456318, - "learning_rate": 1.8039110050739394e-06, - "logits/chosen": -0.46310463547706604, - "logits/rejected": -0.35408297181129456, - "logps/chosen": -1.3439658880233765, - "logps/rejected": -2.0157570838928223, - "loss": 0.9598, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3439658880233765, - "rewards/margins": 0.6717912554740906, - "rewards/rejected": -2.0157570838928223, - "sft_loss": 1.4041773080825806, + "grad_norm": 15.87398183068991, + "learning_rate": 6.013036683579798e-07, + "logits/chosen": -0.11463092267513275, + "logits/rejected": 0.03227987512946129, + "logps/chosen": -1.310810923576355, + "logps/rejected": -1.6783673763275146, + "loss": 1.0297, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.310810923576355, + "rewards/margins": 0.36755651235580444, + "rewards/rejected": -1.6783673763275146, + "sft_loss": 1.366980791091919, "step": 2755 }, { "epoch": 1.4771700953336677, - "grad_norm": 7.113265265380697, - "learning_rate": 1.7993342315162563e-06, - "logits/chosen": -0.534980297088623, - "logits/rejected": -0.3727056384086609, - "logps/chosen": -1.3619086742401123, - "logps/rejected": -2.215651035308838, - "loss": 0.9169, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3619086742401123, - "rewards/margins": 0.8537423014640808, - "rewards/rejected": -2.215651035308838, - "sft_loss": 1.3776787519454956, + "grad_norm": 7.751583510401781, + "learning_rate": 5.997780771720854e-07, + "logits/chosen": -0.2286381721496582, + "logits/rejected": -0.05059467628598213, + "logps/chosen": -1.3588345050811768, + "logps/rejected": -1.907141923904419, + "loss": 0.9799, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3588345050811768, + "rewards/margins": 0.5483072996139526, + "rewards/rejected": -1.907141923904419, + "sft_loss": 1.3552730083465576, "step": 2760 }, { "epoch": 1.4798461281150694, - "grad_norm": 8.036413528086007, - "learning_rate": 1.794754553816546e-06, - "logits/chosen": -0.43422263860702515, - "logits/rejected": -0.3120557367801666, - "logps/chosen": -1.3172380924224854, - "logps/rejected": -1.9721095561981201, - "loss": 0.9269, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3172380924224854, - "rewards/margins": 0.6548714637756348, - "rewards/rejected": -1.9721095561981201, - "sft_loss": 1.378082036972046, + "grad_norm": 15.233468290648068, + "learning_rate": 5.982515179388486e-07, + "logits/chosen": -0.14470013976097107, + "logits/rejected": -0.002064927713945508, + "logps/chosen": -1.3128535747528076, + "logps/rejected": -1.6920738220214844, + "loss": 1.004, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3128535747528076, + "rewards/margins": 0.37922030687332153, + "rewards/rejected": -1.6920738220214844, + "sft_loss": 1.3641343116760254, "step": 2765 }, { "epoch": 1.482522160896471, - "grad_norm": 7.781275539577071, - "learning_rate": 1.7901720164068623e-06, - "logits/chosen": -0.5075265169143677, - "logits/rejected": -0.4491181969642639, - "logps/chosen": -1.2856837511062622, - "logps/rejected": -1.723960518836975, - "loss": 1.0079, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2856837511062622, - "rewards/margins": 0.43827691674232483, - "rewards/rejected": -1.723960518836975, - "sft_loss": 1.348190188407898, + "grad_norm": 8.056494737856939, + "learning_rate": 5.967240054689541e-07, + "logits/chosen": -0.19823172688484192, + "logits/rejected": -0.12065769731998444, + "logps/chosen": -1.2947113513946533, + "logps/rejected": -1.5259299278259277, + "loss": 1.068, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2947113513946533, + "rewards/margins": 0.23121850192546844, + "rewards/rejected": -1.5259299278259277, + "sft_loss": 1.3387796878814697, "step": 2770 }, { "epoch": 1.4851981936778724, - "grad_norm": 13.606802860849392, - "learning_rate": 1.7855866637470027e-06, - "logits/chosen": -0.4231862425804138, - "logits/rejected": -0.39519035816192627, - "logps/chosen": -1.2925583124160767, - "logps/rejected": -1.953325629234314, - "loss": 0.963, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2925583124160767, - "rewards/margins": 0.6607673764228821, - "rewards/rejected": -1.953325629234314, - "sft_loss": 1.3326704502105713, + "grad_norm": 7.755205256691052, + "learning_rate": 5.951955545823342e-07, + "logits/chosen": -0.15351326763629913, + "logits/rejected": -0.10742366313934326, + "logps/chosen": -1.2829701900482178, + "logps/rejected": -1.7134536504745483, + "loss": 1.0134, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2829701900482178, + "rewards/margins": 0.4304834008216858, + "rewards/rejected": -1.7134536504745483, + "sft_loss": 1.3123705387115479, "step": 2775 }, { "epoch": 1.4878742264592741, - "grad_norm": 7.471840777920316, - "learning_rate": 1.780998540324079e-06, - "logits/chosen": -0.41194334626197815, - "logits/rejected": -0.31532105803489685, - "logps/chosen": -1.4386813640594482, - "logps/rejected": -1.9639618396759033, - "loss": 1.0387, - "rewards/accuracies": 0.59375, - "rewards/chosen": -1.4386813640594482, - "rewards/margins": 0.5252804756164551, - "rewards/rejected": -1.9639618396759033, - "sft_loss": 1.4364547729492188, + "grad_norm": 6.88612596291791, + "learning_rate": 5.936661801080263e-07, + "logits/chosen": -0.14032743871212006, + "logits/rejected": -0.03774075582623482, + "logps/chosen": -1.4290128946304321, + "logps/rejected": -1.7514575719833374, + "loss": 1.0777, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4290128946304321, + "rewards/margins": 0.3224448561668396, + "rewards/rejected": -1.7514575719833374, + "sft_loss": 1.4194713830947876, "step": 2780 }, { "epoch": 1.4905502592406756, - "grad_norm": 8.230403778332743, - "learning_rate": 1.776407690652084e-06, - "logits/chosen": -0.4215714931488037, - "logits/rejected": -0.2935768961906433, - "logps/chosen": -1.4041473865509033, - "logps/rejected": -2.048205852508545, - "loss": 1.0061, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.4041473865509033, - "rewards/margins": 0.6440584063529968, - "rewards/rejected": -2.048205852508545, - "sft_loss": 1.4206860065460205, + "grad_norm": 13.416853169884268, + "learning_rate": 5.92135896884028e-07, + "logits/chosen": -0.226902037858963, + "logits/rejected": -0.09653354436159134, + "logps/chosen": -1.400376558303833, + "logps/rejected": -1.8006101846694946, + "loss": 1.0668, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.400376558303833, + "rewards/margins": 0.40023356676101685, + "rewards/rejected": -1.8006101846694946, + "sft_loss": 1.423305869102478, "step": 2785 }, { "epoch": 1.4932262920220774, - "grad_norm": 9.35742961252276, - "learning_rate": 1.7718141592714628e-06, - "logits/chosen": -0.33215445280075073, - "logits/rejected": -0.3602014482021332, - "logps/chosen": -1.2948368787765503, - "logps/rejected": -1.8967326879501343, - "loss": 1.0093, + "grad_norm": 8.481357085434302, + "learning_rate": 5.906047197571541e-07, + "logits/chosen": -0.13754570484161377, + "logits/rejected": -0.15755626559257507, + "logps/chosen": -1.2808914184570312, + "logps/rejected": -1.6580407619476318, + "loss": 1.0402, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2948368787765503, - "rewards/margins": 0.6018956899642944, - "rewards/rejected": -1.8967326879501343, - "sft_loss": 1.389012098312378, + "rewards/chosen": -1.2808914184570312, + "rewards/margins": 0.37714946269989014, + "rewards/rejected": -1.6580407619476318, + "sft_loss": 1.3839619159698486, "step": 2790 }, { "epoch": 1.4959023248034788, - "grad_norm": 6.029347217384856, - "learning_rate": 1.7672179907486757e-06, - "logits/chosen": -0.24997854232788086, - "logits/rejected": -0.2451924830675125, - "logps/chosen": -1.2664806842803955, - "logps/rejected": -1.734628438949585, - "loss": 1.0014, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.2664806842803955, - "rewards/margins": 0.4681479036808014, - "rewards/rejected": -1.734628438949585, - "sft_loss": 1.297235369682312, + "grad_norm": 6.387669914543872, + "learning_rate": 5.890726635828919e-07, + "logits/chosen": -0.027116578072309494, + "logits/rejected": -0.008474569767713547, + "logps/chosen": -1.242453932762146, + "logps/rejected": -1.5739244222640991, + "loss": 1.0197, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.242453932762146, + "rewards/margins": 0.3314705193042755, + "rewards/rejected": -1.5739244222640991, + "sft_loss": 1.29339599609375, "step": 2795 }, { "epoch": 1.4985783575848803, - "grad_norm": 7.690103127341859, - "learning_rate": 1.7626192296757708e-06, - "logits/chosen": -0.4005914628505707, - "logits/rejected": -0.3310944139957428, - "logps/chosen": -1.3489172458648682, - "logps/rejected": -1.814736008644104, - "loss": 1.0333, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3489172458648682, - "rewards/margins": 0.46581873297691345, - "rewards/rejected": -1.814736008644104, - "sft_loss": 1.426548719406128, + "grad_norm": 19.1852011695506, + "learning_rate": 5.875397432252569e-07, + "logits/chosen": -0.21790878474712372, + "logits/rejected": -0.13022013008594513, + "logps/chosen": -1.415160894393921, + "logps/rejected": -1.7492616176605225, + "loss": 1.1144, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.415160894393921, + "rewards/margins": 0.334100604057312, + "rewards/rejected": -1.7492616176605225, + "sft_loss": 1.4745256900787354, "step": 2800 }, { "epoch": 1.4985783575848803, - "eval_logits/chosen": -0.08405511826276779, - "eval_logits/rejected": -0.004597527906298637, - "eval_logps/chosen": -1.3793412446975708, - "eval_logps/rejected": -1.7730538845062256, - "eval_loss": 1.04581880569458, - "eval_rewards/accuracies": 0.6127596497535706, - "eval_rewards/chosen": -1.3793412446975708, - "eval_rewards/margins": 0.3937126696109772, - "eval_rewards/rejected": -1.7730538845062256, - "eval_runtime": 42.9279, - "eval_samples_per_second": 31.332, - "eval_sft_loss": 1.3972686529159546, - "eval_steps_per_second": 7.85, + "eval_logits/chosen": 0.1278160661458969, + "eval_logits/rejected": 0.21500340104103088, + "eval_logps/chosen": -1.3652279376983643, + "eval_logps/rejected": -1.7446658611297607, + "eval_loss": 1.04434072971344, + "eval_rewards/accuracies": 0.610534131526947, + "eval_rewards/chosen": -1.3652279376983643, + "eval_rewards/margins": 0.37943780422210693, + "eval_rewards/rejected": -1.7446658611297607, + "eval_runtime": 43.4312, + "eval_samples_per_second": 30.969, + "eval_sft_loss": 1.3886722326278687, + "eval_steps_per_second": 7.759, "step": 2800 }, { "epoch": 1.5012543903662818, - "grad_norm": 5.2232884918186375, - "learning_rate": 1.7580179206699475e-06, - "logits/chosen": -0.4917038381099701, - "logits/rejected": -0.3383350968360901, - "logps/chosen": -1.1643739938735962, - "logps/rejected": -1.6991310119628906, - "loss": 0.932, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.1643739938735962, - "rewards/margins": 0.5347572565078735, - "rewards/rejected": -1.6991310119628906, - "sft_loss": 1.247200846672058, + "grad_norm": 6.648291877984254, + "learning_rate": 5.860059735566491e-07, + "logits/chosen": -0.3065665364265442, + "logits/rejected": -0.15820252895355225, + "logps/chosen": -1.1917800903320312, + "logps/rejected": -1.6429237127304077, + "loss": 0.9691, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1917800903320312, + "rewards/margins": 0.4511435925960541, + "rewards/rejected": -1.6429237127304077, + "sft_loss": 1.2733690738677979, "step": 2805 }, { "epoch": 1.5039304231476835, - "grad_norm": 9.802894598065583, - "learning_rate": 1.7534141083731262e-06, - "logits/chosen": -0.3784298896789551, - "logits/rejected": -0.3399508595466614, - "logps/chosen": -1.3304203748703003, - "logps/rejected": -1.8287245035171509, - "loss": 0.9993, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.3304203748703003, - "rewards/margins": 0.49830397963523865, - "rewards/rejected": -1.8287245035171509, - "sft_loss": 1.4084551334381104, + "grad_norm": 11.874270050708041, + "learning_rate": 5.844713694577087e-07, + "logits/chosen": -0.184895321726799, + "logits/rejected": -0.14258211851119995, + "logps/chosen": -1.3601287603378296, + "logps/rejected": -1.7205755710601807, + "loss": 1.0659, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3601287603378296, + "rewards/margins": 0.3604467213153839, + "rewards/rejected": -1.7205755710601807, + "sft_loss": 1.4537384510040283, "step": 2810 }, { "epoch": 1.5066064559290853, - "grad_norm": 7.710213953120398, - "learning_rate": 1.7488078374515143e-06, - "logits/chosen": -0.36453741788864136, - "logits/rejected": -0.25610360503196716, - "logps/chosen": -1.3093892335891724, - "logps/rejected": -1.9001718759536743, - "loss": 0.9638, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3093892335891724, - "rewards/margins": 0.5907825231552124, - "rewards/rejected": -1.9001718759536743, - "sft_loss": 1.3398463726043701, + "grad_norm": 6.962778367607981, + "learning_rate": 5.829359458171714e-07, + "logits/chosen": -0.09512137621641159, + "logits/rejected": 0.032009050250053406, + "logps/chosen": -1.3327128887176514, + "logps/rejected": -1.7304967641830444, + "loss": 1.0174, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3327128887176514, + "rewards/margins": 0.397784024477005, + "rewards/rejected": -1.7304967641830444, + "sft_loss": 1.3504347801208496, "step": 2815 }, { "epoch": 1.5092824887104868, - "grad_norm": 6.664089191337728, - "learning_rate": 1.7441991525951722e-06, - "logits/chosen": -0.4268978238105774, - "logits/rejected": -0.2702500820159912, - "logps/chosen": -1.282658576965332, - "logps/rejected": -1.73626708984375, - "loss": 0.9957, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.282658576965332, - "rewards/margins": 0.45360851287841797, - "rewards/rejected": -1.73626708984375, - "sft_loss": 1.3267815113067627, + "grad_norm": 6.566323185465078, + "learning_rate": 5.81399717531724e-07, + "logits/chosen": -0.13119642436504364, + "logits/rejected": 0.04365091398358345, + "logps/chosen": -1.2991206645965576, + "logps/rejected": -1.5871660709381104, + "loss": 1.0471, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2991206645965576, + "rewards/margins": 0.2880452871322632, + "rewards/rejected": -1.5871660709381104, + "sft_loss": 1.3335323333740234, "step": 2820 }, { "epoch": 1.5119585214918883, - "grad_norm": 10.434741433534874, - "learning_rate": 1.7395880985175808e-06, - "logits/chosen": -0.5320969223976135, - "logits/rejected": -0.37741774320602417, - "logps/chosen": -1.3708407878875732, - "logps/rejected": -2.0818610191345215, - "loss": 0.952, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3708407878875732, - "rewards/margins": 0.7110201716423035, - "rewards/rejected": -2.0818610191345215, - "sft_loss": 1.3801392316818237, + "grad_norm": 9.310666619697258, + "learning_rate": 5.798626995058602e-07, + "logits/chosen": -0.20145635306835175, + "logits/rejected": -0.02964537963271141, + "logps/chosen": -1.3808945417404175, + "logps/rejected": -1.8684800863265991, + "loss": 1.0127, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3808945417404175, + "rewards/margins": 0.4875854551792145, + "rewards/rejected": -1.8684800863265991, + "sft_loss": 1.3790462017059326, "step": 2825 }, { "epoch": 1.51463455427329, - "grad_norm": 7.419515705303618, - "learning_rate": 1.7349747199552063e-06, - "logits/chosen": -0.47449636459350586, - "logits/rejected": -0.35492879152297974, - "logps/chosen": -1.3637349605560303, - "logps/rejected": -1.8692903518676758, - "loss": 1.0242, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3637349605560303, - "rewards/margins": 0.5055556297302246, - "rewards/rejected": -1.8692903518676758, - "sft_loss": 1.4562511444091797, + "grad_norm": 7.469500192905114, + "learning_rate": 5.783249066517354e-07, + "logits/chosen": -0.16935193538665771, + "logits/rejected": -0.0209256112575531, + "logps/chosen": -1.390914797782898, + "logps/rejected": -1.6186745166778564, + "loss": 1.108, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.390914797782898, + "rewards/margins": 0.22775951027870178, + "rewards/rejected": -1.6186745166778564, + "sft_loss": 1.451371431350708, "step": 2830 }, { "epoch": 1.5173105870546915, - "grad_norm": 10.065096311367334, - "learning_rate": 1.7303590616670683e-06, - "logits/chosen": -0.4715927243232727, - "logits/rejected": -0.3220441937446594, - "logps/chosen": -1.3339498043060303, - "logps/rejected": -1.9941116571426392, - "loss": 0.9525, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3339498043060303, - "rewards/margins": 0.6601617336273193, - "rewards/rejected": -1.9941116571426392, - "sft_loss": 1.3572701215744019, + "grad_norm": 7.823612798328366, + "learning_rate": 5.767863538890228e-07, + "logits/chosen": -0.15821883082389832, + "logits/rejected": 0.004854840226471424, + "logps/chosen": -1.3072322607040405, + "logps/rejected": -1.6984754800796509, + "loss": 1.0165, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3072322607040405, + "rewards/margins": 0.3912431597709656, + "rewards/rejected": -1.6984754800796509, + "sft_loss": 1.3407630920410156, "step": 2835 }, { "epoch": 1.519986619836093, - "grad_norm": 7.483523830892962, - "learning_rate": 1.7257411684343042e-06, - "logits/chosen": -0.44677048921585083, - "logits/rejected": -0.3579447865486145, - "logps/chosen": -1.330339789390564, - "logps/rejected": -1.7873833179473877, - "loss": 1.0151, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.330339789390564, - "rewards/margins": 0.45704346895217896, - "rewards/rejected": -1.7873833179473877, - "sft_loss": 1.3790075778961182, + "grad_norm": 7.2601639097119275, + "learning_rate": 5.75247056144768e-07, + "logits/chosen": -0.15000715851783752, + "logits/rejected": -0.042465973645448685, + "logps/chosen": -1.3483084440231323, + "logps/rejected": -1.6190989017486572, + "loss": 1.0708, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3483084440231323, + "rewards/margins": 0.27079030871391296, + "rewards/rejected": -1.6190989017486572, + "sft_loss": 1.3885855674743652, "step": 2840 }, { "epoch": 1.5226626526174947, - "grad_norm": 10.065579953099064, - "learning_rate": 1.7211210850597333e-06, - "logits/chosen": -0.42220425605773926, - "logits/rejected": -0.34389373660087585, - "logps/chosen": -1.3688390254974365, - "logps/rejected": -2.004911422729492, - "loss": 0.9812, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3688390254974365, - "rewards/margins": 0.6360724568367004, - "rewards/rejected": -2.004911422729492, - "sft_loss": 1.3310651779174805, + "grad_norm": 8.863189403136145, + "learning_rate": 5.737070283532444e-07, + "logits/chosen": -0.13697829842567444, + "logits/rejected": -0.04790828004479408, + "logps/chosen": -1.3052732944488525, + "logps/rejected": -1.7480891942977905, + "loss": 1.0019, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3052732944488525, + "rewards/margins": 0.44281578063964844, + "rewards/rejected": -1.7480891942977905, + "sft_loss": 1.292776346206665, "step": 2845 }, { "epoch": 1.5253386853988962, - "grad_norm": 7.369649452814721, - "learning_rate": 1.7164988563674256e-06, - "logits/chosen": -0.4650971293449402, - "logits/rejected": -0.38579609990119934, - "logps/chosen": -1.3725297451019287, - "logps/rejected": -2.0686371326446533, - "loss": 1.0069, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.3725297451019287, - "rewards/margins": 0.6961073279380798, - "rewards/rejected": -2.0686371326446533, - "sft_loss": 1.3935930728912354, + "grad_norm": 8.138743922757627, + "learning_rate": 5.721662854558084e-07, + "logits/chosen": -0.179933100938797, + "logits/rejected": -0.09679999947547913, + "logps/chosen": -1.3556548357009888, + "logps/rejected": -1.730666160583496, + "loss": 1.0524, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3556548357009888, + "rewards/margins": 0.3750113844871521, + "rewards/rejected": -1.730666160583496, + "sft_loss": 1.372294545173645, "step": 2850 }, { "epoch": 1.5280147181802977, - "grad_norm": 6.300272482544147, - "learning_rate": 1.7118745272022635e-06, - "logits/chosen": -0.48021286725997925, - "logits/rejected": -0.32814162969589233, - "logps/chosen": -1.402618169784546, - "logps/rejected": -1.969491958618164, - "loss": 0.9914, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.402618169784546, - "rewards/margins": 0.5668739080429077, - "rewards/rejected": -1.969491958618164, - "sft_loss": 1.4442012310028076, + "grad_norm": 6.969945640232902, + "learning_rate": 5.706248424007545e-07, + "logits/chosen": -0.1762910932302475, + "logits/rejected": -0.007267421577125788, + "logps/chosen": -1.3951175212860107, + "logps/rejected": -1.775962471961975, + "loss": 1.0331, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3951175212860107, + "rewards/margins": 0.3808448910713196, + "rewards/rejected": -1.775962471961975, + "sft_loss": 1.431128978729248, "step": 2855 }, { "epoch": 1.5306907509616994, - "grad_norm": 8.625096368723874, - "learning_rate": 1.7072481424295097e-06, - "logits/chosen": -0.5285122990608215, - "logits/rejected": -0.36223360896110535, - "logps/chosen": -1.307751178741455, - "logps/rejected": -1.7706029415130615, - "loss": 0.9759, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.307751178741455, - "rewards/margins": 0.4628518521785736, - "rewards/rejected": -1.7706029415130615, - "sft_loss": 1.3280938863754272, + "grad_norm": 8.452216105259412, + "learning_rate": 5.690827141431699e-07, + "logits/chosen": -0.25379595160484314, + "logits/rejected": -0.07199688255786896, + "logps/chosen": -1.2877904176712036, + "logps/rejected": -1.5933940410614014, + "loss": 1.0154, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2877904176712036, + "rewards/margins": 0.30560365319252014, + "rewards/rejected": -1.5933940410614014, + "sft_loss": 1.3085378408432007, "step": 2860 }, { "epoch": 1.5333667837431009, - "grad_norm": 5.997814680553211, - "learning_rate": 1.702619746934369e-06, - "logits/chosen": -0.5822547674179077, - "logits/rejected": -0.4525377154350281, - "logps/chosen": -1.3115359544754028, - "logps/rejected": -1.9264564514160156, - "loss": 0.957, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3115359544754028, - "rewards/margins": 0.6149204969406128, - "rewards/rejected": -1.9264564514160156, - "sft_loss": 1.3683974742889404, + "grad_norm": 6.1146357749691225, + "learning_rate": 5.675399156447897e-07, + "logits/chosen": -0.27868279814720154, + "logits/rejected": -0.142146036028862, + "logps/chosen": -1.2972919940948486, + "logps/rejected": -1.712339162826538, + "loss": 1.0044, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2972919940948486, + "rewards/margins": 0.41504722833633423, + "rewards/rejected": -1.712339162826538, + "sft_loss": 1.356175422668457, "step": 2865 }, { "epoch": 1.5360428165245024, - "grad_norm": 9.018405181315453, - "learning_rate": 1.6979893856215547e-06, - "logits/chosen": -0.5128965973854065, - "logits/rejected": -0.4116179943084717, - "logps/chosen": -1.3670125007629395, - "logps/rejected": -1.7760088443756104, - "loss": 1.0132, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3670125007629395, - "rewards/margins": 0.4089964032173157, - "rewards/rejected": -1.7760088443756104, - "sft_loss": 1.3387202024459839, + "grad_norm": 8.680293978801568, + "learning_rate": 5.659964618738515e-07, + "logits/chosen": -0.19600138068199158, + "logits/rejected": -0.07686041295528412, + "logps/chosen": -1.3629693984985352, + "logps/rejected": -1.5587340593338013, + "loss": 1.0804, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3629693984985352, + "rewards/margins": 0.19576458632946014, + "rewards/rejected": -1.5587340593338013, + "sft_loss": 1.344761610031128, "step": 2870 }, { "epoch": 1.538718849305904, - "grad_norm": 10.277223558286527, - "learning_rate": 1.6933571034148531e-06, - "logits/chosen": -0.48255014419555664, - "logits/rejected": -0.40862828493118286, - "logps/chosen": -1.4015209674835205, - "logps/rejected": -1.928873062133789, - "loss": 0.9631, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.4015209674835205, - "rewards/margins": 0.5273522138595581, - "rewards/rejected": -1.928873062133789, - "sft_loss": 1.3639947175979614, + "grad_norm": 7.528570843500533, + "learning_rate": 5.644523678049509e-07, + "logits/chosen": -0.17637869715690613, + "logits/rejected": -0.07687224447727203, + "logps/chosen": -1.3604662418365479, + "logps/rejected": -1.6803786754608154, + "loss": 1.016, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3604662418365479, + "rewards/margins": 0.3199126124382019, + "rewards/rejected": -1.6803786754608154, + "sft_loss": 1.347644567489624, "step": 2875 }, { "epoch": 1.5413948820873056, - "grad_norm": 10.625700942112903, - "learning_rate": 1.6887229452566859e-06, - "logits/chosen": -0.42231544852256775, - "logits/rejected": -0.31882724165916443, - "logps/chosen": -1.3298161029815674, - "logps/rejected": -2.002760410308838, - "loss": 0.9609, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3298161029815674, - "rewards/margins": 0.6729440689086914, - "rewards/rejected": -2.002760410308838, - "sft_loss": 1.3508632183074951, + "grad_norm": 10.17870051576461, + "learning_rate": 5.629076484188952e-07, + "logits/chosen": -0.04938278719782829, + "logits/rejected": 0.0634743794798851, + "logps/chosen": -1.3148540258407593, + "logps/rejected": -1.758033037185669, + "loss": 1.0271, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3148540258407593, + "rewards/margins": 0.4431789815425873, + "rewards/rejected": -1.758033037185669, + "sft_loss": 1.3470075130462646, "step": 2880 }, { "epoch": 1.544070914868707, - "grad_norm": 9.865396438185735, - "learning_rate": 1.6840869561076761e-06, - "logits/chosen": -0.5014291405677795, - "logits/rejected": -0.3962041735649109, - "logps/chosen": -1.3711020946502686, - "logps/rejected": -1.9670244455337524, - "loss": 1.0002, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3711020946502686, - "rewards/margins": 0.5959222912788391, - "rewards/rejected": -1.9670244455337524, - "sft_loss": 1.4149680137634277, + "grad_norm": 8.577794751942648, + "learning_rate": 5.613623187025587e-07, + "logits/chosen": -0.14258165657520294, + "logits/rejected": -0.015600791200995445, + "logps/chosen": -1.3516905307769775, + "logps/rejected": -1.742180585861206, + "loss": 1.0458, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3516905307769775, + "rewards/margins": 0.39049032330513, + "rewards/rejected": -1.742180585861206, + "sft_loss": 1.3893133401870728, "step": 2885 }, { "epoch": 1.5467469476501088, - "grad_norm": 6.339272952448974, - "learning_rate": 1.6794491809462108e-06, - "logits/chosen": -0.5733720660209656, - "logits/rejected": -0.4118824899196625, - "logps/chosen": -1.3491941690444946, - "logps/rejected": -1.9526889324188232, - "loss": 0.9637, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3491941690444946, - "rewards/margins": 0.6034947037696838, - "rewards/rejected": -1.9526889324188232, - "sft_loss": 1.367173194885254, + "grad_norm": 8.570238281275206, + "learning_rate": 5.598163936487369e-07, + "logits/chosen": -0.21197304129600525, + "logits/rejected": -0.009961167350411415, + "logps/chosen": -1.3131115436553955, + "logps/rejected": -1.6712608337402344, + "loss": 1.017, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3131115436553955, + "rewards/margins": 0.3581491708755493, + "rewards/rejected": -1.6712608337402344, + "sft_loss": 1.3298722505569458, "step": 2890 }, { "epoch": 1.5494229804315103, - "grad_norm": 6.440190526652441, - "learning_rate": 1.674809664768005e-06, - "logits/chosen": -0.5244798064231873, - "logits/rejected": -0.3966430127620697, - "logps/chosen": -1.3039839267730713, - "logps/rejected": -1.9712779521942139, - "loss": 0.9312, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3039839267730713, - "rewards/margins": 0.6672938466072083, - "rewards/rejected": -1.9712779521942139, - "sft_loss": 1.3040238618850708, + "grad_norm": 10.572661224304408, + "learning_rate": 5.582698882560017e-07, + "logits/chosen": -0.2016395777463913, + "logits/rejected": -0.055265843868255615, + "logps/chosen": -1.2552399635314941, + "logps/rejected": -1.6915838718414307, + "loss": 0.9776, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2552399635314941, + "rewards/margins": 0.43634381890296936, + "rewards/rejected": -1.6915838718414307, + "sft_loss": 1.2686001062393188, "step": 2895 }, { "epoch": 1.5520990132129118, - "grad_norm": 7.879057907006199, - "learning_rate": 1.6701684525856647e-06, - "logits/chosen": -0.44239291548728943, - "logits/rejected": -0.3611551821231842, - "logps/chosen": -1.3233540058135986, - "logps/rejected": -1.9400726556777954, - "loss": 0.9645, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3233540058135986, - "rewards/margins": 0.616718590259552, - "rewards/rejected": -1.9400726556777954, - "sft_loss": 1.385852336883545, + "grad_norm": 6.548478183198705, + "learning_rate": 5.567228175285549e-07, + "logits/chosen": -0.10608162730932236, + "logits/rejected": -0.001629498554393649, + "logps/chosen": -1.31504225730896, + "logps/rejected": -1.6856321096420288, + "loss": 1.0273, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.31504225730896, + "rewards/margins": 0.37058982253074646, + "rewards/rejected": -1.6856321096420288, + "sft_loss": 1.368192434310913, "step": 2900 }, { "epoch": 1.5547750459943135, - "grad_norm": 11.832512783512026, - "learning_rate": 1.6655255894282515e-06, - "logits/chosen": -0.36237016320228577, - "logits/rejected": -0.35094302892684937, - "logps/chosen": -1.322729468345642, - "logps/rejected": -1.9308160543441772, - "loss": 0.9677, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.322729468345642, - "rewards/margins": 0.6080866456031799, - "rewards/rejected": -1.9308160543441772, - "sft_loss": 1.3535382747650146, + "grad_norm": 10.097095692347898, + "learning_rate": 5.551751964760838e-07, + "logits/chosen": -0.051770079880952835, + "logits/rejected": -0.024975869804620743, + "logps/chosen": -1.3028028011322021, + "logps/rejected": -1.6529500484466553, + "loss": 1.0398, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3028028011322021, + "rewards/margins": 0.3501472473144531, + "rewards/rejected": -1.6529500484466553, + "sft_loss": 1.3389087915420532, "step": 2905 }, { "epoch": 1.557451078775715, - "grad_norm": 11.201933335039847, - "learning_rate": 1.6608811203408437e-06, - "logits/chosen": -0.42994600534439087, - "logits/rejected": -0.3571633994579315, - "logps/chosen": -1.2974226474761963, - "logps/rejected": -1.7543586492538452, - "loss": 0.9976, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2974226474761963, - "rewards/margins": 0.45693597197532654, - "rewards/rejected": -1.7543586492538452, - "sft_loss": 1.3785746097564697, + "grad_norm": 6.663043928077902, + "learning_rate": 5.536270401136145e-07, + "logits/chosen": -0.1299368143081665, + "logits/rejected": -0.03927867114543915, + "logps/chosen": -1.254652976989746, + "logps/rejected": -1.5375810861587524, + "loss": 1.0272, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.254652976989746, + "rewards/margins": 0.2829279899597168, + "rewards/rejected": -1.5375810861587524, + "sft_loss": 1.3334118127822876, "step": 2910 }, { "epoch": 1.5601271115571165, - "grad_norm": 11.915398676209564, - "learning_rate": 1.6562350903841002e-06, - "logits/chosen": -0.4060365557670593, - "logits/rejected": -0.24389150738716125, - "logps/chosen": -1.388873815536499, - "logps/rejected": -2.0121002197265625, - "loss": 0.9886, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.388873815536499, - "rewards/margins": 0.6232262849807739, - "rewards/rejected": -2.0121002197265625, - "sft_loss": 1.415649175643921, + "grad_norm": 13.30160450099127, + "learning_rate": 5.520783634613667e-07, + "logits/chosen": -0.0906791165471077, + "logits/rejected": 0.07906799018383026, + "logps/chosen": -1.3736572265625, + "logps/rejected": -1.728371024131775, + "loss": 1.0719, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3736572265625, + "rewards/margins": 0.3547138273715973, + "rewards/rejected": -1.728371024131775, + "sft_loss": 1.4097964763641357, "step": 2915 }, { "epoch": 1.5628031443385182, - "grad_norm": 13.960085221092982, - "learning_rate": 1.651587544633825e-06, - "logits/chosen": -0.4067690968513489, - "logits/rejected": -0.292506605386734, - "logps/chosen": -1.359520435333252, - "logps/rejected": -2.041755199432373, - "loss": 0.9575, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.359520435333252, - "rewards/margins": 0.6822346448898315, - "rewards/rejected": -2.041755199432373, - "sft_loss": 1.3790920972824097, + "grad_norm": 10.224493828222354, + "learning_rate": 5.505291815446082e-07, + "logits/chosen": -0.06458568572998047, + "logits/rejected": 0.05855490639805794, + "logps/chosen": -1.3306993246078491, + "logps/rejected": -1.7111177444458008, + "loss": 1.0252, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3306993246078491, + "rewards/margins": 0.38041844964027405, + "rewards/rejected": -1.7111177444458008, + "sft_loss": 1.3561182022094727, "step": 2920 }, { "epoch": 1.5654791771199197, - "grad_norm": 7.559174143764919, - "learning_rate": 1.6469385281805267e-06, - "logits/chosen": -0.3783648908138275, - "logits/rejected": -0.3329547047615051, - "logps/chosen": -1.2987470626831055, - "logps/rejected": -1.9435436725616455, - "loss": 0.9662, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2987470626831055, - "rewards/margins": 0.6447966694831848, - "rewards/rejected": -1.9435436725616455, - "sft_loss": 1.3145548105239868, + "grad_norm": 10.674837251777372, + "learning_rate": 5.489795093935089e-07, + "logits/chosen": -0.09088797867298126, + "logits/rejected": -0.03129405155777931, + "logps/chosen": -1.2738713026046753, + "logps/rejected": -1.697519302368164, + "loss": 1.015, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2738713026046753, + "rewards/margins": 0.4236481785774231, + "rewards/rejected": -1.697519302368164, + "sft_loss": 1.2819215059280396, "step": 2925 }, { "epoch": 1.5681552099013212, - "grad_norm": 9.139123936589463, - "learning_rate": 1.642288086128984e-06, - "logits/chosen": -0.530595600605011, - "logits/rejected": -0.38953492045402527, - "logps/chosen": -1.3028291463851929, - "logps/rejected": -2.1624374389648438, - "loss": 0.9542, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3028291463851929, - "rewards/margins": 0.8596083521842957, - "rewards/rejected": -2.1624374389648438, - "sft_loss": 1.3950421810150146, + "grad_norm": 7.967714066643323, + "learning_rate": 5.474293620429946e-07, + "logits/chosen": -0.2462097853422165, + "logits/rejected": -0.07248953729867935, + "logps/chosen": -1.2905464172363281, + "logps/rejected": -1.9068615436553955, + "loss": 0.9933, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2905464172363281, + "rewards/margins": 0.6163150072097778, + "rewards/rejected": -1.9068615436553955, + "sft_loss": 1.3654937744140625, "step": 2930 }, { "epoch": 1.570831242682723, - "grad_norm": 8.34429297717093, - "learning_rate": 1.6376362635978055e-06, - "logits/chosen": -0.5230890512466431, - "logits/rejected": -0.40567511320114136, - "logps/chosen": -1.3589205741882324, - "logps/rejected": -1.9703487157821655, - "loss": 0.9887, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3589205741882324, - "rewards/margins": 0.6114282011985779, - "rewards/rejected": -1.9703487157821655, - "sft_loss": 1.40828537940979, + "grad_norm": 8.802652583026893, + "learning_rate": 5.458787545326018e-07, + "logits/chosen": -0.20253901183605194, + "logits/rejected": -0.05758960172533989, + "logps/chosen": -1.345850944519043, + "logps/rejected": -1.7187354564666748, + "loss": 1.0459, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.345850944519043, + "rewards/margins": 0.3728848099708557, + "rewards/rejected": -1.7187354564666748, + "sft_loss": 1.3998357057571411, "step": 2935 }, { "epoch": 1.5735072754641244, - "grad_norm": 8.114057802851098, - "learning_rate": 1.6329831057189936e-06, - "logits/chosen": -0.5228386521339417, - "logits/rejected": -0.37643635272979736, - "logps/chosen": -1.3287737369537354, - "logps/rejected": -2.1009817123413086, - "loss": 0.9644, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.3287737369537354, - "rewards/margins": 0.7722080945968628, - "rewards/rejected": -2.1009817123413086, - "sft_loss": 1.4050620794296265, + "grad_norm": 5.930700000256952, + "learning_rate": 5.443277019063311e-07, + "logits/chosen": -0.19098657369613647, + "logits/rejected": -0.015033292584121227, + "logps/chosen": -1.3140078783035278, + "logps/rejected": -1.8780615329742432, + "loss": 1.0028, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3140078783035278, + "rewards/margins": 0.5640536546707153, + "rewards/rejected": -1.8780615329742432, + "sft_loss": 1.3845045566558838, "step": 2940 }, { "epoch": 1.5761833082455259, - "grad_norm": 6.84292995539461, - "learning_rate": 1.6283286576375069e-06, - "logits/chosen": -0.5087316036224365, - "logits/rejected": -0.39270779490470886, - "logps/chosen": -1.321316123008728, - "logps/rejected": -1.7746942043304443, - "loss": 1.0044, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.321316123008728, - "rewards/margins": 0.453377902507782, - "rewards/rejected": -1.7746942043304443, - "sft_loss": 1.3574235439300537, + "grad_norm": 9.34078243263626, + "learning_rate": 5.427762192125023e-07, + "logits/chosen": -0.17489251494407654, + "logits/rejected": -0.025254786014556885, + "logps/chosen": -1.3132505416870117, + "logps/rejected": -1.6130483150482178, + "loss": 1.0468, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3132505416870117, + "rewards/margins": 0.2997978925704956, + "rewards/rejected": -1.6130483150482178, + "sft_loss": 1.3403208255767822, "step": 2945 }, { "epoch": 1.5788593410269276, - "grad_norm": 11.548431373015145, - "learning_rate": 1.623672964510821e-06, - "logits/chosen": -0.37392085790634155, - "logits/rejected": -0.14829064905643463, - "logps/chosen": -1.2613165378570557, - "logps/rejected": -2.001952648162842, - "loss": 0.9193, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2613165378570557, - "rewards/margins": 0.7406360507011414, - "rewards/rejected": -2.001952648162842, - "sft_loss": 1.2964996099472046, + "grad_norm": 33.24780731727735, + "learning_rate": 5.41224321503607e-07, + "logits/chosen": -0.07287373393774033, + "logits/rejected": 0.18627944588661194, + "logps/chosen": -1.2778263092041016, + "logps/rejected": -1.8012363910675049, + "loss": 0.9706, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2778263092041016, + "rewards/margins": 0.5234102010726929, + "rewards/rejected": -1.8012363910675049, + "sft_loss": 1.2985767126083374, "step": 2950 }, { "epoch": 1.5815353738083293, - "grad_norm": 10.972399432410526, - "learning_rate": 1.6190160715084909e-06, - "logits/chosen": -0.3920244872570038, - "logits/rejected": -0.30911877751350403, - "logps/chosen": -1.3052040338516235, - "logps/rejected": -1.8989613056182861, - "loss": 0.9648, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3052040338516235, - "rewards/margins": 0.5937572717666626, - "rewards/rejected": -1.8989613056182861, - "sft_loss": 1.3513920307159424, + "grad_norm": 11.258095186099865, + "learning_rate": 5.396720238361637e-07, + "logits/chosen": -0.07001444697380066, + "logits/rejected": 0.03678502142429352, + "logps/chosen": -1.2881648540496826, + "logps/rejected": -1.7102916240692139, + "loss": 0.9926, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2881648540496826, + "rewards/margins": 0.42212677001953125, + "rewards/rejected": -1.7102916240692139, + "sft_loss": 1.3350517749786377, "step": 2955 }, { "epoch": 1.5842114065897306, - "grad_norm": 7.320339128263725, - "learning_rate": 1.6143580238117132e-06, - "logits/chosen": -0.4998478889465332, - "logits/rejected": -0.39607498049736023, - "logps/chosen": -1.288106083869934, - "logps/rejected": -1.835472822189331, - "loss": 0.9586, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.288106083869934, - "rewards/margins": 0.5473669767379761, - "rewards/rejected": -1.835472822189331, - "sft_loss": 1.330539584159851, + "grad_norm": 8.822122841064651, + "learning_rate": 5.381193412705711e-07, + "logits/chosen": -0.1789834201335907, + "logits/rejected": -0.04704167693853378, + "logps/chosen": -1.3048163652420044, + "logps/rejected": -1.6448017358779907, + "loss": 1.0211, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3048163652420044, + "rewards/margins": 0.339985191822052, + "rewards/rejected": -1.6448017358779907, + "sft_loss": 1.323728322982788, "step": 2960 }, { "epoch": 1.5868874393711323, - "grad_norm": 5.9223720788792376, - "learning_rate": 1.6096988666128867e-06, - "logits/chosen": -0.48560982942581177, - "logits/rejected": -0.4185088276863098, - "logps/chosen": -1.2920416593551636, - "logps/rejected": -1.901266098022461, - "loss": 0.9729, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2920416593551636, - "rewards/margins": 0.6092244386672974, - "rewards/rejected": -1.901266098022461, - "sft_loss": 1.301276445388794, + "grad_norm": 7.59416415046014, + "learning_rate": 5.365662888709622e-07, + "logits/chosen": -0.15828174352645874, + "logits/rejected": -0.06027153134346008, + "logps/chosen": -1.2840261459350586, + "logps/rejected": -1.6559422016143799, + "loss": 1.0335, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2840261459350586, + "rewards/margins": 0.3719159960746765, + "rewards/rejected": -1.6559422016143799, + "sft_loss": 1.2959760427474976, "step": 2965 }, { "epoch": 1.589563472152534, - "grad_norm": 7.885940091214718, - "learning_rate": 1.6050386451151753e-06, - "logits/chosen": -0.5195499658584595, - "logits/rejected": -0.38451433181762695, - "logps/chosen": -1.3761128187179565, - "logps/rejected": -1.9619756937026978, - "loss": 1.0324, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3761128187179565, - "rewards/margins": 0.5858628749847412, - "rewards/rejected": -1.9619756937026978, - "sft_loss": 1.4462617635726929, + "grad_norm": 12.418799474714286, + "learning_rate": 5.350128817050585e-07, + "logits/chosen": -0.19060225784778595, + "logits/rejected": -0.008369709365069866, + "logps/chosen": -1.3549988269805908, + "logps/rejected": -1.6975902318954468, + "loss": 1.0852, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3549988269805908, + "rewards/margins": 0.34259122610092163, + "rewards/rejected": -1.6975902318954468, + "sft_loss": 1.4190717935562134, "step": 2970 }, { "epoch": 1.5922395049339353, - "grad_norm": 12.299188719276014, - "learning_rate": 1.6003774045320686e-06, - "logits/chosen": -0.5009174942970276, - "logits/rejected": -0.3847171664237976, - "logps/chosen": -1.3710649013519287, - "logps/rejected": -2.1126813888549805, - "loss": 0.9785, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3710649013519287, - "rewards/margins": 0.7416165471076965, - "rewards/rejected": -2.1126813888549805, - "sft_loss": 1.4547905921936035, + "grad_norm": 7.139956178679369, + "learning_rate": 5.334591348440229e-07, + "logits/chosen": -0.17740900814533234, + "logits/rejected": -0.034846335649490356, + "logps/chosen": -1.3455655574798584, + "logps/rejected": -1.7951133251190186, + "loss": 1.0253, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3455655574798584, + "rewards/margins": 0.4495477080345154, + "rewards/rejected": -1.7951133251190186, + "sft_loss": 1.4229328632354736, "step": 2975 }, { "epoch": 1.594915537715337, - "grad_norm": 5.547444314574255, - "learning_rate": 1.5957151900869425e-06, - "logits/chosen": -0.612585723400116, - "logits/rejected": -0.4938820004463196, - "logps/chosen": -1.439699649810791, - "logps/rejected": -2.009437322616577, - "loss": 0.9814, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.439699649810791, - "rewards/margins": 0.5697377920150757, - "rewards/rejected": -2.009437322616577, - "sft_loss": 1.4339112043380737, + "grad_norm": 5.886677056137338, + "learning_rate": 5.319050633623141e-07, + "logits/chosen": -0.27010422945022583, + "logits/rejected": -0.11675111204385757, + "logps/chosen": -1.4220235347747803, + "logps/rejected": -1.7936725616455078, + "loss": 1.0341, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.4220235347747803, + "rewards/margins": 0.37164920568466187, + "rewards/rejected": -1.7936725616455078, + "sft_loss": 1.4209567308425903, "step": 2980 }, { "epoch": 1.5975915704967387, - "grad_norm": 7.988338377974674, - "learning_rate": 1.5910520470126228e-06, - "logits/chosen": -0.5794527530670166, - "logits/rejected": -0.44258028268814087, - "logps/chosen": -1.4420968294143677, - "logps/rejected": -2.096639394760132, - "loss": 0.9941, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.4420968294143677, - "rewards/margins": 0.6545425057411194, - "rewards/rejected": -2.096639394760132, - "sft_loss": 1.3904750347137451, + "grad_norm": 7.56605036688191, + "learning_rate": 5.303506823375409e-07, + "logits/chosen": -0.2440626621246338, + "logits/rejected": -0.06764684617519379, + "logps/chosen": -1.4485827684402466, + "logps/rejected": -1.7837321758270264, + "loss": 1.0784, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4485827684402466, + "rewards/margins": 0.3351495563983917, + "rewards/rejected": -1.7837321758270264, + "sft_loss": 1.3873783349990845, "step": 2985 }, { "epoch": 1.60026760327814, - "grad_norm": 11.050093340691287, - "learning_rate": 1.5863880205509432e-06, - "logits/chosen": -0.5791524052619934, - "logits/rejected": -0.4395558834075928, - "logps/chosen": -1.3059974908828735, - "logps/rejected": -2.073613166809082, - "loss": 0.9213, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3059974908828735, - "rewards/margins": 0.7676156759262085, - "rewards/rejected": -2.073613166809082, - "sft_loss": 1.3330674171447754, + "grad_norm": 19.071359847499636, + "learning_rate": 5.287960068503143e-07, + "logits/chosen": -0.2108345329761505, + "logits/rejected": -0.03887351602315903, + "logps/chosen": -1.3028171062469482, + "logps/rejected": -1.8426485061645508, + "loss": 0.9855, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3028171062469482, + "rewards/margins": 0.5398311614990234, + "rewards/rejected": -1.8426485061645508, + "sft_loss": 1.3312140703201294, "step": 2990 }, { "epoch": 1.6029436360595417, - "grad_norm": 8.206254056781196, - "learning_rate": 1.5817231559523097e-06, - "logits/chosen": -0.5781939029693604, - "logits/rejected": -0.5216881036758423, - "logps/chosen": -1.4066269397735596, - "logps/rejected": -2.1981959342956543, - "loss": 0.9927, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.4066269397735596, - "rewards/margins": 0.7915690541267395, - "rewards/rejected": -2.1981959342956543, - "sft_loss": 1.4781930446624756, + "grad_norm": 10.569481667131303, + "learning_rate": 5.272410519841032e-07, + "logits/chosen": -0.22008387744426727, + "logits/rejected": -0.12119672447443008, + "logps/chosen": -1.3868043422698975, + "logps/rejected": -1.9178674221038818, + "loss": 1.0373, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3868043422698975, + "rewards/margins": 0.5310630798339844, + "rewards/rejected": -1.9178674221038818, + "sft_loss": 1.4592076539993286, "step": 2995 }, { "epoch": 1.6056196688409434, - "grad_norm": 6.359668377120754, - "learning_rate": 1.5770574984752582e-06, - "logits/chosen": -0.6331890821456909, - "logits/rejected": -0.5329810976982117, - "logps/chosen": -1.3997949361801147, - "logps/rejected": -2.0195674896240234, - "loss": 1.0077, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3997949361801147, - "rewards/margins": 0.6197725534439087, - "rewards/rejected": -2.0195674896240234, - "sft_loss": 1.3704310655593872, + "grad_norm": 5.29555931503069, + "learning_rate": 5.256858328250861e-07, + "logits/chosen": -0.2550900876522064, + "logits/rejected": -0.11833520233631134, + "logps/chosen": -1.4027044773101807, + "logps/rejected": -1.7607784271240234, + "loss": 1.0605, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4027044773101807, + "rewards/margins": 0.3580739498138428, + "rewards/rejected": -1.7607784271240234, + "sft_loss": 1.3712348937988281, "step": 3000 }, { "epoch": 1.608295701622345, - "grad_norm": 12.987698998099232, - "learning_rate": 1.5723910933860191e-06, - "logits/chosen": -0.6868072748184204, - "logits/rejected": -0.5752248167991638, - "logps/chosen": -1.3194029331207275, - "logps/rejected": -1.893593192100525, - "loss": 0.9706, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.3194029331207275, - "rewards/margins": 0.5741902589797974, - "rewards/rejected": -1.893593192100525, - "sft_loss": 1.3269562721252441, + "grad_norm": 11.190167224051493, + "learning_rate": 5.241303644620063e-07, + "logits/chosen": -0.26853471994400024, + "logits/rejected": -0.12684592604637146, + "logps/chosen": -1.2774595022201538, + "logps/rejected": -1.6234922409057617, + "loss": 1.0173, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2774595022201538, + "rewards/margins": 0.3460327684879303, + "rewards/rejected": -1.6234922409057617, + "sft_loss": 1.2910670042037964, "step": 3005 }, { "epoch": 1.6109717344037464, - "grad_norm": 8.335772019808212, - "learning_rate": 1.5677239859580742e-06, - "logits/chosen": -0.6515632271766663, - "logits/rejected": -0.5666736364364624, - "logps/chosen": -1.3261549472808838, - "logps/rejected": -1.8968639373779297, - "loss": 0.999, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3261549472808838, - "rewards/margins": 0.5707091689109802, - "rewards/rejected": -1.8968639373779297, - "sft_loss": 1.349196434020996, + "grad_norm": 9.278021212318102, + "learning_rate": 5.225746619860248e-07, + "logits/chosen": -0.2627517282962799, + "logits/rejected": -0.13891619443893433, + "logps/chosen": -1.315260887145996, + "logps/rejected": -1.6367619037628174, + "loss": 1.0562, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.315260887145996, + "rewards/margins": 0.3215009570121765, + "rewards/rejected": -1.6367619037628174, + "sft_loss": 1.3411436080932617, "step": 3010 }, { "epoch": 1.6136477671851481, - "grad_norm": 12.432038308281975, - "learning_rate": 1.5630562214717205e-06, - "logits/chosen": -0.5145977139472961, - "logits/rejected": -0.4578397274017334, - "logps/chosen": -1.4018304347991943, - "logps/rejected": -1.8993148803710938, - "loss": 1.0053, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.4018304347991943, - "rewards/margins": 0.4974845051765442, - "rewards/rejected": -1.8993148803710938, - "sft_loss": 1.391309380531311, + "grad_norm": 10.036290188631336, + "learning_rate": 5.210187404905735e-07, + "logits/chosen": -0.07215414196252823, + "logits/rejected": 0.001811787486076355, + "logps/chosen": -1.358033537864685, + "logps/rejected": -1.692010521888733, + "loss": 1.0435, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.358033537864685, + "rewards/margins": 0.3339768946170807, + "rewards/rejected": -1.692010521888733, + "sft_loss": 1.3851827383041382, "step": 3015 }, { "epoch": 1.6163237999665496, - "grad_norm": 6.843658695419179, - "learning_rate": 1.5583878452136296e-06, - "logits/chosen": -0.6365788578987122, - "logits/rejected": -0.5374706387519836, - "logps/chosen": -1.3000893592834473, - "logps/rejected": -1.7917066812515259, - "loss": 0.9793, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3000893592834473, - "rewards/margins": 0.4916171133518219, - "rewards/rejected": -1.7917066812515259, - "sft_loss": 1.3843553066253662, + "grad_norm": 8.720723133324936, + "learning_rate": 5.194626150712098e-07, + "logits/chosen": -0.2639918327331543, + "logits/rejected": -0.11108319461345673, + "logps/chosen": -1.295902967453003, + "logps/rejected": -1.6099973917007446, + "loss": 1.0365, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.295902967453003, + "rewards/margins": 0.3140943646430969, + "rewards/rejected": -1.6099973917007446, + "sft_loss": 1.3766124248504639, "step": 3020 }, { "epoch": 1.6189998327479511, - "grad_norm": 6.092318695370072, - "learning_rate": 1.5537189024764086e-06, - "logits/chosen": -0.5974392890930176, - "logits/rejected": -0.49662700295448303, - "logps/chosen": -1.2565548419952393, - "logps/rejected": -1.714015245437622, - "loss": 0.9882, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.2565548419952393, - "rewards/margins": 0.45746010541915894, - "rewards/rejected": -1.714015245437622, - "sft_loss": 1.3433637619018555, + "grad_norm": 7.357840446224466, + "learning_rate": 5.179063008254695e-07, + "logits/chosen": -0.19443385303020477, + "logits/rejected": -0.05075268819928169, + "logps/chosen": -1.2535735368728638, + "logps/rejected": -1.5689475536346436, + "loss": 1.0252, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2535735368728638, + "rewards/margins": 0.3153740167617798, + "rewards/rejected": -1.5689475536346436, + "sft_loss": 1.3399714231491089, "step": 3025 }, { "epoch": 1.6216758655293528, - "grad_norm": 6.9247657211195754, - "learning_rate": 1.5490494385581599e-06, - "logits/chosen": -0.5492630004882812, - "logits/rejected": -0.451678991317749, - "logps/chosen": -1.363454818725586, - "logps/rejected": -1.8830769062042236, - "loss": 0.9923, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.363454818725586, - "rewards/margins": 0.5196219682693481, - "rewards/rejected": -1.8830769062042236, - "sft_loss": 1.3941650390625, + "grad_norm": 6.970022638713376, + "learning_rate": 5.163498128527199e-07, + "logits/chosen": -0.168025940656662, + "logits/rejected": -0.02265249192714691, + "logps/chosen": -1.3693130016326904, + "logps/rejected": -1.6871531009674072, + "loss": 1.0515, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3693130016326904, + "rewards/margins": 0.31784000992774963, + "rewards/rejected": -1.6871531009674072, + "sft_loss": 1.3919140100479126, "step": 3030 }, { "epoch": 1.6243518983107543, - "grad_norm": 7.708345661189646, - "learning_rate": 1.5443794987620433e-06, - "logits/chosen": -0.4692181646823883, - "logits/rejected": -0.40992242097854614, - "logps/chosen": -1.3081228733062744, - "logps/rejected": -1.6529757976531982, - "loss": 0.997, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3081228733062744, - "rewards/margins": 0.34485286474227905, - "rewards/rejected": -1.6529757976531982, - "sft_loss": 1.3308913707733154, + "grad_norm": 10.504597847545845, + "learning_rate": 5.147931662540144e-07, + "logits/chosen": -0.0708565041422844, + "logits/rejected": 0.0453130379319191, + "logps/chosen": -1.325622797012329, + "logps/rejected": -1.514797568321228, + "loss": 1.0581, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.325622797012329, + "rewards/margins": 0.18917487561702728, + "rewards/rejected": -1.514797568321228, + "sft_loss": 1.327514410018921, "step": 3035 }, { "epoch": 1.6270279310921558, - "grad_norm": 6.957139071318043, - "learning_rate": 1.539709128395835e-06, - "logits/chosen": -0.5676388144493103, - "logits/rejected": -0.5339337587356567, - "logps/chosen": -1.214019536972046, - "logps/rejected": -1.9025837182998657, - "loss": 0.9272, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.214019536972046, - "rewards/margins": 0.6885641813278198, - "rewards/rejected": -1.9025837182998657, - "sft_loss": 1.2781291007995605, + "grad_norm": 8.09179800215906, + "learning_rate": 5.132363761319449e-07, + "logits/chosen": -0.18696202337741852, + "logits/rejected": -0.12388841807842255, + "logps/chosen": -1.2330793142318726, + "logps/rejected": -1.7086670398712158, + "loss": 0.9741, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2330793142318726, + "rewards/margins": 0.4755876660346985, + "rewards/rejected": -1.7086670398712158, + "sft_loss": 1.2789698839187622, "step": 3040 }, { "epoch": 1.6297039638735575, - "grad_norm": 31.39373896850739, - "learning_rate": 1.5350383727714888e-06, - "logits/chosen": -0.5760513544082642, - "logits/rejected": -0.5065566301345825, - "logps/chosen": -1.3135230541229248, - "logps/rejected": -1.7099521160125732, - "loss": 1.0353, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3135230541229248, - "rewards/margins": 0.3964292109012604, - "rewards/rejected": -1.7099521160125732, - "sft_loss": 1.3254320621490479, + "grad_norm": 12.564328773720826, + "learning_rate": 5.116794575904962e-07, + "logits/chosen": -0.19994398951530457, + "logits/rejected": -0.10410158336162567, + "logps/chosen": -1.267035961151123, + "logps/rejected": -1.5420608520507812, + "loss": 1.0454, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.267035961151123, + "rewards/margins": 0.27502498030662537, + "rewards/rejected": -1.5420608520507812, + "sft_loss": 1.2928426265716553, "step": 3045 }, { "epoch": 1.632379996654959, - "grad_norm": 7.158147144834485, - "learning_rate": 1.5303672772046963e-06, - "logits/chosen": -0.5834034085273743, - "logits/rejected": -0.45677343010902405, - "logps/chosen": -1.3490737676620483, - "logps/rejected": -2.0836308002471924, - "loss": 0.9457, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3490737676620483, - "rewards/margins": 0.7345567345619202, - "rewards/rejected": -2.0836308002471924, - "sft_loss": 1.4182727336883545, + "grad_norm": 6.7703373107950044, + "learning_rate": 5.101224257348987e-07, + "logits/chosen": -0.22879798710346222, + "logits/rejected": -0.07329122722148895, + "logps/chosen": -1.3567241430282593, + "logps/rejected": -1.8353170156478882, + "loss": 1.0171, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3567241430282593, + "rewards/margins": 0.47859278321266174, + "rewards/rejected": -1.8353170156478882, + "sft_loss": 1.4154313802719116, "step": 3050 }, { "epoch": 1.6350560294363605, - "grad_norm": 7.871853838413433, - "learning_rate": 1.525695887014447e-06, - "logits/chosen": -0.561724066734314, - "logits/rejected": -0.4565046429634094, - "logps/chosen": -1.30876886844635, - "logps/rejected": -1.8969253301620483, - "loss": 0.9524, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.30876886844635, - "rewards/margins": 0.5881567001342773, - "rewards/rejected": -1.8969253301620483, - "sft_loss": 1.3320848941802979, + "grad_norm": 8.358972041931533, + "learning_rate": 5.085652956714823e-07, + "logits/chosen": -0.23752336204051971, + "logits/rejected": -0.10456007719039917, + "logps/chosen": -1.3062692880630493, + "logps/rejected": -1.7325499057769775, + "loss": 0.998, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3062692880630493, + "rewards/margins": 0.4262804090976715, + "rewards/rejected": -1.7325499057769775, + "sft_loss": 1.3278343677520752, "step": 3055 }, { "epoch": 1.6377320622177622, - "grad_norm": 7.269484826641823, - "learning_rate": 1.5210242475225896e-06, - "logits/chosen": -0.5328022837638855, - "logits/rejected": -0.3768201172351837, - "logps/chosen": -1.3358179330825806, - "logps/rejected": -1.868719458580017, - "loss": 1.0073, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3358179330825806, - "rewards/margins": 0.532901406288147, - "rewards/rejected": -1.868719458580017, - "sft_loss": 1.3920084238052368, + "grad_norm": 5.687794422303738, + "learning_rate": 5.070080825075298e-07, + "logits/chosen": -0.22338104248046875, + "logits/rejected": -0.05207739397883415, + "logps/chosen": -1.3390872478485107, + "logps/rejected": -1.688347578048706, + "loss": 1.0552, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3390872478485107, + "rewards/margins": 0.349260538816452, + "rewards/rejected": -1.688347578048706, + "sft_loss": 1.382442831993103, "step": 3060 }, { "epoch": 1.6404080949991637, - "grad_norm": 14.670382231421788, - "learning_rate": 1.5163524040533903e-06, - "logits/chosen": -0.4326193928718567, - "logits/rejected": -0.4179346561431885, - "logps/chosen": -1.3334858417510986, - "logps/rejected": -1.9773250818252563, - "loss": 0.9607, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3334858417510986, - "rewards/margins": 0.6438394784927368, - "rewards/rejected": -1.9773250818252563, - "sft_loss": 1.373219609260559, + "grad_norm": 10.26839397605355, + "learning_rate": 5.0545080135113e-07, + "logits/chosen": -0.12670452892780304, + "logits/rejected": -0.08819358795881271, + "logps/chosen": -1.3337876796722412, + "logps/rejected": -1.8196563720703125, + "loss": 1.0039, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3337876796722412, + "rewards/margins": 0.4858686327934265, + "rewards/rejected": -1.8196563720703125, + "sft_loss": 1.370620608329773, "step": 3065 }, { "epoch": 1.6430841277805652, - "grad_norm": 7.986154271307429, - "learning_rate": 1.5116804019330951e-06, - "logits/chosen": -0.5422690510749817, - "logits/rejected": -0.4372781217098236, - "logps/chosen": -1.3089624643325806, - "logps/rejected": -1.870825171470642, - "loss": 0.9804, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3089624643325806, - "rewards/margins": 0.5618628263473511, - "rewards/rejected": -1.870825171470642, - "sft_loss": 1.3825502395629883, + "grad_norm": 8.743863475193205, + "learning_rate": 5.038934673110316e-07, + "logits/chosen": -0.25462573766708374, + "logits/rejected": -0.137288436293602, + "logps/chosen": -1.3446637392044067, + "logps/rejected": -1.7400169372558594, + "loss": 1.0353, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3446637392044067, + "rewards/margins": 0.395353227853775, + "rewards/rejected": -1.7400169372558594, + "sft_loss": 1.392261266708374, "step": 3070 }, { "epoch": 1.645760160561967, - "grad_norm": 5.749957550258941, - "learning_rate": 1.5070082864894892e-06, - "logits/chosen": -0.5507332682609558, - "logits/rejected": -0.5055480599403381, - "logps/chosen": -1.2318403720855713, - "logps/rejected": -1.7911930084228516, - "loss": 0.9293, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2318403720855713, - "rewards/margins": 0.5593525767326355, - "rewards/rejected": -1.7911930084228516, - "sft_loss": 1.256838321685791, + "grad_norm": 6.524113767489526, + "learning_rate": 5.023360954964963e-07, + "logits/chosen": -0.2823813259601593, + "logits/rejected": -0.21783390641212463, + "logps/chosen": -1.274153232574463, + "logps/rejected": -1.6759366989135742, + "loss": 0.9917, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.274153232574463, + "rewards/margins": 0.40178337693214417, + "rewards/rejected": -1.6759366989135742, + "sft_loss": 1.2710778713226318, "step": 3075 }, { "epoch": 1.6484361933433684, - "grad_norm": 8.496437191360968, - "learning_rate": 1.5023361030514572e-06, - "logits/chosen": -0.6092488765716553, - "logits/rejected": -0.4492560029029846, - "logps/chosen": -1.192866563796997, - "logps/rejected": -1.7828502655029297, - "loss": 0.9257, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.192866563796997, - "rewards/margins": 0.5899838209152222, - "rewards/rejected": -1.7828502655029297, - "sft_loss": 1.277256965637207, + "grad_norm": 7.768779479863598, + "learning_rate": 5.007787010171524e-07, + "logits/chosen": -0.33525434136390686, + "logits/rejected": -0.15368661284446716, + "logps/chosen": -1.2215659618377686, + "logps/rejected": -1.6280514001846313, + "loss": 0.9814, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2215659618377686, + "rewards/margins": 0.406485378742218, + "rewards/rejected": -1.6280514001846313, + "sft_loss": 1.2901760339736938, "step": 3080 }, { "epoch": 1.65111222612477, - "grad_norm": 7.625719573165574, - "learning_rate": 1.4976638969485433e-06, - "logits/chosen": -0.4337848722934723, - "logits/rejected": -0.4376908242702484, - "logps/chosen": -1.2819950580596924, - "logps/rejected": -1.8182960748672485, - "loss": 0.9591, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2819950580596924, - "rewards/margins": 0.5363009572029114, - "rewards/rejected": -1.8182960748672485, - "sft_loss": 1.3229899406433105, + "grad_norm": 5.962837403904988, + "learning_rate": 4.992212989828477e-07, + "logits/chosen": -0.14228685200214386, + "logits/rejected": -0.13161325454711914, + "logps/chosen": -1.2799794673919678, + "logps/rejected": -1.6704368591308594, + "loss": 0.9987, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2799794673919678, + "rewards/margins": 0.3904576301574707, + "rewards/rejected": -1.6704368591308594, + "sft_loss": 1.3193469047546387, "step": 3085 }, { "epoch": 1.6537882589061716, - "grad_norm": 9.886210431426107, - "learning_rate": 1.492991713510511e-06, - "logits/chosen": -0.4346126914024353, - "logits/rejected": -0.3826829791069031, - "logps/chosen": -1.2966272830963135, - "logps/rejected": -1.7472527027130127, - "loss": 1.0168, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.2966272830963135, - "rewards/margins": 0.45062533020973206, - "rewards/rejected": -1.7472527027130127, - "sft_loss": 1.3685801029205322, + "grad_norm": 7.4051418349634, + "learning_rate": 4.976639045035036e-07, + "logits/chosen": -0.1406664401292801, + "logits/rejected": -0.0699944943189621, + "logps/chosen": -1.320504903793335, + "logps/rejected": -1.5901587009429932, + "loss": 1.0914, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.320504903793335, + "rewards/margins": 0.26965343952178955, + "rewards/rejected": -1.5901587009429932, + "sft_loss": 1.3954423666000366, "step": 3090 }, { "epoch": 1.6564642916875731, - "grad_norm": 7.0604512187525765, - "learning_rate": 1.4883195980669052e-06, - "logits/chosen": -0.512303352355957, - "logits/rejected": -0.40312275290489197, - "logps/chosen": -1.3184245824813843, - "logps/rejected": -1.9654176235198975, - "loss": 0.9278, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3184245824813843, - "rewards/margins": 0.6469929814338684, - "rewards/rejected": -1.9654176235198975, - "sft_loss": 1.3390653133392334, + "grad_norm": 6.757034888848797, + "learning_rate": 4.961065326889683e-07, + "logits/chosen": -0.18777263164520264, + "logits/rejected": -0.04904399812221527, + "logps/chosen": -1.3198715448379517, + "logps/rejected": -1.7347841262817383, + "loss": 1.003, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3198715448379517, + "rewards/margins": 0.4149126410484314, + "rewards/rejected": -1.7347841262817383, + "sft_loss": 1.34419846534729, "step": 3095 }, { "epoch": 1.6591403244689746, - "grad_norm": 10.563683019206419, - "learning_rate": 1.48364759594661e-06, - "logits/chosen": -0.6492779850959778, - "logits/rejected": -0.5419107675552368, - "logps/chosen": -1.344524621963501, - "logps/rejected": -1.9468377828598022, - "loss": 0.9877, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.344524621963501, - "rewards/margins": 0.6023133993148804, - "rewards/rejected": -1.9468377828598022, - "sft_loss": 1.4405728578567505, + "grad_norm": 10.260517271328002, + "learning_rate": 4.9454919864887e-07, + "logits/chosen": -0.336527019739151, + "logits/rejected": -0.20288416743278503, + "logps/chosen": -1.3640118837356567, + "logps/rejected": -1.6893068552017212, + "loss": 1.0775, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3640118837356567, + "rewards/margins": 0.32529503107070923, + "rewards/rejected": -1.6893068552017212, + "sft_loss": 1.4524770975112915, "step": 3100 }, { "epoch": 1.6618163572503764, - "grad_norm": 12.262873845911782, - "learning_rate": 1.4789757524774105e-06, - "logits/chosen": -0.5909110307693481, - "logits/rejected": -0.4422377645969391, - "logps/chosen": -1.3807677030563354, - "logps/rejected": -1.9295337200164795, - "loss": 0.9935, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3807677030563354, - "rewards/margins": 0.5487662553787231, - "rewards/rejected": -1.9295337200164795, - "sft_loss": 1.4326026439666748, + "grad_norm": 11.022266750671784, + "learning_rate": 4.929919174924701e-07, + "logits/chosen": -0.27054914832115173, + "logits/rejected": -0.0914614200592041, + "logps/chosen": -1.3564226627349854, + "logps/rejected": -1.6926740407943726, + "loss": 1.0388, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3564226627349854, + "rewards/margins": 0.33625149726867676, + "rewards/rejected": -1.6926740407943726, + "sft_loss": 1.4034146070480347, "step": 3105 }, { "epoch": 1.6644923900317778, - "grad_norm": 10.11402429370191, - "learning_rate": 1.474304112985553e-06, - "logits/chosen": -0.5599151253700256, - "logits/rejected": -0.4676848351955414, - "logps/chosen": -1.3399332761764526, - "logps/rejected": -1.953974962234497, - "loss": 0.9437, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3399332761764526, - "rewards/margins": 0.614041805267334, - "rewards/rejected": -1.953974962234497, - "sft_loss": 1.312151312828064, + "grad_norm": 10.413816986747639, + "learning_rate": 4.914347043285177e-07, + "logits/chosen": -0.19406765699386597, + "logits/rejected": -0.09133412688970566, + "logps/chosen": -1.3404884338378906, + "logps/rejected": -1.7077680826187134, + "loss": 1.0173, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3404884338378906, + "rewards/margins": 0.36727944016456604, + "rewards/rejected": -1.7077680826187134, + "sft_loss": 1.2941471338272095, "step": 3110 }, { "epoch": 1.6671684228131793, - "grad_norm": 8.156421754815042, - "learning_rate": 1.469632722795304e-06, - "logits/chosen": -0.4865991473197937, - "logits/rejected": -0.4286844730377197, - "logps/chosen": -1.3717714548110962, - "logps/rejected": -2.03859543800354, - "loss": 0.9726, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3717714548110962, - "rewards/margins": 0.6668239235877991, - "rewards/rejected": -2.03859543800354, - "sft_loss": 1.450972080230713, + "grad_norm": 8.872808421935437, + "learning_rate": 4.898775742651013e-07, + "logits/chosen": -0.10099422931671143, + "logits/rejected": -0.016091059893369675, + "logps/chosen": -1.3404185771942139, + "logps/rejected": -1.7624202966690063, + "loss": 1.0256, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3404185771942139, + "rewards/margins": 0.4220016598701477, + "rewards/rejected": -1.7624202966690063, + "sft_loss": 1.4071005582809448, "step": 3115 }, { "epoch": 1.669844455594581, - "grad_norm": 7.9413448011852275, - "learning_rate": 1.4649616272285115e-06, - "logits/chosen": -0.591395914554596, - "logits/rejected": -0.48542946577072144, - "logps/chosen": -1.4039747714996338, - "logps/rejected": -2.049610137939453, - "loss": 1.0139, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.4039747714996338, - "rewards/margins": 0.6456353068351746, - "rewards/rejected": -2.049610137939453, - "sft_loss": 1.4193564653396606, + "grad_norm": 6.796383293872087, + "learning_rate": 4.883205424095037e-07, + "logits/chosen": -0.2519132196903229, + "logits/rejected": -0.10128184407949448, + "logps/chosen": -1.403340220451355, + "logps/rejected": -1.7736599445343018, + "loss": 1.0702, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.403340220451355, + "rewards/margins": 0.370319664478302, + "rewards/rejected": -1.7736599445343018, + "sft_loss": 1.4098050594329834, "step": 3120 }, { "epoch": 1.6725204883759828, - "grad_norm": 6.763474728045645, - "learning_rate": 1.4602908716041651e-06, - "logits/chosen": -0.4781588912010193, - "logits/rejected": -0.39982470870018005, - "logps/chosen": -1.526206374168396, - "logps/rejected": -2.1994595527648926, - "loss": 0.9935, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.526206374168396, - "rewards/margins": 0.6732532978057861, - "rewards/rejected": -2.1994595527648926, - "sft_loss": 1.4044595956802368, + "grad_norm": 5.748312285692267, + "learning_rate": 4.86763623868055e-07, + "logits/chosen": -0.15138813853263855, + "logits/rejected": -0.04131780564785004, + "logps/chosen": -1.4313924312591553, + "logps/rejected": -1.8283259868621826, + "loss": 1.037, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4313924312591553, + "rewards/margins": 0.39693355560302734, + "rewards/rejected": -1.8283259868621826, + "sft_loss": 1.3752673864364624, "step": 3125 }, { "epoch": 1.675196521157384, - "grad_norm": 9.44893034407677, - "learning_rate": 1.4556205012379568e-06, - "logits/chosen": -0.48927783966064453, - "logits/rejected": -0.3744940161705017, - "logps/chosen": -1.3688924312591553, - "logps/rejected": -1.95903742313385, - "loss": 0.9783, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3688924312591553, - "rewards/margins": 0.5901449918746948, - "rewards/rejected": -1.95903742313385, - "sft_loss": 1.4328769445419312, + "grad_norm": 8.76407108118166, + "learning_rate": 4.852068337459856e-07, + "logits/chosen": -0.15335145592689514, + "logits/rejected": -0.016062330454587936, + "logps/chosen": -1.3842003345489502, + "logps/rejected": -1.7323811054229736, + "loss": 1.0399, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3842003345489502, + "rewards/margins": 0.3481805920600891, + "rewards/rejected": -1.7323811054229736, + "sft_loss": 1.4326179027557373, "step": 3130 }, { "epoch": 1.6778725539387858, - "grad_norm": 7.876706171928691, - "learning_rate": 1.4509505614418402e-06, - "logits/chosen": -0.4390248656272888, - "logits/rejected": -0.38640502095222473, - "logps/chosen": -1.4024837017059326, - "logps/rejected": -1.9528404474258423, - "loss": 0.9861, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.4024837017059326, - "rewards/margins": 0.5503565073013306, - "rewards/rejected": -1.9528404474258423, - "sft_loss": 1.3517831563949585, + "grad_norm": 6.8240954594636385, + "learning_rate": 4.8365018714728e-07, + "logits/chosen": -0.10055909305810928, + "logits/rejected": -0.03672239929437637, + "logps/chosen": -1.3982130289077759, + "logps/rejected": -1.7273247241973877, + "loss": 1.0455, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3982130289077759, + "rewards/margins": 0.3291115462779999, + "rewards/rejected": -1.7273247241973877, + "sft_loss": 1.3380857706069946, "step": 3135 }, { "epoch": 1.6805485867201875, - "grad_norm": 7.842891697104294, - "learning_rate": 1.4462810975235915e-06, - "logits/chosen": -0.6836906671524048, - "logits/rejected": -0.5594662427902222, - "logps/chosen": -1.2556931972503662, - "logps/rejected": -1.6881049871444702, - "loss": 0.992, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.2556931972503662, - "rewards/margins": 0.4324119985103607, - "rewards/rejected": -1.6881049871444702, - "sft_loss": 1.3282722234725952, + "grad_norm": 5.681251133368896, + "learning_rate": 4.820936991745304e-07, + "logits/chosen": -0.38894954323768616, + "logits/rejected": -0.23619875311851501, + "logps/chosen": -1.250199317932129, + "logps/rejected": -1.519134283065796, + "loss": 1.0452, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.250199317932129, + "rewards/margins": 0.26893505454063416, + "rewards/rejected": -1.519134283065796, + "sft_loss": 1.3250980377197266, "step": 3140 }, { "epoch": 1.6832246195015887, - "grad_norm": 9.152902396575497, - "learning_rate": 1.4416121547863703e-06, - "logits/chosen": -0.49310851097106934, - "logits/rejected": -0.3880365490913391, - "logps/chosen": -1.3027580976486206, - "logps/rejected": -1.9210455417633057, - "loss": 0.9856, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3027580976486206, - "rewards/margins": 0.6182874441146851, - "rewards/rejected": -1.9210455417633057, - "sft_loss": 1.362561583518982, + "grad_norm": 8.042233248752146, + "learning_rate": 4.8053738492879e-07, + "logits/chosen": -0.1970815658569336, + "logits/rejected": -0.060459040105342865, + "logps/chosen": -1.294335961341858, + "logps/rejected": -1.7021186351776123, + "loss": 1.0417, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.294335961341858, + "rewards/margins": 0.407782644033432, + "rewards/rejected": -1.7021186351776123, + "sft_loss": 1.3637042045593262, "step": 3145 }, { "epoch": 1.6859006522829905, - "grad_norm": 6.911340969081792, - "learning_rate": 1.4369437785282794e-06, - "logits/chosen": -0.6421962976455688, - "logits/rejected": -0.5590324401855469, - "logps/chosen": -1.3706694841384888, - "logps/rejected": -1.921740174293518, - "loss": 0.9652, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3706694841384888, - "rewards/margins": 0.5510705709457397, - "rewards/rejected": -1.921740174293518, - "sft_loss": 1.385704755783081, + "grad_norm": 9.010907080241779, + "learning_rate": 4.789812595094265e-07, + "logits/chosen": -0.33565548062324524, + "logits/rejected": -0.21785983443260193, + "logps/chosen": -1.3821521997451782, + "logps/rejected": -1.7122135162353516, + "loss": 1.0358, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3821521997451782, + "rewards/margins": 0.33006131649017334, + "rewards/rejected": -1.7122135162353516, + "sft_loss": 1.374982237815857, "step": 3150 }, { "epoch": 1.6885766850643922, - "grad_norm": 8.799634282616347, - "learning_rate": 1.4322760140419259e-06, - "logits/chosen": -0.6151847839355469, - "logits/rejected": -0.5183078646659851, - "logps/chosen": -1.2506282329559326, - "logps/rejected": -1.8808501958847046, - "loss": 0.9542, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.2506282329559326, - "rewards/margins": 0.630221962928772, - "rewards/rejected": -1.8808501958847046, - "sft_loss": 1.3045356273651123, + "grad_norm": 9.202178181845822, + "learning_rate": 4.774253380139752e-07, + "logits/chosen": -0.31604892015457153, + "logits/rejected": -0.2019950896501541, + "logps/chosen": -1.2382168769836426, + "logps/rejected": -1.6578218936920166, + "loss": 0.9991, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2382168769836426, + "rewards/margins": 0.41960495710372925, + "rewards/rejected": -1.6578218936920166, + "sft_loss": 1.2910674810409546, "step": 3155 }, { "epoch": 1.6912527178457935, - "grad_norm": 13.265562216769109, - "learning_rate": 1.427608906613981e-06, - "logits/chosen": -0.561122715473175, - "logits/rejected": -0.5639852285385132, - "logps/chosen": -1.3408920764923096, - "logps/rejected": -1.9823658466339111, - "loss": 0.9855, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3408920764923096, - "rewards/margins": 0.6414738893508911, - "rewards/rejected": -1.9823658466339111, - "sft_loss": 1.4433765411376953, + "grad_norm": 8.42016279895261, + "learning_rate": 4.758696355379936e-07, + "logits/chosen": -0.26183784008026123, + "logits/rejected": -0.24288320541381836, + "logps/chosen": -1.3160405158996582, + "logps/rejected": -1.7478888034820557, + "loss": 1.0328, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3160405158996582, + "rewards/margins": 0.4318482279777527, + "rewards/rejected": -1.7478888034820557, + "sft_loss": 1.3997399806976318, "step": 3160 }, { "epoch": 1.6939287506271952, - "grad_norm": 8.517806199918638, - "learning_rate": 1.4229425015247414e-06, - "logits/chosen": -0.6105222105979919, - "logits/rejected": -0.5246438980102539, - "logps/chosen": -1.3622534275054932, - "logps/rejected": -1.8676481246948242, - "loss": 1.0286, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3622534275054932, - "rewards/margins": 0.5053948760032654, - "rewards/rejected": -1.8676481246948242, - "sft_loss": 1.4393728971481323, + "grad_norm": 5.810263975580805, + "learning_rate": 4.743141671749138e-07, + "logits/chosen": -0.3238453269004822, + "logits/rejected": -0.23669204115867615, + "logps/chosen": -1.3417284488677979, + "logps/rejected": -1.6320661306381226, + "loss": 1.0733, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3417284488677979, + "rewards/margins": 0.2903375029563904, + "rewards/rejected": -1.6320661306381226, + "sft_loss": 1.40792715549469, "step": 3165 }, { "epoch": 1.6966047834085969, - "grad_norm": 6.247142456096901, - "learning_rate": 1.4182768440476904e-06, - "logits/chosen": -0.6173042058944702, - "logits/rejected": -0.5559664964675903, - "logps/chosen": -1.358097791671753, - "logps/rejected": -1.9720399379730225, - "loss": 0.9824, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.358097791671753, - "rewards/margins": 0.6139422059059143, - "rewards/rejected": -1.9720399379730225, - "sft_loss": 1.4049310684204102, + "grad_norm": 7.031312718505899, + "learning_rate": 4.727589480158968e-07, + "logits/chosen": -0.2895796000957489, + "logits/rejected": -0.193088099360466, + "logps/chosen": -1.3276104927062988, + "logps/rejected": -1.6946378946304321, + "loss": 1.0454, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3276104927062988, + "rewards/margins": 0.36702749133110046, + "rewards/rejected": -1.6946378946304321, + "sft_loss": 1.3769690990447998, "step": 3170 }, { "epoch": 1.6992808161899984, - "grad_norm": 19.69526048686937, - "learning_rate": 1.4136119794490567e-06, - "logits/chosen": -0.6866458654403687, - "logits/rejected": -0.6137515902519226, - "logps/chosen": -1.3925710916519165, - "logps/rejected": -1.8843557834625244, - "loss": 1.0459, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3925710916519165, - "rewards/margins": 0.4917844831943512, - "rewards/rejected": -1.8843557834625244, - "sft_loss": 1.431840181350708, + "grad_norm": 11.568266998719043, + "learning_rate": 4.712039931496855e-07, + "logits/chosen": -0.31578174233436584, + "logits/rejected": -0.22001910209655762, + "logps/chosen": -1.3103505373001099, + "logps/rejected": -1.603722333908081, + "loss": 1.0611, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3103505373001099, + "rewards/margins": 0.29337188601493835, + "rewards/rejected": -1.603722333908081, + "sft_loss": 1.345990538597107, "step": 3175 }, { "epoch": 1.7019568489713999, - "grad_norm": 6.359568180732788, - "learning_rate": 1.4089479529873773e-06, - "logits/chosen": -0.5308200716972351, - "logits/rejected": -0.4930025637149811, - "logps/chosen": -1.3771328926086426, - "logps/rejected": -2.063537359237671, - "loss": 0.9812, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3771328926086426, - "rewards/margins": 0.6864045858383179, - "rewards/rejected": -2.063537359237671, - "sft_loss": 1.3803982734680176, + "grad_norm": 6.16136392232105, + "learning_rate": 4.6964931766245905e-07, + "logits/chosen": -0.1511804759502411, + "logits/rejected": -0.0974348783493042, + "logps/chosen": -1.3440520763397217, + "logps/rejected": -1.8585773706436157, + "loss": 1.0114, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3440520763397217, + "rewards/margins": 0.5145252346992493, + "rewards/rejected": -1.8585773706436157, + "sft_loss": 1.354104995727539, "step": 3180 }, { "epoch": 1.7046328817528016, - "grad_norm": 7.135729281452068, - "learning_rate": 1.4042848099130574e-06, - "logits/chosen": -0.5700703263282776, - "logits/rejected": -0.5708822011947632, - "logps/chosen": -1.2795307636260986, - "logps/rejected": -1.7099710702896118, - "loss": 1.0013, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2795307636260986, - "rewards/margins": 0.43044036626815796, - "rewards/rejected": -1.7099710702896118, - "sft_loss": 1.358174204826355, + "grad_norm": 7.683076853167783, + "learning_rate": 4.6809493663768575e-07, + "logits/chosen": -0.20450139045715332, + "logits/rejected": -0.1865270435810089, + "logps/chosen": -1.297811508178711, + "logps/rejected": -1.5590054988861084, + "loss": 1.0452, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.297811508178711, + "rewards/margins": 0.2611939609050751, + "rewards/rejected": -1.5590054988861084, + "sft_loss": 1.3574228286743164, "step": 3185 }, { "epoch": 1.707308914534203, - "grad_norm": 5.886225397742045, - "learning_rate": 1.3996225954679317e-06, - "logits/chosen": -0.5978802442550659, - "logits/rejected": -0.4967038035392761, - "logps/chosen": -1.2683467864990234, - "logps/rejected": -1.8838962316513062, - "loss": 0.9146, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2683467864990234, - "rewards/margins": 0.6155495047569275, - "rewards/rejected": -1.8838962316513062, - "sft_loss": 1.2717249393463135, + "grad_norm": 7.340556040477444, + "learning_rate": 4.6654086515597716e-07, + "logits/chosen": -0.2812632918357849, + "logits/rejected": -0.1310562640428543, + "logps/chosen": -1.2653824090957642, + "logps/rejected": -1.74893057346344, + "loss": 0.9541, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2653824090957642, + "rewards/margins": 0.483548104763031, + "rewards/rejected": -1.74893057346344, + "sft_loss": 1.2568496465682983, "step": 3190 }, { "epoch": 1.7099849473156046, - "grad_norm": 6.2223063839513735, - "learning_rate": 1.3949613548848248e-06, - "logits/chosen": -0.6236740350723267, - "logits/rejected": -0.5328378677368164, - "logps/chosen": -1.2708802223205566, - "logps/rejected": -1.8795549869537354, - "loss": 0.9315, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2708802223205566, - "rewards/margins": 0.6086748242378235, - "rewards/rejected": -1.8795549869537354, - "sft_loss": 1.259526014328003, + "grad_norm": 8.825030310410854, + "learning_rate": 4.6498711829494154e-07, + "logits/chosen": -0.2938292920589447, + "logits/rejected": -0.1903020441532135, + "logps/chosen": -1.2760530710220337, + "logps/rejected": -1.7193348407745361, + "loss": 0.9749, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2760530710220337, + "rewards/margins": 0.4432816505432129, + "rewards/rejected": -1.7193348407745361, + "sft_loss": 1.2668559551239014, "step": 3195 }, { "epoch": 1.7126609800970063, - "grad_norm": 7.529280028879536, - "learning_rate": 1.3903011333871134e-06, - "logits/chosen": -0.5299532413482666, - "logits/rejected": -0.4078814387321472, - "logps/chosen": -1.3449041843414307, - "logps/rejected": -1.996220350265503, - "loss": 0.9824, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3449041843414307, - "rewards/margins": 0.6513162851333618, - "rewards/rejected": -1.996220350265503, - "sft_loss": 1.3764393329620361, + "grad_norm": 7.873635336903305, + "learning_rate": 4.6343371112903777e-07, + "logits/chosen": -0.19705058634281158, + "logits/rejected": -0.06154204532504082, + "logps/chosen": -1.3533506393432617, + "logps/rejected": -1.8990510702133179, + "loss": 1.0196, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3533506393432617, + "rewards/margins": 0.5457005500793457, + "rewards/rejected": -1.8990510702133179, + "sft_loss": 1.3797739744186401, "step": 3200 }, { "epoch": 1.7126609800970063, - "eval_logits/chosen": -0.29770681262016296, - "eval_logits/rejected": -0.23772414028644562, - "eval_logps/chosen": -1.3916462659835815, - "eval_logps/rejected": -1.8345197439193726, - "eval_loss": 1.0347309112548828, - "eval_rewards/accuracies": 0.6283382773399353, - "eval_rewards/chosen": -1.3916462659835815, - "eval_rewards/margins": 0.4428735375404358, - "eval_rewards/rejected": -1.8345197439193726, - "eval_runtime": 46.4526, - "eval_samples_per_second": 28.954, - "eval_sft_loss": 1.406299114227295, - "eval_steps_per_second": 7.255, + "eval_logits/chosen": 0.10069998353719711, + "eval_logits/rejected": 0.18723244965076447, + "eval_logps/chosen": -1.361523985862732, + "eval_logps/rejected": -1.7338141202926636, + "eval_loss": 1.0449482202529907, + "eval_rewards/accuracies": 0.6142433285713196, + "eval_rewards/chosen": -1.361523985862732, + "eval_rewards/margins": 0.37229031324386597, + "eval_rewards/rejected": -1.7338141202926636, + "eval_runtime": 43.4483, + "eval_samples_per_second": 30.956, + "eval_sft_loss": 1.384097933769226, + "eval_steps_per_second": 7.756, "step": 3200 }, { "epoch": 1.7153370128784078, - "grad_norm": 7.354120391704242, - "learning_rate": 1.3856419761882875e-06, - "logits/chosen": -0.6612704992294312, - "logits/rejected": -0.579429030418396, - "logps/chosen": -1.3465828895568848, - "logps/rejected": -1.9286384582519531, - "loss": 0.9623, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3465828895568848, - "rewards/margins": 0.5820555686950684, - "rewards/rejected": -1.9286384582519531, - "sft_loss": 1.3542835712432861, + "grad_norm": 6.844268168714967, + "learning_rate": 4.618806587294291e-07, + "logits/chosen": -0.3229488432407379, + "logits/rejected": -0.20911166071891785, + "logps/chosen": -1.3647921085357666, + "logps/rejected": -1.7320101261138916, + "loss": 1.0377, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3647921085357666, + "rewards/margins": 0.3672178387641907, + "rewards/rejected": -1.7320101261138916, + "sft_loss": 1.368546485900879, "step": 3205 }, { "epoch": 1.7180130456598093, - "grad_norm": 8.205677696925335, - "learning_rate": 1.3809839284915096e-06, - "logits/chosen": -0.638985276222229, - "logits/rejected": -0.5678723454475403, - "logps/chosen": -1.3050861358642578, - "logps/rejected": -1.8729593753814697, - "loss": 0.9808, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.3050861358642578, - "rewards/margins": 0.5678732991218567, - "rewards/rejected": -1.8729593753814697, - "sft_loss": 1.3473398685455322, + "grad_norm": 8.115234289434698, + "learning_rate": 4.603279761638365e-07, + "logits/chosen": -0.28309884667396545, + "logits/rejected": -0.18161797523498535, + "logps/chosen": -1.3209306001663208, + "logps/rejected": -1.6985241174697876, + "loss": 1.0407, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3209306001663208, + "rewards/margins": 0.377593457698822, + "rewards/rejected": -1.6985241174697876, + "sft_loss": 1.3438318967819214, "step": 3210 }, { "epoch": 1.720689078441211, - "grad_norm": 8.31806706498762, - "learning_rate": 1.3763270354891795e-06, - "logits/chosen": -0.6113357543945312, - "logits/rejected": -0.5205150842666626, - "logps/chosen": -1.3317005634307861, - "logps/rejected": -1.945020079612732, - "loss": 0.9746, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3317005634307861, - "rewards/margins": 0.6133192777633667, - "rewards/rejected": -1.945020079612732, - "sft_loss": 1.3502792119979858, + "grad_norm": 8.894952523029621, + "learning_rate": 4.5877567849639315e-07, + "logits/chosen": -0.26940733194351196, + "logits/rejected": -0.14569416642189026, + "logps/chosen": -1.3172519207000732, + "logps/rejected": -1.7098270654678345, + "loss": 1.029, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3172519207000732, + "rewards/margins": 0.39257511496543884, + "rewards/rejected": -1.7098270654678345, + "sft_loss": 1.3400871753692627, "step": 3215 }, { "epoch": 1.7233651112226125, - "grad_norm": 6.374273487412651, - "learning_rate": 1.3716713423624936e-06, - "logits/chosen": -0.6377384662628174, - "logits/rejected": -0.4735463559627533, - "logps/chosen": -1.4612407684326172, - "logps/rejected": -2.114426374435425, - "loss": 1.0311, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.4612407684326172, - "rewards/margins": 0.6531856656074524, - "rewards/rejected": -2.114426374435425, - "sft_loss": 1.4271241426467896, + "grad_norm": 8.060961097393376, + "learning_rate": 4.572237807874979e-07, + "logits/chosen": -0.24872052669525146, + "logits/rejected": -0.03148692101240158, + "logps/chosen": -1.4487375020980835, + "logps/rejected": -1.8613462448120117, + "loss": 1.0828, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4487375020980835, + "rewards/margins": 0.4126088619232178, + "rewards/rejected": -1.8613462448120117, + "sft_loss": 1.4077409505844116, "step": 3220 }, { "epoch": 1.726041144004014, - "grad_norm": 7.714682568173162, - "learning_rate": 1.367016894281007e-06, - "logits/chosen": -0.6564761400222778, - "logits/rejected": -0.5606242418289185, - "logps/chosen": -1.2402262687683105, - "logps/rejected": -1.8998502492904663, - "loss": 0.9272, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2402262687683105, - "rewards/margins": 0.6596239805221558, - "rewards/rejected": -1.8998502492904663, - "sft_loss": 1.3146278858184814, + "grad_norm": 8.652416739646052, + "learning_rate": 4.5567229809366895e-07, + "logits/chosen": -0.24346446990966797, + "logits/rejected": -0.10656796395778656, + "logps/chosen": -1.257863998413086, + "logps/rejected": -1.7140146493911743, + "loss": 0.9896, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.257863998413086, + "rewards/margins": 0.456150621175766, + "rewards/rejected": -1.7140146493911743, + "sft_loss": 1.3103606700897217, "step": 3225 }, { "epoch": 1.7287171767854157, - "grad_norm": 9.634844022060468, - "learning_rate": 1.3623637364021952e-06, - "logits/chosen": -0.6817273497581482, - "logits/rejected": -0.565589427947998, - "logps/chosen": -1.3778315782546997, - "logps/rejected": -2.260098695755005, - "loss": 0.9018, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3778315782546997, - "rewards/margins": 0.88226717710495, - "rewards/rejected": -2.260098695755005, - "sft_loss": 1.3882973194122314, + "grad_norm": 7.284827283113485, + "learning_rate": 4.541212454673984e-07, + "logits/chosen": -0.2828959822654724, + "logits/rejected": -0.13060639798641205, + "logps/chosen": -1.3334567546844482, + "logps/rejected": -1.8920681476593018, + "loss": 0.9674, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3334567546844482, + "rewards/margins": 0.558611273765564, + "rewards/rejected": -1.8920681476593018, + "sft_loss": 1.3370991945266724, "step": 3230 }, { "epoch": 1.7313932095668172, - "grad_norm": 7.761642142665585, - "learning_rate": 1.3577119138710165e-06, - "logits/chosen": -0.6394303441047668, - "logits/rejected": -0.6095726490020752, - "logps/chosen": -1.3704173564910889, - "logps/rejected": -1.9797813892364502, - "loss": 0.9781, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3704173564910889, - "rewards/margins": 0.6093640923500061, - "rewards/rejected": -1.9797813892364502, - "sft_loss": 1.4049458503723145, + "grad_norm": 6.933979823213048, + "learning_rate": 4.525706379570055e-07, + "logits/chosen": -0.26814383268356323, + "logits/rejected": -0.20636215806007385, + "logps/chosen": -1.3193995952606201, + "logps/rejected": -1.7179428339004517, + "loss": 1.0148, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3193995952606201, + "rewards/margins": 0.3985433876514435, + "rewards/rejected": -1.7179428339004517, + "sft_loss": 1.3590309619903564, "step": 3235 }, { "epoch": 1.7340692423482187, - "grad_norm": 7.592780449296121, - "learning_rate": 1.3530614718194734e-06, - "logits/chosen": -0.588589072227478, - "logits/rejected": -0.5276697874069214, - "logps/chosen": -1.3299793004989624, - "logps/rejected": -2.089031219482422, - "loss": 0.9079, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3299793004989624, - "rewards/margins": 0.7590519189834595, - "rewards/rejected": -2.089031219482422, - "sft_loss": 1.279463291168213, + "grad_norm": 7.415555323593706, + "learning_rate": 4.510204906064911e-07, + "logits/chosen": -0.18289199471473694, + "logits/rejected": -0.08508212119340897, + "logps/chosen": -1.292557954788208, + "logps/rejected": -1.7914676666259766, + "loss": 0.9544, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.292557954788208, + "rewards/margins": 0.4989096522331238, + "rewards/rejected": -1.7914676666259766, + "sft_loss": 1.2400840520858765, "step": 3240 }, { "epoch": 1.7367452751296204, - "grad_norm": 11.02640467263545, - "learning_rate": 1.3484124553661754e-06, - "logits/chosen": -0.7396196126937866, - "logits/rejected": -0.6345130205154419, - "logps/chosen": -1.380319595336914, - "logps/rejected": -2.030441999435425, - "loss": 0.9741, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.380319595336914, - "rewards/margins": 0.6501225233078003, - "rewards/rejected": -2.030441999435425, - "sft_loss": 1.3692419528961182, + "grad_norm": 14.699238080869227, + "learning_rate": 4.4947081845539177e-07, + "logits/chosen": -0.35700684785842896, + "logits/rejected": -0.21485145390033722, + "logps/chosen": -1.333528995513916, + "logps/rejected": -1.781053900718689, + "loss": 1.0136, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.333528995513916, + "rewards/margins": 0.4475248456001282, + "rewards/rejected": -1.781053900718689, + "sft_loss": 1.3286097049713135, "step": 3245 }, { "epoch": 1.739421307911022, - "grad_norm": 8.612600618686669, - "learning_rate": 1.3437649096159e-06, - "logits/chosen": -0.5689498782157898, - "logits/rejected": -0.496391624212265, - "logps/chosen": -1.3582226037979126, - "logps/rejected": -1.966665267944336, - "loss": 0.9627, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3582226037979126, - "rewards/margins": 0.6084426641464233, - "rewards/rejected": -1.966665267944336, - "sft_loss": 1.3722515106201172, + "grad_norm": 9.524357067386456, + "learning_rate": 4.479216365386333e-07, + "logits/chosen": -0.13468368351459503, + "logits/rejected": 0.0022729337215423584, + "logps/chosen": -1.3335119485855103, + "logps/rejected": -1.7451871633529663, + "loss": 1.0217, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3335119485855103, + "rewards/margins": 0.4116753041744232, + "rewards/rejected": -1.7451871633529663, + "sft_loss": 1.3561052083969116, "step": 3250 }, { "epoch": 1.7420973406924234, - "grad_norm": 5.941814371197206, - "learning_rate": 1.3391188796591568e-06, - "logits/chosen": -0.6051737666130066, - "logits/rejected": -0.5670342445373535, - "logps/chosen": -1.398306131362915, - "logps/rejected": -1.9220012426376343, - "loss": 0.9966, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.398306131362915, - "rewards/margins": 0.5236951112747192, - "rewards/rejected": -1.9220012426376343, - "sft_loss": 1.4062070846557617, + "grad_norm": 6.71190547951929, + "learning_rate": 4.4637295988638555e-07, + "logits/chosen": -0.17234370112419128, + "logits/rejected": -0.09425880759954453, + "logps/chosen": -1.3979463577270508, + "logps/rejected": -1.6748193502426147, + "loss": 1.0709, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3979463577270508, + "rewards/margins": 0.27687305212020874, + "rewards/rejected": -1.6748193502426147, + "sft_loss": 1.3898849487304688, "step": 3255 }, { "epoch": 1.744773373473825, - "grad_norm": 7.529679820359014, - "learning_rate": 1.3344744105717487e-06, - "logits/chosen": -0.7107158303260803, - "logits/rejected": -0.6142610907554626, - "logps/chosen": -1.3456017971038818, - "logps/rejected": -1.86309814453125, - "loss": 0.9881, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3456017971038818, - "rewards/margins": 0.5174962878227234, - "rewards/rejected": -1.86309814453125, - "sft_loss": 1.3745471239089966, + "grad_norm": 7.512455926222893, + "learning_rate": 4.4482480352391623e-07, + "logits/chosen": -0.30011698603630066, + "logits/rejected": -0.16482755541801453, + "logps/chosen": -1.3577678203582764, + "logps/rejected": -1.6450729370117188, + "loss": 1.0584, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3577678203582764, + "rewards/margins": 0.28730517625808716, + "rewards/rejected": -1.6450729370117188, + "sft_loss": 1.364136815071106, "step": 3260 }, { "epoch": 1.7474494062552266, - "grad_norm": 11.00381742148276, - "learning_rate": 1.3298315474143354e-06, - "logits/chosen": -0.577115535736084, - "logits/rejected": -0.4973164200782776, - "logps/chosen": -1.2828433513641357, - "logps/rejected": -1.985243558883667, - "loss": 0.9345, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.2828433513641357, - "rewards/margins": 0.7024003863334656, - "rewards/rejected": -1.985243558883667, - "sft_loss": 1.3437730073928833, + "grad_norm": 10.350971747439411, + "learning_rate": 4.4327718247144507e-07, + "logits/chosen": -0.19551385939121246, + "logits/rejected": -0.09423010051250458, + "logps/chosen": -1.283046841621399, + "logps/rejected": -1.7449703216552734, + "loss": 0.9992, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.283046841621399, + "rewards/margins": 0.4619235396385193, + "rewards/rejected": -1.7449703216552734, + "sft_loss": 1.334883689880371, "step": 3265 }, { "epoch": 1.750125439036628, - "grad_norm": 7.235230962019375, - "learning_rate": 1.3251903352319951e-06, - "logits/chosen": -0.5999752283096313, - "logits/rejected": -0.4686052203178406, - "logps/chosen": -1.2623176574707031, - "logps/rejected": -1.961260199546814, - "loss": 0.9324, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2623176574707031, - "rewards/margins": 0.6989427208900452, - "rewards/rejected": -1.961260199546814, - "sft_loss": 1.3062334060668945, + "grad_norm": 7.971566899292058, + "learning_rate": 4.417301117439984e-07, + "logits/chosen": -0.19808469712734222, + "logits/rejected": -0.051246147602796555, + "logps/chosen": -1.2304213047027588, + "logps/rejected": -1.6856091022491455, + "loss": 0.975, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2304213047027588, + "rewards/margins": 0.4551876485347748, + "rewards/rejected": -1.6856091022491455, + "sft_loss": 1.2523653507232666, "step": 3270 }, { "epoch": 1.7528014718180298, - "grad_norm": 7.746349070805825, - "learning_rate": 1.3205508190537895e-06, - "logits/chosen": -0.6424199342727661, - "logits/rejected": -0.4257664680480957, - "logps/chosen": -1.3440327644348145, - "logps/rejected": -1.970116376876831, - "loss": 0.9803, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3440327644348145, - "rewards/margins": 0.6260837316513062, - "rewards/rejected": -1.970116376876831, - "sft_loss": 1.4007642269134521, + "grad_norm": 6.508714328333814, + "learning_rate": 4.401836063512631e-07, + "logits/chosen": -0.25844448804855347, + "logits/rejected": 0.04663591459393501, + "logps/chosen": -1.3295787572860718, + "logps/rejected": -1.7243293523788452, + "loss": 1.0324, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3295787572860718, + "rewards/margins": 0.3947505056858063, + "rewards/rejected": -1.7243293523788452, + "sft_loss": 1.3863446712493896, "step": 3275 }, { "epoch": 1.7554775045994313, - "grad_norm": 11.270118234339037, - "learning_rate": 1.3159130438923242e-06, - "logits/chosen": -0.6333575248718262, - "logits/rejected": -0.5891388654708862, - "logps/chosen": -1.2919838428497314, - "logps/rejected": -1.8722622394561768, - "loss": 0.9688, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2919838428497314, - "rewards/margins": 0.5802782773971558, - "rewards/rejected": -1.8722622394561768, - "sft_loss": 1.3905534744262695, + "grad_norm": 12.375952625326908, + "learning_rate": 4.386376812974413e-07, + "logits/chosen": -0.2323465794324875, + "logits/rejected": -0.14086218178272247, + "logps/chosen": -1.2785847187042236, + "logps/rejected": -1.6352514028549194, + "loss": 1.0352, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2785847187042236, + "rewards/margins": 0.3566668629646301, + "rewards/rejected": -1.6352514028549194, + "sft_loss": 1.3782927989959717, "step": 3280 }, { "epoch": 1.7581535373808328, - "grad_norm": 7.970392351680856, - "learning_rate": 1.3112770547433144e-06, - "logits/chosen": -0.6499245762825012, - "logits/rejected": -0.49632638692855835, - "logps/chosen": -1.3325730562210083, - "logps/rejected": -1.9567444324493408, - "loss": 0.9676, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3325730562210083, - "rewards/margins": 0.6241713166236877, - "rewards/rejected": -1.9567444324493408, - "sft_loss": 1.3798366785049438, + "grad_norm": 7.149176077230483, + "learning_rate": 4.370923515811048e-07, + "logits/chosen": -0.25271302461624146, + "logits/rejected": -0.04393969476222992, + "logps/chosen": -1.289284586906433, + "logps/rejected": -1.7105737924575806, + "loss": 1.0153, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.289284586906433, + "rewards/margins": 0.4212891459465027, + "rewards/rejected": -1.7105737924575806, + "sft_loss": 1.3399940729141235, "step": 3285 }, { "epoch": 1.7608295701622345, - "grad_norm": 9.061602961372913, - "learning_rate": 1.3066428965851472e-06, - "logits/chosen": -0.6206027269363403, - "logits/rejected": -0.5510232448577881, - "logps/chosen": -1.4109750986099243, - "logps/rejected": -2.0197665691375732, - "loss": 1.0131, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.4109750986099243, - "rewards/margins": 0.6087915897369385, - "rewards/rejected": -2.0197665691375732, - "sft_loss": 1.4571069478988647, + "grad_norm": 9.483763943800556, + "learning_rate": 4.35547632195049e-07, + "logits/chosen": -0.18309524655342102, + "logits/rejected": -0.07059819251298904, + "logps/chosen": -1.335192084312439, + "logps/rejected": -1.6796214580535889, + "loss": 1.0525, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.335192084312439, + "rewards/margins": 0.3444294333457947, + "rewards/rejected": -1.6796214580535889, + "sft_loss": 1.3960546255111694, "step": 3290 }, { "epoch": 1.763505602943636, - "grad_norm": 8.38575291192651, - "learning_rate": 1.3020106143784454e-06, - "logits/chosen": -0.6656057238578796, - "logits/rejected": -0.6355992555618286, - "logps/chosen": -1.4497514963150024, - "logps/rejected": -2.0534777641296387, - "loss": 1.0232, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.4497514963150024, - "rewards/margins": 0.6037260890007019, - "rewards/rejected": -2.0534777641296387, - "sft_loss": 1.4650858640670776, + "grad_norm": 9.271513968041686, + "learning_rate": 4.340035381261484e-07, + "logits/chosen": -0.2182222157716751, + "logits/rejected": -0.14940352737903595, + "logps/chosen": -1.442674994468689, + "logps/rejected": -1.7633249759674072, + "loss": 1.1113, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.442674994468689, + "rewards/margins": 0.32064980268478394, + "rewards/rejected": -1.7633249759674072, + "sft_loss": 1.4603216648101807, "step": 3295 }, { "epoch": 1.7661816357250375, - "grad_norm": 7.755305600869868, - "learning_rate": 1.2973802530656314e-06, - "logits/chosen": -0.7491085529327393, - "logits/rejected": -0.6685279607772827, - "logps/chosen": -1.4618529081344604, - "logps/rejected": -2.1163430213928223, - "loss": 1.0177, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.4618529081344604, - "rewards/margins": 0.6544899940490723, - "rewards/rejected": -2.1163430213928223, - "sft_loss": 1.5315229892730713, + "grad_norm": 7.3965118387087525, + "learning_rate": 4.324600843552104e-07, + "logits/chosen": -0.31792744994163513, + "logits/rejected": -0.17529422044754028, + "logps/chosen": -1.4276206493377686, + "logps/rejected": -1.8709055185317993, + "loss": 1.0613, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4276206493377686, + "rewards/margins": 0.44328489899635315, + "rewards/rejected": -1.8709055185317993, + "sft_loss": 1.4553303718566895, "step": 3300 }, { "epoch": 1.7688576685064392, - "grad_norm": 12.168745838634132, - "learning_rate": 1.2927518575704906e-06, - "logits/chosen": -0.713337779045105, - "logits/rejected": -0.6121014952659607, - "logps/chosen": -1.3905467987060547, - "logps/rejected": -2.015956401824951, - "loss": 0.9914, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3905467987060547, - "rewards/margins": 0.6254096627235413, - "rewards/rejected": -2.015956401824951, - "sft_loss": 1.4138438701629639, + "grad_norm": 11.12920257714004, + "learning_rate": 4.309172858568302e-07, + "logits/chosen": -0.2868819534778595, + "logits/rejected": -0.1467498242855072, + "logps/chosen": -1.3745849132537842, + "logps/rejected": -1.7479822635650635, + "loss": 1.0532, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3745849132537842, + "rewards/margins": 0.3733974099159241, + "rewards/rejected": -1.7479822635650635, + "sft_loss": 1.378862977027893, "step": 3305 }, { "epoch": 1.771533701287841, - "grad_norm": 7.931750234484632, - "learning_rate": 1.2881254727977365e-06, - "logits/chosen": -0.5274156332015991, - "logits/rejected": -0.5057668089866638, - "logps/chosen": -1.3538296222686768, - "logps/rejected": -1.9042785167694092, - "loss": 0.9768, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.3538296222686768, - "rewards/margins": 0.5504489541053772, - "rewards/rejected": -1.9042785167694092, - "sft_loss": 1.382365107536316, + "grad_norm": 7.334585852212527, + "learning_rate": 4.293751575992455e-07, + "logits/chosen": -0.12723413109779358, + "logits/rejected": -0.08206866681575775, + "logps/chosen": -1.3312435150146484, + "logps/rejected": -1.708142876625061, + "loss": 1.0238, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3312435150146484, + "rewards/margins": 0.37689924240112305, + "rewards/rejected": -1.708142876625061, + "sft_loss": 1.3588409423828125, "step": 3310 }, { "epoch": 1.7742097340692422, - "grad_norm": 9.237317453089084, - "learning_rate": 1.2835011436325749e-06, - "logits/chosen": -0.660834014415741, - "logits/rejected": -0.5497223734855652, - "logps/chosen": -1.314284086227417, - "logps/rejected": -1.891516923904419, - "loss": 0.9609, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.314284086227417, - "rewards/margins": 0.5772326588630676, - "rewards/rejected": -1.891516923904419, - "sft_loss": 1.344481348991394, + "grad_norm": 13.909124891292313, + "learning_rate": 4.278337145441916e-07, + "logits/chosen": -0.32031458616256714, + "logits/rejected": -0.17457079887390137, + "logps/chosen": -1.3311392068862915, + "logps/rejected": -1.7172855138778687, + "loss": 1.0166, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3311392068862915, + "rewards/margins": 0.38614630699157715, + "rewards/rejected": -1.7172855138778687, + "sft_loss": 1.3369089365005493, "step": 3315 }, { "epoch": 1.776885766850644, - "grad_norm": 6.456649113595514, - "learning_rate": 1.278878914940267e-06, - "logits/chosen": -0.582197368144989, - "logits/rejected": -0.43381983041763306, - "logps/chosen": -1.3320949077606201, - "logps/rejected": -2.1774628162384033, - "loss": 0.9482, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3320949077606201, - "rewards/margins": 0.8453680872917175, - "rewards/rejected": -2.1774628162384033, - "sft_loss": 1.3886038064956665, + "grad_norm": 7.115716742325556, + "learning_rate": 4.262929716467556e-07, + "logits/chosen": -0.22326748073101044, + "logits/rejected": -0.035021353513002396, + "logps/chosen": -1.3363620042800903, + "logps/rejected": -1.958900809288025, + "loss": 0.995, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3363620042800903, + "rewards/margins": 0.6225385665893555, + "rewards/rejected": -1.958900809288025, + "sft_loss": 1.3878682851791382, "step": 3320 }, { "epoch": 1.7795617996320456, - "grad_norm": 12.876972828783495, - "learning_rate": 1.2742588315656963e-06, - "logits/chosen": -0.6521707773208618, - "logits/rejected": -0.500217854976654, - "logps/chosen": -1.3390861749649048, - "logps/rejected": -2.0092623233795166, - "loss": 0.9634, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3390861749649048, - "rewards/margins": 0.6701762080192566, - "rewards/rejected": -2.0092623233795166, - "sft_loss": 1.4378734827041626, + "grad_norm": 8.268366165173441, + "learning_rate": 4.247529438552321e-07, + "logits/chosen": -0.28518885374069214, + "logits/rejected": -0.10537783056497574, + "logps/chosen": -1.3669929504394531, + "logps/rejected": -1.8190317153930664, + "loss": 1.0318, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3669929504394531, + "rewards/margins": 0.4520387053489685, + "rewards/rejected": -1.8190317153930664, + "sft_loss": 1.4433027505874634, "step": 3325 }, { "epoch": 1.782237832413447, - "grad_norm": 6.924077732123589, - "learning_rate": 1.269640938332932e-06, - "logits/chosen": -0.5279954671859741, - "logits/rejected": -0.4550296366214752, - "logps/chosen": -1.2392055988311768, - "logps/rejected": -2.0392346382141113, - "loss": 0.8986, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2392055988311768, - "rewards/margins": 0.8000289797782898, - "rewards/rejected": -2.0392346382141113, - "sft_loss": 1.3009543418884277, + "grad_norm": 7.6136076819743, + "learning_rate": 4.232136461109773e-07, + "logits/chosen": -0.1993323266506195, + "logits/rejected": -0.09304220974445343, + "logps/chosen": -1.283071756362915, + "logps/rejected": -1.7913768291473389, + "loss": 0.9885, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.283071756362915, + "rewards/margins": 0.5083053112030029, + "rewards/rejected": -1.7913768291473389, + "sft_loss": 1.3137251138687134, "step": 3330 }, { "epoch": 1.7849138651948486, - "grad_norm": 13.126758218379875, - "learning_rate": 1.265025280044794e-06, - "logits/chosen": -0.6107516884803772, - "logits/rejected": -0.4902040958404541, - "logps/chosen": -1.364108681678772, - "logps/rejected": -1.9469196796417236, - "loss": 0.9617, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.364108681678772, - "rewards/margins": 0.5828110575675964, - "rewards/rejected": -1.9469196796417236, - "sft_loss": 1.3647311925888062, + "grad_norm": 13.182307661983321, + "learning_rate": 4.216750933482646e-07, + "logits/chosen": -0.22208702564239502, + "logits/rejected": -0.06606097519397736, + "logps/chosen": -1.3824620246887207, + "logps/rejected": -1.769325852394104, + "loss": 1.026, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3824620246887207, + "rewards/margins": 0.3868637979030609, + "rewards/rejected": -1.769325852394104, + "sft_loss": 1.3636934757232666, "step": 3335 }, { "epoch": 1.7875898979762503, - "grad_norm": 10.757237916977163, - "learning_rate": 1.2604119014824197e-06, - "logits/chosen": -0.5626201629638672, - "logits/rejected": -0.46400943398475647, - "logps/chosen": -1.3011457920074463, - "logps/rejected": -1.8755508661270142, - "loss": 0.9755, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3011457920074463, - "rewards/margins": 0.574405312538147, - "rewards/rejected": -1.8755508661270142, - "sft_loss": 1.3482170104980469, + "grad_norm": 7.351513845749289, + "learning_rate": 4.2013730049413986e-07, + "logits/chosen": -0.1672711819410324, + "logits/rejected": -0.013500380329787731, + "logps/chosen": -1.2844191789627075, + "logps/rejected": -1.7520344257354736, + "loss": 1.0129, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2844191789627075, + "rewards/margins": 0.4676152765750885, + "rewards/rejected": -1.7520344257354736, + "sft_loss": 1.3409457206726074, "step": 3340 }, { "epoch": 1.7902659307576518, - "grad_norm": 7.560167106069082, - "learning_rate": 1.2558008474048279e-06, - "logits/chosen": -0.5565083622932434, - "logits/rejected": -0.4244113862514496, - "logps/chosen": -1.230414628982544, - "logps/rejected": -1.8162529468536377, - "loss": 0.9346, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.230414628982544, - "rewards/margins": 0.5858383774757385, - "rewards/rejected": -1.8162529468536377, - "sft_loss": 1.293988585472107, + "grad_norm": 6.964401966409443, + "learning_rate": 4.1860028246827594e-07, + "logits/chosen": -0.16246755421161652, + "logits/rejected": 0.018709395080804825, + "logps/chosen": -1.2220256328582764, + "logps/rejected": -1.6237547397613525, + "loss": 0.9836, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2220256328582764, + "rewards/margins": 0.40172892808914185, + "rewards/rejected": -1.6237547397613525, + "sft_loss": 1.2745763063430786, "step": 3345 }, { "epoch": 1.7929419635390533, - "grad_norm": 10.80950410067998, - "learning_rate": 1.2511921625484857e-06, - "logits/chosen": -0.7061828374862671, - "logits/rejected": -0.6089943647384644, - "logps/chosen": -1.3991694450378418, - "logps/rejected": -1.9263055324554443, - "loss": 0.9943, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3991694450378418, - "rewards/margins": 0.5271362066268921, - "rewards/rejected": -1.9263055324554443, - "sft_loss": 1.4165757894515991, + "grad_norm": 8.104077888153768, + "learning_rate": 4.170640541828285e-07, + "logits/chosen": -0.33674901723861694, + "logits/rejected": -0.18985441327095032, + "logps/chosen": -1.391183614730835, + "logps/rejected": -1.7601451873779297, + "loss": 1.0466, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.391183614730835, + "rewards/margins": 0.36896148324012756, + "rewards/rejected": -1.7601451873779297, + "sft_loss": 1.4082539081573486, "step": 3350 }, { "epoch": 1.795617996320455, - "grad_norm": 9.365451206885025, - "learning_rate": 1.2465858916268734e-06, - "logits/chosen": -0.4708705544471741, - "logits/rejected": -0.4554038643836975, - "logps/chosen": -1.4415693283081055, - "logps/rejected": -1.9839493036270142, - "loss": 1.0765, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -1.4415693283081055, - "rewards/margins": 0.5423800349235535, - "rewards/rejected": -1.9839493036270142, - "sft_loss": 1.4307851791381836, + "grad_norm": 8.728384432383228, + "learning_rate": 4.1552863054229116e-07, + "logits/chosen": -0.11587512493133545, + "logits/rejected": -0.07313670217990875, + "logps/chosen": -1.419593334197998, + "logps/rejected": -1.6886402368545532, + "loss": 1.1363, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.419593334197998, + "rewards/margins": 0.26904693245887756, + "rewards/rejected": -1.6886402368545532, + "sft_loss": 1.4004647731781006, "step": 3355 }, { "epoch": 1.7982940291018565, - "grad_norm": 11.867632025826655, - "learning_rate": 1.2419820793300526e-06, - "logits/chosen": -0.6369816064834595, - "logits/rejected": -0.4788491129875183, - "logps/chosen": -1.295993685722351, - "logps/rejected": -1.9095256328582764, - "loss": 0.9696, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -1.295993685722351, - "rewards/margins": 0.6135318875312805, - "rewards/rejected": -1.9095256328582764, - "sft_loss": 1.3290159702301025, + "grad_norm": 8.470114959808182, + "learning_rate": 4.139940264433508e-07, + "logits/chosen": -0.2647283673286438, + "logits/rejected": -0.06007467582821846, + "logps/chosen": -1.2760438919067383, + "logps/rejected": -1.6822011470794678, + "loss": 1.0078, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2760438919067383, + "rewards/margins": 0.4061572551727295, + "rewards/rejected": -1.6822011470794678, + "sft_loss": 1.301793098449707, "step": 3360 }, { "epoch": 1.800970061883258, - "grad_norm": 8.543260387548337, - "learning_rate": 1.2373807703242293e-06, - "logits/chosen": -0.6585286259651184, - "logits/rejected": -0.5375559329986572, - "logps/chosen": -1.3580644130706787, - "logps/rejected": -1.9835560321807861, - "loss": 0.9655, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3580644130706787, - "rewards/margins": 0.625491738319397, - "rewards/rejected": -1.9835560321807861, - "sft_loss": 1.3997434377670288, + "grad_norm": 7.793389012007866, + "learning_rate": 4.1246025677474303e-07, + "logits/chosen": -0.2339763194322586, + "logits/rejected": -0.06817017495632172, + "logps/chosen": -1.318159818649292, + "logps/rejected": -1.762577772140503, + "loss": 0.9987, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.318159818649292, + "rewards/margins": 0.444417804479599, + "rewards/rejected": -1.762577772140503, + "sft_loss": 1.3555116653442383, "step": 3365 }, { "epoch": 1.8036460946646597, - "grad_norm": 8.393989138340482, - "learning_rate": 1.232782009251324e-06, - "logits/chosen": -0.643993079662323, - "logits/rejected": -0.5305663347244263, - "logps/chosen": -1.3146846294403076, - "logps/rejected": -1.8510560989379883, - "loss": 0.9927, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.3146846294403076, - "rewards/margins": 0.536371648311615, - "rewards/rejected": -1.8510560989379883, - "sft_loss": 1.3597562313079834, + "grad_norm": 10.438996597528387, + "learning_rate": 4.10927336417108e-07, + "logits/chosen": -0.20583009719848633, + "logits/rejected": -0.045845162123441696, + "logps/chosen": -1.356227159500122, + "logps/rejected": -1.6683311462402344, + "loss": 1.0769, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.356227159500122, + "rewards/margins": 0.31210413575172424, + "rewards/rejected": -1.6683311462402344, + "sft_loss": 1.351169228553772, "step": 3370 }, { "epoch": 1.8063221274460612, - "grad_norm": 11.44897952633478, - "learning_rate": 1.228185840728537e-06, - "logits/chosen": -0.5137637853622437, - "logits/rejected": -0.4967266023159027, - "logps/chosen": -1.4023181200027466, - "logps/rejected": -1.9296295642852783, - "loss": 1.0354, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.4023181200027466, - "rewards/margins": 0.5273114442825317, - "rewards/rejected": -1.9296295642852783, - "sft_loss": 1.4055049419403076, + "grad_norm": 12.764675577748738, + "learning_rate": 4.093952802428457e-07, + "logits/chosen": -0.07709778100252151, + "logits/rejected": -0.026763681322336197, + "logps/chosen": -1.3959242105484009, + "logps/rejected": -1.683182954788208, + "loss": 1.0913, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3959242105484009, + "rewards/margins": 0.2872585356235504, + "rewards/rejected": -1.683182954788208, + "sft_loss": 1.3707129955291748, "step": 3375 }, { "epoch": 1.8089981602274627, - "grad_norm": 8.204998932114652, - "learning_rate": 1.2235923093479156e-06, - "logits/chosen": -0.7242423295974731, - "logits/rejected": -0.6163910627365112, - "logps/chosen": -1.3340586423873901, - "logps/rejected": -2.004737377166748, - "loss": 0.9414, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3340586423873901, - "rewards/margins": 0.670678973197937, - "rewards/rejected": -2.004737377166748, - "sft_loss": 1.3409759998321533, + "grad_norm": 5.923485220700442, + "learning_rate": 4.0786410311597184e-07, + "logits/chosen": -0.2798308730125427, + "logits/rejected": -0.13452866673469543, + "logps/chosen": -1.3274872303009033, + "logps/rejected": -1.761406660079956, + "loss": 0.983, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3274872303009033, + "rewards/margins": 0.4339195787906647, + "rewards/rejected": -1.761406660079956, + "sft_loss": 1.3321391344070435, "step": 3380 }, { "epoch": 1.8116741930088645, - "grad_norm": 7.287111485026014, - "learning_rate": 1.219001459675921e-06, - "logits/chosen": -0.6734879016876221, - "logits/rejected": -0.6657724380493164, - "logps/chosen": -1.3868675231933594, - "logps/rejected": -1.85861074924469, - "loss": 1.0186, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -1.3868675231933594, - "rewards/margins": 0.4717431962490082, - "rewards/rejected": -1.85861074924469, - "sft_loss": 1.4022619724273682, + "grad_norm": 7.301258798639009, + "learning_rate": 4.063338198919737e-07, + "logits/chosen": -0.2560671865940094, + "logits/rejected": -0.22414672374725342, + "logps/chosen": -1.3753451108932495, + "logps/rejected": -1.6603202819824219, + "loss": 1.067, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3753451108932495, + "rewards/margins": 0.28497499227523804, + "rewards/rejected": -1.6603202819824219, + "sft_loss": 1.3981544971466064, "step": 3385 }, { "epoch": 1.814350225790266, - "grad_norm": 10.1264692240913, - "learning_rate": 1.2144133362529974e-06, - "logits/chosen": -0.6233974695205688, - "logits/rejected": -0.5367701053619385, - "logps/chosen": -1.418578863143921, - "logps/rejected": -1.9424394369125366, - "loss": 1.029, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.418578863143921, - "rewards/margins": 0.5238603949546814, - "rewards/rejected": -1.9424394369125366, - "sft_loss": 1.4560730457305908, + "grad_norm": 9.792834224375655, + "learning_rate": 4.0480444541766575e-07, + "logits/chosen": -0.1899821162223816, + "logits/rejected": -0.06347040086984634, + "logps/chosen": -1.397888422012329, + "logps/rejected": -1.7293627262115479, + "loss": 1.0746, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.397888422012329, + "rewards/margins": 0.3314744234085083, + "rewards/rejected": -1.7293627262115479, + "sft_loss": 1.4246985912322998, "step": 3390 }, { "epoch": 1.8170262585716674, - "grad_norm": 10.456040091114867, - "learning_rate": 1.2098279835931382e-06, - "logits/chosen": -0.6557838916778564, - "logits/rejected": -0.5996249914169312, - "logps/chosen": -1.2695906162261963, - "logps/rejected": -1.9603517055511475, - "loss": 0.9336, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2695906162261963, - "rewards/margins": 0.690761387348175, - "rewards/rejected": -1.9603517055511475, - "sft_loss": 1.2876946926116943, + "grad_norm": 9.254911328138565, + "learning_rate": 4.0327599453104606e-07, + "logits/chosen": -0.23466309905052185, + "logits/rejected": -0.12651152908802032, + "logps/chosen": -1.2661628723144531, + "logps/rejected": -1.7019329071044922, + "loss": 0.9972, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2661628723144531, + "rewards/margins": 0.43577009439468384, + "rewards/rejected": -1.7019329071044922, + "sft_loss": 1.2796809673309326, "step": 3395 }, { "epoch": 1.8197022913530692, - "grad_norm": 8.826313757148988, - "learning_rate": 1.2052454461834544e-06, - "logits/chosen": -0.6324909329414368, - "logits/rejected": -0.5516208410263062, - "logps/chosen": -1.356690764427185, - "logps/rejected": -1.9758851528167725, - "loss": 0.9906, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.356690764427185, - "rewards/margins": 0.6191944479942322, - "rewards/rejected": -1.9758851528167725, - "sft_loss": 1.3892412185668945, + "grad_norm": 8.433387175795398, + "learning_rate": 4.017484820611514e-07, + "logits/chosen": -0.2272816002368927, + "logits/rejected": -0.11349409818649292, + "logps/chosen": -1.3540637493133545, + "logps/rejected": -1.7329727411270142, + "loss": 1.0411, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3540637493133545, + "rewards/margins": 0.3789089322090149, + "rewards/rejected": -1.7329727411270142, + "sft_loss": 1.3797038793563843, "step": 3400 }, { "epoch": 1.8223783241344707, - "grad_norm": 11.425711089424011, - "learning_rate": 1.2006657684837445e-06, - "logits/chosen": -0.6415581703186035, - "logits/rejected": -0.5432634353637695, - "logps/chosen": -1.3200055360794067, - "logps/rejected": -1.8696330785751343, - "loss": 0.9841, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.3200055360794067, - "rewards/margins": 0.5496276617050171, - "rewards/rejected": -1.8696330785751343, - "sft_loss": 1.399254560470581, + "grad_norm": 9.748980739349928, + "learning_rate": 4.002219228279148e-07, + "logits/chosen": -0.22415952384471893, + "logits/rejected": -0.07019458711147308, + "logps/chosen": -1.3057830333709717, + "logps/rejected": -1.663480520248413, + "loss": 1.0315, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3057830333709717, + "rewards/margins": 0.35769766569137573, + "rewards/rejected": -1.663480520248413, + "sft_loss": 1.3861303329467773, "step": 3405 }, { "epoch": 1.8250543569158721, - "grad_norm": 7.832674288185595, - "learning_rate": 1.1960889949260613e-06, - "logits/chosen": -0.6560367345809937, - "logits/rejected": -0.5233356952667236, - "logps/chosen": -1.4361064434051514, - "logps/rejected": -1.959800362586975, - "loss": 1.0228, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.4361064434051514, - "rewards/margins": 0.5236939191818237, - "rewards/rejected": -1.959800362586975, - "sft_loss": 1.453070878982544, + "grad_norm": 14.291251597228019, + "learning_rate": 3.9869633164202045e-07, + "logits/chosen": -0.23121888935565948, + "logits/rejected": -0.008912255987524986, + "logps/chosen": -1.4583107233047485, + "logps/rejected": -1.7844129800796509, + "loss": 1.0807, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4583107233047485, + "rewards/margins": 0.32610228657722473, + "rewards/rejected": -1.7844129800796509, + "sft_loss": 1.458487629890442, "step": 3410 }, { "epoch": 1.8277303896972739, - "grad_norm": 15.075128594039345, - "learning_rate": 1.1915151699142825e-06, - "logits/chosen": -0.6657556891441345, - "logits/rejected": -0.6010826230049133, - "logps/chosen": -1.365739107131958, - "logps/rejected": -2.0313916206359863, - "loss": 1.0002, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.365739107131958, - "rewards/margins": 0.665652334690094, - "rewards/rejected": -2.0313916206359863, - "sft_loss": 1.4609452486038208, + "grad_norm": 9.983326050698938, + "learning_rate": 3.9717172330476077e-07, + "logits/chosen": -0.23669393360614777, + "logits/rejected": -0.13350990414619446, + "logps/chosen": -1.3569681644439697, + "logps/rejected": -1.793971061706543, + "loss": 1.0534, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3569681644439697, + "rewards/margins": 0.4370030462741852, + "rewards/rejected": -1.793971061706543, + "sft_loss": 1.4427287578582764, "step": 3415 }, { "epoch": 1.8304064224786754, - "grad_norm": 16.784397044420558, - "learning_rate": 1.1869443378236782e-06, - "logits/chosen": -0.6641906499862671, - "logits/rejected": -0.5761057734489441, - "logps/chosen": -1.437990427017212, - "logps/rejected": -2.15848708152771, - "loss": 1.02, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.437990427017212, - "rewards/margins": 0.7204967737197876, - "rewards/rejected": -2.15848708152771, - "sft_loss": 1.4786887168884277, + "grad_norm": 20.06942241202815, + "learning_rate": 3.956481126078927e-07, + "logits/chosen": -0.17605897784233093, + "logits/rejected": -0.05872791260480881, + "logps/chosen": -1.4228013753890991, + "logps/rejected": -1.9332406520843506, + "loss": 1.0655, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4228013753890991, + "rewards/margins": 0.5104393362998962, + "rewards/rejected": -1.9332406520843506, + "sft_loss": 1.4420769214630127, "step": 3420 }, { "epoch": 1.8330824552600768, - "grad_norm": 13.981297309313103, - "learning_rate": 1.1823765430004812e-06, - "logits/chosen": -0.700829029083252, - "logits/rejected": -0.6927005648612976, - "logps/chosen": -1.3422982692718506, - "logps/rejected": -2.0163462162017822, - "loss": 0.9734, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.3422982692718506, - "rewards/margins": 0.6740477085113525, - "rewards/rejected": -2.0163462162017822, - "sft_loss": 1.351973056793213, + "grad_norm": 6.966730048920975, + "learning_rate": 3.941255143334937e-07, + "logits/chosen": -0.26047974824905396, + "logits/rejected": -0.21100255846977234, + "logps/chosen": -1.349595546722412, + "logps/rejected": -1.765178918838501, + "loss": 1.0365, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.349595546722412, + "rewards/margins": 0.41558337211608887, + "rewards/rejected": -1.765178918838501, + "sft_loss": 1.3722059726715088, "step": 3425 }, { "epoch": 1.8357584880414786, - "grad_norm": 8.964708336353103, - "learning_rate": 1.177811829761457e-06, - "logits/chosen": -0.6395770907402039, - "logits/rejected": -0.5657497048377991, - "logps/chosen": -1.3664880990982056, - "logps/rejected": -2.083770275115967, - "loss": 0.9664, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3664880990982056, - "rewards/margins": 0.7172822952270508, - "rewards/rejected": -2.083770275115967, - "sft_loss": 1.3898346424102783, + "grad_norm": 8.473349423749175, + "learning_rate": 3.9260394325381895e-07, + "logits/chosen": -0.2327476292848587, + "logits/rejected": -0.10556666553020477, + "logps/chosen": -1.3813903331756592, + "logps/rejected": -1.886596441268921, + "loss": 1.035, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3813903331756592, + "rewards/margins": 0.5052059888839722, + "rewards/rejected": -1.886596441268921, + "sft_loss": 1.3991137742996216, "step": 3430 }, { "epoch": 1.83843452082288, - "grad_norm": 8.986806024083583, - "learning_rate": 1.1732502423934737e-06, - "logits/chosen": -0.634931206703186, - "logits/rejected": -0.6005954742431641, - "logps/chosen": -1.3003346920013428, - "logps/rejected": -1.934922456741333, - "loss": 0.9261, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3003346920013428, - "rewards/margins": 0.6345877051353455, - "rewards/rejected": -1.934922456741333, - "sft_loss": 1.3663181066513062, + "grad_norm": 10.035402052308962, + "learning_rate": 3.9108341413115784e-07, + "logits/chosen": -0.23867163062095642, + "logits/rejected": -0.15210063755512238, + "logps/chosen": -1.3119127750396729, + "logps/rejected": -1.7596677541732788, + "loss": 0.9875, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3119127750396729, + "rewards/margins": 0.44775494933128357, + "rewards/rejected": -1.7596677541732788, + "sft_loss": 1.3625072240829468, "step": 3435 }, { "epoch": 1.8411105536042816, - "grad_norm": 7.9950743058468925, - "learning_rate": 1.1686918251530716e-06, - "logits/chosen": -0.7094308137893677, - "logits/rejected": -0.6693183779716492, - "logps/chosen": -1.2470004558563232, - "logps/rejected": -2.106454372406006, - "loss": 0.9191, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2470004558563232, - "rewards/margins": 0.8594539761543274, - "rewards/rejected": -2.106454372406006, - "sft_loss": 1.2810404300689697, + "grad_norm": 8.330575167502747, + "learning_rate": 3.895639417176905e-07, + "logits/chosen": -0.3020384907722473, + "logits/rejected": -0.23150360584259033, + "logps/chosen": -1.2680355310440063, + "logps/rejected": -1.8124500513076782, + "loss": 0.9956, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2680355310440063, + "rewards/margins": 0.5444144010543823, + "rewards/rejected": -1.8124500513076782, + "sft_loss": 1.2984832525253296, "step": 3440 }, { "epoch": 1.8437865863856833, - "grad_norm": 6.634098157658137, - "learning_rate": 1.164136622266035e-06, - "logits/chosen": -0.6816264390945435, - "logits/rejected": -0.5064053535461426, - "logps/chosen": -1.324466347694397, - "logps/rejected": -1.8923667669296265, - "loss": 0.9682, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.324466347694397, - "rewards/margins": 0.5679003596305847, - "rewards/rejected": -1.8923667669296265, - "sft_loss": 1.3814367055892944, + "grad_norm": 6.589495318347472, + "learning_rate": 3.8804554075534497e-07, + "logits/chosen": -0.26342055201530457, + "logits/rejected": -0.03827213495969772, + "logps/chosen": -1.3178520202636719, + "logps/rejected": -1.7128499746322632, + "loss": 1.0207, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3178520202636719, + "rewards/margins": 0.3949979543685913, + "rewards/rejected": -1.7128499746322632, + "sft_loss": 1.363925814628601, "step": 3445 }, { "epoch": 1.8464626191670848, - "grad_norm": 9.472889100430713, - "learning_rate": 1.1595846779269622e-06, - "logits/chosen": -0.7638979554176331, - "logits/rejected": -0.6376869082450867, - "logps/chosen": -1.3652839660644531, - "logps/rejected": -2.0850937366485596, - "loss": 0.9771, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3652839660644531, - "rewards/margins": 0.7198096513748169, - "rewards/rejected": -2.0850937366485596, - "sft_loss": 1.4393898248672485, + "grad_norm": 8.820902723158103, + "learning_rate": 3.8652822597565403e-07, + "logits/chosen": -0.35779404640197754, + "logits/rejected": -0.17605280876159668, + "logps/chosen": -1.3372957706451416, + "logps/rejected": -1.8262310028076172, + "loss": 1.0258, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3372957706451416, + "rewards/margins": 0.48893508315086365, + "rewards/rejected": -1.8262310028076172, + "sft_loss": 1.4235930442810059, "step": 3450 }, { "epoch": 1.8491386519484863, - "grad_norm": 8.955470407751477, - "learning_rate": 1.155036036298837e-06, - "logits/chosen": -0.6534041166305542, - "logits/rejected": -0.5347647666931152, - "logps/chosen": -1.522452712059021, - "logps/rejected": -2.2508766651153564, - "loss": 1.0283, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.522452712059021, - "rewards/margins": 0.7284238934516907, - "rewards/rejected": -2.2508766651153564, - "sft_loss": 1.5423953533172607, + "grad_norm": 10.074634622342717, + "learning_rate": 3.850120120996123e-07, + "logits/chosen": -0.21685686707496643, + "logits/rejected": -0.05704827979207039, + "logps/chosen": -1.5073444843292236, + "logps/rejected": -1.9315385818481445, + "loss": 1.0891, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.5073444843292236, + "rewards/margins": 0.42419394850730896, + "rewards/rejected": -1.9315385818481445, + "sft_loss": 1.5247254371643066, "step": 3455 }, { "epoch": 1.851814684729888, - "grad_norm": 10.485052013304722, - "learning_rate": 1.1504907415126008e-06, - "logits/chosen": -0.5379031896591187, - "logits/rejected": -0.4626663327217102, - "logps/chosen": -1.3453060388565063, - "logps/rejected": -1.9888687133789062, - "loss": 0.9917, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3453060388565063, - "rewards/margins": 0.6435626149177551, - "rewards/rejected": -1.9888687133789062, - "sft_loss": 1.3984931707382202, + "grad_norm": 8.722866655496768, + "learning_rate": 3.8349691383753356e-07, + "logits/chosen": -0.11254153400659561, + "logits/rejected": 0.011176636442542076, + "logps/chosen": -1.3440258502960205, + "logps/rejected": -1.7505595684051514, + "loss": 1.052, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3440258502960205, + "rewards/margins": 0.4065338969230652, + "rewards/rejected": -1.7505595684051514, + "sft_loss": 1.385436773300171, "step": 3460 }, { "epoch": 1.8544907175112895, - "grad_norm": 6.859780274263937, - "learning_rate": 1.1459488376667235e-06, - "logits/chosen": -0.7011104822158813, - "logits/rejected": -0.6202434301376343, - "logps/chosen": -1.279159665107727, - "logps/rejected": -1.7484251260757446, - "loss": 0.969, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.279159665107727, - "rewards/margins": 0.46926528215408325, - "rewards/rejected": -1.7484251260757446, - "sft_loss": 1.3194999694824219, + "grad_norm": 6.386705264362839, + "learning_rate": 3.819829458889078e-07, + "logits/chosen": -0.29518577456474304, + "logits/rejected": -0.17219959199428558, + "logps/chosen": -1.2693736553192139, + "logps/rejected": -1.5777580738067627, + "loss": 1.0049, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2693736553192139, + "rewards/margins": 0.30838438868522644, + "rewards/rejected": -1.5777580738067627, + "sft_loss": 1.3014901876449585, "step": 3465 }, { "epoch": 1.857166750292691, - "grad_norm": 10.019955364920394, - "learning_rate": 1.1414103688267756e-06, - "logits/chosen": -0.6622621417045593, - "logits/rejected": -0.5898221135139465, - "logps/chosen": -1.4044809341430664, - "logps/rejected": -2.0266287326812744, - "loss": 0.9957, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.4044809341430664, - "rewards/margins": 0.6221475601196289, - "rewards/rejected": -2.0266287326812744, - "sft_loss": 1.4509638547897339, + "grad_norm": 12.161518236812636, + "learning_rate": 3.804701229422585e-07, + "logits/chosen": -0.2759206295013428, + "logits/rejected": -0.17716960608959198, + "logps/chosen": -1.426809549331665, + "logps/rejected": -1.799734354019165, + "loss": 1.0697, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.426809549331665, + "rewards/margins": 0.3729247450828552, + "rewards/rejected": -1.799734354019165, + "sft_loss": 1.4560751914978027, "step": 3470 }, { "epoch": 1.8598427830740927, - "grad_norm": 11.18927324808781, - "learning_rate": 1.136875379025002e-06, - "logits/chosen": -0.6773689389228821, - "logits/rejected": -0.6406997442245483, - "logps/chosen": -1.3125277757644653, - "logps/rejected": -1.8904197216033936, - "loss": 0.9518, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3125277757644653, - "rewards/margins": 0.577892005443573, - "rewards/rejected": -1.8904197216033936, - "sft_loss": 1.3247658014297485, + "grad_norm": 7.8248066129413365, + "learning_rate": 3.789584596750007e-07, + "logits/chosen": -0.3126507103443146, + "logits/rejected": -0.24135151505470276, + "logps/chosen": -1.3244651556015015, + "logps/rejected": -1.6968772411346436, + "loss": 1.0154, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3244651556015015, + "rewards/margins": 0.37241214513778687, + "rewards/rejected": -1.6968772411346436, + "sft_loss": 1.3305847644805908, "step": 3475 }, { "epoch": 1.8625188158554944, - "grad_norm": 6.368596092649226, - "learning_rate": 1.132343912259894e-06, - "logits/chosen": -0.5879019498825073, - "logits/rejected": -0.5750831365585327, - "logps/chosen": -1.3526540994644165, - "logps/rejected": -1.9275623559951782, - "loss": 0.9745, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3526540994644165, - "rewards/margins": 0.5749083757400513, - "rewards/rejected": -1.9275623559951782, - "sft_loss": 1.3961443901062012, + "grad_norm": 7.08266825370226, + "learning_rate": 3.77447970753298e-07, + "logits/chosen": -0.16913610696792603, + "logits/rejected": -0.13287656009197235, + "logps/chosen": -1.357891321182251, + "logps/rejected": -1.7254955768585205, + "loss": 1.031, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.357891321182251, + "rewards/margins": 0.36760419607162476, + "rewards/rejected": -1.7254955768585205, + "sft_loss": 1.3800822496414185, "step": 3480 }, { "epoch": 1.8651948486368957, - "grad_norm": 8.96244156761912, - "learning_rate": 1.1278160124957617e-06, - "logits/chosen": -0.6188081502914429, - "logits/rejected": -0.5407212972640991, - "logps/chosen": -1.3009461164474487, - "logps/rejected": -1.8492584228515625, - "loss": 0.9641, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3009461164474487, - "rewards/margins": 0.5483121871948242, - "rewards/rejected": -1.8492584228515625, - "sft_loss": 1.3755310773849487, + "grad_norm": 7.992211502710734, + "learning_rate": 3.7593867083192057e-07, + "logits/chosen": -0.20583942532539368, + "logits/rejected": -0.09783850610256195, + "logps/chosen": -1.3103139400482178, + "logps/rejected": -1.6683019399642944, + "loss": 1.0259, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3103139400482178, + "rewards/margins": 0.3579878509044647, + "rewards/rejected": -1.6683019399642944, + "sft_loss": 1.3723644018173218, "step": 3485 }, { "epoch": 1.8678708814182974, - "grad_norm": 7.912781929815597, - "learning_rate": 1.1232917236623085e-06, - "logits/chosen": -0.5891327857971191, - "logits/rejected": -0.5167615413665771, - "logps/chosen": -1.3643978834152222, - "logps/rejected": -1.8810253143310547, - "loss": 1.012, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3643978834152222, - "rewards/margins": 0.516627311706543, - "rewards/rejected": -1.8810253143310547, - "sft_loss": 1.4596291780471802, + "grad_norm": 9.146193231875682, + "learning_rate": 3.7443057455410276e-07, + "logits/chosen": -0.16054293513298035, + "logits/rejected": -0.02962280437350273, + "logps/chosen": -1.341709017753601, + "logps/rejected": -1.5948436260223389, + "loss": 1.0828, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.341709017753601, + "rewards/margins": 0.253134548664093, + "rewards/rejected": -1.5948436260223389, + "sft_loss": 1.4336766004562378, "step": 3490 }, { "epoch": 1.870546914199699, - "grad_norm": 9.082992475981591, - "learning_rate": 1.1187710896542045e-06, - "logits/chosen": -0.7301222085952759, - "logits/rejected": -0.6070829033851624, - "logps/chosen": -1.3872734308242798, - "logps/rejected": -1.9226562976837158, - "loss": 0.9757, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3872734308242798, - "rewards/margins": 0.5353829264640808, - "rewards/rejected": -1.9226562976837158, - "sft_loss": 1.4221903085708618, + "grad_norm": 7.863417785744701, + "learning_rate": 3.7292369655140145e-07, + "logits/chosen": -0.2692742943763733, + "logits/rejected": -0.10102590173482895, + "logps/chosen": -1.3230074644088745, + "logps/rejected": -1.6514161825180054, + "loss": 1.0239, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3230074644088745, + "rewards/margins": 0.32840877771377563, + "rewards/rejected": -1.6514161825180054, + "sft_loss": 1.3536341190338135, "step": 3495 }, { "epoch": 1.8732229469811004, - "grad_norm": 9.516454972740851, - "learning_rate": 1.1142541543306603e-06, - "logits/chosen": -0.6769564747810364, - "logits/rejected": -0.565294623374939, - "logps/chosen": -1.4010789394378662, - "logps/rejected": -2.2788970470428467, - "loss": 0.9576, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.4010789394378662, - "rewards/margins": 0.8778184056282043, - "rewards/rejected": -2.2788970470428467, - "sft_loss": 1.4577972888946533, + "grad_norm": 9.644019735561926, + "learning_rate": 3.714180514435534e-07, + "logits/chosen": -0.19013360142707825, + "logits/rejected": -0.03948694467544556, + "logps/chosen": -1.3543713092803955, + "logps/rejected": -1.8687629699707031, + "loss": 1.0206, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3543713092803955, + "rewards/margins": 0.5143915414810181, + "rewards/rejected": -1.8687629699707031, + "sft_loss": 1.4165542125701904, "step": 3500 }, { "epoch": 1.875898979762502, - "grad_norm": 11.30165208210217, - "learning_rate": 1.109740961515003e-06, - "logits/chosen": -0.6544634103775024, - "logits/rejected": -0.5810515880584717, - "logps/chosen": -1.4300733804702759, - "logps/rejected": -2.110495090484619, - "loss": 0.9866, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.4300733804702759, - "rewards/margins": 0.680421769618988, - "rewards/rejected": -2.110495090484619, - "sft_loss": 1.4657166004180908, + "grad_norm": 11.96224025240736, + "learning_rate": 3.6991365383833426e-07, + "logits/chosen": -0.1905595362186432, + "logits/rejected": -0.060992609709501266, + "logps/chosen": -1.377895474433899, + "logps/rejected": -1.8040307760238647, + "loss": 1.034, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.377895474433899, + "rewards/margins": 0.42613524198532104, + "rewards/rejected": -1.8040307760238647, + "sft_loss": 1.4206275939941406, "step": 3505 }, { "epoch": 1.8785750125439038, - "grad_norm": 13.350266083433564, - "learning_rate": 1.1052315549942487e-06, - "logits/chosen": -0.668562650680542, - "logits/rejected": -0.6335813403129578, - "logps/chosen": -1.3456979990005493, - "logps/rejected": -2.0151984691619873, - "loss": 0.9477, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3456979990005493, - "rewards/margins": 0.6695006489753723, - "rewards/rejected": -2.0151984691619873, - "sft_loss": 1.3689993619918823, + "grad_norm": 13.195637439863178, + "learning_rate": 3.684105183314162e-07, + "logits/chosen": -0.2208903580904007, + "logits/rejected": -0.140710711479187, + "logps/chosen": -1.3023154735565186, + "logps/rejected": -1.6787493228912354, + "loss": 1.0144, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3023154735565186, + "rewards/margins": 0.3764336109161377, + "rewards/rejected": -1.6787493228912354, + "sft_loss": 1.3435550928115845, "step": 3510 }, { "epoch": 1.881251045325305, - "grad_norm": 9.927984052359218, - "learning_rate": 1.100725978518679e-06, - "logits/chosen": -0.6724262833595276, - "logits/rejected": -0.5307937860488892, - "logps/chosen": -1.3404207229614258, - "logps/rejected": -1.971617341041565, - "loss": 0.9628, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3404207229614258, - "rewards/margins": 0.6311966180801392, - "rewards/rejected": -1.971617341041565, - "sft_loss": 1.348073959350586, + "grad_norm": 11.526309708973256, + "learning_rate": 3.669086595062263e-07, + "logits/chosen": -0.1962347775697708, + "logits/rejected": 0.003413937985897064, + "logps/chosen": -1.3440295457839966, + "logps/rejected": -1.7574107646942139, + "loss": 1.0204, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3440295457839966, + "rewards/margins": 0.4133811593055725, + "rewards/rejected": -1.7574107646942139, + "sft_loss": 1.3372437953948975, "step": 3515 }, { "epoch": 1.8839270781067068, - "grad_norm": 9.773831218207205, - "learning_rate": 1.0962242758014169e-06, - "logits/chosen": -0.7042940855026245, - "logits/rejected": -0.6029044389724731, - "logps/chosen": -1.3506810665130615, - "logps/rejected": -2.053575038909912, - "loss": 0.9527, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3506810665130615, - "rewards/margins": 0.7028939127922058, - "rewards/rejected": -2.053575038909912, - "sft_loss": 1.4030991792678833, + "grad_norm": 7.910211511126699, + "learning_rate": 3.654080919338056e-07, + "logits/chosen": -0.2631233334541321, + "logits/rejected": -0.11768689006567001, + "logps/chosen": -1.3593801259994507, + "logps/rejected": -1.8382713794708252, + "loss": 1.0193, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3593801259994507, + "rewards/margins": 0.4788913130760193, + "rewards/rejected": -1.8382713794708252, + "sft_loss": 1.4099977016448975, "step": 3520 }, { "epoch": 1.8866031108881085, - "grad_norm": 5.9940400713158, - "learning_rate": 1.091726490518002e-06, - "logits/chosen": -0.6179289817810059, - "logits/rejected": -0.4722275733947754, - "logps/chosen": -1.3348115682601929, - "logps/rejected": -1.9982001781463623, - "loss": 0.9753, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3348115682601929, - "rewards/margins": 0.6633888483047485, - "rewards/rejected": -1.9982001781463623, - "sft_loss": 1.4151699542999268, + "grad_norm": 8.743071655576298, + "learning_rate": 3.639088301726673e-07, + "logits/chosen": -0.16788813471794128, + "logits/rejected": 0.0290222205221653, + "logps/chosen": -1.3322017192840576, + "logps/rejected": -1.7500145435333252, + "loss": 1.0442, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3322017192840576, + "rewards/margins": 0.4178128242492676, + "rewards/rejected": -1.7500145435333252, + "sft_loss": 1.411254644393921, "step": 3525 }, { "epoch": 1.88927914366951, - "grad_norm": 9.015576299378978, - "learning_rate": 1.0872326663059668e-06, - "logits/chosen": -0.6120736002922058, - "logits/rejected": -0.5604882836341858, - "logps/chosen": -1.3328831195831299, - "logps/rejected": -1.956390619277954, - "loss": 0.9804, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3328831195831299, - "rewards/margins": 0.6235072612762451, - "rewards/rejected": -1.956390619277954, - "sft_loss": 1.4265530109405518, + "grad_norm": 10.166980185767446, + "learning_rate": 3.624108887686556e-07, + "logits/chosen": -0.17637817561626434, + "logits/rejected": -0.1042185053229332, + "logps/chosen": -1.307114601135254, + "logps/rejected": -1.732693076133728, + "loss": 1.0151, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.307114601135254, + "rewards/margins": 0.42557865381240845, + "rewards/rejected": -1.732693076133728, + "sft_loss": 1.3863542079925537, "step": 3530 }, { "epoch": 1.8919551764509115, - "grad_norm": 7.096923003122541, - "learning_rate": 1.0827428467644132e-06, - "logits/chosen": -0.7125923037528992, - "logits/rejected": -0.6133583188056946, - "logps/chosen": -1.281874179840088, - "logps/rejected": -1.9087066650390625, - "loss": 0.9619, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.281874179840088, - "rewards/margins": 0.6268326044082642, - "rewards/rejected": -1.9087066650390625, - "sft_loss": 1.33371102809906, + "grad_norm": 12.139404913555175, + "learning_rate": 3.6091428225480433e-07, + "logits/chosen": -0.2735113501548767, + "logits/rejected": -0.131699800491333, + "logps/chosen": -1.265926718711853, + "logps/rejected": -1.67597234249115, + "loss": 1.0242, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.265926718711853, + "rewards/margins": 0.410045862197876, + "rewards/rejected": -1.67597234249115, + "sft_loss": 1.325702428817749, "step": 3535 }, { "epoch": 1.8946312092323132, - "grad_norm": 9.935078763524709, - "learning_rate": 1.0782570754535903e-06, - "logits/chosen": -0.6627184748649597, - "logits/rejected": -0.4770810008049011, - "logps/chosen": -1.3245737552642822, - "logps/rejected": -1.7778218984603882, - "loss": 0.9977, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.3245737552642822, - "rewards/margins": 0.4532480239868164, - "rewards/rejected": -1.7778218984603882, - "sft_loss": 1.3664424419403076, + "grad_norm": 8.370663556151582, + "learning_rate": 3.5941902515119674e-07, + "logits/chosen": -0.222178652882576, + "logits/rejected": 0.014582176692783833, + "logps/chosen": -1.3194665908813477, + "logps/rejected": -1.6473604440689087, + "loss": 1.0412, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3194665908813477, + "rewards/margins": 0.3278936445713043, + "rewards/rejected": -1.6473604440689087, + "sft_loss": 1.3597601652145386, "step": 3540 }, { "epoch": 1.8973072420137147, - "grad_norm": 8.945704039529609, - "learning_rate": 1.0737753958944712e-06, - "logits/chosen": -0.7426556348800659, - "logits/rejected": -0.5525585412979126, - "logps/chosen": -1.3033052682876587, - "logps/rejected": -1.8459545373916626, - "loss": 0.9616, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3033052682876587, - "rewards/margins": 0.5426491498947144, - "rewards/rejected": -1.8459545373916626, - "sft_loss": 1.3472154140472412, + "grad_norm": 7.684311641797868, + "learning_rate": 3.5792513196482373e-07, + "logits/chosen": -0.35185542702674866, + "logits/rejected": -0.07574127614498138, + "logps/chosen": -1.3232667446136475, + "logps/rejected": -1.6644926071166992, + "loss": 1.0237, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3232667446136475, + "rewards/margins": 0.34122568368911743, + "rewards/rejected": -1.6644926071166992, + "sft_loss": 1.3519316911697388, "step": 3545 }, { "epoch": 1.8999832747951162, - "grad_norm": 7.642566646415059, - "learning_rate": 1.0692978515683305e-06, - "logits/chosen": -0.603020966053009, - "logits/rejected": -0.5429368019104004, - "logps/chosen": -1.2958600521087646, - "logps/rejected": -1.7832367420196533, - "loss": 0.9581, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2958600521087646, - "rewards/margins": 0.4873766303062439, - "rewards/rejected": -1.7832367420196533, - "sft_loss": 1.272925615310669, + "grad_norm": 7.797432185476643, + "learning_rate": 3.5643261718944346e-07, + "logits/chosen": -0.15763789415359497, + "logits/rejected": -0.06480036675930023, + "logps/chosen": -1.3108044862747192, + "logps/rejected": -1.6585772037506104, + "loss": 1.0056, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3108044862747192, + "rewards/margins": 0.3477727770805359, + "rewards/rejected": -1.6585772037506104, + "sft_loss": 1.279354214668274, "step": 3550 }, { "epoch": 1.902659307576518, - "grad_norm": 7.510300680479713, - "learning_rate": 1.0648244859163227e-06, - "logits/chosen": -0.7047315239906311, - "logits/rejected": -0.5987603068351746, - "logps/chosen": -1.2614771127700806, - "logps/rejected": -1.9058072566986084, - "loss": 0.9537, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.2614771127700806, - "rewards/margins": 0.6443303227424622, - "rewards/rejected": -1.9058072566986084, - "sft_loss": 1.29532790184021, + "grad_norm": 7.745216098418087, + "learning_rate": 3.5494149530544087e-07, + "logits/chosen": -0.29884886741638184, + "logits/rejected": -0.1565883904695511, + "logps/chosen": -1.283048152923584, + "logps/rejected": -1.7354202270507812, + "loss": 1.002, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.283048152923584, + "rewards/margins": 0.45237183570861816, + "rewards/rejected": -1.7354202270507812, + "sft_loss": 1.3008395433425903, "step": 3555 }, { "epoch": 1.9053353403579194, - "grad_norm": 8.516169618871453, - "learning_rate": 1.0603553423390612e-06, - "logits/chosen": -0.6130501627922058, - "logits/rejected": -0.561513364315033, - "logps/chosen": -1.3350327014923096, - "logps/rejected": -1.9226328134536743, - "loss": 0.9654, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3350327014923096, - "rewards/margins": 0.58760005235672, - "rewards/rejected": -1.9226328134536743, - "sft_loss": 1.3583624362945557, + "grad_norm": 10.78515921532401, + "learning_rate": 3.534517807796871e-07, + "logits/chosen": -0.17739878594875336, + "logits/rejected": -0.08526907861232758, + "logps/chosen": -1.3533596992492676, + "logps/rejected": -1.7326189279556274, + "loss": 1.0338, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3533596992492676, + "rewards/margins": 0.3792593479156494, + "rewards/rejected": -1.7326189279556274, + "sft_loss": 1.3796982765197754, "step": 3560 }, { "epoch": 1.908011373139321, - "grad_norm": 10.22264712287352, - "learning_rate": 1.0558904641961966e-06, - "logits/chosen": -0.6046770811080933, - "logits/rejected": -0.5800749659538269, - "logps/chosen": -1.2874317169189453, - "logps/rejected": -2.0039634704589844, - "loss": 0.9636, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -1.2874317169189453, - "rewards/margins": 0.7165321111679077, - "rewards/rejected": -2.0039634704589844, - "sft_loss": 1.3508872985839844, + "grad_norm": 8.176774721936933, + "learning_rate": 3.519634880653988e-07, + "logits/chosen": -0.1857794225215912, + "logits/rejected": -0.11552359908819199, + "logps/chosen": -1.297877550125122, + "logps/rejected": -1.8466908931732178, + "loss": 1.0068, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.297877550125122, + "rewards/margins": 0.5488134622573853, + "rewards/rejected": -1.8466908931732178, + "sft_loss": 1.3502676486968994, "step": 3565 }, { "epoch": 1.9106874059207226, - "grad_norm": 6.834050275139067, - "learning_rate": 1.0514298948059961e-06, - "logits/chosen": -0.6144019961357117, - "logits/rejected": -0.5128809809684753, - "logps/chosen": -1.293882966041565, - "logps/rejected": -1.8975296020507812, - "loss": 0.9465, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.293882966041565, - "rewards/margins": 0.6036466956138611, - "rewards/rejected": -1.8975296020507812, - "sft_loss": 1.345824956893921, + "grad_norm": 9.824605979924113, + "learning_rate": 3.504766316019987e-07, + "logits/chosen": -0.2430846244096756, + "logits/rejected": -0.09735213220119476, + "logps/chosen": -1.3140398263931274, + "logps/rejected": -1.7592847347259521, + "loss": 0.9866, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3140398263931274, + "rewards/margins": 0.44524508714675903, + "rewards/rejected": -1.7592847347259521, + "sft_loss": 1.3423892259597778, "step": 3570 }, { "epoch": 1.913363438702124, - "grad_norm": 8.874431612379572, - "learning_rate": 1.0469736774449235e-06, - "logits/chosen": -0.5574930310249329, - "logits/rejected": -0.4778769612312317, - "logps/chosen": -1.2909395694732666, - "logps/rejected": -1.8894269466400146, - "loss": 0.9914, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.2909395694732666, - "rewards/margins": 0.5984874963760376, - "rewards/rejected": -1.8894269466400146, - "sft_loss": 1.3335301876068115, + "grad_norm": 6.999487909190867, + "learning_rate": 3.489912258149745e-07, + "logits/chosen": -0.11612536013126373, + "logits/rejected": 0.008373789489269257, + "logps/chosen": -1.3217031955718994, + "logps/rejected": -1.7657520771026611, + "loss": 1.0348, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3217031955718994, + "rewards/margins": 0.44404906034469604, + "rewards/rejected": -1.7657520771026611, + "sft_loss": 1.343665361404419, "step": 3575 }, { "epoch": 1.9160394714835256, - "grad_norm": 6.988865773579564, - "learning_rate": 1.0425218553472193e-06, - "logits/chosen": -0.6077667474746704, - "logits/rejected": -0.5917859077453613, - "logps/chosen": -1.2812873125076294, - "logps/rejected": -1.9763896465301514, - "loss": 0.9109, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.2812873125076294, - "rewards/margins": 0.6951022744178772, - "rewards/rejected": -1.9763896465301514, - "sft_loss": 1.332451581954956, + "grad_norm": 6.980014810377578, + "learning_rate": 3.475072851157397e-07, + "logits/chosen": -0.21279378235340118, + "logits/rejected": -0.15889443457126617, + "logps/chosen": -1.3280820846557617, + "logps/rejected": -1.8464562892913818, + "loss": 0.9733, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3280820846557617, + "rewards/margins": 0.5183740854263306, + "rewards/rejected": -1.8464562892913818, + "sft_loss": 1.3642082214355469, "step": 3580 }, { "epoch": 1.9187155042649273, - "grad_norm": 7.559212482726521, - "learning_rate": 1.038074471704481e-06, - "logits/chosen": -0.5742273926734924, - "logits/rejected": -0.5489991307258606, - "logps/chosen": -1.417245864868164, - "logps/rejected": -2.0291554927825928, - "loss": 1.0054, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.417245864868164, - "rewards/margins": 0.6119096875190735, - "rewards/rejected": -2.0291554927825928, - "sft_loss": 1.5023760795593262, + "grad_norm": 6.949960470988114, + "learning_rate": 3.460248239014936e-07, + "logits/chosen": -0.10336363315582275, + "logits/rejected": -0.05016554147005081, + "logps/chosen": -1.4357208013534546, + "logps/rejected": -1.8436673879623413, + "loss": 1.0583, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4357208013534546, + "rewards/margins": 0.4079464077949524, + "rewards/rejected": -1.8436673879623413, + "sft_loss": 1.4935412406921387, "step": 3585 }, { "epoch": 1.9213915370463288, - "grad_norm": 7.638506343278633, - "learning_rate": 1.033631569665244e-06, - "logits/chosen": -0.5891857743263245, - "logits/rejected": -0.535805344581604, - "logps/chosen": -1.3460886478424072, - "logps/rejected": -1.8601782321929932, - "loss": 1.012, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3460886478424072, - "rewards/margins": 0.5140894651412964, - "rewards/rejected": -1.8601782321929932, - "sft_loss": 1.3942530155181885, + "grad_norm": 9.413612625444546, + "learning_rate": 3.4454385655508134e-07, + "logits/chosen": -0.15116353332996368, + "logits/rejected": -0.07496649026870728, + "logps/chosen": -1.3940389156341553, + "logps/rejected": -1.6964954137802124, + "loss": 1.0913, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3940389156341553, + "rewards/margins": 0.30245649814605713, + "rewards/rejected": -1.6964954137802124, + "sft_loss": 1.4201014041900635, "step": 3590 }, { "epoch": 1.9240675698277303, - "grad_norm": 7.330430515610557, - "learning_rate": 1.0291931923345635e-06, - "logits/chosen": -0.730771541595459, - "logits/rejected": -0.5671923756599426, - "logps/chosen": -1.3746426105499268, - "logps/rejected": -2.0186867713928223, - "loss": 0.9488, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3746426105499268, - "rewards/margins": 0.6440441012382507, - "rewards/rejected": -2.0186867713928223, - "sft_loss": 1.3147354125976562, + "grad_norm": 7.514835548491341, + "learning_rate": 3.4306439744485447e-07, + "logits/chosen": -0.29490602016448975, + "logits/rejected": -0.08670699596405029, + "logps/chosen": -1.3553574085235596, + "logps/rejected": -1.8344428539276123, + "loss": 0.9764, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3553574085235596, + "rewards/margins": 0.4790855348110199, + "rewards/rejected": -1.8344428539276123, + "sft_loss": 1.3091545104980469, "step": 3595 }, { "epoch": 1.926743602609132, - "grad_norm": 8.909033918227077, - "learning_rate": 1.0247593827735966e-06, - "logits/chosen": -0.6141990423202515, - "logits/rejected": -0.4768144190311432, - "logps/chosen": -1.372605800628662, - "logps/rejected": -2.222141742706299, - "loss": 0.9557, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.372605800628662, - "rewards/margins": 0.8495360612869263, - "rewards/rejected": -2.222141742706299, - "sft_loss": 1.3834865093231201, + "grad_norm": 9.723844697548177, + "learning_rate": 3.415864609245322e-07, + "logits/chosen": -0.13972511887550354, + "logits/rejected": 0.04370427876710892, + "logps/chosen": -1.3503167629241943, + "logps/rejected": -1.9329227209091187, + "loss": 1.0023, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3503167629241943, + "rewards/margins": 0.5826060175895691, + "rewards/rejected": -1.9329227209091187, + "sft_loss": 1.373500108718872, "step": 3600 }, { "epoch": 1.926743602609132, - "eval_logits/chosen": -0.34722036123275757, - "eval_logits/rejected": -0.29034626483917236, - "eval_logps/chosen": -1.4343228340148926, - "eval_logps/rejected": -1.9644039869308472, - "eval_loss": 1.0308538675308228, - "eval_rewards/accuracies": 0.6454005837440491, - "eval_rewards/chosen": -1.4343228340148926, - "eval_rewards/margins": 0.5300810933113098, - "eval_rewards/rejected": -1.9644039869308472, - "eval_runtime": 45.9074, - "eval_samples_per_second": 29.298, - "eval_sft_loss": 1.4319441318511963, - "eval_steps_per_second": 7.341, + "eval_logits/chosen": 0.13220730423927307, + "eval_logits/rejected": 0.22114311158657074, + "eval_logps/chosen": -1.376747727394104, + "eval_logps/rejected": -1.7830473184585571, + "eval_loss": 1.0404950380325317, + "eval_rewards/accuracies": 0.612017810344696, + "eval_rewards/chosen": -1.376747727394104, + "eval_rewards/margins": 0.4062995910644531, + "eval_rewards/rejected": -1.7830473184585571, + "eval_runtime": 43.3596, + "eval_samples_per_second": 31.02, + "eval_sft_loss": 1.3926547765731812, + "eval_steps_per_second": 7.772, "step": 3600 }, { "epoch": 1.9294196353905335, - "grad_norm": 9.575565595168888, - "learning_rate": 1.0203301839991816e-06, - "logits/chosen": -0.6674664616584778, - "logits/rejected": -0.6533695459365845, - "logps/chosen": -1.3234946727752686, - "logps/rejected": -1.7638345956802368, - "loss": 1.0147, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.3234946727752686, - "rewards/margins": 0.4403398633003235, - "rewards/rejected": -1.7638345956802368, - "sft_loss": 1.365957260131836, + "grad_norm": 8.133124681161464, + "learning_rate": 3.401100613330605e-07, + "logits/chosen": -0.25169333815574646, + "logits/rejected": -0.20886509120464325, + "logps/chosen": -1.3518650531768799, + "logps/rejected": -1.6274452209472656, + "loss": 1.0831, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3518650531768799, + "rewards/margins": 0.2755802273750305, + "rewards/rejected": -1.6274452209472656, + "sft_loss": 1.380122184753418, "step": 3605 }, { "epoch": 1.932095668171935, - "grad_norm": 7.413916582912618, - "learning_rate": 1.0159056389834254e-06, - "logits/chosen": -0.6367475986480713, - "logits/rejected": -0.5504434704780579, - "logps/chosen": -1.320719838142395, - "logps/rejected": -1.9815881252288818, - "loss": 0.9514, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.320719838142395, - "rewards/margins": 0.6608681678771973, - "rewards/rejected": -1.9815881252288818, - "sft_loss": 1.4096839427947998, + "grad_norm": 7.469153856364965, + "learning_rate": 3.3863521299447514e-07, + "logits/chosen": -0.17526309192180634, + "logits/rejected": -0.051106780767440796, + "logps/chosen": -1.313644289970398, + "logps/rejected": -1.7661473751068115, + "loss": 1.0174, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.313644289970398, + "rewards/margins": 0.4525030255317688, + "rewards/rejected": -1.7661473751068115, + "sft_loss": 1.39413583278656, "step": 3610 }, { "epoch": 1.9347717009533367, - "grad_norm": 6.010152257530943, - "learning_rate": 1.0114857906532827e-06, - "logits/chosen": -0.5671981573104858, - "logits/rejected": -0.4917985796928406, - "logps/chosen": -1.3791894912719727, - "logps/rejected": -1.947789192199707, - "loss": 0.9871, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.3791894912719727, - "rewards/margins": 0.5685997009277344, - "rewards/rejected": -1.947789192199707, - "sft_loss": 1.3976194858551025, + "grad_norm": 6.523078696838851, + "learning_rate": 3.371619302177609e-07, + "logits/chosen": -0.10688170045614243, + "logits/rejected": 0.013090262189507484, + "logps/chosen": -1.3751380443572998, + "logps/rejected": -1.7437824010849, + "loss": 1.046, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3751380443572998, + "rewards/margins": 0.36864447593688965, + "rewards/rejected": -1.7437824010849, + "sft_loss": 1.3935257196426392, "step": 3615 }, { "epoch": 1.9374477337347382, - "grad_norm": 12.21225851564805, - "learning_rate": 1.0070706818901417e-06, - "logits/chosen": -0.6242167353630066, - "logits/rejected": -0.5752514600753784, - "logps/chosen": -1.3984695672988892, - "logps/rejected": -1.881219506263733, - "loss": 1.0316, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3984695672988892, - "rewards/margins": 0.4827499985694885, - "rewards/rejected": -1.881219506263733, - "sft_loss": 1.4380195140838623, + "grad_norm": 12.122483026445424, + "learning_rate": 3.3569022729671393e-07, + "logits/chosen": -0.1743597835302353, + "logits/rejected": -0.10058524459600449, + "logps/chosen": -1.3639273643493652, + "logps/rejected": -1.6401739120483398, + "loss": 1.0758, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3639273643493652, + "rewards/margins": 0.276246577501297, + "rewards/rejected": -1.6401739120483398, + "sft_loss": 1.4094431400299072, "step": 3620 }, { "epoch": 1.9401237665161397, - "grad_norm": 6.902602838356776, - "learning_rate": 1.0026603555294073e-06, - "logits/chosen": -0.5850001573562622, - "logits/rejected": -0.5802913904190063, - "logps/chosen": -1.3279889822006226, - "logps/rejected": -1.8815221786499023, - "loss": 0.9881, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3279889822006226, - "rewards/margins": 0.5535333752632141, - "rewards/rejected": -1.8815221786499023, - "sft_loss": 1.4049403667449951, + "grad_norm": 8.402271918311031, + "learning_rate": 3.342201185098024e-07, + "logits/chosen": -0.12908560037612915, + "logits/rejected": -0.10964863002300262, + "logps/chosen": -1.3172557353973389, + "logps/rejected": -1.6691465377807617, + "loss": 1.0411, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3172557353973389, + "rewards/margins": 0.3518907427787781, + "rewards/rejected": -1.6691465377807617, + "sft_loss": 1.3863766193389893, "step": 3625 }, { "epoch": 1.9427997992975414, - "grad_norm": 11.500033917000573, - "learning_rate": 9.982548543600843e-07, - "logits/chosen": -0.6528538465499878, - "logits/rejected": -0.6264825463294983, - "logps/chosen": -1.3921599388122559, - "logps/rejected": -1.9632352590560913, - "loss": 1.0585, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.3921599388122559, - "rewards/margins": 0.5710754990577698, - "rewards/rejected": -1.9632352590560913, - "sft_loss": 1.5010713338851929, + "grad_norm": 15.309824266627936, + "learning_rate": 3.3275161812002807e-07, + "logits/chosen": -0.22412028908729553, + "logits/rejected": -0.1645040512084961, + "logps/chosen": -1.3662104606628418, + "logps/rejected": -1.7294059991836548, + "loss": 1.0994, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3662104606628418, + "rewards/margins": 0.3631953299045563, + "rewards/rejected": -1.7294059991836548, + "sft_loss": 1.470768690109253, "step": 3630 }, { "epoch": 1.945475832078943, - "grad_norm": 8.069222931489321, - "learning_rate": 9.93854221124365e-07, - "logits/chosen": -0.6453899145126343, - "logits/rejected": -0.5964998602867126, - "logps/chosen": -1.3313370943069458, - "logps/rejected": -1.9300304651260376, - "loss": 0.9682, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3313370943069458, - "rewards/margins": 0.5986935496330261, - "rewards/rejected": -1.9300304651260376, - "sft_loss": 1.3976194858551025, + "grad_norm": 7.523094119971832, + "learning_rate": 3.312847403747883e-07, + "logits/chosen": -0.23497335612773895, + "logits/rejected": -0.14777755737304688, + "logps/chosen": -1.3361382484436035, + "logps/rejected": -1.7206249237060547, + "loss": 1.0388, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3361382484436035, + "rewards/margins": 0.3844866156578064, + "rewards/rejected": -1.7206249237060547, + "sft_loss": 1.4018043279647827, "step": 3635 }, { "epoch": 1.9481518648603444, - "grad_norm": 6.284563214144554, - "learning_rate": 9.894584985172121e-07, - "logits/chosen": -0.6314951777458191, - "logits/rejected": -0.5464446544647217, - "logps/chosen": -1.4279075860977173, - "logps/rejected": -1.8439185619354248, - "loss": 1.0198, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.4279075860977173, - "rewards/margins": 0.41601109504699707, - "rewards/rejected": -1.8439185619354248, - "sft_loss": 1.4700605869293213, + "grad_norm": 6.650681430076024, + "learning_rate": 3.2981949950573733e-07, + "logits/chosen": -0.1847231686115265, + "logits/rejected": -0.0686403438448906, + "logps/chosen": -1.4241373538970947, + "logps/rejected": -1.6333825588226318, + "loss": 1.0985, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4241373538970947, + "rewards/margins": 0.20924527943134308, + "rewards/rejected": -1.6333825588226318, + "sft_loss": 1.4570655822753906, "step": 3640 }, { "epoch": 1.9508278976417461, - "grad_norm": 9.537923550198874, - "learning_rate": 9.850677291859458e-07, - "logits/chosen": -0.6765953898429871, - "logits/rejected": -0.5869105458259583, - "logps/chosen": -1.4551875591278076, - "logps/rejected": -1.8798748254776, - "loss": 1.0508, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -1.4551875591278076, - "rewards/margins": 0.42468729615211487, - "rewards/rejected": -1.8798748254776, - "sft_loss": 1.5198633670806885, + "grad_norm": 6.703996052459396, + "learning_rate": 3.283559097286486e-07, + "logits/chosen": -0.24172177910804749, + "logits/rejected": -0.11361400038003922, + "logps/chosen": -1.4382174015045166, + "logps/rejected": -1.6817821264266968, + "loss": 1.0878, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4382174015045166, + "rewards/margins": 0.24356460571289062, + "rewards/rejected": -1.6817821264266968, + "sft_loss": 1.47786545753479, "step": 3645 }, { "epoch": 1.9535039304231478, - "grad_norm": 7.939924398451612, - "learning_rate": 9.806819557298295e-07, - "logits/chosen": -0.6868590712547302, - "logits/rejected": -0.6186624765396118, - "logps/chosen": -1.3275146484375, - "logps/rejected": -1.836299180984497, - "loss": 0.9731, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3275146484375, - "rewards/margins": 0.5087844729423523, - "rewards/rejected": -1.836299180984497, - "sft_loss": 1.386508584022522, + "grad_norm": 6.809062902834572, + "learning_rate": 3.268939852432765e-07, + "logits/chosen": -0.2671525776386261, + "logits/rejected": -0.16197296977043152, + "logps/chosen": -1.316367268562317, + "logps/rejected": -1.6348955631256104, + "loss": 1.035, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.316367268562317, + "rewards/margins": 0.31852835416793823, + "rewards/rejected": -1.6348955631256104, + "sft_loss": 1.3881739377975464, "step": 3650 }, { "epoch": 1.9561799632045491, - "grad_norm": 8.054517035710493, - "learning_rate": 9.76301220699656e-07, - "logits/chosen": -0.6554325222969055, - "logits/rejected": -0.5593774914741516, - "logps/chosen": -1.4057365655899048, - "logps/rejected": -1.9774185419082642, - "loss": 0.9963, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.4057365655899048, - "rewards/margins": 0.5716819167137146, - "rewards/rejected": -1.9774185419082642, - "sft_loss": 1.4154739379882812, + "grad_norm": 8.59906216379134, + "learning_rate": 3.254337402332187e-07, + "logits/chosen": -0.2066754400730133, + "logits/rejected": -0.06717734038829803, + "logps/chosen": -1.4142173528671265, + "logps/rejected": -1.7292635440826416, + "loss": 1.0753, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4142173528671265, + "rewards/margins": 0.31504613161087036, + "rewards/rejected": -1.7292635440826416, + "sft_loss": 1.4033355712890625, "step": 3655 }, { "epoch": 1.9588559959859508, - "grad_norm": 11.357176551974492, - "learning_rate": 9.719255665973365e-07, - "logits/chosen": -0.6747775077819824, - "logits/rejected": -0.5881116390228271, - "logps/chosen": -1.3360207080841064, - "logps/rejected": -1.9081058502197266, - "loss": 0.984, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3360207080841064, - "rewards/margins": 0.5720850229263306, - "rewards/rejected": -1.9081058502197266, - "sft_loss": 1.3956830501556396, + "grad_norm": 9.493755359677273, + "learning_rate": 3.239751888657788e-07, + "logits/chosen": -0.23579907417297363, + "logits/rejected": -0.10296180099248886, + "logps/chosen": -1.2681344747543335, + "logps/rejected": -1.6588176488876343, + "loss": 1.0213, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2681344747543335, + "rewards/margins": 0.39068326354026794, + "rewards/rejected": -1.6588176488876343, + "sft_loss": 1.3464233875274658, "step": 3660 }, { "epoch": 1.9615320287673526, - "grad_norm": 7.848314944555806, - "learning_rate": 9.675550358754857e-07, - "logits/chosen": -0.6074212193489075, - "logits/rejected": -0.5252271294593811, - "logps/chosen": -1.272979497909546, - "logps/rejected": -1.9367396831512451, - "loss": 0.9582, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.272979497909546, - "rewards/margins": 0.6637603640556335, - "rewards/rejected": -1.9367396831512451, - "sft_loss": 1.2792924642562866, + "grad_norm": 7.389145463944783, + "learning_rate": 3.2251834529182856e-07, + "logits/chosen": -0.17442061007022858, + "logits/rejected": -0.05897391960024834, + "logps/chosen": -1.272640585899353, + "logps/rejected": -1.662667989730835, + "loss": 1.0202, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.272640585899353, + "rewards/margins": 0.39002761244773865, + "rewards/rejected": -1.662667989730835, + "sft_loss": 1.2678601741790771, "step": 3665 }, { "epoch": 1.9642080615487538, - "grad_norm": 8.533089389751797, - "learning_rate": 9.631896709370124e-07, - "logits/chosen": -0.6103826761245728, - "logits/rejected": -0.507732629776001, - "logps/chosen": -1.3028061389923096, - "logps/rejected": -2.1012096405029297, - "loss": 0.9137, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3028061389923096, - "rewards/margins": 0.7984035611152649, - "rewards/rejected": -2.1012096405029297, - "sft_loss": 1.4023349285125732, + "grad_norm": 6.70872777057386, + "learning_rate": 3.2106322364567075e-07, + "logits/chosen": -0.23170146346092224, + "logits/rejected": -0.08080779761075974, + "logps/chosen": -1.3061374425888062, + "logps/rejected": -1.7743968963623047, + "loss": 1.0124, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3061374425888062, + "rewards/margins": 0.46825942397117615, + "rewards/rejected": -1.7743968963623047, + "sft_loss": 1.4011518955230713, "step": 3670 }, { "epoch": 1.9668840943301555, - "grad_norm": 6.983088917340356, - "learning_rate": 9.588295141347055e-07, - "logits/chosen": -0.6318444013595581, - "logits/rejected": -0.529419481754303, - "logps/chosen": -1.4596143960952759, - "logps/rejected": -2.2197556495666504, - "loss": 1.0007, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.4596143960952759, - "rewards/margins": 0.7601410746574402, - "rewards/rejected": -2.2197556495666504, - "sft_loss": 1.4763944149017334, + "grad_norm": 5.941168181963396, + "learning_rate": 3.1960983804490183e-07, + "logits/chosen": -0.21781177818775177, + "logits/rejected": -0.07623197138309479, + "logps/chosen": -1.3767824172973633, + "logps/rejected": -1.8881441354751587, + "loss": 1.0171, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3767824172973633, + "rewards/margins": 0.5113617181777954, + "rewards/rejected": -1.8881441354751587, + "sft_loss": 1.4345794916152954, "step": 3675 }, { "epoch": 1.9695601271115573, - "grad_norm": 9.347057160760839, - "learning_rate": 9.544746077708263e-07, - "logits/chosen": -0.6308041214942932, - "logits/rejected": -0.5474078059196472, - "logps/chosen": -1.2283879518508911, - "logps/rejected": -1.8275114297866821, - "loss": 0.9208, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2283879518508911, - "rewards/margins": 0.5991234183311462, - "rewards/rejected": -1.8275114297866821, - "sft_loss": 1.294682264328003, + "grad_norm": 7.172600001585544, + "learning_rate": 3.1815820259027537e-07, + "logits/chosen": -0.21801035106182098, + "logits/rejected": -0.094429150223732, + "logps/chosen": -1.2056726217269897, + "logps/rejected": -1.6035264730453491, + "loss": 0.9565, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2056726217269897, + "rewards/margins": 0.39785367250442505, + "rewards/rejected": -1.6035264730453491, + "sft_loss": 1.2749571800231934, "step": 3680 }, { "epoch": 1.9722361598929585, - "grad_norm": 8.534486415963926, - "learning_rate": 9.50124994096695e-07, - "logits/chosen": -0.6402637362480164, - "logits/rejected": -0.5865155458450317, - "logps/chosen": -1.3168342113494873, - "logps/rejected": -1.9747432470321655, - "loss": 0.975, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3168342113494873, - "rewards/margins": 0.6579092144966125, - "rewards/rejected": -1.9747432470321655, - "sft_loss": 1.383578896522522, + "grad_norm": 8.996171174673789, + "learning_rate": 3.16708331365565e-07, + "logits/chosen": -0.227299302816391, + "logits/rejected": -0.14037606120109558, + "logps/chosen": -1.2919366359710693, + "logps/rejected": -1.7213513851165771, + "loss": 1.0273, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2919366359710693, + "rewards/margins": 0.4294148087501526, + "rewards/rejected": -1.7213513851165771, + "sft_loss": 1.366469383239746, "step": 3685 }, { "epoch": 1.9749121926743602, - "grad_norm": 7.607105121263159, - "learning_rate": 9.457807153122826e-07, - "logits/chosen": -0.5896097421646118, - "logits/rejected": -0.4782601296901703, - "logps/chosen": -1.3596460819244385, - "logps/rejected": -2.096602439880371, - "loss": 0.9529, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.3596460819244385, - "rewards/margins": 0.7369564771652222, - "rewards/rejected": -2.096602439880371, - "sft_loss": 1.352794885635376, + "grad_norm": 6.584738685805881, + "learning_rate": 3.152602384374275e-07, + "logits/chosen": -0.2102833241224289, + "logits/rejected": -0.0533757284283638, + "logps/chosen": -1.3438446521759033, + "logps/rejected": -1.7797075510025024, + "loss": 1.018, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3438446521759033, + "rewards/margins": 0.43586283922195435, + "rewards/rejected": -1.7797075510025024, + "sft_loss": 1.3491125106811523, "step": 3690 }, { "epoch": 1.977588225455762, - "grad_norm": 8.683135121423115, - "learning_rate": 9.41441813565801e-07, - "logits/chosen": -0.5771075487136841, - "logits/rejected": -0.5605219006538391, - "logps/chosen": -1.4253747463226318, - "logps/rejected": -2.057281970977783, - "loss": 1.0184, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.4253747463226318, - "rewards/margins": 0.6319074034690857, - "rewards/rejected": -2.057281970977783, - "sft_loss": 1.5130159854888916, + "grad_norm": 8.115150453314252, + "learning_rate": 3.1381393785526697e-07, + "logits/chosen": -0.1443311721086502, + "logits/rejected": -0.09654446691274643, + "logps/chosen": -1.3931901454925537, + "logps/rejected": -1.8128507137298584, + "loss": 1.0565, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3931901454925537, + "rewards/margins": 0.4196605086326599, + "rewards/rejected": -1.8128507137298584, + "sft_loss": 1.4648798704147339, "step": 3695 }, { "epoch": 1.9802642582371635, - "grad_norm": 8.479150683466232, - "learning_rate": 9.371083309532938e-07, - "logits/chosen": -0.5546612739562988, - "logits/rejected": -0.47140711545944214, - "logps/chosen": -1.3317207098007202, - "logps/rejected": -1.954064130783081, - "loss": 0.9682, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3317207098007202, - "rewards/margins": 0.6223434805870056, - "rewards/rejected": -1.954064130783081, - "sft_loss": 1.3673937320709229, + "grad_norm": 8.432376252624964, + "learning_rate": 3.123694436510979e-07, + "logits/chosen": -0.14914622902870178, + "logits/rejected": -0.04011151194572449, + "logps/chosen": -1.3027012348175049, + "logps/rejected": -1.7023481130599976, + "loss": 1.0173, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3027012348175049, + "rewards/margins": 0.3996468484401703, + "rewards/rejected": -1.7023481130599976, + "sft_loss": 1.332310438156128, "step": 3700 }, { "epoch": 1.982940291018565, - "grad_norm": 7.173261243532672, - "learning_rate": 9.327803095182284e-07, - "logits/chosen": -0.6001571416854858, - "logits/rejected": -0.5562934875488281, - "logps/chosen": -1.3848721981048584, - "logps/rejected": -2.006413698196411, - "loss": 0.963, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3848721981048584, - "rewards/margins": 0.6215416193008423, - "rewards/rejected": -2.006413698196411, - "sft_loss": 1.3758659362792969, + "grad_norm": 7.581891370204553, + "learning_rate": 3.1092676983940946e-07, + "logits/chosen": -0.2142951786518097, + "logits/rejected": -0.1357312798500061, + "logps/chosen": -1.3414753675460815, + "logps/rejected": -1.75808846950531, + "loss": 1.0063, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3414753675460815, + "rewards/margins": 0.4166131019592285, + "rewards/rejected": -1.75808846950531, + "sft_loss": 1.344266414642334, "step": 3705 }, { "epoch": 1.9856163237999667, - "grad_norm": 9.639346443973118, - "learning_rate": 9.28457791251088e-07, - "logits/chosen": -0.4836387634277344, - "logits/rejected": -0.45512452721595764, - "logps/chosen": -1.3742034435272217, - "logps/rejected": -1.8803613185882568, - "loss": 1.0281, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3742034435272217, - "rewards/margins": 0.506157636642456, - "rewards/rejected": -1.8803613185882568, - "sft_loss": 1.4473121166229248, + "grad_norm": 8.87844232756679, + "learning_rate": 3.094859304170293e-07, + "logits/chosen": -0.06727816164493561, + "logits/rejected": -0.01013193279504776, + "logps/chosen": -1.3536124229431152, + "logps/rejected": -1.688381552696228, + "loss": 1.0899, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3536124229431152, + "rewards/margins": 0.3347693085670471, + "rewards/rejected": -1.688381552696228, + "sft_loss": 1.4273655414581299, "step": 3710 }, { "epoch": 1.9882923565813682, - "grad_norm": 6.039721125610725, - "learning_rate": 9.241408180889638e-07, - "logits/chosen": -0.5536881685256958, - "logits/rejected": -0.5073939561843872, - "logps/chosen": -1.3740195035934448, - "logps/rejected": -1.9659255743026733, - "loss": 0.9995, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3740195035934448, - "rewards/margins": 0.5919061899185181, - "rewards/rejected": -1.9659255743026733, - "sft_loss": 1.4292535781860352, + "grad_norm": 6.813440382150874, + "learning_rate": 3.0804693936298795e-07, + "logits/chosen": -0.1517767459154129, + "logits/rejected": -0.07338926941156387, + "logps/chosen": -1.3509794473648071, + "logps/rejected": -1.7539699077606201, + "loss": 1.0538, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3509794473648071, + "rewards/margins": 0.4029907286167145, + "rewards/rejected": -1.7539699077606201, + "sft_loss": 1.3848661184310913, "step": 3715 }, { "epoch": 1.9909683893627697, - "grad_norm": 6.770977619657827, - "learning_rate": 9.198294319151478e-07, - "logits/chosen": -0.5691328644752502, - "logits/rejected": -0.5202389359474182, - "logps/chosen": -1.3304475545883179, - "logps/rejected": -1.7927255630493164, - "loss": 0.9824, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3304475545883179, - "rewards/margins": 0.462277889251709, - "rewards/rejected": -1.7927255630493164, - "sft_loss": 1.3431183099746704, + "grad_norm": 4.664992051222814, + "learning_rate": 3.066098106383826e-07, + "logits/chosen": -0.19024668633937836, + "logits/rejected": -0.10702836513519287, + "logps/chosen": -1.3410247564315796, + "logps/rejected": -1.6328284740447998, + "loss": 1.0458, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3410247564315796, + "rewards/margins": 0.2918040156364441, + "rewards/rejected": -1.6328284740447998, + "sft_loss": 1.3327479362487793, "step": 3720 }, { "epoch": 1.9936444221441714, - "grad_norm": 7.303791106226416, - "learning_rate": 9.155236745587279e-07, - "logits/chosen": -0.6500542759895325, - "logits/rejected": -0.5855103135108948, - "logps/chosen": -1.3462293148040771, - "logps/rejected": -1.933521032333374, - "loss": 0.9937, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.3462293148040771, - "rewards/margins": 0.5872918367385864, - "rewards/rejected": -1.933521032333374, - "sft_loss": 1.4082921743392944, + "grad_norm": 7.138454236405234, + "learning_rate": 3.0517455818624263e-07, + "logits/chosen": -0.24577781558036804, + "logits/rejected": -0.13845011591911316, + "logps/chosen": -1.3480573892593384, + "logps/rejected": -1.739193320274353, + "loss": 1.0602, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3480573892593384, + "rewards/margins": 0.3911357820034027, + "rewards/rejected": -1.739193320274353, + "sft_loss": 1.4003136157989502, "step": 3725 }, { "epoch": 1.9963204549255729, - "grad_norm": 8.400362850544338, - "learning_rate": 9.112235877941808e-07, - "logits/chosen": -0.5426613688468933, - "logits/rejected": -0.44646692276000977, - "logps/chosen": -1.2635165452957153, - "logps/rejected": -1.8674116134643555, - "loss": 0.9185, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2635165452957153, - "rewards/margins": 0.6038950085639954, - "rewards/rejected": -1.8674116134643555, - "sft_loss": 1.2890541553497314, + "grad_norm": 5.283253532535059, + "learning_rate": 3.037411959313936e-07, + "logits/chosen": -0.1491527259349823, + "logits/rejected": -0.01746211014688015, + "logps/chosen": -1.2778774499893188, + "logps/rejected": -1.7071552276611328, + "loss": 0.9776, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2778774499893188, + "rewards/margins": 0.42927759885787964, + "rewards/rejected": -1.7071552276611328, + "sft_loss": 1.2987234592437744, "step": 3730 }, { "epoch": 1.9989964877069744, - "grad_norm": 7.477633679990428, - "learning_rate": 9.069292133409672e-07, - "logits/chosen": -0.5348911881446838, - "logits/rejected": -0.48282891511917114, - "logps/chosen": -1.4278972148895264, - "logps/rejected": -1.913153886795044, - "loss": 1.0657, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.4278972148895264, - "rewards/margins": 0.48525673151016235, - "rewards/rejected": -1.913153886795044, - "sft_loss": 1.4490896463394165, + "grad_norm": 12.054790115213882, + "learning_rate": 3.023097377803224e-07, + "logits/chosen": -0.11329762637615204, + "logits/rejected": -0.03464198112487793, + "logps/chosen": -1.4374887943267822, + "logps/rejected": -1.6887142658233643, + "loss": 1.1489, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4374887943267822, + "rewards/margins": 0.2512255609035492, + "rewards/rejected": -1.6887142658233643, + "sft_loss": 1.4471741914749146, "step": 3735 }, { "epoch": 2.001672520488376, - "grad_norm": 9.063452365996959, - "learning_rate": 9.026405928631269e-07, - "logits/chosen": -0.5262424349784851, - "logits/rejected": -0.5206266641616821, - "logps/chosen": -1.3640168905258179, - "logps/rejected": -1.8886162042617798, - "loss": 0.9904, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3640168905258179, - "rewards/margins": 0.5245994329452515, - "rewards/rejected": -1.8886162042617798, - "sft_loss": 1.3758842945098877, + "grad_norm": 7.143914332553867, + "learning_rate": 3.008801976210423e-07, + "logits/chosen": -0.12631377577781677, + "logits/rejected": -0.07701162248849869, + "logps/chosen": -1.3984800577163696, + "logps/rejected": -1.6868093013763428, + "loss": 1.0818, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3984800577163696, + "rewards/margins": 0.28832948207855225, + "rewards/rejected": -1.6868093013763428, + "sft_loss": 1.379875659942627, "step": 3740 }, { "epoch": 2.0043485532697773, - "grad_norm": 5.506743461924594, - "learning_rate": 8.983577679688745e-07, - "logits/chosen": -0.6251953840255737, - "logits/rejected": -0.56153804063797, - "logps/chosen": -1.3155872821807861, - "logps/rejected": -2.001924991607666, - "loss": 0.9253, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3155872821807861, - "rewards/margins": 0.6863377690315247, - "rewards/rejected": -2.001924991607666, - "sft_loss": 1.3774052858352661, + "grad_norm": 7.623520104046133, + "learning_rate": 2.994525893229581e-07, + "logits/chosen": -0.17279155552387238, + "logits/rejected": -0.06797249615192413, + "logps/chosen": -1.3755438327789307, + "logps/rejected": -1.7344719171524048, + "loss": 1.0419, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3755438327789307, + "rewards/margins": 0.3589281439781189, + "rewards/rejected": -1.7344719171524048, + "sft_loss": 1.3938535451889038, "step": 3745 }, { "epoch": 2.007024586051179, - "grad_norm": 7.6386048683501775, - "learning_rate": 8.940807802101961e-07, - "logits/chosen": -0.6785083413124084, - "logits/rejected": -0.6239103674888611, - "logps/chosen": -1.1901180744171143, - "logps/rejected": -1.972402811050415, - "loss": 0.8635, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.1901180744171143, - "rewards/margins": 0.7822847366333008, - "rewards/rejected": -1.972402811050415, - "sft_loss": 1.2562397718429565, + "grad_norm": 11.226955224513722, + "learning_rate": 2.98026926736732e-07, + "logits/chosen": -0.23111264407634735, + "logits/rejected": -0.14523088932037354, + "logps/chosen": -1.250886082649231, + "logps/rejected": -1.725956678390503, + "loss": 0.9775, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.250886082649231, + "rewards/margins": 0.47507089376449585, + "rewards/rejected": -1.725956678390503, + "sft_loss": 1.287925124168396, "step": 3750 }, { "epoch": 2.0097006188325808, - "grad_norm": 9.068439135923542, - "learning_rate": 8.898096710824455e-07, - "logits/chosen": -0.6381465196609497, - "logits/rejected": -0.5730241537094116, - "logps/chosen": -1.2574265003204346, - "logps/rejected": -2.136043071746826, - "loss": 0.9042, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2574265003204346, - "rewards/margins": 0.8786169290542603, - "rewards/rejected": -2.136043071746826, - "sft_loss": 1.3654946088790894, + "grad_norm": 6.836683127732292, + "learning_rate": 2.9660322369414846e-07, + "logits/chosen": -0.19628411531448364, + "logits/rejected": -0.09107625484466553, + "logps/chosen": -1.2780523300170898, + "logps/rejected": -1.8557761907577515, + "loss": 0.9756, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2780523300170898, + "rewards/margins": 0.5777239203453064, + "rewards/rejected": -1.8557761907577515, + "sft_loss": 1.3645378351211548, "step": 3755 }, { "epoch": 2.0123766516139825, - "grad_norm": 5.629269515593526, - "learning_rate": 8.855444820239421e-07, - "logits/chosen": -0.6984506845474243, - "logits/rejected": -0.7039046883583069, - "logps/chosen": -1.2816941738128662, - "logps/rejected": -2.23903226852417, - "loss": 0.8963, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2816941738128662, - "rewards/margins": 0.9573379755020142, - "rewards/rejected": -2.23903226852417, - "sft_loss": 1.3613649606704712, + "grad_norm": 6.996028018688775, + "learning_rate": 2.9518149400798063e-07, + "logits/chosen": -0.2888035476207733, + "logits/rejected": -0.2642049789428711, + "logps/chosen": -1.3084895610809326, + "logps/rejected": -1.8495372533798218, + "loss": 1.0097, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3084895610809326, + "rewards/margins": 0.5410477519035339, + "rewards/rejected": -1.8495372533798218, + "sft_loss": 1.3794362545013428, "step": 3760 }, { "epoch": 2.0150526843953838, - "grad_norm": 6.517730028677668, - "learning_rate": 8.812852544155691e-07, - "logits/chosen": -0.6268309950828552, - "logits/rejected": -0.49097996950149536, - "logps/chosen": -1.2899795770645142, - "logps/rejected": -2.2022030353546143, - "loss": 0.8829, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.2899795770645142, - "rewards/margins": 0.9122235178947449, - "rewards/rejected": -2.2022030353546143, - "sft_loss": 1.3458114862442017, + "grad_norm": 9.926606596207181, + "learning_rate": 2.9376175147185633e-07, + "logits/chosen": -0.1349642425775528, + "logits/rejected": 0.04218194633722305, + "logps/chosen": -1.3362480401992798, + "logps/rejected": -1.839485764503479, + "loss": 0.999, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3362480401992798, + "rewards/margins": 0.503237783908844, + "rewards/rejected": -1.839485764503479, + "sft_loss": 1.3468215465545654, "step": 3765 }, { "epoch": 2.0177287171767855, - "grad_norm": 10.89228362918389, - "learning_rate": 8.770320295803714e-07, - "logits/chosen": -0.713524580001831, - "logits/rejected": -0.6023358106613159, - "logps/chosen": -1.2371385097503662, - "logps/rejected": -2.269756317138672, - "loss": 0.8539, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2371385097503662, - "rewards/margins": 1.0326178073883057, - "rewards/rejected": -2.269756317138672, - "sft_loss": 1.3002526760101318, + "grad_norm": 14.972325401010377, + "learning_rate": 2.9234400986012376e-07, + "logits/chosen": -0.2785857617855072, + "logits/rejected": -0.11333181709051132, + "logps/chosen": -1.274951696395874, + "logps/rejected": -1.9533205032348633, + "loss": 0.9451, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.274951696395874, + "rewards/margins": 0.6783686876296997, + "rewards/rejected": -1.9533205032348633, + "sft_loss": 1.319342851638794, "step": 3770 }, { "epoch": 2.020404749958187, - "grad_norm": 9.872604944685474, - "learning_rate": 8.727848487831545e-07, - "logits/chosen": -0.6570896506309509, - "logits/rejected": -0.6487444639205933, - "logps/chosen": -1.2803922891616821, - "logps/rejected": -2.118070602416992, - "loss": 0.8801, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2803922891616821, - "rewards/margins": 0.8376782536506653, - "rewards/rejected": -2.118070602416992, - "sft_loss": 1.3236936330795288, + "grad_norm": 8.705233648780364, + "learning_rate": 2.9092828292771817e-07, + "logits/chosen": -0.23137220740318298, + "logits/rejected": -0.17029942572116852, + "logps/chosen": -1.3322160243988037, + "logps/rejected": -1.7487624883651733, + "loss": 1.025, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3322160243988037, + "rewards/margins": 0.4165467321872711, + "rewards/rejected": -1.7487624883651733, + "sft_loss": 1.3671302795410156, "step": 3775 }, { "epoch": 2.0230807827395885, - "grad_norm": 7.965795082559047, - "learning_rate": 8.685437532300863e-07, - "logits/chosen": -0.6047377586364746, - "logits/rejected": -0.6088879704475403, - "logps/chosen": -1.2858202457427979, - "logps/rejected": -2.0399012565612793, - "loss": 0.9426, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2858202457427979, - "rewards/margins": 0.7540808916091919, - "rewards/rejected": -2.0399012565612793, - "sft_loss": 1.3772176504135132, + "grad_norm": 7.650898256780712, + "learning_rate": 2.8951458441002875e-07, + "logits/chosen": -0.160761758685112, + "logits/rejected": -0.12005305290222168, + "logps/chosen": -1.323396921157837, + "logps/rejected": -1.7633053064346313, + "loss": 1.0471, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.323396921157837, + "rewards/margins": 0.43990859389305115, + "rewards/rejected": -1.7633053064346313, + "sft_loss": 1.4031734466552734, "step": 3780 }, { "epoch": 2.02575681552099, - "grad_norm": 8.500820535507295, - "learning_rate": 8.64308784068293e-07, - "logits/chosen": -0.6436256170272827, - "logits/rejected": -0.5537455677986145, - "logps/chosen": -1.3402982950210571, - "logps/rejected": -2.2292492389678955, - "loss": 0.8799, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3402982950210571, - "rewards/margins": 0.8889509439468384, - "rewards/rejected": -2.2292492389678955, - "sft_loss": 1.3502360582351685, + "grad_norm": 6.0976862564963, + "learning_rate": 2.881029280227643e-07, + "logits/chosen": -0.21818462014198303, + "logits/rejected": -0.09445396065711975, + "logps/chosen": -1.3529255390167236, + "logps/rejected": -1.9128081798553467, + "loss": 0.9711, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3529255390167236, + "rewards/margins": 0.559882640838623, + "rewards/rejected": -1.9128081798553467, + "sft_loss": 1.3481228351593018, "step": 3785 }, { "epoch": 2.028432848302392, - "grad_norm": 3.8693780171767522, - "learning_rate": 8.600799823854655e-07, - "logits/chosen": -0.6947168111801147, - "logits/rejected": -0.5738447904586792, - "logps/chosen": -1.2864720821380615, - "logps/rejected": -2.260383129119873, - "loss": 0.8736, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2864720821380615, - "rewards/margins": 0.9739111065864563, - "rewards/rejected": -2.260383129119873, - "sft_loss": 1.3739585876464844, + "grad_norm": 5.441754368319895, + "learning_rate": 2.8669332746182177e-07, + "logits/chosen": -0.2935495674610138, + "logits/rejected": -0.11881496757268906, + "logps/chosen": -1.305868148803711, + "logps/rejected": -1.8669077157974243, + "loss": 0.9641, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.305868148803711, + "rewards/margins": 0.5610396862030029, + "rewards/rejected": -1.8669077157974243, + "sft_loss": 1.3788195848464966, "step": 3790 }, { "epoch": 2.031108881083793, - "grad_norm": 8.208137567928274, - "learning_rate": 8.558573892094547e-07, - "logits/chosen": -0.6459519863128662, - "logits/rejected": -0.6449599862098694, - "logps/chosen": -1.2557079792022705, - "logps/rejected": -1.9562523365020752, - "loss": 0.9317, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2557079792022705, - "rewards/margins": 0.7005443572998047, - "rewards/rejected": -1.9562523365020752, - "sft_loss": 1.3796627521514893, + "grad_norm": 9.345893259790794, + "learning_rate": 2.8528579640315156e-07, + "logits/chosen": -0.18723037838935852, + "logits/rejected": -0.15769711136817932, + "logps/chosen": -1.2819069623947144, + "logps/rejected": -1.6533581018447876, + "loss": 1.0239, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2819069623947144, + "rewards/margins": 0.37145131826400757, + "rewards/rejected": -1.6533581018447876, + "sft_loss": 1.3619247674942017, "step": 3795 }, { "epoch": 2.033784913865195, - "grad_norm": 15.685924831899001, - "learning_rate": 8.516410455078793e-07, - "logits/chosen": -0.6611881256103516, - "logits/rejected": -0.5716621279716492, - "logps/chosen": -1.3278257846832275, - "logps/rejected": -2.2204842567443848, - "loss": 0.9066, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.3278257846832275, - "rewards/margins": 0.8926587104797363, - "rewards/rejected": -2.2204842567443848, - "sft_loss": 1.4301295280456543, + "grad_norm": 11.061799198438894, + "learning_rate": 2.8388034850262646e-07, + "logits/chosen": -0.1916731595993042, + "logits/rejected": -0.059759993106126785, + "logps/chosen": -1.3480558395385742, + "logps/rejected": -1.8577535152435303, + "loss": 1.0046, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3480558395385742, + "rewards/margins": 0.509697675704956, + "rewards/rejected": -1.8577535152435303, + "sft_loss": 1.427194356918335, "step": 3800 }, { "epoch": 2.0364609466465966, - "grad_norm": 9.902269917183073, - "learning_rate": 8.474309921877238e-07, - "logits/chosen": -0.6370423436164856, - "logits/rejected": -0.5673704147338867, - "logps/chosen": -1.2646716833114624, - "logps/rejected": -2.070289373397827, - "loss": 0.8929, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2646716833114624, - "rewards/margins": 0.8056178092956543, - "rewards/rejected": -2.070289373397827, - "sft_loss": 1.324689269065857, + "grad_norm": 9.434706841711222, + "learning_rate": 2.824769973959079e-07, + "logits/chosen": -0.1767081767320633, + "logits/rejected": -0.059535883367061615, + "logps/chosen": -1.2606967687606812, + "logps/rejected": -1.712166428565979, + "loss": 0.9757, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2606967687606812, + "rewards/margins": 0.4514695107936859, + "rewards/rejected": -1.712166428565979, + "sft_loss": 1.308203101158142, "step": 3805 }, { "epoch": 2.039136979427998, - "grad_norm": 10.820319319966085, - "learning_rate": 8.432272700949452e-07, - "logits/chosen": -0.590074896812439, - "logits/rejected": -0.5352758765220642, - "logps/chosen": -1.3020083904266357, - "logps/rejected": -2.3446125984191895, - "loss": 0.8485, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.3020083904266357, - "rewards/margins": 1.042603850364685, - "rewards/rejected": -2.3446125984191895, - "sft_loss": 1.3151377439498901, + "grad_norm": 8.219395893248242, + "learning_rate": 2.81075756698315e-07, + "logits/chosen": -0.07792104035615921, + "logits/rejected": 0.010812275111675262, + "logps/chosen": -1.2784520387649536, + "logps/rejected": -1.8368396759033203, + "loss": 0.9606, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2784520387649536, + "rewards/margins": 0.558387815952301, + "rewards/rejected": -1.8368396759033203, + "sft_loss": 1.296967625617981, "step": 3810 }, { "epoch": 2.0418130122093996, - "grad_norm": 8.07034734287858, - "learning_rate": 8.390299200140712e-07, - "logits/chosen": -0.7658069729804993, - "logits/rejected": -0.6765872836112976, - "logps/chosen": -1.3506486415863037, - "logps/rejected": -2.1507608890533447, - "loss": 0.8976, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3506486415863037, - "rewards/margins": 0.8001121282577515, - "rewards/rejected": -2.1507608890533447, - "sft_loss": 1.339355707168579, + "grad_norm": 7.428218502025483, + "learning_rate": 2.7967664000469035e-07, + "logits/chosen": -0.30743715167045593, + "logits/rejected": -0.1746453046798706, + "logps/chosen": -1.3353490829467773, + "logps/rejected": -1.719951868057251, + "loss": 1.0081, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3353490829467773, + "rewards/margins": 0.3846026659011841, + "rewards/rejected": -1.719951868057251, + "sft_loss": 1.3209892511367798, "step": 3815 }, { "epoch": 2.0444890449908013, - "grad_norm": 7.603117365844863, - "learning_rate": 8.348389826678129e-07, - "logits/chosen": -0.7368592619895935, - "logits/rejected": -0.582007110118866, - "logps/chosen": -1.3428410291671753, - "logps/rejected": -2.237255811691284, - "loss": 0.9049, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3428410291671753, - "rewards/margins": 0.8944147229194641, - "rewards/rejected": -2.237255811691284, - "sft_loss": 1.389864206314087, + "grad_norm": 10.29560194460434, + "learning_rate": 2.7827966088927095e-07, + "logits/chosen": -0.281122624874115, + "logits/rejected": -0.08383277803659439, + "logps/chosen": -1.399206519126892, + "logps/rejected": -1.8089908361434937, + "loss": 1.0588, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.399206519126892, + "rewards/margins": 0.40978437662124634, + "rewards/rejected": -1.8089908361434937, + "sft_loss": 1.4134975671768188, "step": 3820 }, { "epoch": 2.0471650777722026, - "grad_norm": 9.380763525185147, - "learning_rate": 8.306544987166615e-07, - "logits/chosen": -0.6831952333450317, - "logits/rejected": -0.6276777982711792, - "logps/chosen": -1.2772094011306763, - "logps/rejected": -2.1352391242980957, - "loss": 0.8712, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.2772094011306763, - "rewards/margins": 0.8580294847488403, - "rewards/rejected": -2.1352391242980957, - "sft_loss": 1.344315767288208, + "grad_norm": 7.948370711282376, + "learning_rate": 2.768848329055538e-07, + "logits/chosen": -0.23702120780944824, + "logits/rejected": -0.12606510519981384, + "logps/chosen": -1.2770016193389893, + "logps/rejected": -1.7505557537078857, + "loss": 0.971, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2770016193389893, + "rewards/margins": 0.47355398535728455, + "rewards/rejected": -1.7505557537078857, + "sft_loss": 1.3434475660324097, "step": 3825 }, { "epoch": 2.0498411105536043, - "grad_norm": 9.07721790035139, - "learning_rate": 8.264765087584998e-07, - "logits/chosen": -0.7254881858825684, - "logits/rejected": -0.6233684420585632, - "logps/chosen": -1.369759202003479, - "logps/rejected": -2.263784408569336, - "loss": 0.9069, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.369759202003479, - "rewards/margins": 0.8940251469612122, - "rewards/rejected": -2.263784408569336, - "sft_loss": 1.4023691415786743, + "grad_norm": 7.104542934372534, + "learning_rate": 2.7549216958616657e-07, + "logits/chosen": -0.31596413254737854, + "logits/rejected": -0.1691267192363739, + "logps/chosen": -1.3756964206695557, + "logps/rejected": -1.9007256031036377, + "loss": 0.9993, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3756964206695557, + "rewards/margins": 0.5250293016433716, + "rewards/rejected": -1.9007256031036377, + "sft_loss": 1.389954924583435, "step": 3830 }, { "epoch": 2.052517143335006, - "grad_norm": 5.792245981917584, - "learning_rate": 8.223050533282033e-07, - "logits/chosen": -0.6422053575515747, - "logits/rejected": -0.5379607677459717, - "logps/chosen": -1.3162529468536377, - "logps/rejected": -2.225130558013916, - "loss": 0.8708, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3162529468536377, - "rewards/margins": 0.9088780283927917, - "rewards/rejected": -2.225130558013916, - "sft_loss": 1.3632615804672241, + "grad_norm": 9.283822610907997, + "learning_rate": 2.741016844427344e-07, + "logits/chosen": -0.189658060669899, + "logits/rejected": -0.0372190997004509, + "logps/chosen": -1.3490064144134521, + "logps/rejected": -1.820847511291504, + "loss": 0.9957, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3490064144134521, + "rewards/margins": 0.47184085845947266, + "rewards/rejected": -1.820847511291504, + "sft_loss": 1.3788495063781738, "step": 3835 }, { "epoch": 2.0551931761164073, - "grad_norm": 7.190187729992475, - "learning_rate": 8.181401728972522e-07, - "logits/chosen": -0.6327613592147827, - "logits/rejected": -0.5678998231887817, - "logps/chosen": -1.2547208070755005, - "logps/rejected": -2.2098288536071777, - "loss": 0.8403, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.2547208070755005, - "rewards/margins": 0.9551082849502563, - "rewards/rejected": -2.2098288536071777, - "sft_loss": 1.2958214282989502, + "grad_norm": 7.308347252793988, + "learning_rate": 2.7271339096575073e-07, + "logits/chosen": -0.16798244416713715, + "logits/rejected": -0.0625995546579361, + "logps/chosen": -1.2780601978302002, + "logps/rejected": -1.8349239826202393, + "loss": 0.9607, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2780601978302002, + "rewards/margins": 0.5568638443946838, + "rewards/rejected": -1.8349239826202393, + "sft_loss": 1.3138271570205688, "step": 3840 }, { "epoch": 2.057869208897809, - "grad_norm": 8.609904902907434, - "learning_rate": 8.139819078733338e-07, - "logits/chosen": -0.7823083996772766, - "logits/rejected": -0.6090282201766968, - "logps/chosen": -1.3866506814956665, - "logps/rejected": -2.327561616897583, - "loss": 0.8912, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3866506814956665, - "rewards/margins": 0.940910816192627, - "rewards/rejected": -2.327561616897583, - "sft_loss": 1.4194447994232178, + "grad_norm": 5.961133659424594, + "learning_rate": 2.713273026244446e-07, + "logits/chosen": -0.3501536250114441, + "logits/rejected": -0.10656338930130005, + "logps/chosen": -1.3864253759384155, + "logps/rejected": -1.8850123882293701, + "loss": 0.9978, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3864253759384155, + "rewards/margins": 0.4985869824886322, + "rewards/rejected": -1.8850123882293701, + "sft_loss": 1.4140044450759888, "step": 3845 }, { "epoch": 2.0605452416792107, - "grad_norm": 11.17797594329939, - "learning_rate": 8.098302985999547e-07, - "logits/chosen": -0.6843757033348083, - "logits/rejected": -0.5614744424819946, - "logps/chosen": -1.3497182130813599, - "logps/rejected": -2.1061267852783203, - "loss": 0.9361, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3497182130813599, - "rewards/margins": 0.7564086318016052, - "rewards/rejected": -2.1061267852783203, - "sft_loss": 1.395309567451477, + "grad_norm": 8.40981218131488, + "learning_rate": 2.6994343286665156e-07, + "logits/chosen": -0.24091164767742157, + "logits/rejected": -0.07308875769376755, + "logps/chosen": -1.3868906497955322, + "logps/rejected": -1.8397271633148193, + "loss": 1.0406, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3868906497955322, + "rewards/margins": 0.4528365731239319, + "rewards/rejected": -1.8397271633148193, + "sft_loss": 1.4096046686172485, "step": 3850 }, { "epoch": 2.063221274460612, - "grad_norm": 7.9813576056899125, - "learning_rate": 8.056853853560447e-07, - "logits/chosen": -0.6499954462051392, - "logits/rejected": -0.5020272731781006, - "logps/chosen": -1.2980072498321533, - "logps/rejected": -2.4210283756256104, - "loss": 0.842, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.2980072498321533, - "rewards/margins": 1.123020887374878, - "rewards/rejected": -2.4210283756256104, - "sft_loss": 1.3265023231506348, + "grad_norm": 12.477651068225477, + "learning_rate": 2.6856179511868156e-07, + "logits/chosen": -0.17689739167690277, + "logits/rejected": 0.006243367679417133, + "logps/chosen": -1.3302204608917236, + "logps/rejected": -2.0581300258636475, + "loss": 0.932, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3302204608917236, + "rewards/margins": 0.7279095649719238, + "rewards/rejected": -2.0581300258636475, + "sft_loss": 1.333121657371521, "step": 3855 }, { "epoch": 2.0658973072420137, - "grad_norm": 6.769947516647681, - "learning_rate": 8.015472083555717e-07, - "logits/chosen": -0.6723402142524719, - "logits/rejected": -0.5541272163391113, - "logps/chosen": -1.279719591140747, - "logps/rejected": -2.26812744140625, - "loss": 0.8635, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.279719591140747, - "rewards/margins": 0.988408088684082, - "rewards/rejected": -2.26812744140625, - "sft_loss": 1.2970294952392578, + "grad_norm": 6.455403879486427, + "learning_rate": 2.6718240278519056e-07, + "logits/chosen": -0.20638510584831238, + "logits/rejected": -0.042838867753744125, + "logps/chosen": -1.3307113647460938, + "logps/rejected": -1.9355605840682983, + "loss": 0.9769, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3307113647460938, + "rewards/margins": 0.6048492193222046, + "rewards/rejected": -1.9355605840682983, + "sft_loss": 1.3224481344223022, "step": 3860 }, { "epoch": 2.0685733400234154, - "grad_norm": 8.99183976486184, - "learning_rate": 7.974158077471461e-07, - "logits/chosen": -0.788475513458252, - "logits/rejected": -0.667589008808136, - "logps/chosen": -1.3984959125518799, - "logps/rejected": -2.200364589691162, - "loss": 0.9518, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3984959125518799, - "rewards/margins": 0.8018687963485718, - "rewards/rejected": -2.200364589691162, - "sft_loss": 1.4584678411483765, + "grad_norm": 9.636876818748828, + "learning_rate": 2.6580526924904866e-07, + "logits/chosen": -0.32187479734420776, + "logits/rejected": -0.1283341944217682, + "logps/chosen": -1.4119688272476196, + "logps/rejected": -1.7951968908309937, + "loss": 1.0789, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4119688272476196, + "rewards/margins": 0.38322800397872925, + "rewards/rejected": -1.7951968908309937, + "sft_loss": 1.4519226551055908, "step": 3865 }, { "epoch": 2.0712493728048167, - "grad_norm": 6.984993509609364, - "learning_rate": 7.932912236136356e-07, - "logits/chosen": -0.6701870560646057, - "logits/rejected": -0.6589460968971252, - "logps/chosen": -1.1804810762405396, - "logps/rejected": -2.028503894805908, - "loss": 0.8487, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.1804810762405396, - "rewards/margins": 0.8480230569839478, - "rewards/rejected": -2.028503894805908, - "sft_loss": 1.2543269395828247, + "grad_norm": 10.56360714977184, + "learning_rate": 2.6443040787121186e-07, + "logits/chosen": -0.2144956886768341, + "logits/rejected": -0.1632927805185318, + "logps/chosen": -1.2403037548065186, + "logps/rejected": -1.7169328927993774, + "loss": 0.9702, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2403037548065186, + "rewards/margins": 0.4766291677951813, + "rewards/rejected": -1.7169328927993774, + "sft_loss": 1.270911693572998, "step": 3870 }, { "epoch": 2.0739254055862184, - "grad_norm": 8.13663272463278, - "learning_rate": 7.891734959717726e-07, - "logits/chosen": -0.6010066270828247, - "logits/rejected": -0.5039754509925842, - "logps/chosen": -1.330596923828125, - "logps/rejected": -2.231889486312866, - "loss": 0.8742, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.330596923828125, - "rewards/margins": 0.9012928009033203, - "rewards/rejected": -2.231889486312866, - "sft_loss": 1.4094765186309814, + "grad_norm": 9.584980104505666, + "learning_rate": 2.6305783199059084e-07, + "logits/chosen": -0.2386658638715744, + "logits/rejected": -0.12415851652622223, + "logps/chosen": -1.340667963027954, + "logps/rejected": -1.8312772512435913, + "loss": 0.979, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.340667963027954, + "rewards/margins": 0.49060922861099243, + "rewards/rejected": -1.8312772512435913, + "sft_loss": 1.4031498432159424, "step": 3875 }, { "epoch": 2.07660143836762, - "grad_norm": 7.834614174863811, - "learning_rate": 7.850626647717698e-07, - "logits/chosen": -0.6204323768615723, - "logits/rejected": -0.49028873443603516, - "logps/chosen": -1.2041785717010498, - "logps/rejected": -2.15822696685791, - "loss": 0.8209, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.2041785717010498, - "rewards/margins": 0.9540484547615051, - "rewards/rejected": -2.15822696685791, - "sft_loss": 1.2499016523361206, + "grad_norm": 13.524396726620859, + "learning_rate": 2.6168755492392324e-07, + "logits/chosen": -0.23326320946216583, + "logits/rejected": -0.06280764937400818, + "logps/chosen": -1.221289873123169, + "logps/rejected": -1.7749824523925781, + "loss": 0.9317, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.221289873123169, + "rewards/margins": 0.5536924600601196, + "rewards/rejected": -1.7749824523925781, + "sft_loss": 1.2569924592971802, "step": 3880 }, { "epoch": 2.0792774711490214, - "grad_norm": 6.98029199111734, - "learning_rate": 7.809587698969282e-07, - "logits/chosen": -0.5861740112304688, - "logits/rejected": -0.4753715395927429, - "logps/chosen": -1.2556285858154297, - "logps/rejected": -2.299781560897827, - "loss": 0.857, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2556285858154297, - "rewards/margins": 1.0441529750823975, - "rewards/rejected": -2.299781560897827, - "sft_loss": 1.3279292583465576, + "grad_norm": 9.48373908089574, + "learning_rate": 2.6031958996564274e-07, + "logits/chosen": -0.2690264582633972, + "logits/rejected": -0.11539246886968613, + "logps/chosen": -1.268319845199585, + "logps/rejected": -1.8795678615570068, + "loss": 0.9535, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.268319845199585, + "rewards/margins": 0.6112480163574219, + "rewards/rejected": -1.8795678615570068, + "sft_loss": 1.3194853067398071, "step": 3885 }, { "epoch": 2.081953503930423, - "grad_norm": 8.151958862375524, - "learning_rate": 7.768618511632555e-07, - "logits/chosen": -0.5082379579544067, - "logits/rejected": -0.42794299125671387, - "logps/chosen": -1.3039039373397827, - "logps/rejected": -2.215757131576538, - "loss": 0.9311, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3039039373397827, - "rewards/margins": 0.9118531942367554, - "rewards/rejected": -2.215757131576538, - "sft_loss": 1.3952258825302124, + "grad_norm": 7.272403165121372, + "learning_rate": 2.589539503877518e-07, + "logits/chosen": -0.16457560658454895, + "logits/rejected": -0.06957674771547318, + "logps/chosen": -1.3222824335098267, + "logps/rejected": -1.7769525051116943, + "loss": 1.0518, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3222824335098267, + "rewards/margins": 0.4546701908111572, + "rewards/rejected": -1.7769525051116943, + "sft_loss": 1.375428557395935, "step": 3890 }, { "epoch": 2.084629536711825, - "grad_norm": 8.039789806789473, - "learning_rate": 7.727719483190737e-07, - "logits/chosen": -0.602218508720398, - "logits/rejected": -0.3982509970664978, - "logps/chosen": -1.3197612762451172, - "logps/rejected": -2.207941770553589, - "loss": 0.898, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3197612762451172, - "rewards/margins": 0.8881803750991821, - "rewards/rejected": -2.207941770553589, - "sft_loss": 1.3508869409561157, + "grad_norm": 6.596628800990976, + "learning_rate": 2.5759064943969125e-07, + "logits/chosen": -0.24087989330291748, + "logits/rejected": -0.0020778581965714693, + "logps/chosen": -1.288952112197876, + "logps/rejected": -1.8099241256713867, + "loss": 0.9846, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.288952112197876, + "rewards/margins": 0.5209718942642212, + "rewards/rejected": -1.8099241256713867, + "sft_loss": 1.3294023275375366, "step": 3895 }, { "epoch": 2.087305569493226, - "grad_norm": 8.145725584579546, - "learning_rate": 7.686891010446394e-07, - "logits/chosen": -0.46284013986587524, - "logits/rejected": -0.47182974219322205, - "logps/chosen": -1.3199361562728882, - "logps/rejected": -2.0793938636779785, - "loss": 0.9104, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3199361562728882, - "rewards/margins": 0.7594578266143799, - "rewards/rejected": -2.0793938636779785, - "sft_loss": 1.3835678100585938, + "grad_norm": 12.309650576815347, + "learning_rate": 2.562297003482131e-07, + "logits/chosen": -0.06875167042016983, + "logits/rejected": -0.052970387041568756, + "logps/chosen": -1.3002475500106812, + "logps/rejected": -1.7216488122940063, + "loss": 1.0046, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3002475500106812, + "rewards/margins": 0.42140135169029236, + "rewards/rejected": -1.7216488122940063, + "sft_loss": 1.35751473903656, "step": 3900 }, { "epoch": 2.089981602274628, - "grad_norm": 5.976896677469458, - "learning_rate": 7.646133489517535e-07, - "logits/chosen": -0.5105876922607422, - "logits/rejected": -0.42746657133102417, - "logps/chosen": -1.323020577430725, - "logps/rejected": -2.170231819152832, - "loss": 0.8989, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.323020577430725, - "rewards/margins": 0.8472112417221069, - "rewards/rejected": -2.170231819152832, - "sft_loss": 1.3276392221450806, + "grad_norm": 7.051202750690179, + "learning_rate": 2.548711163172512e-07, + "logits/chosen": -0.15310558676719666, + "logits/rejected": -0.05742264911532402, + "logps/chosen": -1.3195465803146362, + "logps/rejected": -1.7413108348846436, + "loss": 1.0062, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3195465803146362, + "rewards/margins": 0.4217642843723297, + "rewards/rejected": -1.7413108348846436, + "sft_loss": 1.3092045783996582, "step": 3905 }, { "epoch": 2.0926576350560295, - "grad_norm": 6.634770698723937, - "learning_rate": 7.605447315833821e-07, - "logits/chosen": -0.4753246307373047, - "logits/rejected": -0.3948332965373993, - "logps/chosen": -1.2177543640136719, - "logps/rejected": -2.0822913646698, - "loss": 0.8623, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2177543640136719, - "rewards/margins": 0.8645371198654175, - "rewards/rejected": -2.0822913646698, - "sft_loss": 1.2676682472229004, + "grad_norm": 8.940324337005347, + "learning_rate": 2.53514910527794e-07, + "logits/chosen": -0.1288268119096756, + "logits/rejected": -0.006907902657985687, + "logps/chosen": -1.2185369729995728, + "logps/rejected": -1.6721731424331665, + "loss": 0.9723, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2185369729995728, + "rewards/margins": 0.4536362290382385, + "rewards/rejected": -1.6721731424331665, + "sft_loss": 1.2611117362976074, "step": 3910 }, { "epoch": 2.095333667837431, - "grad_norm": 9.030628197865246, - "learning_rate": 7.564832884132672e-07, - "logits/chosen": -0.5891907215118408, - "logits/rejected": -0.4459839463233948, - "logps/chosen": -1.3624029159545898, - "logps/rejected": -2.277984380722046, - "loss": 0.9139, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3624029159545898, - "rewards/margins": 0.9155814051628113, - "rewards/rejected": -2.277984380722046, - "sft_loss": 1.4179056882858276, + "grad_norm": 7.482529780113386, + "learning_rate": 2.5216109613775573e-07, + "logits/chosen": -0.22753730416297913, + "logits/rejected": -0.06723004579544067, + "logps/chosen": -1.3481438159942627, + "logps/rejected": -1.8852345943450928, + "loss": 0.9939, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3481438159942627, + "rewards/margins": 0.5370906591415405, + "rewards/rejected": -1.8852345943450928, + "sft_loss": 1.39449143409729, "step": 3915 }, { "epoch": 2.0980097006188325, - "grad_norm": 10.064942364992636, - "learning_rate": 7.524290588455499e-07, - "logits/chosen": -0.5865086913108826, - "logits/rejected": -0.4551008641719818, - "logps/chosen": -1.357023000717163, - "logps/rejected": -2.522416591644287, - "loss": 0.8519, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.357023000717163, - "rewards/margins": 1.165393590927124, - "rewards/rejected": -2.522416591644287, - "sft_loss": 1.3542098999023438, + "grad_norm": 13.492128161318169, + "learning_rate": 2.5080968628184993e-07, + "logits/chosen": -0.21619009971618652, + "logits/rejected": -0.05382479354739189, + "logps/chosen": -1.3529709577560425, + "logps/rejected": -2.009631633758545, + "loss": 0.9675, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3529709577560425, + "rewards/margins": 0.6566608548164368, + "rewards/rejected": -2.009631633758545, + "sft_loss": 1.3439533710479736, "step": 3920 }, { "epoch": 2.1006857334002342, - "grad_norm": 10.286245053566672, - "learning_rate": 7.483820822143816e-07, - "logits/chosen": -0.6092128157615662, - "logits/rejected": -0.5239665508270264, - "logps/chosen": -1.2598596811294556, - "logps/rejected": -2.266526937484741, - "loss": 0.8794, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2598596811294556, - "rewards/margins": 1.0066674947738647, - "rewards/rejected": -2.266526937484741, - "sft_loss": 1.3254220485687256, + "grad_norm": 6.67567810727143, + "learning_rate": 2.494606940714605e-07, + "logits/chosen": -0.20344305038452148, + "logits/rejected": -0.08249272406101227, + "logps/chosen": -1.27012300491333, + "logps/rejected": -1.800536870956421, + "loss": 0.9979, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.27012300491333, + "rewards/margins": 0.5304139852523804, + "rewards/rejected": -1.800536870956421, + "sft_loss": 1.3256646394729614, "step": 3925 }, { "epoch": 2.103361766181636, - "grad_norm": 5.140048064303052, - "learning_rate": 7.443423977835487e-07, - "logits/chosen": -0.6823610663414001, - "logits/rejected": -0.576682984828949, - "logps/chosen": -1.3115248680114746, - "logps/rejected": -2.282677412033081, - "loss": 0.8606, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.3115248680114746, - "rewards/margins": 0.971152663230896, - "rewards/rejected": -2.282677412033081, - "sft_loss": 1.3290514945983887, + "grad_norm": 5.538112523667141, + "learning_rate": 2.4811413259451625e-07, + "logits/chosen": -0.28186747431755066, + "logits/rejected": -0.1351318061351776, + "logps/chosen": -1.3375747203826904, + "logps/rejected": -1.8085401058197021, + "loss": 0.9924, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3375747203826904, + "rewards/margins": 0.4709654748439789, + "rewards/rejected": -1.8085401058197021, + "sft_loss": 1.341721773147583, "step": 3930 }, { "epoch": 2.106037798963037, - "grad_norm": 9.831200068099484, - "learning_rate": 7.403100447460861e-07, - "logits/chosen": -0.5606423020362854, - "logits/rejected": -0.5006273984909058, - "logps/chosen": -1.305371642112732, - "logps/rejected": -2.273237943649292, - "loss": 0.8726, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.305371642112732, - "rewards/margins": 0.9678661227226257, - "rewards/rejected": -2.273237943649292, - "sft_loss": 1.306117296218872, + "grad_norm": 6.646817354380201, + "learning_rate": 2.46770014915362e-07, + "logits/chosen": -0.1404200941324234, + "logits/rejected": -0.07894755899906158, + "logps/chosen": -1.3222754001617432, + "logps/rejected": -1.8376655578613281, + "loss": 0.9938, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3222754001617432, + "rewards/margins": 0.5153903961181641, + "rewards/rejected": -1.8376655578613281, + "sft_loss": 1.3083678483963013, "step": 3935 }, { "epoch": 2.108713831744439, - "grad_norm": 10.618972493058113, - "learning_rate": 7.36285062223902e-07, - "logits/chosen": -0.5751175880432129, - "logits/rejected": -0.5403181910514832, - "logps/chosen": -1.2422279119491577, - "logps/rejected": -2.406665086746216, - "loss": 0.8025, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2422279119491577, - "rewards/margins": 1.1644370555877686, - "rewards/rejected": -2.406665086746216, - "sft_loss": 1.2548398971557617, + "grad_norm": 10.23679057616976, + "learning_rate": 2.45428354074634e-07, + "logits/chosen": -0.17988917231559753, + "logits/rejected": -0.12885865569114685, + "logps/chosen": -1.2356412410736084, + "logps/rejected": -1.8562723398208618, + "loss": 0.9153, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2356412410736084, + "rewards/margins": 0.6206308603286743, + "rewards/rejected": -1.8562723398208618, + "sft_loss": 1.2337496280670166, "step": 3940 }, { "epoch": 2.1113898645258407, - "grad_norm": 8.628247938136461, - "learning_rate": 7.322674892673931e-07, - "logits/chosen": -0.5670984983444214, - "logits/rejected": -0.4008113741874695, - "logps/chosen": -1.3559355735778809, - "logps/rejected": -2.112020969390869, - "loss": 0.9507, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3559355735778809, - "rewards/margins": 0.7560855746269226, - "rewards/rejected": -2.112020969390869, - "sft_loss": 1.4266374111175537, + "grad_norm": 6.8928666859689045, + "learning_rate": 2.4408916308913105e-07, + "logits/chosen": -0.19816677272319794, + "logits/rejected": -0.0217067189514637, + "logps/chosen": -1.3648486137390137, + "logps/rejected": -1.738193154335022, + "loss": 1.0507, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3648486137390137, + "rewards/margins": 0.3733447790145874, + "rewards/rejected": -1.738193154335022, + "sft_loss": 1.3976587057113647, "step": 3945 }, { "epoch": 2.114065897307242, - "grad_norm": 7.079779269175077, - "learning_rate": 7.282573648550709e-07, - "logits/chosen": -0.48685067892074585, - "logits/rejected": -0.36240923404693604, - "logps/chosen": -1.3060166835784912, - "logps/rejected": -2.1700565814971924, - "loss": 0.9002, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3060166835784912, - "rewards/margins": 0.8640398979187012, - "rewards/rejected": -2.1700565814971924, - "sft_loss": 1.3409571647644043, + "grad_norm": 9.006350243530518, + "learning_rate": 2.4275245495169025e-07, + "logits/chosen": -0.11615003645420074, + "logits/rejected": 0.044729799032211304, + "logps/chosen": -1.3059442043304443, + "logps/rejected": -1.7530139684677124, + "loss": 1.0125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3059442043304443, + "rewards/margins": 0.4470697343349457, + "rewards/rejected": -1.7530139684677124, + "sft_loss": 1.3343216180801392, "step": 3950 }, { "epoch": 2.1167419300886436, - "grad_norm": 7.646818085113159, - "learning_rate": 7.242547278931792e-07, - "logits/chosen": -0.6195932030677795, - "logits/rejected": -0.5592155456542969, - "logps/chosen": -1.2778809070587158, - "logps/rejected": -2.3017425537109375, - "loss": 0.8653, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.2778809070587158, - "rewards/margins": 1.0238616466522217, - "rewards/rejected": -2.3017425537109375, - "sft_loss": 1.3358291387557983, + "grad_norm": 6.81240384354102, + "learning_rate": 2.414182426310597e-07, + "logits/chosen": -0.2713935077190399, + "logits/rejected": -0.18466126918792725, + "logps/chosen": -1.2866567373275757, + "logps/rejected": -1.8285629749298096, + "loss": 0.994, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2866567373275757, + "rewards/margins": 0.5419060587882996, + "rewards/rejected": -1.8285629749298096, + "sft_loss": 1.3387314081192017, "step": 3955 }, { "epoch": 2.1194179628700454, - "grad_norm": 9.883662610666288, - "learning_rate": 7.202596172153203e-07, - "logits/chosen": -0.525765597820282, - "logits/rejected": -0.4359908103942871, - "logps/chosen": -1.3174402713775635, - "logps/rejected": -2.382549524307251, - "loss": 0.8847, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3174402713775635, - "rewards/margins": 1.0651090145111084, - "rewards/rejected": -2.382549524307251, - "sft_loss": 1.4014842510223389, + "grad_norm": 12.535037949453782, + "learning_rate": 2.400865390717734e-07, + "logits/chosen": -0.16105331480503082, + "logits/rejected": -0.04563795030117035, + "logps/chosen": -1.3320871591567993, + "logps/rejected": -2.02455472946167, + "loss": 0.9829, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3320871591567993, + "rewards/margins": 0.6924676895141602, + "rewards/rejected": -2.02455472946167, + "sft_loss": 1.413009762763977, "step": 3960 }, { "epoch": 2.1220939956514466, - "grad_norm": 8.474083790065452, - "learning_rate": 7.162720715820742e-07, - "logits/chosen": -0.5002860426902771, - "logits/rejected": -0.4082806706428528, - "logps/chosen": -1.2493045330047607, - "logps/rejected": -2.2874772548675537, - "loss": 0.8689, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2493045330047607, - "rewards/margins": 1.0381726026535034, - "rewards/rejected": -2.2874772548675537, - "sft_loss": 1.346387267112732, + "grad_norm": 6.675328687431673, + "learning_rate": 2.3875735719402475e-07, + "logits/chosen": -0.11590099334716797, + "logits/rejected": -0.0057983072474598885, + "logps/chosen": -1.2462270259857178, + "logps/rejected": -1.8627185821533203, + "loss": 0.9516, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2462270259857178, + "rewards/margins": 0.6164913773536682, + "rewards/rejected": -1.8627185821533203, + "sft_loss": 1.319265604019165, "step": 3965 }, { "epoch": 2.1247700284328483, - "grad_norm": 6.205278251934963, - "learning_rate": 7.122921296806278e-07, - "logits/chosen": -0.5613098740577698, - "logits/rejected": -0.48972979187965393, - "logps/chosen": -1.2905679941177368, - "logps/rejected": -2.313955068588257, - "loss": 0.8803, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2905679941177368, - "rewards/margins": 1.0233871936798096, - "rewards/rejected": -2.313955068588257, - "sft_loss": 1.3947936296463013, + "grad_norm": 6.603486037645406, + "learning_rate": 2.3743070989354258e-07, + "logits/chosen": -0.1910642683506012, + "logits/rejected": -0.09453106671571732, + "logps/chosen": -1.2924778461456299, + "logps/rejected": -1.8789180517196655, + "loss": 0.9891, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2924778461456299, + "rewards/margins": 0.5864402055740356, + "rewards/rejected": -1.8789180517196655, + "sft_loss": 1.3684488534927368, "step": 3970 }, { "epoch": 2.12744606121425, - "grad_norm": 9.86803358657914, - "learning_rate": 7.083198301243937e-07, - "logits/chosen": -0.5123628973960876, - "logits/rejected": -0.43953627347946167, - "logps/chosen": -1.180465817451477, - "logps/rejected": -1.981055498123169, - "loss": 0.847, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.180465817451477, - "rewards/margins": 0.8005896806716919, - "rewards/rejected": -1.981055498123169, - "sft_loss": 1.2407500743865967, + "grad_norm": 7.756837511314091, + "learning_rate": 2.3610661004146454e-07, + "logits/chosen": -0.13991785049438477, + "logits/rejected": -0.03918793797492981, + "logps/chosen": -1.211921215057373, + "logps/rejected": -1.6410366296768188, + "loss": 0.9586, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.211921215057373, + "rewards/margins": 0.4291153848171234, + "rewards/rejected": -1.6410366296768188, + "sft_loss": 1.2346084117889404, "step": 3975 }, { "epoch": 2.1301220939956513, - "grad_norm": 9.064455669725788, - "learning_rate": 7.043552114526395e-07, - "logits/chosen": -0.5958659052848816, - "logits/rejected": -0.53276127576828, - "logps/chosen": -1.1877778768539429, - "logps/rejected": -2.1918532848358154, - "loss": 0.8463, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.1877778768539429, - "rewards/margins": 1.0040756464004517, - "rewards/rejected": -2.1918532848358154, - "sft_loss": 1.3280725479125977, + "grad_norm": 7.276228405268265, + "learning_rate": 2.3478507048421314e-07, + "logits/chosen": -0.2536202073097229, + "logits/rejected": -0.1750868409872055, + "logps/chosen": -1.2155635356903076, + "logps/rejected": -1.861124038696289, + "loss": 0.9508, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2155635356903076, + "rewards/margins": 0.6455605626106262, + "rewards/rejected": -1.861124038696289, + "sft_loss": 1.329552412033081, "step": 3980 }, { "epoch": 2.132798126777053, - "grad_norm": 10.715834332859933, - "learning_rate": 7.003983121301139e-07, - "logits/chosen": -0.6565994620323181, - "logits/rejected": -0.5570293664932251, - "logps/chosen": -1.2925097942352295, - "logps/rejected": -2.3577346801757812, - "loss": 0.8334, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2925097942352295, - "rewards/margins": 1.0652248859405518, - "rewards/rejected": -2.3577346801757812, - "sft_loss": 1.341284990310669, + "grad_norm": 14.883539668534967, + "learning_rate": 2.334661040433713e-07, + "logits/chosen": -0.2990095019340515, + "logits/rejected": -0.19773916900157928, + "logps/chosen": -1.3071012496948242, + "logps/rejected": -1.9256079196929932, + "loss": 0.944, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3071012496948242, + "rewards/margins": 0.6185065507888794, + "rewards/rejected": -1.9256079196929932, + "sft_loss": 1.3233211040496826, "step": 3985 }, { "epoch": 2.1354741595584548, - "grad_norm": 10.098791734156606, - "learning_rate": 6.964491705466704e-07, - "logits/chosen": -0.6845608949661255, - "logits/rejected": -0.5976250767707825, - "logps/chosen": -1.2707078456878662, - "logps/rejected": -2.3325119018554688, - "loss": 0.8549, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2707078456878662, - "rewards/margins": 1.0618040561676025, - "rewards/rejected": -2.3325119018554688, - "sft_loss": 1.322950839996338, + "grad_norm": 9.44597333332424, + "learning_rate": 2.321497235155568e-07, + "logits/chosen": -0.29291418194770813, + "logits/rejected": -0.15362228453159332, + "logps/chosen": -1.2437551021575928, + "logps/rejected": -1.8007538318634033, + "loss": 0.9541, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2437551021575928, + "rewards/margins": 0.556998610496521, + "rewards/rejected": -1.8007538318634033, + "sft_loss": 1.307832956314087, "step": 3990 }, { "epoch": 2.138150192339856, - "grad_norm": 8.970579954518575, - "learning_rate": 6.92507825016899e-07, - "logits/chosen": -0.7098767161369324, - "logits/rejected": -0.4670625329017639, - "logps/chosen": -1.2964619398117065, - "logps/rejected": -2.4015231132507324, - "loss": 0.8596, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.2964619398117065, - "rewards/margins": 1.1050611734390259, - "rewards/rejected": -2.4015231132507324, - "sft_loss": 1.360012412071228, + "grad_norm": 16.441148444934594, + "learning_rate": 2.3083594167229965e-07, + "logits/chosen": -0.3222687542438507, + "logits/rejected": -0.061028677970170975, + "logps/chosen": -1.3114013671875, + "logps/rejected": -1.8884906768798828, + "loss": 0.9779, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3114013671875, + "rewards/margins": 0.5770891308784485, + "rewards/rejected": -1.8884906768798828, + "sft_loss": 1.3354787826538086, "step": 3995 }, { "epoch": 2.1408262251212578, - "grad_norm": 11.765582068507088, - "learning_rate": 6.885743137797502e-07, - "logits/chosen": -0.5854183435440063, - "logits/rejected": -0.5112401843070984, - "logps/chosen": -1.2789857387542725, - "logps/rejected": -2.4257848262786865, - "loss": 0.8559, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.2789857387542725, - "rewards/margins": 1.1467993259429932, - "rewards/rejected": -2.4257848262786865, - "sft_loss": 1.372701644897461, + "grad_norm": 16.965841060039175, + "learning_rate": 2.295247712599167e-07, + "logits/chosen": -0.20985686779022217, + "logits/rejected": -0.10149633884429932, + "logps/chosen": -1.2924827337265015, + "logps/rejected": -1.9266865253448486, + "loss": 0.9654, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2924827337265015, + "rewards/margins": 0.6342039108276367, + "rewards/rejected": -1.9266865253448486, + "sft_loss": 1.3712036609649658, "step": 4000 }, { "epoch": 2.1408262251212578, - "eval_logits/chosen": -0.24339796602725983, - "eval_logits/rejected": -0.17608876526355743, - "eval_logps/chosen": -1.5362346172332764, - "eval_logps/rejected": -2.1634719371795654, - "eval_loss": 1.0419976711273193, - "eval_rewards/accuracies": 0.6550444960594177, - "eval_rewards/chosen": -1.5362346172332764, - "eval_rewards/margins": 0.6272372603416443, - "eval_rewards/rejected": -2.1634719371795654, - "eval_runtime": 43.6443, - "eval_samples_per_second": 30.817, - "eval_sft_loss": 1.4888213872909546, - "eval_steps_per_second": 7.722, + "eval_logits/chosen": 0.1482262760400772, + "eval_logits/rejected": 0.240483820438385, + "eval_logps/chosen": -1.3910187482833862, + "eval_logps/rejected": -1.8183025121688843, + "eval_loss": 1.0418007373809814, + "eval_rewards/accuracies": 0.6179525256156921, + "eval_rewards/chosen": -1.3910187482833862, + "eval_rewards/margins": 0.4272836744785309, + "eval_rewards/rejected": -1.8183025121688843, + "eval_runtime": 46.2356, + "eval_samples_per_second": 29.09, + "eval_sft_loss": 1.3967313766479492, + "eval_steps_per_second": 7.289, "step": 4000 }, { "epoch": 2.1435022579026595, - "grad_norm": 7.152007906365384, - "learning_rate": 6.846486749981684e-07, - "logits/chosen": -0.5965894460678101, - "logits/rejected": -0.41736525297164917, - "logps/chosen": -1.4271074533462524, - "logps/rejected": -2.2944045066833496, - "loss": 0.9183, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.4271074533462524, - "rewards/margins": 0.8672970533370972, - "rewards/rejected": -2.2944045066833496, - "sft_loss": 1.4213173389434814, + "grad_norm": 9.133500661368627, + "learning_rate": 2.2821622499938948e-07, + "logits/chosen": -0.19903546571731567, + "logits/rejected": 0.02608914114534855, + "logps/chosen": -1.4473344087600708, + "logps/rejected": -1.8965390920639038, + "loss": 1.0441, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4473344087600708, + "rewards/margins": 0.44920483231544495, + "rewards/rejected": -1.8965390920639038, + "sft_loss": 1.4248669147491455, "step": 4005 }, { "epoch": 2.1461782906840607, - "grad_norm": 14.90474531035357, - "learning_rate": 6.807309467587173e-07, - "logits/chosen": -0.6228991150856018, - "logits/rejected": -0.5492960214614868, - "logps/chosen": -1.3263002634048462, - "logps/rejected": -2.23538875579834, - "loss": 0.8993, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3263002634048462, - "rewards/margins": 0.9090884923934937, - "rewards/rejected": -2.23538875579834, - "sft_loss": 1.3850328922271729, + "grad_norm": 9.074100048107182, + "learning_rate": 2.269103155862391e-07, + "logits/chosen": -0.23106679320335388, + "logits/rejected": -0.13114657998085022, + "logps/chosen": -1.334062933921814, + "logps/rejected": -1.7841312885284424, + "loss": 1.0196, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.334062933921814, + "rewards/margins": 0.45006832480430603, + "rewards/rejected": -1.7841312885284424, + "sft_loss": 1.3713685274124146, "step": 4010 }, { "epoch": 2.1488543234654625, - "grad_norm": 10.686015445659427, - "learning_rate": 6.768211670712146e-07, - "logits/chosen": -0.6297835111618042, - "logits/rejected": -0.38724324107170105, - "logps/chosen": -1.3609362840652466, - "logps/rejected": -2.22047758102417, - "loss": 0.9235, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3609362840652466, - "rewards/margins": 0.8595415353775024, - "rewards/rejected": -2.22047758102417, - "sft_loss": 1.393182635307312, + "grad_norm": 9.618864174062091, + "learning_rate": 2.2560705569040483e-07, + "logits/chosen": -0.21874377131462097, + "logits/rejected": 0.03923650458455086, + "logps/chosen": -1.3294260501861572, + "logps/rejected": -1.780279517173767, + "loss": 1.026, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3294260501861572, + "rewards/margins": 0.45085349678993225, + "rewards/rejected": -1.780279517173767, + "sft_loss": 1.3640459775924683, "step": 4015 }, { "epoch": 2.151530356246864, - "grad_norm": 8.16760442315204, - "learning_rate": 6.729193738683589e-07, - "logits/chosen": -0.7085089683532715, - "logits/rejected": -0.60541832447052, - "logps/chosen": -1.3713706731796265, - "logps/rejected": -2.3267054557800293, - "loss": 0.9264, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3713706731796265, - "rewards/margins": 0.9553349614143372, - "rewards/rejected": -2.3267054557800293, - "sft_loss": 1.4503589868545532, + "grad_norm": 13.560338534464554, + "learning_rate": 2.2430645795611963e-07, + "logits/chosen": -0.29200059175491333, + "logits/rejected": -0.1397140622138977, + "logps/chosen": -1.364084243774414, + "logps/rejected": -1.825477957725525, + "loss": 1.0258, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.364084243774414, + "rewards/margins": 0.4613935351371765, + "rewards/rejected": -1.825477957725525, + "sft_loss": 1.4269052743911743, "step": 4020 }, { "epoch": 2.1542063890282654, - "grad_norm": 6.910080249548427, - "learning_rate": 6.690256050053652e-07, - "logits/chosen": -0.6153056025505066, - "logits/rejected": -0.5264173746109009, - "logps/chosen": -1.2847819328308105, - "logps/rejected": -2.305784225463867, - "loss": 0.8757, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2847819328308105, - "rewards/margins": 1.021002173423767, - "rewards/rejected": -2.305784225463867, - "sft_loss": 1.317858099937439, + "grad_norm": 7.098165162790431, + "learning_rate": 2.230085350017884e-07, + "logits/chosen": -0.21481871604919434, + "logits/rejected": -0.10389243066310883, + "logps/chosen": -1.286023736000061, + "logps/rejected": -1.780125617980957, + "loss": 0.992, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.286023736000061, + "rewards/margins": 0.49410194158554077, + "rewards/rejected": -1.780125617980957, + "sft_loss": 1.304584264755249, "step": 4025 }, { "epoch": 2.156882421809667, - "grad_norm": 7.708333768885909, - "learning_rate": 6.651398982595967e-07, - "logits/chosen": -0.632286012172699, - "logits/rejected": -0.5932300090789795, - "logps/chosen": -1.2660969495773315, - "logps/rejected": -2.2909035682678223, - "loss": 0.8702, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2660969495773315, - "rewards/margins": 1.0248066186904907, - "rewards/rejected": -2.2909035682678223, - "sft_loss": 1.3490211963653564, + "grad_norm": 11.026115794573862, + "learning_rate": 2.2171329941986554e-07, + "logits/chosen": -0.25596949458122253, + "logits/rejected": -0.17235644161701202, + "logps/chosen": -1.2982758283615112, + "logps/rejected": -1.8246577978134155, + "loss": 0.9829, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2982758283615112, + "rewards/margins": 0.5263819694519043, + "rewards/rejected": -1.8246577978134155, + "sft_loss": 1.3357901573181152, "step": 4030 }, { "epoch": 2.159558454591069, - "grad_norm": 11.974974879307897, - "learning_rate": 6.612622913301961e-07, - "logits/chosen": -0.6110135316848755, - "logits/rejected": -0.6037889719009399, - "logps/chosen": -1.2175084352493286, - "logps/rejected": -1.889762282371521, - "loss": 0.9114, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2175084352493286, - "rewards/margins": 0.6722537875175476, - "rewards/rejected": -1.889762282371521, - "sft_loss": 1.3205323219299316, + "grad_norm": 9.030464811943805, + "learning_rate": 2.2042076377673202e-07, + "logits/chosen": -0.2392152100801468, + "logits/rejected": -0.20006027817726135, + "logps/chosen": -1.260817289352417, + "logps/rejected": -1.5614498853683472, + "loss": 1.0272, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.260817289352417, + "rewards/margins": 0.30063262581825256, + "rewards/rejected": -1.5614498853683472, + "sft_loss": 1.3343435525894165, "step": 4035 }, { "epoch": 2.16223448737247, - "grad_norm": 8.652725511123771, - "learning_rate": 6.573928218377243e-07, - "logits/chosen": -0.6040986776351929, - "logits/rejected": -0.6009663343429565, - "logps/chosen": -1.2341662645339966, - "logps/rejected": -2.081291675567627, - "loss": 0.8842, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2341662645339966, - "rewards/margins": 0.8471253514289856, - "rewards/rejected": -2.081291675567627, - "sft_loss": 1.2724459171295166, + "grad_norm": 5.569795891024788, + "learning_rate": 2.1913094061257476e-07, + "logits/chosen": -0.22639043629169464, + "logits/rejected": -0.18254801630973816, + "logps/chosen": -1.242897391319275, + "logps/rejected": -1.709593415260315, + "loss": 0.9835, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.242897391319275, + "rewards/margins": 0.46669578552246094, + "rewards/rejected": -1.709593415260315, + "sft_loss": 1.2660150527954102, "step": 4040 }, { "epoch": 2.164910520153872, - "grad_norm": 6.388012839682491, - "learning_rate": 6.5353152732379e-07, - "logits/chosen": -0.5803536772727966, - "logits/rejected": -0.47784996032714844, - "logps/chosen": -1.333279013633728, - "logps/rejected": -2.183581829071045, - "loss": 0.916, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.333279013633728, - "rewards/margins": 0.8503029942512512, - "rewards/rejected": -2.183581829071045, - "sft_loss": 1.4035253524780273, + "grad_norm": 6.791702368644484, + "learning_rate": 2.178438424412633e-07, + "logits/chosen": -0.18503087759017944, + "logits/rejected": -0.05424821376800537, + "logps/chosen": -1.358148455619812, + "logps/rejected": -1.8204864263534546, + "loss": 1.0243, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.358148455619812, + "rewards/margins": 0.4623379111289978, + "rewards/rejected": -1.8204864263534546, + "sft_loss": 1.4118998050689697, "step": 4045 }, { "epoch": 2.1675865529352736, - "grad_norm": 10.295310498808199, - "learning_rate": 6.496784452506907e-07, - "logits/chosen": -0.665248692035675, - "logits/rejected": -0.5683599710464478, - "logps/chosen": -1.3648693561553955, - "logps/rejected": -2.200554609298706, - "loss": 0.9444, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3648693561553955, - "rewards/margins": 0.8356854319572449, - "rewards/rejected": -2.200554609298706, - "sft_loss": 1.4416182041168213, + "grad_norm": 8.644936297670235, + "learning_rate": 2.165594817502302e-07, + "logits/chosen": -0.26354607939720154, + "logits/rejected": -0.15164721012115479, + "logps/chosen": -1.3595654964447021, + "logps/rejected": -1.7514352798461914, + "loss": 1.0539, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3595654964447021, + "rewards/margins": 0.39186978340148926, + "rewards/rejected": -1.7514352798461914, + "sft_loss": 1.4033119678497314, "step": 4050 }, { "epoch": 2.170262585716675, - "grad_norm": 7.63395836520522, - "learning_rate": 6.458336130010442e-07, - "logits/chosen": -0.573379397392273, - "logits/rejected": -0.5254855751991272, - "logps/chosen": -1.300022840499878, - "logps/rejected": -1.9362703561782837, - "loss": 0.8988, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.300022840499878, - "rewards/margins": 0.6362478137016296, - "rewards/rejected": -1.9362703561782837, - "sft_loss": 1.3450088500976562, + "grad_norm": 6.899153154126547, + "learning_rate": 2.1527787100034806e-07, + "logits/chosen": -0.1441766768693924, + "logits/rejected": -0.08697351068258286, + "logps/chosen": -1.338050365447998, + "logps/rejected": -1.6296417713165283, + "loss": 1.0226, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.338050365447998, + "rewards/margins": 0.2915913760662079, + "rewards/rejected": -1.6296417713165283, + "sft_loss": 1.3598458766937256, "step": 4055 }, { "epoch": 2.1729386184980766, - "grad_norm": 8.165944085428556, - "learning_rate": 6.419970678774312e-07, - "logits/chosen": -0.4763668477535248, - "logits/rejected": -0.381841242313385, - "logps/chosen": -1.202392578125, - "logps/rejected": -2.0925676822662354, - "loss": 0.8646, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.202392578125, - "rewards/margins": 0.8901751637458801, - "rewards/rejected": -2.0925676822662354, - "sft_loss": 1.3081729412078857, + "grad_norm": 21.304530755120677, + "learning_rate": 2.1399902262581037e-07, + "logits/chosen": -0.06472693383693695, + "logits/rejected": 0.04358845204114914, + "logps/chosen": -1.2369481325149536, + "logps/rejected": -1.7288429737091064, + "loss": 0.9767, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2369481325149536, + "rewards/margins": 0.4918946623802185, + "rewards/rejected": -1.7288429737091064, + "sft_loss": 1.2975510358810425, "step": 4060 }, { "epoch": 2.1756146512794783, - "grad_norm": 7.609107903515956, - "learning_rate": 6.381688471020282e-07, - "logits/chosen": -0.6313573122024536, - "logits/rejected": -0.584338366985321, - "logps/chosen": -1.2770190238952637, - "logps/rejected": -2.2510018348693848, - "loss": 0.8755, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2770190238952637, - "rewards/margins": 0.9739829897880554, - "rewards/rejected": -2.2510018348693848, - "sft_loss": 1.3386409282684326, + "grad_norm": 16.284974043783276, + "learning_rate": 2.127229490340094e-07, + "logits/chosen": -0.2730258107185364, + "logits/rejected": -0.1723470240831375, + "logps/chosen": -1.3269093036651611, + "logps/rejected": -1.891418695449829, + "loss": 0.9966, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3269093036651611, + "rewards/margins": 0.5645094513893127, + "rewards/rejected": -1.891418695449829, + "sft_loss": 1.3565967082977295, "step": 4065 }, { "epoch": 2.1782906840608796, - "grad_norm": 10.51751349012702, - "learning_rate": 6.34348987816251e-07, - "logits/chosen": -0.5648465156555176, - "logits/rejected": -0.36703091859817505, - "logps/chosen": -1.2451423406600952, - "logps/rejected": -2.3362300395965576, - "loss": 0.8612, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.2451423406600952, - "rewards/margins": 1.0910876989364624, - "rewards/rejected": -2.3362300395965576, - "sft_loss": 1.3578553199768066, + "grad_norm": 11.971155874073554, + "learning_rate": 2.1144966260541698e-07, + "logits/chosen": -0.16100676357746124, + "logits/rejected": 0.049106527119874954, + "logps/chosen": -1.2725163698196411, + "logps/rejected": -1.8967195749282837, + "loss": 0.9728, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2725163698196411, + "rewards/margins": 0.624203085899353, + "rewards/rejected": -1.8967195749282837, + "sft_loss": 1.3371376991271973, "step": 4070 }, { "epoch": 2.1809667168422813, - "grad_norm": 12.273680244934537, - "learning_rate": 6.3053752708039e-07, - "logits/chosen": -0.5989198684692383, - "logits/rejected": -0.4523433744907379, - "logps/chosen": -1.350265622138977, - "logps/rejected": -2.188004732131958, - "loss": 0.9194, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.350265622138977, - "rewards/margins": 0.8377388119697571, - "rewards/rejected": -2.188004732131958, - "sft_loss": 1.4103902578353882, + "grad_norm": 12.42484802411317, + "learning_rate": 2.1017917569346332e-07, + "logits/chosen": -0.23265857994556427, + "logits/rejected": -0.04037471115589142, + "logps/chosen": -1.378442406654358, + "logps/rejected": -1.7594501972198486, + "loss": 1.0376, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.378442406654358, + "rewards/margins": 0.38100793957710266, + "rewards/rejected": -1.7594501972198486, + "sft_loss": 1.4049503803253174, "step": 4075 }, { "epoch": 2.183642749623683, - "grad_norm": 5.712507138294748, - "learning_rate": 6.267345018732552e-07, - "logits/chosen": -0.6406753659248352, - "logits/rejected": -0.5366859436035156, - "logps/chosen": -1.3789926767349243, - "logps/rejected": -2.4511501789093018, - "loss": 0.9131, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3789926767349243, - "rewards/margins": 1.0721572637557983, - "rewards/rejected": -2.4511501789093018, - "sft_loss": 1.467935562133789, + "grad_norm": 9.926837801001243, + "learning_rate": 2.0891150062441837e-07, + "logits/chosen": -0.24227485060691833, + "logits/rejected": -0.10362211614847183, + "logps/chosen": -1.3642479181289673, + "logps/rejected": -1.9338639974594116, + "loss": 1.016, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3642479181289673, + "rewards/margins": 0.5696161389350891, + "rewards/rejected": -1.9338639974594116, + "sft_loss": 1.4558117389678955, "step": 4080 }, { "epoch": 2.1863187824050843, - "grad_norm": 10.413751258895298, - "learning_rate": 6.229399490918126e-07, - "logits/chosen": -0.5594549775123596, - "logits/rejected": -0.5101550817489624, - "logps/chosen": -1.3116942644119263, - "logps/rejected": -2.1040501594543457, - "loss": 0.8984, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3116942644119263, - "rewards/margins": 0.7923555374145508, - "rewards/rejected": -2.1040501594543457, - "sft_loss": 1.3226666450500488, + "grad_norm": 10.447830711803674, + "learning_rate": 2.0764664969727086e-07, + "logits/chosen": -0.20619995892047882, + "logits/rejected": -0.11009863764047623, + "logps/chosen": -1.309762954711914, + "logps/rejected": -1.675286054611206, + "loss": 1.0163, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.309762954711914, + "rewards/margins": 0.3655230700969696, + "rewards/rejected": -1.675286054611206, + "sft_loss": 1.3190863132476807, "step": 4085 }, { "epoch": 2.188994815186486, - "grad_norm": 7.726743703110656, - "learning_rate": 6.19153905550831e-07, - "logits/chosen": -0.6734306216239929, - "logits/rejected": -0.5152979493141174, - "logps/chosen": -1.2853702306747437, - "logps/rejected": -2.1580288410186768, - "loss": 0.897, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.2853702306747437, - "rewards/margins": 0.8726586103439331, - "rewards/rejected": -2.1580288410186768, - "sft_loss": 1.3541196584701538, + "grad_norm": 7.788529932811042, + "learning_rate": 2.0638463518361033e-07, + "logits/chosen": -0.2938595116138458, + "logits/rejected": -0.10211338102817535, + "logps/chosen": -1.279653549194336, + "logps/rejected": -1.7211673259735107, + "loss": 1.0055, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.279653549194336, + "rewards/margins": 0.4415138363838196, + "rewards/rejected": -1.7211673259735107, + "sft_loss": 1.3394492864608765, "step": 4090 }, { "epoch": 2.1916708479678877, - "grad_norm": 8.947962461451352, - "learning_rate": 6.153764079825211e-07, - "logits/chosen": -0.6520611047744751, - "logits/rejected": -0.5935763120651245, - "logps/chosen": -1.3681317567825317, - "logps/rejected": -2.20524525642395, - "loss": 0.925, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3681317567825317, - "rewards/margins": 0.8371133804321289, - "rewards/rejected": -2.20524525642395, - "sft_loss": 1.4108130931854248, + "grad_norm": 7.232847549769498, + "learning_rate": 2.0512546932750702e-07, + "logits/chosen": -0.2455311268568039, + "logits/rejected": -0.15797817707061768, + "logps/chosen": -1.3783433437347412, + "logps/rejected": -1.723474144935608, + "loss": 1.058, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3783433437347412, + "rewards/margins": 0.34513089060783386, + "rewards/rejected": -1.723474144935608, + "sft_loss": 1.4231681823730469, "step": 4095 }, { "epoch": 2.194346880749289, - "grad_norm": 10.456483262678793, - "learning_rate": 6.116074930361803e-07, - "logits/chosen": -0.5813900828361511, - "logits/rejected": -0.47301873564720154, - "logps/chosen": -1.253137469291687, - "logps/rejected": -2.296706199645996, - "loss": 0.8651, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.253137469291687, - "rewards/margins": 1.043568730354309, - "rewards/rejected": -2.296706199645996, - "sft_loss": 1.3607757091522217, + "grad_norm": 10.274974819074817, + "learning_rate": 2.0386916434539343e-07, + "logits/chosen": -0.17151767015457153, + "logits/rejected": -0.03050888516008854, + "logps/chosen": -1.212294340133667, + "logps/rejected": -1.779362678527832, + "loss": 0.9717, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.212294340133667, + "rewards/margins": 0.5670684576034546, + "rewards/rejected": -1.779362678527832, + "sft_loss": 1.330917239189148, "step": 4100 }, { "epoch": 2.1970229135306907, - "grad_norm": 7.759076142826036, - "learning_rate": 6.078471972778388e-07, - "logits/chosen": -0.592880368232727, - "logits/rejected": -0.4213104844093323, - "logps/chosen": -1.3892344236373901, - "logps/rejected": -2.3780176639556885, - "loss": 0.8894, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3892344236373901, - "rewards/margins": 0.9887831807136536, - "rewards/rejected": -2.3780176639556885, - "sft_loss": 1.4068810939788818, + "grad_norm": 10.474003261456547, + "learning_rate": 2.0261573242594627e-07, + "logits/chosen": -0.2251317799091339, + "logits/rejected": -0.031384989619255066, + "logps/chosen": -1.3947486877441406, + "logps/rejected": -1.8142874240875244, + "loss": 1.022, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3947486877441406, + "rewards/margins": 0.41953855752944946, + "rewards/rejected": -1.8142874240875244, + "sft_loss": 1.391324520111084, "step": 4105 }, { "epoch": 2.1996989463120924, - "grad_norm": 13.227310541931292, - "learning_rate": 6.040955571899018e-07, - "logits/chosen": -0.601898729801178, - "logits/rejected": -0.44615617394447327, - "logps/chosen": -1.33761727809906, - "logps/rejected": -2.3596558570861816, - "loss": 0.9046, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.33761727809906, - "rewards/margins": 1.0220386981964111, - "rewards/rejected": -2.3596558570861816, - "sft_loss": 1.4018828868865967, + "grad_norm": 22.073423191846164, + "learning_rate": 2.0136518572996724e-07, + "logits/chosen": -0.17992620170116425, + "logits/rejected": -0.00832604430615902, + "logps/chosen": -1.2970690727233887, + "logps/rejected": -1.7878872156143188, + "loss": 1.0178, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2970690727233887, + "rewards/margins": 0.49081793427467346, + "rewards/rejected": -1.7878872156143188, + "sft_loss": 1.368438959121704, "step": 4110 }, { "epoch": 2.202374979093494, - "grad_norm": 11.048455514959556, - "learning_rate": 6.003526091707986e-07, - "logits/chosen": -0.5543791055679321, - "logits/rejected": -0.5159690976142883, - "logps/chosen": -1.307888150215149, - "logps/rejected": -2.3227555751800537, - "loss": 0.8555, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.307888150215149, - "rewards/margins": 1.0148675441741943, - "rewards/rejected": -2.3227555751800537, - "sft_loss": 1.3637077808380127, + "grad_norm": 13.437164800427897, + "learning_rate": 2.0011753639026617e-07, + "logits/chosen": -0.15775267779827118, + "logits/rejected": -0.10851816833019257, + "logps/chosen": -1.3005092144012451, + "logps/rejected": -1.8442541360855103, + "loss": 0.963, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3005092144012451, + "rewards/margins": 0.5437448024749756, + "rewards/rejected": -1.8442541360855103, + "sft_loss": 1.3481507301330566, "step": 4115 }, { "epoch": 2.2050510118748954, - "grad_norm": 7.691464355723396, - "learning_rate": 5.966183895346264e-07, - "logits/chosen": -0.58518385887146, - "logits/rejected": -0.5370736122131348, - "logps/chosen": -1.245976209640503, - "logps/rejected": -2.251274585723877, - "loss": 0.8695, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.245976209640503, - "rewards/margins": 1.005298376083374, - "rewards/rejected": -2.251274585723877, - "sft_loss": 1.34295654296875, + "grad_norm": 8.324524926032959, + "learning_rate": 1.988727965115421e-07, + "logits/chosen": -0.204762265086174, + "logits/rejected": -0.12053843587636948, + "logps/chosen": -1.254342794418335, + "logps/rejected": -1.835875153541565, + "loss": 0.9696, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.254342794418335, + "rewards/margins": 0.5815322399139404, + "rewards/rejected": -1.835875153541565, + "sft_loss": 1.3362596035003662, "step": 4120 }, { "epoch": 2.207727044656297, - "grad_norm": 7.3169015153864585, - "learning_rate": 5.928929345108015e-07, - "logits/chosen": -0.651535153388977, - "logits/rejected": -0.4840888977050781, - "logps/chosen": -1.2679508924484253, - "logps/rejected": -2.3214924335479736, - "loss": 0.8589, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2679508924484253, - "rewards/margins": 1.053541660308838, - "rewards/rejected": -2.3214924335479736, - "sft_loss": 1.3484123945236206, + "grad_norm": 11.093237168255653, + "learning_rate": 1.9763097817026713e-07, + "logits/chosen": -0.24492064118385315, + "logits/rejected": -0.025628382340073586, + "logps/chosen": -1.2933624982833862, + "logps/rejected": -1.8775631189346313, + "loss": 0.9738, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2933624982833862, + "rewards/margins": 0.5842007398605347, + "rewards/rejected": -1.8775631189346313, + "sft_loss": 1.3566689491271973, "step": 4125 }, { "epoch": 2.210403077437699, - "grad_norm": 13.563104283561207, - "learning_rate": 5.891762802437039e-07, - "logits/chosen": -0.5729442834854126, - "logits/rejected": -0.49847179651260376, - "logps/chosen": -1.315578579902649, - "logps/rejected": -2.3521034717559814, - "loss": 0.8863, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.315578579902649, - "rewards/margins": 1.036525011062622, - "rewards/rejected": -2.3521034717559814, - "sft_loss": 1.3842662572860718, + "grad_norm": 9.123638617689465, + "learning_rate": 1.9639209341456796e-07, + "logits/chosen": -0.16828450560569763, + "logits/rejected": -0.08324885368347168, + "logps/chosen": -1.3041160106658936, + "logps/rejected": -1.8632113933563232, + "loss": 0.9844, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3041160106658936, + "rewards/margins": 0.5590953826904297, + "rewards/rejected": -1.8632113933563232, + "sft_loss": 1.351027250289917, "step": 4130 }, { "epoch": 2.2130791102191, - "grad_norm": 8.362458070033808, - "learning_rate": 5.854684627923306e-07, - "logits/chosen": -0.5584360361099243, - "logits/rejected": -0.5975090265274048, - "logps/chosen": -1.3722999095916748, - "logps/rejected": -2.573744297027588, - "loss": 0.9023, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3722999095916748, - "rewards/margins": 1.2014447450637817, - "rewards/rejected": -2.573744297027588, - "sft_loss": 1.4072306156158447, + "grad_norm": 9.575295181352812, + "learning_rate": 1.951561542641102e-07, + "logits/chosen": -0.15587007999420166, + "logits/rejected": -0.16831621527671814, + "logps/chosen": -1.4001448154449463, + "logps/rejected": -2.0440139770507812, + "loss": 1.0319, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.4001448154449463, + "rewards/margins": 0.6438690423965454, + "rewards/rejected": -2.0440139770507812, + "sft_loss": 1.446777105331421, "step": 4135 }, { "epoch": 2.215755143000502, - "grad_norm": 7.472610360581948, - "learning_rate": 5.817695181299418e-07, - "logits/chosen": -0.6967195272445679, - "logits/rejected": -0.6487011313438416, - "logps/chosen": -1.2705659866333008, - "logps/rejected": -2.1925406455993652, - "loss": 0.879, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2705659866333008, - "rewards/margins": 0.9219745397567749, - "rewards/rejected": -2.1925406455993652, - "sft_loss": 1.3230068683624268, + "grad_norm": 9.246711856267877, + "learning_rate": 1.939231727099806e-07, + "logits/chosen": -0.31833615899086, + "logits/rejected": -0.22816014289855957, + "logps/chosen": -1.2842254638671875, + "logps/rejected": -1.8089005947113037, + "loss": 0.9756, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2842254638671875, + "rewards/margins": 0.5246752500534058, + "rewards/rejected": -1.8089005947113037, + "sft_loss": 1.3310213088989258, "step": 4140 }, { "epoch": 2.2184311757819035, - "grad_norm": 8.964792124855025, - "learning_rate": 5.780794821437158e-07, - "logits/chosen": -0.519437313079834, - "logits/rejected": -0.3859899342060089, - "logps/chosen": -1.3835476636886597, - "logps/rejected": -2.3953795433044434, - "loss": 0.8809, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.3835476636886597, - "rewards/margins": 1.011831521987915, - "rewards/rejected": -2.3953795433044434, - "sft_loss": 1.3992650508880615, + "grad_norm": 9.691411327588451, + "learning_rate": 1.926931607145719e-07, + "logits/chosen": -0.13669563829898834, + "logits/rejected": -0.0020809501875191927, + "logps/chosen": -1.4141411781311035, + "logps/rejected": -1.9502674341201782, + "loss": 1.0033, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4141411781311035, + "rewards/margins": 0.5361261367797852, + "rewards/rejected": -1.9502674341201782, + "sft_loss": 1.4029778242111206, "step": 4145 }, { "epoch": 2.221107208563305, - "grad_norm": 7.8812313935035405, - "learning_rate": 5.743983906343969e-07, - "logits/chosen": -0.5854192972183228, - "logits/rejected": -0.5049089193344116, - "logps/chosen": -1.2127283811569214, - "logps/rejected": -2.2633450031280518, - "loss": 0.8527, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.2127283811569214, - "rewards/margins": 1.0506165027618408, - "rewards/rejected": -2.2633450031280518, - "sft_loss": 1.3281142711639404, + "grad_norm": 7.939966528509989, + "learning_rate": 1.9146613021146564e-07, + "logits/chosen": -0.1929643601179123, + "logits/rejected": -0.09834329783916473, + "logps/chosen": -1.2604628801345825, + "logps/rejected": -1.8470321893692017, + "loss": 0.9755, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2604628801345825, + "rewards/margins": 0.58656907081604, + "rewards/rejected": -1.8470321893692017, + "sft_loss": 1.3392512798309326, "step": 4150 }, { "epoch": 2.2237832413447065, - "grad_norm": 8.011591459714527, - "learning_rate": 5.707262793159521e-07, - "logits/chosen": -0.5341406464576721, - "logits/rejected": -0.5728651881217957, - "logps/chosen": -1.2559373378753662, - "logps/rejected": -2.0911660194396973, - "loss": 0.8749, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.2559373378753662, - "rewards/margins": 0.8352285623550415, - "rewards/rejected": -2.0911660194396973, - "sft_loss": 1.3055473566055298, + "grad_norm": 7.6808240546636615, + "learning_rate": 1.9024209310531736e-07, + "logits/chosen": -0.1238643079996109, + "logits/rejected": -0.14029282331466675, + "logps/chosen": -1.2798162698745728, + "logps/rejected": -1.7341972589492798, + "loss": 0.9756, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2798162698745728, + "rewards/margins": 0.45438089966773987, + "rewards/rejected": -1.7341972589492798, + "sft_loss": 1.2994613647460938, "step": 4155 }, { "epoch": 2.2264592741261082, - "grad_norm": 12.086417724962727, - "learning_rate": 5.670631838152204e-07, - "logits/chosen": -0.5920090675354004, - "logits/rejected": -0.47571271657943726, - "logps/chosen": -1.3779784440994263, - "logps/rejected": -2.217038869857788, - "loss": 0.9123, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3779784440994263, - "rewards/margins": 0.8390604853630066, - "rewards/rejected": -2.217038869857788, - "sft_loss": 1.4312714338302612, + "grad_norm": 11.861276253624242, + "learning_rate": 1.890210612717401e-07, + "logits/chosen": -0.185426265001297, + "logits/rejected": -0.026468921452760696, + "logps/chosen": -1.3891743421554565, + "logps/rejected": -1.7652900218963623, + "loss": 1.0455, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3891743421554565, + "rewards/margins": 0.3761156499385834, + "rewards/rejected": -1.7652900218963623, + "sft_loss": 1.4331741333007812, "step": 4160 }, { "epoch": 2.2291353069075095, - "grad_norm": 7.473769846664818, - "learning_rate": 5.634091396715716e-07, - "logits/chosen": -0.6135045886039734, - "logits/rejected": -0.5422523021697998, - "logps/chosen": -1.309459924697876, - "logps/rejected": -2.377963066101074, - "loss": 0.8797, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.309459924697876, - "rewards/margins": 1.0685032606124878, - "rewards/rejected": -2.377963066101074, - "sft_loss": 1.3874919414520264, + "grad_norm": 12.865007757777473, + "learning_rate": 1.8780304655719054e-07, + "logits/chosen": -0.20513662695884705, + "logits/rejected": -0.08602355420589447, + "logps/chosen": -1.3340986967086792, + "logps/rejected": -1.872727394104004, + "loss": 1.0218, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3340986967086792, + "rewards/margins": 0.5386286973953247, + "rewards/rejected": -1.872727394104004, + "sft_loss": 1.3980815410614014, "step": 4165 }, { "epoch": 2.231811339688911, - "grad_norm": 10.69968137640894, - "learning_rate": 5.59764182336557e-07, - "logits/chosen": -0.49654465913772583, - "logits/rejected": -0.4765149652957916, - "logps/chosen": -1.354604959487915, - "logps/rejected": -2.4720609188079834, - "loss": 0.8629, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.354604959487915, - "rewards/margins": 1.1174561977386475, - "rewards/rejected": -2.4720609188079834, - "sft_loss": 1.4114564657211304, + "grad_norm": 25.443956510077477, + "learning_rate": 1.865880607788523e-07, + "logits/chosen": -0.0756000429391861, + "logits/rejected": -0.008861005306243896, + "logps/chosen": -1.3441126346588135, + "logps/rejected": -1.889350175857544, + "loss": 0.9776, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3441126346588135, + "rewards/margins": 0.5452374815940857, + "rewards/rejected": -1.889350175857544, + "sft_loss": 1.373456358909607, "step": 4170 }, { "epoch": 2.234487372470313, - "grad_norm": 10.193127728348049, - "learning_rate": 5.561283471735695e-07, - "logits/chosen": -0.5936630964279175, - "logits/rejected": -0.535660982131958, - "logps/chosen": -1.2989658117294312, - "logps/rejected": -2.1412882804870605, - "loss": 0.9114, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.2989658117294312, - "rewards/margins": 0.8423227071762085, - "rewards/rejected": -2.1412882804870605, - "sft_loss": 1.340101957321167, + "grad_norm": 9.939323030947252, + "learning_rate": 1.8537611572452316e-07, + "logits/chosen": -0.2153160274028778, + "logits/rejected": -0.10737192630767822, + "logps/chosen": -1.32200026512146, + "logps/rejected": -1.6421655416488647, + "loss": 1.0551, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.32200026512146, + "rewards/margins": 0.32016539573669434, + "rewards/rejected": -1.6421655416488647, + "sft_loss": 1.335711121559143, "step": 4175 }, { "epoch": 2.237163405251714, - "grad_norm": 18.238471325259738, - "learning_rate": 5.52501669457497e-07, - "logits/chosen": -0.6202064752578735, - "logits/rejected": -0.41418686509132385, - "logps/chosen": -1.2645350694656372, - "logps/rejected": -2.3794612884521484, - "loss": 0.845, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2645350694656372, - "rewards/margins": 1.1149260997772217, - "rewards/rejected": -2.3794612884521484, - "sft_loss": 1.3265830278396606, + "grad_norm": 9.617887992384906, + "learning_rate": 1.84167223152499e-07, + "logits/chosen": -0.2255008965730667, + "logits/rejected": -0.0034437566064298153, + "logps/chosen": -1.2645217180252075, + "logps/rejected": -1.8279240131378174, + "loss": 0.9514, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2645217180252075, + "rewards/margins": 0.5634021759033203, + "rewards/rejected": -1.8279240131378174, + "sft_loss": 1.313612461090088, "step": 4180 }, { "epoch": 2.239839438033116, - "grad_norm": 13.203807096035783, - "learning_rate": 5.488841843743833e-07, - "logits/chosen": -0.6738411784172058, - "logits/rejected": -0.6753481030464172, - "logps/chosen": -1.229512095451355, - "logps/rejected": -2.188352108001709, - "loss": 0.8635, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.229512095451355, - "rewards/margins": 0.958840012550354, - "rewards/rejected": -2.188352108001709, - "sft_loss": 1.2865744829177856, + "grad_norm": 7.5739248884185395, + "learning_rate": 1.8296139479146112e-07, + "logits/chosen": -0.2612372040748596, + "logits/rejected": -0.24644172191619873, + "logps/chosen": -1.2250291109085083, + "logps/rejected": -1.7144653797149658, + "loss": 0.9706, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2250291109085083, + "rewards/margins": 0.48943623900413513, + "rewards/rejected": -1.7144653797149658, + "sft_loss": 1.2682294845581055, "step": 4185 }, { "epoch": 2.2425154708145176, - "grad_norm": 8.750089998855545, - "learning_rate": 5.452759270210839e-07, - "logits/chosen": -0.5226529836654663, - "logits/rejected": -0.4423566460609436, - "logps/chosen": -1.278633952140808, - "logps/rejected": -2.46871280670166, - "loss": 0.8591, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.278633952140808, - "rewards/margins": 1.1900789737701416, - "rewards/rejected": -2.46871280670166, - "sft_loss": 1.3546621799468994, + "grad_norm": 7.365542445718876, + "learning_rate": 1.8175864234036132e-07, + "logits/chosen": -0.12606190145015717, + "logits/rejected": -0.014820749871432781, + "logps/chosen": -1.2924238443374634, + "logps/rejected": -1.892459511756897, + "loss": 0.9807, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2924238443374634, + "rewards/margins": 0.600035548210144, + "rewards/rejected": -1.892459511756897, + "sft_loss": 1.3498013019561768, "step": 4190 }, { "epoch": 2.245191503595919, - "grad_norm": 6.64950755241882, - "learning_rate": 5.416769324049282e-07, - "logits/chosen": -0.729025661945343, - "logits/rejected": -0.6177822351455688, - "logps/chosen": -1.3424952030181885, - "logps/rejected": -2.1164371967315674, - "loss": 0.9672, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3424952030181885, - "rewards/margins": 0.7739418745040894, - "rewards/rejected": -2.1164371967315674, - "sft_loss": 1.4235026836395264, + "grad_norm": 9.238459788940935, + "learning_rate": 1.805589774683094e-07, + "logits/chosen": -0.3326072096824646, + "logits/rejected": -0.18114691972732544, + "logps/chosen": -1.3395802974700928, + "logps/rejected": -1.7088123559951782, + "loss": 1.0665, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3395802974700928, + "rewards/margins": 0.3692319989204407, + "rewards/rejected": -1.7088123559951782, + "sft_loss": 1.3958898782730103, "step": 4195 }, { "epoch": 2.2478675363773206, - "grad_norm": 8.79179757288585, - "learning_rate": 5.38087235443377e-07, - "logits/chosen": -0.4805554747581482, - "logits/rejected": -0.525387704372406, - "logps/chosen": -1.3676626682281494, - "logps/rejected": -2.3163561820983887, - "loss": 0.8948, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3676626682281494, - "rewards/margins": 0.9486930966377258, - "rewards/rejected": -2.3163561820983887, - "sft_loss": 1.4133599996566772, + "grad_norm": 9.822847787054164, + "learning_rate": 1.79362411814459e-07, + "logits/chosen": -0.05810268968343735, + "logits/rejected": -0.06424954533576965, + "logps/chosen": -1.384313941001892, + "logps/rejected": -1.8423607349395752, + "loss": 1.0364, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.384313941001892, + "rewards/margins": 0.45804667472839355, + "rewards/rejected": -1.8423607349395752, + "sft_loss": 1.4010447263717651, "step": 4200 }, { "epoch": 2.2505435691587223, - "grad_norm": 5.380572243442791, - "learning_rate": 5.345068709636866e-07, - "logits/chosen": -0.6558794975280762, - "logits/rejected": -0.5935714244842529, - "logps/chosen": -1.278519630432129, - "logps/rejected": -2.2183380126953125, - "loss": 0.859, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.278519630432129, - "rewards/margins": 0.9398185610771179, - "rewards/rejected": -2.2183380126953125, - "sft_loss": 1.3346195220947266, + "grad_norm": 7.372415164748828, + "learning_rate": 1.7816895698789552e-07, + "logits/chosen": -0.25069189071655273, + "logits/rejected": -0.17634975910186768, + "logps/chosen": -1.283087134361267, + "logps/rejected": -1.765835165977478, + "loss": 0.9576, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.283087134361267, + "rewards/margins": 0.4827481210231781, + "rewards/rejected": -1.765835165977478, + "sft_loss": 1.3241028785705566, "step": 4205 }, { "epoch": 2.2532196019401236, - "grad_norm": 7.582818350173143, - "learning_rate": 5.309358737025682e-07, - "logits/chosen": -0.6234208941459656, - "logits/rejected": -0.5458418726921082, - "logps/chosen": -1.3497518301010132, - "logps/rejected": -2.6105995178222656, - "loss": 0.8876, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3497518301010132, - "rewards/margins": 1.260847806930542, - "rewards/rejected": -2.6105995178222656, - "sft_loss": 1.4059553146362305, + "grad_norm": 6.401387220245433, + "learning_rate": 1.7697862456752271e-07, + "logits/chosen": -0.228153795003891, + "logits/rejected": -0.1123252660036087, + "logps/chosen": -1.287208080291748, + "logps/rejected": -1.9734983444213867, + "loss": 0.9658, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.287208080291748, + "rewards/margins": 0.6862903833389282, + "rewards/rejected": -1.9734983444213867, + "sft_loss": 1.3538731336593628, "step": 4210 }, { "epoch": 2.2558956347215253, - "grad_norm": 15.432277361630593, - "learning_rate": 5.273742783058537e-07, - "logits/chosen": -0.6282011270523071, - "logits/rejected": -0.5331791639328003, - "logps/chosen": -1.3267197608947754, - "logps/rejected": -2.370863676071167, - "loss": 0.902, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3267197608947754, - "rewards/margins": 1.044143795967102, - "rewards/rejected": -2.370863676071167, - "sft_loss": 1.3646594285964966, + "grad_norm": 7.858392367779872, + "learning_rate": 1.7579142610195124e-07, + "logits/chosen": -0.2269498109817505, + "logits/rejected": -0.09576071798801422, + "logps/chosen": -1.3312914371490479, + "logps/rejected": -1.813730001449585, + "loss": 1.0197, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3312914371490479, + "rewards/margins": 0.4824386239051819, + "rewards/rejected": -1.813730001449585, + "sft_loss": 1.3493551015853882, "step": 4215 }, { "epoch": 2.258571667502927, - "grad_norm": 6.940650950295138, - "learning_rate": 5.23822119328157e-07, - "logits/chosen": -0.6763989329338074, - "logits/rejected": -0.5052696466445923, - "logps/chosen": -1.2935147285461426, - "logps/rejected": -2.270808458328247, - "loss": 0.8958, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.2935147285461426, - "rewards/margins": 0.9772937893867493, - "rewards/rejected": -2.270808458328247, - "sft_loss": 1.3664653301239014, + "grad_norm": 6.255595915276127, + "learning_rate": 1.7460737310938568e-07, + "logits/chosen": -0.26727840304374695, + "logits/rejected": -0.058417391031980515, + "logps/chosen": -1.2808057069778442, + "logps/rejected": -1.8030893802642822, + "loss": 0.9971, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2808057069778442, + "rewards/margins": 0.522283673286438, + "rewards/rejected": -1.8030893802642822, + "sft_loss": 1.350306749343872, "step": 4220 }, { "epoch": 2.2612477002843283, - "grad_norm": 6.544634209943382, - "learning_rate": 5.202794312325399e-07, - "logits/chosen": -0.6599363684654236, - "logits/rejected": -0.4820350706577301, - "logps/chosen": -1.3575453758239746, - "logps/rejected": -2.430337429046631, - "loss": 0.8547, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.3575453758239746, - "rewards/margins": 1.0727920532226562, - "rewards/rejected": -2.430337429046631, - "sft_loss": 1.4071263074874878, + "grad_norm": 6.500444843165474, + "learning_rate": 1.734264770775133e-07, + "logits/chosen": -0.25225088000297546, + "logits/rejected": -0.06664416939020157, + "logps/chosen": -1.3443737030029297, + "logps/rejected": -1.9076101779937744, + "loss": 0.9705, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3443737030029297, + "rewards/margins": 0.5632363557815552, + "rewards/rejected": -1.9076101779937744, + "sft_loss": 1.3907678127288818, "step": 4225 }, { "epoch": 2.26392373306573, - "grad_norm": 7.936678377225664, - "learning_rate": 5.167462483901773e-07, - "logits/chosen": -0.6433524489402771, - "logits/rejected": -0.582925021648407, - "logps/chosen": -1.3367130756378174, - "logps/rejected": -2.375605344772339, - "loss": 0.8797, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3367130756378174, - "rewards/margins": 1.038892388343811, - "rewards/rejected": -2.375605344772339, - "sft_loss": 1.3651541471481323, + "grad_norm": 7.920412917789237, + "learning_rate": 1.7224874946339241e-07, + "logits/chosen": -0.2684093713760376, + "logits/rejected": -0.1828213632106781, + "logps/chosen": -1.3897509574890137, + "logps/rejected": -1.9323326349258423, + "loss": 1.0114, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3897509574890137, + "rewards/margins": 0.5425814986228943, + "rewards/rejected": -1.9323326349258423, + "sft_loss": 1.3893733024597168, "step": 4230 }, { "epoch": 2.2665997658471317, - "grad_norm": 7.015436901921431, - "learning_rate": 5.132226050800256e-07, - "logits/chosen": -0.5926527380943298, - "logits/rejected": -0.5132274627685547, - "logps/chosen": -1.3561756610870361, - "logps/rejected": -2.0783355236053467, - "loss": 0.9461, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3561756610870361, - "rewards/margins": 0.7221602201461792, - "rewards/rejected": -2.0783355236053467, - "sft_loss": 1.437430739402771, + "grad_norm": 5.807782159852171, + "learning_rate": 1.7107420169334186e-07, + "logits/chosen": -0.19051989912986755, + "logits/rejected": -0.06599706411361694, + "logps/chosen": -1.3841588497161865, + "logps/rejected": -1.6875133514404297, + "loss": 1.0851, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3841588497161865, + "rewards/margins": 0.3033544421195984, + "rewards/rejected": -1.6875133514404297, + "sft_loss": 1.4423117637634277, "step": 4235 }, { "epoch": 2.269275798628533, - "grad_norm": 5.5716445207379515, - "learning_rate": 5.097085354884869e-07, - "logits/chosen": -0.5913428068161011, - "logits/rejected": -0.5225772857666016, - "logps/chosen": -1.2736221551895142, - "logps/rejected": -2.1757893562316895, - "loss": 0.8966, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2736221551895142, - "rewards/margins": 0.9021672010421753, - "rewards/rejected": -2.1757893562316895, - "sft_loss": 1.3726946115493774, + "grad_norm": 6.195477499990059, + "learning_rate": 1.6990284516282893e-07, + "logits/chosen": -0.18834324181079865, + "logits/rejected": -0.06376080214977264, + "logps/chosen": -1.305316686630249, + "logps/rejected": -1.6655584573745728, + "loss": 1.0326, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.305316686630249, + "rewards/margins": 0.36024174094200134, + "rewards/rejected": -1.6655584573745728, + "sft_loss": 1.385566234588623, "step": 4240 }, { "epoch": 2.2719518314099347, - "grad_norm": 9.07817146254204, - "learning_rate": 5.062040737090806e-07, - "logits/chosen": -0.655211329460144, - "logits/rejected": -0.5655419826507568, - "logps/chosen": -1.3266026973724365, - "logps/rejected": -2.2359845638275146, - "loss": 0.907, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3266026973724365, - "rewards/margins": 0.9093819856643677, - "rewards/rejected": -2.2359845638275146, - "sft_loss": 1.3721539974212646, + "grad_norm": 14.446952301432269, + "learning_rate": 1.687346912363602e-07, + "logits/chosen": -0.2903379201889038, + "logits/rejected": -0.16157583892345428, + "logps/chosen": -1.3384888172149658, + "logps/rejected": -1.7572097778320312, + "loss": 1.0311, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3384888172149658, + "rewards/margins": 0.4187210202217102, + "rewards/rejected": -1.7572097778320312, + "sft_loss": 1.3747570514678955, "step": 4245 }, { "epoch": 2.2746278641913364, - "grad_norm": 10.049455974892844, - "learning_rate": 5.027092537421091e-07, - "logits/chosen": -0.6034508943557739, - "logits/rejected": -0.4758704602718353, - "logps/chosen": -1.3402130603790283, - "logps/rejected": -2.1698737144470215, - "loss": 0.9332, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3402130603790283, - "rewards/margins": 0.8296605944633484, - "rewards/rejected": -2.1698737144470215, - "sft_loss": 1.3776081800460815, + "grad_norm": 5.840044296050715, + "learning_rate": 1.675697512473697e-07, + "logits/chosen": -0.18125630915164948, + "logits/rejected": -0.0014208063948899508, + "logps/chosen": -1.3621944189071655, + "logps/rejected": -1.7927730083465576, + "loss": 1.0284, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3621944189071655, + "rewards/margins": 0.4305785298347473, + "rewards/rejected": -1.7927730083465576, + "sft_loss": 1.3738056421279907, "step": 4250 }, { "epoch": 2.2773038969727377, - "grad_norm": 9.900693966782814, - "learning_rate": 4.992241094943326e-07, - "logits/chosen": -0.6390854716300964, - "logits/rejected": -0.41629552841186523, - "logps/chosen": -1.3275277614593506, - "logps/rejected": -2.4523518085479736, - "loss": 0.8547, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3275277614593506, - "rewards/margins": 1.124824047088623, - "rewards/rejected": -2.4523518085479736, - "sft_loss": 1.3700711727142334, + "grad_norm": 13.818786645322406, + "learning_rate": 1.6640803649811087e-07, + "logits/chosen": -0.2514936625957489, + "logits/rejected": -0.013041814789175987, + "logps/chosen": -1.3561211824417114, + "logps/rejected": -1.9976732730865479, + "loss": 0.9825, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3561211824417114, + "rewards/margins": 0.6415520310401917, + "rewards/rejected": -1.9976732730865479, + "sft_loss": 1.3830833435058594, "step": 4255 }, { "epoch": 2.2799799297541394, - "grad_norm": 12.835616709525304, - "learning_rate": 4.957486747786342e-07, - "logits/chosen": -0.6146548986434937, - "logits/rejected": -0.5280757546424866, - "logps/chosen": -1.2478665113449097, - "logps/rejected": -2.1629269123077393, - "loss": 0.8487, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.2478665113449097, - "rewards/margins": 0.9150603413581848, - "rewards/rejected": -2.1629269123077393, - "sft_loss": 1.2602465152740479, + "grad_norm": 14.17690399009336, + "learning_rate": 1.6524955825954472e-07, + "logits/chosen": -0.22364842891693115, + "logits/rejected": -0.1085544228553772, + "logps/chosen": -1.2623711824417114, + "logps/rejected": -1.7410624027252197, + "loss": 0.9569, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2623711824417114, + "rewards/margins": 0.47869133949279785, + "rewards/rejected": -1.7410624027252197, + "sft_loss": 1.2642004489898682, "step": 4260 }, { "epoch": 2.282655962535541, - "grad_norm": 7.320178337679792, - "learning_rate": 4.922829833136984e-07, - "logits/chosen": -0.7337331771850586, - "logits/rejected": -0.5915257930755615, - "logps/chosen": -1.2824734449386597, - "logps/rejected": -2.310690402984619, - "loss": 0.8835, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2824734449386597, - "rewards/margins": 1.0282166004180908, - "rewards/rejected": -2.310690402984619, - "sft_loss": 1.3565456867218018, + "grad_norm": 8.380378848216962, + "learning_rate": 1.6409432777123277e-07, + "logits/chosen": -0.3221927285194397, + "logits/rejected": -0.14524655044078827, + "logps/chosen": -1.2813420295715332, + "logps/rejected": -1.8537393808364868, + "loss": 0.9835, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2813420295715332, + "rewards/margins": 0.5723973512649536, + "rewards/rejected": -1.8537393808364868, + "sft_loss": 1.3393852710723877, "step": 4265 }, { "epoch": 2.285331995316943, - "grad_norm": 5.297260316677994, - "learning_rate": 4.888270687236773e-07, - "logits/chosen": -0.5888367891311646, - "logits/rejected": -0.3697664737701416, - "logps/chosen": -1.3420976400375366, - "logps/rejected": -2.446397304534912, - "loss": 0.862, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3420976400375366, - "rewards/margins": 1.104299783706665, - "rewards/rejected": -2.446397304534912, - "sft_loss": 1.3266279697418213, + "grad_norm": 7.55389047157114, + "learning_rate": 1.6294235624122577e-07, + "logits/chosen": -0.13835462927818298, + "logits/rejected": 0.10697861015796661, + "logps/chosen": -1.3332964181900024, + "logps/rejected": -2.028346300125122, + "loss": 0.9493, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3332964181900024, + "rewards/margins": 0.6950497627258301, + "rewards/rejected": -2.028346300125122, + "sft_loss": 1.313429355621338, "step": 4270 }, { "epoch": 2.288008028098344, - "grad_norm": 11.963344029150749, - "learning_rate": 4.853809645378709e-07, - "logits/chosen": -0.6401462554931641, - "logits/rejected": -0.5787710547447205, - "logps/chosen": -1.3975201845169067, - "logps/rejected": -2.3918261528015137, - "loss": 0.9225, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3975201845169067, - "rewards/margins": 0.9943059682846069, - "rewards/rejected": -2.3918261528015137, - "sft_loss": 1.4320608377456665, + "grad_norm": 11.316416033813448, + "learning_rate": 1.6179365484595697e-07, + "logits/chosen": -0.20318862795829773, + "logits/rejected": -0.11733639240264893, + "logps/chosen": -1.4059638977050781, + "logps/rejected": -1.9184436798095703, + "loss": 1.038, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4059638977050781, + "rewards/margins": 0.5124797224998474, + "rewards/rejected": -1.9184436798095703, + "sft_loss": 1.4158484935760498, "step": 4275 }, { "epoch": 2.290684060879746, - "grad_norm": 7.788506744129682, - "learning_rate": 4.81944704190396e-07, - "logits/chosen": -0.652570366859436, - "logits/rejected": -0.6069068312644958, - "logps/chosen": -1.2777897119522095, - "logps/rejected": -2.1783740520477295, - "loss": 0.8837, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2777897119522095, - "rewards/margins": 0.9005842208862305, - "rewards/rejected": -2.1783740520477295, - "sft_loss": 1.3518401384353638, + "grad_norm": 8.200841921982983, + "learning_rate": 1.60648234730132e-07, + "logits/chosen": -0.21803493797779083, + "logits/rejected": -0.12491512298583984, + "logps/chosen": -1.25760018825531, + "logps/rejected": -1.7700488567352295, + "loss": 0.9609, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.25760018825531, + "rewards/margins": 0.5124487280845642, + "rewards/rejected": -1.7700488567352295, + "sft_loss": 1.3133172988891602, "step": 4280 }, { "epoch": 2.293360093661147, - "grad_norm": 11.72830078153376, - "learning_rate": 4.785183210198667e-07, - "logits/chosen": -0.5845471620559692, - "logits/rejected": -0.6229828000068665, - "logps/chosen": -1.2514235973358154, - "logps/rejected": -2.2576746940612793, - "loss": 0.8695, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2514235973358154, - "rewards/margins": 1.0062510967254639, - "rewards/rejected": -2.2576746940612793, - "sft_loss": 1.3402807712554932, + "grad_norm": 12.658528990316567, + "learning_rate": 1.595061070066222e-07, + "logits/chosen": -0.12976686656475067, + "logits/rejected": -0.13357266783714294, + "logps/chosen": -1.2693363428115845, + "logps/rejected": -1.7833693027496338, + "loss": 0.9779, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2693363428115845, + "rewards/margins": 0.5140329599380493, + "rewards/rejected": -1.7833693027496338, + "sft_loss": 1.337398648262024, "step": 4285 }, { "epoch": 2.296036126442549, - "grad_norm": 14.698312110073433, - "learning_rate": 4.7510184826906626e-07, - "logits/chosen": -0.7108185887336731, - "logits/rejected": -0.5703717470169067, - "logps/chosen": -1.3999465703964233, - "logps/rejected": -2.3794710636138916, - "loss": 0.9243, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.3999465703964233, - "rewards/margins": 0.9795247912406921, - "rewards/rejected": -2.3794710636138916, - "sft_loss": 1.440507173538208, + "grad_norm": 10.497517676948384, + "learning_rate": 1.5836728275635542e-07, + "logits/chosen": -0.28647470474243164, + "logits/rejected": -0.10886237770318985, + "logps/chosen": -1.3986010551452637, + "logps/rejected": -1.8781064748764038, + "loss": 1.0317, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3986010551452637, + "rewards/margins": 0.4795055389404297, + "rewards/rejected": -1.8781064748764038, + "sft_loss": 1.412027359008789, "step": 4290 }, { "epoch": 2.2987121592239506, - "grad_norm": 8.97879561445652, - "learning_rate": 4.7169531908462953e-07, - "logits/chosen": -0.6745246648788452, - "logits/rejected": -0.6440014243125916, - "logps/chosen": -1.3127162456512451, - "logps/rejected": -2.2175207138061523, - "loss": 0.8733, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.3127162456512451, - "rewards/margins": 0.9048043489456177, - "rewards/rejected": -2.2175207138061523, - "sft_loss": 1.3560574054718018, + "grad_norm": 8.056840404356597, + "learning_rate": 1.5723177302820984e-07, + "logits/chosen": -0.24646127223968506, + "logits/rejected": -0.19223013520240784, + "logps/chosen": -1.3325896263122559, + "logps/rejected": -1.8143196105957031, + "loss": 0.9822, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3325896263122559, + "rewards/margins": 0.48172998428344727, + "rewards/rejected": -1.8143196105957031, + "sft_loss": 1.343353033065796, "step": 4295 }, { "epoch": 2.3013881920053523, - "grad_norm": 5.615115793326001, - "learning_rate": 4.6829876651671636e-07, - "logits/chosen": -0.6029156446456909, - "logits/rejected": -0.5269027352333069, - "logps/chosen": -1.2948484420776367, - "logps/rejected": -2.2537930011749268, - "loss": 0.8801, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2948484420776367, - "rewards/margins": 0.9589444994926453, - "rewards/rejected": -2.2537930011749268, - "sft_loss": 1.3327890634536743, + "grad_norm": 9.084221826454508, + "learning_rate": 1.5609958883890544e-07, + "logits/chosen": -0.21413850784301758, + "logits/rejected": -0.0885709673166275, + "logps/chosen": -1.3127249479293823, + "logps/rejected": -1.7602211236953735, + "loss": 0.996, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3127249479293823, + "rewards/margins": 0.4474961757659912, + "rewards/rejected": -1.7602211236953735, + "sft_loss": 1.3310400247573853, "step": 4300 }, { "epoch": 2.3040642247867535, - "grad_norm": 19.92989352140432, - "learning_rate": 4.64912223518696e-07, - "logits/chosen": -0.7015405297279358, - "logits/rejected": -0.6147525906562805, - "logps/chosen": -1.2988426685333252, - "logps/rejected": -2.4416849613189697, - "loss": 0.842, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.2988426685333252, - "rewards/margins": 1.142842173576355, - "rewards/rejected": -2.4416849613189697, - "sft_loss": 1.3980969190597534, + "grad_norm": 11.449525862847201, + "learning_rate": 1.5497074117289865e-07, + "logits/chosen": -0.30147385597229004, + "logits/rejected": -0.18163147568702698, + "logps/chosen": -1.3303790092468262, + "logps/rejected": -1.975589394569397, + "loss": 0.9805, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3303790092468262, + "rewards/margins": 0.6452105641365051, + "rewards/rejected": -1.975589394569397, + "sft_loss": 1.419938325881958, "step": 4305 }, { "epoch": 2.3067402575681553, - "grad_norm": 7.255081147335957, - "learning_rate": 4.615357229468221e-07, - "logits/chosen": -0.6565300822257996, - "logits/rejected": -0.4925423562526703, - "logps/chosen": -1.289421796798706, - "logps/rejected": -2.5012295246124268, - "loss": 0.83, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.289421796798706, - "rewards/margins": 1.2118077278137207, - "rewards/rejected": -2.5012295246124268, - "sft_loss": 1.3298838138580322, + "grad_norm": 8.895095429807123, + "learning_rate": 1.5384524098227402e-07, + "logits/chosen": -0.2536405324935913, + "logits/rejected": -0.05459263175725937, + "logps/chosen": -1.3059298992156982, + "logps/rejected": -1.970513105392456, + "loss": 0.959, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3059298992156982, + "rewards/margins": 0.6645833253860474, + "rewards/rejected": -1.970513105392456, + "sft_loss": 1.3471238613128662, "step": 4310 }, { "epoch": 2.3094162903495565, - "grad_norm": 13.943097839852726, - "learning_rate": 4.581692975599192e-07, - "logits/chosen": -0.6478136777877808, - "logits/rejected": -0.5113622546195984, - "logps/chosen": -1.3406429290771484, - "logps/rejected": -2.235804319381714, - "loss": 0.8955, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3406429290771484, - "rewards/margins": 0.8951613306999207, - "rewards/rejected": -2.235804319381714, - "sft_loss": 1.4088243246078491, + "grad_norm": 12.012076106940423, + "learning_rate": 1.5272309918663974e-07, + "logits/chosen": -0.23025648295879364, + "logits/rejected": -0.07983642816543579, + "logps/chosen": -1.362376093864441, + "logps/rejected": -1.7934879064559937, + "loss": 1.0226, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.362376093864441, + "rewards/margins": 0.4311119616031647, + "rewards/rejected": -1.7934879064559937, + "sft_loss": 1.4036413431167603, "step": 4315 }, { "epoch": 2.3120923231309582, - "grad_norm": 10.306307118346142, - "learning_rate": 4.548129800190603e-07, - "logits/chosen": -0.6606016159057617, - "logits/rejected": -0.5660589933395386, - "logps/chosen": -1.312565803527832, - "logps/rejected": -2.348137617111206, - "loss": 0.8717, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.312565803527832, - "rewards/margins": 1.035571813583374, - "rewards/rejected": -2.348137617111206, - "sft_loss": 1.379921793937683, + "grad_norm": 8.975404973926123, + "learning_rate": 1.516043266730201e-07, + "logits/chosen": -0.2329825609922409, + "logits/rejected": -0.09870786964893341, + "logps/chosen": -1.3620847463607788, + "logps/rejected": -1.8238025903701782, + "loss": 1.0331, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3620847463607788, + "rewards/margins": 0.46171754598617554, + "rewards/rejected": -1.8238025903701782, + "sft_loss": 1.3915579319000244, "step": 4320 }, { "epoch": 2.31476835591236, - "grad_norm": 11.13862485808852, - "learning_rate": 4.5146680288725367e-07, - "logits/chosen": -0.6532072424888611, - "logits/rejected": -0.5128600597381592, - "logps/chosen": -1.3058058023452759, - "logps/rejected": -2.372032880783081, - "loss": 0.8814, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3058058023452759, - "rewards/margins": 1.0662271976470947, - "rewards/rejected": -2.372032880783081, - "sft_loss": 1.3538084030151367, + "grad_norm": 10.39904637190083, + "learning_rate": 1.504889342957512e-07, + "logits/chosen": -0.23698978126049042, + "logits/rejected": -0.0770409107208252, + "logps/chosen": -1.3104714155197144, + "logps/rejected": -1.920532464981079, + "loss": 0.998, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3104714155197144, + "rewards/margins": 0.61006098985672, + "rewards/rejected": -1.920532464981079, + "sft_loss": 1.3576505184173584, "step": 4325 }, { "epoch": 2.3174443886937617, - "grad_norm": 16.579937013749525, - "learning_rate": 4.481307986291237e-07, - "logits/chosen": -0.6807089447975159, - "logits/rejected": -0.6148806214332581, - "logps/chosen": -1.400302767753601, - "logps/rejected": -2.3534867763519287, - "loss": 0.9536, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.400302767753601, - "rewards/margins": 0.9531840085983276, - "rewards/rejected": -2.3534867763519287, - "sft_loss": 1.41280198097229, + "grad_norm": 10.299178045423748, + "learning_rate": 1.4937693287637453e-07, + "logits/chosen": -0.22072386741638184, + "logits/rejected": -0.1066427230834961, + "logps/chosen": -1.4406993389129639, + "logps/rejected": -1.9226884841918945, + "loss": 1.0741, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.4406993389129639, + "rewards/margins": 0.4819890558719635, + "rewards/rejected": -1.9226884841918945, + "sft_loss": 1.416477084159851, "step": 4330 }, { "epoch": 2.320120421475163, - "grad_norm": 8.168795805694037, - "learning_rate": 4.4480499961059915e-07, - "logits/chosen": -0.6159850358963013, - "logits/rejected": -0.5662098526954651, - "logps/chosen": -1.3755680322647095, - "logps/rejected": -2.3326282501220703, - "loss": 0.8879, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3755680322647095, - "rewards/margins": 0.9570604562759399, - "rewards/rejected": -2.3326282501220703, - "sft_loss": 1.3728866577148438, + "grad_norm": 5.747903690377897, + "learning_rate": 1.4826833320353305e-07, + "logits/chosen": -0.17060045897960663, + "logits/rejected": -0.10004113614559174, + "logps/chosen": -1.4150245189666748, + "logps/rejected": -1.9153951406478882, + "loss": 1.0166, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4150245189666748, + "rewards/margins": 0.5003708600997925, + "rewards/rejected": -1.9153951406478882, + "sft_loss": 1.3764393329620361, "step": 4335 }, { "epoch": 2.3227964542565647, - "grad_norm": 9.224841659650155, - "learning_rate": 4.414894380985959e-07, - "logits/chosen": -0.695419192314148, - "logits/rejected": -0.5470012426376343, - "logps/chosen": -1.2756474018096924, - "logps/rejected": -2.5504016876220703, - "loss": 0.8475, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.2756474018096924, - "rewards/margins": 1.2747540473937988, - "rewards/rejected": -2.5504016876220703, - "sft_loss": 1.3470367193222046, + "grad_norm": 9.207914170551703, + "learning_rate": 1.4716314603286528e-07, + "logits/chosen": -0.23539428412914276, + "logits/rejected": -0.06189621612429619, + "logps/chosen": -1.2505683898925781, + "logps/rejected": -1.98164963722229, + "loss": 0.9379, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2505683898925781, + "rewards/margins": 0.7310811877250671, + "rewards/rejected": -1.98164963722229, + "sft_loss": 1.3134586811065674, "step": 4340 }, { "epoch": 2.3254724870379664, - "grad_norm": 14.596642629245082, - "learning_rate": 4.3818414626070703e-07, - "logits/chosen": -0.716138482093811, - "logits/rejected": -0.6742798089981079, - "logps/chosen": -1.4133208990097046, - "logps/rejected": -2.321979522705078, - "loss": 0.9418, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.4133208990097046, - "rewards/margins": 0.9086586236953735, - "rewards/rejected": -2.321979522705078, - "sft_loss": 1.4485996961593628, + "grad_norm": 13.990820654898867, + "learning_rate": 1.4606138208690233e-07, + "logits/chosen": -0.24911001324653625, + "logits/rejected": -0.16555368900299072, + "logps/chosen": -1.4709831476211548, + "logps/rejected": -1.7603830099105835, + "loss": 1.1095, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4709831476211548, + "rewards/margins": 0.2893998622894287, + "rewards/rejected": -1.7603830099105835, + "sft_loss": 1.459791898727417, "step": 4345 }, { "epoch": 2.3281485198193677, - "grad_norm": 10.813466223871607, - "learning_rate": 4.3488915616488757e-07, - "logits/chosen": -0.6477821469306946, - "logits/rejected": -0.6120352149009705, - "logps/chosen": -1.3689117431640625, - "logps/rejected": -2.4252209663391113, - "loss": 0.8642, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3689117431640625, - "rewards/margins": 1.0563093423843384, - "rewards/rejected": -2.4252209663391113, - "sft_loss": 1.3875305652618408, + "grad_norm": 7.4323654415717355, + "learning_rate": 1.4496305205496251e-07, + "logits/chosen": -0.15631893277168274, + "logits/rejected": -0.09263060986995697, + "logps/chosen": -1.3558018207550049, + "logps/rejected": -1.9541645050048828, + "loss": 0.9599, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3558018207550049, + "rewards/margins": 0.5983625650405884, + "rewards/rejected": -1.9541645050048828, + "sft_loss": 1.377393364906311, "step": 4350 }, { "epoch": 2.3308245526007694, - "grad_norm": 8.596530588605493, - "learning_rate": 4.316044997791469e-07, - "logits/chosen": -0.7162259817123413, - "logits/rejected": -0.6386939287185669, - "logps/chosen": -1.3806841373443604, - "logps/rejected": -2.3228540420532227, - "loss": 0.8994, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.3806841373443604, - "rewards/margins": 0.9421700239181519, - "rewards/rejected": -2.3228540420532227, - "sft_loss": 1.4305603504180908, + "grad_norm": 10.017881800957118, + "learning_rate": 1.4386816659304895e-07, + "logits/chosen": -0.26166990399360657, + "logits/rejected": -0.14959931373596191, + "logps/chosen": -1.379631757736206, + "logps/rejected": -1.8234903812408447, + "loss": 1.0108, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.379631757736206, + "rewards/margins": 0.44385847449302673, + "rewards/rejected": -1.8234903812408447, + "sft_loss": 1.4106724262237549, "step": 4355 }, { "epoch": 2.333500585382171, - "grad_norm": 8.601381742032121, - "learning_rate": 4.283302089712348e-07, - "logits/chosen": -0.746851921081543, - "logits/rejected": -0.5745272040367126, - "logps/chosen": -1.4036058187484741, - "logps/rejected": -2.382904529571533, - "loss": 0.8812, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.4036058187484741, - "rewards/margins": 0.9792987108230591, - "rewards/rejected": -2.382904529571533, - "sft_loss": 1.4322397708892822, + "grad_norm": 9.705232333372008, + "learning_rate": 1.4277673632374492e-07, + "logits/chosen": -0.31485554575920105, + "logits/rejected": -0.10140397399663925, + "logps/chosen": -1.4201301336288452, + "logps/rejected": -1.8692519664764404, + "loss": 1.025, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4201301336288452, + "rewards/margins": 0.44912201166152954, + "rewards/rejected": -1.8692519664764404, + "sft_loss": 1.43977952003479, "step": 4360 }, { "epoch": 2.3361766181635724, - "grad_norm": 9.74306816379491, - "learning_rate": 4.250663155083357e-07, - "logits/chosen": -0.5852586030960083, - "logits/rejected": -0.6144552826881409, - "logps/chosen": -1.3089172840118408, - "logps/rejected": -2.3125052452087402, - "loss": 0.8857, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3089172840118408, - "rewards/margins": 1.0035879611968994, - "rewards/rejected": -2.3125052452087402, - "sft_loss": 1.3435930013656616, + "grad_norm": 9.662438801196549, + "learning_rate": 1.416887718361119e-07, + "logits/chosen": -0.11434853076934814, + "logits/rejected": -0.11242060363292694, + "logps/chosen": -1.3031599521636963, + "logps/rejected": -1.8188354969024658, + "loss": 0.9819, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3031599521636963, + "rewards/margins": 0.5156753659248352, + "rewards/rejected": -1.8188354969024658, + "sft_loss": 1.3415518999099731, "step": 4365 }, { "epoch": 2.338852650944974, - "grad_norm": 11.720966069939928, - "learning_rate": 4.218128510567578e-07, - "logits/chosen": -0.6212056875228882, - "logits/rejected": -0.5088070631027222, - "logps/chosen": -1.2342393398284912, - "logps/rejected": -2.3752951622009277, - "loss": 0.8039, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2342393398284912, - "rewards/margins": 1.1410560607910156, - "rewards/rejected": -2.3752951622009277, - "sft_loss": 1.2810344696044922, + "grad_norm": 10.933654268222613, + "learning_rate": 1.406042836855859e-07, + "logits/chosen": -0.19458740949630737, + "logits/rejected": -0.054971061646938324, + "logps/chosen": -1.232940435409546, + "logps/rejected": -1.8467018604278564, + "loss": 0.929, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.232940435409546, + "rewards/margins": 0.6137614846229553, + "rewards/rejected": -1.8467018604278564, + "sft_loss": 1.2706055641174316, "step": 4370 }, { "epoch": 2.341528683726376, - "grad_norm": 8.919312733582453, - "learning_rate": 4.185698471816279e-07, - "logits/chosen": -0.7572565078735352, - "logits/rejected": -0.5971711874008179, - "logps/chosen": -1.3673795461654663, - "logps/rejected": -2.542800188064575, - "loss": 0.9014, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3673795461654663, - "rewards/margins": 1.1754207611083984, - "rewards/rejected": -2.542800188064575, - "sft_loss": 1.450248122215271, + "grad_norm": 7.592807117898535, + "learning_rate": 1.3952328239387595e-07, + "logits/chosen": -0.3091137409210205, + "logits/rejected": -0.11701484024524689, + "logps/chosen": -1.365443468093872, + "logps/rejected": -1.9199018478393555, + "loss": 1.0253, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.365443468093872, + "rewards/margins": 0.5544580817222595, + "rewards/rejected": -1.9199018478393555, + "sft_loss": 1.44214928150177, "step": 4375 }, { "epoch": 2.344204716507777, - "grad_norm": 10.14180915809746, - "learning_rate": 4.1533733534658326e-07, - "logits/chosen": -0.6978228688240051, - "logits/rejected": -0.5519328713417053, - "logps/chosen": -1.3382021188735962, - "logps/rejected": -2.3281280994415283, - "loss": 0.9024, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3382021188735962, - "rewards/margins": 0.9899260401725769, - "rewards/rejected": -2.3281280994415283, - "sft_loss": 1.3950769901275635, + "grad_norm": 7.076712114080835, + "learning_rate": 1.3844577844886109e-07, + "logits/chosen": -0.240199014544487, + "logits/rejected": -0.05072479322552681, + "logps/chosen": -1.355287790298462, + "logps/rejected": -1.8138258457183838, + "loss": 1.0286, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.355287790298462, + "rewards/margins": 0.45853790640830994, + "rewards/rejected": -1.8138258457183838, + "sft_loss": 1.3838344812393188, "step": 4380 }, { "epoch": 2.346880749289179, - "grad_norm": 7.8186537137711865, - "learning_rate": 4.121153469134686e-07, - "logits/chosen": -0.6784673929214478, - "logits/rejected": -0.563454270362854, - "logps/chosen": -1.3425476551055908, - "logps/rejected": -2.3465404510498047, - "loss": 0.8798, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3425476551055908, - "rewards/margins": 1.003993034362793, - "rewards/rejected": -2.3465404510498047, - "sft_loss": 1.385127067565918, + "grad_norm": 13.56410524646758, + "learning_rate": 1.3737178230448955e-07, + "logits/chosen": -0.2634291648864746, + "logits/rejected": -0.11340577900409698, + "logps/chosen": -1.3418443202972412, + "logps/rejected": -1.8998095989227295, + "loss": 0.9715, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3418443202972412, + "rewards/margins": 0.5579651594161987, + "rewards/rejected": -1.8998095989227295, + "sft_loss": 1.3820970058441162, "step": 4385 }, { "epoch": 2.3495567820705805, - "grad_norm": 7.009913853726649, - "learning_rate": 4.089039131420292e-07, - "logits/chosen": -0.6797121167182922, - "logits/rejected": -0.5956329107284546, - "logps/chosen": -1.2960842847824097, - "logps/rejected": -2.1306862831115723, - "loss": 0.9071, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2960842847824097, - "rewards/margins": 0.8346019983291626, - "rewards/rejected": -2.1306862831115723, - "sft_loss": 1.3619669675827026, + "grad_norm": 5.686835263850377, + "learning_rate": 1.363013043806764e-07, + "logits/chosen": -0.23777596652507782, + "logits/rejected": -0.10516651719808578, + "logps/chosen": -1.288923978805542, + "logps/rejected": -1.724108338356018, + "loss": 1.0092, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.288923978805542, + "rewards/margins": 0.43518438935279846, + "rewards/rejected": -1.724108338356018, + "sft_loss": 1.348914384841919, "step": 4390 }, { "epoch": 2.3522328148519818, - "grad_norm": 9.23115558482239, - "learning_rate": 4.0570306518961027e-07, - "logits/chosen": -0.6509016752243042, - "logits/rejected": -0.5293477773666382, - "logps/chosen": -1.3495789766311646, - "logps/rejected": -2.419602394104004, - "loss": 0.9034, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3495789766311646, - "rewards/margins": 1.0700232982635498, - "rewards/rejected": -2.419602394104004, - "sft_loss": 1.4062663316726685, + "grad_norm": 11.53155326090315, + "learning_rate": 1.352343550632034e-07, + "logits/chosen": -0.18005147576332092, + "logits/rejected": -0.01693601906299591, + "logps/chosen": -1.3340141773223877, + "logps/rejected": -1.9913352727890015, + "loss": 0.9824, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3340141773223877, + "rewards/margins": 0.6573209762573242, + "rewards/rejected": -1.9913352727890015, + "sft_loss": 1.3819031715393066, "step": 4395 }, { "epoch": 2.3549088476333835, - "grad_norm": 7.471901000006091, - "learning_rate": 4.025128341108517e-07, - "logits/chosen": -0.7326493263244629, - "logits/rejected": -0.5960140228271484, - "logps/chosen": -1.328127145767212, - "logps/rejected": -2.343733072280884, - "loss": 0.8788, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.328127145767212, - "rewards/margins": 1.0156059265136719, - "rewards/rejected": -2.343733072280884, - "sft_loss": 1.3994157314300537, + "grad_norm": 7.213618434542793, + "learning_rate": 1.3417094470361722e-07, + "logits/chosen": -0.2869768440723419, + "logits/rejected": -0.13710127770900726, + "logps/chosen": -1.3353666067123413, + "logps/rejected": -1.9065090417861938, + "loss": 0.9676, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3353666067123413, + "rewards/margins": 0.5711422562599182, + "rewards/rejected": -1.9065090417861938, + "sft_loss": 1.3584064245224, "step": 4400 }, { "epoch": 2.3549088476333835, - "eval_logits/chosen": -0.3551722764968872, - "eval_logits/rejected": -0.29631373286247253, - "eval_logps/chosen": -1.5272942781448364, - "eval_logps/rejected": -2.1771371364593506, - "eval_loss": 1.0414245128631592, - "eval_rewards/accuracies": 0.6468842625617981, - "eval_rewards/chosen": -1.5272942781448364, - "eval_rewards/margins": 0.6498429179191589, - "eval_rewards/rejected": -2.1771371364593506, - "eval_runtime": 44.1464, - "eval_samples_per_second": 30.467, - "eval_sft_loss": 1.4793506860733032, - "eval_steps_per_second": 7.634, + "eval_logits/chosen": 0.11583670228719711, + "eval_logits/rejected": 0.20641706883907318, + "eval_logps/chosen": -1.4061453342437744, + "eval_logps/rejected": -1.8540223836898804, + "eval_loss": 1.041849136352539, + "eval_rewards/accuracies": 0.6231454014778137, + "eval_rewards/chosen": -1.4061453342437744, + "eval_rewards/margins": 0.44787701964378357, + "eval_rewards/rejected": -1.8540223836898804, + "eval_runtime": 48.6845, + "eval_samples_per_second": 27.627, + "eval_sft_loss": 1.4054352045059204, + "eval_steps_per_second": 6.922, "step": 4400 }, { "epoch": 2.357584880414785, - "grad_norm": 18.24389181198718, - "learning_rate": 3.9933325085739047e-07, - "logits/chosen": -0.749717116355896, - "logits/rejected": -0.7404106259346008, - "logps/chosen": -1.2358829975128174, - "logps/rejected": -2.073561191558838, - "loss": 0.8849, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2358829975128174, - "rewards/margins": 0.8376781344413757, - "rewards/rejected": -2.073561191558838, - "sft_loss": 1.2863762378692627, + "grad_norm": 16.531761456585027, + "learning_rate": 1.3311108361913015e-07, + "logits/chosen": -0.2997193932533264, + "logits/rejected": -0.25852295756340027, + "logps/chosen": -1.2969939708709717, + "logps/rejected": -1.6899290084838867, + "loss": 1.0055, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2969939708709717, + "rewards/margins": 0.39293503761291504, + "rewards/rejected": -1.6899290084838867, + "sft_loss": 1.3143447637557983, "step": 4405 }, { "epoch": 2.3602609131961865, - "grad_norm": 8.743575469487237, - "learning_rate": 3.9616434627755624e-07, - "logits/chosen": -0.6609512567520142, - "logits/rejected": -0.6308881044387817, - "logps/chosen": -1.3922468423843384, - "logps/rejected": -2.649442195892334, - "loss": 0.8581, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.3922468423843384, - "rewards/margins": 1.257195234298706, - "rewards/rejected": -2.649442195892334, - "sft_loss": 1.431348443031311, + "grad_norm": 7.539170451075638, + "learning_rate": 1.3205478209251874e-07, + "logits/chosen": -0.21077589690685272, + "logits/rejected": -0.1720072478055954, + "logps/chosen": -1.408328890800476, + "logps/rejected": -2.056427478790283, + "loss": 0.9926, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.408328890800476, + "rewards/margins": 0.6480986475944519, + "rewards/rejected": -2.056427478790283, + "sft_loss": 1.425734519958496, "step": 4410 }, { "epoch": 2.362936945977588, - "grad_norm": 8.223634442134767, - "learning_rate": 3.930061511160762e-07, - "logits/chosen": -0.6532543301582336, - "logits/rejected": -0.5277368426322937, - "logps/chosen": -1.3420674800872803, - "logps/rejected": -2.281698703765869, - "loss": 0.9015, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3420674800872803, - "rewards/margins": 0.939631462097168, - "rewards/rejected": -2.281698703765869, - "sft_loss": 1.3932065963745117, + "grad_norm": 8.089600230518249, + "learning_rate": 1.310020503720254e-07, + "logits/chosen": -0.21279382705688477, + "logits/rejected": -0.05223933607339859, + "logps/chosen": -1.3679611682891846, + "logps/rejected": -1.826300859451294, + "loss": 1.0134, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3679611682891846, + "rewards/margins": 0.45833978056907654, + "rewards/rejected": -1.826300859451294, + "sft_loss": 1.3863824605941772, "step": 4415 }, { "epoch": 2.36561297875899, - "grad_norm": 13.739216943538223, - "learning_rate": 3.898586960137726e-07, - "logits/chosen": -0.6745472550392151, - "logits/rejected": -0.6361994743347168, - "logps/chosen": -1.3241745233535767, - "logps/rejected": -2.131701946258545, - "loss": 0.8933, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3241745233535767, - "rewards/margins": 0.8075275421142578, - "rewards/rejected": -2.131701946258545, - "sft_loss": 1.358970046043396, + "grad_norm": 10.501997573777427, + "learning_rate": 1.2995289867125752e-07, + "logits/chosen": -0.24000367522239685, + "logits/rejected": -0.1543852984905243, + "logps/chosen": -1.3647658824920654, + "logps/rejected": -1.688582420349121, + "loss": 1.0335, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3647658824920654, + "rewards/margins": 0.3238166272640228, + "rewards/rejected": -1.688582420349121, + "sft_loss": 1.36466383934021, "step": 4420 }, { "epoch": 2.368289011540391, - "grad_norm": 7.72059808047864, - "learning_rate": 3.867220115072696e-07, - "logits/chosen": -0.6497647166252136, - "logits/rejected": -0.5739953517913818, - "logps/chosen": -1.2007476091384888, - "logps/rejected": -2.001007318496704, - "loss": 0.87, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2007476091384888, - "rewards/margins": 0.8002594709396362, - "rewards/rejected": -2.001007318496704, - "sft_loss": 1.3230812549591064, + "grad_norm": 13.152802899680687, + "learning_rate": 1.2890733716908986e-07, + "logits/chosen": -0.1994973123073578, + "logits/rejected": -0.08989440649747849, + "logps/chosen": -1.2513803243637085, + "logps/rejected": -1.6642259359359741, + "loss": 1.0053, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2513803243637085, + "rewards/margins": 0.4128456115722656, + "rewards/rejected": -1.6642259359359741, + "sft_loss": 1.3685729503631592, "step": 4425 }, { "epoch": 2.370965044321793, - "grad_norm": 7.127098452943066, - "learning_rate": 3.8359612802869367e-07, - "logits/chosen": -0.6745830774307251, - "logits/rejected": -0.5413500070571899, - "logps/chosen": -1.3599244356155396, - "logps/rejected": -2.357084035873413, - "loss": 0.9164, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3599244356155396, - "rewards/margins": 0.9971596002578735, - "rewards/rejected": -2.357084035873413, - "sft_loss": 1.4182361364364624, + "grad_norm": 6.3273467397298875, + "learning_rate": 1.2786537600956454e-07, + "logits/chosen": -0.2455778419971466, + "logits/rejected": -0.07191842049360275, + "logps/chosen": -1.3764415979385376, + "logps/rejected": -1.8642089366912842, + "loss": 1.033, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3764415979385376, + "rewards/margins": 0.48776760697364807, + "rewards/rejected": -1.8642089366912842, + "sft_loss": 1.4149798154830933, "step": 4430 }, { "epoch": 2.3736410771031946, - "grad_norm": 8.195598404674085, - "learning_rate": 3.8048107590537987e-07, - "logits/chosen": -0.7300285696983337, - "logits/rejected": -0.5494762659072876, - "logps/chosen": -1.2751190662384033, - "logps/rejected": -2.339775323867798, - "loss": 0.8621, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2751190662384033, - "rewards/margins": 1.0646560192108154, - "rewards/rejected": -2.339775323867798, - "sft_loss": 1.3891193866729736, + "grad_norm": 6.326489480123552, + "learning_rate": 1.268270253017933e-07, + "logits/chosen": -0.3077549934387207, + "logits/rejected": -0.08836190402507782, + "logps/chosen": -1.262527585029602, + "logps/rejected": -1.7917951345443726, + "loss": 0.9662, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.262527585029602, + "rewards/margins": 0.5292677283287048, + "rewards/rejected": -1.7917951345443726, + "sft_loss": 1.3449105024337769, "step": 4435 }, { "epoch": 2.376317109884596, - "grad_norm": 7.272577584258443, - "learning_rate": 3.773768853595774e-07, - "logits/chosen": -0.735929548740387, - "logits/rejected": -0.5498467683792114, - "logps/chosen": -1.3501901626586914, - "logps/rejected": -2.220968723297119, - "loss": 0.9322, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.3501901626586914, - "rewards/margins": 0.8707782626152039, - "rewards/rejected": -2.220968723297119, - "sft_loss": 1.4078781604766846, + "grad_norm": 9.481565949086859, + "learning_rate": 1.257922951198591e-07, + "logits/chosen": -0.31242865324020386, + "logits/rejected": -0.09762193262577057, + "logps/chosen": -1.3546260595321655, + "logps/rejected": -1.7663938999176025, + "loss": 1.044, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3546260595321655, + "rewards/margins": 0.4117676615715027, + "rewards/rejected": -1.7663938999176025, + "sft_loss": 1.40595281124115, "step": 4440 }, { "epoch": 2.3789931426659976, - "grad_norm": 10.306160904948694, - "learning_rate": 3.7428358650815706e-07, - "logits/chosen": -0.7110830545425415, - "logits/rejected": -0.5415419340133667, - "logps/chosen": -1.3333743810653687, - "logps/rejected": -2.156614303588867, - "loss": 0.9339, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3333743810653687, - "rewards/margins": 0.8232399225234985, - "rewards/rejected": -2.156614303588867, - "sft_loss": 1.3927953243255615, + "grad_norm": 8.325158454192946, + "learning_rate": 1.24761195502719e-07, + "logits/chosen": -0.27985435724258423, + "logits/rejected": -0.0948081687092781, + "logps/chosen": -1.3356212377548218, + "logps/rejected": -1.74164617061615, + "loss": 1.0353, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3356212377548218, + "rewards/margins": 0.4060249924659729, + "rewards/rejected": -1.74164617061615, + "sft_loss": 1.3741556406021118, "step": 4445 }, { "epoch": 2.3816691754473993, - "grad_norm": 7.818563194854539, - "learning_rate": 3.712012093623172e-07, - "logits/chosen": -0.6752433776855469, - "logits/rejected": -0.5796962380409241, - "logps/chosen": -1.3343051671981812, - "logps/rejected": -2.4307165145874023, - "loss": 0.8901, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.3343051671981812, - "rewards/margins": 1.0964112281799316, - "rewards/rejected": -2.4307165145874023, - "sft_loss": 1.394907832145691, + "grad_norm": 13.884407642700152, + "learning_rate": 1.2373373645410573e-07, + "logits/chosen": -0.19278530776500702, + "logits/rejected": -0.06277438253164291, + "logps/chosen": -1.4006202220916748, + "logps/rejected": -1.9403584003448486, + "loss": 1.0405, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4006202220916748, + "rewards/margins": 0.5397380590438843, + "rewards/rejected": -1.9403584003448486, + "sft_loss": 1.402583360671997, "step": 4450 }, { "epoch": 2.384345208228801, - "grad_norm": 15.317372572588315, - "learning_rate": 3.6812978382729524e-07, - "logits/chosen": -0.7404896020889282, - "logits/rejected": -0.6768995523452759, - "logps/chosen": -1.3519471883773804, - "logps/rejected": -2.286410093307495, - "loss": 0.921, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.3519471883773804, - "rewards/margins": 0.9344632029533386, - "rewards/rejected": -2.286410093307495, - "sft_loss": 1.4139940738677979, + "grad_norm": 11.829877249857724, + "learning_rate": 1.2270992794243175e-07, + "logits/chosen": -0.2743862271308899, + "logits/rejected": -0.17100296914577484, + "logps/chosen": -1.3506872653961182, + "logps/rejected": -1.8455251455307007, + "loss": 1.0189, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3506872653961182, + "rewards/margins": 0.4948379099369049, + "rewards/rejected": -1.8455251455307007, + "sft_loss": 1.395976185798645, "step": 4455 }, { "epoch": 2.3870212410102023, - "grad_norm": 8.674143010837458, - "learning_rate": 3.650693397020744e-07, - "logits/chosen": -0.7839964628219604, - "logits/rejected": -0.6039905548095703, - "logps/chosen": -1.2913477420806885, - "logps/rejected": -2.494628429412842, - "loss": 0.8862, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.2913477420806885, - "rewards/margins": 1.2032805681228638, - "rewards/rejected": -2.494628429412842, - "sft_loss": 1.3796261548995972, + "grad_norm": 8.967902356172923, + "learning_rate": 1.2168977990069147e-07, + "logits/chosen": -0.30608153343200684, + "logits/rejected": -0.0737648606300354, + "logps/chosen": -1.2880511283874512, + "logps/rejected": -1.9710843563079834, + "loss": 0.9693, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2880511283874512, + "rewards/margins": 0.6830333471298218, + "rewards/rejected": -1.9710843563079834, + "sft_loss": 1.3572402000427246, "step": 4460 }, { "epoch": 2.389697273791604, - "grad_norm": 14.304659649681543, - "learning_rate": 3.6201990667909774e-07, - "logits/chosen": -0.7501360177993774, - "logits/rejected": -0.6040843725204468, - "logps/chosen": -1.3918160200119019, - "logps/rejected": -2.316871166229248, - "loss": 0.9388, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3918160200119019, - "rewards/margins": 0.9250553250312805, - "rewards/rejected": -2.316871166229248, - "sft_loss": 1.4156030416488647, + "grad_norm": 13.691400586547886, + "learning_rate": 1.206733022263659e-07, + "logits/chosen": -0.29284581542015076, + "logits/rejected": -0.10276402533054352, + "logps/chosen": -1.3815476894378662, + "logps/rejected": -1.816573143005371, + "loss": 1.0459, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3815476894378662, + "rewards/margins": 0.43502530455589294, + "rewards/rejected": -1.816573143005371, + "sft_loss": 1.3989099264144897, "step": 4465 }, { "epoch": 2.3923733065730053, - "grad_norm": 10.260220317471864, - "learning_rate": 3.589815143439772e-07, - "logits/chosen": -0.6317887902259827, - "logits/rejected": -0.5740104913711548, - "logps/chosen": -1.2195584774017334, - "logps/rejected": -2.1278574466705322, - "loss": 0.8879, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2195584774017334, - "rewards/margins": 0.9082988500595093, - "rewards/rejected": -2.1278574466705322, - "sft_loss": 1.2862308025360107, + "grad_norm": 7.438040374763478, + "learning_rate": 1.1966050478132572e-07, + "logits/chosen": -0.1603635549545288, + "logits/rejected": -0.08685074001550674, + "logps/chosen": -1.2549306154251099, + "logps/rejected": -1.7096410989761353, + "loss": 0.9981, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2549306154251099, + "rewards/margins": 0.4547103941440582, + "rewards/rejected": -1.7096410989761353, + "sft_loss": 1.2909096479415894, "step": 4470 }, { "epoch": 2.395049339354407, - "grad_norm": 7.903865291358914, - "learning_rate": 3.559541921752091e-07, - "logits/chosen": -0.7242652177810669, - "logits/rejected": -0.5375711917877197, - "logps/chosen": -1.3358646631240845, - "logps/rejected": -2.285754680633545, - "loss": 0.8906, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3358646631240845, - "rewards/margins": 0.9498898386955261, - "rewards/rejected": -2.285754680633545, - "sft_loss": 1.3705825805664062, + "grad_norm": 12.45344214229761, + "learning_rate": 1.1865139739173635e-07, + "logits/chosen": -0.2482972890138626, + "logits/rejected": -0.023926924914121628, + "logps/chosen": -1.334442377090454, + "logps/rejected": -1.807771921157837, + "loss": 0.9952, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.334442377090454, + "rewards/margins": 0.47332969307899475, + "rewards/rejected": -1.807771921157837, + "sft_loss": 1.3329417705535889, "step": 4475 }, { "epoch": 2.3977253721358087, - "grad_norm": 7.419993242744366, - "learning_rate": 3.5293796954388565e-07, - "logits/chosen": -0.6737462282180786, - "logits/rejected": -0.6052318811416626, - "logps/chosen": -1.2106980085372925, - "logps/rejected": -2.0494823455810547, - "loss": 0.8845, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2106980085372925, - "rewards/margins": 0.8387842178344727, - "rewards/rejected": -2.0494823455810547, - "sft_loss": 1.2878100872039795, + "grad_norm": 9.739524869817904, + "learning_rate": 1.1764598984796187e-07, + "logits/chosen": -0.23262295126914978, + "logits/rejected": -0.14341866970062256, + "logps/chosen": -1.207165241241455, + "logps/rejected": -1.6426270008087158, + "loss": 0.9879, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.207165241241455, + "rewards/margins": 0.4354618489742279, + "rewards/rejected": -1.6426270008087158, + "sft_loss": 1.2839418649673462, "step": 4480 }, { "epoch": 2.4004014049172104, - "grad_norm": 11.853678645168188, - "learning_rate": 3.499328757134129e-07, - "logits/chosen": -0.6386197209358215, - "logits/rejected": -0.5696219801902771, - "logps/chosen": -1.338358998298645, - "logps/rejected": -2.3903615474700928, - "loss": 0.8575, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.338358998298645, - "rewards/margins": 1.0520024299621582, - "rewards/rejected": -2.3903615474700928, - "sft_loss": 1.3504332304000854, + "grad_norm": 8.846852816642949, + "learning_rate": 1.1664429190447095e-07, + "logits/chosen": -0.20950651168823242, + "logits/rejected": -0.11397744715213776, + "logps/chosen": -1.3568718433380127, + "logps/rejected": -1.9146311283111572, + "loss": 0.9793, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3568718433380127, + "rewards/margins": 0.5577594041824341, + "rewards/rejected": -1.9146311283111572, + "sft_loss": 1.3494882583618164, "step": 4485 }, { "epoch": 2.4030774376986117, - "grad_norm": 8.859236562170793, - "learning_rate": 3.469389398392237e-07, - "logits/chosen": -0.6983398199081421, - "logits/rejected": -0.5085052251815796, - "logps/chosen": -1.2761619091033936, - "logps/rejected": -2.413107395172119, - "loss": 0.8225, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.2761619091033936, - "rewards/margins": 1.1369454860687256, - "rewards/rejected": -2.413107395172119, - "sft_loss": 1.339481234550476, + "grad_norm": 10.164110735715587, + "learning_rate": 1.1564631327974122e-07, + "logits/chosen": -0.2683698236942291, + "logits/rejected": -0.04780945926904678, + "logps/chosen": -1.2910733222961426, + "logps/rejected": -1.9252097606658936, + "loss": 0.9495, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2910733222961426, + "rewards/margins": 0.6341365575790405, + "rewards/rejected": -1.9252097606658936, + "sft_loss": 1.3434323072433472, "step": 4490 }, { "epoch": 2.4057534704800134, - "grad_norm": 10.898058882878603, - "learning_rate": 3.4395619096849764e-07, - "logits/chosen": -0.7793927788734436, - "logits/rejected": -0.5934855341911316, - "logps/chosen": -1.3146260976791382, - "logps/rejected": -2.3048288822174072, - "loss": 0.8978, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.3146260976791382, - "rewards/margins": 0.9902030825614929, - "rewards/rejected": -2.3048288822174072, - "sft_loss": 1.4020847082138062, + "grad_norm": 6.3916695628160545, + "learning_rate": 1.1465206365616587e-07, + "logits/chosen": -0.31511589884757996, + "logits/rejected": -0.09818333387374878, + "logps/chosen": -1.309179425239563, + "logps/rejected": -1.787996530532837, + "loss": 0.9986, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.309179425239563, + "rewards/margins": 0.47881707549095154, + "rewards/rejected": -1.787996530532837, + "sft_loss": 1.3815486431121826, "step": 4495 }, { "epoch": 2.408429503261415, - "grad_norm": 8.3769372005201, - "learning_rate": 3.409846580398766e-07, - "logits/chosen": -0.5743625164031982, - "logits/rejected": -0.6095589399337769, - "logps/chosen": -1.2932769060134888, - "logps/rejected": -2.2235419750213623, - "loss": 0.8752, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.2932769060134888, - "rewards/margins": 0.9302651286125183, - "rewards/rejected": -2.2235419750213623, - "sft_loss": 1.3634895086288452, + "grad_norm": 5.9588450975580685, + "learning_rate": 1.1366155267995887e-07, + "logits/chosen": -0.12920327484607697, + "logits/rejected": -0.1269599050283432, + "logps/chosen": -1.3116642236709595, + "logps/rejected": -1.7722593545913696, + "loss": 1.0026, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3116642236709595, + "rewards/margins": 0.4605952203273773, + "rewards/rejected": -1.7722593545913696, + "sft_loss": 1.3667010068893433, "step": 4500 }, { "epoch": 2.4111055360428164, - "grad_norm": 8.348768620392361, - "learning_rate": 3.380243698831869e-07, - "logits/chosen": -0.7401924133300781, - "logits/rejected": -0.600095272064209, - "logps/chosen": -1.3059180974960327, - "logps/rejected": -2.298156261444092, - "loss": 0.8658, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3059180974960327, - "rewards/margins": 0.9922383427619934, - "rewards/rejected": -2.298156261444092, - "sft_loss": 1.372867226600647, + "grad_norm": 10.936536750088786, + "learning_rate": 1.1267478996106228e-07, + "logits/chosen": -0.2735688090324402, + "logits/rejected": -0.1245633214712143, + "logps/chosen": -1.3134715557098389, + "logps/rejected": -1.8533337116241455, + "loss": 0.965, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3134715557098389, + "rewards/margins": 0.5398621559143066, + "rewards/rejected": -1.8533337116241455, + "sft_loss": 1.350994348526001, "step": 4505 }, { "epoch": 2.413781568824218, - "grad_norm": 5.762119412659862, - "learning_rate": 3.350753552191563e-07, - "logits/chosen": -0.7536791563034058, - "logits/rejected": -0.6396313905715942, - "logps/chosen": -1.3253954648971558, - "logps/rejected": -2.276576519012451, - "loss": 0.8466, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.3253954648971558, - "rewards/margins": 0.9511808156967163, - "rewards/rejected": -2.276576519012451, - "sft_loss": 1.3070496320724487, + "grad_norm": 17.92011875022809, + "learning_rate": 1.116917850730521e-07, + "logits/chosen": -0.2906729280948639, + "logits/rejected": -0.1478831022977829, + "logps/chosen": -1.3039506673812866, + "logps/rejected": -1.7351878881454468, + "loss": 0.9837, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3039506673812866, + "rewards/margins": 0.43123722076416016, + "rewards/rejected": -1.7351878881454468, + "sft_loss": 1.2791199684143066, "step": 4510 }, { "epoch": 2.41645760160562, - "grad_norm": 9.495811954480246, - "learning_rate": 3.3213764265913915e-07, - "logits/chosen": -0.7169691324234009, - "logits/rejected": -0.6960464715957642, - "logps/chosen": -1.315781593322754, - "logps/rejected": -2.1747562885284424, - "loss": 0.8986, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.315781593322754, - "rewards/margins": 0.8589746356010437, - "rewards/rejected": -2.1747562885284424, - "sft_loss": 1.3821439743041992, + "grad_norm": 6.916233284559286, + "learning_rate": 1.1071254755304637e-07, + "logits/chosen": -0.2609194815158844, + "logits/rejected": -0.1757608950138092, + "logps/chosen": -1.3002293109893799, + "logps/rejected": -1.7666009664535522, + "loss": 0.9892, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3002293109893799, + "rewards/margins": 0.4663717746734619, + "rewards/rejected": -1.7666009664535522, + "sft_loss": 1.34689462184906, "step": 4515 }, { "epoch": 2.419133634387021, - "grad_norm": 9.196657234975614, - "learning_rate": 3.292112607048343e-07, - "logits/chosen": -0.6809697151184082, - "logits/rejected": -0.6138942837715149, - "logps/chosen": -1.3078068494796753, - "logps/rejected": -2.3196234703063965, - "loss": 0.8486, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.3078068494796753, - "rewards/margins": 1.0118167400360107, - "rewards/rejected": -2.3196234703063965, - "sft_loss": 1.3212130069732666, + "grad_norm": 7.971397826258169, + "learning_rate": 1.0973708690161143e-07, + "logits/chosen": -0.24207957088947296, + "logits/rejected": -0.1476575881242752, + "logps/chosen": -1.281394362449646, + "logps/rejected": -1.8730818033218384, + "loss": 0.9371, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.281394362449646, + "rewards/margins": 0.5916873216629028, + "rewards/rejected": -1.8730818033218384, + "sft_loss": 1.2919895648956299, "step": 4520 }, { "epoch": 2.421809667168423, - "grad_norm": 10.931137947223332, - "learning_rate": 3.262962377480136e-07, - "logits/chosen": -0.7364736199378967, - "logits/rejected": -0.597502589225769, - "logps/chosen": -1.3951513767242432, - "logps/rejected": -2.4568734169006348, - "loss": 0.9168, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.3951513767242432, - "rewards/margins": 1.0617221593856812, - "rewards/rejected": -2.4568734169006348, - "sft_loss": 1.4719269275665283, + "grad_norm": 11.465781260730042, + "learning_rate": 1.0876541258267119e-07, + "logits/chosen": -0.3093206286430359, + "logits/rejected": -0.12962278723716736, + "logps/chosen": -1.4094141721725464, + "logps/rejected": -1.9580589532852173, + "loss": 1.0531, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4094141721725464, + "rewards/margins": 0.5486448407173157, + "rewards/rejected": -1.9580589532852173, + "sft_loss": 1.4824426174163818, "step": 4525 }, { "epoch": 2.4244856999498245, - "grad_norm": 9.68105778607799, - "learning_rate": 3.233926020702414e-07, - "logits/chosen": -0.7190853953361511, - "logits/rejected": -0.6500726938247681, - "logps/chosen": -1.3541837930679321, - "logps/rejected": -2.039454221725464, - "loss": 0.9484, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3541837930679321, - "rewards/margins": 0.6852703094482422, - "rewards/rejected": -2.039454221725464, - "sft_loss": 1.359106183052063, + "grad_norm": 12.302336490667379, + "learning_rate": 1.0779753402341379e-07, + "logits/chosen": -0.27253293991088867, + "logits/rejected": -0.17779898643493652, + "logps/chosen": -1.3364722728729248, + "logps/rejected": -1.7032201290130615, + "loss": 1.0275, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3364722728729248, + "rewards/margins": 0.36674779653549194, + "rewards/rejected": -1.7032201290130615, + "sft_loss": 1.3226144313812256, "step": 4530 }, { "epoch": 2.427161732731226, - "grad_norm": 10.439717436730895, - "learning_rate": 3.205003818426047e-07, - "logits/chosen": -0.5770900845527649, - "logits/rejected": -0.5122250318527222, - "logps/chosen": -1.265426754951477, - "logps/rejected": -2.2611968517303467, - "loss": 0.8864, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.265426754951477, - "rewards/margins": 0.9957700967788696, - "rewards/rejected": -2.2611968517303467, - "sft_loss": 1.3501912355422974, + "grad_norm": 9.169403824852585, + "learning_rate": 1.0683346061420157e-07, + "logits/chosen": -0.11461882293224335, + "logits/rejected": 0.003153522266075015, + "logps/chosen": -1.2885421514511108, + "logps/rejected": -1.7717559337615967, + "loss": 1.0068, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2885421514511108, + "rewards/margins": 0.4832138121128082, + "rewards/rejected": -1.7717559337615967, + "sft_loss": 1.3321475982666016, "step": 4535 }, { "epoch": 2.4298377655126275, - "grad_norm": 9.77852965592999, - "learning_rate": 3.1761960512543627e-07, - "logits/chosen": -0.6387471556663513, - "logits/rejected": -0.5983433127403259, - "logps/chosen": -1.2862733602523804, - "logps/rejected": -2.2489261627197266, - "loss": 0.8922, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2862733602523804, - "rewards/margins": 0.9626529812812805, - "rewards/rejected": -2.2489261627197266, - "sft_loss": 1.3163095712661743, + "grad_norm": 6.341765185378913, + "learning_rate": 1.0587320170847874e-07, + "logits/chosen": -0.1359674036502838, + "logits/rejected": -0.057124294340610504, + "logps/chosen": -1.2703921794891357, + "logps/rejected": -1.7722088098526, + "loss": 0.989, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2703921794891357, + "rewards/margins": 0.5018167495727539, + "rewards/rejected": -1.7722088098526, + "sft_loss": 1.3047778606414795, "step": 4540 }, { "epoch": 2.4325137982940293, - "grad_norm": 24.67312605387757, - "learning_rate": 3.147502998680447e-07, - "logits/chosen": -0.6618883609771729, - "logits/rejected": -0.5567878484725952, - "logps/chosen": -1.2788108587265015, - "logps/rejected": -2.2833950519561768, - "loss": 0.8881, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2788108587265015, - "rewards/margins": 1.0045843124389648, - "rewards/rejected": -2.2833950519561768, - "sft_loss": 1.3379775285720825, + "grad_norm": 14.248618673793011, + "learning_rate": 1.0491676662268156e-07, + "logits/chosen": -0.18335461616516113, + "logits/rejected": -0.04374461621046066, + "logps/chosen": -1.2934150695800781, + "logps/rejected": -1.7944812774658203, + "loss": 1.007, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2934150695800781, + "rewards/margins": 0.5010663270950317, + "rewards/rejected": -1.7944812774658203, + "sft_loss": 1.3242768049240112, "step": 4545 }, { "epoch": 2.4351898310754305, - "grad_norm": 7.560659530220782, - "learning_rate": 3.11892493908442e-07, - "logits/chosen": -0.72434002161026, - "logits/rejected": -0.6553603410720825, - "logps/chosen": -1.2460191249847412, - "logps/rejected": -2.1280198097229004, - "loss": 0.8913, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2460191249847412, - "rewards/margins": 0.8820006251335144, - "rewards/rejected": -2.1280198097229004, - "sft_loss": 1.3037611246109009, + "grad_norm": 9.205092096895898, + "learning_rate": 1.0396416463614732e-07, + "logits/chosen": -0.25516027212142944, + "logits/rejected": -0.1437629908323288, + "logps/chosen": -1.2467381954193115, + "logps/rejected": -1.7445876598358154, + "loss": 0.9732, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2467381954193115, + "rewards/margins": 0.49784937500953674, + "rewards/rejected": -1.7445876598358154, + "sft_loss": 1.276613712310791, "step": 4550 }, { "epoch": 2.4378658638568322, - "grad_norm": 11.183665158896252, - "learning_rate": 3.0904621497307437e-07, - "logits/chosen": -0.6681724786758423, - "logits/rejected": -0.61586594581604, - "logps/chosen": -1.3558168411254883, - "logps/rejected": -2.169890880584717, - "loss": 0.9579, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3558168411254883, - "rewards/margins": 0.8140741586685181, - "rewards/rejected": -2.169890880584717, - "sft_loss": 1.463220477104187, + "grad_norm": 7.101111088315062, + "learning_rate": 1.0301540499102479e-07, + "logits/chosen": -0.19623471796512604, + "logits/rejected": -0.12268761545419693, + "logps/chosen": -1.3916569948196411, + "logps/rejected": -1.7594877481460571, + "loss": 1.0586, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3916569948196411, + "rewards/margins": 0.367830753326416, + "rewards/rejected": -1.7594877481460571, + "sft_loss": 1.4670602083206177, "step": 4555 }, { "epoch": 2.440541896638234, - "grad_norm": 10.577372485814985, - "learning_rate": 3.062114906765522e-07, - "logits/chosen": -0.703151524066925, - "logits/rejected": -0.5398627519607544, - "logps/chosen": -1.2571704387664795, - "logps/rejected": -2.2171876430511475, - "loss": 0.8799, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2571704387664795, - "rewards/margins": 0.960017204284668, - "rewards/rejected": -2.2171876430511475, - "sft_loss": 1.2801587581634521, + "grad_norm": 7.1133746677185785, + "learning_rate": 1.0207049689218405e-07, + "logits/chosen": -0.23669187724590302, + "logits/rejected": -0.045713476836681366, + "logps/chosen": -1.287642478942871, + "logps/rejected": -1.7388050556182861, + "loss": 1.0283, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.287642478942871, + "rewards/margins": 0.45116251707077026, + "rewards/rejected": -1.7388050556182861, + "sft_loss": 1.2754676342010498, "step": 4560 }, { "epoch": 2.4432179294196352, - "grad_norm": 13.685702261492946, - "learning_rate": 3.0338834852138346e-07, - "logits/chosen": -0.6743217706680298, - "logits/rejected": -0.6253520846366882, - "logps/chosen": -1.3779981136322021, - "logps/rejected": -2.291980743408203, - "loss": 0.9037, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3779981136322021, - "rewards/margins": 0.9139825105667114, - "rewards/rejected": -2.291980743408203, - "sft_loss": 1.38899564743042, + "grad_norm": 9.04971905642823, + "learning_rate": 1.0112944950712782e-07, + "logits/chosen": -0.182923823595047, + "logits/rejected": -0.09119474142789841, + "logps/chosen": -1.38874089717865, + "logps/rejected": -1.8274568319320679, + "loss": 1.0187, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.38874089717865, + "rewards/margins": 0.4387160837650299, + "rewards/rejected": -1.8274568319320679, + "sft_loss": 1.4005342721939087, "step": 4565 }, { "epoch": 2.445893962201037, - "grad_norm": 17.384020814403296, - "learning_rate": 3.0057681589770526e-07, - "logits/chosen": -0.6919450163841248, - "logits/rejected": -0.5843518972396851, - "logps/chosen": -1.3070205450057983, - "logps/rejected": -2.368257999420166, - "loss": 0.8608, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.3070205450057983, - "rewards/margins": 1.0612374544143677, - "rewards/rejected": -2.368257999420166, - "sft_loss": 1.3402724266052246, + "grad_norm": 14.466176430071794, + "learning_rate": 1.0019227196590174e-07, + "logits/chosen": -0.14486274123191833, + "logits/rejected": 0.015377027913928032, + "logps/chosen": -1.2977030277252197, + "logps/rejected": -1.8409439325332642, + "loss": 0.9696, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2977030277252197, + "rewards/margins": 0.5432409644126892, + "rewards/rejected": -1.8409439325332642, + "sft_loss": 1.3190866708755493, "step": 4570 }, { "epoch": 2.4485699949824387, - "grad_norm": 13.322706624373525, - "learning_rate": 2.9777692008301993e-07, - "logits/chosen": -0.6535794138908386, - "logits/rejected": -0.6342029571533203, - "logps/chosen": -1.3068435192108154, - "logps/rejected": -2.229980230331421, - "loss": 0.887, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3068435192108154, - "rewards/margins": 0.9231365323066711, - "rewards/rejected": -2.229980230331421, - "sft_loss": 1.371297836303711, + "grad_norm": 16.65086539067498, + "learning_rate": 9.925897336100664e-08, + "logits/chosen": -0.1298166811466217, + "logits/rejected": -0.07689005136489868, + "logps/chosen": -1.3006457090377808, + "logps/rejected": -1.753061056137085, + "loss": 1.009, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3006457090377808, + "rewards/margins": 0.4524151682853699, + "rewards/rejected": -1.753061056137085, + "sft_loss": 1.353380799293518, "step": 4575 }, { "epoch": 2.45124602776384, - "grad_norm": 7.737728756955118, - "learning_rate": 2.949886882419284e-07, - "logits/chosen": -0.7128503322601318, - "logits/rejected": -0.6816755533218384, - "logps/chosen": -1.2589917182922363, - "logps/rejected": -2.2378623485565186, - "loss": 0.8626, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2589917182922363, - "rewards/margins": 0.9788707494735718, - "rewards/rejected": -2.2378623485565186, - "sft_loss": 1.3179762363433838, + "grad_norm": 10.575069440877996, + "learning_rate": 9.832956274730946e-08, + "logits/chosen": -0.21864600479602814, + "logits/rejected": -0.16788628697395325, + "logps/chosen": -1.2870771884918213, + "logps/rejected": -1.7425334453582764, + "loss": 0.9956, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2870771884918213, + "rewards/margins": 0.4554562568664551, + "rewards/rejected": -1.7425334453582764, + "sft_loss": 1.3278234004974365, "step": 4580 }, { "epoch": 2.4539220605452416, - "grad_norm": 8.316412011766696, - "learning_rate": 2.92212147425869e-07, - "logits/chosen": -0.6452383995056152, - "logits/rejected": -0.5411955714225769, - "logps/chosen": -1.3594417572021484, - "logps/rejected": -2.3032777309417725, - "loss": 0.9555, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3594417572021484, - "rewards/margins": 0.9438360929489136, - "rewards/rejected": -2.3032777309417725, - "sft_loss": 1.4443765878677368, + "grad_norm": 7.601451022217489, + "learning_rate": 9.740404914195633e-08, + "logits/chosen": -0.19092920422554016, + "logits/rejected": -0.0491974875330925, + "logps/chosen": -1.3717070817947388, + "logps/rejected": -1.7525761127471924, + "loss": 1.083, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3717070817947388, + "rewards/margins": 0.3808690905570984, + "rewards/rejected": -1.7525761127471924, + "sft_loss": 1.4324686527252197, "step": 4585 }, { "epoch": 2.4565980933266434, - "grad_norm": 10.39657321444957, - "learning_rate": 2.894473245728518e-07, - "logits/chosen": -0.7684861421585083, - "logits/rejected": -0.6261590719223022, - "logps/chosen": -1.2524304389953613, - "logps/rejected": -2.1767070293426514, - "loss": 0.891, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2524304389953613, - "rewards/margins": 0.9242765307426453, - "rewards/rejected": -2.1767070293426514, - "sft_loss": 1.3225806951522827, + "grad_norm": 7.488293856429623, + "learning_rate": 9.648244152428392e-08, + "logits/chosen": -0.307849258184433, + "logits/rejected": -0.1173395961523056, + "logps/chosen": -1.2461965084075928, + "logps/rejected": -1.6748631000518799, + "loss": 1.0013, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2461965084075928, + "rewards/margins": 0.4286664128303528, + "rewards/rejected": -1.6748631000518799, + "sft_loss": 1.3064314126968384, "step": 4590 }, { "epoch": 2.4592741261080446, - "grad_norm": 17.39160581642626, - "learning_rate": 2.866942465072014e-07, - "logits/chosen": -0.7339946031570435, - "logits/rejected": -0.6413661241531372, - "logps/chosen": -1.3248698711395264, - "logps/rejected": -2.3884971141815186, - "loss": 0.8961, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3248698711395264, - "rewards/margins": 1.063627004623413, - "rewards/rejected": -2.3884971141815186, - "sft_loss": 1.3735225200653076, + "grad_norm": 18.458289821572553, + "learning_rate": 9.556474883573379e-08, + "logits/chosen": -0.25331372022628784, + "logits/rejected": -0.1301385462284088, + "logps/chosen": -1.3207085132598877, + "logps/rejected": -1.8877767324447632, + "loss": 0.9931, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3207085132598877, + "rewards/margins": 0.5670682191848755, + "rewards/rejected": -1.8877767324447632, + "sft_loss": 1.3454453945159912, "step": 4595 }, { "epoch": 2.4619501588894463, - "grad_norm": 5.919966901123824, - "learning_rate": 2.839529399392924e-07, - "logits/chosen": -0.7200514078140259, - "logits/rejected": -0.5268881320953369, - "logps/chosen": -1.3770151138305664, - "logps/rejected": -2.454486131668091, - "loss": 0.9059, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3770151138305664, - "rewards/margins": 1.077471137046814, - "rewards/rejected": -2.454486131668091, - "sft_loss": 1.4495172500610352, + "grad_norm": 7.635517310171866, + "learning_rate": 9.465097997976412e-08, + "logits/chosen": -0.24821238219738007, + "logits/rejected": 0.020019862800836563, + "logps/chosen": -1.3479750156402588, + "logps/rejected": -1.9506289958953857, + "loss": 0.9808, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3479750156402588, + "rewards/margins": 0.6026536822319031, + "rewards/rejected": -1.9506289958953857, + "sft_loss": 1.4187161922454834, "step": 4600 }, { "epoch": 2.464626191670848, - "grad_norm": 9.71994547678538, - "learning_rate": 2.812234314652937e-07, - "logits/chosen": -0.6608083248138428, - "logits/rejected": -0.5510744452476501, - "logps/chosen": -1.3255417346954346, - "logps/rejected": -2.4279494285583496, - "loss": 0.8981, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3255417346954346, - "rewards/margins": 1.102407693862915, - "rewards/rejected": -2.4279494285583496, - "sft_loss": 1.3806711435317993, + "grad_norm": 11.600826497569802, + "learning_rate": 9.374114382176457e-08, + "logits/chosen": -0.17614629864692688, + "logits/rejected": -0.031257856637239456, + "logps/chosen": -1.3354041576385498, + "logps/rejected": -1.9279369115829468, + "loss": 1.0027, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3354041576385498, + "rewards/margins": 0.5925329327583313, + "rewards/rejected": -1.9279369115829468, + "sft_loss": 1.3648275136947632, "step": 4605 }, { "epoch": 2.46730222445225, - "grad_norm": 6.834992459863064, - "learning_rate": 2.785057475669084e-07, - "logits/chosen": -0.7301166653633118, - "logits/rejected": -0.6108202934265137, - "logps/chosen": -1.302876591682434, - "logps/rejected": -2.4487080574035645, - "loss": 0.8634, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.302876591682434, - "rewards/margins": 1.1458313465118408, - "rewards/rejected": -2.4487080574035645, - "sft_loss": 1.3394935131072998, + "grad_norm": 7.982304524200157, + "learning_rate": 9.283524918896945e-08, + "logits/chosen": -0.2403148114681244, + "logits/rejected": -0.08003642410039902, + "logps/chosen": -1.3246963024139404, + "logps/rejected": -1.8701080083847046, + "loss": 0.9869, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3246963024139404, + "rewards/margins": 0.5454118847846985, + "rewards/rejected": -1.8701080083847046, + "sft_loss": 1.344297170639038, "step": 4610 }, { "epoch": 2.469978257233651, - "grad_norm": 9.15944035800927, - "learning_rate": 2.75799914611117e-07, - "logits/chosen": -0.6518206000328064, - "logits/rejected": -0.5422743558883667, - "logps/chosen": -1.3434274196624756, - "logps/rejected": -2.491009473800659, - "loss": 0.8819, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3434274196624756, - "rewards/margins": 1.1475821733474731, - "rewards/rejected": -2.491009473800659, - "sft_loss": 1.3923437595367432, + "grad_norm": 9.660647937774073, + "learning_rate": 9.193330487037232e-08, + "logits/chosen": -0.16011041402816772, + "logits/rejected": -0.0173188503831625, + "logps/chosen": -1.3368616104125977, + "logps/rejected": -2.026045560836792, + "loss": 0.9754, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3368616104125977, + "rewards/margins": 0.6891839504241943, + "rewards/rejected": -2.026045560836792, + "sft_loss": 1.380816102027893, "step": 4615 }, { "epoch": 2.4726542900150528, - "grad_norm": 9.054548708671863, - "learning_rate": 2.7310595884992354e-07, - "logits/chosen": -0.6681968569755554, - "logits/rejected": -0.5110117197036743, - "logps/chosen": -1.2326476573944092, - "logps/rejected": -2.231614589691162, - "loss": 0.8627, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2326476573944092, - "rewards/margins": 0.998967170715332, - "rewards/rejected": -2.231614589691162, - "sft_loss": 1.3625903129577637, + "grad_norm": 7.359053426609902, + "learning_rate": 9.103531961664118e-08, + "logits/chosen": -0.1726696491241455, + "logits/rejected": 0.0374867208302021, + "logps/chosen": -1.2261712551116943, + "logps/rejected": -1.7587896585464478, + "loss": 0.9608, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2261712551116943, + "rewards/margins": 0.5326187014579773, + "rewards/rejected": -1.7587896585464478, + "sft_loss": 1.3288418054580688, "step": 4620 }, { "epoch": 2.475330322796454, - "grad_norm": 11.42134908729122, - "learning_rate": 2.7042390642009805e-07, - "logits/chosen": -0.7752343416213989, - "logits/rejected": -0.7648538947105408, - "logps/chosen": -1.2942503690719604, - "logps/rejected": -2.251347064971924, - "loss": 0.8968, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2942503690719604, - "rewards/margins": 0.9570968747138977, - "rewards/rejected": -2.251347064971924, - "sft_loss": 1.364689588546753, + "grad_norm": 6.227833451403368, + "learning_rate": 9.014130214003269e-08, + "logits/chosen": -0.3180811405181885, + "logits/rejected": -0.2705082595348358, + "logps/chosen": -1.33169686794281, + "logps/rejected": -1.805450677871704, + "loss": 1.0013, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.33169686794281, + "rewards/margins": 0.4737536907196045, + "rewards/rejected": -1.805450677871704, + "sft_loss": 1.3847988843917847, "step": 4625 }, { "epoch": 2.4780063555778558, - "grad_norm": 7.018668458347168, - "learning_rate": 2.6775378334292543e-07, - "logits/chosen": -0.6319466233253479, - "logits/rejected": -0.5827672481536865, - "logps/chosen": -1.2391040325164795, - "logps/rejected": -2.199864387512207, - "loss": 0.8923, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2391040325164795, - "rewards/margins": 0.9607602953910828, - "rewards/rejected": -2.199864387512207, - "sft_loss": 1.33708918094635, + "grad_norm": 7.777287675881731, + "learning_rate": 8.925126111430848e-08, + "logits/chosen": -0.17647233605384827, + "logits/rejected": -0.09123236685991287, + "logps/chosen": -1.2847613096237183, + "logps/rejected": -1.7988321781158447, + "loss": 1.0017, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2847613096237183, + "rewards/margins": 0.5140709280967712, + "rewards/rejected": -1.7988321781158447, + "sft_loss": 1.3374969959259033, "step": 4630 }, { "epoch": 2.4806823883592575, - "grad_norm": 24.407625854432688, - "learning_rate": 2.650956155239512e-07, - "logits/chosen": -0.6126594543457031, - "logits/rejected": -0.49098721146583557, - "logps/chosen": -1.276228666305542, - "logps/rejected": -2.399261236190796, - "loss": 0.846, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.276228666305542, - "rewards/margins": 1.123032569885254, - "rewards/rejected": -2.399261236190796, - "sft_loss": 1.2981501817703247, + "grad_norm": 16.697480712369682, + "learning_rate": 8.83652051746504e-08, + "logits/chosen": -0.13744884729385376, + "logits/rejected": 0.016100814566016197, + "logps/chosen": -1.2873327732086182, + "logps/rejected": -1.947933554649353, + "loss": 0.9606, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2873327732086182, + "rewards/margins": 0.6606006026268005, + "rewards/rejected": -1.947933554649353, + "sft_loss": 1.306330680847168, "step": 4635 }, { "epoch": 2.483358421140659, - "grad_norm": 10.230795726796329, - "learning_rate": 2.6244942875273093e-07, - "logits/chosen": -0.6266258955001831, - "logits/rejected": -0.5506526231765747, - "logps/chosen": -1.3115276098251343, - "logps/rejected": -2.3939852714538574, - "loss": 0.8511, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3115276098251343, - "rewards/margins": 1.0824575424194336, - "rewards/rejected": -2.3939852714538574, - "sft_loss": 1.3581907749176025, + "grad_norm": 7.35333599354952, + "learning_rate": 8.748314291757696e-08, + "logits/chosen": -0.1599302589893341, + "logits/rejected": -0.05215344950556755, + "logps/chosen": -1.3196965456008911, + "logps/rejected": -1.8524376153945923, + "loss": 0.9713, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3196965456008911, + "rewards/margins": 0.532741129398346, + "rewards/rejected": -1.8524376153945923, + "sft_loss": 1.3606996536254883, "step": 4640 }, { "epoch": 2.4860344539220605, - "grad_norm": 13.846920028469155, - "learning_rate": 2.59815248702581e-07, - "logits/chosen": -0.6682706475257874, - "logits/rejected": -0.555500864982605, - "logps/chosen": -1.2825028896331787, - "logps/rejected": -2.1850879192352295, - "loss": 0.8783, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2825028896331787, - "rewards/margins": 0.9025851488113403, - "rewards/rejected": -2.1850879192352295, - "sft_loss": 1.3643155097961426, + "grad_norm": 8.44997951397884, + "learning_rate": 8.660508290086032e-08, + "logits/chosen": -0.19984640181064606, + "logits/rejected": -0.06128763407468796, + "logps/chosen": -1.294210433959961, + "logps/rejected": -1.7580476999282837, + "loss": 0.9853, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.294210433959961, + "rewards/margins": 0.4638374447822571, + "rewards/rejected": -1.7580476999282837, + "sft_loss": 1.3605902194976807, "step": 4645 }, { "epoch": 2.488710486703462, - "grad_norm": 10.690683184069727, - "learning_rate": 2.5719310093032695e-07, - "logits/chosen": -0.7059717774391174, - "logits/rejected": -0.5149275660514832, - "logps/chosen": -1.313106894493103, - "logps/rejected": -2.339468002319336, - "loss": 0.8633, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.313106894493103, - "rewards/margins": 1.026361107826233, - "rewards/rejected": -2.339468002319336, - "sft_loss": 1.337316632270813, + "grad_norm": 8.153798854643007, + "learning_rate": 8.573103364344231e-08, + "logits/chosen": -0.24238982796669006, + "logits/rejected": 0.01320638693869114, + "logps/chosen": -1.3412058353424072, + "logps/rejected": -1.9187650680541992, + "loss": 0.9616, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3412058353424072, + "rewards/margins": 0.5775595903396606, + "rewards/rejected": -1.9187650680541992, + "sft_loss": 1.3528225421905518, "step": 4650 }, { "epoch": 2.4913865194848634, - "grad_norm": 12.241505842991586, - "learning_rate": 2.5458301087605876e-07, - "logits/chosen": -0.7407052516937256, - "logits/rejected": -0.6273609399795532, - "logps/chosen": -1.315629005432129, - "logps/rejected": -2.2260148525238037, - "loss": 0.9076, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.315629005432129, - "rewards/margins": 0.9103857278823853, - "rewards/rejected": -2.2260148525238037, - "sft_loss": 1.4132753610610962, + "grad_norm": 9.62999907350543, + "learning_rate": 8.486100362535292e-08, + "logits/chosen": -0.26737847924232483, + "logits/rejected": -0.11783139407634735, + "logps/chosen": -1.3052794933319092, + "logps/rejected": -1.743168592453003, + "loss": 0.997, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3052794933319092, + "rewards/margins": 0.4378891587257385, + "rewards/rejected": -1.743168592453003, + "sft_loss": 1.3775510787963867, "step": 4655 }, { "epoch": 2.494062552266265, - "grad_norm": 12.546182844033622, - "learning_rate": 2.5198500386288083e-07, - "logits/chosen": -0.7224112749099731, - "logits/rejected": -0.6466668844223022, - "logps/chosen": -1.3775488138198853, - "logps/rejected": -2.372617721557617, - "loss": 0.8919, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3775488138198853, - "rewards/margins": 0.9950687289237976, - "rewards/rejected": -2.372617721557617, - "sft_loss": 1.4110504388809204, + "grad_norm": 9.773237204542284, + "learning_rate": 8.399500128762693e-08, + "logits/chosen": -0.23270268738269806, + "logits/rejected": -0.1101439818739891, + "logps/chosen": -1.3616409301757812, + "logps/rejected": -1.8167692422866821, + "loss": 1.0086, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3616409301757812, + "rewards/margins": 0.4551284909248352, + "rewards/rejected": -1.8167692422866821, + "sft_loss": 1.3947080373764038, "step": 4660 }, { "epoch": 2.496738585047667, - "grad_norm": 9.800911861573296, - "learning_rate": 2.493991050966694e-07, - "logits/chosen": -0.6987338662147522, - "logits/rejected": -0.6448682546615601, - "logps/chosen": -1.3800842761993408, - "logps/rejected": -2.333311080932617, - "loss": 0.9007, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3800842761993408, - "rewards/margins": 0.9532268643379211, - "rewards/rejected": -2.333311080932617, - "sft_loss": 1.4070731401443481, + "grad_norm": 13.46926687344113, + "learning_rate": 8.313303503222313e-08, + "logits/chosen": -0.22592651844024658, + "logits/rejected": -0.15461456775665283, + "logps/chosen": -1.4279885292053223, + "logps/rejected": -1.8917471170425415, + "loss": 1.0439, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4279885292053223, + "rewards/margins": 0.4637584686279297, + "rewards/rejected": -1.8917471170425415, + "sft_loss": 1.4091001749038696, "step": 4665 }, { "epoch": 2.4994146178290686, - "grad_norm": 25.998145847928697, - "learning_rate": 2.4682533966582494e-07, - "logits/chosen": -0.6867243647575378, - "logits/rejected": -0.5943757891654968, - "logps/chosen": -1.3054351806640625, - "logps/rejected": -2.0959885120391846, - "loss": 0.9185, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3054351806640625, - "rewards/margins": 0.7905532717704773, - "rewards/rejected": -2.0959885120391846, - "sft_loss": 1.400160789489746, + "grad_norm": 22.862875623764268, + "learning_rate": 8.227511322194164e-08, + "logits/chosen": -0.23195526003837585, + "logits/rejected": -0.09586568176746368, + "logps/chosen": -1.3377292156219482, + "logps/rejected": -1.708822250366211, + "loss": 1.0443, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3377292156219482, + "rewards/margins": 0.3710930645465851, + "rewards/rejected": -1.708822250366211, + "sft_loss": 1.4078140258789062, "step": 4670 }, { "epoch": 2.50209065061047, - "grad_norm": 11.02779994160791, - "learning_rate": 2.442637325410316e-07, - "logits/chosen": -0.6484078168869019, - "logits/rejected": -0.5131920576095581, - "logps/chosen": -1.267917513847351, - "logps/rejected": -2.326103687286377, - "loss": 0.8481, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.267917513847351, - "rewards/margins": 1.0581862926483154, - "rewards/rejected": -2.326103687286377, - "sft_loss": 1.2982842922210693, + "grad_norm": 11.316385205834427, + "learning_rate": 8.142124418034385e-08, + "logits/chosen": -0.17335200309753418, + "logits/rejected": -0.02181796357035637, + "logps/chosen": -1.2743074893951416, + "logps/rejected": -1.803501844406128, + "loss": 0.9722, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2743074893951416, + "rewards/margins": 0.5291942358016968, + "rewards/rejected": -1.803501844406128, + "sft_loss": 1.2910643815994263, "step": 4675 }, { "epoch": 2.5047666833918716, - "grad_norm": 14.251626374285026, - "learning_rate": 2.417143085750122e-07, - "logits/chosen": -0.6278024911880493, - "logits/rejected": -0.5599108934402466, - "logps/chosen": -1.2820518016815186, - "logps/rejected": -2.404381513595581, - "loss": 0.8423, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.2820518016815186, - "rewards/margins": 1.1223294734954834, - "rewards/rejected": -2.404381513595581, - "sft_loss": 1.327433705329895, + "grad_norm": 21.98504110227831, + "learning_rate": 8.057143619167073e-08, + "logits/chosen": -0.15896247327327728, + "logits/rejected": -0.049371711909770966, + "logps/chosen": -1.3332550525665283, + "logps/rejected": -1.9437267780303955, + "loss": 0.9747, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3332550525665283, + "rewards/margins": 0.6104718446731567, + "rewards/rejected": -1.9437267780303955, + "sft_loss": 1.3414416313171387, "step": 4680 }, { "epoch": 2.507442716173273, - "grad_norm": 11.805514240107307, - "learning_rate": 2.3917709250228994e-07, - "logits/chosen": -0.7355372309684753, - "logits/rejected": -0.590130627155304, - "logps/chosen": -1.3373448848724365, - "logps/rejected": -2.2430479526519775, - "loss": 0.8902, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3373448848724365, - "rewards/margins": 0.9057027697563171, - "rewards/rejected": -2.2430479526519775, - "sft_loss": 1.3762180805206299, + "grad_norm": 11.770340730002667, + "learning_rate": 7.97256975007633e-08, + "logits/chosen": -0.2712337076663971, + "logits/rejected": -0.0732741728425026, + "logps/chosen": -1.3264377117156982, + "logps/rejected": -1.7903629541397095, + "loss": 0.9916, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3264377117156982, + "rewards/margins": 0.4639252722263336, + "rewards/rejected": -1.7903629541397095, + "sft_loss": 1.3691695928573608, "step": 4685 }, { "epoch": 2.5101187489546746, - "grad_norm": 7.8787461944719865, - "learning_rate": 2.3665210893894557e-07, - "logits/chosen": -0.6504893898963928, - "logits/rejected": -0.6293870806694031, - "logps/chosen": -1.3105475902557373, - "logps/rejected": -2.3168463706970215, - "loss": 0.8746, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.3105475902557373, - "rewards/margins": 1.0062991380691528, - "rewards/rejected": -2.3168463706970215, - "sft_loss": 1.3457313776016235, + "grad_norm": 9.051087910498229, + "learning_rate": 7.888403631298186e-08, + "logits/chosen": -0.1903747022151947, + "logits/rejected": -0.1403854936361313, + "logps/chosen": -1.324100375175476, + "logps/rejected": -1.8005352020263672, + "loss": 0.999, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.324100375175476, + "rewards/margins": 0.4764348864555359, + "rewards/rejected": -1.8005352020263672, + "sft_loss": 1.3471524715423584, "step": 4690 }, { "epoch": 2.5127947817360763, - "grad_norm": 7.173051141455469, - "learning_rate": 2.3413938238238157e-07, - "logits/chosen": -0.6418722867965698, - "logits/rejected": -0.4838961660861969, - "logps/chosen": -1.3251688480377197, - "logps/rejected": -2.3496623039245605, - "loss": 0.9154, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3251688480377197, - "rewards/margins": 1.0244933366775513, - "rewards/rejected": -2.3496623039245605, - "sft_loss": 1.3792946338653564, + "grad_norm": 7.615968755779422, + "learning_rate": 7.804646079412719e-08, + "logits/chosen": -0.17845022678375244, + "logits/rejected": 0.018734300509095192, + "logps/chosen": -1.3418115377426147, + "logps/rejected": -1.8535064458847046, + "loss": 1.0332, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3418115377426147, + "rewards/margins": 0.5116950869560242, + "rewards/rejected": -1.8535064458847046, + "sft_loss": 1.3775501251220703, "step": 4695 }, { "epoch": 2.515470814517478, - "grad_norm": 7.241597522887762, - "learning_rate": 2.316389372110812e-07, - "logits/chosen": -0.7322367429733276, - "logits/rejected": -0.6537310481071472, - "logps/chosen": -1.282598614692688, - "logps/rejected": -2.2019360065460205, - "loss": 0.9101, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.282598614692688, - "rewards/margins": 0.9193374514579773, - "rewards/rejected": -2.2019360065460205, - "sft_loss": 1.368976354598999, + "grad_norm": 6.537479023406544, + "learning_rate": 7.72129790703604e-08, + "logits/chosen": -0.27379363775253296, + "logits/rejected": -0.14433471858501434, + "logps/chosen": -1.2867431640625, + "logps/rejected": -1.788644790649414, + "loss": 1.0146, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2867431640625, + "rewards/margins": 0.5019017457962036, + "rewards/rejected": -1.788644790649414, + "sft_loss": 1.3657176494598389, "step": 4700 }, { "epoch": 2.5181468472988793, - "grad_norm": 9.936247951660189, - "learning_rate": 2.2915079768437514e-07, - "logits/chosen": -0.6218239665031433, - "logits/rejected": -0.6292901039123535, - "logps/chosen": -1.3534696102142334, - "logps/rejected": -2.283411979675293, - "loss": 0.9143, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3534696102142334, - "rewards/margins": 0.9299424886703491, - "rewards/rejected": -2.283411979675293, - "sft_loss": 1.364280343055725, + "grad_norm": 12.955282547417086, + "learning_rate": 7.638359922812504e-08, + "logits/chosen": -0.14411696791648865, + "logits/rejected": -0.10907924175262451, + "logps/chosen": -1.3774569034576416, + "logps/rejected": -1.8264074325561523, + "loss": 1.0335, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3774569034576416, + "rewards/margins": 0.44895029067993164, + "rewards/rejected": -1.8264074325561523, + "sft_loss": 1.369371771812439, "step": 4705 }, { "epoch": 2.520822880080281, - "grad_norm": 11.31977470233518, - "learning_rate": 2.2667498794220326e-07, - "logits/chosen": -0.7041364312171936, - "logits/rejected": -0.5870811343193054, - "logps/chosen": -1.3467581272125244, - "logps/rejected": -2.507812261581421, - "loss": 0.8574, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.3467581272125244, - "rewards/margins": 1.1610538959503174, - "rewards/rejected": -2.507812261581421, - "sft_loss": 1.3909904956817627, + "grad_norm": 10.083790285634256, + "learning_rate": 7.555832931406774e-08, + "logits/chosen": -0.24078384041786194, + "logits/rejected": -0.08013347536325455, + "logps/chosen": -1.3708370923995972, + "logps/rejected": -1.9471477270126343, + "loss": 0.9875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3708370923995972, + "rewards/margins": 0.5763106346130371, + "rewards/rejected": -1.9471477270126343, + "sft_loss": 1.3982982635498047, "step": 4710 }, { "epoch": 2.5234989128616827, - "grad_norm": 9.736817733827035, - "learning_rate": 2.2421153200488332e-07, - "logits/chosen": -0.677000105381012, - "logits/rejected": -0.7056635022163391, - "logps/chosen": -1.414865255355835, - "logps/rejected": -2.4057819843292236, - "loss": 0.9029, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.414865255355835, - "rewards/margins": 0.9909166097640991, - "rewards/rejected": -2.4057819843292236, - "sft_loss": 1.4676659107208252, + "grad_norm": 9.32421974678991, + "learning_rate": 7.47371773349611e-08, + "logits/chosen": -0.21982285380363464, + "logits/rejected": -0.18153129518032074, + "logps/chosen": -1.4248440265655518, + "logps/rejected": -1.8908706903457642, + "loss": 1.0332, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4248440265655518, + "rewards/margins": 0.4660266935825348, + "rewards/rejected": -1.8908706903457642, + "sft_loss": 1.477002739906311, "step": 4715 }, { "epoch": 2.526174945643084, - "grad_norm": 11.290414144270878, - "learning_rate": 2.217604537728749e-07, - "logits/chosen": -0.6715537309646606, - "logits/rejected": -0.6152840852737427, - "logps/chosen": -1.2174723148345947, - "logps/rejected": -2.1375701427459717, - "loss": 0.8517, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.2174723148345947, - "rewards/margins": 0.9200976490974426, - "rewards/rejected": -2.1375701427459717, - "sft_loss": 1.3102633953094482, + "grad_norm": 11.542221088014736, + "learning_rate": 7.392015125762496e-08, + "logits/chosen": -0.1805545836687088, + "logits/rejected": -0.07476134598255157, + "logps/chosen": -1.2300540208816528, + "logps/rejected": -1.7072839736938477, + "loss": 0.9635, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2300540208816528, + "rewards/margins": 0.4772297739982605, + "rewards/rejected": -1.7072839736938477, + "sft_loss": 1.300767183303833, "step": 4720 }, { "epoch": 2.5288509784244857, - "grad_norm": 8.768687363299435, - "learning_rate": 2.1932177702655053e-07, - "logits/chosen": -0.7175818681716919, - "logits/rejected": -0.698711633682251, - "logps/chosen": -1.3792225122451782, - "logps/rejected": -2.303156614303589, - "loss": 0.9178, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3792225122451782, - "rewards/margins": 0.9239341020584106, - "rewards/rejected": -2.303156614303589, - "sft_loss": 1.4294236898422241, + "grad_norm": 11.64707144335193, + "learning_rate": 7.310725900885018e-08, + "logits/chosen": -0.24052152037620544, + "logits/rejected": -0.176700621843338, + "logps/chosen": -1.3545494079589844, + "logps/rejected": -1.7904884815216064, + "loss": 1.021, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3545494079589844, + "rewards/margins": 0.43593907356262207, + "rewards/rejected": -1.7904884815216064, + "sft_loss": 1.394141674041748, "step": 4725 }, { "epoch": 2.5315270112058874, - "grad_norm": 8.362980908179956, - "learning_rate": 2.1689552542596232e-07, - "logits/chosen": -0.6957578063011169, - "logits/rejected": -0.5916758179664612, - "logps/chosen": -1.2203314304351807, - "logps/rejected": -2.3191776275634766, - "loss": 0.8499, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2203314304351807, - "rewards/margins": 1.098846197128296, - "rewards/rejected": -2.3191776275634766, - "sft_loss": 1.3278615474700928, + "grad_norm": 7.904436255203085, + "learning_rate": 7.229850847532076e-08, + "logits/chosen": -0.17813178896903992, + "logits/rejected": -0.05374855920672417, + "logps/chosen": -1.2366796731948853, + "logps/rejected": -1.7965199947357178, + "loss": 0.9679, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2366796731948853, + "rewards/margins": 0.5598403811454773, + "rewards/rejected": -1.7965199947357178, + "sft_loss": 1.3370568752288818, "step": 4730 }, { "epoch": 2.5342030439872887, - "grad_norm": 5.208940835506882, - "learning_rate": 2.1448172251061338e-07, - "logits/chosen": -0.6650044322013855, - "logits/rejected": -0.7591149806976318, - "logps/chosen": -1.347022294998169, - "logps/rejected": -2.176450252532959, - "loss": 0.8969, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.347022294998169, - "rewards/margins": 0.8294281959533691, - "rewards/rejected": -2.176450252532959, - "sft_loss": 1.3947023153305054, + "grad_norm": 7.4402860611881465, + "learning_rate": 7.149390750353779e-08, + "logits/chosen": -0.1650870144367218, + "logits/rejected": -0.2268679440021515, + "logps/chosen": -1.3688818216323853, + "logps/rejected": -1.739956259727478, + "loss": 1.0155, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3688818216323853, + "rewards/margins": 0.37107449769973755, + "rewards/rejected": -1.739956259727478, + "sft_loss": 1.3951529264450073, "step": 4735 }, { "epoch": 2.5368790767686904, - "grad_norm": 6.993025606054764, - "learning_rate": 2.1208039169923122e-07, - "logits/chosen": -0.7362990975379944, - "logits/rejected": -0.6220189929008484, - "logps/chosen": -1.3531649112701416, - "logps/rejected": -2.3682894706726074, - "loss": 0.875, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3531649112701416, - "rewards/margins": 1.0151245594024658, - "rewards/rejected": -2.3682894706726074, - "sft_loss": 1.3880125284194946, + "grad_norm": 7.044974235801137, + "learning_rate": 7.069346389974374e-08, + "logits/chosen": -0.2574231028556824, + "logits/rejected": -0.10612340271472931, + "logps/chosen": -1.3511667251586914, + "logps/rejected": -1.880702257156372, + "loss": 0.9782, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3511667251586914, + "rewards/margins": 0.5295354127883911, + "rewards/rejected": -1.880702257156372, + "sft_loss": 1.377986192703247, "step": 4740 }, { "epoch": 2.539555109550092, - "grad_norm": 8.38125688208826, - "learning_rate": 2.096915562895369e-07, - "logits/chosen": -0.6793426275253296, - "logits/rejected": -0.6831785440444946, - "logps/chosen": -1.3633360862731934, - "logps/rejected": -2.381472110748291, - "loss": 0.901, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3633360862731934, - "rewards/margins": 1.0181360244750977, - "rewards/rejected": -2.381472110748291, - "sft_loss": 1.4215288162231445, + "grad_norm": 10.285582997631959, + "learning_rate": 6.989718542984563e-08, + "logits/chosen": -0.170863077044487, + "logits/rejected": -0.12588295340538025, + "logps/chosen": -1.3761823177337646, + "logps/rejected": -1.852365493774414, + "loss": 1.0204, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3761823177337646, + "rewards/margins": 0.4761830270290375, + "rewards/rejected": -1.852365493774414, + "sft_loss": 1.3957990407943726, "step": 4745 }, { "epoch": 2.5422311423314934, - "grad_norm": 9.175151061347913, - "learning_rate": 2.07315239458023e-07, - "logits/chosen": -0.6697893738746643, - "logits/rejected": -0.520065426826477, - "logps/chosen": -1.3501994609832764, - "logps/rejected": -2.576988458633423, - "loss": 0.8288, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.3501994609832764, - "rewards/margins": 1.2267887592315674, - "rewards/rejected": -2.576988458633423, - "sft_loss": 1.3498144149780273, + "grad_norm": 6.546259057130183, + "learning_rate": 6.9105079819341e-08, + "logits/chosen": -0.1758599430322647, + "logits/rejected": 0.03030111826956272, + "logps/chosen": -1.297829031944275, + "logps/rejected": -2.0442402362823486, + "loss": 0.9007, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.297829031944275, + "rewards/margins": 0.746411144733429, + "rewards/rejected": -2.0442402362823486, + "sft_loss": 1.3134304285049438, "step": 4750 }, { "epoch": 2.544907175112895, - "grad_norm": 10.016426252351847, - "learning_rate": 2.0495146425972487e-07, - "logits/chosen": -0.7503206133842468, - "logits/rejected": -0.6195517778396606, - "logps/chosen": -1.2370802164077759, - "logps/rejected": -2.4151086807250977, - "loss": 0.8469, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2370802164077759, - "rewards/margins": 1.1780284643173218, - "rewards/rejected": -2.4151086807250977, - "sft_loss": 1.3105494976043701, + "grad_norm": 8.301983409544098, + "learning_rate": 6.831715475324163e-08, + "logits/chosen": -0.24078845977783203, + "logits/rejected": -0.05983034521341324, + "logps/chosen": -1.232013463973999, + "logps/rejected": -1.936300277709961, + "loss": 0.9475, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.232013463973999, + "rewards/margins": 0.7042867541313171, + "rewards/rejected": -1.936300277709961, + "sft_loss": 1.2897391319274902, "step": 4755 }, { "epoch": 2.547583207894297, - "grad_norm": 12.292841806081597, - "learning_rate": 2.0260025362800078e-07, - "logits/chosen": -0.7724018096923828, - "logits/rejected": -0.6956478357315063, - "logps/chosen": -1.251227617263794, - "logps/rejected": -2.305633068084717, - "loss": 0.8592, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.251227617263794, - "rewards/margins": 1.0544054508209229, - "rewards/rejected": -2.305633068084717, - "sft_loss": 1.3491507768630981, + "grad_norm": 5.974846801942517, + "learning_rate": 6.753341787600026e-08, + "logits/chosen": -0.27302032709121704, + "logits/rejected": -0.124132439494133, + "logps/chosen": -1.2491251230239868, + "logps/rejected": -1.7809514999389648, + "loss": 0.9751, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2491251230239868, + "rewards/margins": 0.5318263173103333, + "rewards/rejected": -1.7809514999389648, + "sft_loss": 1.3279565572738647, "step": 4760 }, { "epoch": 2.5502592406756985, - "grad_norm": 16.015556311358075, - "learning_rate": 2.002616303743059e-07, - "logits/chosen": -0.7809301614761353, - "logits/rejected": -0.6208760142326355, - "logps/chosen": -1.3786977529525757, - "logps/rejected": -2.4913034439086914, - "loss": 0.8944, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3786977529525757, - "rewards/margins": 1.1126058101654053, - "rewards/rejected": -2.4913034439086914, - "sft_loss": 1.4251824617385864, + "grad_norm": 12.996647544401874, + "learning_rate": 6.67538767914353e-08, + "logits/chosen": -0.2610344886779785, + "logits/rejected": -0.07429017871618271, + "logps/chosen": -1.3629835844039917, + "logps/rejected": -1.8292680978775024, + "loss": 1.0442, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3629835844039917, + "rewards/margins": 0.4662845730781555, + "rewards/rejected": -1.8292680978775024, + "sft_loss": 1.3952610492706299, "step": 4765 }, { "epoch": 2.5529352734571, - "grad_norm": 8.73037919482241, - "learning_rate": 1.979356171879738e-07, - "logits/chosen": -0.7047054767608643, - "logits/rejected": -0.6488312482833862, - "logps/chosen": -1.3484723567962646, - "logps/rejected": -2.4292967319488525, - "loss": 0.8628, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.3484723567962646, - "rewards/margins": 1.0808244943618774, - "rewards/rejected": -2.4292967319488525, - "sft_loss": 1.3975765705108643, + "grad_norm": 10.682258574204342, + "learning_rate": 6.597853906265793e-08, + "logits/chosen": -0.2108164131641388, + "logits/rejected": -0.09726519882678986, + "logps/chosen": -1.3561142683029175, + "logps/rejected": -1.9013593196868896, + "loss": 0.9947, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3561142683029175, + "rewards/margins": 0.5452449321746826, + "rewards/rejected": -1.9013593196868896, + "sft_loss": 1.3703248500823975, "step": 4770 }, { "epoch": 2.5556113062385015, - "grad_norm": 11.96525948420899, - "learning_rate": 1.9562223663599399e-07, - "logits/chosen": -0.6062943339347839, - "logits/rejected": -0.5331624150276184, - "logps/chosen": -1.3019130229949951, - "logps/rejected": -2.3833680152893066, - "loss": 0.8578, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3019130229949951, - "rewards/margins": 1.0814554691314697, - "rewards/rejected": -2.3833680152893066, - "sft_loss": 1.309452772140503, + "grad_norm": 8.396879522182823, + "learning_rate": 6.5207412211998e-08, + "logits/chosen": -0.08900733292102814, + "logits/rejected": 0.02392936311662197, + "logps/chosen": -1.2975869178771973, + "logps/rejected": -1.862375020980835, + "loss": 0.9622, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2975869178771973, + "rewards/margins": 0.5647882223129272, + "rewards/rejected": -1.862375020980835, + "sft_loss": 1.2736012935638428, "step": 4775 }, { "epoch": 2.558287339019903, - "grad_norm": 19.670502841219136, - "learning_rate": 1.9332151116279557e-07, - "logits/chosen": -0.7169122695922852, - "logits/rejected": -0.6609517335891724, - "logps/chosen": -1.3092458248138428, - "logps/rejected": -2.282841920852661, - "loss": 0.8788, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3092458248138428, - "rewards/margins": 0.9735962748527527, - "rewards/rejected": -2.282841920852661, - "sft_loss": 1.3545231819152832, + "grad_norm": 7.986055069676312, + "learning_rate": 6.444050372093186e-08, + "logits/chosen": -0.15881235897541046, + "logits/rejected": -0.06122690439224243, + "logps/chosen": -1.3280928134918213, + "logps/rejected": -1.8025190830230713, + "loss": 0.9947, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3280928134918213, + "rewards/margins": 0.47442618012428284, + "rewards/rejected": -1.8025190830230713, + "sft_loss": 1.3553526401519775, "step": 4780 }, { "epoch": 2.5609633718013045, - "grad_norm": 8.73783859329783, - "learning_rate": 1.9103346309002623e-07, - "logits/chosen": -0.6947210431098938, - "logits/rejected": -0.6709440350532532, - "logps/chosen": -1.3139759302139282, - "logps/rejected": -2.1969356536865234, - "loss": 0.9067, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3139759302139282, - "rewards/margins": 0.8829599618911743, - "rewards/rejected": -2.1969356536865234, - "sft_loss": 1.3645445108413696, + "grad_norm": 8.648469385118139, + "learning_rate": 6.367782103000873e-08, + "logits/chosen": -0.16575714945793152, + "logits/rejected": -0.10236098617315292, + "logps/chosen": -1.33333420753479, + "logps/rejected": -1.7184255123138428, + "loss": 1.0239, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.33333420753479, + "rewards/margins": 0.3850913643836975, + "rewards/rejected": -1.7184255123138428, + "sft_loss": 1.3617249727249146, "step": 4785 }, { "epoch": 2.5636394045827062, - "grad_norm": 8.28415692660026, - "learning_rate": 1.887581146163394e-07, - "logits/chosen": -0.7579419016838074, - "logits/rejected": -0.6561330556869507, - "logps/chosen": -1.3163559436798096, - "logps/rejected": -2.490429401397705, - "loss": 0.8568, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3163559436798096, - "rewards/margins": 1.174073576927185, - "rewards/rejected": -2.490429401397705, - "sft_loss": 1.350839376449585, + "grad_norm": 9.109556486343607, + "learning_rate": 6.29193715387798e-08, + "logits/chosen": -0.2730414867401123, + "logits/rejected": -0.12491519749164581, + "logps/chosen": -1.3478052616119385, + "logps/rejected": -1.9820867776870728, + "loss": 0.963, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3478052616119385, + "rewards/margins": 0.6342812776565552, + "rewards/rejected": -1.9820867776870728, + "sft_loss": 1.3628085851669312, "step": 4790 }, { "epoch": 2.566315437364108, - "grad_norm": 9.467127267461985, - "learning_rate": 1.8649548781717506e-07, - "logits/chosen": -0.6651458740234375, - "logits/rejected": -0.6147671937942505, - "logps/chosen": -1.3289358615875244, - "logps/rejected": -2.270233392715454, - "loss": 0.8755, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.3289358615875244, - "rewards/margins": 0.9412969350814819, - "rewards/rejected": -2.270233392715454, - "sft_loss": 1.3590539693832397, + "grad_norm": 17.90572280189856, + "learning_rate": 6.216516260572502e-08, + "logits/chosen": -0.15778687596321106, + "logits/rejected": -0.07748770713806152, + "logps/chosen": -1.366068959236145, + "logps/rejected": -1.822178840637207, + "loss": 1.0242, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.366068959236145, + "rewards/margins": 0.4561101496219635, + "rewards/rejected": -1.822178840637207, + "sft_loss": 1.3762224912643433, "step": 4795 }, { "epoch": 2.568991470145509, - "grad_norm": 6.597920954146079, - "learning_rate": 1.8424560464454891e-07, - "logits/chosen": -0.7337859869003296, - "logits/rejected": -0.6585230827331543, - "logps/chosen": -1.2692844867706299, - "logps/rejected": -2.1357951164245605, - "loss": 0.8747, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2692844867706299, - "rewards/margins": 0.8665107488632202, - "rewards/rejected": -2.1357951164245605, - "sft_loss": 1.3429874181747437, + "grad_norm": 6.340516969551728, + "learning_rate": 6.141520154818297e-08, + "logits/chosen": -0.2035118043422699, + "logits/rejected": -0.08557195216417313, + "logps/chosen": -1.2682950496673584, + "logps/rejected": -1.7354037761688232, + "loss": 0.9789, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2682950496673584, + "rewards/margins": 0.46710890531539917, + "rewards/rejected": -1.7354037761688232, + "sft_loss": 1.3428736925125122, "step": 4800 }, { "epoch": 2.568991470145509, - "eval_logits/chosen": -0.44644811749458313, - "eval_logits/rejected": -0.39516207575798035, - "eval_logps/chosen": -1.5252902507781982, - "eval_logps/rejected": -2.175719976425171, - "eval_loss": 1.0419161319732666, - "eval_rewards/accuracies": 0.6454005837440491, - "eval_rewards/chosen": -1.5252902507781982, - "eval_rewards/margins": 0.6504298448562622, - "eval_rewards/rejected": -2.175719976425171, - "eval_runtime": 44.1168, - "eval_samples_per_second": 30.487, - "eval_sft_loss": 1.4755855798721313, - "eval_steps_per_second": 7.639, + "eval_logits/chosen": 0.09956033527851105, + "eval_logits/rejected": 0.18866664171218872, + "eval_logps/chosen": -1.3974061012268066, + "eval_logps/rejected": -1.8380084037780762, + "eval_loss": 1.0419869422912598, + "eval_rewards/accuracies": 0.6142433285713196, + "eval_rewards/chosen": -1.3974061012268066, + "eval_rewards/margins": 0.4406021535396576, + "eval_rewards/rejected": -1.8380084037780762, + "eval_runtime": 48.2365, + "eval_samples_per_second": 27.883, + "eval_sft_loss": 1.4008758068084717, + "eval_steps_per_second": 6.986, "step": 4800 }, { "epoch": 2.571667502926911, - "grad_norm": 15.103542790559718, - "learning_rate": 1.820084869268369e-07, - "logits/chosen": -0.7690774202346802, - "logits/rejected": -0.7059490084648132, - "logps/chosen": -1.3684237003326416, - "logps/rejected": -2.373002052307129, - "loss": 0.8923, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.3684237003326416, - "rewards/margins": 1.0045783519744873, - "rewards/rejected": -2.373002052307129, - "sft_loss": 1.4043967723846436, + "grad_norm": 12.243034491589519, + "learning_rate": 6.066949564227897e-08, + "logits/chosen": -0.3126266300678253, + "logits/rejected": -0.18945705890655518, + "logps/chosen": -1.3755854368209839, + "logps/rejected": -1.9291465282440186, + "loss": 0.9907, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3755854368209839, + "rewards/margins": 0.5535610914230347, + "rewards/rejected": -1.9291465282440186, + "sft_loss": 1.392539381980896, "step": 4805 }, { "epoch": 2.574343535708312, - "grad_norm": 10.299547408906609, - "learning_rate": 1.7978415636856571e-07, - "logits/chosen": -0.6917232871055603, - "logits/rejected": -0.6187536716461182, - "logps/chosen": -1.3753135204315186, - "logps/rejected": -2.3357646465301514, - "loss": 0.9299, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.3753135204315186, - "rewards/margins": 0.9604509472846985, - "rewards/rejected": -2.3357646465301514, - "sft_loss": 1.4127343893051147, + "grad_norm": 9.067815560279103, + "learning_rate": 5.992805212285523e-08, + "logits/chosen": -0.17166857421398163, + "logits/rejected": -0.03303740173578262, + "logps/chosen": -1.394575834274292, + "logps/rejected": -1.859715223312378, + "loss": 1.0549, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.394575834274292, + "rewards/margins": 0.4651394784450531, + "rewards/rejected": -1.859715223312378, + "sft_loss": 1.4112951755523682, "step": 4810 }, { "epoch": 2.577019568489714, - "grad_norm": 11.14236285917446, - "learning_rate": 1.7757263455019906e-07, - "logits/chosen": -0.6763869524002075, - "logits/rejected": -0.5861636400222778, - "logps/chosen": -1.204840064048767, - "logps/rejected": -2.362485885620117, - "loss": 0.8418, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.204840064048767, - "rewards/margins": 1.1576459407806396, - "rewards/rejected": -2.362485885620117, - "sft_loss": 1.283821702003479, + "grad_norm": 7.556783097027108, + "learning_rate": 5.9190878183399684e-08, + "logits/chosen": -0.16894081234931946, + "logits/rejected": -0.034742534160614014, + "logps/chosen": -1.206220269203186, + "logps/rejected": -1.8776988983154297, + "loss": 0.927, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.206220269203186, + "rewards/margins": 0.6714786291122437, + "rewards/rejected": -1.8776988983154297, + "sft_loss": 1.2821996212005615, "step": 4815 }, { "epoch": 2.5796956012711156, - "grad_norm": 7.259647243572955, - "learning_rate": 1.7537394292793245e-07, - "logits/chosen": -0.6820626258850098, - "logits/rejected": -0.611841082572937, - "logps/chosen": -1.3615658283233643, - "logps/rejected": -2.250566005706787, - "loss": 0.9006, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3615658283233643, - "rewards/margins": 0.8890002965927124, - "rewards/rejected": -2.250566005706787, - "sft_loss": 1.3739036321640015, + "grad_norm": 8.883994735413456, + "learning_rate": 5.845798097597748e-08, + "logits/chosen": -0.14478550851345062, + "logits/rejected": -0.06053388863801956, + "logps/chosen": -1.3819754123687744, + "logps/rejected": -1.8339745998382568, + "loss": 1.0138, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3819754123687744, + "rewards/margins": 0.4519991874694824, + "rewards/rejected": -1.8339745998382568, + "sft_loss": 1.3619210720062256, "step": 4820 }, { "epoch": 2.5823716340525174, - "grad_norm": 9.838289890502702, - "learning_rate": 1.731881028334808e-07, - "logits/chosen": -0.6777626872062683, - "logits/rejected": -0.5994135141372681, - "logps/chosen": -1.3140347003936768, - "logps/rejected": -2.183943748474121, - "loss": 0.8925, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3140347003936768, - "rewards/margins": 0.8699088096618652, - "rewards/rejected": -2.183943748474121, - "sft_loss": 1.322758436203003, + "grad_norm": 10.631060590250604, + "learning_rate": 5.772936761116026e-08, + "logits/chosen": -0.15467044711112976, + "logits/rejected": -0.027900194749236107, + "logps/chosen": -1.306641936302185, + "logps/rejected": -1.7456728219985962, + "loss": 0.9949, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.306641936302185, + "rewards/margins": 0.43903082609176636, + "rewards/rejected": -1.7456728219985962, + "sft_loss": 1.302283525466919, "step": 4825 }, { "epoch": 2.5850476668339186, - "grad_norm": 9.648523193237793, - "learning_rate": 1.7101513547387487e-07, - "logits/chosen": -0.7211915850639343, - "logits/rejected": -0.620406448841095, - "logps/chosen": -1.3341495990753174, - "logps/rejected": -2.2978355884552, - "loss": 0.8812, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3341495990753174, - "rewards/margins": 0.9636863470077515, - "rewards/rejected": -2.2978355884552, - "sft_loss": 1.3428423404693604, + "grad_norm": 6.9724462295350085, + "learning_rate": 5.700504515795829e-08, + "logits/chosen": -0.23680837452411652, + "logits/rejected": -0.08103035390377045, + "logps/chosen": -1.360740303993225, + "logps/rejected": -1.769418716430664, + "loss": 1.0285, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.360740303993225, + "rewards/margins": 0.4086781442165375, + "rewards/rejected": -1.769418716430664, + "sft_loss": 1.3670880794525146, "step": 4830 }, { "epoch": 2.5877236996153203, - "grad_norm": 12.784087079857935, - "learning_rate": 1.6885506193125306e-07, - "logits/chosen": -0.8385518193244934, - "logits/rejected": -0.6945291757583618, - "logps/chosen": -1.3321683406829834, - "logps/rejected": -2.4906907081604004, - "loss": 0.8596, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3321683406829834, - "rewards/margins": 1.158522367477417, - "rewards/rejected": -2.4906907081604004, - "sft_loss": 1.3666893243789673, + "grad_norm": 9.983272955806635, + "learning_rate": 5.628502064375101e-08, + "logits/chosen": -0.3214946389198303, + "logits/rejected": -0.1256788820028305, + "logps/chosen": -1.3193713426589966, + "logps/rejected": -1.8835500478744507, + "loss": 0.981, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3193713426589966, + "rewards/margins": 0.5641787052154541, + "rewards/rejected": -1.8835500478744507, + "sft_loss": 1.346934199333191, "step": 4835 }, { "epoch": 2.5903997323967216, - "grad_norm": 11.568765499261328, - "learning_rate": 1.667079031626591e-07, - "logits/chosen": -0.7675267457962036, - "logits/rejected": -0.6532658338546753, - "logps/chosen": -1.3479506969451904, - "logps/rejected": -2.358936309814453, - "loss": 0.8862, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3479506969451904, - "rewards/margins": 1.0109856128692627, - "rewards/rejected": -2.358936309814453, - "sft_loss": 1.4017937183380127, + "grad_norm": 8.038456616467183, + "learning_rate": 5.55693010542197e-08, + "logits/chosen": -0.27449679374694824, + "logits/rejected": -0.07452504336833954, + "logps/chosen": -1.3287158012390137, + "logps/rejected": -1.82450270652771, + "loss": 0.9938, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3287158012390137, + "rewards/margins": 0.49578672647476196, + "rewards/rejected": -1.82450270652771, + "sft_loss": 1.376766324043274, "step": 4840 }, { "epoch": 2.5930757651781233, - "grad_norm": 9.203820574490626, - "learning_rate": 1.6457367999983568e-07, - "logits/chosen": -0.7395308613777161, - "logits/rejected": -0.6636630892753601, - "logps/chosen": -1.3089097738265991, - "logps/rejected": -2.2758500576019287, - "loss": 0.8811, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.3089097738265991, - "rewards/margins": 0.9669402241706848, - "rewards/rejected": -2.2758500576019287, - "sft_loss": 1.4001891613006592, + "grad_norm": 9.078399651470578, + "learning_rate": 5.485789333327856e-08, + "logits/chosen": -0.243708997964859, + "logits/rejected": -0.1337020993232727, + "logps/chosen": -1.3256762027740479, + "logps/rejected": -1.828753113746643, + "loss": 1.0052, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3256762027740479, + "rewards/margins": 0.5030766725540161, + "rewards/rejected": -1.828753113746643, + "sft_loss": 1.3922346830368042, "step": 4845 }, { "epoch": 2.595751797959525, - "grad_norm": 12.680296874297454, - "learning_rate": 1.6245241314902604e-07, - "logits/chosen": -0.8743961453437805, - "logits/rejected": -0.7420130968093872, - "logps/chosen": -1.295114278793335, - "logps/rejected": -2.3928744792938232, - "loss": 0.8639, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.295114278793335, - "rewards/margins": 1.097760558128357, - "rewards/rejected": -2.3928744792938232, - "sft_loss": 1.3053842782974243, + "grad_norm": 14.233014918580938, + "learning_rate": 5.4150804383008675e-08, + "logits/chosen": -0.3622373640537262, + "logits/rejected": -0.19894781708717346, + "logps/chosen": -1.3045145273208618, + "logps/rejected": -1.907910943031311, + "loss": 0.9678, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3045145273208618, + "rewards/margins": 0.603396475315094, + "rewards/rejected": -1.907910943031311, + "sft_loss": 1.2965425252914429, "step": 4850 }, { "epoch": 2.5984278307409268, - "grad_norm": 7.991829421367602, - "learning_rate": 1.6034412319077008e-07, - "logits/chosen": -0.6992126703262329, - "logits/rejected": -0.5952213406562805, - "logps/chosen": -1.2627414464950562, - "logps/rejected": -2.328612804412842, - "loss": 0.8834, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.2627414464950562, - "rewards/margins": 1.065871238708496, - "rewards/rejected": -2.328612804412842, - "sft_loss": 1.3418805599212646, + "grad_norm": 7.813337647316198, + "learning_rate": 5.344804106359002e-08, + "logits/chosen": -0.17479762434959412, + "logits/rejected": -0.0063132173381745815, + "logps/chosen": -1.259197473526001, + "logps/rejected": -1.8022098541259766, + "loss": 1.0029, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.259197473526001, + "rewards/margins": 0.5430123209953308, + "rewards/rejected": -1.8022098541259766, + "sft_loss": 1.326766014099121, "step": 4855 }, { "epoch": 2.601103863522328, - "grad_norm": 8.77204071833793, - "learning_rate": 1.582488305797068e-07, - "logits/chosen": -0.7220746278762817, - "logits/rejected": -0.6973734498023987, - "logps/chosen": -1.2161717414855957, - "logps/rejected": -2.3466389179229736, - "loss": 0.7956, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.2161717414855957, - "rewards/margins": 1.1304669380187988, - "rewards/rejected": -2.3466389179229736, - "sft_loss": 1.2758996486663818, + "grad_norm": 12.79303668954908, + "learning_rate": 5.274961019323559e-08, + "logits/chosen": -0.23360638320446014, + "logits/rejected": -0.16707371175289154, + "logps/chosen": -1.2304644584655762, + "logps/rejected": -1.8639189004898071, + "loss": 0.9079, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2304644584655762, + "rewards/margins": 0.6334545612335205, + "rewards/rejected": -1.8639189004898071, + "sft_loss": 1.2692384719848633, "step": 4860 }, { "epoch": 2.6037798963037297, - "grad_norm": 8.753318218697338, - "learning_rate": 1.5616655564437354e-07, - "logits/chosen": -0.8045107126235962, - "logits/rejected": -0.7597325444221497, - "logps/chosen": -1.3217512369155884, - "logps/rejected": -2.395482301712036, - "loss": 0.8733, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3217512369155884, - "rewards/margins": 1.0737309455871582, - "rewards/rejected": -2.395482301712036, - "sft_loss": 1.347623586654663, + "grad_norm": 7.443452034913313, + "learning_rate": 5.205551854812451e-08, + "logits/chosen": -0.34079301357269287, + "logits/rejected": -0.24996092915534973, + "logps/chosen": -1.2894681692123413, + "logps/rejected": -1.858069658279419, + "loss": 0.9614, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2894681692123413, + "rewards/margins": 0.5686012506484985, + "rewards/rejected": -1.858069658279419, + "sft_loss": 1.329217791557312, "step": 4865 }, { "epoch": 2.606455929085131, - "grad_norm": 7.614181712513337, - "learning_rate": 1.5409731858701154e-07, - "logits/chosen": -0.6682974696159363, - "logits/rejected": -0.5954986214637756, - "logps/chosen": -1.2902181148529053, - "logps/rejected": -2.3482024669647217, - "loss": 0.853, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.2902181148529053, - "rewards/margins": 1.0579843521118164, - "rewards/rejected": -2.3482024669647217, - "sft_loss": 1.3140208721160889, + "grad_norm": 12.395401520728653, + "learning_rate": 5.1365772862337177e-08, + "logits/chosen": -0.17850585281848907, + "logits/rejected": -0.04492710903286934, + "logps/chosen": -1.32509183883667, + "logps/rejected": -1.837829828262329, + "loss": 0.9917, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.32509183883667, + "rewards/margins": 0.5127378702163696, + "rewards/rejected": -1.837829828262329, + "sft_loss": 1.3318425416946411, "step": 4870 }, { "epoch": 2.6091319618665327, - "grad_norm": 9.475477603706777, - "learning_rate": 1.5204113948336717e-07, - "logits/chosen": -0.5529733896255493, - "logits/rejected": -0.4986020028591156, - "logps/chosen": -1.2315846681594849, - "logps/rejected": -2.5735256671905518, - "loss": 0.8204, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.2315846681594849, - "rewards/margins": 1.341940999031067, - "rewards/rejected": -2.5735256671905518, - "sft_loss": 1.3268613815307617, + "grad_norm": 8.33277923667889, + "learning_rate": 5.068037982778905e-08, + "logits/chosen": -0.06016062572598457, + "logits/rejected": 0.016124781221151352, + "logps/chosen": -1.2886050939559937, + "logps/rejected": -1.9926360845565796, + "loss": 0.9573, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2886050939559937, + "rewards/margins": 0.7040311694145203, + "rewards/rejected": -1.9926360845565796, + "sft_loss": 1.347915768623352, "step": 4875 }, { "epoch": 2.6118079946479344, - "grad_norm": 7.4222452619037576, - "learning_rate": 1.499980382824997e-07, - "logits/chosen": -0.6061524748802185, - "logits/rejected": -0.5214860439300537, - "logps/chosen": -1.258568525314331, - "logps/rejected": -2.432037830352783, - "loss": 0.8569, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.258568525314331, - "rewards/margins": 1.1734689474105835, - "rewards/rejected": -2.432037830352783, - "sft_loss": 1.3374583721160889, + "grad_norm": 12.743055750152957, + "learning_rate": 4.999934609416656e-08, + "logits/chosen": -0.13470612466335297, + "logits/rejected": -0.010785120539367199, + "logps/chosen": -1.2654292583465576, + "logps/rejected": -1.8969109058380127, + "loss": 0.9741, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2654292583465576, + "rewards/margins": 0.6314815282821655, + "rewards/rejected": -1.8969109058380127, + "sft_loss": 1.3268860578536987, "step": 4880 }, { "epoch": 2.614484027429336, - "grad_norm": 11.640054133774896, - "learning_rate": 1.479680348065855e-07, - "logits/chosen": -0.6220036149024963, - "logits/rejected": -0.5831924676895142, - "logps/chosen": -1.4027297496795654, - "logps/rejected": -2.553523540496826, - "loss": 0.9172, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.4027297496795654, - "rewards/margins": 1.1507937908172607, - "rewards/rejected": -2.553523540496826, - "sft_loss": 1.530013084411621, + "grad_norm": 9.596905530882646, + "learning_rate": 4.932267826886183e-08, + "logits/chosen": -0.14197297394275665, + "logits/rejected": -0.06500792503356934, + "logps/chosen": -1.4103431701660156, + "logps/rejected": -1.9615188837051392, + "loss": 1.0443, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4103431701660156, + "rewards/margins": 0.5511755347251892, + "rewards/rejected": -1.9615188837051392, + "sft_loss": 1.5112073421478271, "step": 4885 }, { "epoch": 2.6171600602107374, - "grad_norm": 15.846563432651616, - "learning_rate": 1.4595114875072762e-07, - "logits/chosen": -0.7990840673446655, - "logits/rejected": -0.6822538375854492, - "logps/chosen": -1.3178495168685913, - "logps/rejected": -2.345160722732544, - "loss": 0.8832, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3178495168685913, - "rewards/margins": 1.027311086654663, - "rewards/rejected": -2.345160722732544, - "sft_loss": 1.3916982412338257, + "grad_norm": 9.503153296695984, + "learning_rate": 4.8650382916909206e-08, + "logits/chosen": -0.3041810095310211, + "logits/rejected": -0.13739736378192902, + "logps/chosen": -1.3125327825546265, + "logps/rejected": -1.88010573387146, + "loss": 0.9772, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3125327825546265, + "rewards/margins": 0.5675729513168335, + "rewards/rejected": -1.88010573387146, + "sft_loss": 1.3756803274154663, "step": 4890 }, { "epoch": 2.619836092992139, - "grad_norm": 7.443163924141347, - "learning_rate": 1.4394739968276293e-07, - "logits/chosen": -0.7228940725326538, - "logits/rejected": -0.6642226576805115, - "logps/chosen": -1.3509809970855713, - "logps/rejected": -2.127976179122925, - "loss": 0.9536, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3509809970855713, - "rewards/margins": 0.7769953012466431, - "rewards/rejected": -2.127976179122925, - "sft_loss": 1.4385178089141846, + "grad_norm": 7.885282658627051, + "learning_rate": 4.7982466560920976e-08, + "logits/chosen": -0.2626744210720062, + "logits/rejected": -0.17322595417499542, + "logps/chosen": -1.3757003545761108, + "logps/rejected": -1.753072738647461, + "loss": 1.0796, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3757003545761108, + "rewards/margins": 0.37737223505973816, + "rewards/rejected": -1.753072738647461, + "sft_loss": 1.458059549331665, "step": 4895 }, { "epoch": 2.622512125773541, - "grad_norm": 7.294244692854026, - "learning_rate": 1.4195680704307405e-07, - "logits/chosen": -0.6522566676139832, - "logits/rejected": -0.5300472974777222, - "logps/chosen": -1.2895146608352661, - "logps/rejected": -2.4499125480651855, - "loss": 0.8536, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2895146608352661, - "rewards/margins": 1.1603978872299194, - "rewards/rejected": -2.4499125480651855, - "sft_loss": 1.351528525352478, + "grad_norm": 6.666763867558161, + "learning_rate": 4.7318935681024685e-08, + "logits/chosen": -0.15900710225105286, + "logits/rejected": -0.023568501695990562, + "logps/chosen": -1.2843879461288452, + "logps/rejected": -1.8810021877288818, + "loss": 0.9673, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2843879461288452, + "rewards/margins": 0.5966143012046814, + "rewards/rejected": -1.8810021877288818, + "sft_loss": 1.3582007884979248, "step": 4900 }, { "epoch": 2.625188158554942, - "grad_norm": 17.234232738029313, - "learning_rate": 1.3997939014439926e-07, - "logits/chosen": -0.7080479860305786, - "logits/rejected": -0.5974324941635132, - "logps/chosen": -1.382468819618225, - "logps/rejected": -2.398974895477295, - "loss": 0.9083, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.382468819618225, - "rewards/margins": 1.0165059566497803, - "rewards/rejected": -2.398974895477295, - "sft_loss": 1.4537564516067505, + "grad_norm": 6.382501308102608, + "learning_rate": 4.6659796714799745e-08, + "logits/chosen": -0.21288824081420898, + "logits/rejected": -0.05509115010499954, + "logps/chosen": -1.3490147590637207, + "logps/rejected": -1.9092353582382202, + "loss": 1.0173, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3490147590637207, + "rewards/margins": 0.5602205395698547, + "rewards/rejected": -1.9092353582382202, + "sft_loss": 1.4373271465301514, "step": 4905 }, { "epoch": 2.627864191336344, - "grad_norm": 9.92835117922593, - "learning_rate": 1.380151681716465e-07, - "logits/chosen": -0.7300471067428589, - "logits/rejected": -0.7779293060302734, - "logps/chosen": -1.3747262954711914, - "logps/rejected": -2.7321619987487793, - "loss": 0.871, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.3747262954711914, - "rewards/margins": 1.3574353456497192, - "rewards/rejected": -2.7321619987487793, - "sft_loss": 1.411215901374817, + "grad_norm": 8.782343504072141, + "learning_rate": 4.60050560572155e-08, + "logits/chosen": -0.23608019948005676, + "logits/rejected": -0.2559540271759033, + "logps/chosen": -1.3334243297576904, + "logps/rejected": -2.139165163040161, + "loss": 0.9426, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3334243297576904, + "rewards/margins": 0.8057405352592468, + "rewards/rejected": -2.139165163040161, + "sft_loss": 1.3623217344284058, "step": 4910 }, { "epoch": 2.6305402241177456, - "grad_norm": 10.59200185807618, - "learning_rate": 1.3606416018170502e-07, - "logits/chosen": -0.6524641513824463, - "logits/rejected": -0.5611596703529358, - "logps/chosen": -1.2252461910247803, - "logps/rejected": -2.2592012882232666, - "loss": 0.848, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2252461910247803, - "rewards/margins": 1.0339548587799072, - "rewards/rejected": -2.2592012882232666, - "sft_loss": 1.3374149799346924, + "grad_norm": 10.02999649415756, + "learning_rate": 4.535472006056834e-08, + "logits/chosen": -0.16636885702610016, + "logits/rejected": -0.058062683790922165, + "logps/chosen": -1.2314822673797607, + "logps/rejected": -1.8077586889266968, + "loss": 0.9453, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2314822673797607, + "rewards/margins": 0.5762763023376465, + "rewards/rejected": -1.8077586889266968, + "sft_loss": 1.3330705165863037, "step": 4915 }, { "epoch": 2.6332162568991473, - "grad_norm": 6.598038307883381, - "learning_rate": 1.3412638510326397e-07, - "logits/chosen": -0.6867167949676514, - "logits/rejected": -0.6597640514373779, - "logps/chosen": -1.3008304834365845, - "logps/rejected": -2.2904257774353027, - "loss": 0.8999, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3008304834365845, - "rewards/margins": 0.9895952939987183, - "rewards/rejected": -2.2904257774353027, - "sft_loss": 1.379055380821228, + "grad_norm": 18.166548359394767, + "learning_rate": 4.470879503442132e-08, + "logits/chosen": -0.14276938140392303, + "logits/rejected": -0.05258003622293472, + "logps/chosen": -1.3031578063964844, + "logps/rejected": -1.8209069967269897, + "loss": 1.0079, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3031578063964844, + "rewards/margins": 0.5177491903305054, + "rewards/rejected": -1.8209069967269897, + "sft_loss": 1.3699431419372559, "step": 4920 }, { "epoch": 2.6358922896805486, - "grad_norm": 8.580472421138872, - "learning_rate": 1.3220186173662462e-07, - "logits/chosen": -0.8828998804092407, - "logits/rejected": -0.6815120577812195, - "logps/chosen": -1.2868249416351318, - "logps/rejected": -2.482501268386841, - "loss": 0.8522, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.2868249416351318, - "rewards/margins": 1.1956764459609985, - "rewards/rejected": -2.482501268386841, - "sft_loss": 1.3695834875106812, + "grad_norm": 11.801562049253334, + "learning_rate": 4.406728724554154e-08, + "logits/chosen": -0.36980322003364563, + "logits/rejected": -0.11403369903564453, + "logps/chosen": -1.2860276699066162, + "logps/rejected": -1.892530083656311, + "loss": 0.9628, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2860276699066162, + "rewards/margins": 0.6065024137496948, + "rewards/rejected": -1.892530083656311, + "sft_loss": 1.355703592300415, "step": 4925 }, { "epoch": 2.6385683224619503, - "grad_norm": 8.937724503158526, - "learning_rate": 1.30290608753522e-07, - "logits/chosen": -0.6469255685806274, - "logits/rejected": -0.5224730968475342, - "logps/chosen": -1.3711665868759155, - "logps/rejected": -2.592311382293701, - "loss": 0.8565, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.3711665868759155, - "rewards/margins": 1.2211449146270752, - "rewards/rejected": -2.592311382293701, - "sft_loss": 1.385009527206421, + "grad_norm": 9.188807326966096, + "learning_rate": 4.3430202917840664e-08, + "logits/chosen": -0.17843613028526306, + "logits/rejected": -0.025499451905488968, + "logps/chosen": -1.4194364547729492, + "logps/rejected": -2.0837531089782715, + "loss": 0.9735, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4194364547729492, + "rewards/margins": 0.6643165349960327, + "rewards/rejected": -2.0837531089782715, + "sft_loss": 1.3846460580825806, "step": 4930 }, { "epoch": 2.6412443552433515, - "grad_norm": 12.928969840134284, - "learning_rate": 1.2839264469694039e-07, - "logits/chosen": -0.7605575323104858, - "logits/rejected": -0.6379026174545288, - "logps/chosen": -1.3274918794631958, - "logps/rejected": -2.2872328758239746, - "loss": 0.9115, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3274918794631958, - "rewards/margins": 0.9597407579421997, - "rewards/rejected": -2.2872328758239746, - "sft_loss": 1.38313627243042, + "grad_norm": 10.722029916958498, + "learning_rate": 4.279754823231346e-08, + "logits/chosen": -0.26976945996284485, + "logits/rejected": -0.10888000577688217, + "logps/chosen": -1.3152145147323608, + "logps/rejected": -1.7785648107528687, + "loss": 1.031, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3152145147323608, + "rewards/margins": 0.46335023641586304, + "rewards/rejected": -1.7785648107528687, + "sft_loss": 1.3738157749176025, "step": 4935 }, { "epoch": 2.6439203880247533, - "grad_norm": 8.512784932615155, - "learning_rate": 1.2650798798093577e-07, - "logits/chosen": -0.7268552780151367, - "logits/rejected": -0.6888422966003418, - "logps/chosen": -1.306331992149353, - "logps/rejected": -2.0488409996032715, - "loss": 0.9097, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.306331992149353, - "rewards/margins": 0.742509126663208, - "rewards/rejected": -2.0488409996032715, - "sft_loss": 1.3654850721359253, + "grad_norm": 6.422919879862994, + "learning_rate": 4.216932932697859e-08, + "logits/chosen": -0.21269111335277557, + "logits/rejected": -0.13011713325977325, + "logps/chosen": -1.3373366594314575, + "logps/rejected": -1.6350433826446533, + "loss": 1.0387, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3373366594314575, + "rewards/margins": 0.29770660400390625, + "rewards/rejected": -1.6350433826446533, + "sft_loss": 1.350295066833496, "step": 4940 }, { "epoch": 2.646596420806155, - "grad_norm": 6.309196448048183, - "learning_rate": 1.2463665689045533e-07, - "logits/chosen": -0.7216583490371704, - "logits/rejected": -0.5565370321273804, - "logps/chosen": -1.2761034965515137, - "logps/rejected": -2.5478756427764893, - "loss": 0.8398, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.2761034965515137, - "rewards/margins": 1.271771788597107, - "rewards/rejected": -2.5478756427764893, - "sft_loss": 1.3512345552444458, + "grad_norm": 7.621540725374221, + "learning_rate": 4.154555229681844e-08, + "logits/chosen": -0.22732429206371307, + "logits/rejected": -0.025684243068099022, + "logps/chosen": -1.304892897605896, + "logps/rejected": -2.0208868980407715, + "loss": 0.9539, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.304892897605896, + "rewards/margins": 0.7159940600395203, + "rewards/rejected": -2.0208868980407715, + "sft_loss": 1.3593806028366089, "step": 4945 }, { "epoch": 2.6492724535875567, - "grad_norm": 9.598407648987838, - "learning_rate": 1.2277866958116207e-07, - "logits/chosen": -0.7417315244674683, - "logits/rejected": -0.6495709419250488, - "logps/chosen": -1.3259822130203247, - "logps/rejected": -2.192164421081543, - "loss": 0.8991, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3259822130203247, - "rewards/margins": 0.8661823272705078, - "rewards/rejected": -2.192164421081543, - "sft_loss": 1.3365287780761719, + "grad_norm": 8.196773582663084, + "learning_rate": 4.092622319372069e-08, + "logits/chosen": -0.2208227664232254, + "logits/rejected": -0.08531433343887329, + "logps/chosen": -1.3079783916473389, + "logps/rejected": -1.8024765253067017, + "loss": 0.9937, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3079783916473389, + "rewards/margins": 0.4944981038570404, + "rewards/rejected": -1.8024765253067017, + "sft_loss": 1.3162072896957397, "step": 4950 }, { "epoch": 2.651948486368958, - "grad_norm": 7.078745272401252, - "learning_rate": 1.2093404407925668e-07, - "logits/chosen": -0.7276274561882019, - "logits/rejected": -0.7417802214622498, - "logps/chosen": -1.3177086114883423, - "logps/rejected": -2.2994585037231445, - "loss": 0.8947, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3177086114883423, - "rewards/margins": 0.9817501306533813, - "rewards/rejected": -2.2994585037231445, - "sft_loss": 1.3993942737579346, + "grad_norm": 9.310700403381741, + "learning_rate": 4.031134802641889e-08, + "logits/chosen": -0.22234082221984863, + "logits/rejected": -0.22068548202514648, + "logps/chosen": -1.3351229429244995, + "logps/rejected": -1.8221311569213867, + "loss": 1.0185, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3351229429244995, + "rewards/margins": 0.48700833320617676, + "rewards/rejected": -1.8221311569213867, + "sft_loss": 1.3827526569366455, "step": 4955 }, { "epoch": 2.6546245191503597, - "grad_norm": 8.760101354046029, - "learning_rate": 1.1910279828130405e-07, - "logits/chosen": -0.623112678527832, - "logits/rejected": -0.5573943853378296, - "logps/chosen": -1.2637913227081299, - "logps/rejected": -2.2288098335266113, - "loss": 0.8648, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2637913227081299, - "rewards/margins": 0.9650182723999023, - "rewards/rejected": -2.2288098335266113, - "sft_loss": 1.303546667098999, + "grad_norm": 8.80574012157952, + "learning_rate": 3.970093276043468e-08, + "logits/chosen": -0.12468080222606659, + "logits/rejected": -0.0179353766143322, + "logps/chosen": -1.3131933212280273, + "logps/rejected": -1.8085600137710571, + "loss": 0.9783, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3131933212280273, + "rewards/margins": 0.4953668713569641, + "rewards/rejected": -1.8085600137710571, + "sft_loss": 1.3201087713241577, "step": 4960 }, { "epoch": 2.657300551931761, - "grad_norm": 9.425834805164445, - "learning_rate": 1.1728494995405876e-07, - "logits/chosen": -0.7425374984741211, - "logits/rejected": -0.6649678945541382, - "logps/chosen": -1.2058379650115967, - "logps/rejected": -2.2518324851989746, - "loss": 0.8527, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.2058379650115967, - "rewards/margins": 1.0459946393966675, - "rewards/rejected": -2.2518324851989746, - "sft_loss": 1.3115302324295044, + "grad_norm": 7.285871873326477, + "learning_rate": 3.9094983318019584e-08, + "logits/chosen": -0.2541118264198303, + "logits/rejected": -0.12958697974681854, + "logps/chosen": -1.2113314867019653, + "logps/rejected": -1.7473266124725342, + "loss": 0.9582, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2113314867019653, + "rewards/margins": 0.5359951257705688, + "rewards/rejected": -1.7473266124725342, + "sft_loss": 1.306854486465454, "step": 4965 }, { "epoch": 2.6599765847131627, - "grad_norm": 8.040153775855304, - "learning_rate": 1.1548051673429366e-07, - "logits/chosen": -0.6462007164955139, - "logits/rejected": -0.6145657896995544, - "logps/chosen": -1.228861927986145, - "logps/rejected": -2.3314716815948486, - "loss": 0.8266, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.228861927986145, - "rewards/margins": 1.1026098728179932, - "rewards/rejected": -2.3314716815948486, - "sft_loss": 1.2801616191864014, + "grad_norm": 14.073938284183036, + "learning_rate": 3.849350557809789e-08, + "logits/chosen": -0.13873368501663208, + "logits/rejected": -0.06717085093259811, + "logps/chosen": -1.2868283987045288, + "logps/rejected": -1.8329023122787476, + "loss": 0.9415, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2868283987045288, + "rewards/margins": 0.5460739135742188, + "rewards/rejected": -1.8329023122787476, + "sft_loss": 1.2820491790771484, "step": 4970 }, { "epoch": 2.6626526174945644, - "grad_norm": 10.12979124482581, - "learning_rate": 1.136895161286271e-07, - "logits/chosen": -0.704356849193573, - "logits/rejected": -0.6970897912979126, - "logps/chosen": -1.3229091167449951, - "logps/rejected": -2.3172924518585205, - "loss": 0.87, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3229091167449951, - "rewards/margins": 0.9943831562995911, - "rewards/rejected": -2.3172924518585205, - "sft_loss": 1.3527791500091553, + "grad_norm": 9.513931999414476, + "learning_rate": 3.789650537620903e-08, + "logits/chosen": -0.2162855565547943, + "logits/rejected": -0.16296498477458954, + "logps/chosen": -1.3681292533874512, + "logps/rejected": -1.8026014566421509, + "loss": 1.0238, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3681292533874512, + "rewards/margins": 0.4344722628593445, + "rewards/rejected": -1.8026014566421509, + "sft_loss": 1.3663793802261353, "step": 4975 }, { "epoch": 2.665328650275966, - "grad_norm": 9.107056984707933, - "learning_rate": 1.1191196551335547e-07, - "logits/chosen": -0.6133028864860535, - "logits/rejected": -0.5777004957199097, - "logps/chosen": -1.442797303199768, - "logps/rejected": -2.3889315128326416, - "loss": 0.9202, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.442797303199768, - "rewards/margins": 0.9461342096328735, - "rewards/rejected": -2.3889315128326416, - "sft_loss": 1.3789355754852295, + "grad_norm": 8.955292464980037, + "learning_rate": 3.730398850445182e-08, + "logits/chosen": -0.11974634230136871, + "logits/rejected": -0.04977312684059143, + "logps/chosen": -1.4495785236358643, + "logps/rejected": -1.9651418924331665, + "loss": 1.0168, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4495785236358643, + "rewards/margins": 0.5155635476112366, + "rewards/rejected": -1.9651418924331665, + "sft_loss": 1.3548001050949097, "step": 4980 }, { "epoch": 2.6680046830573674, - "grad_norm": 10.841673758953423, - "learning_rate": 1.1014788213428206e-07, - "logits/chosen": -0.681205153465271, - "logits/rejected": -0.5534544587135315, - "logps/chosen": -1.2655446529388428, - "logps/rejected": -2.481712818145752, - "loss": 0.8421, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2655446529388428, - "rewards/margins": 1.2161678075790405, - "rewards/rejected": -2.481712818145752, - "sft_loss": 1.3065024614334106, + "grad_norm": 10.663220628394305, + "learning_rate": 3.671596071142735e-08, + "logits/chosen": -0.15686583518981934, + "logits/rejected": 0.01727641560137272, + "logps/chosen": -1.2853853702545166, + "logps/rejected": -1.9318078756332397, + "loss": 0.9685, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2853853702545166, + "rewards/margins": 0.6464226841926575, + "rewards/rejected": -1.9318078756332397, + "sft_loss": 1.2928669452667236, "step": 4985 }, { "epoch": 2.670680715838769, - "grad_norm": 8.353892759307273, - "learning_rate": 1.08397283106552e-07, - "logits/chosen": -0.8035761713981628, - "logits/rejected": -0.6475404500961304, - "logps/chosen": -1.2617781162261963, - "logps/rejected": -2.3890726566314697, - "loss": 0.8401, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.2617781162261963, - "rewards/margins": 1.1272943019866943, - "rewards/rejected": -2.3890726566314697, - "sft_loss": 1.3372514247894287, + "grad_norm": 11.53115506279403, + "learning_rate": 3.6132427702183996e-08, + "logits/chosen": -0.289564311504364, + "logits/rejected": -0.07157912105321884, + "logps/chosen": -1.2882611751556396, + "logps/rejected": -1.8648605346679688, + "loss": 0.9553, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2882611751556396, + "rewards/margins": 0.5765994787216187, + "rewards/rejected": -1.8648605346679688, + "sft_loss": 1.3371655941009521, "step": 4990 }, { "epoch": 2.6733567486201704, - "grad_norm": 11.899115678259255, - "learning_rate": 1.0666018541448442e-07, - "logits/chosen": -0.7440560460090637, - "logits/rejected": -0.7424860000610352, - "logps/chosen": -1.3631649017333984, - "logps/rejected": -2.1925854682922363, - "loss": 0.9325, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3631649017333984, - "rewards/margins": 0.8294209241867065, - "rewards/rejected": -2.1925854682922363, - "sft_loss": 1.4264438152313232, + "grad_norm": 6.390690113174573, + "learning_rate": 3.555339513816147e-08, + "logits/chosen": -0.2623346745967865, + "logits/rejected": -0.24493210017681122, + "logps/chosen": -1.3538005352020264, + "logps/rejected": -1.7726166248321533, + "loss": 1.023, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3538005352020264, + "rewards/margins": 0.418815940618515, + "rewards/rejected": -1.7726166248321533, + "sft_loss": 1.3910917043685913, "step": 4995 }, { "epoch": 2.676032781401572, - "grad_norm": 8.637571793111615, - "learning_rate": 1.0493660591140919e-07, - "logits/chosen": -0.7476707100868225, - "logits/rejected": -0.7413309216499329, - "logps/chosen": -1.3109577894210815, - "logps/rejected": -2.4442172050476074, - "loss": 0.8531, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.3109577894210815, - "rewards/margins": 1.133259654045105, - "rewards/rejected": -2.4442172050476074, - "sft_loss": 1.3718430995941162, + "grad_norm": 6.85135820752156, + "learning_rate": 3.497886863713639e-08, + "logits/chosen": -0.22708475589752197, + "logits/rejected": -0.20830416679382324, + "logps/chosen": -1.3401455879211426, + "logps/rejected": -1.891880750656128, + "loss": 0.9958, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3401455879211426, + "rewards/margins": 0.5517350435256958, + "rewards/rejected": -1.891880750656128, + "sft_loss": 1.3599205017089844, "step": 5000 }, { "epoch": 2.678708814182974, - "grad_norm": 6.938299568282176, - "learning_rate": 1.0322656131950165e-07, - "logits/chosen": -0.6714180707931519, - "logits/rejected": -0.6517351865768433, - "logps/chosen": -1.2863237857818604, - "logps/rejected": -2.285841703414917, - "loss": 0.8472, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.2863237857818604, - "rewards/margins": 0.999518096446991, - "rewards/rejected": -2.285841703414917, - "sft_loss": 1.3182358741760254, + "grad_norm": 9.741872318805992, + "learning_rate": 3.440885377316721e-08, + "logits/chosen": -0.1690005362033844, + "logits/rejected": -0.11986930668354034, + "logps/chosen": -1.3109533786773682, + "logps/rejected": -1.815737009048462, + "loss": 0.9727, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3109533786773682, + "rewards/margins": 0.5047835111618042, + "rewards/rejected": -1.815737009048462, + "sft_loss": 1.3245909214019775, "step": 5005 }, { "epoch": 2.6813848469643755, - "grad_norm": 9.804769515443496, - "learning_rate": 1.0153006822962246e-07, - "logits/chosen": -0.6418807506561279, - "logits/rejected": -0.6069843173027039, - "logps/chosen": -1.4389764070510864, - "logps/rejected": -2.4578685760498047, - "loss": 0.9649, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.4389764070510864, - "rewards/margins": 1.0188924074172974, - "rewards/rejected": -2.4578685760498047, - "sft_loss": 1.4984257221221924, + "grad_norm": 8.275910236168725, + "learning_rate": 3.384335607654082e-08, + "logits/chosen": -0.14948007464408875, + "logits/rejected": -0.049628086388111115, + "logps/chosen": -1.4493799209594727, + "logps/rejected": -1.9683153629302979, + "loss": 1.0734, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4493799209594727, + "rewards/margins": 0.5189353227615356, + "rewards/rejected": -1.9683153629302979, + "sft_loss": 1.4960986375808716, "step": 5010 }, { "epoch": 2.684060879745777, - "grad_norm": 10.572158439743552, - "learning_rate": 9.984714310115434e-08, - "logits/chosen": -0.7636522650718689, - "logits/rejected": -0.72252357006073, - "logps/chosen": -1.4187383651733398, - "logps/rejected": -2.4914615154266357, - "loss": 0.8708, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.4187383651733398, - "rewards/margins": 1.072723150253296, - "rewards/rejected": -2.4914615154266357, - "sft_loss": 1.3292744159698486, + "grad_norm": 10.495542277964995, + "learning_rate": 3.328238103371811e-08, + "logits/chosen": -0.2823304831981659, + "logits/rejected": -0.21650946140289307, + "logps/chosen": -1.3710321187973022, + "logps/rejected": -1.846612572669983, + "loss": 0.9938, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3710321187973022, + "rewards/margins": 0.47558069229125977, + "rewards/rejected": -1.846612572669983, + "sft_loss": 1.3259201049804688, "step": 5015 }, { "epoch": 2.6867369125271785, - "grad_norm": 23.55508226743853, - "learning_rate": 9.817780226184509e-08, - "logits/chosen": -0.7631696462631226, - "logits/rejected": -0.6109831929206848, - "logps/chosen": -1.2823731899261475, - "logps/rejected": -2.268428087234497, - "loss": 0.8635, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2823731899261475, - "rewards/margins": 0.9860547780990601, - "rewards/rejected": -2.268428087234497, - "sft_loss": 1.319327712059021, + "grad_norm": 13.036528693824037, + "learning_rate": 3.272593408728169e-08, + "logits/chosen": -0.2800753116607666, + "logits/rejected": -0.08348731696605682, + "logps/chosen": -1.2873401641845703, + "logps/rejected": -1.763296127319336, + "loss": 0.9832, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2873401641845703, + "rewards/margins": 0.47595587372779846, + "rewards/rejected": -1.763296127319336, + "sft_loss": 1.3176186084747314, "step": 5020 }, { "epoch": 2.6894129453085798, - "grad_norm": 12.154954085554968, - "learning_rate": 9.652206190764611e-08, - "logits/chosen": -0.7588266134262085, - "logits/rejected": -0.6622456312179565, - "logps/chosen": -1.321897268295288, - "logps/rejected": -2.219815254211426, - "loss": 0.8903, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.321897268295288, - "rewards/margins": 0.8979180455207825, - "rewards/rejected": -2.219815254211426, - "sft_loss": 1.3334851264953613, + "grad_norm": 8.752888024463486, + "learning_rate": 3.217402063588204e-08, + "logits/chosen": -0.2731943428516388, + "logits/rejected": -0.1365620493888855, + "logps/chosen": -1.3452825546264648, + "logps/rejected": -1.8296034336090088, + "loss": 1.0007, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3452825546264648, + "rewards/margins": 0.4843209385871887, + "rewards/rejected": -1.8296034336090088, + "sft_loss": 1.337990164756775, "step": 5025 }, { "epoch": 2.6920889780899815, - "grad_norm": 6.100600505123231, - "learning_rate": 9.487993810255823e-08, - "logits/chosen": -0.7056714296340942, - "logits/rejected": -0.6703065633773804, - "logps/chosen": -1.3172643184661865, - "logps/rejected": -2.510986566543579, - "loss": 0.8796, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3172643184661865, - "rewards/margins": 1.1937217712402344, - "rewards/rejected": -2.510986566543579, - "sft_loss": 1.3384928703308105, + "grad_norm": 6.755409189563525, + "learning_rate": 3.162664603418608e-08, + "logits/chosen": -0.20813412964344025, + "logits/rejected": -0.13779008388519287, + "logps/chosen": -1.3276698589324951, + "logps/rejected": -1.9187484979629517, + "loss": 0.9926, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3276698589324951, + "rewards/margins": 0.5910786390304565, + "rewards/rejected": -1.9187484979629517, + "sft_loss": 1.333012342453003, "step": 5030 }, { "epoch": 2.694765010871383, - "grad_norm": 9.24875569951955, - "learning_rate": 9.325144677847325e-08, - "logits/chosen": -0.7563332915306091, - "logits/rejected": -0.6742985248565674, - "logps/chosen": -1.351980447769165, - "logps/rejected": -2.3332836627960205, - "loss": 0.8956, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.351980447769165, - "rewards/margins": 0.9813033938407898, - "rewards/rejected": -2.3332836627960205, - "sft_loss": 1.4207156896591187, + "grad_norm": 14.67877774359697, + "learning_rate": 3.1083815592824416e-08, + "logits/chosen": -0.2622719407081604, + "logits/rejected": -0.13872763514518738, + "logps/chosen": -1.3953707218170166, + "logps/rejected": -1.8387365341186523, + "loss": 1.0281, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3953707218170166, + "rewards/margins": 0.4433657228946686, + "rewards/rejected": -1.8387365341186523, + "sft_loss": 1.4280171394348145, "step": 5035 }, { "epoch": 2.697441043652785, - "grad_norm": 8.06269783556905, - "learning_rate": 9.163660373502158e-08, - "logits/chosen": -0.5251580476760864, - "logits/rejected": -0.5942437648773193, - "logps/chosen": -1.333310604095459, - "logps/rejected": -2.270529270172119, - "loss": 0.8927, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.333310604095459, - "rewards/margins": 0.9372186660766602, - "rewards/rejected": -2.270529270172119, - "sft_loss": 1.3476473093032837, + "grad_norm": 10.452396185649386, + "learning_rate": 3.054553457834053e-08, + "logits/chosen": -0.014820876531302929, + "logits/rejected": -0.058113228529691696, + "logps/chosen": -1.3324902057647705, + "logps/rejected": -1.8379656076431274, + "loss": 0.9951, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3324902057647705, + "rewards/margins": 0.5054755210876465, + "rewards/rejected": -1.8379656076431274, + "sft_loss": 1.342046856880188, "step": 5040 }, { "epoch": 2.700117076434186, - "grad_norm": 15.59237219629854, - "learning_rate": 9.003542463941711e-08, - "logits/chosen": -0.6538597941398621, - "logits/rejected": -0.6683381199836731, - "logps/chosen": -1.2633765935897827, - "logps/rejected": -2.2634286880493164, - "loss": 0.8548, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2633765935897827, - "rewards/margins": 1.0000520944595337, - "rewards/rejected": -2.2634286880493164, - "sft_loss": 1.3013274669647217, + "grad_norm": 10.706514654691368, + "learning_rate": 3.0011808213139036e-08, + "logits/chosen": -0.1659650355577469, + "logits/rejected": -0.13472406566143036, + "logps/chosen": -1.2917453050613403, + "logps/rejected": -1.7612578868865967, + "loss": 0.9835, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2917453050613403, + "rewards/margins": 0.4695127606391907, + "rewards/rejected": -1.7612578868865967, + "sft_loss": 1.319317102432251, "step": 5045 }, { "epoch": 2.702793109215588, - "grad_norm": 8.31848775007685, - "learning_rate": 8.844792502630705e-08, - "logits/chosen": -0.7369416356086731, - "logits/rejected": -0.682004988193512, - "logps/chosen": -1.1934149265289307, - "logps/rejected": -2.214322328567505, - "loss": 0.8138, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.1934149265289307, - "rewards/margins": 1.0209074020385742, - "rewards/rejected": -2.214322328567505, - "sft_loss": 1.252170205116272, + "grad_norm": 7.839518418772648, + "learning_rate": 2.948264167543568e-08, + "logits/chosen": -0.22668242454528809, + "logits/rejected": -0.16178184747695923, + "logps/chosen": -1.185306429862976, + "logps/rejected": -1.7473878860473633, + "loss": 0.9152, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.185306429862976, + "rewards/margins": 0.5620813369750977, + "rewards/rejected": -1.7473878860473633, + "sft_loss": 1.2352666854858398, "step": 5050 }, { "epoch": 2.7054691419969896, - "grad_norm": 7.300347450169716, - "learning_rate": 8.687412029761866e-08, - "logits/chosen": -0.8128924369812012, - "logits/rejected": -0.7599584460258484, - "logps/chosen": -1.2087875604629517, - "logps/rejected": -2.244777202606201, - "loss": 0.843, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2087875604629517, - "rewards/margins": 1.03598952293396, - "rewards/rejected": -2.244777202606201, - "sft_loss": 1.281432867050171, + "grad_norm": 10.126397618064189, + "learning_rate": 2.8958040099206216e-08, + "logits/chosen": -0.322266548871994, + "logits/rejected": -0.23900584876537323, + "logps/chosen": -1.2218904495239258, + "logps/rejected": -1.7490746974945068, + "loss": 0.9703, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2218904495239258, + "rewards/margins": 0.5271841883659363, + "rewards/rejected": -1.7490746974945068, + "sft_loss": 1.2784899473190308, "step": 5055 }, { "epoch": 2.708145174778391, - "grad_norm": 8.97940117033596, - "learning_rate": 8.531402572241325e-08, - "logits/chosen": -0.7046575546264648, - "logits/rejected": -0.669869065284729, - "logps/chosen": -1.2762138843536377, - "logps/rejected": -2.200396776199341, - "loss": 0.9019, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2762138843536377, - "rewards/margins": 0.9241830110549927, - "rewards/rejected": -2.200396776199341, - "sft_loss": 1.3880784511566162, + "grad_norm": 15.316353952433701, + "learning_rate": 2.843800857413775e-08, + "logits/chosen": -0.1940038502216339, + "logits/rejected": -0.12784381210803986, + "logps/chosen": -1.3368351459503174, + "logps/rejected": -1.7661672830581665, + "loss": 1.0231, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3368351459503174, + "rewards/margins": 0.42933225631713867, + "rewards/rejected": -1.7661672830581665, + "sft_loss": 1.4155434370040894, "step": 5060 }, { "epoch": 2.7108212075597926, - "grad_norm": 9.349225873443023, - "learning_rate": 8.376765643673462e-08, - "logits/chosen": -0.737822413444519, - "logits/rejected": -0.5608315467834473, - "logps/chosen": -1.3172581195831299, - "logps/rejected": -2.219006061553955, - "loss": 0.8674, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.3172581195831299, - "rewards/margins": 0.90174800157547, - "rewards/rejected": -2.219006061553955, - "sft_loss": 1.3546133041381836, + "grad_norm": 17.394926694549877, + "learning_rate": 2.7922552145578203e-08, + "logits/chosen": -0.23021197319030762, + "logits/rejected": 0.012189358472824097, + "logps/chosen": -1.347095251083374, + "logps/rejected": -1.8356088399887085, + "loss": 0.9938, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.347095251083374, + "rewards/margins": 0.4885135293006897, + "rewards/rejected": -1.8356088399887085, + "sft_loss": 1.3754655122756958, "step": 5065 }, { "epoch": 2.7134972403411943, - "grad_norm": 13.166356373380424, - "learning_rate": 8.223502744346484e-08, - "logits/chosen": -0.6462982892990112, - "logits/rejected": -0.5416436791419983, - "logps/chosen": -1.232508897781372, - "logps/rejected": -2.0593743324279785, - "loss": 0.871, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.232508897781372, - "rewards/margins": 0.8268651962280273, - "rewards/rejected": -2.0593743324279785, - "sft_loss": 1.2882734537124634, + "grad_norm": 11.886876249226376, + "learning_rate": 2.7411675814488277e-08, + "logits/chosen": -0.11982126533985138, + "logits/rejected": 0.03776576370000839, + "logps/chosen": -1.2209315299987793, + "logps/rejected": -1.650206208229065, + "loss": 0.9695, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2209315299987793, + "rewards/margins": 0.4292744994163513, + "rewards/rejected": -1.650206208229065, + "sft_loss": 1.2713295221328735, "step": 5070 }, { "epoch": 2.7161732731225956, - "grad_norm": 11.578676537207762, - "learning_rate": 8.071615361217648e-08, - "logits/chosen": -0.6779338121414185, - "logits/rejected": -0.6354584097862244, - "logps/chosen": -1.2588183879852295, - "logps/rejected": -1.9801623821258545, - "loss": 0.9133, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.2588183879852295, - "rewards/margins": 0.7213441133499146, - "rewards/rejected": -1.9801623821258545, - "sft_loss": 1.3248854875564575, + "grad_norm": 15.403235946354835, + "learning_rate": 2.690538453739216e-08, + "logits/chosen": -0.18403539061546326, + "logits/rejected": -0.1058233380317688, + "logps/chosen": -1.2756471633911133, + "logps/rejected": -1.5969749689102173, + "loss": 1.0286, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2756471633911133, + "rewards/margins": 0.32132771611213684, + "rewards/rejected": -1.5969749689102173, + "sft_loss": 1.3122966289520264, "step": 5075 }, { "epoch": 2.7188493059039973, - "grad_norm": 11.088766453636934, - "learning_rate": 7.92110496789909e-08, - "logits/chosen": -0.7551140189170837, - "logits/rejected": -0.6507441997528076, - "logps/chosen": -1.3093791007995605, - "logps/rejected": -2.214526414871216, - "loss": 0.8765, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3093791007995605, - "rewards/margins": 0.9051472544670105, - "rewards/rejected": -2.214526414871216, - "sft_loss": 1.3492542505264282, + "grad_norm": 6.884410383974212, + "learning_rate": 2.6403683226330298e-08, + "logits/chosen": -0.23184525966644287, + "logits/rejected": -0.09298277646303177, + "logps/chosen": -1.3421884775161743, + "logps/rejected": -1.8177179098129272, + "loss": 0.9939, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3421884775161743, + "rewards/margins": 0.4755295217037201, + "rewards/rejected": -1.8177179098129272, + "sft_loss": 1.3477542400360107, "step": 5080 }, { "epoch": 2.721525338685399, - "grad_norm": 32.57712911465191, - "learning_rate": 7.771973024643241e-08, - "logits/chosen": -0.7945700287818909, - "logits/rejected": -0.7233412861824036, - "logps/chosen": -1.2418968677520752, - "logps/rejected": -2.426748037338257, - "loss": 0.8114, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2418968677520752, - "rewards/margins": 1.184851050376892, - "rewards/rejected": -2.426748037338257, - "sft_loss": 1.2658183574676514, + "grad_norm": 17.033262401083658, + "learning_rate": 2.5906576748810804e-08, + "logits/chosen": -0.2996065616607666, + "logits/rejected": -0.1711694747209549, + "logps/chosen": -1.2369778156280518, + "logps/rejected": -1.918890357017517, + "loss": 0.9122, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2369778156280518, + "rewards/margins": 0.6819124817848206, + "rewards/rejected": -1.918890357017517, + "sft_loss": 1.2690715789794922, "step": 5085 }, { "epoch": 2.7242013714668003, - "grad_norm": 11.21305824615193, - "learning_rate": 7.624220978328905e-08, - "logits/chosen": -0.8018733859062195, - "logits/rejected": -0.6985114216804504, - "logps/chosen": -1.3311643600463867, - "logps/rejected": -2.403780221939087, - "loss": 0.8759, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3311643600463867, - "rewards/margins": 1.0726158618927002, - "rewards/rejected": -2.403780221939087, - "sft_loss": 1.3794212341308594, + "grad_norm": 16.15761950655529, + "learning_rate": 2.5414069927763016e-08, + "logits/chosen": -0.3268240690231323, + "logits/rejected": -0.1671230047941208, + "logps/chosen": -1.3656808137893677, + "logps/rejected": -1.9123808145523071, + "loss": 1.0189, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3656808137893677, + "rewards/margins": 0.5466999411582947, + "rewards/rejected": -1.9123808145523071, + "sft_loss": 1.3982808589935303, "step": 5090 }, { "epoch": 2.726877404248202, - "grad_norm": 13.044135461749839, - "learning_rate": 7.477850262447056e-08, - "logits/chosen": -0.8470147252082825, - "logits/rejected": -0.6896313428878784, - "logps/chosen": -1.2608357667922974, - "logps/rejected": -2.455876111984253, - "loss": 0.8377, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.2608357667922974, - "rewards/margins": 1.1950404644012451, - "rewards/rejected": -2.455876111984253, - "sft_loss": 1.3385980129241943, + "grad_norm": 7.783888684416137, + "learning_rate": 2.4926167541490185e-08, + "logits/chosen": -0.3442414402961731, + "logits/rejected": -0.1430184692144394, + "logps/chosen": -1.286874532699585, + "logps/rejected": -1.9325740337371826, + "loss": 0.9666, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.286874532699585, + "rewards/margins": 0.6456994414329529, + "rewards/rejected": -1.9325740337371826, + "sft_loss": 1.3336836099624634, "step": 5095 }, { "epoch": 2.7295534370296037, - "grad_norm": 8.305080535633154, - "learning_rate": 7.332862297087073e-08, - "logits/chosen": -0.6654651761054993, - "logits/rejected": -0.5675476789474487, - "logps/chosen": -1.2314693927764893, - "logps/rejected": -2.658346652984619, - "loss": 0.8194, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.2314693927764893, - "rewards/margins": 1.4268771409988403, - "rewards/rejected": -2.658346652984619, - "sft_loss": 1.2853989601135254, + "grad_norm": 10.148748409509505, + "learning_rate": 2.4442874323623574e-08, + "logits/chosen": -0.14277896285057068, + "logits/rejected": 0.0038436322938650846, + "logps/chosen": -1.2554153203964233, + "logps/rejected": -2.050020933151245, + "loss": 0.9078, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2554153203964233, + "rewards/margins": 0.7946058511734009, + "rewards/rejected": -2.050020933151245, + "sft_loss": 1.2769153118133545, "step": 5100 }, { "epoch": 2.7322294698110055, - "grad_norm": 10.450685745088888, - "learning_rate": 7.189258488922768e-08, - "logits/chosen": -0.6975101232528687, - "logits/rejected": -0.5831706523895264, - "logps/chosen": -1.3284635543823242, - "logps/rejected": -2.352112293243408, - "loss": 0.8555, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.3284635543823242, - "rewards/margins": 1.023648738861084, - "rewards/rejected": -2.352112293243408, - "sft_loss": 1.3477023839950562, + "grad_norm": 12.320620324937778, + "learning_rate": 2.396419496307589e-08, + "logits/chosen": -0.20719130337238312, + "logits/rejected": -0.04225752875208855, + "logps/chosen": -1.3282561302185059, + "logps/rejected": -1.8030502796173096, + "loss": 0.9795, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3282561302185059, + "rewards/margins": 0.4747942388057709, + "rewards/rejected": -1.8030502796173096, + "sft_loss": 1.3411647081375122, "step": 5105 }, { "epoch": 2.7349055025924067, - "grad_norm": 12.362442246584587, - "learning_rate": 7.047040231198959e-08, - "logits/chosen": -0.7626334428787231, - "logits/rejected": -0.6651133298873901, - "logps/chosen": -1.2891103029251099, - "logps/rejected": -2.2566936016082764, - "loss": 0.8845, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2891103029251099, - "rewards/margins": 0.9675832986831665, - "rewards/rejected": -2.2566936016082764, - "sft_loss": 1.3317053318023682, + "grad_norm": 8.552167395329858, + "learning_rate": 2.349013410399653e-08, + "logits/chosen": -0.2896295487880707, + "logits/rejected": -0.14427712559700012, + "logps/chosen": -1.325205683708191, + "logps/rejected": -1.812347173690796, + "loss": 0.996, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.325205683708191, + "rewards/margins": 0.4871414303779602, + "rewards/rejected": -1.812347173690796, + "sft_loss": 1.332587718963623, "step": 5110 }, { "epoch": 2.7375815353738084, - "grad_norm": 7.641707637535059, - "learning_rate": 6.906208903717787e-08, - "logits/chosen": -0.8081218004226685, - "logits/rejected": -0.6535941362380981, - "logps/chosen": -1.2907568216323853, - "logps/rejected": -2.467261552810669, - "loss": 0.8587, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2907568216323853, - "rewards/margins": 1.1765047311782837, - "rewards/rejected": -2.467261552810669, - "sft_loss": 1.351701021194458, + "grad_norm": 8.47360607592217, + "learning_rate": 2.3020696345725954e-08, + "logits/chosen": -0.3206334710121155, + "logits/rejected": -0.10871531069278717, + "logps/chosen": -1.2992509603500366, + "logps/rejected": -1.9598051309585571, + "loss": 0.9699, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2992509603500366, + "rewards/margins": 0.6605542302131653, + "rewards/rejected": -1.9598051309585571, + "sft_loss": 1.3578704595565796, "step": 5115 }, { "epoch": 2.7402575681552097, - "grad_norm": 9.531204664336729, - "learning_rate": 6.76676587282542e-08, - "logits/chosen": -0.7317078709602356, - "logits/rejected": -0.7078748345375061, - "logps/chosen": -1.4110424518585205, - "logps/rejected": -2.46642804145813, - "loss": 0.8744, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.4110424518585205, - "rewards/margins": 1.0553853511810303, - "rewards/rejected": -2.46642804145813, - "sft_loss": 1.419112205505371, + "grad_norm": 12.042241142188479, + "learning_rate": 2.2555886242751398e-08, + "logits/chosen": -0.2356669157743454, + "logits/rejected": -0.17199601233005524, + "logps/chosen": -1.4252959489822388, + "logps/rejected": -2.001298189163208, + "loss": 0.9816, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4252959489822388, + "rewards/margins": 0.5760020017623901, + "rewards/rejected": -2.001298189163208, + "sft_loss": 1.4144552946090698, "step": 5120 }, { "epoch": 2.7429336009366114, - "grad_norm": 13.469773428881165, - "learning_rate": 6.628712491398736e-08, - "logits/chosen": -0.8352164030075073, - "logits/rejected": -0.6624903678894043, - "logps/chosen": -1.2661771774291992, - "logps/rejected": -2.2277333736419678, - "loss": 0.8784, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2661771774291992, - "rewards/margins": 0.9615561366081238, - "rewards/rejected": -2.2277333736419678, - "sft_loss": 1.3674657344818115, + "grad_norm": 27.851101467081993, + "learning_rate": 2.2095708304662453e-08, + "logits/chosen": -0.34884732961654663, + "logits/rejected": -0.12272150814533234, + "logps/chosen": -1.2981888055801392, + "logps/rejected": -1.770521879196167, + "loss": 1.0054, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2981888055801392, + "rewards/margins": 0.47233304381370544, + "rewards/rejected": -1.770521879196167, + "sft_loss": 1.3615667819976807, "step": 5125 }, { "epoch": 2.745609633718013, - "grad_norm": 9.16478362847294, - "learning_rate": 6.492050098832281e-08, - "logits/chosen": -0.8760141134262085, - "logits/rejected": -0.7233066558837891, - "logps/chosen": -1.2933028936386108, - "logps/rejected": -2.4618935585021973, - "loss": 0.8477, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.2933028936386108, - "rewards/margins": 1.1685909032821655, - "rewards/rejected": -2.4618935585021973, - "sft_loss": 1.3852190971374512, + "grad_norm": 6.017299596428813, + "learning_rate": 2.16401669961076e-08, + "logits/chosen": -0.4026873707771301, + "logits/rejected": -0.20588211715221405, + "logps/chosen": -1.291322112083435, + "logps/rejected": -1.9024931192398071, + "loss": 0.9745, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.291322112083435, + "rewards/margins": 0.6111709475517273, + "rewards/rejected": -1.9024931192398071, + "sft_loss": 1.3745701313018799, "step": 5130 }, { "epoch": 2.748285666499415, - "grad_norm": 13.989699607246886, - "learning_rate": 6.356780021025161e-08, - "logits/chosen": -0.6575424075126648, - "logits/rejected": -0.6311792135238647, - "logps/chosen": -1.3274948596954346, - "logps/rejected": -2.2456724643707275, - "loss": 0.8865, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3274948596954346, - "rewards/margins": 0.9181777238845825, - "rewards/rejected": -2.2456724643707275, - "sft_loss": 1.379867434501648, + "grad_norm": 9.072085408916156, + "learning_rate": 2.1189266736750532e-08, + "logits/chosen": -0.15757055580615997, + "logits/rejected": -0.08821666985750198, + "logps/chosen": -1.319150686264038, + "logps/rejected": -1.785361647605896, + "loss": 0.9952, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.319150686264038, + "rewards/margins": 0.4662107825279236, + "rewards/rejected": -1.785361647605896, + "sft_loss": 1.3475697040557861, "step": 5135 }, { "epoch": 2.750961699280816, - "grad_norm": 7.570676290771131, - "learning_rate": 6.222903570368288e-08, - "logits/chosen": -0.6666676998138428, - "logits/rejected": -0.5848277807235718, - "logps/chosen": -1.3754925727844238, - "logps/rejected": -2.2089810371398926, - "loss": 0.9377, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3754925727844238, - "rewards/margins": 0.8334886431694031, - "rewards/rejected": -2.2089810371398926, - "sft_loss": 1.4269959926605225, + "grad_norm": 7.2791490733710225, + "learning_rate": 2.0743011901227623e-08, + "logits/chosen": -0.16408401727676392, + "logits/rejected": -0.02998713217675686, + "logps/chosen": -1.3824737071990967, + "logps/rejected": -1.8074893951416016, + "loss": 1.0441, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3824737071990967, + "rewards/margins": 0.42501574754714966, + "rewards/rejected": -1.8074893951416016, + "sft_loss": 1.4203590154647827, "step": 5140 }, { "epoch": 2.753637732062218, - "grad_norm": 10.834118294330352, - "learning_rate": 6.090422045731525e-08, - "logits/chosen": -0.7254561185836792, - "logits/rejected": -0.5795365571975708, - "logps/chosen": -1.2621824741363525, - "logps/rejected": -2.258971691131592, - "loss": 0.8837, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2621824741363525, - "rewards/margins": 0.9967893362045288, - "rewards/rejected": -2.258971691131592, - "sft_loss": 1.325939416885376, + "grad_norm": 9.76020026843918, + "learning_rate": 2.030140681910508e-08, + "logits/chosen": -0.23930442333221436, + "logits/rejected": -0.06129174679517746, + "logps/chosen": -1.2756531238555908, + "logps/rejected": -1.7888301610946655, + "loss": 0.9928, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2756531238555908, + "rewards/margins": 0.5131770968437195, + "rewards/rejected": -1.7888301610946655, + "sft_loss": 1.3264212608337402, "step": 5145 }, { "epoch": 2.756313764843619, - "grad_norm": 10.56441198792611, - "learning_rate": 5.9593367324512593e-08, - "logits/chosen": -0.7619807720184326, - "logits/rejected": -0.6812753081321716, - "logps/chosen": -1.2736539840698242, - "logps/rejected": -2.1827409267425537, - "loss": 0.853, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.2736539840698242, - "rewards/margins": 0.9090871810913086, - "rewards/rejected": -2.1827409267425537, - "sft_loss": 1.314415454864502, + "grad_norm": 7.203198297540759, + "learning_rate": 1.986445577483753e-08, + "logits/chosen": -0.30419832468032837, + "logits/rejected": -0.18057909607887268, + "logps/chosen": -1.2716641426086426, + "logps/rejected": -1.7480897903442383, + "loss": 0.9584, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2716641426086426, + "rewards/margins": 0.4764255881309509, + "rewards/rejected": -1.7480897903442383, + "sft_loss": 1.31839120388031, "step": 5150 }, { "epoch": 2.758989797625021, - "grad_norm": 7.411699622579818, - "learning_rate": 5.8296489023177305e-08, - "logits/chosen": -0.7957712411880493, - "logits/rejected": -0.7316339015960693, - "logps/chosen": -1.3173013925552368, - "logps/rejected": -2.193631172180176, - "loss": 0.8966, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3173013925552368, - "rewards/margins": 0.8763298988342285, - "rewards/rejected": -2.193631172180176, - "sft_loss": 1.4033677577972412, + "grad_norm": 7.7771625212828, + "learning_rate": 1.9432163007725765e-08, + "logits/chosen": -0.29892662167549133, + "logits/rejected": -0.1932951807975769, + "logps/chosen": -1.3394827842712402, + "logps/rejected": -1.8271324634552002, + "loss": 0.9999, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3394827842712402, + "rewards/margins": 0.4876495897769928, + "rewards/rejected": -1.8271324634552002, + "sft_loss": 1.4048898220062256, "step": 5155 }, { "epoch": 2.7616658304064226, - "grad_norm": 5.6330080840511245, - "learning_rate": 5.7013598135628895e-08, - "logits/chosen": -0.73341304063797, - "logits/rejected": -0.7175203561782837, - "logps/chosen": -1.2750922441482544, - "logps/rejected": -2.3670904636383057, - "loss": 0.8476, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.2750922441482544, - "rewards/margins": 1.0919979810714722, - "rewards/rejected": -2.3670904636383057, - "sft_loss": 1.3750401735305786, + "grad_norm": 6.302749082938999, + "learning_rate": 1.9004532711876297e-08, + "logits/chosen": -0.25966915488243103, + "logits/rejected": -0.19929108023643494, + "logps/chosen": -1.2809369564056396, + "logps/rejected": -1.862352967262268, + "loss": 0.9495, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2809369564056396, + "rewards/margins": 0.5814159512519836, + "rewards/rejected": -1.862352967262268, + "sft_loss": 1.3612562417984009, "step": 5160 }, { "epoch": 2.7643418631878243, - "grad_norm": 11.765520522447899, - "learning_rate": 5.5744707108479784e-08, - "logits/chosen": -0.7418981790542603, - "logits/rejected": -0.6119092106819153, - "logps/chosen": -1.2623355388641357, - "logps/rejected": -2.2879061698913574, - "loss": 0.8443, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.2623355388641357, - "rewards/margins": 1.0255706310272217, - "rewards/rejected": -2.2879061698913574, - "sft_loss": 1.3115851879119873, + "grad_norm": 9.166286039381573, + "learning_rate": 1.8581569036159928e-08, + "logits/chosen": -0.23846633732318878, + "logits/rejected": -0.04908045381307602, + "logps/chosen": -1.2551108598709106, + "logps/rejected": -1.8049733638763428, + "loss": 0.9493, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2551108598709106, + "rewards/margins": 0.5498624444007874, + "rewards/rejected": -1.8049733638763428, + "sft_loss": 1.3031388521194458, "step": 5165 }, { "epoch": 2.7670178959692255, - "grad_norm": 8.656161954299588, - "learning_rate": 5.448982825251686e-08, - "logits/chosen": -0.7418988943099976, - "logits/rejected": -0.6640129089355469, - "logps/chosen": -1.3426659107208252, - "logps/rejected": -2.3372931480407715, - "loss": 0.9129, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3426659107208252, - "rewards/margins": 0.9946274757385254, - "rewards/rejected": -2.3372931480407715, - "sft_loss": 1.417859673500061, + "grad_norm": 6.057023934672095, + "learning_rate": 1.8163276084172285e-08, + "logits/chosen": -0.21795444190502167, + "logits/rejected": -0.08611872047185898, + "logps/chosen": -1.3664335012435913, + "logps/rejected": -1.8410437107086182, + "loss": 1.0505, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3664335012435913, + "rewards/margins": 0.4746101498603821, + "rewards/rejected": -1.8410437107086182, + "sft_loss": 1.4286975860595703, "step": 5170 }, { "epoch": 2.7696939287506273, - "grad_norm": 15.682599029347866, - "learning_rate": 5.324897374257959e-08, - "logits/chosen": -0.7691652774810791, - "logits/rejected": -0.7162500619888306, - "logps/chosen": -1.3604718446731567, - "logps/rejected": -2.3567006587982178, - "loss": 0.9071, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3604718446731567, - "rewards/margins": 0.9962291717529297, - "rewards/rejected": -2.3567006587982178, - "sft_loss": 1.3918625116348267, + "grad_norm": 12.377505883238616, + "learning_rate": 1.7749657914193194e-08, + "logits/chosen": -0.26580289006233215, + "logits/rejected": -0.15877142548561096, + "logps/chosen": -1.3597075939178467, + "logps/rejected": -1.8588300943374634, + "loss": 1.0197, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3597075939178467, + "rewards/margins": 0.499122679233551, + "rewards/rejected": -1.8588300943374634, + "sft_loss": 1.3698407411575317, "step": 5175 }, { "epoch": 2.7723699615320285, - "grad_norm": 6.815161618210579, - "learning_rate": 5.202215561744461e-08, - "logits/chosen": -0.6741605997085571, - "logits/rejected": -0.6667729616165161, - "logps/chosen": -1.3192849159240723, - "logps/rejected": -2.3011367321014404, - "loss": 0.9035, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3192849159240723, - "rewards/margins": 0.9818517565727234, - "rewards/rejected": -2.3011367321014404, - "sft_loss": 1.397717833518982, + "grad_norm": 10.005247094936804, + "learning_rate": 1.7340718539148203e-08, + "logits/chosen": -0.16776493191719055, + "logits/rejected": -0.11533866822719574, + "logps/chosen": -1.3729407787322998, + "logps/rejected": -1.8863117694854736, + "loss": 1.0264, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3729407787322998, + "rewards/margins": 0.5133708715438843, + "rewards/rejected": -1.8863117694854736, + "sft_loss": 1.424452781677246, "step": 5180 }, { "epoch": 2.7750459943134302, - "grad_norm": 8.747190731241487, - "learning_rate": 5.080938577970617e-08, - "logits/chosen": -0.7463937997817993, - "logits/rejected": -0.6645978093147278, - "logps/chosen": -1.219977617263794, - "logps/rejected": -2.464294195175171, - "loss": 0.8587, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.219977617263794, - "rewards/margins": 1.2443166971206665, - "rewards/rejected": -2.464294195175171, - "sft_loss": 1.3175201416015625, + "grad_norm": 8.47032457767384, + "learning_rate": 1.6936461926568724e-08, + "logits/chosen": -0.19939279556274414, + "logits/rejected": -0.08026957511901855, + "logps/chosen": -1.2351583242416382, + "logps/rejected": -1.9103492498397827, + "loss": 0.9511, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2351583242416382, + "rewards/margins": 0.675190806388855, + "rewards/rejected": -1.9103492498397827, + "sft_loss": 1.3065307140350342, "step": 5185 }, { "epoch": 2.777722027094832, - "grad_norm": 9.688695160055701, - "learning_rate": 4.961067599566305e-08, - "logits/chosen": -0.8435994386672974, - "logits/rejected": -0.7417990565299988, - "logps/chosen": -1.281051754951477, - "logps/rejected": -2.3396692276000977, - "loss": 0.8574, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.281051754951477, - "rewards/margins": 1.0586177110671997, - "rewards/rejected": -2.3396692276000977, - "sft_loss": 1.3897156715393066, + "grad_norm": 10.78552905594794, + "learning_rate": 1.6536891998554346e-08, + "logits/chosen": -0.3108990788459778, + "logits/rejected": -0.14208605885505676, + "logps/chosen": -1.284757137298584, + "logps/rejected": -1.823754906654358, + "loss": 0.9872, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.284757137298584, + "rewards/margins": 0.5389977097511292, + "rewards/rejected": -1.823754906654358, + "sft_loss": 1.3961396217346191, "step": 5190 }, { "epoch": 2.7803980598762337, - "grad_norm": 7.8819553620316025, - "learning_rate": 4.8426037895202277e-08, - "logits/chosen": -0.7271918654441833, - "logits/rejected": -0.6405032873153687, - "logps/chosen": -1.3014874458312988, - "logps/rejected": -2.2847611904144287, - "loss": 0.8617, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3014874458312988, - "rewards/margins": 0.9832738041877747, - "rewards/rejected": -2.2847611904144287, - "sft_loss": 1.3530741930007935, + "grad_norm": 11.737073842324726, + "learning_rate": 1.6142012631734093e-08, + "logits/chosen": -0.21096226572990417, + "logits/rejected": -0.0763673335313797, + "logps/chosen": -1.3107333183288574, + "logps/rejected": -1.7774174213409424, + "loss": 0.983, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3107333183288574, + "rewards/margins": 0.4666841924190521, + "rewards/rejected": -1.7774174213409424, + "sft_loss": 1.3360531330108643, "step": 5195 }, { "epoch": 2.783074092657635, - "grad_norm": 11.952336263629748, - "learning_rate": 4.725548297168847e-08, - "logits/chosen": -0.7949420809745789, - "logits/rejected": -0.7019721269607544, - "logps/chosen": -1.2483434677124023, - "logps/rejected": -2.2791337966918945, - "loss": 0.8717, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.2483434677124023, - "rewards/margins": 1.0307904481887817, - "rewards/rejected": -2.2791337966918945, - "sft_loss": 1.3429853916168213, + "grad_norm": 11.707716272263704, + "learning_rate": 1.575182765722949e-08, + "logits/chosen": -0.27849727869033813, + "logits/rejected": -0.12080128490924835, + "logps/chosen": -1.269670844078064, + "logps/rejected": -1.8024629354476929, + "loss": 1.0003, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.269670844078064, + "rewards/margins": 0.5327920913696289, + "rewards/rejected": -1.8024629354476929, + "sft_loss": 1.3441965579986572, "step": 5200 }, { "epoch": 2.783074092657635, - "eval_logits/chosen": -0.49635621905326843, - "eval_logits/rejected": -0.4497099220752716, - "eval_logps/chosen": -1.5370229482650757, - "eval_logps/rejected": -2.2062909603118896, - "eval_loss": 1.0438063144683838, - "eval_rewards/accuracies": 0.6468842625617981, - "eval_rewards/chosen": -1.5370229482650757, - "eval_rewards/margins": 0.6692681312561035, - "eval_rewards/rejected": -2.2062909603118896, - "eval_runtime": 42.9859, - "eval_samples_per_second": 31.289, - "eval_sft_loss": 1.4854966402053833, - "eval_steps_per_second": 7.84, + "eval_logits/chosen": 0.11369408667087555, + "eval_logits/rejected": 0.20460036396980286, + "eval_logps/chosen": -1.398575782775879, + "eval_logps/rejected": -1.8438172340393066, + "eval_loss": 1.041297197341919, + "eval_rewards/accuracies": 0.6186943650245667, + "eval_rewards/chosen": -1.398575782775879, + "eval_rewards/margins": 0.4452415108680725, + "eval_rewards/rejected": -1.8438172340393066, + "eval_runtime": 50.9806, + "eval_samples_per_second": 26.383, + "eval_sft_loss": 1.4026674032211304, + "eval_steps_per_second": 6.61, "step": 5200 }, { "epoch": 2.7857501254390367, - "grad_norm": 8.386269839505383, - "learning_rate": 4.609902258185017e-08, - "logits/chosen": -0.6943266987800598, - "logits/rejected": -0.7278541326522827, - "logps/chosen": -1.2799112796783447, - "logps/rejected": -2.147169589996338, - "loss": 0.919, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.2799112796783447, - "rewards/margins": 0.8672581911087036, - "rewards/rejected": -2.147169589996338, - "sft_loss": 1.3275152444839478, + "grad_norm": 7.79934160621283, + "learning_rate": 1.536634086061672e-08, + "logits/chosen": -0.16183073818683624, + "logits/rejected": -0.1267414540052414, + "logps/chosen": -1.3054921627044678, + "logps/rejected": -1.7374639511108398, + "loss": 1.042, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3054921627044678, + "rewards/margins": 0.43197187781333923, + "rewards/rejected": -1.7374639511108398, + "sft_loss": 1.331822156906128, "step": 5205 }, { "epoch": 2.788426158220438, - "grad_norm": 8.29830813055329, - "learning_rate": 4.4956667945671496e-08, - "logits/chosen": -0.7525221109390259, - "logits/rejected": -0.7058395147323608, - "logps/chosen": -1.2669576406478882, - "logps/rejected": -2.4628257751464844, - "loss": 0.8309, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2669576406478882, - "rewards/margins": 1.1958680152893066, - "rewards/rejected": -2.4628257751464844, - "sft_loss": 1.3189142942428589, + "grad_norm": 14.886700148689938, + "learning_rate": 1.4985555981890495e-08, + "logits/chosen": -0.21894225478172302, + "logits/rejected": -0.12526299059391022, + "logps/chosen": -1.3163914680480957, + "logps/rejected": -1.8971529006958008, + "loss": 0.9634, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3163914680480957, + "rewards/margins": 0.5807615518569946, + "rewards/rejected": -1.8971529006958008, + "sft_loss": 1.3194328546524048, "step": 5210 }, { "epoch": 2.7911021910018396, - "grad_norm": 12.635756214928179, - "learning_rate": 4.382843014628168e-08, - "logits/chosen": -0.7203118205070496, - "logits/rejected": -0.6712735891342163, - "logps/chosen": -1.270727515220642, - "logps/rejected": -2.275433301925659, - "loss": 0.8641, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.270727515220642, - "rewards/margins": 1.004705548286438, - "rewards/rejected": -2.275433301925659, - "sft_loss": 1.3368369340896606, + "grad_norm": 5.547064945350949, + "learning_rate": 1.4609476715427226e-08, + "logits/chosen": -0.23416312038898468, + "logits/rejected": -0.13896510004997253, + "logps/chosen": -1.2732197046279907, + "logps/rejected": -1.8109039068222046, + "loss": 0.974, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2732197046279907, + "rewards/margins": 0.5376842021942139, + "rewards/rejected": -1.8109039068222046, + "sft_loss": 1.3302028179168701, "step": 5215 }, { "epoch": 2.7937782237832414, - "grad_norm": 10.654987854599767, - "learning_rate": 4.271432012984938e-08, - "logits/chosen": -0.7634333372116089, - "logits/rejected": -0.7373124361038208, - "logps/chosen": -1.282454252243042, - "logps/rejected": -2.5014779567718506, - "loss": 0.8665, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.282454252243042, - "rewards/margins": 1.2190238237380981, - "rewards/rejected": -2.5014779567718506, - "sft_loss": 1.385162591934204, + "grad_norm": 11.434534387420785, + "learning_rate": 1.4238106709949792e-08, + "logits/chosen": -0.2328489124774933, + "logits/rejected": -0.15601322054862976, + "logps/chosen": -1.2758675813674927, + "logps/rejected": -1.9007642269134521, + "loss": 0.991, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2758675813674927, + "rewards/margins": 0.6248966455459595, + "rewards/rejected": -1.9007642269134521, + "sft_loss": 1.3788985013961792, "step": 5220 }, { "epoch": 2.796454256564643, - "grad_norm": 21.230618820765667, - "learning_rate": 4.1614348705474534e-08, - "logits/chosen": -0.689985454082489, - "logits/rejected": -0.5928815603256226, - "logps/chosen": -1.3926451206207275, - "logps/rejected": -2.5388782024383545, - "loss": 0.8896, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3926451206207275, - "rewards/margins": 1.146233320236206, - "rewards/rejected": -2.5388782024383545, - "sft_loss": 1.416735291481018, + "grad_norm": 28.682691199755357, + "learning_rate": 1.3871449568491511e-08, + "logits/chosen": -0.16524925827980042, + "logits/rejected": -0.023778708651661873, + "logps/chosen": -1.376541018486023, + "logps/rejected": -1.9343907833099365, + "loss": 1.0012, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.376541018486023, + "rewards/margins": 0.5578497052192688, + "rewards/rejected": -1.9343907833099365, + "sft_loss": 1.3966842889785767, "step": 5225 }, { "epoch": 2.7991302893460444, - "grad_norm": 9.824058265140998, - "learning_rate": 4.052852654508482e-08, - "logits/chosen": -0.8481475710868835, - "logits/rejected": -0.7754641771316528, - "logps/chosen": -1.3186979293823242, - "logps/rejected": -2.2451705932617188, - "loss": 0.8758, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3186979293823242, - "rewards/margins": 0.9264729619026184, - "rewards/rejected": -2.2451705932617188, - "sft_loss": 1.3459546566009521, + "grad_norm": 10.160924466266978, + "learning_rate": 1.3509508848361606e-08, + "logits/chosen": -0.3029932677745819, + "logits/rejected": -0.1541920006275177, + "logps/chosen": -1.3313233852386475, + "logps/rejected": -1.7757488489151, + "loss": 0.9956, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3313233852386475, + "rewards/margins": 0.4444255828857422, + "rewards/rejected": -1.7757488489151, + "sft_loss": 1.331716775894165, "step": 5230 }, { "epoch": 2.801806322127446, - "grad_norm": 12.835873812310274, - "learning_rate": 3.9456864183331557e-08, - "logits/chosen": -0.7798875570297241, - "logits/rejected": -0.7219721078872681, - "logps/chosen": -1.3479766845703125, - "logps/rejected": -2.30627703666687, - "loss": 0.873, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3479766845703125, - "rewards/margins": 0.9583007097244263, - "rewards/rejected": -2.30627703666687, - "sft_loss": 1.3579721450805664, + "grad_norm": 7.942468487830706, + "learning_rate": 1.3152288061110517e-08, + "logits/chosen": -0.2790893018245697, + "logits/rejected": -0.16795600950717926, + "logps/chosen": -1.3298932313919067, + "logps/rejected": -1.826585054397583, + "loss": 0.9702, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3298932313919067, + "rewards/margins": 0.49669161438941956, + "rewards/rejected": -1.826585054397583, + "sft_loss": 1.3156096935272217, "step": 5235 }, { "epoch": 2.804482354908848, - "grad_norm": 8.458792436519417, - "learning_rate": 3.839937201748744e-08, - "logits/chosen": -0.7576145529747009, - "logits/rejected": -0.5977579951286316, - "logps/chosen": -1.3713600635528564, - "logps/rejected": -2.4814581871032715, - "loss": 0.9009, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3713600635528564, - "rewards/margins": 1.110097885131836, - "rewards/rejected": -2.4814581871032715, - "sft_loss": 1.3729455471038818, + "grad_norm": 11.404925923136052, + "learning_rate": 1.2799790672495814e-08, + "logits/chosen": -0.26472631096839905, + "logits/rejected": -0.053306348621845245, + "logps/chosen": -1.34926438331604, + "logps/rejected": -1.8661677837371826, + "loss": 1.0163, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.34926438331604, + "rewards/margins": 0.5169033408164978, + "rewards/rejected": -1.8661677837371826, + "sft_loss": 1.3601570129394531, "step": 5240 }, { "epoch": 2.807158387690249, - "grad_norm": 7.784021685248867, - "learning_rate": 3.735606030734651e-08, - "logits/chosen": -0.7065932154655457, - "logits/rejected": -0.6710800528526306, - "logps/chosen": -1.2697302103042603, - "logps/rejected": -2.2421200275421143, - "loss": 0.897, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.2697302103042603, - "rewards/margins": 0.9723899960517883, - "rewards/rejected": -2.2421200275421143, - "sft_loss": 1.3095524311065674, + "grad_norm": 9.149071007317527, + "learning_rate": 1.2452020102448835e-08, + "logits/chosen": -0.17115184664726257, + "logits/rejected": -0.11456646770238876, + "logps/chosen": -1.292288064956665, + "logps/rejected": -1.7565841674804688, + "loss": 1.0066, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.292288064956665, + "rewards/margins": 0.46429625153541565, + "rewards/rejected": -1.7565841674804688, + "sft_loss": 1.303889274597168, "step": 5245 }, { "epoch": 2.8098344204716508, - "grad_norm": 15.526328851790549, - "learning_rate": 3.632693917512331e-08, - "logits/chosen": -0.7905303239822388, - "logits/rejected": -0.6953557729721069, - "logps/chosen": -1.36759352684021, - "logps/rejected": -2.4535601139068604, - "loss": 0.932, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.36759352684021, - "rewards/margins": 1.0859668254852295, - "rewards/rejected": -2.4535601139068604, - "sft_loss": 1.415594458580017, + "grad_norm": 14.640790871582416, + "learning_rate": 1.2108979725041103e-08, + "logits/chosen": -0.2621290683746338, + "logits/rejected": -0.1196102723479271, + "logps/chosen": -1.3476142883300781, + "logps/rejected": -1.876542091369629, + "loss": 1.0228, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3476142883300781, + "rewards/margins": 0.5289276838302612, + "rewards/rejected": -1.876542091369629, + "sft_loss": 1.3881253004074097, "step": 5250 }, { "epoch": 2.8125104532530525, - "grad_norm": 11.082167378180044, - "learning_rate": 3.531201860535588e-08, - "logits/chosen": -0.7412352561950684, - "logits/rejected": -0.5843526721000671, - "logps/chosen": -1.3602845668792725, - "logps/rejected": -2.3672142028808594, - "loss": 0.8927, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3602845668792725, - "rewards/margins": 1.006929636001587, - "rewards/rejected": -2.3672142028808594, - "sft_loss": 1.3555505275726318, + "grad_norm": 12.441149190766268, + "learning_rate": 1.1770672868451958e-08, + "logits/chosen": -0.2256254404783249, + "logits/rejected": -0.0006707877037115395, + "logps/chosen": -1.341567873954773, + "logps/rejected": -1.817436933517456, + "loss": 1.0167, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.341567873954773, + "rewards/margins": 0.4758690297603607, + "rewards/rejected": -1.817436933517456, + "sft_loss": 1.3392161130905151, "step": 5255 }, { "epoch": 2.8151864860344538, - "grad_norm": 12.359836615146241, - "learning_rate": 3.431130844480762e-08, - "logits/chosen": -0.7371433973312378, - "logits/rejected": -0.7097469568252563, - "logps/chosen": -1.3078466653823853, - "logps/rejected": -2.363670825958252, - "loss": 0.8994, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3078466653823853, - "rewards/margins": 1.0558240413665771, - "rewards/rejected": -2.363670825958252, - "sft_loss": 1.4063899517059326, + "grad_norm": 16.34873624523049, + "learning_rate": 1.1437102814935872e-08, + "logits/chosen": -0.19952070713043213, + "logits/rejected": -0.12068512290716171, + "logps/chosen": -1.3146910667419434, + "logps/rejected": -1.8844077587127686, + "loss": 1.0118, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3146910667419434, + "rewards/margins": 0.5697168707847595, + "rewards/rejected": -1.8844077587127686, + "sft_loss": 1.3984930515289307, "step": 5260 }, { "epoch": 2.8178625188158555, - "grad_norm": 8.790703213167431, - "learning_rate": 3.332481840237306e-08, - "logits/chosen": -0.831184983253479, - "logits/rejected": -0.7051724195480347, - "logps/chosen": -1.4858452081680298, - "logps/rejected": -2.5289244651794434, - "loss": 0.935, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.4858452081680298, - "rewards/margins": 1.043079137802124, - "rewards/rejected": -2.5289244651794434, - "sft_loss": 1.5154434442520142, + "grad_norm": 9.920009189503926, + "learning_rate": 1.1108272800791018e-08, + "logits/chosen": -0.3461669683456421, + "logits/rejected": -0.1222844123840332, + "logps/chosen": -1.5000916719436646, + "logps/rejected": -1.9649989604949951, + "loss": 1.0746, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.5000916719436646, + "rewards/margins": 0.4649074673652649, + "rewards/rejected": -1.9649989604949951, + "sft_loss": 1.5144903659820557, "step": 5265 }, { "epoch": 2.820538551597257, - "grad_norm": 10.28811582940433, - "learning_rate": 3.235255804898307e-08, - "logits/chosen": -0.7143429517745972, - "logits/rejected": -0.6302151083946228, - "logps/chosen": -1.255330204963684, - "logps/rejected": -2.2524116039276123, - "loss": 0.8299, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.255330204963684, - "rewards/margins": 0.9970816373825073, - "rewards/rejected": -2.2524116039276123, - "sft_loss": 1.288132905960083, + "grad_norm": 7.557753564737677, + "learning_rate": 1.078418601632769e-08, + "logits/chosen": -0.21707001328468323, + "logits/rejected": -0.0680532231926918, + "logps/chosen": -1.266671061515808, + "logps/rejected": -1.8083187341690063, + "loss": 0.9414, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.266671061515808, + "rewards/margins": 0.5416474938392639, + "rewards/rejected": -1.8083187341690063, + "sft_loss": 1.2934260368347168, "step": 5270 }, { "epoch": 2.8232145843786585, - "grad_norm": 5.87670585733461, - "learning_rate": 3.1394536817511475e-08, - "logits/chosen": -0.7384647727012634, - "logits/rejected": -0.6223892569541931, - "logps/chosen": -1.3244822025299072, - "logps/rejected": -2.414888381958008, - "loss": 0.845, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.3244822025299072, - "rewards/margins": 1.090406060218811, - "rewards/rejected": -2.414888381958008, - "sft_loss": 1.3457294702529907, + "grad_norm": 7.800923952205403, + "learning_rate": 1.0464845605837159e-08, + "logits/chosen": -0.19317008554935455, + "logits/rejected": -0.03749823570251465, + "logps/chosen": -1.3289610147476196, + "logps/rejected": -1.878650426864624, + "loss": 0.9596, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3289610147476196, + "rewards/margins": 0.5496894717216492, + "rewards/rejected": -1.878650426864624, + "sft_loss": 1.3256250619888306, "step": 5275 }, { "epoch": 2.82589061716006, - "grad_norm": 7.378821074283863, - "learning_rate": 3.0450764002684926e-08, - "logits/chosen": -0.6905801296234131, - "logits/rejected": -0.5800179839134216, - "logps/chosen": -1.4378401041030884, - "logps/rejected": -2.594846248626709, - "loss": 0.8967, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.4378401041030884, - "rewards/margins": 1.1570061445236206, - "rewards/rejected": -2.594846248626709, - "sft_loss": 1.4353234767913818, + "grad_norm": 15.100479281827543, + "learning_rate": 1.0150254667561642e-08, + "logits/chosen": -0.2256535291671753, + "logits/rejected": -0.05598093196749687, + "logps/chosen": -1.4376709461212158, + "logps/rejected": -2.014469623565674, + "loss": 1.0343, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4376709461212158, + "rewards/margins": 0.5767990350723267, + "rewards/rejected": -2.014469623565674, + "sft_loss": 1.4365990161895752, "step": 5280 }, { "epoch": 2.828566649941462, - "grad_norm": 9.299567439189415, - "learning_rate": 2.9521248760991158e-08, - "logits/chosen": -0.7614275813102722, - "logits/rejected": -0.7028220891952515, - "logps/chosen": -1.2835723161697388, - "logps/rejected": -2.5371744632720947, - "loss": 0.8219, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.2835723161697388, - "rewards/margins": 1.253602147102356, - "rewards/rejected": -2.5371744632720947, - "sft_loss": 1.306063175201416, + "grad_norm": 14.542188718953193, + "learning_rate": 9.840416253663719e-09, + "logits/chosen": -0.2840477526187897, + "logits/rejected": -0.18323290348052979, + "logps/chosen": -1.2720750570297241, + "logps/rejected": -1.9221827983856201, + "loss": 0.9368, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2720750570297241, + "rewards/margins": 0.6501076221466064, + "rewards/rejected": -1.9221827983856201, + "sft_loss": 1.2888672351837158, "step": 5285 }, { "epoch": 2.8312426827228636, - "grad_norm": 8.787277207068174, - "learning_rate": 2.8606000110591224e-08, - "logits/chosen": -0.7085649371147156, - "logits/rejected": -0.61260586977005, - "logps/chosen": -1.3210874795913696, - "logps/rejected": -2.2309603691101074, - "loss": 0.9032, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3210874795913696, - "rewards/margins": 0.9098728895187378, - "rewards/rejected": -2.2309603691101074, - "sft_loss": 1.3892929553985596, + "grad_norm": 12.497966768346453, + "learning_rate": 9.535333370197074e-09, + "logits/chosen": -0.21206767857074738, + "logits/rejected": -0.06609787046909332, + "logps/chosen": -1.3235752582550049, + "logps/rejected": -1.765629768371582, + "loss": 1.0268, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3235752582550049, + "rewards/margins": 0.44205451011657715, + "rewards/rejected": -1.765629768371582, + "sft_loss": 1.3885878324508667, "step": 5290 }, { "epoch": 2.833918715504265, - "grad_norm": 8.989265420899805, - "learning_rate": 2.770502693123139e-08, - "logits/chosen": -0.7841325998306274, - "logits/rejected": -0.6407720446586609, - "logps/chosen": -1.4084599018096924, - "logps/rejected": -2.4518253803253174, - "loss": 0.9106, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.4084599018096924, - "rewards/margins": 1.043365716934204, - "rewards/rejected": -2.4518253803253174, - "sft_loss": 1.4504055976867676, + "grad_norm": 4.858483991361914, + "learning_rate": 9.23500897707713e-09, + "logits/chosen": -0.293332576751709, + "logits/rejected": -0.09560652077198029, + "logps/chosen": -1.422760009765625, + "logps/rejected": -1.9979000091552734, + "loss": 1.0268, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.422760009765625, + "rewards/margins": 0.5751398801803589, + "rewards/rejected": -1.9979000091552734, + "sft_loss": 1.4550334215164185, "step": 5295 }, { "epoch": 2.8365947482856666, - "grad_norm": 7.355372041676527, - "learning_rate": 2.6818337964157726e-08, - "logits/chosen": -0.749289870262146, - "logits/rejected": -0.7474431991577148, - "logps/chosen": -1.3349918127059937, - "logps/rejected": -2.496898889541626, - "loss": 0.8718, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3349918127059937, - "rewards/margins": 1.1619070768356323, - "rewards/rejected": -2.496898889541626, - "sft_loss": 1.3594180345535278, + "grad_norm": 9.142542353654257, + "learning_rate": 8.939445988052574e-09, + "logits/chosen": -0.23723764717578888, + "logits/rejected": -0.1911575049161911, + "logps/chosen": -1.3420722484588623, + "logps/rejected": -1.9288721084594727, + "loss": 0.986, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3420722484588623, + "rewards/margins": 0.5867999196052551, + "rewards/rejected": -1.9288721084594727, + "sft_loss": 1.3467390537261963, "step": 5300 }, { "epoch": 2.839270781067068, - "grad_norm": 10.195419419106704, - "learning_rate": 2.5945941812029973e-08, - "logits/chosen": -0.7281011343002319, - "logits/rejected": -0.6409409642219543, - "logps/chosen": -1.372420310974121, - "logps/rejected": -2.2830145359039307, - "loss": 0.934, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.372420310974121, - "rewards/margins": 0.9105939865112305, - "rewards/rejected": -2.2830145359039307, - "sft_loss": 1.4592607021331787, + "grad_norm": 11.931970720363225, + "learning_rate": 8.648647270676656e-09, + "logits/chosen": -0.2269028127193451, + "logits/rejected": -0.08750694990158081, + "logps/chosen": -1.360556960105896, + "logps/rejected": -1.8920097351074219, + "loss": 1.0135, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.360556960105896, + "rewards/margins": 0.5314527750015259, + "rewards/rejected": -1.8920097351074219, + "sft_loss": 1.4528709650039673, "step": 5305 }, { "epoch": 2.8419468138484696, - "grad_norm": 10.851374419235961, - "learning_rate": 2.5087846938839976e-08, - "logits/chosen": -0.8489446640014648, - "logits/rejected": -0.6587695479393005, - "logps/chosen": -1.2906196117401123, - "logps/rejected": -2.5040271282196045, - "loss": 0.863, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2906196117401123, - "rewards/margins": 1.2134075164794922, - "rewards/rejected": -2.5040271282196045, - "sft_loss": 1.3395346403121948, + "grad_norm": 6.881015565821513, + "learning_rate": 8.362615646279991e-09, + "logits/chosen": -0.3738090693950653, + "logits/rejected": -0.11932375282049179, + "logps/chosen": -1.277724027633667, + "logps/rejected": -1.9794114828109741, + "loss": 0.9526, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.277724027633667, + "rewards/margins": 0.701687216758728, + "rewards/rejected": -1.9794114828109741, + "sft_loss": 1.326658844947815, "step": 5310 }, { "epoch": 2.8446228466298713, - "grad_norm": 10.678429689134765, - "learning_rate": 2.42440616698274e-08, - "logits/chosen": -0.6294640302658081, - "logits/rejected": -0.5675816535949707, - "logps/chosen": -1.2999883890151978, - "logps/rejected": -2.193481922149658, - "loss": 0.8862, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2999883890151978, - "rewards/margins": 0.8934933543205261, - "rewards/rejected": -2.193481922149658, - "sft_loss": 1.374890685081482, + "grad_norm": 11.822782857418908, + "learning_rate": 8.081353889942466e-09, + "logits/chosen": -0.12437667697668076, + "logits/rejected": -0.001963780727237463, + "logps/chosen": -1.3134911060333252, + "logps/rejected": -1.7377033233642578, + "loss": 0.9931, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3134911060333252, + "rewards/margins": 0.4242123067378998, + "rewards/rejected": -1.7377033233642578, + "sft_loss": 1.3535457849502563, "step": 5315 }, { "epoch": 2.847298879411273, - "grad_norm": 7.048939436252608, - "learning_rate": 2.3414594191401128e-08, - "logits/chosen": -0.657579779624939, - "logits/rejected": -0.6235214471817017, - "logps/chosen": -1.2714077234268188, - "logps/rejected": -2.1557183265686035, - "loss": 0.8777, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2714077234268188, - "rewards/margins": 0.8843106031417847, - "rewards/rejected": -2.1557183265686035, - "sft_loss": 1.2815296649932861, + "grad_norm": 7.61898243948295, + "learning_rate": 7.804864730467042e-09, + "logits/chosen": -0.1452607810497284, + "logits/rejected": -0.06799864023923874, + "logps/chosen": -1.2906428575515747, + "logps/rejected": -1.701115369796753, + "loss": 0.9981, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2906428575515747, + "rewards/margins": 0.41047239303588867, + "rewards/rejected": -1.701115369796753, + "sft_loss": 1.2899892330169678, "step": 5320 }, { "epoch": 2.8499749121926743, - "grad_norm": 10.24828223806133, - "learning_rate": 2.2599452551057998e-08, - "logits/chosen": -0.6916013956069946, - "logits/rejected": -0.5996135473251343, - "logps/chosen": -1.3933923244476318, - "logps/rejected": -2.5210020542144775, - "loss": 0.8655, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.3933923244476318, - "rewards/margins": 1.1276098489761353, - "rewards/rejected": -2.5210020542144775, - "sft_loss": 1.4164988994598389, + "grad_norm": 7.697647234000022, + "learning_rate": 7.533150850352665e-09, + "logits/chosen": -0.18606527149677277, + "logits/rejected": -0.041612427681684494, + "logps/chosen": -1.3773971796035767, + "logps/rejected": -1.968849539756775, + "loss": 0.9795, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3773971796035767, + "rewards/margins": 0.5914527177810669, + "rewards/rejected": -1.968849539756775, + "sft_loss": 1.3904547691345215, "step": 5325 }, { "epoch": 2.852650944974076, - "grad_norm": 14.831723599703547, - "learning_rate": 2.1798644657305857e-08, - "logits/chosen": -0.6822465062141418, - "logits/rejected": -0.6394556164741516, - "logps/chosen": -1.2614357471466064, - "logps/rejected": -2.4309158325195312, - "loss": 0.8739, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2614357471466064, - "rewards/margins": 1.1694800853729248, - "rewards/rejected": -2.4309158325195312, - "sft_loss": 1.3675239086151123, + "grad_norm": 9.375056323214285, + "learning_rate": 7.2662148857686175e-09, + "logits/chosen": -0.14773902297019958, + "logits/rejected": -0.07479329407215118, + "logps/chosen": -1.2629927396774292, + "logps/rejected": -1.8598219156265259, + "loss": 0.9804, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2629927396774292, + "rewards/margins": 0.5968291759490967, + "rewards/rejected": -1.8598219156265259, + "sft_loss": 1.350647211074829, "step": 5330 }, { "epoch": 2.8553269777554773, - "grad_norm": 9.26451309003692, - "learning_rate": 2.1012178279586293e-08, - "logits/chosen": -0.6579137444496155, - "logits/rejected": -0.6982561349868774, - "logps/chosen": -1.275101900100708, - "logps/rejected": -2.031686782836914, - "loss": 0.9113, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.275101900100708, - "rewards/margins": 0.7565848231315613, - "rewards/rejected": -2.031686782836914, - "sft_loss": 1.2985403537750244, + "grad_norm": 15.012322470417251, + "learning_rate": 7.0040594265287635e-09, + "logits/chosen": -0.11493344604969025, + "logits/rejected": -0.13788627088069916, + "logps/chosen": -1.3153281211853027, + "logps/rejected": -1.6620845794677734, + "loss": 1.0355, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3153281211853027, + "rewards/margins": 0.3467564284801483, + "rewards/rejected": -1.6620845794677734, + "sft_loss": 1.3026233911514282, "step": 5335 }, { "epoch": 2.858003010536879, - "grad_norm": 14.334556180897604, - "learning_rate": 2.02400610481997e-08, - "logits/chosen": -0.684406578540802, - "logits/rejected": -0.6869625449180603, - "logps/chosen": -1.3207378387451172, - "logps/rejected": -2.1499147415161133, - "loss": 0.891, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3207378387451172, - "rewards/margins": 0.8291767239570618, - "rewards/rejected": -2.1499147415161133, - "sft_loss": 1.3247309923171997, + "grad_norm": 12.17512336395831, + "learning_rate": 6.746687016066566e-09, + "logits/chosen": -0.16038838028907776, + "logits/rejected": -0.12360197305679321, + "logps/chosen": -1.3261483907699585, + "logps/rejected": -1.7351865768432617, + "loss": 0.9976, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3261483907699585, + "rewards/margins": 0.40903812646865845, + "rewards/rejected": -1.7351865768432617, + "sft_loss": 1.319050908088684, "step": 5340 }, { "epoch": 2.8606790433182807, - "grad_norm": 7.025935672166714, - "learning_rate": 1.948230045423083e-08, - "logits/chosen": -0.8215829730033875, - "logits/rejected": -0.6989858150482178, - "logps/chosen": -1.2560803890228271, - "logps/rejected": -2.2751643657684326, - "loss": 0.823, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.2560803890228271, - "rewards/margins": 1.0190842151641846, - "rewards/rejected": -2.2751643657684326, - "sft_loss": 1.3089632987976074, + "grad_norm": 4.8655149717993105, + "learning_rate": 6.494100151410276e-09, + "logits/chosen": -0.3073890507221222, + "logits/rejected": -0.12378803640604019, + "logps/chosen": -1.2660466432571411, + "logps/rejected": -1.792517900466919, + "loss": 0.9352, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2660466432571411, + "rewards/margins": 0.5264711380004883, + "rewards/rejected": -1.792517900466919, + "sft_loss": 1.3165446519851685, "step": 5345 }, { "epoch": 2.8633550760996824, - "grad_norm": 10.410442099194597, - "learning_rate": 1.8738903849476186e-08, - "logits/chosen": -0.6630287170410156, - "logits/rejected": -0.7192557454109192, - "logps/chosen": -1.3754017353057861, - "logps/rejected": -2.25844669342041, - "loss": 0.9245, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3754017353057861, - "rewards/margins": 0.8830450177192688, - "rewards/rejected": -2.25844669342041, - "sft_loss": 1.3733446598052979, + "grad_norm": 7.823717651654698, + "learning_rate": 6.246301283158728e-09, + "logits/chosen": -0.15731899440288544, + "logits/rejected": -0.17895503342151642, + "logps/chosen": -1.3960539102554321, + "logps/rejected": -1.7920535802841187, + "loss": 1.0472, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3960539102554321, + "rewards/margins": 0.3959996700286865, + "rewards/rejected": -1.7920535802841187, + "sft_loss": 1.3741943836212158, "step": 5350 }, { "epoch": 2.8660311088810837, - "grad_norm": 21.160878628046465, - "learning_rate": 1.8009878446373083e-08, - "logits/chosen": -0.7304435968399048, - "logits/rejected": -0.6887432336807251, - "logps/chosen": -1.3410518169403076, - "logps/rejected": -2.280390977859497, - "loss": 0.9104, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3410518169403076, - "rewards/margins": 0.9393390417098999, - "rewards/rejected": -2.280390977859497, - "sft_loss": 1.3831353187561035, + "grad_norm": 10.189124237163066, + "learning_rate": 6.0032928154576944e-09, + "logits/chosen": -0.23294535279273987, + "logits/rejected": -0.14471940696239471, + "logps/chosen": -1.3473031520843506, + "logps/rejected": -1.8221263885498047, + "loss": 1.0107, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3473031520843506, + "rewards/margins": 0.4748232960700989, + "rewards/rejected": -1.8221263885498047, + "sft_loss": 1.3688666820526123, "step": 5355 }, { "epoch": 2.8687071416624854, - "grad_norm": 12.445287099306254, - "learning_rate": 1.729523131792887e-08, - "logits/chosen": -0.7037655115127563, - "logits/rejected": -0.5557045936584473, - "logps/chosen": -1.3285108804702759, - "logps/rejected": -2.325253963470459, - "loss": 0.9054, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3285108804702759, - "rewards/margins": 0.996743381023407, - "rewards/rejected": -2.325253963470459, - "sft_loss": 1.4038374423980713, + "grad_norm": 12.1538866560791, + "learning_rate": 5.76507710597629e-09, + "logits/chosen": -0.205631285905838, + "logits/rejected": -0.008093352429568768, + "logps/chosen": -1.3273183107376099, + "logps/rejected": -1.844382643699646, + "loss": 1.0101, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3273183107376099, + "rewards/margins": 0.5170644521713257, + "rewards/rejected": -1.844382643699646, + "sft_loss": 1.3940155506134033, "step": 5360 }, { "epoch": 2.8713831744438867, - "grad_norm": 8.398774874498331, - "learning_rate": 1.6594969397653316e-08, - "logits/chosen": -0.775101363658905, - "logits/rejected": -0.6764003038406372, - "logps/chosen": -1.3454972505569458, - "logps/rejected": -2.4815573692321777, - "loss": 0.8661, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3454972505569458, - "rewards/margins": 1.136060357093811, - "rewards/rejected": -2.4815573692321777, - "sft_loss": 1.4039502143859863, + "grad_norm": 8.240418561246397, + "learning_rate": 5.531656465884438e-09, + "logits/chosen": -0.27251097559928894, + "logits/rejected": -0.10316653549671173, + "logps/chosen": -1.3439300060272217, + "logps/rejected": -1.923710823059082, + "loss": 0.9782, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3439300060272217, + "rewards/margins": 0.5797806978225708, + "rewards/rejected": -1.923710823059082, + "sft_loss": 1.3883768320083618, "step": 5365 }, { "epoch": 2.8740592072252884, - "grad_norm": 9.084274857994668, - "learning_rate": 1.5909099479490653e-08, - "logits/chosen": -0.6702944040298462, - "logits/rejected": -0.6756909489631653, - "logps/chosen": -1.3259707689285278, - "logps/rejected": -2.08377742767334, - "loss": 0.9366, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3259707689285278, - "rewards/margins": 0.7578068375587463, - "rewards/rejected": -2.08377742767334, - "sft_loss": 1.3615634441375732, + "grad_norm": 8.628519080580295, + "learning_rate": 5.303033159830217e-09, + "logits/chosen": -0.13890625536441803, + "logits/rejected": -0.10441068559885025, + "logps/chosen": -1.3346256017684937, + "logps/rejected": -1.661921739578247, + "loss": 1.0487, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3346256017684937, + "rewards/margins": 0.3272959589958191, + "rewards/rejected": -1.661921739578247, + "sft_loss": 1.3400561809539795, "step": 5370 }, { "epoch": 2.87673524000669, - "grad_norm": 10.559950123474135, - "learning_rate": 1.5237628217753818e-08, - "logits/chosen": -0.7217191457748413, - "logits/rejected": -0.679119884967804, - "logps/chosen": -1.2826389074325562, - "logps/rejected": -2.4561946392059326, - "loss": 0.8817, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2826389074325562, - "rewards/margins": 1.1735559701919556, - "rewards/rejected": -2.4561946392059326, - "sft_loss": 1.3704030513763428, + "grad_norm": 7.675909185183875, + "learning_rate": 5.079209405917939e-09, + "logits/chosen": -0.19847624003887177, + "logits/rejected": -0.10837669670581818, + "logps/chosen": -1.29621160030365, + "logps/rejected": -1.9675801992416382, + "loss": 0.9917, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.29621160030365, + "rewards/margins": 0.6713687181472778, + "rewards/rejected": -1.9675801992416382, + "sft_loss": 1.3709373474121094, "step": 5375 }, { "epoch": 2.879411272788092, - "grad_norm": 11.42448679848264, - "learning_rate": 1.4580562127059994e-08, - "logits/chosen": -0.7507520914077759, - "logits/rejected": -0.5890139937400818, - "logps/chosen": -1.43178129196167, - "logps/rejected": -2.6365647315979004, - "loss": 0.8829, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.43178129196167, - "rewards/margins": 1.2047832012176514, - "rewards/rejected": -2.6365647315979004, - "sft_loss": 1.4908645153045654, + "grad_norm": 8.00604306816368, + "learning_rate": 4.860187375686664e-09, + "logits/chosen": -0.25982865691185, + "logits/rejected": -0.041426219046115875, + "logps/chosen": -1.4465641975402832, + "logps/rejected": -2.004077911376953, + "loss": 1.0053, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4465641975402832, + "rewards/margins": 0.5575135350227356, + "rewards/rejected": -2.004077911376953, + "sft_loss": 1.4928638935089111, "step": 5380 }, { "epoch": 2.882087305569493, - "grad_norm": 7.472431729509608, - "learning_rate": 1.3937907582267151e-08, - "logits/chosen": -0.6748173236846924, - "logits/rejected": -0.6350966095924377, - "logps/chosen": -1.2746866941452026, - "logps/rejected": -2.2813777923583984, - "loss": 0.8562, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.2746866941452026, - "rewards/margins": 1.0066910982131958, - "rewards/rejected": -2.2813777923583984, - "sft_loss": 1.3331294059753418, + "grad_norm": 6.391211316990939, + "learning_rate": 4.64596919408905e-09, + "logits/chosen": -0.1506432145833969, + "logits/rejected": -0.06374558061361313, + "logps/chosen": -1.3080568313598633, + "logps/rejected": -1.7327619791030884, + "loss": 0.9912, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3080568313598633, + "rewards/margins": 0.42470502853393555, + "rewards/rejected": -1.7327619791030884, + "sft_loss": 1.3371647596359253, "step": 5385 }, { "epoch": 2.884763338350895, - "grad_norm": 11.134637185940472, - "learning_rate": 1.3309670818412446e-08, - "logits/chosen": -0.7103601694107056, - "logits/rejected": -0.6358092427253723, - "logps/chosen": -1.3732722997665405, - "logps/rejected": -2.3026444911956787, - "loss": 0.8945, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3732722997665405, - "rewards/margins": 0.9293721914291382, - "rewards/rejected": -2.3026444911956787, - "sft_loss": 1.4193050861358643, + "grad_norm": 8.583832219329038, + "learning_rate": 4.436556939470814e-09, + "logits/chosen": -0.19303080439567566, + "logits/rejected": -0.07137370109558105, + "logps/chosen": -1.3770928382873535, + "logps/rejected": -1.7824920415878296, + "loss": 1.0351, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3770928382873535, + "rewards/margins": 0.4053993225097656, + "rewards/rejected": -1.7824920415878296, + "sft_loss": 1.4261677265167236, "step": 5390 }, { "epoch": 2.887439371132296, - "grad_norm": 9.166135528851378, - "learning_rate": 1.2695857930651921e-08, - "logits/chosen": -0.8693428039550781, - "logits/rejected": -0.7057567834854126, - "logps/chosen": -1.2538936138153076, - "logps/rejected": -2.3252408504486084, - "loss": 0.8164, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.2538936138153076, - "rewards/margins": 1.0713472366333008, - "rewards/rejected": -2.3252408504486084, - "sft_loss": 1.3077678680419922, + "grad_norm": 9.393850330779484, + "learning_rate": 4.23195264355064e-09, + "logits/chosen": -0.3502636253833771, + "logits/rejected": -0.14578810334205627, + "logps/chosen": -1.2524877786636353, + "logps/rejected": -1.7917158603668213, + "loss": 0.9224, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2524877786636353, + "rewards/margins": 0.5392279028892517, + "rewards/rejected": -1.7917158603668213, + "sft_loss": 1.2977898120880127, "step": 5395 }, { "epoch": 2.890115403913698, - "grad_norm": 6.076040614686364, - "learning_rate": 1.2096474874200735e-08, - "logits/chosen": -0.7578507661819458, - "logits/rejected": -0.5721181631088257, - "logps/chosen": -1.3199083805084229, - "logps/rejected": -2.6234889030456543, - "loss": 0.8339, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.3199083805084229, - "rewards/margins": 1.3035802841186523, - "rewards/rejected": -2.6234889030456543, - "sft_loss": 1.34604811668396, + "grad_norm": 8.113996362616188, + "learning_rate": 4.032158291400245e-09, + "logits/chosen": -0.25449585914611816, + "logits/rejected": 0.0005387455457821488, + "logps/chosen": -1.3301738500595093, + "logps/rejected": -2.1094958782196045, + "loss": 0.9356, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3301738500595093, + "rewards/margins": 0.7793217897415161, + "rewards/rejected": -2.1094958782196045, + "sft_loss": 1.334867238998413, "step": 5400 }, { "epoch": 2.8927914366950995, - "grad_norm": 8.014889456039318, - "learning_rate": 1.1511527464276194e-08, - "logits/chosen": -0.6812966465950012, - "logits/rejected": -0.6842519640922546, - "logps/chosen": -1.414838194847107, - "logps/rejected": -2.4990859031677246, - "loss": 0.8919, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -1.414838194847107, - "rewards/margins": 1.0842478275299072, - "rewards/rejected": -2.4990859031677246, - "sft_loss": 1.4632160663604736, + "grad_norm": 7.448239479874202, + "learning_rate": 3.837175821425398e-09, + "logits/chosen": -0.15206363797187805, + "logits/rejected": -0.10554593801498413, + "logps/chosen": -1.4665147066116333, + "logps/rejected": -1.9461534023284912, + "loss": 1.0438, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4665147066116333, + "rewards/margins": 0.4796389043331146, + "rewards/rejected": -1.9461534023284912, + "sft_loss": 1.4484292268753052, "step": 5405 }, { "epoch": 2.8954674694765012, - "grad_norm": 5.700276972210124, - "learning_rate": 1.0941021376040305e-08, - "logits/chosen": -0.6988664865493774, - "logits/rejected": -0.6308866739273071, - "logps/chosen": -1.3098427057266235, - "logps/rejected": -2.4279820919036865, - "loss": 0.8976, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.3098427057266235, - "rewards/margins": 1.1181391477584839, - "rewards/rejected": -2.4279820919036865, - "sft_loss": 1.3720207214355469, + "grad_norm": 5.693368841094515, + "learning_rate": 3.6470071253467683e-09, + "logits/chosen": -0.14152638614177704, + "logits/rejected": -0.011924123391509056, + "logps/chosen": -1.3331220149993896, + "logps/rejected": -2.045759677886963, + "loss": 0.9854, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3331220149993896, + "rewards/margins": 0.712637722492218, + "rewards/rejected": -2.045759677886963, + "sft_loss": 1.3652178049087524, "step": 5410 }, { "epoch": 2.8981435022579025, - "grad_norm": 8.648526708006992, - "learning_rate": 1.0384962144545818e-08, - "logits/chosen": -0.7574768662452698, - "logits/rejected": -0.6414147615432739, - "logps/chosen": -1.3453338146209717, - "logps/rejected": -2.267055034637451, - "loss": 0.9057, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3453338146209717, - "rewards/margins": 0.9217211008071899, - "rewards/rejected": -2.267055034637451, - "sft_loss": 1.4229142665863037, + "grad_norm": 6.693209813625698, + "learning_rate": 3.461654048181939e-09, + "logits/chosen": -0.25429773330688477, + "logits/rejected": -0.08107346296310425, + "logps/chosen": -1.3584120273590088, + "logps/rejected": -1.7874362468719482, + "loss": 1.023, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3584120273590088, + "rewards/margins": 0.42902421951293945, + "rewards/rejected": -1.7874362468719482, + "sft_loss": 1.411090612411499, "step": 5415 }, { "epoch": 2.9008195350393042, - "grad_norm": 9.044442234302466, - "learning_rate": 9.843355164681767e-09, - "logits/chosen": -0.7178055047988892, - "logits/rejected": -0.6718038320541382, - "logps/chosen": -1.2801578044891357, - "logps/rejected": -2.220691680908203, - "loss": 0.8944, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2801578044891357, - "rewards/margins": 0.9405338168144226, - "rewards/rejected": -2.220691680908203, - "sft_loss": 1.3160831928253174, + "grad_norm": 7.726299637751278, + "learning_rate": 3.281118388227255e-09, + "logits/chosen": -0.18577158451080322, + "logits/rejected": -0.11131541430950165, + "logps/chosen": -1.2643746137619019, + "logps/rejected": -1.72647225856781, + "loss": 0.9986, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2643746137619019, + "rewards/margins": 0.46209773421287537, + "rewards/rejected": -1.72647225856781, + "sft_loss": 1.2958446741104126, "step": 5420 }, { "epoch": 2.903495567820706, - "grad_norm": 9.813349046424374, - "learning_rate": 9.316205691121515e-09, - "logits/chosen": -0.7028988599777222, - "logits/rejected": -0.6428855657577515, - "logps/chosen": -1.3650051355361938, - "logps/rejected": -2.4545657634735107, - "loss": 0.8675, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3650051355361938, - "rewards/margins": 1.0895609855651855, - "rewards/rejected": -2.4545657634735107, - "sft_loss": 1.3854036331176758, + "grad_norm": 10.809253240200837, + "learning_rate": 3.1054018970405048e-09, + "logits/chosen": -0.1767970770597458, + "logits/rejected": -0.03920573741197586, + "logps/chosen": -1.3743648529052734, + "logps/rejected": -1.9125168323516846, + "loss": 0.9875, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3743648529052734, + "rewards/margins": 0.5381518602371216, + "rewards/rejected": -1.9125168323516846, + "sft_loss": 1.3702685832977295, "step": 5425 }, { "epoch": 2.906171600602107, - "grad_norm": 10.624268559701752, - "learning_rate": 8.803518838271463e-09, - "logits/chosen": -0.7422298192977905, - "logits/rejected": -0.6166383028030396, - "logps/chosen": -1.3253414630889893, - "logps/rejected": -2.4383633136749268, - "loss": 0.84, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.3253414630889893, - "rewards/margins": 1.1130220890045166, - "rewards/rejected": -2.4383633136749268, - "sft_loss": 1.3755017518997192, + "grad_norm": 8.151264446447978, + "learning_rate": 2.9345062794238207e-09, + "logits/chosen": -0.20741066336631775, + "logits/rejected": -0.03023364581167698, + "logps/chosen": -1.3413336277008057, + "logps/rejected": -1.9218580722808838, + "loss": 0.9577, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3413336277008057, + "rewards/margins": 0.5805243849754333, + "rewards/rejected": -1.9218580722808838, + "sft_loss": 1.3622300624847412, "step": 5430 }, { "epoch": 2.908847633383509, - "grad_norm": 8.741716309198573, - "learning_rate": 8.305299580221748e-09, - "logits/chosen": -0.8222736120223999, - "logits/rejected": -0.7537822723388672, - "logps/chosen": -1.273291826248169, - "logps/rejected": -2.3553757667541504, - "loss": 0.8519, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.273291826248169, - "rewards/margins": 1.082083821296692, - "rewards/rejected": -2.3553757667541504, - "sft_loss": 1.3592718839645386, + "grad_norm": 9.717039548252071, + "learning_rate": 2.7684331934072492e-09, + "logits/chosen": -0.30634820461273193, + "logits/rejected": -0.1974448412656784, + "logps/chosen": -1.2954027652740479, + "logps/rejected": -1.873197317123413, + "loss": 0.9574, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2954027652740479, + "rewards/margins": 0.5777946710586548, + "rewards/rejected": -1.873197317123413, + "sft_loss": 1.3467910289764404, "step": 5435 }, { "epoch": 2.9115236661649107, - "grad_norm": 43.86752680722663, - "learning_rate": 7.821552750697958e-09, - "logits/chosen": -0.7530517578125, - "logits/rejected": -0.6569010019302368, - "logps/chosen": -1.3031551837921143, - "logps/rejected": -2.2156496047973633, - "loss": 0.9016, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3031551837921143, - "rewards/margins": 0.9124943017959595, - "rewards/rejected": -2.2156496047973633, - "sft_loss": 1.390745997428894, + "grad_norm": 16.560899956258325, + "learning_rate": 2.6071842502326526e-09, + "logits/chosen": -0.25282543897628784, + "logits/rejected": -0.13164076209068298, + "logps/chosen": -1.3265902996063232, + "logps/rejected": -1.8097198009490967, + "loss": 1.0067, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3265902996063232, + "rewards/margins": 0.4831293523311615, + "rewards/rejected": -1.8097198009490967, + "sft_loss": 1.3786664009094238, "step": 5440 }, { "epoch": 2.9141996989463124, - "grad_norm": 7.838655429793774, - "learning_rate": 7.3522830430136635e-09, - "logits/chosen": -0.5364211797714233, - "logits/rejected": -0.5294418931007385, - "logps/chosen": -1.3198676109313965, - "logps/rejected": -2.8044791221618652, - "loss": 0.8214, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.3198676109313965, - "rewards/margins": 1.4846115112304688, - "rewards/rejected": -2.8044791221618652, - "sft_loss": 1.3252191543579102, + "grad_norm": 5.990699359674066, + "learning_rate": 2.450761014337888e-09, + "logits/chosen": -0.011542147025465965, + "logits/rejected": 0.005524858832359314, + "logps/chosen": -1.3121052980422974, + "logps/rejected": -2.1332852840423584, + "loss": 0.9434, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3121052980422974, + "rewards/margins": 0.8211800456047058, + "rewards/rejected": -2.1332852840423584, + "sft_loss": 1.3157904148101807, "step": 5445 }, { "epoch": 2.9168757317277136, - "grad_norm": 9.352079097559933, - "learning_rate": 6.897495010025956e-09, - "logits/chosen": -0.6472350358963013, - "logits/rejected": -0.581468939781189, - "logps/chosen": -1.3410637378692627, - "logps/rejected": -2.4063053131103516, - "loss": 0.8721, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.3410637378692627, - "rewards/margins": 1.0652415752410889, - "rewards/rejected": -2.4063053131103516, - "sft_loss": 1.3743155002593994, + "grad_norm": 12.619599450749567, + "learning_rate": 2.299165003341985e-09, + "logits/chosen": -0.09076647460460663, + "logits/rejected": 0.011687842197716236, + "logps/chosen": -1.3570753335952759, + "logps/rejected": -1.9621295928955078, + "loss": 0.9804, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3570753335952759, + "rewards/margins": 0.6050541996955872, + "rewards/rejected": -1.9621295928955078, + "sft_loss": 1.368789792060852, "step": 5450 }, { "epoch": 2.9195517645091154, - "grad_norm": 10.857747514119612, - "learning_rate": 6.4571930640899835e-09, - "logits/chosen": -0.7794079780578613, - "logits/rejected": -0.6243848204612732, - "logps/chosen": -1.3348716497421265, - "logps/rejected": -2.204594135284424, - "loss": 0.9159, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3348716497421265, - "rewards/margins": 0.8697225451469421, - "rewards/rejected": -2.204594135284424, - "sft_loss": 1.3703720569610596, + "grad_norm": 7.831340047201271, + "learning_rate": 2.1523976880299945e-09, + "logits/chosen": -0.23645658791065216, + "logits/rejected": -0.062863290309906, + "logps/chosen": -1.3370959758758545, + "logps/rejected": -1.778796911239624, + "loss": 1.0223, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3370959758758545, + "rewards/margins": 0.44170108437538147, + "rewards/rejected": -1.778796911239624, + "sft_loss": 1.366168737411499, "step": 5455 }, { "epoch": 2.9222277972905166, - "grad_norm": 9.561445401671, - "learning_rate": 6.0313814770174836e-09, - "logits/chosen": -0.7363497614860535, - "logits/rejected": -0.675298810005188, - "logps/chosen": -1.3376637697219849, - "logps/rejected": -2.3302359580993652, - "loss": 0.9026, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3376637697219849, - "rewards/margins": 0.9925724267959595, - "rewards/rejected": -2.3302359580993652, - "sft_loss": 1.426440715789795, + "grad_norm": 10.40011225408446, + "learning_rate": 2.010460492339161e-09, + "logits/chosen": -0.22707609832286835, + "logits/rejected": -0.1244509220123291, + "logps/chosen": -1.340163230895996, + "logps/rejected": -1.9151283502578735, + "loss": 0.9828, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.340163230895996, + "rewards/margins": 0.5749651193618774, + "rewards/rejected": -1.9151283502578735, + "sft_loss": 1.402766466140747, "step": 5460 }, { "epoch": 2.9249038300719183, - "grad_norm": 13.182686708577272, - "learning_rate": 5.620064380033985e-09, - "logits/chosen": -0.7910588979721069, - "logits/rejected": -0.6436706185340881, - "logps/chosen": -1.403331995010376, - "logps/rejected": -2.356701374053955, - "loss": 0.8904, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.403331995010376, - "rewards/margins": 0.9533694386482239, - "rewards/rejected": -2.356701374053955, - "sft_loss": 1.4023396968841553, + "grad_norm": 8.544508926159043, + "learning_rate": 1.8733547933446614e-09, + "logits/chosen": -0.2974518835544586, + "logits/rejected": -0.10885222256183624, + "logps/chosen": -1.3845881223678589, + "logps/rejected": -1.844294786453247, + "loss": 1.0019, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3845881223678589, + "rewards/margins": 0.45970669388771057, + "rewards/rejected": -1.844294786453247, + "sft_loss": 1.3685612678527832, "step": 5465 }, { "epoch": 2.92757986285332, - "grad_norm": 30.005707028226283, - "learning_rate": 5.22324576374017e-09, - "logits/chosen": -0.7054346203804016, - "logits/rejected": -0.6411979794502258, - "logps/chosen": -1.2877601385116577, - "logps/rejected": -2.226062059402466, - "loss": 0.8736, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2877601385116577, - "rewards/margins": 0.9383017420768738, - "rewards/rejected": -2.226062059402466, - "sft_loss": 1.3081693649291992, + "grad_norm": 10.556707029804059, + "learning_rate": 1.7410819212467231e-09, + "logits/chosen": -0.15490223467350006, + "logits/rejected": -0.06371744722127914, + "logps/chosen": -1.2789368629455566, + "logps/rejected": -1.7651761770248413, + "loss": 0.9691, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2789368629455566, + "rewards/margins": 0.4862392544746399, + "rewards/rejected": -1.7651761770248413, + "sft_loss": 1.267244577407837, "step": 5470 }, { "epoch": 2.9302558956347218, - "grad_norm": 9.514563994159747, - "learning_rate": 4.840929478071576e-09, - "logits/chosen": -0.6584609746932983, - "logits/rejected": -0.7336791753768921, - "logps/chosen": -1.247133493423462, - "logps/rejected": -2.2455291748046875, - "loss": 0.8666, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.247133493423462, - "rewards/margins": 0.9983956217765808, - "rewards/rejected": -2.2455291748046875, - "sft_loss": 1.3202401399612427, + "grad_norm": 16.86907330565408, + "learning_rate": 1.613643159357192e-09, + "logits/chosen": -0.14799180626869202, + "logits/rejected": -0.1888667643070221, + "logps/chosen": -1.2474968433380127, + "logps/rejected": -1.7585350275039673, + "loss": 0.9819, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2474968433380127, + "rewards/margins": 0.5110381841659546, + "rewards/rejected": -1.7585350275039673, + "sft_loss": 1.3227674961090088, "step": 5475 }, { "epoch": 2.932931928416123, - "grad_norm": 16.65588273040812, - "learning_rate": 4.47311923226279e-09, - "logits/chosen": -0.7217914462089539, - "logits/rejected": -0.6552537679672241, - "logps/chosen": -1.321942687034607, - "logps/rejected": -2.1904098987579346, - "loss": 0.9158, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.321942687034607, - "rewards/margins": 0.868466854095459, - "rewards/rejected": -2.1904098987579346, - "sft_loss": 1.4025959968566895, + "grad_norm": 8.141074083564165, + "learning_rate": 1.4910397440875967e-09, + "logits/chosen": -0.20287397503852844, + "logits/rejected": -0.09254992008209229, + "logps/chosen": -1.3348429203033447, + "logps/rejected": -1.7784709930419922, + "loss": 1.0147, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3348429203033447, + "rewards/margins": 0.4436280131340027, + "rewards/rejected": -1.7784709930419922, + "sft_loss": 1.3954198360443115, "step": 5480 }, { "epoch": 2.9356079611975248, - "grad_norm": 8.627080547408749, - "learning_rate": 4.119818594810476e-09, - "logits/chosen": -0.6494681239128113, - "logits/rejected": -0.5289129018783569, - "logps/chosen": -1.2953909635543823, - "logps/rejected": -2.254089832305908, - "loss": 0.8834, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.2953909635543823, - "rewards/margins": 0.9586986303329468, - "rewards/rejected": -2.254089832305908, - "sft_loss": 1.3672096729278564, + "grad_norm": 9.844583759460882, + "learning_rate": 1.3732728649368253e-09, + "logits/chosen": -0.1399911642074585, + "logits/rejected": 0.027894001454114914, + "logps/chosen": -1.2917319536209106, + "logps/rejected": -1.8091261386871338, + "loss": 0.9802, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2917319536209106, + "rewards/margins": 0.5173942446708679, + "rewards/rejected": -1.8091261386871338, + "sft_loss": 1.3570094108581543, "step": 5485 }, { "epoch": 2.938283993978926, - "grad_norm": 11.394554940053027, - "learning_rate": 3.781030993438573e-09, - "logits/chosen": -0.726294994354248, - "logits/rejected": -0.7348896265029907, - "logps/chosen": -1.2835429906845093, - "logps/rejected": -2.3471598625183105, - "loss": 0.8917, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2835429906845093, - "rewards/margins": 1.0636169910430908, - "rewards/rejected": -2.3471598625183105, - "sft_loss": 1.3875757455825806, + "grad_norm": 10.753233019071727, + "learning_rate": 1.260343664479524e-09, + "logits/chosen": -0.22496263682842255, + "logits/rejected": -0.18973574042320251, + "logps/chosen": -1.2890931367874146, + "logps/rejected": -1.7965633869171143, + "loss": 0.9967, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2890931367874146, + "rewards/margins": 0.5074703693389893, + "rewards/rejected": -1.7965633869171143, + "sft_loss": 1.371532678604126, "step": 5490 }, { "epoch": 2.9409600267603278, - "grad_norm": 8.284869142815294, - "learning_rate": 3.4567597150663155e-09, - "logits/chosen": -0.811735987663269, - "logits/rejected": -0.6471236944198608, - "logps/chosen": -1.2578198909759521, - "logps/rejected": -2.3566677570343018, - "loss": 0.834, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -1.2578198909759521, - "rewards/margins": 1.0988481044769287, - "rewards/rejected": -2.3566677570343018, - "sft_loss": 1.3185293674468994, + "grad_norm": 14.825230069609155, + "learning_rate": 1.1522532383554384e-09, + "logits/chosen": -0.2755940854549408, + "logits/rejected": -0.07098913192749023, + "logps/chosen": -1.267250418663025, + "logps/rejected": -1.843408226966858, + "loss": 0.9524, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.267250418663025, + "rewards/margins": 0.5761579275131226, + "rewards/rejected": -1.843408226966858, + "sft_loss": 1.3246123790740967, "step": 5495 }, { "epoch": 2.9436360595417295, - "grad_norm": 7.885958963716733, - "learning_rate": 3.147007905774768e-09, - "logits/chosen": -0.667231023311615, - "logits/rejected": -0.601595401763916, - "logps/chosen": -1.3667452335357666, - "logps/rejected": -2.311988592147827, - "loss": 0.9335, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3667452335357666, - "rewards/margins": 0.9452434778213501, - "rewards/rejected": -2.311988592147827, - "sft_loss": 1.391640305519104, + "grad_norm": 8.70591824815608, + "learning_rate": 1.049002635258256e-09, + "logits/chosen": -0.16660761833190918, + "logits/rejected": -0.049916822463274, + "logps/chosen": -1.386541485786438, + "logps/rejected": -1.805859923362732, + "loss": 1.0579, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.386541485786438, + "rewards/margins": 0.41931843757629395, + "rewards/rejected": -1.805859923362732, + "sft_loss": 1.4034945964813232, "step": 5500 }, { "epoch": 2.946312092323131, - "grad_norm": 17.05972389177579, - "learning_rate": 2.851778570777508e-09, - "logits/chosen": -0.6465167999267578, - "logits/rejected": -0.6802867650985718, - "logps/chosen": -1.351560354232788, - "logps/rejected": -2.2371137142181396, - "loss": 0.8927, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.351560354232788, - "rewards/margins": 0.8855533599853516, - "rewards/rejected": -2.2371137142181396, - "sft_loss": 1.376105546951294, + "grad_norm": 10.87677449432048, + "learning_rate": 9.505928569258358e-10, + "logits/chosen": -0.12635710835456848, + "logits/rejected": -0.11135026067495346, + "logps/chosen": -1.346790075302124, + "logps/rejected": -1.7952775955200195, + "loss": 1.0127, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.346790075302124, + "rewards/margins": 0.4484872817993164, + "rewards/rejected": -1.7952775955200195, + "sft_loss": 1.376283884048462, "step": 5505 }, { "epoch": 2.9489881251045325, - "grad_norm": 11.071659410964866, - "learning_rate": 2.5710745743908192e-09, - "logits/chosen": -0.7512981295585632, - "logits/rejected": -0.6664119958877563, - "logps/chosen": -1.331237554550171, - "logps/rejected": -2.683913469314575, - "loss": 0.8378, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.331237554550171, - "rewards/margins": 1.3526757955551147, - "rewards/rejected": -2.683913469314575, - "sft_loss": 1.3605666160583496, + "grad_norm": 12.222939159055558, + "learning_rate": 8.57024858130273e-10, + "logits/chosen": -0.2253093421459198, + "logits/rejected": -0.10633256286382675, + "logps/chosen": -1.342167615890503, + "logps/rejected": -2.1339049339294434, + "loss": 0.9421, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.342167615890503, + "rewards/margins": 0.79173743724823, + "rewards/rejected": -2.1339049339294434, + "sft_loss": 1.3634456396102905, "step": 5510 }, { "epoch": 2.951664157885934, - "grad_norm": 10.604952132522218, - "learning_rate": 2.304898640006048e-09, - "logits/chosen": -0.7933308482170105, - "logits/rejected": -0.6962018013000488, - "logps/chosen": -1.2568343877792358, - "logps/rejected": -2.4431662559509277, - "loss": 0.8777, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.2568343877792358, - "rewards/margins": 1.186332106590271, - "rewards/rejected": -2.4431662559509277, - "sft_loss": 1.3736810684204102, + "grad_norm": 38.57193275680594, + "learning_rate": 7.682995466686826e-10, + "logits/chosen": -0.2967481017112732, + "logits/rejected": -0.15350167453289032, + "logps/chosen": -1.3245993852615356, + "logps/rejected": -1.9612815380096436, + "loss": 1.0, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3245993852615356, + "rewards/margins": 0.6366821527481079, + "rewards/rejected": -1.9612815380096436, + "sft_loss": 1.37442147731781, "step": 5515 }, { "epoch": 2.9543401906673354, - "grad_norm": 10.915979227431771, - "learning_rate": 2.0532533500631225e-09, - "logits/chosen": -0.6986032128334045, - "logits/rejected": -0.6873008012771606, - "logps/chosen": -1.2808072566986084, - "logps/rejected": -2.178983449935913, - "loss": 0.8949, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2808072566986084, - "rewards/margins": 0.8981763124465942, - "rewards/rejected": -2.178983449935913, - "sft_loss": 1.3327544927597046, + "grad_norm": 10.18404680838618, + "learning_rate": 6.844177833543741e-10, + "logits/chosen": -0.18095073103904724, + "logits/rejected": -0.12094295024871826, + "logps/chosen": -1.290932297706604, + "logps/rejected": -1.7183071374893188, + "loss": 1.01, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.290932297706604, + "rewards/margins": 0.42737483978271484, + "rewards/rejected": -1.7183071374893188, + "sft_loss": 1.324407935142517, "step": 5520 }, { "epoch": 2.957016223448737, - "grad_norm": 15.710577343617024, - "learning_rate": 1.8161411460262401e-09, - "logits/chosen": -0.7469355463981628, - "logits/rejected": -0.6454081535339355, - "logps/chosen": -1.3567637205123901, - "logps/rejected": -2.6256422996520996, - "loss": 0.8532, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3567637205123901, - "rewards/margins": 1.2688785791397095, - "rewards/rejected": -2.6256422996520996, - "sft_loss": 1.385542631149292, + "grad_norm": 9.147815224087866, + "learning_rate": 6.053803820087467e-10, + "logits/chosen": -0.21934518218040466, + "logits/rejected": -0.09977545589208603, + "logps/chosen": -1.3488126993179321, + "logps/rejected": -1.9975097179412842, + "loss": 0.974, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3488126993179321, + "rewards/margins": 0.6486971378326416, + "rewards/rejected": -1.9975097179412842, + "sft_loss": 1.367114782333374, "step": 5525 }, { "epoch": 2.959692256230139, - "grad_norm": 7.801556195681117, - "learning_rate": 1.5935643283585545e-09, - "logits/chosen": -0.7634005546569824, - "logits/rejected": -0.5943415760993958, - "logps/chosen": -1.4132963418960571, - "logps/rejected": -2.3316051959991455, - "loss": 0.9049, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.4132963418960571, - "rewards/margins": 0.9183089137077332, - "rewards/rejected": -2.3316051959991455, - "sft_loss": 1.4461170434951782, + "grad_norm": 10.109138141206513, + "learning_rate": 5.311881094528514e-10, + "logits/chosen": -0.27003321051597595, + "logits/rejected": -0.04579021781682968, + "logps/chosen": -1.4224035739898682, + "logps/rejected": -1.8239049911499023, + "loss": 1.0401, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4224035739898682, + "rewards/margins": 0.40150150656700134, + "rewards/rejected": -1.8239049911499023, + "sft_loss": 1.4241381883621216, "step": 5530 }, { "epoch": 2.9623682890115406, - "grad_norm": 11.690526948729639, - "learning_rate": 1.3855250565015244e-09, - "logits/chosen": -0.732628583908081, - "logits/rejected": -0.73069167137146, - "logps/chosen": -1.2804659605026245, - "logps/rejected": -2.1868271827697754, - "loss": 0.9143, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2804659605026245, - "rewards/margins": 0.906360924243927, - "rewards/rejected": -2.1868271827697754, - "sft_loss": 1.358859658241272, + "grad_norm": 12.128324494377605, + "learning_rate": 4.6184168550050806e-10, + "logits/chosen": -0.20196180045604706, + "logits/rejected": -0.15698209404945374, + "logps/chosen": -1.295867681503296, + "logps/rejected": -1.7471435070037842, + "loss": 1.0313, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.295867681503296, + "rewards/margins": 0.45127612352371216, + "rewards/rejected": -1.7471435070037842, + "sft_loss": 1.3786756992340088, "step": 5535 }, { "epoch": 2.965044321792942, - "grad_norm": 6.098410094413616, - "learning_rate": 1.1920253488530986e-09, - "logits/chosen": -0.8392502069473267, - "logits/rejected": -0.7265399098396301, - "logps/chosen": -1.337065577507019, - "logps/rejected": -2.251086950302124, - "loss": 0.8909, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.337065577507019, - "rewards/margins": 0.9140211939811707, - "rewards/rejected": -2.251086950302124, - "sft_loss": 1.3136518001556396, + "grad_norm": 11.33136853984457, + "learning_rate": 3.973417829510328e-10, + "logits/chosen": -0.3168911635875702, + "logits/rejected": -0.15934643149375916, + "logps/chosen": -1.3818787336349487, + "logps/rejected": -1.8020331859588623, + "loss": 1.0248, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3818787336349487, + "rewards/margins": 0.4201543927192688, + "rewards/rejected": -1.8020331859588623, + "sft_loss": 1.336635947227478, "step": 5540 }, { "epoch": 2.9677203545743436, - "grad_norm": 17.435180135775486, - "learning_rate": 1.0130670827482314e-09, - "logits/chosen": -0.724540114402771, - "logits/rejected": -0.6843401789665222, - "logps/chosen": -1.2680495977401733, - "logps/rejected": -2.138017416000366, - "loss": 0.8746, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2680495977401733, - "rewards/margins": 0.8699675798416138, - "rewards/rejected": -2.138017416000366, - "sft_loss": 1.3009154796600342, + "grad_norm": 12.551932178201476, + "learning_rate": 3.3768902758274377e-10, + "logits/chosen": -0.21031954884529114, + "logits/rejected": -0.09710313379764557, + "logps/chosen": -1.2512221336364746, + "logps/rejected": -1.681840181350708, + "loss": 0.9822, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2512221336364746, + "rewards/margins": 0.4306180477142334, + "rewards/rejected": -1.681840181350708, + "sft_loss": 1.3028347492218018, "step": 5545 }, { "epoch": 2.970396387355745, - "grad_norm": 7.694728883809673, - "learning_rate": 8.4865199444073e-10, - "logits/chosen": -0.6581524610519409, - "logits/rejected": -0.5862508416175842, - "logps/chosen": -1.3748828172683716, - "logps/rejected": -2.388700485229492, - "loss": 0.8962, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3748828172683716, - "rewards/margins": 1.0138174295425415, - "rewards/rejected": -2.388700485229492, - "sft_loss": 1.4283702373504639, + "grad_norm": 6.543632163538888, + "learning_rate": 2.8288399814691e-10, + "logits/chosen": -0.12163994461297989, + "logits/rejected": -0.00886031985282898, + "logps/chosen": -1.377091646194458, + "logps/rejected": -1.926664113998413, + "loss": 1.0066, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.377091646194458, + "rewards/margins": 0.5495725274085999, + "rewards/rejected": -1.926664113998413, + "sft_loss": 1.4350866079330444, "step": 5550 }, { "epoch": 2.9730724201371466, - "grad_norm": 13.217299665435, - "learning_rate": 6.987816790866019e-10, - "logits/chosen": -0.7316558361053467, - "logits/rejected": -0.5861397981643677, - "logps/chosen": -1.4063125848770142, - "logps/rejected": -2.501913547515869, - "loss": 0.9166, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.4063125848770142, - "rewards/margins": 1.0956008434295654, - "rewards/rejected": -2.501913547515869, - "sft_loss": 1.4392328262329102, + "grad_norm": 11.995615591283471, + "learning_rate": 2.3292722636220066e-10, + "logits/chosen": -0.22242912650108337, + "logits/rejected": -0.008000977337360382, + "logps/chosen": -1.4153683185577393, + "logps/rejected": -1.983817458152771, + "loss": 1.0265, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.4153683185577393, + "rewards/margins": 0.568449079990387, + "rewards/rejected": -1.983817458152771, + "sft_loss": 1.4314537048339844, "step": 5555 }, { "epoch": 2.9757484529185483, - "grad_norm": 11.587722175371816, - "learning_rate": 5.634575907284001e-10, - "logits/chosen": -0.6962507963180542, - "logits/rejected": -0.718319296836853, - "logps/chosen": -1.3200267553329468, - "logps/rejected": -2.1147451400756836, - "loss": 0.9395, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3200267553329468, - "rewards/margins": 0.7947185635566711, - "rewards/rejected": -2.1147451400756836, - "sft_loss": 1.4130859375, + "grad_norm": 13.761928785661214, + "learning_rate": 1.8781919690946668e-10, + "logits/chosen": -0.14891259372234344, + "logits/rejected": -0.1366450935602188, + "logps/chosen": -1.354978322982788, + "logps/rejected": -1.6971426010131836, + "loss": 1.0707, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.354978322982788, + "rewards/margins": 0.3421640992164612, + "rewards/rejected": -1.6971426010131836, + "sft_loss": 1.4287164211273193, "step": 5560 }, { "epoch": 2.97842448569995, - "grad_norm": 9.125117910711788, - "learning_rate": 4.426810422809013e-10, - "logits/chosen": -0.7751784324645996, - "logits/rejected": -0.7629925608634949, - "logps/chosen": -1.2659528255462646, - "logps/rejected": -2.2525923252105713, - "loss": 0.878, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.2659528255462646, - "rewards/margins": 0.9866394996643066, - "rewards/rejected": -2.2525923252105713, - "sft_loss": 1.3184149265289307, + "grad_norm": 9.61519890066985, + "learning_rate": 1.4756034742696711e-10, + "logits/chosen": -0.2660152316093445, + "logits/rejected": -0.21024027466773987, + "logps/chosen": -1.3015344142913818, + "logps/rejected": -1.741166353225708, + "loss": 1.0001, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3015344142913818, + "rewards/margins": 0.4396318793296814, + "rewards/rejected": -1.741166353225708, + "sft_loss": 1.3129467964172363, "step": 5565 }, { "epoch": 2.9811005184813513, - "grad_norm": 9.994838583436563, - "learning_rate": 3.36453205518783e-10, - "logits/chosen": -0.7178460359573364, - "logits/rejected": -0.6534903645515442, - "logps/chosen": -1.2815896272659302, - "logps/rejected": -2.5624938011169434, - "loss": 0.8331, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.2815896272659302, - "rewards/margins": 1.2809040546417236, - "rewards/rejected": -2.5624938011169434, - "sft_loss": 1.349057912826538, + "grad_norm": 8.956481812838017, + "learning_rate": 1.12151068506261e-10, + "logits/chosen": -0.18123656511306763, + "logits/rejected": -0.045481182634830475, + "logps/chosen": -1.2856550216674805, + "logps/rejected": -2.00970721244812, + "loss": 0.95, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2856550216674805, + "rewards/margins": 0.7240523099899292, + "rewards/rejected": -2.00970721244812, + "sft_loss": 1.3504688739776611, "step": 5570 }, { "epoch": 2.983776551262753, - "grad_norm": 10.932926394809643, - "learning_rate": 2.447751110647989e-10, - "logits/chosen": -0.7367149591445923, - "logits/rejected": -0.6385723352432251, - "logps/chosen": -1.2736537456512451, - "logps/rejected": -2.4375550746917725, - "loss": 0.8645, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.2736537456512451, - "rewards/margins": 1.1639012098312378, - "rewards/rejected": -2.4375550746917725, - "sft_loss": 1.3699533939361572, + "grad_norm": 7.346514532996517, + "learning_rate": 8.159170368826629e-11, + "logits/chosen": -0.2186519205570221, + "logits/rejected": -0.06879440695047379, + "logps/chosen": -1.2897922992706299, + "logps/rejected": -1.8777774572372437, + "loss": 0.9716, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2897922992706299, + "rewards/margins": 0.5879851579666138, + "rewards/rejected": -1.8777774572372437, + "sft_loss": 1.3677794933319092, "step": 5575 }, { "epoch": 2.9864525840441547, - "grad_norm": 7.8711121475014325, - "learning_rate": 1.6764764838045342e-10, - "logits/chosen": -0.8285869359970093, - "logits/rejected": -0.63426673412323, - "logps/chosen": -1.2631531953811646, - "logps/rejected": -2.3292622566223145, - "loss": 0.8533, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2631531953811646, - "rewards/margins": 1.0661091804504395, - "rewards/rejected": -2.3292622566223145, - "sft_loss": 1.32146418094635, + "grad_norm": 7.005036649911722, + "learning_rate": 5.588254946015114e-11, + "logits/chosen": -0.3067344129085541, + "logits/rejected": -0.051812849938869476, + "logps/chosen": -1.2479729652404785, + "logps/rejected": -1.8473840951919556, + "loss": 0.9648, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2479729652404785, + "rewards/margins": 0.599411129951477, + "rewards/rejected": -1.8473840951919556, + "sft_loss": 1.3181512355804443, "step": 5580 }, { "epoch": 2.989128616825556, - "grad_norm": 6.485888323981134, - "learning_rate": 1.0507156575650934e-10, - "logits/chosen": -0.8164284825325012, - "logits/rejected": -0.6994372010231018, - "logps/chosen": -1.3235969543457031, - "logps/rejected": -2.549525260925293, - "loss": 0.8687, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.3235969543457031, - "rewards/margins": 1.225928544998169, - "rewards/rejected": -2.549525260925293, - "sft_loss": 1.4341703653335571, + "grad_norm": 5.346289798835596, + "learning_rate": 3.502385525216978e-11, + "logits/chosen": -0.2633610963821411, + "logits/rejected": -0.09879405796527863, + "logps/chosen": -1.3504259586334229, + "logps/rejected": -1.948297142982483, + "loss": 0.9951, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3504259586334229, + "rewards/margins": 0.5978710651397705, + "rewards/rejected": -1.948297142982483, + "sft_loss": 1.4504520893096924, "step": 5585 }, { "epoch": 2.9918046496069577, - "grad_norm": 8.46512671115117, - "learning_rate": 5.7047470306659246e-11, - "logits/chosen": -0.7358273267745972, - "logits/rejected": -0.7163251638412476, - "logps/chosen": -1.3835117816925049, - "logps/rejected": -2.445448398590088, - "loss": 0.9048, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3835117816925049, - "rewards/margins": 1.061936616897583, - "rewards/rejected": -2.445448398590088, - "sft_loss": 1.3738961219787598, + "grad_norm": 6.752210598255841, + "learning_rate": 1.901582343555308e-11, + "logits/chosen": -0.18967227637767792, + "logits/rejected": -0.1351352035999298, + "logps/chosen": -1.3926022052764893, + "logps/rejected": -1.875583291053772, + "loss": 1.0273, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3926022052764893, + "rewards/margins": 0.48298120498657227, + "rewards/rejected": -1.875583291053772, + "sft_loss": 1.3738386631011963, "step": 5590 }, { "epoch": 2.9944806823883594, - "grad_norm": 7.695148971648861, - "learning_rate": 2.3575827960697906e-11, - "logits/chosen": -0.7481427192687988, - "logits/rejected": -0.6636000871658325, - "logps/chosen": -1.2466938495635986, - "logps/rejected": -2.349560499191284, - "loss": 0.826, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.2466938495635986, - "rewards/margins": 1.1028664112091064, - "rewards/rejected": -2.349560499191284, - "sft_loss": 1.316910743713379, + "grad_norm": 14.99272495049364, + "learning_rate": 7.858609320232634e-12, + "logits/chosen": -0.21403300762176514, + "logits/rejected": -0.07583233714103699, + "logps/chosen": -1.2375662326812744, + "logps/rejected": -1.7883220911026, + "loss": 0.9561, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2375662326812744, + "rewards/margins": 0.5507559180259705, + "rewards/rejected": -1.7883220911026, + "sft_loss": 1.3005077838897705, "step": 5595 }, { "epoch": 2.9971567151697607, - "grad_norm": 8.539557104214142, - "learning_rate": 4.656963460691888e-12, - "logits/chosen": -0.753442645072937, - "logits/rejected": -0.7010436058044434, - "logps/chosen": -1.3138887882232666, - "logps/rejected": -2.5131661891937256, - "loss": 0.8816, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3138887882232666, - "rewards/margins": 1.199277639389038, - "rewards/rejected": -2.5131661891937256, - "sft_loss": 1.4080010652542114, + "grad_norm": 9.373094728599023, + "learning_rate": 1.5523211535639624e-12, + "logits/chosen": -0.20850276947021484, + "logits/rejected": -0.10659182071685791, + "logps/chosen": -1.3211060762405396, + "logps/rejected": -2.0065088272094727, + "loss": 0.9909, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3211060762405396, + "rewards/margins": 0.6854029297828674, + "rewards/rejected": -2.0065088272094727, + "sft_loss": 1.4024105072021484, "step": 5600 }, { "epoch": 2.9971567151697607, - "eval_logits/chosen": -0.47351476550102234, - "eval_logits/rejected": -0.42483875155448914, - "eval_logps/chosen": -1.5353264808654785, - "eval_logps/rejected": -2.2017006874084473, - "eval_loss": 1.0436354875564575, - "eval_rewards/accuracies": 0.6476261019706726, - "eval_rewards/chosen": -1.5353264808654785, - "eval_rewards/margins": 0.6663743257522583, - "eval_rewards/rejected": -2.2017006874084473, - "eval_runtime": 43.0659, - "eval_samples_per_second": 31.231, - "eval_sft_loss": 1.4855337142944336, - "eval_steps_per_second": 7.825, + "eval_logits/chosen": 0.12686319649219513, + "eval_logits/rejected": 0.21865931153297424, + "eval_logps/chosen": -1.3990031480789185, + "eval_logps/rejected": -1.843998670578003, + "eval_loss": 1.0415899753570557, + "eval_rewards/accuracies": 0.6157270073890686, + "eval_rewards/chosen": -1.3990031480789185, + "eval_rewards/margins": 0.44499555230140686, + "eval_rewards/rejected": -1.843998670578003, + "eval_runtime": 50.6079, + "eval_samples_per_second": 26.577, + "eval_sft_loss": 1.403135061264038, + "eval_steps_per_second": 6.659, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, - "train_loss": 0.9803723535479859, - "train_runtime": 32947.1599, - "train_samples_per_second": 5.444, - "train_steps_per_second": 0.17 + "train_loss": 1.0402103091545567, + "train_runtime": 33797.3752, + "train_samples_per_second": 5.307, + "train_steps_per_second": 0.166 } ], "logging_steps": 5,