{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 2.5307798952804768, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.06028543785214424, "logits/rejected": 0.15203383564949036, "logps/chosen": -1.716343641281128, "logps/rejected": -1.8897250890731812, "loss": 0.7351, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.716343641281128, "rewards/margins": 0.17338162660598755, "rewards/rejected": -1.8897250890731812, "sft_loss": 1.4684785604476929, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 1.3045011604582515, "learning_rate": 1.7825311942959e-08, "logits/chosen": 0.006154696457087994, "logits/rejected": 0.12953761219978333, "logps/chosen": -1.8027820587158203, "logps/rejected": -1.8465938568115234, "loss": 0.7432, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8027820587158203, "rewards/margins": 0.043811749666929245, "rewards/rejected": -1.8465938568115234, "sft_loss": 1.5083377361297607, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 1.3049558006137096, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.038712628185749054, "logits/rejected": 0.06149368733167648, "logps/chosen": -1.6352713108062744, "logps/rejected": -1.7654964923858643, "loss": 0.7566, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.6352713108062744, "rewards/margins": 0.13022512197494507, "rewards/rejected": -1.7654964923858643, "sft_loss": 1.5005992650985718, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 1.7589380784200994, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.039089519530534744, "logits/rejected": 0.049571335315704346, "logps/chosen": -1.7260818481445312, "logps/rejected": -1.807185411453247, "loss": 0.7573, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.7260818481445312, "rewards/margins": 0.08110358566045761, "rewards/rejected": -1.807185411453247, "sft_loss": 1.5007972717285156, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 1.6148810466509307, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.056121088564395905, "logits/rejected": 0.031073397025465965, "logps/chosen": -1.8696680068969727, "logps/rejected": -1.7794153690338135, "loss": 0.788, "rewards/accuracies": 0.375, "rewards/chosen": -1.8696680068969727, "rewards/margins": -0.09025251865386963, "rewards/rejected": -1.7794153690338135, "sft_loss": 1.5459704399108887, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 1.134138477310001, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.0943167433142662, "logits/rejected": 0.0011650652159005404, "logps/chosen": -1.9078174829483032, "logps/rejected": -1.8316199779510498, "loss": 0.7459, "rewards/accuracies": 0.4375, "rewards/chosen": -1.9078174829483032, "rewards/margins": -0.0761973112821579, "rewards/rejected": -1.8316199779510498, "sft_loss": 1.646023154258728, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 1.6619700873288943, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.05790115147829056, "logits/rejected": 0.10320062935352325, "logps/chosen": -1.8472083806991577, "logps/rejected": -1.9982259273529053, "loss": 0.7611, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8472083806991577, "rewards/margins": 0.15101750195026398, "rewards/rejected": -1.9982259273529053, "sft_loss": 1.5622081756591797, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 1.464719715628282, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.02323739603161812, "logits/rejected": 0.19785355031490326, "logps/chosen": -1.8846538066864014, "logps/rejected": -1.7455508708953857, "loss": 0.7666, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.8846538066864014, "rewards/margins": -0.13910314440727234, "rewards/rejected": -1.7455508708953857, "sft_loss": 1.5199060440063477, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 1.5585864902929252, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.021629726514220238, "logits/rejected": 0.22333423793315887, "logps/chosen": -1.8406168222427368, "logps/rejected": -1.8752384185791016, "loss": 0.7563, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8406168222427368, "rewards/margins": 0.034621547907590866, "rewards/rejected": -1.8752384185791016, "sft_loss": 1.5375287532806396, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 1.3244054200487152, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.05435756966471672, "logits/rejected": 0.09777506440877914, "logps/chosen": -1.9022912979125977, "logps/rejected": -1.7811915874481201, "loss": 0.7565, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.9022912979125977, "rewards/margins": -0.12109962850809097, "rewards/rejected": -1.7811915874481201, "sft_loss": 1.5845015048980713, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 1.4183844835158061, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.10434381663799286, "logits/rejected": 0.1210247278213501, "logps/chosen": -1.8414385318756104, "logps/rejected": -1.875012755393982, "loss": 0.7441, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.8414385318756104, "rewards/margins": 0.033574365079402924, "rewards/rejected": -1.875012755393982, "sft_loss": 1.5868983268737793, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 1.4194354948659196, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.08948967605829239, "logits/rejected": 0.10288163274526596, "logps/chosen": -1.8002837896347046, "logps/rejected": -1.9058860540390015, "loss": 0.7422, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.8002837896347046, "rewards/margins": 0.10560242086648941, "rewards/rejected": -1.9058860540390015, "sft_loss": 1.5473297834396362, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 1.3481178192361205, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.04388038441538811, "logits/rejected": 0.10528695583343506, "logps/chosen": -1.6494039297103882, "logps/rejected": -1.78204345703125, "loss": 0.7463, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.6494039297103882, "rewards/margins": 0.1326395571231842, "rewards/rejected": -1.78204345703125, "sft_loss": 1.4804751873016357, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 2.16778718518737, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.07954644411802292, "logits/rejected": 0.07445457577705383, "logps/chosen": -1.7811418771743774, "logps/rejected": -1.829134225845337, "loss": 0.761, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.7811418771743774, "rewards/margins": 0.04799215868115425, "rewards/rejected": -1.829134225845337, "sft_loss": 1.638705849647522, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 1.2302737702389388, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.05034886673092842, "logits/rejected": 0.1335323303937912, "logps/chosen": -1.8009684085845947, "logps/rejected": -2.0643763542175293, "loss": 0.7454, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.8009684085845947, "rewards/margins": 0.26340797543525696, "rewards/rejected": -2.0643763542175293, "sft_loss": 1.5755311250686646, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 1.3121530039846816, "learning_rate": 1.42602495543672e-07, "logits/chosen": 0.0018813014030456543, "logits/rejected": 0.10915534198284149, "logps/chosen": -1.7485237121582031, "logps/rejected": -1.7818000316619873, "loss": 0.7563, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7485237121582031, "rewards/margins": 0.03327634930610657, "rewards/rejected": -1.7818000316619873, "sft_loss": 1.5394573211669922, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 1.4021077740070411, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.1436150074005127, "logits/rejected": 0.10774292796850204, "logps/chosen": -1.829763412475586, "logps/rejected": -2.016918659210205, "loss": 0.7571, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.829763412475586, "rewards/margins": 0.1871553659439087, "rewards/rejected": -2.016918659210205, "sft_loss": 1.5105193853378296, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 1.086482718318956, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.09334637224674225, "logits/rejected": 0.05624980852007866, "logps/chosen": -1.8011209964752197, "logps/rejected": -1.8162376880645752, "loss": 0.7581, "rewards/accuracies": 0.46875, "rewards/chosen": -1.8011209964752197, "rewards/margins": 0.015116686001420021, "rewards/rejected": -1.8162376880645752, "sft_loss": 1.4740904569625854, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 1.2496592116011893, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.06257010996341705, "logits/rejected": 0.09181664139032364, "logps/chosen": -1.8811146020889282, "logps/rejected": -1.9701454639434814, "loss": 0.7473, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.8811146020889282, "rewards/margins": 0.08903058618307114, "rewards/rejected": -1.9701454639434814, "sft_loss": 1.5524765253067017, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 1.1358051297964684, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.032475076615810394, "logits/rejected": 0.03443972021341324, "logps/chosen": -1.7509632110595703, "logps/rejected": -1.8610137701034546, "loss": 0.7443, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.7509632110595703, "rewards/margins": 0.11005053669214249, "rewards/rejected": -1.8610137701034546, "sft_loss": 1.5149767398834229, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 1.1498117979009437, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.07438740134239197, "logits/rejected": 0.10223875194787979, "logps/chosen": -1.7091327905654907, "logps/rejected": -1.874015212059021, "loss": 0.742, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.7091327905654907, "rewards/margins": 0.16488228738307953, "rewards/rejected": -1.874015212059021, "sft_loss": 1.463929295539856, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 1.2631632504843453, "learning_rate": 1.96078431372549e-07, "logits/chosen": 0.020029287785291672, "logits/rejected": 0.11946004629135132, "logps/chosen": -1.7755727767944336, "logps/rejected": -1.8332843780517578, "loss": 0.7567, "rewards/accuracies": 0.5, "rewards/chosen": -1.7755727767944336, "rewards/margins": 0.0577116496860981, "rewards/rejected": -1.8332843780517578, "sft_loss": 1.5115814208984375, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 1.3979503483361755, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.019500691443681717, "logits/rejected": 0.2292514592409134, "logps/chosen": -1.7710542678833008, "logps/rejected": -2.0842669010162354, "loss": 0.7296, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.7710542678833008, "rewards/margins": 0.3132126033306122, "rewards/rejected": -2.0842669010162354, "sft_loss": 1.6256048679351807, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 0.9471896408017881, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.06396186351776123, "logits/rejected": 0.11650919914245605, "logps/chosen": -1.8994373083114624, "logps/rejected": -2.0369832515716553, "loss": 0.7397, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8994373083114624, "rewards/margins": 0.1375458985567093, "rewards/rejected": -2.0369832515716553, "sft_loss": 1.6425449848175049, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 1.501264995813309, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.0704985037446022, "logits/rejected": 0.06473371386528015, "logps/chosen": -1.814263939857483, "logps/rejected": -1.7392761707305908, "loss": 0.7617, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.814263939857483, "rewards/margins": -0.07498808205127716, "rewards/rejected": -1.7392761707305908, "sft_loss": 1.5751149654388428, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 1.8340291888002895, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.0581393726170063, "logits/rejected": 0.20125238597393036, "logps/chosen": -1.877229928970337, "logps/rejected": -1.9977684020996094, "loss": 0.7481, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.877229928970337, "rewards/margins": 0.12053883075714111, "rewards/rejected": -1.9977684020996094, "sft_loss": 1.6528816223144531, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 1.0603454056081036, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.014934045262634754, "logits/rejected": 0.1119217649102211, "logps/chosen": -1.9535691738128662, "logps/rejected": -1.9393174648284912, "loss": 0.7532, "rewards/accuracies": 0.5, "rewards/chosen": -1.9535691738128662, "rewards/margins": -0.014251927845180035, "rewards/rejected": -1.9393174648284912, "sft_loss": 1.6135581731796265, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 1.70352983220921, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.03259888291358948, "logits/rejected": 0.14035694301128387, "logps/chosen": -1.9172674417495728, "logps/rejected": -2.143113613128662, "loss": 0.7388, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9172674417495728, "rewards/margins": 0.22584640979766846, "rewards/rejected": -2.143113613128662, "sft_loss": 1.6537431478500366, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 1.1788956191636877, "learning_rate": 2.5846702317290554e-07, "logits/chosen": -0.008710386231541634, "logits/rejected": 0.1506728231906891, "logps/chosen": -1.8602443933486938, "logps/rejected": -1.9983711242675781, "loss": 0.7427, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.8602443933486938, "rewards/margins": 0.13812680542469025, "rewards/rejected": -1.9983711242675781, "sft_loss": 1.5861378908157349, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 1.611957958433947, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.047285713255405426, "logits/rejected": 0.1237465962767601, "logps/chosen": -1.8235307931900024, "logps/rejected": -1.8251911401748657, "loss": 0.7505, "rewards/accuracies": 0.53125, "rewards/chosen": -1.8235307931900024, "rewards/margins": 0.0016601771349087358, "rewards/rejected": -1.8251911401748657, "sft_loss": 1.445634126663208, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 1.4133899671490275, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.028733108192682266, "logits/rejected": 0.02427602931857109, "logps/chosen": -1.8935750722885132, "logps/rejected": -1.9562524557113647, "loss": 0.7509, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.8935750722885132, "rewards/margins": 0.0626775249838829, "rewards/rejected": -1.9562524557113647, "sft_loss": 1.5830233097076416, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 1.177019125859732, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.12482740730047226, "logits/rejected": 0.0246548131108284, "logps/chosen": -2.0720441341400146, "logps/rejected": -2.0472512245178223, "loss": 0.7583, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -2.0720441341400146, "rewards/margins": -0.024792974814772606, "rewards/rejected": -2.0472512245178223, "sft_loss": 1.6636087894439697, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 1.2937098303864403, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.025919023901224136, "logits/rejected": 0.16027899086475372, "logps/chosen": -1.853724718093872, "logps/rejected": -2.1069750785827637, "loss": 0.745, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.853724718093872, "rewards/margins": 0.2532506287097931, "rewards/rejected": -2.1069750785827637, "sft_loss": 1.5339034795761108, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 1.1908623116367458, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.06941623985767365, "logits/rejected": -0.014124035835266113, "logps/chosen": -2.095472812652588, "logps/rejected": -2.0944128036499023, "loss": 0.7431, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -2.095472812652588, "rewards/margins": -0.001059868955053389, "rewards/rejected": -2.0944128036499023, "sft_loss": 1.6449720859527588, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 1.1539069879301553, "learning_rate": 3.1194295900178254e-07, "logits/chosen": 0.05122692137956619, "logits/rejected": 0.050775568932294846, "logps/chosen": -1.9801479578018188, "logps/rejected": -2.052694797515869, "loss": 0.7693, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.9801479578018188, "rewards/margins": 0.0725465789437294, "rewards/rejected": -2.052694797515869, "sft_loss": 1.6534878015518188, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 1.0008286652422143, "learning_rate": 3.2085561497326203e-07, "logits/chosen": 0.050390541553497314, "logits/rejected": 0.055344052612781525, "logps/chosen": -2.0598304271698, "logps/rejected": -2.077967643737793, "loss": 0.7446, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.0598304271698, "rewards/margins": 0.01813710294663906, "rewards/rejected": -2.077967643737793, "sft_loss": 1.6423956155776978, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 1.3549860343592595, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.11505118757486343, "logits/rejected": -0.019612614065408707, "logps/chosen": -2.0028810501098633, "logps/rejected": -2.0836894512176514, "loss": 0.7561, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.0028810501098633, "rewards/margins": 0.0808083564043045, "rewards/rejected": -2.0836894512176514, "sft_loss": 1.6119740009307861, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 1.7311733616395397, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.036365706473588943, "logits/rejected": 0.09691213071346283, "logps/chosen": -2.3790314197540283, "logps/rejected": -2.2984354496002197, "loss": 0.7477, "rewards/accuracies": 0.46875, "rewards/chosen": -2.3790314197540283, "rewards/margins": -0.08059573173522949, "rewards/rejected": -2.2984354496002197, "sft_loss": 1.7958142757415771, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 0.9496432892268047, "learning_rate": 3.475935828877005e-07, "logits/chosen": 0.04074074327945709, "logits/rejected": 0.20683078467845917, "logps/chosen": -1.8889601230621338, "logps/rejected": -1.9778417348861694, "loss": 0.7422, "rewards/accuracies": 0.53125, "rewards/chosen": -1.8889601230621338, "rewards/margins": 0.08888188749551773, "rewards/rejected": -1.9778417348861694, "sft_loss": 1.5175426006317139, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 0.9622038848794177, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.045106835663318634, "logits/rejected": 0.10371474921703339, "logps/chosen": -2.222414493560791, "logps/rejected": -2.04349684715271, "loss": 0.7524, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.222414493560791, "rewards/margins": -0.17891743779182434, "rewards/rejected": -2.04349684715271, "sft_loss": 1.725873589515686, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 1.201290427678159, "learning_rate": 3.654188948306595e-07, "logits/chosen": 0.0018949396908283234, "logits/rejected": 0.16811925172805786, "logps/chosen": -2.366450548171997, "logps/rejected": -2.164440631866455, "loss": 0.7477, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.366450548171997, "rewards/margins": -0.20200976729393005, "rewards/rejected": -2.164440631866455, "sft_loss": 1.6500478982925415, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 0.9519143647498421, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.1441155970096588, "logits/rejected": 0.054268885403871536, "logps/chosen": -2.2122931480407715, "logps/rejected": -2.50825834274292, "loss": 0.727, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.2122931480407715, "rewards/margins": 0.2959652543067932, "rewards/rejected": -2.50825834274292, "sft_loss": 1.7047412395477295, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 0.9147143276129032, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.13151441514492035, "logits/rejected": 0.13415369391441345, "logps/chosen": -2.0408544540405273, "logps/rejected": -2.140831708908081, "loss": 0.7324, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0408544540405273, "rewards/margins": 0.09997712820768356, "rewards/rejected": -2.140831708908081, "sft_loss": 1.689650535583496, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 0.9340469402771395, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.09782375395298004, "logits/rejected": 0.21198837459087372, "logps/chosen": -2.1927573680877686, "logps/rejected": -2.5048155784606934, "loss": 0.7261, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1927573680877686, "rewards/margins": 0.31205856800079346, "rewards/rejected": -2.5048155784606934, "sft_loss": 1.7473742961883545, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 1.1940860708131598, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.06439894437789917, "logits/rejected": 0.12158125638961792, "logps/chosen": -2.0465662479400635, "logps/rejected": -2.1566920280456543, "loss": 0.7288, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.0465662479400635, "rewards/margins": 0.11012595891952515, "rewards/rejected": -2.1566920280456543, "sft_loss": 1.579954743385315, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 1.1570769692704002, "learning_rate": 4.09982174688057e-07, "logits/chosen": 0.024921538308262825, "logits/rejected": 0.11274313926696777, "logps/chosen": -2.291510820388794, "logps/rejected": -2.380722761154175, "loss": 0.7372, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.291510820388794, "rewards/margins": 0.08921203017234802, "rewards/rejected": -2.380722761154175, "sft_loss": 1.6209876537322998, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 0.9947029798361241, "learning_rate": 4.188948306595365e-07, "logits/chosen": 0.052441079169511795, "logits/rejected": 0.2144056260585785, "logps/chosen": -2.117339611053467, "logps/rejected": -2.3475327491760254, "loss": 0.7204, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.117339611053467, "rewards/margins": 0.23019298911094666, "rewards/rejected": -2.3475327491760254, "sft_loss": 1.594679594039917, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 1.0455265087391556, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.0020845234394073486, "logits/rejected": 0.1302955150604248, "logps/chosen": -2.195319414138794, "logps/rejected": -2.3318018913269043, "loss": 0.7474, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.195319414138794, "rewards/margins": 0.13648256659507751, "rewards/rejected": -2.3318018913269043, "sft_loss": 1.7364375591278076, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 1.0013964505950739, "learning_rate": 4.3672014260249554e-07, "logits/chosen": 0.053445588797330856, "logits/rejected": 0.18915502727031708, "logps/chosen": -2.10256028175354, "logps/rejected": -2.3874642848968506, "loss": 0.7297, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.10256028175354, "rewards/margins": 0.2849038243293762, "rewards/rejected": -2.3874642848968506, "sft_loss": 1.7636394500732422, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 1.1313050273773242, "learning_rate": 4.4563279857397503e-07, "logits/chosen": -0.025862867012619972, "logits/rejected": 0.15776577591896057, "logps/chosen": -2.5357606410980225, "logps/rejected": -2.5316543579101562, "loss": 0.745, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.5357606410980225, "rewards/margins": -0.004106348846107721, "rewards/rejected": -2.5316543579101562, "sft_loss": 1.7009462118148804, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 1.198688397864217, "learning_rate": 4.545454545454545e-07, "logits/chosen": 0.027461037039756775, "logits/rejected": 0.20019881427288055, "logps/chosen": -1.969020128250122, "logps/rejected": -2.2718300819396973, "loss": 0.7298, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.969020128250122, "rewards/margins": 0.3028102517127991, "rewards/rejected": -2.2718300819396973, "sft_loss": 1.5253115892410278, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 0.9576003040336463, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.217337965965271, "logits/rejected": -0.09969816356897354, "logps/chosen": -2.3528895378112793, "logps/rejected": -2.2599847316741943, "loss": 0.7258, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.3528895378112793, "rewards/margins": -0.09290491044521332, "rewards/rejected": -2.2599847316741943, "sft_loss": 1.7216367721557617, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 0.8083878838357342, "learning_rate": 4.723707664884135e-07, "logits/chosen": -0.04917464405298233, "logits/rejected": 0.04295088350772858, "logps/chosen": -2.5756170749664307, "logps/rejected": -2.369469404220581, "loss": 0.7435, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.5756170749664307, "rewards/margins": -0.2061474323272705, "rewards/rejected": -2.369469404220581, "sft_loss": 1.998988389968872, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 0.8439805417282846, "learning_rate": 4.81283422459893e-07, "logits/chosen": -0.06389988213777542, "logits/rejected": 0.09119514375925064, "logps/chosen": -2.194108486175537, "logps/rejected": -2.3751087188720703, "loss": 0.7411, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.194108486175537, "rewards/margins": 0.18100018799304962, "rewards/rejected": -2.3751087188720703, "sft_loss": 1.682721734046936, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 1.2479379665264587, "learning_rate": 4.901960784313725e-07, "logits/chosen": 0.014311921782791615, "logits/rejected": 0.12128366529941559, "logps/chosen": -2.3143210411071777, "logps/rejected": -2.603527307510376, "loss": 0.7489, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.3143210411071777, "rewards/margins": 0.28920650482177734, "rewards/rejected": -2.603527307510376, "sft_loss": 1.7431834936141968, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 0.8807325016661434, "learning_rate": 4.99108734402852e-07, "logits/chosen": -0.07187201082706451, "logits/rejected": 0.1185765266418457, "logps/chosen": -2.59013295173645, "logps/rejected": -2.569979190826416, "loss": 0.7275, "rewards/accuracies": 0.5625, "rewards/chosen": -2.59013295173645, "rewards/margins": -0.02015404775738716, "rewards/rejected": -2.569979190826416, "sft_loss": 2.0005898475646973, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 1.262434366720682, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.0466768741607666, "logits/rejected": 0.11643379926681519, "logps/chosen": -2.5148816108703613, "logps/rejected": -2.536932945251465, "loss": 0.7479, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -2.5148816108703613, "rewards/margins": 0.022051483392715454, "rewards/rejected": -2.536932945251465, "sft_loss": 1.7881135940551758, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 0.7384226605508317, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.10155355930328369, "logits/rejected": 0.2345721274614334, "logps/chosen": -2.307314157485962, "logps/rejected": -2.776353120803833, "loss": 0.7086, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.307314157485962, "rewards/margins": 0.46903902292251587, "rewards/rejected": -2.776353120803833, "sft_loss": 1.829089879989624, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 0.7690015544470431, "learning_rate": 5.258467023172905e-07, "logits/chosen": -0.04389738291501999, "logits/rejected": 0.02246524766087532, "logps/chosen": -2.7992782592773438, "logps/rejected": -2.68961763381958, "loss": 0.7361, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.7992782592773438, "rewards/margins": -0.10966069996356964, "rewards/rejected": -2.68961763381958, "sft_loss": 2.0393753051757812, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 0.9284960969608202, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.07447122037410736, "logits/rejected": 0.11382939666509628, "logps/chosen": -3.0339980125427246, "logps/rejected": -3.1267342567443848, "loss": 0.7372, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -3.0339980125427246, "rewards/margins": 0.09273599088191986, "rewards/rejected": -3.1267342567443848, "sft_loss": 1.9921964406967163, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 0.8614208468355983, "learning_rate": 5.436720142602496e-07, "logits/chosen": -0.0066381096839904785, "logits/rejected": 0.07297073304653168, "logps/chosen": -2.851527690887451, "logps/rejected": -2.8573780059814453, "loss": 0.739, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -2.851527690887451, "rewards/margins": 0.005850297398865223, "rewards/rejected": -2.8573780059814453, "sft_loss": 2.166574239730835, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 0.6501720560581197, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.18907728791236877, "logits/rejected": -0.0829610675573349, "logps/chosen": -3.2818069458007812, "logps/rejected": -3.3691933155059814, "loss": 0.728, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -3.2818069458007812, "rewards/margins": 0.08738609403371811, "rewards/rejected": -3.3691933155059814, "sft_loss": 2.39288592338562, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 0.6372708648025124, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.006549348589032888, "logits/rejected": 0.17332328855991364, "logps/chosen": -3.1714937686920166, "logps/rejected": -3.2386155128479004, "loss": 0.7312, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -3.1714937686920166, "rewards/margins": 0.0671217292547226, "rewards/rejected": -3.2386155128479004, "sft_loss": 2.2824227809906006, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 0.8740635051818171, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.03601797670125961, "logits/rejected": 0.11112309992313385, "logps/chosen": -3.0045642852783203, "logps/rejected": -2.9035542011260986, "loss": 0.7351, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.0045642852783203, "rewards/margins": -0.101010262966156, "rewards/rejected": -2.9035542011260986, "sft_loss": 2.2101263999938965, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 0.5771610112338228, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.11896149069070816, "logits/rejected": 0.018192211166024208, "logps/chosen": -3.610837459564209, "logps/rejected": -3.8876991271972656, "loss": 0.725, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.610837459564209, "rewards/margins": 0.27686089277267456, "rewards/rejected": -3.8876991271972656, "sft_loss": 2.300236225128174, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 0.6229477054686579, "learning_rate": 5.88235294117647e-07, "logits/chosen": 0.005486331880092621, "logits/rejected": 0.177880197763443, "logps/chosen": -2.7989001274108887, "logps/rejected": -3.7798194885253906, "loss": 0.7251, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.7989001274108887, "rewards/margins": 0.980919361114502, "rewards/rejected": -3.7798194885253906, "sft_loss": 2.2227303981781006, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 0.7181549721950545, "learning_rate": 5.971479500891266e-07, "logits/chosen": 0.053232062608003616, "logits/rejected": 0.1841447651386261, "logps/chosen": -3.7734217643737793, "logps/rejected": -3.7936007976531982, "loss": 0.7314, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -3.7734217643737793, "rewards/margins": 0.0201789028942585, "rewards/rejected": -3.7936007976531982, "sft_loss": 2.1736080646514893, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 0.70309398793489, "learning_rate": 6.060606060606061e-07, "logits/chosen": -0.006882413290441036, "logits/rejected": 0.1695895940065384, "logps/chosen": -3.509596347808838, "logps/rejected": -3.7672221660614014, "loss": 0.7219, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.509596347808838, "rewards/margins": 0.25762563943862915, "rewards/rejected": -3.7672221660614014, "sft_loss": 2.3050999641418457, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 1.0684204512933249, "learning_rate": 6.149732620320855e-07, "logits/chosen": 0.06146972253918648, "logits/rejected": 0.1032826155424118, "logps/chosen": -3.7491230964660645, "logps/rejected": -3.746976375579834, "loss": 0.733, "rewards/accuracies": 0.5, "rewards/chosen": -3.7491230964660645, "rewards/margins": -0.00214635138399899, "rewards/rejected": -3.746976375579834, "sft_loss": 2.372309446334839, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 0.674534244813876, "learning_rate": 6.238859180035651e-07, "logits/chosen": 0.04469449073076248, "logits/rejected": 0.15989665687084198, "logps/chosen": -3.837616443634033, "logps/rejected": -3.3015296459198, "loss": 0.729, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -3.837616443634033, "rewards/margins": -0.5360864400863647, "rewards/rejected": -3.3015296459198, "sft_loss": 2.3684537410736084, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 0.4798276107270202, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.03864552825689316, "logits/rejected": 0.21794693171977997, "logps/chosen": -3.891691207885742, "logps/rejected": -3.7515900135040283, "loss": 0.7264, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -3.891691207885742, "rewards/margins": -0.14010128378868103, "rewards/rejected": -3.7515900135040283, "sft_loss": 2.518442153930664, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 1.1965823226283256, "learning_rate": 6.417112299465241e-07, "logits/chosen": 0.004572421312332153, "logits/rejected": 0.10303233563899994, "logps/chosen": -4.1401591300964355, "logps/rejected": -4.2200188636779785, "loss": 0.7301, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -4.1401591300964355, "rewards/margins": 0.07985991984605789, "rewards/rejected": -4.2200188636779785, "sft_loss": 2.446254253387451, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 1.0181453679763794, "learning_rate": 6.506238859180035e-07, "logits/chosen": 0.06938910484313965, "logits/rejected": 0.1773306131362915, "logps/chosen": -3.9401297569274902, "logps/rejected": -3.458475112915039, "loss": 0.727, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -3.9401297569274902, "rewards/margins": -0.48165464401245117, "rewards/rejected": -3.458475112915039, "sft_loss": 2.5714945793151855, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 0.7034205710786969, "learning_rate": 6.59536541889483e-07, "logits/chosen": 0.04681248217821121, "logits/rejected": 0.16646014153957367, "logps/chosen": -3.4816317558288574, "logps/rejected": -3.414193630218506, "loss": 0.7407, "rewards/accuracies": 0.5, "rewards/chosen": -3.4816317558288574, "rewards/margins": -0.0674385279417038, "rewards/rejected": -3.414193630218506, "sft_loss": 2.5620157718658447, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 0.9349621887687155, "learning_rate": 6.684491978609626e-07, "logits/chosen": 0.029973220080137253, "logits/rejected": 0.22433066368103027, "logps/chosen": -3.6492698192596436, "logps/rejected": -4.301482677459717, "loss": 0.7198, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -3.6492698192596436, "rewards/margins": 0.6522127389907837, "rewards/rejected": -4.301482677459717, "sft_loss": 2.334585666656494, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 0.6534594197266338, "learning_rate": 6.77361853832442e-07, "logits/chosen": 0.04294043034315109, "logits/rejected": 0.15177340805530548, "logps/chosen": -3.961987257003784, "logps/rejected": -4.119868278503418, "loss": 0.7196, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.961987257003784, "rewards/margins": 0.15788108110427856, "rewards/rejected": -4.119868278503418, "sft_loss": 2.639615058898926, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 0.8195329529138362, "learning_rate": 6.862745098039216e-07, "logits/chosen": 0.05825083702802658, "logits/rejected": 0.1578684002161026, "logps/chosen": -3.07454514503479, "logps/rejected": -3.358675479888916, "loss": 0.7206, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -3.07454514503479, "rewards/margins": 0.28413036465644836, "rewards/rejected": -3.358675479888916, "sft_loss": 2.2494893074035645, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 0.8015314543916019, "learning_rate": 6.95187165775401e-07, "logits/chosen": 0.1552068293094635, "logits/rejected": 0.3621278405189514, "logps/chosen": -3.0863308906555176, "logps/rejected": -3.399913787841797, "loss": 0.7198, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -3.0863308906555176, "rewards/margins": 0.3135828375816345, "rewards/rejected": -3.399913787841797, "sft_loss": 2.292431354522705, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 0.8421949387057387, "learning_rate": 7.040998217468806e-07, "logits/chosen": 0.031766343861818314, "logits/rejected": 0.23609808087348938, "logps/chosen": -2.7795357704162598, "logps/rejected": -3.061073064804077, "loss": 0.7247, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.7795357704162598, "rewards/margins": 0.2815372347831726, "rewards/rejected": -3.061073064804077, "sft_loss": 2.0890800952911377, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 1.070064093059005, "learning_rate": 7.1301247771836e-07, "logits/chosen": 0.14136911928653717, "logits/rejected": 0.2708422541618347, "logps/chosen": -3.2854278087615967, "logps/rejected": -3.46429705619812, "loss": 0.7149, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -3.2854278087615967, "rewards/margins": 0.17886866629123688, "rewards/rejected": -3.46429705619812, "sft_loss": 2.1866886615753174, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.4407428205013275, "eval_logits/rejected": 0.5533545613288879, "eval_logps/chosen": -3.312464952468872, "eval_logps/rejected": -3.568161964416504, "eval_loss": 0.7231929898262024, "eval_rewards/accuracies": 0.5200296640396118, "eval_rewards/chosen": -3.312464952468872, "eval_rewards/margins": 0.25569629669189453, "eval_rewards/rejected": -3.568161964416504, "eval_runtime": 44.1302, "eval_samples_per_second": 30.478, "eval_sft_loss": 2.1337332725524902, "eval_steps_per_second": 7.636, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 0.8380463757471741, "learning_rate": 7.219251336898395e-07, "logits/chosen": 0.12377011775970459, "logits/rejected": 0.23273494839668274, "logps/chosen": -3.467702865600586, "logps/rejected": -3.855557680130005, "loss": 0.7318, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -3.467702865600586, "rewards/margins": 0.3878548741340637, "rewards/rejected": -3.855557680130005, "sft_loss": 2.629897356033325, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 0.8015436429519107, "learning_rate": 7.30837789661319e-07, "logits/chosen": 0.15863196551799774, "logits/rejected": 0.3188668489456177, "logps/chosen": -3.4703338146209717, "logps/rejected": -3.9953055381774902, "loss": 0.7181, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -3.4703338146209717, "rewards/margins": 0.5249720215797424, "rewards/rejected": -3.9953055381774902, "sft_loss": 2.4155757427215576, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 1.183682024326887, "learning_rate": 7.397504456327985e-07, "logits/chosen": 0.14013604819774628, "logits/rejected": 0.21396835148334503, "logps/chosen": -3.6671295166015625, "logps/rejected": -3.4590630531311035, "loss": 0.7319, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -3.6671295166015625, "rewards/margins": -0.20806679129600525, "rewards/rejected": -3.4590630531311035, "sft_loss": 2.605466365814209, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 1.3858150606864232, "learning_rate": 7.486631016042781e-07, "logits/chosen": 0.06939506530761719, "logits/rejected": 0.3356233537197113, "logps/chosen": -2.94575572013855, "logps/rejected": -3.125661849975586, "loss": 0.7168, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.94575572013855, "rewards/margins": 0.17990592122077942, "rewards/rejected": -3.125661849975586, "sft_loss": 2.1718907356262207, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 1.0438361542866905, "learning_rate": 7.575757575757575e-07, "logits/chosen": 0.058596737682819366, "logits/rejected": 0.30125856399536133, "logps/chosen": -3.111341714859009, "logps/rejected": -3.4133172035217285, "loss": 0.7194, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.111341714859009, "rewards/margins": 0.3019755184650421, "rewards/rejected": -3.4133172035217285, "sft_loss": 2.271104335784912, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 0.9585500950106343, "learning_rate": 7.664884135472371e-07, "logits/chosen": 0.012262892909348011, "logits/rejected": 0.2687227129936218, "logps/chosen": -2.8309836387634277, "logps/rejected": -3.7098381519317627, "loss": 0.7209, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.8309836387634277, "rewards/margins": 0.8788547515869141, "rewards/rejected": -3.7098381519317627, "sft_loss": 2.2115042209625244, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 0.7878172279611957, "learning_rate": 7.754010695187165e-07, "logits/chosen": 0.11982943117618561, "logits/rejected": 0.2392585277557373, "logps/chosen": -3.093507766723633, "logps/rejected": -3.1052567958831787, "loss": 0.7209, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -3.093507766723633, "rewards/margins": 0.011749285273253918, "rewards/rejected": -3.1052567958831787, "sft_loss": 2.3222641944885254, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 1.1063814156047223, "learning_rate": 7.84313725490196e-07, "logits/chosen": 0.08916598558425903, "logits/rejected": 0.21588215231895447, "logps/chosen": -2.7143657207489014, "logps/rejected": -3.1528379917144775, "loss": 0.724, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.7143657207489014, "rewards/margins": 0.4384719729423523, "rewards/rejected": -3.1528379917144775, "sft_loss": 2.295525074005127, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 0.883130703007747, "learning_rate": 7.932263814616755e-07, "logits/chosen": 0.07394398748874664, "logits/rejected": 0.21995148062705994, "logps/chosen": -2.920626163482666, "logps/rejected": -3.506260395050049, "loss": 0.7206, "rewards/accuracies": 0.59375, "rewards/chosen": -2.920626163482666, "rewards/margins": 0.5856344103813171, "rewards/rejected": -3.506260395050049, "sft_loss": 2.224306106567383, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 0.8695281019667966, "learning_rate": 8.02139037433155e-07, "logits/chosen": 0.09777168929576874, "logits/rejected": 0.2562565207481384, "logps/chosen": -2.7872490882873535, "logps/rejected": -3.0836105346679688, "loss": 0.7174, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.7872490882873535, "rewards/margins": 0.29636120796203613, "rewards/rejected": -3.0836105346679688, "sft_loss": 2.252509117126465, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 1.1522578168512165, "learning_rate": 8.110516934046346e-07, "logits/chosen": 0.10990069806575775, "logits/rejected": 0.21983602643013, "logps/chosen": -2.5689430236816406, "logps/rejected": -2.998944044113159, "loss": 0.7104, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.5689430236816406, "rewards/margins": 0.4300007224082947, "rewards/rejected": -2.998944044113159, "sft_loss": 2.086578845977783, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 2.435980412813483, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.014444696716964245, "logits/rejected": 0.15174110233783722, "logps/chosen": -2.6563944816589355, "logps/rejected": -2.8838610649108887, "loss": 0.7268, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.6563944816589355, "rewards/margins": 0.2274663895368576, "rewards/rejected": -2.8838610649108887, "sft_loss": 2.333906888961792, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 2.2069324871389653, "learning_rate": 8.288770053475936e-07, "logits/chosen": 0.22611455619335175, "logits/rejected": 0.2659430503845215, "logps/chosen": -2.8453681468963623, "logps/rejected": -3.1539711952209473, "loss": 0.7222, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.8453681468963623, "rewards/margins": 0.3086031377315521, "rewards/rejected": -3.1539711952209473, "sft_loss": 2.4192214012145996, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 3.704990416624573, "learning_rate": 8.37789661319073e-07, "logits/chosen": 0.2801414728164673, "logits/rejected": 0.22712154686450958, "logps/chosen": -2.801309108734131, "logps/rejected": -2.847769260406494, "loss": 0.733, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.801309108734131, "rewards/margins": 0.04645988345146179, "rewards/rejected": -2.847769260406494, "sft_loss": 2.284790515899658, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 1.068048481080084, "learning_rate": 8.467023172905525e-07, "logits/chosen": -8.206814527511597e-05, "logits/rejected": 0.1875239610671997, "logps/chosen": -2.40893816947937, "logps/rejected": -3.194272518157959, "loss": 0.7097, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.40893816947937, "rewards/margins": 0.7853342890739441, "rewards/rejected": -3.194272518157959, "sft_loss": 2.097153425216675, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 2.1566987984823514, "learning_rate": 8.55614973262032e-07, "logits/chosen": 0.04935279116034508, "logits/rejected": 0.28145831823349, "logps/chosen": -2.404567241668701, "logps/rejected": -2.732786178588867, "loss": 0.7217, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.404567241668701, "rewards/margins": 0.32821884751319885, "rewards/rejected": -2.732786178588867, "sft_loss": 2.01057505607605, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 0.9623812340325298, "learning_rate": 8.645276292335115e-07, "logits/chosen": 0.08504694700241089, "logits/rejected": 0.14895778894424438, "logps/chosen": -2.759326457977295, "logps/rejected": -2.824343681335449, "loss": 0.7263, "rewards/accuracies": 0.53125, "rewards/chosen": -2.759326457977295, "rewards/margins": 0.06501699984073639, "rewards/rejected": -2.824343681335449, "sft_loss": 2.2212133407592773, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 1.569603368409317, "learning_rate": 8.734402852049911e-07, "logits/chosen": 0.14935262501239777, "logits/rejected": 0.23176102340221405, "logps/chosen": -2.831303358078003, "logps/rejected": -3.133817195892334, "loss": 0.7248, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.831303358078003, "rewards/margins": 0.30251362919807434, "rewards/rejected": -3.133817195892334, "sft_loss": 2.3560450077056885, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 1.4572523265368869, "learning_rate": 8.823529411764705e-07, "logits/chosen": 0.0733940601348877, "logits/rejected": 0.11172135919332504, "logps/chosen": -2.8521978855133057, "logps/rejected": -3.004929304122925, "loss": 0.7243, "rewards/accuracies": 0.53125, "rewards/chosen": -2.8521978855133057, "rewards/margins": 0.15273188054561615, "rewards/rejected": -3.004929304122925, "sft_loss": 2.426811695098877, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 1.5609730411038651, "learning_rate": 8.912655971479501e-07, "logits/chosen": 0.04248486086726189, "logits/rejected": 0.1631307303905487, "logps/chosen": -2.8464436531066895, "logps/rejected": -3.2533068656921387, "loss": 0.7231, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.8464436531066895, "rewards/margins": 0.4068628251552582, "rewards/rejected": -3.2533068656921387, "sft_loss": 2.2971839904785156, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 1.4689783802748066, "learning_rate": 9.001782531194295e-07, "logits/chosen": 0.003608876373618841, "logits/rejected": 0.17241023480892181, "logps/chosen": -2.7894625663757324, "logps/rejected": -2.8479764461517334, "loss": 0.7173, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.7894625663757324, "rewards/margins": 0.05851361155509949, "rewards/rejected": -2.8479764461517334, "sft_loss": 2.290825843811035, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 1.250742204693819, "learning_rate": 9.09090909090909e-07, "logits/chosen": 0.152287095785141, "logits/rejected": 0.2180653065443039, "logps/chosen": -2.5529842376708984, "logps/rejected": -3.0081677436828613, "loss": 0.7178, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.5529842376708984, "rewards/margins": 0.45518356561660767, "rewards/rejected": -3.0081677436828613, "sft_loss": 2.1848416328430176, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 1.1650240017903066, "learning_rate": 9.180035650623885e-07, "logits/chosen": 0.07927284389734268, "logits/rejected": 0.20023027062416077, "logps/chosen": -2.3548150062561035, "logps/rejected": -2.520425319671631, "loss": 0.7125, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.3548150062561035, "rewards/margins": 0.16561046242713928, "rewards/rejected": -2.520425319671631, "sft_loss": 2.0264971256256104, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 1.4028935287772106, "learning_rate": 9.26916221033868e-07, "logits/chosen": 0.00641799857839942, "logits/rejected": 0.16784623265266418, "logps/chosen": -2.4290060997009277, "logps/rejected": -2.6969335079193115, "loss": 0.7192, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.4290060997009277, "rewards/margins": 0.2679271101951599, "rewards/rejected": -2.6969335079193115, "sft_loss": 2.2241756916046143, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 1.5865055700177921, "learning_rate": 9.358288770053476e-07, "logits/chosen": 0.15192261338233948, "logits/rejected": 0.2448718100786209, "logps/chosen": -2.57245135307312, "logps/rejected": -2.901125192642212, "loss": 0.7168, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.57245135307312, "rewards/margins": 0.3286738395690918, "rewards/rejected": -2.901125192642212, "sft_loss": 2.272670269012451, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 1.6613335130368763, "learning_rate": 9.44741532976827e-07, "logits/chosen": 0.09941687434911728, "logits/rejected": 0.18588587641716003, "logps/chosen": -2.3438611030578613, "logps/rejected": -2.6406006813049316, "loss": 0.7257, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.3438611030578613, "rewards/margins": 0.296739399433136, "rewards/rejected": -2.6406006813049316, "sft_loss": 2.1210086345672607, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 1.5684653787288134, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.08004043996334076, "logits/rejected": 0.21586282551288605, "logps/chosen": -2.391448974609375, "logps/rejected": -2.769453525543213, "loss": 0.7077, "rewards/accuracies": 0.65625, "rewards/chosen": -2.391448974609375, "rewards/margins": 0.3780044615268707, "rewards/rejected": -2.769453525543213, "sft_loss": 2.076185464859009, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 1.3887045431816825, "learning_rate": 9.62566844919786e-07, "logits/chosen": 0.04192466661334038, "logits/rejected": 0.13054180145263672, "logps/chosen": -2.6236674785614014, "logps/rejected": -2.9908204078674316, "loss": 0.7208, "rewards/accuracies": 0.5625, "rewards/chosen": -2.6236674785614014, "rewards/margins": 0.3671533465385437, "rewards/rejected": -2.9908204078674316, "sft_loss": 2.410619020462036, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 1.3678970419412904, "learning_rate": 9.714795008912655e-07, "logits/chosen": -0.06351624429225922, "logits/rejected": 0.15162459015846252, "logps/chosen": -2.6514830589294434, "logps/rejected": -2.9466261863708496, "loss": 0.7095, "rewards/accuracies": 0.59375, "rewards/chosen": -2.6514830589294434, "rewards/margins": 0.29514291882514954, "rewards/rejected": -2.9466261863708496, "sft_loss": 2.2425341606140137, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 1.1688094755564253, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.06675631552934647, "logits/rejected": 0.14489200711250305, "logps/chosen": -2.3897202014923096, "logps/rejected": -2.7485108375549316, "loss": 0.7146, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.3897202014923096, "rewards/margins": 0.3587908148765564, "rewards/rejected": -2.7485108375549316, "sft_loss": 2.1949100494384766, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 1.2334375243791638, "learning_rate": 9.893048128342244e-07, "logits/chosen": -0.040539491921663284, "logits/rejected": 0.09747248142957687, "logps/chosen": -2.6100172996520996, "logps/rejected": -2.8595051765441895, "loss": 0.7208, "rewards/accuracies": 0.59375, "rewards/chosen": -2.6100172996520996, "rewards/margins": 0.24948802590370178, "rewards/rejected": -2.8595051765441895, "sft_loss": 2.3463802337646484, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 1.2178582513233482, "learning_rate": 9.98217468805704e-07, "logits/chosen": 0.05656442791223526, "logits/rejected": 0.08870618045330048, "logps/chosen": -2.4351911544799805, "logps/rejected": -2.8274285793304443, "loss": 0.7146, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.4351911544799805, "rewards/margins": 0.3922370970249176, "rewards/rejected": -2.8274285793304443, "sft_loss": 2.3474295139312744, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 1.4646884322093898, "learning_rate": 9.999984476788462e-07, "logits/chosen": 0.016786161810159683, "logits/rejected": 0.07795722037553787, "logps/chosen": -2.3994269371032715, "logps/rejected": -2.7498245239257812, "loss": 0.7173, "rewards/accuracies": 0.625, "rewards/chosen": -2.3994269371032715, "rewards/margins": 0.35039767622947693, "rewards/rejected": -2.7498245239257812, "sft_loss": 2.2927894592285156, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 2.280916772571687, "learning_rate": 9.999921413906797e-07, "logits/chosen": -0.07725201547145844, "logits/rejected": 0.14675331115722656, "logps/chosen": -2.3151724338531494, "logps/rejected": -2.637024402618408, "loss": 0.714, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.3151724338531494, "rewards/margins": 0.3218519687652588, "rewards/rejected": -2.637024402618408, "sft_loss": 2.2025227546691895, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 1.1544623408371042, "learning_rate": 9.999809841765644e-07, "logits/chosen": -0.03033895418047905, "logits/rejected": 0.02668929658830166, "logps/chosen": -2.313297748565674, "logps/rejected": -2.608497381210327, "loss": 0.7186, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.313297748565674, "rewards/margins": 0.29519957304000854, "rewards/rejected": -2.608497381210327, "sft_loss": 2.221764326095581, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 1.454393332211814, "learning_rate": 9.999649761447477e-07, "logits/chosen": -0.060431670397520065, "logits/rejected": 0.11492051184177399, "logps/chosen": -2.400331735610962, "logps/rejected": -2.877683162689209, "loss": 0.7171, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.400331735610962, "rewards/margins": 0.47735172510147095, "rewards/rejected": -2.877683162689209, "sft_loss": 2.3011434078216553, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 1.9403262233624872, "learning_rate": 9.999441174505398e-07, "logits/chosen": -0.06489763408899307, "logits/rejected": 0.027169320732355118, "logps/chosen": -2.6530721187591553, "logps/rejected": -2.9842185974121094, "loss": 0.7182, "rewards/accuracies": 0.59375, "rewards/chosen": -2.6530721187591553, "rewards/margins": 0.331146240234375, "rewards/rejected": -2.9842185974121094, "sft_loss": 2.4599719047546387, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 7.880831844211117, "learning_rate": 9.999184082963116e-07, "logits/chosen": -0.1074734479188919, "logits/rejected": 0.011968502774834633, "logps/chosen": -2.530820608139038, "logps/rejected": -2.730072498321533, "loss": 0.7164, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.530820608139038, "rewards/margins": 0.1992516666650772, "rewards/rejected": -2.730072498321533, "sft_loss": 2.421886920928955, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 1.6012133474306431, "learning_rate": 9.998878489314937e-07, "logits/chosen": -0.0028941601049154997, "logits/rejected": 0.1417306363582611, "logps/chosen": -2.3665213584899902, "logps/rejected": -2.799224853515625, "loss": 0.7165, "rewards/accuracies": 0.625, "rewards/chosen": -2.3665213584899902, "rewards/margins": 0.43270349502563477, "rewards/rejected": -2.799224853515625, "sft_loss": 2.3110907077789307, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 1.6966709570248353, "learning_rate": 9.99852439652573e-07, "logits/chosen": -0.0750584825873375, "logits/rejected": 0.08908344805240631, "logps/chosen": -2.4441514015197754, "logps/rejected": -2.7474427223205566, "loss": 0.7173, "rewards/accuracies": 0.59375, "rewards/chosen": -2.4441514015197754, "rewards/margins": 0.3032917380332947, "rewards/rejected": -2.7474427223205566, "sft_loss": 2.3465425968170166, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 1.7054325905287304, "learning_rate": 9.998121808030904e-07, "logits/chosen": -0.07087988406419754, "logits/rejected": 0.022867068648338318, "logps/chosen": -2.6355044841766357, "logps/rejected": -2.976886034011841, "loss": 0.7131, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.6355044841766357, "rewards/margins": 0.34138113260269165, "rewards/rejected": -2.976886034011841, "sft_loss": 2.448537826538086, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 1.949138520574182, "learning_rate": 9.997670727736379e-07, "logits/chosen": -0.034800779074430466, "logits/rejected": 0.15406827628612518, "logps/chosen": -2.5420567989349365, "logps/rejected": -2.9066779613494873, "loss": 0.7156, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.5420567989349365, "rewards/margins": 0.3646214008331299, "rewards/rejected": -2.9066779613494873, "sft_loss": 2.3323192596435547, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 1.092636916919118, "learning_rate": 9.99717116001853e-07, "logits/chosen": -0.014688342809677124, "logits/rejected": 0.09319359064102173, "logps/chosen": -2.4575400352478027, "logps/rejected": -2.952061891555786, "loss": 0.7115, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.4575400352478027, "rewards/margins": 0.4945217967033386, "rewards/rejected": -2.952061891555786, "sft_loss": 2.2898201942443848, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 1.2412022266614404, "learning_rate": 9.996623109724173e-07, "logits/chosen": 0.044067852199077606, "logits/rejected": 0.1323404163122177, "logps/chosen": -2.5102686882019043, "logps/rejected": -2.789283275604248, "loss": 0.7118, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.5102686882019043, "rewards/margins": 0.2790144383907318, "rewards/rejected": -2.789283275604248, "sft_loss": 2.3239073753356934, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 1.7171630474485193, "learning_rate": 9.996026582170488e-07, "logits/chosen": -0.002364107873290777, "logits/rejected": 0.15507212281227112, "logps/chosen": -2.3736510276794434, "logps/rejected": -2.9077327251434326, "loss": 0.7087, "rewards/accuracies": 0.625, "rewards/chosen": -2.3736510276794434, "rewards/margins": 0.5340819358825684, "rewards/rejected": -2.9077327251434326, "sft_loss": 2.198334217071533, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 1.2450982262768642, "learning_rate": 9.995381583144996e-07, "logits/chosen": -0.032905466854572296, "logits/rejected": 0.09662505239248276, "logps/chosen": -2.36788272857666, "logps/rejected": -2.794607162475586, "loss": 0.71, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.36788272857666, "rewards/margins": 0.42672476172447205, "rewards/rejected": -2.794607162475586, "sft_loss": 2.192002534866333, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 3.122728313345638, "learning_rate": 9.994688118905471e-07, "logits/chosen": 0.021928105503320694, "logits/rejected": 0.299724280834198, "logps/chosen": -2.40183687210083, "logps/rejected": -2.842911720275879, "loss": 0.7159, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.40183687210083, "rewards/margins": 0.44107475876808167, "rewards/rejected": -2.842911720275879, "sft_loss": 2.27349591255188, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 1.3092436802370593, "learning_rate": 9.993946196179912e-07, "logits/chosen": -0.03172747418284416, "logits/rejected": 0.1991158276796341, "logps/chosen": -2.472367763519287, "logps/rejected": -2.981828212738037, "loss": 0.7139, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.472367763519287, "rewards/margins": 0.5094602704048157, "rewards/rejected": -2.981828212738037, "sft_loss": 2.336221218109131, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 1.6621210076695574, "learning_rate": 9.993155822166455e-07, "logits/chosen": 0.03128929063677788, "logits/rejected": 0.11202099174261093, "logps/chosen": -2.5021328926086426, "logps/rejected": -3.0028514862060547, "loss": 0.7124, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.5021328926086426, "rewards/margins": 0.5007185339927673, "rewards/rejected": -3.0028514862060547, "sft_loss": 2.308013439178467, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 1.3392167687694965, "learning_rate": 9.992317004533313e-07, "logits/chosen": 0.06702680140733719, "logits/rejected": 0.19247296452522278, "logps/chosen": -2.5227482318878174, "logps/rejected": -3.011077642440796, "loss": 0.7163, "rewards/accuracies": 0.65625, "rewards/chosen": -2.5227482318878174, "rewards/margins": 0.4883296489715576, "rewards/rejected": -3.011077642440796, "sft_loss": 2.4135658740997314, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 1.8709007126693438, "learning_rate": 9.991429751418696e-07, "logits/chosen": 0.07573200017213821, "logits/rejected": 0.10133898258209229, "logps/chosen": -2.5253071784973145, "logps/rejected": -2.9615590572357178, "loss": 0.7166, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.5253071784973145, "rewards/margins": 0.43625155091285706, "rewards/rejected": -2.9615590572357178, "sft_loss": 2.3898026943206787, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 2.3425277167206215, "learning_rate": 9.99049407143074e-07, "logits/chosen": -0.012026980519294739, "logits/rejected": 0.13976624608039856, "logps/chosen": -2.443476915359497, "logps/rejected": -2.661541700363159, "loss": 0.7152, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.443476915359497, "rewards/margins": 0.2180650681257248, "rewards/rejected": -2.661541700363159, "sft_loss": 2.2523090839385986, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 2.129722647662276, "learning_rate": 9.989509973647416e-07, "logits/chosen": -0.006276947446167469, "logits/rejected": 0.15791794657707214, "logps/chosen": -2.2242307662963867, "logps/rejected": -2.573673963546753, "loss": 0.7116, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.2242307662963867, "rewards/margins": 0.349443256855011, "rewards/rejected": -2.573673963546753, "sft_loss": 2.207887887954712, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 2.3242009017910963, "learning_rate": 9.988477467616445e-07, "logits/chosen": -0.041744984686374664, "logits/rejected": 0.14908739924430847, "logps/chosen": -2.3040108680725098, "logps/rejected": -2.683948516845703, "loss": 0.7153, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.3040108680725098, "rewards/margins": 0.37993746995925903, "rewards/rejected": -2.683948516845703, "sft_loss": 2.2961134910583496, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 1.6993879707708788, "learning_rate": 9.987396563355205e-07, "logits/chosen": -0.037251681089401245, "logits/rejected": 0.058357805013656616, "logps/chosen": -2.2798972129821777, "logps/rejected": -2.778377056121826, "loss": 0.7093, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2798972129821777, "rewards/margins": 0.4984796643257141, "rewards/rejected": -2.778377056121826, "sft_loss": 2.2684168815612793, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 4.211312152650652, "learning_rate": 9.986267271350631e-07, "logits/chosen": -0.0022398352157324553, "logits/rejected": 0.15500149130821228, "logps/chosen": -2.3524091243743896, "logps/rejected": -2.7513551712036133, "loss": 0.7179, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.3524091243743896, "rewards/margins": 0.39894604682922363, "rewards/rejected": -2.7513551712036133, "sft_loss": 2.2516465187072754, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 1.6333954397076826, "learning_rate": 9.985089602559123e-07, "logits/chosen": -0.022715812548995018, "logits/rejected": 0.1583533138036728, "logps/chosen": -2.3316569328308105, "logps/rejected": -2.73073148727417, "loss": 0.7089, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.3316569328308105, "rewards/margins": 0.3990745544433594, "rewards/rejected": -2.73073148727417, "sft_loss": 2.2660598754882812, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 2.8513640368794304, "learning_rate": 9.983863568406428e-07, "logits/chosen": 0.06075746566057205, "logits/rejected": 0.09779484570026398, "logps/chosen": -2.4610791206359863, "logps/rejected": -2.9051265716552734, "loss": 0.7085, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.4610791206359863, "rewards/margins": 0.44404739141464233, "rewards/rejected": -2.9051265716552734, "sft_loss": 2.4020164012908936, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 2.353937482827338, "learning_rate": 9.982589180787532e-07, "logits/chosen": -0.026162762194871902, "logits/rejected": 0.06587730348110199, "logps/chosen": -2.3094372749328613, "logps/rejected": -2.7674665451049805, "loss": 0.7063, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.3094372749328613, "rewards/margins": 0.45802921056747437, "rewards/rejected": -2.7674665451049805, "sft_loss": 2.2600460052490234, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 2.3243908024294724, "learning_rate": 9.981266452066553e-07, "logits/chosen": -0.10598037391901016, "logits/rejected": 0.03518640622496605, "logps/chosen": -2.3793280124664307, "logps/rejected": -2.6736900806427, "loss": 0.7147, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.3793280124664307, "rewards/margins": 0.2943619191646576, "rewards/rejected": -2.6736900806427, "sft_loss": 2.2658851146698, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 1.4643334623964388, "learning_rate": 9.979895395076608e-07, "logits/chosen": -0.05929265171289444, "logits/rejected": 0.14080263674259186, "logps/chosen": -2.3748557567596436, "logps/rejected": -2.955979824066162, "loss": 0.7055, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.3748557567596436, "rewards/margins": 0.5811238288879395, "rewards/rejected": -2.955979824066162, "sft_loss": 2.2898504734039307, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 2.667905722196229, "learning_rate": 9.9784760231197e-07, "logits/chosen": -0.010344207286834717, "logits/rejected": 0.10175009071826935, "logps/chosen": -2.370831251144409, "logps/rejected": -2.8821828365325928, "loss": 0.7045, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.370831251144409, "rewards/margins": 0.5113516449928284, "rewards/rejected": -2.8821828365325928, "sft_loss": 2.232008457183838, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 1.3278072182397616, "learning_rate": 9.97700834996658e-07, "logits/chosen": -0.10242640972137451, "logits/rejected": 0.06512792408466339, "logps/chosen": -2.4747941493988037, "logps/rejected": -2.8564939498901367, "loss": 0.7109, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.4747941493988037, "rewards/margins": 0.38169991970062256, "rewards/rejected": -2.8564939498901367, "sft_loss": 2.218146800994873, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 2.136760100983307, "learning_rate": 9.97549238985662e-07, "logits/chosen": -0.02182629704475403, "logits/rejected": 0.16530433297157288, "logps/chosen": -2.430757999420166, "logps/rejected": -2.852879524230957, "loss": 0.713, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.430757999420166, "rewards/margins": 0.4221215844154358, "rewards/rejected": -2.852879524230957, "sft_loss": 2.345231533050537, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 4.6770537298554204, "learning_rate": 9.973928157497674e-07, "logits/chosen": -0.09692031145095825, "logits/rejected": 0.055900346487760544, "logps/chosen": -2.342827320098877, "logps/rejected": -2.8272900581359863, "loss": 0.7095, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.342827320098877, "rewards/margins": 0.48446279764175415, "rewards/rejected": -2.8272900581359863, "sft_loss": 2.2327513694763184, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 2.2551930524603083, "learning_rate": 9.972315668065927e-07, "logits/chosen": -0.15884767472743988, "logits/rejected": -0.019425716251134872, "logps/chosen": -2.444471836090088, "logps/rejected": -2.7931387424468994, "loss": 0.7094, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.444471836090088, "rewards/margins": 0.3486667275428772, "rewards/rejected": -2.7931387424468994, "sft_loss": 2.2840495109558105, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 1.446991465344738, "learning_rate": 9.97065493720576e-07, "logits/chosen": -0.1402565836906433, "logits/rejected": -0.01722204126417637, "logps/chosen": -2.2898192405700684, "logps/rejected": -2.594698190689087, "loss": 0.7104, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.2898192405700684, "rewards/margins": 0.30487877130508423, "rewards/rejected": -2.594698190689087, "sft_loss": 2.2371888160705566, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 2.254533551657452, "learning_rate": 9.968945981029594e-07, "logits/chosen": -0.17079803347587585, "logits/rejected": -0.001663824892602861, "logps/chosen": -2.373328685760498, "logps/rejected": -2.7454586029052734, "loss": 0.7082, "rewards/accuracies": 0.65625, "rewards/chosen": -2.373328685760498, "rewards/margins": 0.3721300959587097, "rewards/rejected": -2.7454586029052734, "sft_loss": 2.2215752601623535, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 2.899391862149443, "learning_rate": 9.967188816117726e-07, "logits/chosen": -0.05076109245419502, "logits/rejected": 0.045064955949783325, "logps/chosen": -2.3542187213897705, "logps/rejected": -2.8510525226593018, "loss": 0.7089, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.3542187213897705, "rewards/margins": 0.4968341886997223, "rewards/rejected": -2.8510525226593018, "sft_loss": 2.2298872470855713, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 1.8550090633215264, "learning_rate": 9.965383459518179e-07, "logits/chosen": -0.09492182731628418, "logits/rejected": 0.08217965066432953, "logps/chosen": -2.2518415451049805, "logps/rejected": -2.728621244430542, "loss": 0.7047, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2518415451049805, "rewards/margins": 0.47677963972091675, "rewards/rejected": -2.728621244430542, "sft_loss": 2.1824424266815186, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 2.039586623800378, "learning_rate": 9.963529928746533e-07, "logits/chosen": -0.01995522901415825, "logits/rejected": 0.12563300132751465, "logps/chosen": -2.253056049346924, "logps/rejected": -2.7270843982696533, "loss": 0.7117, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.253056049346924, "rewards/margins": 0.47402825951576233, "rewards/rejected": -2.7270843982696533, "sft_loss": 2.203734874725342, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 1.6289697484565457, "learning_rate": 9.961628241785746e-07, "logits/chosen": -0.10875172913074493, "logits/rejected": -0.02619188465178013, "logps/chosen": -2.293623924255371, "logps/rejected": -2.7733473777770996, "loss": 0.7158, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.293623924255371, "rewards/margins": 0.47972339391708374, "rewards/rejected": -2.7733473777770996, "sft_loss": 2.2108073234558105, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 2.0602281192051457, "learning_rate": 9.959678417085998e-07, "logits/chosen": -0.06123873591423035, "logits/rejected": 0.04782121628522873, "logps/chosen": -2.481053352355957, "logps/rejected": -2.871842384338379, "loss": 0.7139, "rewards/accuracies": 0.59375, "rewards/chosen": -2.481053352355957, "rewards/margins": 0.3907889425754547, "rewards/rejected": -2.871842384338379, "sft_loss": 2.2680141925811768, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 1.621361099423288, "learning_rate": 9.957680473564493e-07, "logits/chosen": 0.03413773700594902, "logits/rejected": 0.18661415576934814, "logps/chosen": -2.510728597640991, "logps/rejected": -3.1267201900482178, "loss": 0.7073, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.510728597640991, "rewards/margins": 0.6159918904304504, "rewards/rejected": -3.1267201900482178, "sft_loss": 2.2942333221435547, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 1.2629751944575442, "learning_rate": 9.95563443060529e-07, "logits/chosen": -0.11486305296421051, "logits/rejected": 0.08284200727939606, "logps/chosen": -2.491071939468384, "logps/rejected": -2.922267436981201, "loss": 0.7097, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.491071939468384, "rewards/margins": 0.43119579553604126, "rewards/rejected": -2.922267436981201, "sft_loss": 2.2405238151550293, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 2.3807379465238987, "learning_rate": 9.95354030805911e-07, "logits/chosen": -0.15121929347515106, "logits/rejected": 0.02654470130801201, "logps/chosen": -2.372063636779785, "logps/rejected": -2.7418456077575684, "loss": 0.7143, "rewards/accuracies": 0.59375, "rewards/chosen": -2.372063636779785, "rewards/margins": 0.3697819113731384, "rewards/rejected": -2.7418456077575684, "sft_loss": 2.2500219345092773, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 2.5285997561445415, "learning_rate": 9.951398126243133e-07, "logits/chosen": -0.04930179938673973, "logits/rejected": 0.08506530523300171, "logps/chosen": -2.4192235469818115, "logps/rejected": -2.856471538543701, "loss": 0.7154, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.4192235469818115, "rewards/margins": 0.43724822998046875, "rewards/rejected": -2.856471538543701, "sft_loss": 2.2640137672424316, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 1.7388598414744387, "learning_rate": 9.94920790594082e-07, "logits/chosen": -0.08470118045806885, "logits/rejected": 0.04234440252184868, "logps/chosen": -2.269394874572754, "logps/rejected": -2.7940497398376465, "loss": 0.7059, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.269394874572754, "rewards/margins": 0.5246550440788269, "rewards/rejected": -2.7940497398376465, "sft_loss": 2.1549248695373535, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 1.2915713925072447, "learning_rate": 9.946969668401696e-07, "logits/chosen": -0.11627205461263657, "logits/rejected": 0.08593938499689102, "logps/chosen": -2.153496503829956, "logps/rejected": -2.681713581085205, "loss": 0.7077, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.153496503829956, "rewards/margins": 0.5282168388366699, "rewards/rejected": -2.681713581085205, "sft_loss": 2.069179058074951, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 2.743951187418527, "learning_rate": 9.944683435341155e-07, "logits/chosen": -0.0455852746963501, "logits/rejected": 0.04003704711794853, "logps/chosen": -2.217771530151367, "logps/rejected": -2.646599769592285, "loss": 0.7105, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.217771530151367, "rewards/margins": 0.4288281500339508, "rewards/rejected": -2.646599769592285, "sft_loss": 2.1145291328430176, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.2856700122356415, "eval_logits/rejected": 0.3870464861392975, "eval_logps/chosen": -2.235349655151367, "eval_logps/rejected": -2.7243196964263916, "eval_loss": 0.7055234313011169, "eval_rewards/accuracies": 0.6446587443351746, "eval_rewards/chosen": -2.235349655151367, "eval_rewards/margins": 0.4889698326587677, "eval_rewards/rejected": -2.7243196964263916, "eval_runtime": 43.3921, "eval_samples_per_second": 30.996, "eval_sft_loss": 2.1066486835479736, "eval_steps_per_second": 7.766, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 1.6950654692711624, "learning_rate": 9.942349228940236e-07, "logits/chosen": -0.13364177942276, "logits/rejected": 0.04280375689268112, "logps/chosen": -2.181466579437256, "logps/rejected": -2.7770562171936035, "loss": 0.6995, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.181466579437256, "rewards/margins": 0.595589280128479, "rewards/rejected": -2.7770562171936035, "sft_loss": 2.1175734996795654, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 3.107157954891758, "learning_rate": 9.939967071845424e-07, "logits/chosen": -0.0517873540520668, "logits/rejected": 0.02858349122107029, "logps/chosen": -2.2146787643432617, "logps/rejected": -2.631208896636963, "loss": 0.7107, "rewards/accuracies": 0.65625, "rewards/chosen": -2.2146787643432617, "rewards/margins": 0.41653013229370117, "rewards/rejected": -2.631208896636963, "sft_loss": 2.177154064178467, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 3.62624736111432, "learning_rate": 9.937536987168413e-07, "logits/chosen": 0.0011221707100048661, "logits/rejected": 0.12757374346256256, "logps/chosen": -2.143000602722168, "logps/rejected": -2.7792890071868896, "loss": 0.6987, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.143000602722168, "rewards/margins": 0.6362886428833008, "rewards/rejected": -2.7792890071868896, "sft_loss": 2.0995757579803467, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 4.632145814822578, "learning_rate": 9.935058998485896e-07, "logits/chosen": 0.02935582958161831, "logits/rejected": 0.08219500631093979, "logps/chosen": -2.1796677112579346, "logps/rejected": -2.678680658340454, "loss": 0.7122, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1796677112579346, "rewards/margins": 0.4990130364894867, "rewards/rejected": -2.678680658340454, "sft_loss": 2.1346020698547363, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 2.5174642262048366, "learning_rate": 9.932533129839333e-07, "logits/chosen": 0.019097473472356796, "logits/rejected": 0.16284245252609253, "logps/chosen": -2.130647659301758, "logps/rejected": -2.58423113822937, "loss": 0.7126, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.130647659301758, "rewards/margins": 0.4535837173461914, "rewards/rejected": -2.58423113822937, "sft_loss": 2.1586554050445557, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 2.555830574006697, "learning_rate": 9.929959405734711e-07, "logits/chosen": 0.1632825881242752, "logits/rejected": 0.3447878956794739, "logps/chosen": -2.2275099754333496, "logps/rejected": -2.6180667877197266, "loss": 0.7146, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2275099754333496, "rewards/margins": 0.39055711030960083, "rewards/rejected": -2.6180667877197266, "sft_loss": 2.1432220935821533, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 3.672400135778624, "learning_rate": 9.927337851142314e-07, "logits/chosen": 0.08189113438129425, "logits/rejected": 0.22178630530834198, "logps/chosen": -2.357943058013916, "logps/rejected": -2.7608790397644043, "loss": 0.7142, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.357943058013916, "rewards/margins": 0.4029361605644226, "rewards/rejected": -2.7608790397644043, "sft_loss": 2.332587242126465, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 3.5725253771617753, "learning_rate": 9.924668491496474e-07, "logits/chosen": 0.06914591789245605, "logits/rejected": 0.32668501138687134, "logps/chosen": -2.4014992713928223, "logps/rejected": -2.8846962451934814, "loss": 0.7154, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.4014992713928223, "rewards/margins": 0.48319679498672485, "rewards/rejected": -2.8846962451934814, "sft_loss": 2.3815293312072754, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 3.1294455746461116, "learning_rate": 9.92195135269533e-07, "logits/chosen": 0.1843930184841156, "logits/rejected": 0.2454400509595871, "logps/chosen": -2.3867859840393066, "logps/rejected": -2.7935075759887695, "loss": 0.7121, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.3867859840393066, "rewards/margins": 0.4067217707633972, "rewards/rejected": -2.7935075759887695, "sft_loss": 2.3315231800079346, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 3.0367682866830297, "learning_rate": 9.919186461100574e-07, "logits/chosen": 0.15431417524814606, "logits/rejected": 0.2713518738746643, "logps/chosen": -2.562764883041382, "logps/rejected": -3.004469394683838, "loss": 0.71, "rewards/accuracies": 0.65625, "rewards/chosen": -2.562764883041382, "rewards/margins": 0.4417042136192322, "rewards/rejected": -3.004469394683838, "sft_loss": 2.359578847885132, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 2.2239671562004606, "learning_rate": 9.9163738435372e-07, "logits/chosen": 0.17244210839271545, "logits/rejected": 0.34257280826568604, "logps/chosen": -2.564682960510254, "logps/rejected": -3.1213314533233643, "loss": 0.7148, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.564682960510254, "rewards/margins": 0.5566484332084656, "rewards/rejected": -3.1213314533233643, "sft_loss": 2.3615224361419678, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 1.0920440799644606, "learning_rate": 9.913513527293234e-07, "logits/chosen": 0.09747910499572754, "logits/rejected": 0.3031081259250641, "logps/chosen": -2.531813144683838, "logps/rejected": -3.162034034729004, "loss": 0.7055, "rewards/accuracies": 0.65625, "rewards/chosen": -2.531813144683838, "rewards/margins": 0.630220890045166, "rewards/rejected": -3.162034034729004, "sft_loss": 2.3049185276031494, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 4.842613450105217, "learning_rate": 9.910605540119474e-07, "logits/chosen": 0.18088313937187195, "logits/rejected": 0.3242112696170807, "logps/chosen": -2.553947925567627, "logps/rejected": -3.1162753105163574, "loss": 0.7099, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.553947925567627, "rewards/margins": 0.56232750415802, "rewards/rejected": -3.1162753105163574, "sft_loss": 2.3149704933166504, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 1.676184646304957, "learning_rate": 9.907649910229227e-07, "logits/chosen": 0.1025063768029213, "logits/rejected": 0.42056649923324585, "logps/chosen": -2.3593525886535645, "logps/rejected": -2.967888355255127, "loss": 0.7043, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.3593525886535645, "rewards/margins": 0.608535647392273, "rewards/rejected": -2.967888355255127, "sft_loss": 2.2593531608581543, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 2.947417875013683, "learning_rate": 9.90464666629803e-07, "logits/chosen": 0.17791959643363953, "logits/rejected": 0.27456963062286377, "logps/chosen": -2.3868355751037598, "logps/rejected": -2.758819580078125, "loss": 0.7185, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.3868355751037598, "rewards/margins": 0.37198397517204285, "rewards/rejected": -2.758819580078125, "sft_loss": 2.237908124923706, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 3.535073328729538, "learning_rate": 9.901595837463363e-07, "logits/chosen": 0.129004567861557, "logits/rejected": 0.3492968678474426, "logps/chosen": -2.2729153633117676, "logps/rejected": -2.8254361152648926, "loss": 0.7008, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2729153633117676, "rewards/margins": 0.552520751953125, "rewards/rejected": -2.8254361152648926, "sft_loss": 2.122039318084717, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 1.9883077785571128, "learning_rate": 9.898497453324384e-07, "logits/chosen": 0.08982595801353455, "logits/rejected": 0.19023357331752777, "logps/chosen": -2.2362680435180664, "logps/rejected": -2.804640531539917, "loss": 0.7015, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2362680435180664, "rewards/margins": 0.568372905254364, "rewards/rejected": -2.804640531539917, "sft_loss": 2.1717028617858887, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 1.7328895899308072, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.07798051089048386, "logits/rejected": 0.08264918625354767, "logps/chosen": -2.19826602935791, "logps/rejected": -2.604795455932617, "loss": 0.7121, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.19826602935791, "rewards/margins": 0.4065292775630951, "rewards/rejected": -2.604795455932617, "sft_loss": 2.130378246307373, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 2.423264934459615, "learning_rate": 9.892158139836724e-07, "logits/chosen": 0.06881462037563324, "logits/rejected": 0.18817448616027832, "logps/chosen": -2.142784833908081, "logps/rejected": -2.5093910694122314, "loss": 0.7047, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.142784833908081, "rewards/margins": 0.36660611629486084, "rewards/rejected": -2.5093910694122314, "sft_loss": 2.0799660682678223, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 2.4706182493332007, "learning_rate": 9.88891727199209e-07, "logits/chosen": 0.026805415749549866, "logits/rejected": 0.1365644782781601, "logps/chosen": -2.190398693084717, "logps/rejected": -2.69905686378479, "loss": 0.7007, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.190398693084717, "rewards/margins": 0.508658230304718, "rewards/rejected": -2.69905686378479, "sft_loss": 2.1007237434387207, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 2.2734688952192457, "learning_rate": 9.885628971850641e-07, "logits/chosen": 0.025261899456381798, "logits/rejected": 0.23059232532978058, "logps/chosen": -2.249539852142334, "logps/rejected": -2.822911024093628, "loss": 0.7067, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.249539852142334, "rewards/margins": 0.573371171951294, "rewards/rejected": -2.822911024093628, "sft_loss": 2.253406524658203, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 1.905964454067589, "learning_rate": 9.882293271315481e-07, "logits/chosen": 0.062464743852615356, "logits/rejected": 0.18661728501319885, "logps/chosen": -2.2542724609375, "logps/rejected": -2.6662635803222656, "loss": 0.7263, "rewards/accuracies": 0.5625, "rewards/chosen": -2.2542724609375, "rewards/margins": 0.41199105978012085, "rewards/rejected": -2.6662635803222656, "sft_loss": 2.176217555999756, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 2.078130359969576, "learning_rate": 9.878910202749589e-07, "logits/chosen": 0.060682039707899094, "logits/rejected": 0.2705400288105011, "logps/chosen": -2.239940881729126, "logps/rejected": -2.697516441345215, "loss": 0.7101, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.239940881729126, "rewards/margins": 0.45757532119750977, "rewards/rejected": -2.697516441345215, "sft_loss": 2.1451468467712402, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 2.0532116326739382, "learning_rate": 9.875479798975512e-07, "logits/chosen": 0.14619162678718567, "logits/rejected": 0.3306914269924164, "logps/chosen": -2.332319498062134, "logps/rejected": -2.874541759490967, "loss": 0.7052, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.332319498062134, "rewards/margins": 0.5422223210334778, "rewards/rejected": -2.874541759490967, "sft_loss": 2.18951153755188, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 1.3305226834529542, "learning_rate": 9.87200209327504e-07, "logits/chosen": 0.07990497350692749, "logits/rejected": 0.30456340312957764, "logps/chosen": -2.451970338821411, "logps/rejected": -2.8729283809661865, "loss": 0.7083, "rewards/accuracies": 0.625, "rewards/chosen": -2.451970338821411, "rewards/margins": 0.4209579527378082, "rewards/rejected": -2.8729283809661865, "sft_loss": 2.1967291831970215, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 2.3836534390528743, "learning_rate": 9.868477119388894e-07, "logits/chosen": 0.08301494270563126, "logits/rejected": 0.18483616411685944, "logps/chosen": -2.343928337097168, "logps/rejected": -2.943770170211792, "loss": 0.7128, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.343928337097168, "rewards/margins": 0.5998419523239136, "rewards/rejected": -2.943770170211792, "sft_loss": 2.2015366554260254, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 1.9175709672468506, "learning_rate": 9.864904911516383e-07, "logits/chosen": 0.14284172654151917, "logits/rejected": 0.2308415174484253, "logps/chosen": -2.4166016578674316, "logps/rejected": -2.889622211456299, "loss": 0.7092, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.4166016578674316, "rewards/margins": 0.4730204939842224, "rewards/rejected": -2.889622211456299, "sft_loss": 2.2566981315612793, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 1.7468529370557997, "learning_rate": 9.861285504315084e-07, "logits/chosen": 0.14955595135688782, "logits/rejected": 0.26237112283706665, "logps/chosen": -2.4178216457366943, "logps/rejected": -2.961228609085083, "loss": 0.7034, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.4178216457366943, "rewards/margins": 0.54340660572052, "rewards/rejected": -2.961228609085083, "sft_loss": 2.302690267562866, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 1.7821999235187895, "learning_rate": 9.857618932900502e-07, "logits/chosen": 0.04973750561475754, "logits/rejected": 0.23481830954551697, "logps/chosen": -2.339595079421997, "logps/rejected": -2.958242893218994, "loss": 0.6998, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.339595079421997, "rewards/margins": 0.6186478137969971, "rewards/rejected": -2.958242893218994, "sft_loss": 2.2271862030029297, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 2.375938544297863, "learning_rate": 9.853905232845727e-07, "logits/chosen": 0.03858000785112381, "logits/rejected": 0.2346847802400589, "logps/chosen": -2.3574490547180176, "logps/rejected": -2.7179243564605713, "loss": 0.714, "rewards/accuracies": 0.59375, "rewards/chosen": -2.3574490547180176, "rewards/margins": 0.3604753017425537, "rewards/rejected": -2.7179243564605713, "sft_loss": 2.230804204940796, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 1.3797533963570972, "learning_rate": 9.850144440181095e-07, "logits/chosen": 0.10901288688182831, "logits/rejected": 0.3531850278377533, "logps/chosen": -2.3547780513763428, "logps/rejected": -2.7378902435302734, "loss": 0.7157, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.3547780513763428, "rewards/margins": 0.38311222195625305, "rewards/rejected": -2.7378902435302734, "sft_loss": 2.27116322517395, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 2.4267629639987205, "learning_rate": 9.846336591393832e-07, "logits/chosen": 0.12777450680732727, "logits/rejected": 0.3147388696670532, "logps/chosen": -2.2933754920959473, "logps/rejected": -2.753551959991455, "loss": 0.7081, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.2933754920959473, "rewards/margins": 0.46017637848854065, "rewards/rejected": -2.753551959991455, "sft_loss": 2.24653697013855, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 1.876050989117389, "learning_rate": 9.842481723427704e-07, "logits/chosen": 0.16586394608020782, "logits/rejected": 0.19615688920021057, "logps/chosen": -2.46045184135437, "logps/rejected": -2.9221224784851074, "loss": 0.7113, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.46045184135437, "rewards/margins": 0.46167030930519104, "rewards/rejected": -2.9221224784851074, "sft_loss": 2.3795344829559326, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 2.8189772657931726, "learning_rate": 9.838579873682658e-07, "logits/chosen": 0.1627335250377655, "logits/rejected": 0.17865543067455292, "logps/chosen": -2.2735114097595215, "logps/rejected": -2.731293201446533, "loss": 0.7084, "rewards/accuracies": 0.625, "rewards/chosen": -2.2735114097595215, "rewards/margins": 0.45778173208236694, "rewards/rejected": -2.731293201446533, "sft_loss": 2.200697898864746, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 1.3613854877272462, "learning_rate": 9.834631080014457e-07, "logits/chosen": 0.047867465764284134, "logits/rejected": 0.27975499629974365, "logps/chosen": -2.2784676551818848, "logps/rejected": -2.8016064167022705, "loss": 0.7053, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2784676551818848, "rewards/margins": 0.5231388807296753, "rewards/rejected": -2.8016064167022705, "sft_loss": 2.259875774383545, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 2.045704139911155, "learning_rate": 9.830635380734312e-07, "logits/chosen": 0.03475247696042061, "logits/rejected": 0.26037245988845825, "logps/chosen": -2.4209530353546143, "logps/rejected": -2.78556489944458, "loss": 0.7137, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.4209530353546143, "rewards/margins": 0.3646118640899658, "rewards/rejected": -2.78556489944458, "sft_loss": 2.3162875175476074, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 2.1236606797076285, "learning_rate": 9.826592814608517e-07, "logits/chosen": 0.1473628282546997, "logits/rejected": 0.3909761905670166, "logps/chosen": -2.32572603225708, "logps/rejected": -2.7549567222595215, "loss": 0.7103, "rewards/accuracies": 0.65625, "rewards/chosen": -2.32572603225708, "rewards/margins": 0.42923077940940857, "rewards/rejected": -2.7549567222595215, "sft_loss": 2.276968002319336, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 2.2436977480189575, "learning_rate": 9.822503420858067e-07, "logits/chosen": 0.2726105749607086, "logits/rejected": 0.2880789637565613, "logps/chosen": -2.1168220043182373, "logps/rejected": -2.6466643810272217, "loss": 0.7027, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1168220043182373, "rewards/margins": 0.5298421382904053, "rewards/rejected": -2.6466643810272217, "sft_loss": 2.1228058338165283, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 2.0435846330257834, "learning_rate": 9.818367239158277e-07, "logits/chosen": 0.22745075821876526, "logits/rejected": 0.3045138418674469, "logps/chosen": -2.261383295059204, "logps/rejected": -2.6280386447906494, "loss": 0.718, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.261383295059204, "rewards/margins": 0.366655170917511, "rewards/rejected": -2.6280386447906494, "sft_loss": 2.2003605365753174, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 1.4535000931898072, "learning_rate": 9.8141843096384e-07, "logits/chosen": 0.1563439518213272, "logits/rejected": 0.34087103605270386, "logps/chosen": -2.3654720783233643, "logps/rejected": -2.9726266860961914, "loss": 0.7015, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.3654720783233643, "rewards/margins": 0.607154369354248, "rewards/rejected": -2.9726266860961914, "sft_loss": 2.3127644062042236, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 2.08251689341471, "learning_rate": 9.809954672881237e-07, "logits/chosen": 0.18138954043388367, "logits/rejected": 0.34113839268684387, "logps/chosen": -2.468475818634033, "logps/rejected": -3.0173211097717285, "loss": 0.7131, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.468475818634033, "rewards/margins": 0.5488454103469849, "rewards/rejected": -3.0173211097717285, "sft_loss": 2.478551149368286, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 1.9358358256624035, "learning_rate": 9.80567836992274e-07, "logits/chosen": 0.1209973692893982, "logits/rejected": 0.3399454951286316, "logps/chosen": -2.2919278144836426, "logps/rejected": -2.8712244033813477, "loss": 0.706, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.2919278144836426, "rewards/margins": 0.5792968273162842, "rewards/rejected": -2.8712244033813477, "sft_loss": 2.263434886932373, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 2.0145294277350794, "learning_rate": 9.801355442251625e-07, "logits/chosen": 0.1270841807126999, "logits/rejected": 0.3247852921485901, "logps/chosen": -2.1929144859313965, "logps/rejected": -2.6835200786590576, "loss": 0.7043, "rewards/accuracies": 0.65625, "rewards/chosen": -2.1929144859313965, "rewards/margins": 0.49060574173927307, "rewards/rejected": -2.6835200786590576, "sft_loss": 2.1795461177825928, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 3.9739830078495095, "learning_rate": 9.796985931808949e-07, "logits/chosen": 0.13059914112091064, "logits/rejected": 0.3214924931526184, "logps/chosen": -2.224752187728882, "logps/rejected": -2.8040366172790527, "loss": 0.7077, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.224752187728882, "rewards/margins": 0.5792843103408813, "rewards/rejected": -2.8040366172790527, "sft_loss": 2.2124037742614746, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 3.151943102651046, "learning_rate": 9.792569880987724e-07, "logits/chosen": 0.10221121460199356, "logits/rejected": 0.2548214793205261, "logps/chosen": -2.177299976348877, "logps/rejected": -2.803210496902466, "loss": 0.6956, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.177299976348877, "rewards/margins": 0.625910222530365, "rewards/rejected": -2.803210496902466, "sft_loss": 2.131620168685913, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 4.302780014141352, "learning_rate": 9.788107332632493e-07, "logits/chosen": 0.16803228855133057, "logits/rejected": 0.27060407400131226, "logps/chosen": -2.1256794929504395, "logps/rejected": -2.6368584632873535, "loss": 0.7032, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1256794929504395, "rewards/margins": 0.5111792087554932, "rewards/rejected": -2.6368584632873535, "sft_loss": 2.1204867362976074, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 1.9088328959701046, "learning_rate": 9.783598330038924e-07, "logits/chosen": 0.1321832537651062, "logits/rejected": 0.27038392424583435, "logps/chosen": -2.1786580085754395, "logps/rejected": -2.5679731369018555, "loss": 0.7235, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.1786580085754395, "rewards/margins": 0.3893152177333832, "rewards/rejected": -2.5679731369018555, "sft_loss": 2.169933319091797, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 1.5228203778779503, "learning_rate": 9.779042916953376e-07, "logits/chosen": 0.1220581904053688, "logits/rejected": 0.3341611623764038, "logps/chosen": -2.1551034450531006, "logps/rejected": -2.824923038482666, "loss": 0.6967, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1551034450531006, "rewards/margins": 0.6698193550109863, "rewards/rejected": -2.824923038482666, "sft_loss": 2.188110828399658, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 1.8722753805757277, "learning_rate": 9.774441137572487e-07, "logits/chosen": 0.0969678983092308, "logits/rejected": 0.2719166874885559, "logps/chosen": -2.218916654586792, "logps/rejected": -2.8767879009246826, "loss": 0.7044, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.218916654586792, "rewards/margins": 0.657870888710022, "rewards/rejected": -2.8767879009246826, "sft_loss": 2.1699187755584717, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 2.524700899597618, "learning_rate": 9.76979303654274e-07, "logits/chosen": 0.07900562137365341, "logits/rejected": 0.2019214928150177, "logps/chosen": -2.3817086219787598, "logps/rejected": -2.9759418964385986, "loss": 0.7069, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.3817086219787598, "rewards/margins": 0.5942331552505493, "rewards/rejected": -2.9759418964385986, "sft_loss": 2.3367366790771484, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 2.778143248389814, "learning_rate": 9.765098658960035e-07, "logits/chosen": 0.15386387705802917, "logits/rejected": 0.22167351841926575, "logps/chosen": -2.329440116882324, "logps/rejected": -2.9883041381835938, "loss": 0.6992, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.329440116882324, "rewards/margins": 0.6588642001152039, "rewards/rejected": -2.9883041381835938, "sft_loss": 2.254359483718872, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 2.163166547109692, "learning_rate": 9.76035805036924e-07, "logits/chosen": 0.20119425654411316, "logits/rejected": 0.4054529070854187, "logps/chosen": -2.332470655441284, "logps/rejected": -2.8844902515411377, "loss": 0.703, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.332470655441284, "rewards/margins": 0.5520192384719849, "rewards/rejected": -2.8844902515411377, "sft_loss": 2.2267251014709473, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 2.572689395806096, "learning_rate": 9.755571256763764e-07, "logits/chosen": 0.14655031263828278, "logits/rejected": 0.2958758473396301, "logps/chosen": -2.2035248279571533, "logps/rejected": -2.95106840133667, "loss": 0.6966, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2035248279571533, "rewards/margins": 0.7475437521934509, "rewards/rejected": -2.95106840133667, "sft_loss": 2.208761692047119, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 2.4396453685753428, "learning_rate": 9.750738324585097e-07, "logits/chosen": 0.07241447269916534, "logits/rejected": 0.3418889343738556, "logps/chosen": -2.136514902114868, "logps/rejected": -2.6713004112243652, "loss": 0.7007, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.136514902114868, "rewards/margins": 0.5347855687141418, "rewards/rejected": -2.6713004112243652, "sft_loss": 2.092672109603882, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 3.0120905040874506, "learning_rate": 9.74585930072237e-07, "logits/chosen": 0.18253377079963684, "logits/rejected": 0.33938705921173096, "logps/chosen": -2.040200710296631, "logps/rejected": -2.693777561187744, "loss": 0.7011, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.040200710296631, "rewards/margins": 0.6535772085189819, "rewards/rejected": -2.693777561187744, "sft_loss": 2.049227237701416, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 4.291489973247969, "learning_rate": 9.740934232511892e-07, "logits/chosen": 0.08960796147584915, "logits/rejected": 0.21009111404418945, "logps/chosen": -2.233616828918457, "logps/rejected": -2.7052865028381348, "loss": 0.7068, "rewards/accuracies": 0.65625, "rewards/chosen": -2.233616828918457, "rewards/margins": 0.4716699719429016, "rewards/rejected": -2.7052865028381348, "sft_loss": 2.2202603816986084, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 1.6984441890547815, "learning_rate": 9.735963167736698e-07, "logits/chosen": 0.14620177447795868, "logits/rejected": 0.3416219651699066, "logps/chosen": -2.303943157196045, "logps/rejected": -2.638916492462158, "loss": 0.7126, "rewards/accuracies": 0.5625, "rewards/chosen": -2.303943157196045, "rewards/margins": 0.3349727988243103, "rewards/rejected": -2.638916492462158, "sft_loss": 2.2308120727539062, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 2.0181546365872443, "learning_rate": 9.730946154626078e-07, "logits/chosen": 0.16837917268276215, "logits/rejected": 0.2764458954334259, "logps/chosen": -2.180739164352417, "logps/rejected": -2.62367582321167, "loss": 0.7072, "rewards/accuracies": 0.59375, "rewards/chosen": -2.180739164352417, "rewards/margins": 0.44293683767318726, "rewards/rejected": -2.62367582321167, "sft_loss": 2.108156204223633, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 2.3660885845943467, "learning_rate": 9.725883241855117e-07, "logits/chosen": 0.0687054768204689, "logits/rejected": 0.2612117528915405, "logps/chosen": -2.339418888092041, "logps/rejected": -2.985769748687744, "loss": 0.7088, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.339418888092041, "rewards/margins": 0.6463507413864136, "rewards/rejected": -2.985769748687744, "sft_loss": 2.275740385055542, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 2.294313228399238, "learning_rate": 9.720774478544218e-07, "logits/chosen": 0.18929219245910645, "logits/rejected": 0.35968270897865295, "logps/chosen": -2.3253414630889893, "logps/rejected": -3.0628182888031006, "loss": 0.7024, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.3253414630889893, "rewards/margins": 0.7374764680862427, "rewards/rejected": -3.0628182888031006, "sft_loss": 2.2206921577453613, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 2.4074171680487217, "learning_rate": 9.715619914258624e-07, "logits/chosen": 0.10449817031621933, "logits/rejected": 0.21964387595653534, "logps/chosen": -2.3780767917633057, "logps/rejected": -2.781838893890381, "loss": 0.7068, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.3780767917633057, "rewards/margins": 0.40376242995262146, "rewards/rejected": -2.781838893890381, "sft_loss": 2.1239264011383057, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 1.7207229215774509, "learning_rate": 9.710419599007937e-07, "logits/chosen": 0.16424152255058289, "logits/rejected": 0.33380651473999023, "logps/chosen": -2.299243927001953, "logps/rejected": -2.7112672328948975, "loss": 0.7182, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.299243927001953, "rewards/margins": 0.41202330589294434, "rewards/rejected": -2.7112672328948975, "sft_loss": 2.189572811126709, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 2.233411754559403, "learning_rate": 9.705173583245643e-07, "logits/chosen": 0.14826945960521698, "logits/rejected": 0.3402601182460785, "logps/chosen": -2.322040557861328, "logps/rejected": -3.035202980041504, "loss": 0.704, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.322040557861328, "rewards/margins": 0.7131628394126892, "rewards/rejected": -3.035202980041504, "sft_loss": 2.1495418548583984, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 2.6376907033408323, "learning_rate": 9.699881917868609e-07, "logits/chosen": 0.04995732381939888, "logits/rejected": 0.19983510673046112, "logps/chosen": -2.2585396766662598, "logps/rejected": -2.790225028991699, "loss": 0.7016, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.2585396766662598, "rewards/margins": 0.5316852927207947, "rewards/rejected": -2.790225028991699, "sft_loss": 2.169877290725708, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 2.155798516432301, "learning_rate": 9.694544654216594e-07, "logits/chosen": 0.048025552183389664, "logits/rejected": 0.272845596075058, "logps/chosen": -2.3054137229919434, "logps/rejected": -2.8888356685638428, "loss": 0.7, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.3054137229919434, "rewards/margins": 0.583422064781189, "rewards/rejected": -2.8888356685638428, "sft_loss": 2.1739730834960938, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 2.161030120954079, "learning_rate": 9.689161844071755e-07, "logits/chosen": 0.19647592306137085, "logits/rejected": 0.30404067039489746, "logps/chosen": -2.1804988384246826, "logps/rejected": -2.6770386695861816, "loss": 0.7074, "rewards/accuracies": 0.625, "rewards/chosen": -2.1804988384246826, "rewards/margins": 0.4965395927429199, "rewards/rejected": -2.6770386695861816, "sft_loss": 2.0427918434143066, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 1.7786484279015686, "learning_rate": 9.683733539658138e-07, "logits/chosen": 0.10375823080539703, "logits/rejected": 0.3111613094806671, "logps/chosen": -2.209398031234741, "logps/rejected": -2.770206928253174, "loss": 0.697, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.209398031234741, "rewards/margins": 0.5608090162277222, "rewards/rejected": -2.770206928253174, "sft_loss": 2.001215934753418, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 2.0623461324880252, "learning_rate": 9.678259793641178e-07, "logits/chosen": 0.14811334013938904, "logits/rejected": 0.20558318495750427, "logps/chosen": -2.205653190612793, "logps/rejected": -2.572145700454712, "loss": 0.7099, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.205653190612793, "rewards/margins": 0.3664925694465637, "rewards/rejected": -2.572145700454712, "sft_loss": 2.1566948890686035, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 2.2424158455070224, "learning_rate": 9.672740659127183e-07, "logits/chosen": 0.0038683507591485977, "logits/rejected": 0.1579521894454956, "logps/chosen": -2.1706321239471436, "logps/rejected": -2.7472522258758545, "loss": 0.7053, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.1706321239471436, "rewards/margins": 0.5766201615333557, "rewards/rejected": -2.7472522258758545, "sft_loss": 2.0863354206085205, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 1.9617321040832172, "learning_rate": 9.667176189662818e-07, "logits/chosen": 0.06456903368234634, "logits/rejected": 0.24067726731300354, "logps/chosen": -2.1974761486053467, "logps/rejected": -2.812023401260376, "loss": 0.6997, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1974761486053467, "rewards/margins": 0.6145472526550293, "rewards/rejected": -2.812023401260376, "sft_loss": 2.060354709625244, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 2.16126487305779, "learning_rate": 9.661566439234592e-07, "logits/chosen": 0.15495900809764862, "logits/rejected": 0.27409639954566956, "logps/chosen": -2.3435518741607666, "logps/rejected": -2.7621870040893555, "loss": 0.7097, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.3435518741607666, "rewards/margins": 0.4186350405216217, "rewards/rejected": -2.7621870040893555, "sft_loss": 2.2268261909484863, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 2.6944212251680324, "learning_rate": 9.655911462268327e-07, "logits/chosen": 0.1989332139492035, "logits/rejected": 0.3290112018585205, "logps/chosen": -2.212402820587158, "logps/rejected": -2.9126648902893066, "loss": 0.6954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.212402820587158, "rewards/margins": 0.700262188911438, "rewards/rejected": -2.9126648902893066, "sft_loss": 2.144258737564087, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 2.1501122351517363, "learning_rate": 9.650211313628636e-07, "logits/chosen": 0.1345038115978241, "logits/rejected": 0.2468636929988861, "logps/chosen": -2.232625722885132, "logps/rejected": -2.700748920440674, "loss": 0.7052, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.232625722885132, "rewards/margins": 0.4681231379508972, "rewards/rejected": -2.700748920440674, "sft_loss": 2.1846437454223633, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 2.681493179683401, "learning_rate": 9.644466048618386e-07, "logits/chosen": 0.15378674864768982, "logits/rejected": 0.34652647376060486, "logps/chosen": -2.3802332878112793, "logps/rejected": -2.8429431915283203, "loss": 0.7063, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.3802332878112793, "rewards/margins": 0.46271008253097534, "rewards/rejected": -2.8429431915283203, "sft_loss": 2.2455978393554688, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 3.349380471785963, "learning_rate": 9.63867572297816e-07, "logits/chosen": 0.12027013301849365, "logits/rejected": 0.34836727380752563, "logps/chosen": -2.1819441318511963, "logps/rejected": -2.6743319034576416, "loss": 0.7072, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1819441318511963, "rewards/margins": 0.49238792061805725, "rewards/rejected": -2.6743319034576416, "sft_loss": 2.1214489936828613, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 1.7856091716715792, "learning_rate": 9.632840392885727e-07, "logits/chosen": 0.12357620894908905, "logits/rejected": 0.3255251944065094, "logps/chosen": -2.2828421592712402, "logps/rejected": -2.861924886703491, "loss": 0.7083, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2828421592712402, "rewards/margins": 0.5790826082229614, "rewards/rejected": -2.861924886703491, "sft_loss": 2.2045605182647705, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 3.1641412359972074, "learning_rate": 9.626960114955483e-07, "logits/chosen": 0.16735532879829407, "logits/rejected": 0.3422599732875824, "logps/chosen": -2.2051072120666504, "logps/rejected": -3.099365234375, "loss": 0.6991, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2051072120666504, "rewards/margins": 0.8942579030990601, "rewards/rejected": -3.099365234375, "sft_loss": 2.128654956817627, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 1.9470313023305175, "learning_rate": 9.621034946237909e-07, "logits/chosen": 0.09066956490278244, "logits/rejected": 0.2607859671115875, "logps/chosen": -2.3099732398986816, "logps/rejected": -3.01261305809021, "loss": 0.6981, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.3099732398986816, "rewards/margins": 0.7026399970054626, "rewards/rejected": -3.01261305809021, "sft_loss": 2.2505602836608887, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 3.184610641159135, "learning_rate": 9.615064944219021e-07, "logits/chosen": 0.1539539098739624, "logits/rejected": 0.30308666825294495, "logps/chosen": -2.1489720344543457, "logps/rejected": -2.784179449081421, "loss": 0.695, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1489720344543457, "rewards/margins": 0.6352076530456543, "rewards/rejected": -2.784179449081421, "sft_loss": 2.13791561126709, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 4.4377039888981775, "learning_rate": 9.609050166819803e-07, "logits/chosen": 0.12475017458200455, "logits/rejected": 0.21696901321411133, "logps/chosen": -2.19476580619812, "logps/rejected": -2.6287224292755127, "loss": 0.7071, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.19476580619812, "rewards/margins": 0.4339566230773926, "rewards/rejected": -2.6287224292755127, "sft_loss": 2.0920956134796143, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.555233359336853, "eval_logits/rejected": 0.6762645244598389, "eval_logps/chosen": -2.1362555027008057, "eval_logps/rejected": -2.7640209197998047, "eval_loss": 0.6987842917442322, "eval_rewards/accuracies": 0.6691394448280334, "eval_rewards/chosen": -2.1362555027008057, "eval_rewards/margins": 0.6277655363082886, "eval_rewards/rejected": -2.7640209197998047, "eval_runtime": 43.2828, "eval_samples_per_second": 31.075, "eval_sft_loss": 2.044482946395874, "eval_steps_per_second": 7.786, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 3.830774189470102, "learning_rate": 9.602990672395653e-07, "logits/chosen": 0.00027870089979842305, "logits/rejected": 0.22381892800331116, "logps/chosen": -2.0898985862731934, "logps/rejected": -2.686279773712158, "loss": 0.6929, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0898985862731934, "rewards/margins": 0.5963811874389648, "rewards/rejected": -2.686279773712158, "sft_loss": 2.0494701862335205, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 4.25157046723289, "learning_rate": 9.59688651973581e-07, "logits/chosen": 0.021906504407525063, "logits/rejected": 0.2690677046775818, "logps/chosen": -2.0723724365234375, "logps/rejected": -2.5781683921813965, "loss": 0.7024, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.0723724365234375, "rewards/margins": 0.5057964324951172, "rewards/rejected": -2.5781683921813965, "sft_loss": 2.0166726112365723, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 1.7915990533349566, "learning_rate": 9.590737768062792e-07, "logits/chosen": 0.06573444604873657, "logits/rejected": 0.22309072315692902, "logps/chosen": -2.202409029006958, "logps/rejected": -2.7307677268981934, "loss": 0.7096, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.202409029006958, "rewards/margins": 0.528359055519104, "rewards/rejected": -2.7307677268981934, "sft_loss": 2.1225969791412354, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 2.306640871794144, "learning_rate": 9.584544477031816e-07, "logits/chosen": 0.2040838748216629, "logits/rejected": 0.36291855573654175, "logps/chosen": -2.194674015045166, "logps/rejected": -2.743324041366577, "loss": 0.7043, "rewards/accuracies": 0.65625, "rewards/chosen": -2.194674015045166, "rewards/margins": 0.5486500859260559, "rewards/rejected": -2.743324041366577, "sft_loss": 2.0980405807495117, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 2.2545845017591235, "learning_rate": 9.578306706730215e-07, "logits/chosen": 0.010583726689219475, "logits/rejected": 0.27537912130355835, "logps/chosen": -2.328881025314331, "logps/rejected": -2.8073489665985107, "loss": 0.7045, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.328881025314331, "rewards/margins": 0.4784678816795349, "rewards/rejected": -2.8073489665985107, "sft_loss": 2.170609951019287, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 3.663446419735623, "learning_rate": 9.572024517676865e-07, "logits/chosen": 0.0909842699766159, "logits/rejected": 0.22236530482769012, "logps/chosen": -2.3348231315612793, "logps/rejected": -2.9920623302459717, "loss": 0.7037, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.3348231315612793, "rewards/margins": 0.6572390198707581, "rewards/rejected": -2.9920623302459717, "sft_loss": 2.1873583793640137, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 1.4543890342842616, "learning_rate": 9.565697970821593e-07, "logits/chosen": 0.1440887749195099, "logits/rejected": 0.33302026987075806, "logps/chosen": -2.454163074493408, "logps/rejected": -3.0908660888671875, "loss": 0.7083, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.454163074493408, "rewards/margins": 0.6367031335830688, "rewards/rejected": -3.0908660888671875, "sft_loss": 2.254920721054077, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 2.4722340588869978, "learning_rate": 9.559327127544585e-07, "logits/chosen": 0.026768425479531288, "logits/rejected": 0.19142289459705353, "logps/chosen": -2.4390079975128174, "logps/rejected": -3.031845808029175, "loss": 0.7071, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.4390079975128174, "rewards/margins": 0.592837929725647, "rewards/rejected": -3.031845808029175, "sft_loss": 2.2959556579589844, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 1.8974146027052907, "learning_rate": 9.552912049655789e-07, "logits/chosen": 0.04718981683254242, "logits/rejected": 0.26890960335731506, "logps/chosen": -2.355416774749756, "logps/rejected": -2.8213212490081787, "loss": 0.7085, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.355416774749756, "rewards/margins": 0.46590447425842285, "rewards/rejected": -2.8213212490081787, "sft_loss": 2.2121856212615967, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 2.5707161962938048, "learning_rate": 9.546452799394315e-07, "logits/chosen": 0.08500464260578156, "logits/rejected": 0.3310920298099518, "logps/chosen": -2.398761749267578, "logps/rejected": -2.832510471343994, "loss": 0.7127, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.398761749267578, "rewards/margins": 0.43374842405319214, "rewards/rejected": -2.832510471343994, "sft_loss": 2.2532973289489746, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 2.777155725892231, "learning_rate": 9.539949439427846e-07, "logits/chosen": 0.10809852927923203, "logits/rejected": 0.26978474855422974, "logps/chosen": -2.203864336013794, "logps/rejected": -2.7843985557556152, "loss": 0.703, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.203864336013794, "rewards/margins": 0.5805341005325317, "rewards/rejected": -2.7843985557556152, "sft_loss": 2.1802217960357666, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 2.6576584593887658, "learning_rate": 9.533402032852002e-07, "logits/chosen": 0.07701893150806427, "logits/rejected": 0.2485908716917038, "logps/chosen": -2.1946444511413574, "logps/rejected": -2.8757450580596924, "loss": 0.7118, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1946444511413574, "rewards/margins": 0.681100606918335, "rewards/rejected": -2.8757450580596924, "sft_loss": 2.120004177093506, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 2.2163120337499542, "learning_rate": 9.526810643189754e-07, "logits/chosen": 0.1574430763721466, "logits/rejected": 0.3811032474040985, "logps/chosen": -2.132166862487793, "logps/rejected": -2.809222459793091, "loss": 0.7002, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.132166862487793, "rewards/margins": 0.6770555973052979, "rewards/rejected": -2.809222459793091, "sft_loss": 2.09527850151062, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 2.0634743227388324, "learning_rate": 9.52017533439079e-07, "logits/chosen": 0.061243295669555664, "logits/rejected": 0.16142085194587708, "logps/chosen": -2.1922335624694824, "logps/rejected": -2.8089139461517334, "loss": 0.6999, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1922335624694824, "rewards/margins": 0.6166807413101196, "rewards/rejected": -2.8089139461517334, "sft_loss": 2.1539809703826904, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 1.340766627146057, "learning_rate": 9.513496170830909e-07, "logits/chosen": 0.1113109439611435, "logits/rejected": 0.21860043704509735, "logps/chosen": -2.2608723640441895, "logps/rejected": -2.8428518772125244, "loss": 0.7003, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.2608723640441895, "rewards/margins": 0.5819796323776245, "rewards/rejected": -2.8428518772125244, "sft_loss": 2.1284279823303223, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 5.504206460094653, "learning_rate": 9.506773217311382e-07, "logits/chosen": 0.10970310121774673, "logits/rejected": 0.3257525861263275, "logps/chosen": -2.2973437309265137, "logps/rejected": -2.927894115447998, "loss": 0.7095, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.2973437309265137, "rewards/margins": 0.6305506229400635, "rewards/rejected": -2.927894115447998, "sft_loss": 2.231661319732666, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 2.9750399204385043, "learning_rate": 9.500006539058334e-07, "logits/chosen": 0.1559508889913559, "logits/rejected": 0.3778040409088135, "logps/chosen": -2.1892244815826416, "logps/rejected": -2.6358699798583984, "loss": 0.7002, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1892244815826416, "rewards/margins": 0.4466456472873688, "rewards/rejected": -2.6358699798583984, "sft_loss": 2.093132972717285, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 2.4309268153225263, "learning_rate": 9.493196201722109e-07, "logits/chosen": 0.05222257971763611, "logits/rejected": 0.2760353684425354, "logps/chosen": -2.3155436515808105, "logps/rejected": -2.7205491065979004, "loss": 0.7059, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.3155436515808105, "rewards/margins": 0.4050050377845764, "rewards/rejected": -2.7205491065979004, "sft_loss": 2.2433018684387207, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 3.117488409175474, "learning_rate": 9.486342271376628e-07, "logits/chosen": 0.16181819140911102, "logits/rejected": 0.15442785620689392, "logps/chosen": -2.341973066329956, "logps/rejected": -3.1276779174804688, "loss": 0.701, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.341973066329956, "rewards/margins": 0.7857049703598022, "rewards/rejected": -3.1276779174804688, "sft_loss": 2.221696138381958, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 2.080673352225228, "learning_rate": 9.479444814518755e-07, "logits/chosen": 0.13009627163410187, "logits/rejected": 0.4935070872306824, "logps/chosen": -2.3354337215423584, "logps/rejected": -3.1901021003723145, "loss": 0.6983, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.3354337215423584, "rewards/margins": 0.8546679615974426, "rewards/rejected": -3.1901021003723145, "sft_loss": 2.2007999420166016, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 3.078906430649519, "learning_rate": 9.472503898067645e-07, "logits/chosen": 0.33531293272972107, "logits/rejected": 0.4072476327419281, "logps/chosen": -2.3676962852478027, "logps/rejected": -2.9815027713775635, "loss": 0.7061, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.3676962852478027, "rewards/margins": 0.6138062477111816, "rewards/rejected": -2.9815027713775635, "sft_loss": 2.1546168327331543, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 2.5677069132991104, "learning_rate": 9.465519589364099e-07, "logits/chosen": 0.29916244745254517, "logits/rejected": 0.40381136536598206, "logps/chosen": -2.371689796447754, "logps/rejected": -2.9242143630981445, "loss": 0.7138, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.371689796447754, "rewards/margins": 0.5525246262550354, "rewards/rejected": -2.9242143630981445, "sft_loss": 2.1695141792297363, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 1.7137276143457714, "learning_rate": 9.458491956169914e-07, "logits/chosen": 0.16459494829177856, "logits/rejected": 0.4160476624965668, "logps/chosen": -2.3387157917022705, "logps/rejected": -3.0234005451202393, "loss": 0.6976, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.3387157917022705, "rewards/margins": 0.6846847534179688, "rewards/rejected": -3.0234005451202393, "sft_loss": 2.14802885055542, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 2.263175751819887, "learning_rate": 9.451421066667215e-07, "logits/chosen": 0.005195315927267075, "logits/rejected": 0.2853628098964691, "logps/chosen": -2.2528247833251953, "logps/rejected": -2.944413661956787, "loss": 0.6995, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.2528247833251953, "rewards/margins": 0.6915886998176575, "rewards/rejected": -2.944413661956787, "sft_loss": 2.1431026458740234, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 1.9875831630039975, "learning_rate": 9.444306989457805e-07, "logits/chosen": 0.22154255211353302, "logits/rejected": 0.3978469967842102, "logps/chosen": -2.3452460765838623, "logps/rejected": -2.894991397857666, "loss": 0.7071, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.3452460765838623, "rewards/margins": 0.5497456789016724, "rewards/rejected": -2.894991397857666, "sft_loss": 2.125516891479492, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 2.534540663345523, "learning_rate": 9.437149793562489e-07, "logits/chosen": 0.18094339966773987, "logits/rejected": 0.34638968110084534, "logps/chosen": -2.3127498626708984, "logps/rejected": -2.7011842727661133, "loss": 0.7069, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.3127498626708984, "rewards/margins": 0.38843435049057007, "rewards/rejected": -2.7011842727661133, "sft_loss": 2.1503615379333496, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 2.0331521044506142, "learning_rate": 9.429949548420417e-07, "logits/chosen": 0.21482765674591064, "logits/rejected": 0.348909854888916, "logps/chosen": -2.3520545959472656, "logps/rejected": -2.8570454120635986, "loss": 0.7012, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.3520545959472656, "rewards/margins": 0.5049908757209778, "rewards/rejected": -2.8570454120635986, "sft_loss": 2.2471046447753906, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 2.2432607952668615, "learning_rate": 9.422706323888396e-07, "logits/chosen": 0.20482811331748962, "logits/rejected": 0.276005357503891, "logps/chosen": -2.2051875591278076, "logps/rejected": -2.8122718334198, "loss": 0.7017, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2051875591278076, "rewards/margins": 0.6070839166641235, "rewards/rejected": -2.8122718334198, "sft_loss": 2.1116836071014404, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 2.1432552849114233, "learning_rate": 9.415420190240225e-07, "logits/chosen": 0.20497314631938934, "logits/rejected": 0.47388821840286255, "logps/chosen": -2.1197383403778076, "logps/rejected": -2.9532077312469482, "loss": 0.6895, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1197383403778076, "rewards/margins": 0.8334692716598511, "rewards/rejected": -2.9532077312469482, "sft_loss": 2.102599620819092, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 2.5423288896269143, "learning_rate": 9.408091218166002e-07, "logits/chosen": 0.19564411044120789, "logits/rejected": 0.3067219853401184, "logps/chosen": -2.1597135066986084, "logps/rejected": -2.5702147483825684, "loss": 0.7134, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.1597135066986084, "rewards/margins": 0.4105011522769928, "rewards/rejected": -2.5702147483825684, "sft_loss": 2.123806953430176, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 1.8893484799831797, "learning_rate": 9.400719478771449e-07, "logits/chosen": 0.1256810575723648, "logits/rejected": 0.49158114194869995, "logps/chosen": -2.386765956878662, "logps/rejected": -2.8819260597229004, "loss": 0.7046, "rewards/accuracies": 0.6875, "rewards/chosen": -2.386765956878662, "rewards/margins": 0.49516019225120544, "rewards/rejected": -2.8819260597229004, "sft_loss": 2.235278606414795, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 1.9988518198555865, "learning_rate": 9.393305043577209e-07, "logits/chosen": 0.03835318237543106, "logits/rejected": 0.19741104543209076, "logps/chosen": -2.27360463142395, "logps/rejected": -2.9089667797088623, "loss": 0.702, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.27360463142395, "rewards/margins": 0.6353622674942017, "rewards/rejected": -2.9089667797088623, "sft_loss": 2.2150821685791016, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 1.5629991357488582, "learning_rate": 9.38584798451817e-07, "logits/chosen": 0.03389846533536911, "logits/rejected": 0.19717106223106384, "logps/chosen": -2.255682945251465, "logps/rejected": -2.724970579147339, "loss": 0.7011, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.255682945251465, "rewards/margins": 0.4692877233028412, "rewards/rejected": -2.724970579147339, "sft_loss": 2.155445098876953, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 1.8213600131915777, "learning_rate": 9.37834837394275e-07, "logits/chosen": 0.07529856264591217, "logits/rejected": 0.2234765738248825, "logps/chosen": -2.281263828277588, "logps/rejected": -3.119943141937256, "loss": 0.6933, "rewards/accuracies": 0.71875, "rewards/chosen": -2.281263828277588, "rewards/margins": 0.8386794328689575, "rewards/rejected": -3.119943141937256, "sft_loss": 2.2313129901885986, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 2.9933299216173577, "learning_rate": 9.370806284612203e-07, "logits/chosen": -0.016869869083166122, "logits/rejected": 0.16321472823619843, "logps/chosen": -2.199903964996338, "logps/rejected": -2.920362949371338, "loss": 0.6944, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.199903964996338, "rewards/margins": 0.720458984375, "rewards/rejected": -2.920362949371338, "sft_loss": 2.1823477745056152, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 5.500730072855545, "learning_rate": 9.363221789699912e-07, "logits/chosen": -0.075008325278759, "logits/rejected": 0.0719195008277893, "logps/chosen": -2.2905290126800537, "logps/rejected": -2.7570128440856934, "loss": 0.7169, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.2905290126800537, "rewards/margins": 0.4664839208126068, "rewards/rejected": -2.7570128440856934, "sft_loss": 2.2204031944274902, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 5.900709315702155, "learning_rate": 9.355594962790682e-07, "logits/chosen": -0.041843343526124954, "logits/rejected": 0.1123179942369461, "logps/chosen": -2.1659350395202637, "logps/rejected": -2.8170411586761475, "loss": 0.7046, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1659350395202637, "rewards/margins": 0.6511061787605286, "rewards/rejected": -2.8170411586761475, "sft_loss": 2.1369853019714355, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 3.0244073453127034, "learning_rate": 9.34792587788002e-07, "logits/chosen": -0.009002335369586945, "logits/rejected": 0.1299920678138733, "logps/chosen": -2.429741382598877, "logps/rejected": -2.960287570953369, "loss": 0.7078, "rewards/accuracies": 0.65625, "rewards/chosen": -2.429741382598877, "rewards/margins": 0.5305465459823608, "rewards/rejected": -2.960287570953369, "sft_loss": 2.339808225631714, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 2.5131062987246704, "learning_rate": 9.34021460937342e-07, "logits/chosen": 0.0465545579791069, "logits/rejected": 0.15400944650173187, "logps/chosen": -2.3714404106140137, "logps/rejected": -2.8975820541381836, "loss": 0.712, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.3714404106140137, "rewards/margins": 0.5261418223381042, "rewards/rejected": -2.8975820541381836, "sft_loss": 2.300178050994873, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 1.4429172448843048, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.11528120189905167, "logits/rejected": 0.02020915411412716, "logps/chosen": -2.4631452560424805, "logps/rejected": -3.0314419269561768, "loss": 0.7063, "rewards/accuracies": 0.65625, "rewards/chosen": -2.4631452560424805, "rewards/margins": 0.5682967901229858, "rewards/rejected": -3.0314419269561768, "sft_loss": 2.417720317840576, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 2.038326194928855, "learning_rate": 9.324665821239998e-07, "logits/chosen": -0.07167818397283554, "logits/rejected": 0.14383158087730408, "logps/chosen": -2.2117011547088623, "logps/rejected": -3.1124587059020996, "loss": 0.7104, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.2117011547088623, "rewards/margins": 0.9007574319839478, "rewards/rejected": -3.1124587059020996, "sft_loss": 2.210836887359619, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 2.578398933156557, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.05414128303527832, "logits/rejected": 0.14086315035820007, "logps/chosen": -2.2781615257263184, "logps/rejected": -2.977079391479492, "loss": 0.6949, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2781615257263184, "rewards/margins": 0.6989179253578186, "rewards/rejected": -2.977079391479492, "sft_loss": 2.234966278076172, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 2.6551004124751256, "learning_rate": 9.30894920180659e-07, "logits/chosen": 0.03887999802827835, "logits/rejected": 0.17733149230480194, "logps/chosen": -2.222642183303833, "logps/rejected": -2.6063666343688965, "loss": 0.7132, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.222642183303833, "rewards/margins": 0.3837243914604187, "rewards/rejected": -2.6063666343688965, "sft_loss": 2.07389497756958, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 2.1780359850681617, "learning_rate": 9.301028145701543e-07, "logits/chosen": 0.01445702277123928, "logits/rejected": 0.16482463479042053, "logps/chosen": -2.2106032371520996, "logps/rejected": -2.9935824871063232, "loss": 0.7007, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.2106032371520996, "rewards/margins": 0.7829793691635132, "rewards/rejected": -2.9935824871063232, "sft_loss": 2.214637517929077, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 3.254513497125433, "learning_rate": 9.293065361002563e-07, "logits/chosen": 0.003981569316238165, "logits/rejected": 0.13027504086494446, "logps/chosen": -2.1432437896728516, "logps/rejected": -2.844395637512207, "loss": 0.6936, "rewards/accuracies": 0.65625, "rewards/chosen": -2.1432437896728516, "rewards/margins": 0.7011516690254211, "rewards/rejected": -2.844395637512207, "sft_loss": 2.105501174926758, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 2.192853514273409, "learning_rate": 9.285060924964622e-07, "logits/chosen": -0.037188876420259476, "logits/rejected": 0.09554673731327057, "logps/chosen": -2.263479709625244, "logps/rejected": -2.7253289222717285, "loss": 0.7029, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.263479709625244, "rewards/margins": 0.4618498682975769, "rewards/rejected": -2.7253289222717285, "sft_loss": 2.08933687210083, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 1.81295648344349, "learning_rate": 9.277014915246792e-07, "logits/chosen": 0.06525443494319916, "logits/rejected": 0.12765257060527802, "logps/chosen": -2.149930715560913, "logps/rejected": -2.93646502494812, "loss": 0.6994, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.149930715560913, "rewards/margins": 0.786534309387207, "rewards/rejected": -2.93646502494812, "sft_loss": 2.0745248794555664, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 1.915371902448545, "learning_rate": 9.268927409911498e-07, "logits/chosen": -0.06192860007286072, "logits/rejected": 0.05368726700544357, "logps/chosen": -2.1278603076934814, "logps/rejected": -2.6336026191711426, "loss": 0.7073, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1278603076934814, "rewards/margins": 0.5057421922683716, "rewards/rejected": -2.6336026191711426, "sft_loss": 2.1015305519104004, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 2.198257788150926, "learning_rate": 9.260798487423749e-07, "logits/chosen": -0.11924233287572861, "logits/rejected": 0.10633299499750137, "logps/chosen": -2.256054401397705, "logps/rejected": -2.803920030593872, "loss": 0.7021, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.256054401397705, "rewards/margins": 0.5478653311729431, "rewards/rejected": -2.803920030593872, "sft_loss": 2.262547254562378, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 7.705182659305554, "learning_rate": 9.252628226650389e-07, "logits/chosen": -0.03130624443292618, "logits/rejected": 0.09935715794563293, "logps/chosen": -2.404046058654785, "logps/rejected": -2.8623976707458496, "loss": 0.7054, "rewards/accuracies": 0.625, "rewards/chosen": -2.404046058654785, "rewards/margins": 0.4583517611026764, "rewards/rejected": -2.8623976707458496, "sft_loss": 2.363663673400879, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 2.4972398291785893, "learning_rate": 9.244416706859321e-07, "logits/chosen": -0.08602894097566605, "logits/rejected": 0.10091432183980942, "logps/chosen": -2.3144848346710205, "logps/rejected": -2.9508140087127686, "loss": 0.6973, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.3144848346710205, "rewards/margins": 0.6363292932510376, "rewards/rejected": -2.9508140087127686, "sft_loss": 2.3109982013702393, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 2.2190636962447945, "learning_rate": 9.23616400771875e-07, "logits/chosen": -0.05645718052983284, "logits/rejected": 0.1200685054063797, "logps/chosen": -2.2083747386932373, "logps/rejected": -2.9030609130859375, "loss": 0.6911, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.2083747386932373, "rewards/margins": 0.6946861147880554, "rewards/rejected": -2.9030609130859375, "sft_loss": 2.1226391792297363, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 1.5901442821916314, "learning_rate": 9.227870209296395e-07, "logits/chosen": -0.034431345760822296, "logits/rejected": 0.10250719636678696, "logps/chosen": -2.251410961151123, "logps/rejected": -2.75331711769104, "loss": 0.7042, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.251410961151123, "rewards/margins": 0.5019063353538513, "rewards/rejected": -2.75331711769104, "sft_loss": 2.1372828483581543, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 1.9633230437398599, "learning_rate": 9.219535392058728e-07, "logits/chosen": -0.11807402223348618, "logits/rejected": -0.08948754519224167, "logps/chosen": -2.2486908435821533, "logps/rejected": -2.7955987453460693, "loss": 0.7079, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.2486908435821533, "rewards/margins": 0.5469079613685608, "rewards/rejected": -2.7955987453460693, "sft_loss": 2.15276837348938, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 2.2725449419839645, "learning_rate": 9.211159636870181e-07, "logits/chosen": -0.1662970334291458, "logits/rejected": 0.02462906390428543, "logps/chosen": -2.1715939044952393, "logps/rejected": -2.8786914348602295, "loss": 0.7046, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1715939044952393, "rewards/margins": 0.7070977091789246, "rewards/rejected": -2.8786914348602295, "sft_loss": 2.0866100788116455, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 1.8485713742971077, "learning_rate": 9.202743024992367e-07, "logits/chosen": -0.04839160293340683, "logits/rejected": 0.05790669471025467, "logps/chosen": -2.131850004196167, "logps/rejected": -2.935936450958252, "loss": 0.7021, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.131850004196167, "rewards/margins": 0.8040862083435059, "rewards/rejected": -2.935936450958252, "sft_loss": 2.0599653720855713, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 2.894647340821409, "learning_rate": 9.194285638083293e-07, "logits/chosen": -0.09339950978755951, "logits/rejected": 0.06900927424430847, "logps/chosen": -2.15094256401062, "logps/rejected": -2.8568568229675293, "loss": 0.6949, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.15094256401062, "rewards/margins": 0.7059140205383301, "rewards/rejected": -2.8568568229675293, "sft_loss": 2.0355353355407715, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 4.426364323187453, "learning_rate": 9.185787558196562e-07, "logits/chosen": -0.11539479345083237, "logits/rejected": -0.005151323974132538, "logps/chosen": -2.1997592449188232, "logps/rejected": -2.8636884689331055, "loss": 0.7035, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1997592449188232, "rewards/margins": 0.6639290452003479, "rewards/rejected": -2.8636884689331055, "sft_loss": 2.1363730430603027, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 3.428026695044482, "learning_rate": 9.177248867780583e-07, "logits/chosen": -0.08635888993740082, "logits/rejected": 0.03695956617593765, "logps/chosen": -2.423877000808716, "logps/rejected": -2.8618521690368652, "loss": 0.7105, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.423877000808716, "rewards/margins": 0.43797531723976135, "rewards/rejected": -2.8618521690368652, "sft_loss": 2.333068370819092, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 3.0184449422286423, "learning_rate": 9.168669649677769e-07, "logits/chosen": -0.09940381348133087, "logits/rejected": 0.029827887192368507, "logps/chosen": -2.283815622329712, "logps/rejected": -2.930316686630249, "loss": 0.7062, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.283815622329712, "rewards/margins": 0.6465011835098267, "rewards/rejected": -2.930316686630249, "sft_loss": 2.2652838230133057, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 2.8768669638127267, "learning_rate": 9.16004998712373e-07, "logits/chosen": -0.03785894066095352, "logits/rejected": 0.05646789073944092, "logps/chosen": -2.2821617126464844, "logps/rejected": -2.957378625869751, "loss": 0.6949, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2821617126464844, "rewards/margins": 0.6752170324325562, "rewards/rejected": -2.957378625869751, "sft_loss": 2.1811747550964355, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 3.142060880674868, "learning_rate": 9.151389963746472e-07, "logits/chosen": -0.14265461266040802, "logits/rejected": 0.1681184321641922, "logps/chosen": -2.211472511291504, "logps/rejected": -3.15592360496521, "loss": 0.6955, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.211472511291504, "rewards/margins": 0.9444509744644165, "rewards/rejected": -3.15592360496521, "sft_loss": 2.178743362426758, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 2.8514252011165087, "learning_rate": 9.142689663565577e-07, "logits/chosen": -0.05743175745010376, "logits/rejected": 0.003899590577930212, "logps/chosen": -2.146831750869751, "logps/rejected": -2.847151279449463, "loss": 0.6963, "rewards/accuracies": 0.6875, "rewards/chosen": -2.146831750869751, "rewards/margins": 0.7003197073936462, "rewards/rejected": -2.847151279449463, "sft_loss": 2.150350570678711, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 4.351890597339401, "learning_rate": 9.133949170991397e-07, "logits/chosen": -0.07250859588384628, "logits/rejected": 0.03235817700624466, "logps/chosen": -2.158828020095825, "logps/rejected": -2.7288763523101807, "loss": 0.6976, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.158828020095825, "rewards/margins": 0.5700482130050659, "rewards/rejected": -2.7288763523101807, "sft_loss": 2.229212522506714, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 1.8152384973896596, "learning_rate": 9.125168570824231e-07, "logits/chosen": -0.16011783480644226, "logits/rejected": 0.024740198627114296, "logps/chosen": -2.182220458984375, "logps/rejected": -2.8084864616394043, "loss": 0.7102, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.182220458984375, "rewards/margins": 0.626266360282898, "rewards/rejected": -2.8084864616394043, "sft_loss": 2.122053623199463, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 1.713996615511991, "learning_rate": 9.116347948253496e-07, "logits/chosen": -0.13271503150463104, "logits/rejected": -0.0041678594425320625, "logps/chosen": -2.298312187194824, "logps/rejected": -2.7899975776672363, "loss": 0.7086, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.298312187194824, "rewards/margins": 0.4916854798793793, "rewards/rejected": -2.7899975776672363, "sft_loss": 2.1895220279693604, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 2.8142124312312458, "learning_rate": 9.107487388856916e-07, "logits/chosen": -0.15361139178276062, "logits/rejected": 0.03482117876410484, "logps/chosen": -2.194044351577759, "logps/rejected": -2.812548875808716, "loss": 0.6983, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.194044351577759, "rewards/margins": 0.618504524230957, "rewards/rejected": -2.812548875808716, "sft_loss": 2.105254650115967, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 2.503275869310201, "learning_rate": 9.098586978599673e-07, "logits/chosen": -0.13203874230384827, "logits/rejected": 0.0336499884724617, "logps/chosen": -2.146395444869995, "logps/rejected": -2.9748904705047607, "loss": 0.6888, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.146395444869995, "rewards/margins": 0.8284950256347656, "rewards/rejected": -2.9748904705047607, "sft_loss": 2.065767765045166, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 2.2382958133005126, "learning_rate": 9.089646803833588e-07, "logits/chosen": -0.16713622212409973, "logits/rejected": -0.03363850340247154, "logps/chosen": -2.1303060054779053, "logps/rejected": -2.7613275051116943, "loss": 0.7082, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1303060054779053, "rewards/margins": 0.6310214996337891, "rewards/rejected": -2.7613275051116943, "sft_loss": 2.1318306922912598, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 1.94318796170751, "learning_rate": 9.080666951296276e-07, "logits/chosen": -0.3367980122566223, "logits/rejected": -0.08581961691379547, "logps/chosen": -2.160841226577759, "logps/rejected": -2.9789345264434814, "loss": 0.6953, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.160841226577759, "rewards/margins": 0.8180931806564331, "rewards/rejected": -2.9789345264434814, "sft_loss": 2.1677775382995605, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 1.526861078294158, "learning_rate": 9.071647508110305e-07, "logits/chosen": -0.2882223427295685, "logits/rejected": -0.060058873146772385, "logps/chosen": -2.2199759483337402, "logps/rejected": -2.96525239944458, "loss": 0.7042, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.2199759483337402, "rewards/margins": 0.7452765107154846, "rewards/rejected": -2.96525239944458, "sft_loss": 2.131960391998291, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 2.3404760802143896, "learning_rate": 9.062588561782354e-07, "logits/chosen": -0.1957237422466278, "logits/rejected": -0.13356651365756989, "logps/chosen": -2.247565746307373, "logps/rejected": -2.748612880706787, "loss": 0.7003, "rewards/accuracies": 0.65625, "rewards/chosen": -2.247565746307373, "rewards/margins": 0.5010470151901245, "rewards/rejected": -2.748612880706787, "sft_loss": 2.2203712463378906, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 1.9202765729064233, "learning_rate": 9.053490200202358e-07, "logits/chosen": -0.1572050154209137, "logits/rejected": -0.0726763978600502, "logps/chosen": -2.1353588104248047, "logps/rejected": -2.6677019596099854, "loss": 0.6987, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1353588104248047, "rewards/margins": 0.5323430299758911, "rewards/rejected": -2.6677019596099854, "sft_loss": 2.1538896560668945, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 3.2305496709675805, "learning_rate": 9.044352511642661e-07, "logits/chosen": -0.19083921611309052, "logits/rejected": -0.14574851095676422, "logps/chosen": -2.2574779987335205, "logps/rejected": -2.750894546508789, "loss": 0.7094, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.2574779987335205, "rewards/margins": 0.4934166967868805, "rewards/rejected": -2.750894546508789, "sft_loss": 2.267965793609619, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 3.084250669754339, "learning_rate": 9.03517558475716e-07, "logits/chosen": -0.20070838928222656, "logits/rejected": -0.11343379318714142, "logps/chosen": -2.117205858230591, "logps/rejected": -2.580235481262207, "loss": 0.7087, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.117205858230591, "rewards/margins": 0.46302956342697144, "rewards/rejected": -2.580235481262207, "sft_loss": 2.0681068897247314, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 2.7526441438895723, "learning_rate": 9.025959508580436e-07, "logits/chosen": -0.1787761151790619, "logits/rejected": 0.023528896272182465, "logps/chosen": -2.194460391998291, "logps/rejected": -2.870157241821289, "loss": 0.7, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.194460391998291, "rewards/margins": 0.6756970882415771, "rewards/rejected": -2.870157241821289, "sft_loss": 2.1022377014160156, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 2.5042614370667486, "learning_rate": 9.016704372526905e-07, "logits/chosen": -0.20925016701221466, "logits/rejected": -0.05993008613586426, "logps/chosen": -2.0619091987609863, "logps/rejected": -2.9169676303863525, "loss": 0.6934, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0619091987609863, "rewards/margins": 0.8550586700439453, "rewards/rejected": -2.9169676303863525, "sft_loss": 2.0523006916046143, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 3.3434727474771644, "learning_rate": 9.007410266389934e-07, "logits/chosen": -0.27181947231292725, "logits/rejected": -0.2028999626636505, "logps/chosen": -2.173649311065674, "logps/rejected": -2.6213297843933105, "loss": 0.7035, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.173649311065674, "rewards/margins": 0.4476805627346039, "rewards/rejected": -2.6213297843933105, "sft_loss": 2.2011539936065674, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 2.2836497429851823, "learning_rate": 8.998077280340981e-07, "logits/chosen": -0.1945187747478485, "logits/rejected": -0.1373414248228073, "logps/chosen": -2.3528316020965576, "logps/rejected": -2.9335505962371826, "loss": 0.7054, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.3528316020965576, "rewards/margins": 0.5807192921638489, "rewards/rejected": -2.9335505962371826, "sft_loss": 2.2219395637512207, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 2.070645749062221, "learning_rate": 8.988705504928722e-07, "logits/chosen": -0.2843402922153473, "logits/rejected": -0.12146928161382675, "logps/chosen": -2.226685047149658, "logps/rejected": -3.1664788722991943, "loss": 0.6909, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.226685047149658, "rewards/margins": 0.9397938847541809, "rewards/rejected": -3.1664788722991943, "sft_loss": 2.2412779331207275, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": -0.03452831879258156, "eval_logits/rejected": 0.041437409818172455, "eval_logps/chosen": -2.3066861629486084, "eval_logps/rejected": -3.0784785747528076, "eval_loss": 0.6950539350509644, "eval_rewards/accuracies": 0.6824925541877747, "eval_rewards/chosen": -2.3066861629486084, "eval_rewards/margins": 0.7717921137809753, "eval_rewards/rejected": -3.0784785747528076, "eval_runtime": 47.6551, "eval_samples_per_second": 28.224, "eval_sft_loss": 2.231623888015747, "eval_steps_per_second": 7.072, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 5.5586703197383605, "learning_rate": 8.979295031078157e-07, "logits/chosen": -0.295482873916626, "logits/rejected": -0.09522996842861176, "logps/chosen": -2.3428032398223877, "logps/rejected": -3.139549493789673, "loss": 0.6925, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.3428032398223877, "rewards/margins": 0.7967461347579956, "rewards/rejected": -3.139549493789673, "sft_loss": 2.2508833408355713, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 2.2505753653004286, "learning_rate": 8.969845950089751e-07, "logits/chosen": -0.2747432589530945, "logits/rejected": -0.12129826843738556, "logps/chosen": -2.196272134780884, "logps/rejected": -3.2144463062286377, "loss": 0.692, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.196272134780884, "rewards/margins": 1.018174171447754, "rewards/rejected": -3.2144463062286377, "sft_loss": 2.209385395050049, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 8.156452408256476, "learning_rate": 8.960358353638526e-07, "logits/chosen": -0.20282170176506042, "logits/rejected": -0.11459418386220932, "logps/chosen": -2.337512493133545, "logps/rejected": -3.029510974884033, "loss": 0.7022, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.337512493133545, "rewards/margins": 0.6919983625411987, "rewards/rejected": -3.029510974884033, "sft_loss": 2.3106839656829834, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 2.748174363839319, "learning_rate": 8.950832333773184e-07, "logits/chosen": -0.2138424813747406, "logits/rejected": -0.09318797290325165, "logps/chosen": -2.16756010055542, "logps/rejected": -2.8914496898651123, "loss": 0.7034, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.16756010055542, "rewards/margins": 0.7238895297050476, "rewards/rejected": -2.8914496898651123, "sft_loss": 2.1276867389678955, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 2.239202934001568, "learning_rate": 8.941267982915213e-07, "logits/chosen": -0.1431226283311844, "logits/rejected": -0.10495742410421371, "logps/chosen": -2.298262119293213, "logps/rejected": -2.645540475845337, "loss": 0.7115, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.298262119293213, "rewards/margins": 0.3472786545753479, "rewards/rejected": -2.645540475845337, "sft_loss": 2.180662155151367, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 2.4065553816037566, "learning_rate": 8.931665393857983e-07, "logits/chosen": -0.15369094908237457, "logits/rejected": -0.022520026192069054, "logps/chosen": -2.1215758323669434, "logps/rejected": -2.918578624725342, "loss": 0.6991, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1215758323669434, "rewards/margins": 0.7970027923583984, "rewards/rejected": -2.918578624725342, "sft_loss": 2.0732903480529785, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 3.1905090140746006, "learning_rate": 8.922024659765861e-07, "logits/chosen": -0.2666686177253723, "logits/rejected": -0.1523967683315277, "logps/chosen": -2.1386024951934814, "logps/rejected": -2.9105467796325684, "loss": 0.7043, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.1386024951934814, "rewards/margins": 0.7719441652297974, "rewards/rejected": -2.9105467796325684, "sft_loss": 2.1258816719055176, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 2.1615218808447074, "learning_rate": 8.912345874173288e-07, "logits/chosen": -0.21783633530139923, "logits/rejected": -0.1169201135635376, "logps/chosen": -2.1727309226989746, "logps/rejected": -3.094940662384033, "loss": 0.7014, "rewards/accuracies": 0.65625, "rewards/chosen": -2.1727309226989746, "rewards/margins": 0.9222098588943481, "rewards/rejected": -3.094940662384033, "sft_loss": 2.1299543380737305, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 2.867818877426616, "learning_rate": 8.902629130983885e-07, "logits/chosen": -0.2079753428697586, "logits/rejected": -0.15814396739006042, "logps/chosen": -2.1862399578094482, "logps/rejected": -2.729067802429199, "loss": 0.7032, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.1862399578094482, "rewards/margins": 0.5428280830383301, "rewards/rejected": -2.729067802429199, "sft_loss": 2.2186481952667236, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 3.0966118330484886, "learning_rate": 8.892874524469537e-07, "logits/chosen": -0.14018157124519348, "logits/rejected": -0.09165972471237183, "logps/chosen": -2.090341329574585, "logps/rejected": -2.7218966484069824, "loss": 0.7045, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.090341329574585, "rewards/margins": 0.631555438041687, "rewards/rejected": -2.7218966484069824, "sft_loss": 1.9962527751922607, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 3.3357022059764745, "learning_rate": 8.883082149269478e-07, "logits/chosen": -0.26899534463882446, "logits/rejected": -0.16346599161624908, "logps/chosen": -2.192856550216675, "logps/rejected": -2.9463510513305664, "loss": 0.698, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.192856550216675, "rewards/margins": 0.753494381904602, "rewards/rejected": -2.9463510513305664, "sft_loss": 2.127333402633667, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 2.332298426755994, "learning_rate": 8.873252100389377e-07, "logits/chosen": -0.2414066046476364, "logits/rejected": -0.20738832652568817, "logps/chosen": -2.234240770339966, "logps/rejected": -2.9939560890197754, "loss": 0.7017, "rewards/accuracies": 0.6875, "rewards/chosen": -2.234240770339966, "rewards/margins": 0.7597158551216125, "rewards/rejected": -2.9939560890197754, "sft_loss": 2.1499571800231934, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 2.7484518511406284, "learning_rate": 8.863384473200411e-07, "logits/chosen": -0.24828548729419708, "logits/rejected": -0.18916873633861542, "logps/chosen": -2.4132347106933594, "logps/rejected": -2.844573497772217, "loss": 0.7114, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.4132347106933594, "rewards/margins": 0.4313390254974365, "rewards/rejected": -2.844573497772217, "sft_loss": 2.32940411567688, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 2.7554671577395555, "learning_rate": 8.853479363438342e-07, "logits/chosen": -0.18388070166110992, "logits/rejected": -0.037655822932720184, "logps/chosen": -2.497715711593628, "logps/rejected": -3.0130228996276855, "loss": 0.721, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.497715711593628, "rewards/margins": 0.5153070688247681, "rewards/rejected": -3.0130228996276855, "sft_loss": 2.3574628829956055, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 3.1412922575918687, "learning_rate": 8.843536867202588e-07, "logits/chosen": -0.19547554850578308, "logits/rejected": -0.030999530106782913, "logps/chosen": -2.3352713584899902, "logps/rejected": -3.2001075744628906, "loss": 0.7033, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.3352713584899902, "rewards/margins": 0.8648357391357422, "rewards/rejected": -3.2001075744628906, "sft_loss": 2.3229193687438965, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 2.6452804771537854, "learning_rate": 8.833557080955292e-07, "logits/chosen": -0.288478285074234, "logits/rejected": -0.19373884797096252, "logps/chosen": -2.3482749462127686, "logps/rejected": -2.9911880493164062, "loss": 0.703, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.3482749462127686, "rewards/margins": 0.6429128646850586, "rewards/rejected": -2.9911880493164062, "sft_loss": 2.2894492149353027, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 2.6461804264836823, "learning_rate": 8.823540101520381e-07, "logits/chosen": -0.19792509078979492, "logits/rejected": -0.007888046093285084, "logps/chosen": -2.3069212436676025, "logps/rejected": -3.104051113128662, "loss": 0.7007, "rewards/accuracies": 0.625, "rewards/chosen": -2.3069212436676025, "rewards/margins": 0.7971299290657043, "rewards/rejected": -3.104051113128662, "sft_loss": 2.2453808784484863, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 2.5770833660332007, "learning_rate": 8.813486026082637e-07, "logits/chosen": -0.23112145066261292, "logits/rejected": -0.05248458310961723, "logps/chosen": -2.1011803150177, "logps/rejected": -3.042440414428711, "loss": 0.6902, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1011803150177, "rewards/margins": 0.9412603378295898, "rewards/rejected": -3.042440414428711, "sft_loss": 2.0605461597442627, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 4.86659798854765, "learning_rate": 8.803394952186742e-07, "logits/chosen": -0.3249500095844269, "logits/rejected": -0.16356393694877625, "logps/chosen": -2.191807270050049, "logps/rejected": -2.9690449237823486, "loss": 0.7006, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.191807270050049, "rewards/margins": 0.7772378325462341, "rewards/rejected": -2.9690449237823486, "sft_loss": 2.2248458862304688, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 3.1252711270501443, "learning_rate": 8.793266977736342e-07, "logits/chosen": -0.18225380778312683, "logits/rejected": -0.23605379462242126, "logps/chosen": -2.2445120811462402, "logps/rejected": -2.6155567169189453, "loss": 0.7126, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.2445120811462402, "rewards/margins": 0.37104448676109314, "rewards/rejected": -2.6155567169189453, "sft_loss": 2.242246150970459, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 2.1193829188326876, "learning_rate": 8.783102200993085e-07, "logits/chosen": -0.18783244490623474, "logits/rejected": -0.06622517108917236, "logps/chosen": -2.0862059593200684, "logps/rejected": -2.738128900527954, "loss": 0.7036, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.0862059593200684, "rewards/margins": 0.6519228219985962, "rewards/rejected": -2.738128900527954, "sft_loss": 2.0497243404388428, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 2.80200570752565, "learning_rate": 8.772900720575683e-07, "logits/chosen": -0.20885543525218964, "logits/rejected": -0.13907435536384583, "logps/chosen": -2.239652633666992, "logps/rejected": -2.8518741130828857, "loss": 0.6982, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.239652633666992, "rewards/margins": 0.6122216582298279, "rewards/rejected": -2.8518741130828857, "sft_loss": 2.1831889152526855, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 2.6772905972911647, "learning_rate": 8.762662635458944e-07, "logits/chosen": -0.24115291237831116, "logits/rejected": -0.070436030626297, "logps/chosen": -2.3881723880767822, "logps/rejected": -3.046025037765503, "loss": 0.7064, "rewards/accuracies": 0.625, "rewards/chosen": -2.3881723880767822, "rewards/margins": 0.6578529477119446, "rewards/rejected": -3.046025037765503, "sft_loss": 2.255465030670166, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 2.452321438573616, "learning_rate": 8.752388044972811e-07, "logits/chosen": -0.19311857223510742, "logits/rejected": -0.13626573979854584, "logps/chosen": -2.209670066833496, "logps/rejected": -3.022451400756836, "loss": 0.6979, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.209670066833496, "rewards/margins": 0.8127814531326294, "rewards/rejected": -3.022451400756836, "sft_loss": 2.1349611282348633, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 2.472216839769144, "learning_rate": 8.74207704880141e-07, "logits/chosen": -0.18031716346740723, "logits/rejected": -0.0725378543138504, "logps/chosen": -2.1493165493011475, "logps/rejected": -3.2853176593780518, "loss": 0.6855, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1493165493011475, "rewards/margins": 1.1360008716583252, "rewards/rejected": -3.2853176593780518, "sft_loss": 2.1569228172302246, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 4.870408947854243, "learning_rate": 8.731729746982068e-07, "logits/chosen": -0.19539253413677216, "logits/rejected": -0.1260274350643158, "logps/chosen": -2.1418533325195312, "logps/rejected": -2.6719813346862793, "loss": 0.7017, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1418533325195312, "rewards/margins": 0.530128002166748, "rewards/rejected": -2.6719813346862793, "sft_loss": 2.1912734508514404, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 2.734847887314104, "learning_rate": 8.721346239904355e-07, "logits/chosen": -0.2868293821811676, "logits/rejected": -0.13878083229064941, "logps/chosen": -2.084522008895874, "logps/rejected": -2.909510374069214, "loss": 0.692, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.084522008895874, "rewards/margins": 0.824988067150116, "rewards/rejected": -2.909510374069214, "sft_loss": 2.0666072368621826, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 1.8351539850699354, "learning_rate": 8.710926628309101e-07, "logits/chosen": -0.22179599106311798, "logits/rejected": -0.0967031866312027, "logps/chosen": -2.1067683696746826, "logps/rejected": -2.756479501724243, "loss": 0.6962, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1067683696746826, "rewards/margins": 0.649711012840271, "rewards/rejected": -2.756479501724243, "sft_loss": 2.08931040763855, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 2.9730415144289664, "learning_rate": 8.700471013287424e-07, "logits/chosen": -0.193211168050766, "logits/rejected": -0.1608760505914688, "logps/chosen": -2.051504373550415, "logps/rejected": -2.6706979274749756, "loss": 0.6959, "rewards/accuracies": 0.625, "rewards/chosen": -2.051504373550415, "rewards/margins": 0.6191936731338501, "rewards/rejected": -2.6706979274749756, "sft_loss": 2.0490097999572754, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 2.540363578802009, "learning_rate": 8.689979496279746e-07, "logits/chosen": -0.1945440024137497, "logits/rejected": -0.14428547024726868, "logps/chosen": -2.2550501823425293, "logps/rejected": -2.7136588096618652, "loss": 0.7086, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.2550501823425293, "rewards/margins": 0.45860838890075684, "rewards/rejected": -2.7136588096618652, "sft_loss": 2.216766357421875, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 1.8463985419387632, "learning_rate": 8.679452179074811e-07, "logits/chosen": -0.22097475826740265, "logits/rejected": -0.12107206881046295, "logps/chosen": -2.1033809185028076, "logps/rejected": -2.739773988723755, "loss": 0.6929, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1033809185028076, "rewards/margins": 0.6363930702209473, "rewards/rejected": -2.739773988723755, "sft_loss": 2.14668607711792, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 3.4086278612305723, "learning_rate": 8.668889163808698e-07, "logits/chosen": -0.2078239619731903, "logits/rejected": -0.08686795830726624, "logps/chosen": -2.072483539581299, "logps/rejected": -2.6630349159240723, "loss": 0.7048, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.072483539581299, "rewards/margins": 0.5905511975288391, "rewards/rejected": -2.6630349159240723, "sft_loss": 2.0812599658966064, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 7.215629129560617, "learning_rate": 8.658290552963827e-07, "logits/chosen": -0.1843900978565216, "logits/rejected": -0.1560697853565216, "logps/chosen": -2.170839309692383, "logps/rejected": -2.9429659843444824, "loss": 0.7124, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.170839309692383, "rewards/margins": 0.7721266150474548, "rewards/rejected": -2.9429659843444824, "sft_loss": 2.1287245750427246, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 2.0259578444564186, "learning_rate": 8.647656449367966e-07, "logits/chosen": -0.20086932182312012, "logits/rejected": -0.031804632395505905, "logps/chosen": -2.117952823638916, "logps/rejected": -2.6687893867492676, "loss": 0.6943, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.117952823638916, "rewards/margins": 0.5508365035057068, "rewards/rejected": -2.6687893867492676, "sft_loss": 2.1530754566192627, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 3.40133907473232, "learning_rate": 8.636986956193235e-07, "logits/chosen": -0.2689778208732605, "logits/rejected": -0.16882191598415375, "logps/chosen": -2.109886407852173, "logps/rejected": -2.7701923847198486, "loss": 0.6897, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.109886407852173, "rewards/margins": 0.6603060364723206, "rewards/rejected": -2.7701923847198486, "sft_loss": 2.1185245513916016, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 6.154439580864391, "learning_rate": 8.626282176955104e-07, "logits/chosen": -0.2206428498029709, "logits/rejected": -0.11337701231241226, "logps/chosen": -2.1953907012939453, "logps/rejected": -2.889822244644165, "loss": 0.6981, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1953907012939453, "rewards/margins": 0.6944314241409302, "rewards/rejected": -2.889822244644165, "sft_loss": 2.189736843109131, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 2.35738290621667, "learning_rate": 8.615542215511389e-07, "logits/chosen": -0.1755184829235077, "logits/rejected": -0.13018499314785004, "logps/chosen": -2.3339505195617676, "logps/rejected": -2.749356746673584, "loss": 0.7126, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.3339505195617676, "rewards/margins": 0.4154065251350403, "rewards/rejected": -2.749356746673584, "sft_loss": 2.2870373725891113, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 3.3112070687276605, "learning_rate": 8.604767176061241e-07, "logits/chosen": -0.12363386154174805, "logits/rejected": -0.03265656903386116, "logps/chosen": -2.342581033706665, "logps/rejected": -2.935011386871338, "loss": 0.7009, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.342581033706665, "rewards/margins": 0.5924302935600281, "rewards/rejected": -2.935011386871338, "sft_loss": 2.305471658706665, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 4.891263450981831, "learning_rate": 8.593957163144141e-07, "logits/chosen": -0.24133631587028503, "logits/rejected": -0.11299635469913483, "logps/chosen": -2.2154290676116943, "logps/rejected": -3.2079403400421143, "loss": 0.682, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2154290676116943, "rewards/margins": 0.9925110936164856, "rewards/rejected": -3.2079403400421143, "sft_loss": 2.215498685836792, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 2.0208424129718083, "learning_rate": 8.58311228163888e-07, "logits/chosen": -0.15862031280994415, "logits/rejected": -0.08435139060020447, "logps/chosen": -2.3127713203430176, "logps/rejected": -2.894026041030884, "loss": 0.7017, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.3127713203430176, "rewards/margins": 0.581254780292511, "rewards/rejected": -2.894026041030884, "sft_loss": 2.3109865188598633, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 2.8507836505925166, "learning_rate": 8.57223263676255e-07, "logits/chosen": -0.24791434407234192, "logits/rejected": -0.13634294271469116, "logps/chosen": -2.1066908836364746, "logps/rejected": -3.230231761932373, "loss": 0.6909, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1066908836364746, "rewards/margins": 1.1235411167144775, "rewards/rejected": -3.230231761932373, "sft_loss": 2.103177547454834, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 2.280443520636504, "learning_rate": 8.561318334069511e-07, "logits/chosen": -0.15570509433746338, "logits/rejected": -0.02451663836836815, "logps/chosen": -2.162179470062256, "logps/rejected": -2.8802807331085205, "loss": 0.6978, "rewards/accuracies": 0.6875, "rewards/chosen": -2.162179470062256, "rewards/margins": 0.7181010246276855, "rewards/rejected": -2.8802807331085205, "sft_loss": 2.0782687664031982, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 2.6339671478110964, "learning_rate": 8.550369479450375e-07, "logits/chosen": -0.17519035935401917, "logits/rejected": -0.06617375463247299, "logps/chosen": -2.117783784866333, "logps/rejected": -2.8188655376434326, "loss": 0.6921, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.117783784866333, "rewards/margins": 0.7010820508003235, "rewards/rejected": -2.8188655376434326, "sft_loss": 2.0838098526000977, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 2.1942284258327236, "learning_rate": 8.539386179130977e-07, "logits/chosen": -0.17862842977046967, "logits/rejected": -0.12102198600769043, "logps/chosen": -2.0949339866638184, "logps/rejected": -2.6766610145568848, "loss": 0.6962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0949339866638184, "rewards/margins": 0.5817269086837769, "rewards/rejected": -2.6766610145568848, "sft_loss": 1.989256501197815, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 2.0531383973558808, "learning_rate": 8.528368539671347e-07, "logits/chosen": -0.2294219732284546, "logits/rejected": -0.09604024887084961, "logps/chosen": -2.1103386878967285, "logps/rejected": -3.1935291290283203, "loss": 0.6921, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.1103386878967285, "rewards/margins": 1.0831904411315918, "rewards/rejected": -3.1935291290283203, "sft_loss": 2.0963447093963623, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 2.1561448351403474, "learning_rate": 8.51731666796467e-07, "logits/chosen": -0.10031883418560028, "logits/rejected": -0.06307245790958405, "logps/chosen": -2.1303870677948, "logps/rejected": -2.627593755722046, "loss": 0.7075, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.1303870677948, "rewards/margins": 0.4972063899040222, "rewards/rejected": -2.627593755722046, "sft_loss": 2.038759708404541, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 1.8915094325573296, "learning_rate": 8.506230671236254e-07, "logits/chosen": -0.24092257022857666, "logits/rejected": -0.18085786700248718, "logps/chosen": -2.1491005420684814, "logps/rejected": -2.653780698776245, "loss": 0.7051, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1491005420684814, "rewards/margins": 0.5046799182891846, "rewards/rejected": -2.653780698776245, "sft_loss": 2.12119722366333, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 1.3368718107290438, "learning_rate": 8.495110657042488e-07, "logits/chosen": -0.20756366848945618, "logits/rejected": -0.0775209441781044, "logps/chosen": -2.227262258529663, "logps/rejected": -2.9371085166931152, "loss": 0.6965, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.227262258529663, "rewards/margins": 0.7098467946052551, "rewards/rejected": -2.9371085166931152, "sft_loss": 2.236367702484131, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 3.199698485846893, "learning_rate": 8.483956733269799e-07, "logits/chosen": -0.18606507778167725, "logits/rejected": -0.11772701889276505, "logps/chosen": -2.1976842880249023, "logps/rejected": -2.954611301422119, "loss": 0.6986, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1976842880249023, "rewards/margins": 0.7569268941879272, "rewards/rejected": -2.954611301422119, "sft_loss": 2.1414358615875244, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 1.7084071257906217, "learning_rate": 8.472769008133602e-07, "logits/chosen": -0.3265012800693512, "logits/rejected": -0.20926764607429504, "logps/chosen": -2.269092082977295, "logps/rejected": -2.8173460960388184, "loss": 0.7033, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.269092082977295, "rewards/margins": 0.5482543110847473, "rewards/rejected": -2.8173460960388184, "sft_loss": 2.1899359226226807, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 2.617818345800502, "learning_rate": 8.461547590177259e-07, "logits/chosen": -0.23281869292259216, "logits/rejected": -0.1132984310388565, "logps/chosen": -2.226372241973877, "logps/rejected": -3.053133726119995, "loss": 0.6979, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.226372241973877, "rewards/margins": 0.8267615437507629, "rewards/rejected": -3.053133726119995, "sft_loss": 2.1588146686553955, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 2.2899406517874445, "learning_rate": 8.450292588271014e-07, "logits/chosen": -0.2197950780391693, "logits/rejected": -0.12617357075214386, "logps/chosen": -2.483006715774536, "logps/rejected": -3.114408254623413, "loss": 0.699, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.483006715774536, "rewards/margins": 0.631401538848877, "rewards/rejected": -3.114408254623413, "sft_loss": 2.2906270027160645, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 2.758295228529619, "learning_rate": 8.439004111610945e-07, "logits/chosen": -0.2070378065109253, "logits/rejected": -0.13232679665088654, "logps/chosen": -2.27894926071167, "logps/rejected": -3.149691104888916, "loss": 0.688, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.27894926071167, "rewards/margins": 0.8707423210144043, "rewards/rejected": -3.149691104888916, "sft_loss": 2.22861647605896, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 2.265093438441994, "learning_rate": 8.427682269717901e-07, "logits/chosen": -0.23553189635276794, "logits/rejected": -0.09476248174905777, "logps/chosen": -2.338883876800537, "logps/rejected": -3.0769600868225098, "loss": 0.7056, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.338883876800537, "rewards/margins": 0.7380759119987488, "rewards/rejected": -3.0769600868225098, "sft_loss": 2.221405029296875, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 2.4026215375782627, "learning_rate": 8.416327172436446e-07, "logits/chosen": -0.2625536322593689, "logits/rejected": -0.12275469303131104, "logps/chosen": -2.4061732292175293, "logps/rejected": -2.9844717979431152, "loss": 0.7021, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.4061732292175293, "rewards/margins": 0.5782989263534546, "rewards/rejected": -2.9844717979431152, "sft_loss": 2.265167713165283, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 2.8656083212996326, "learning_rate": 8.404938929933778e-07, "logits/chosen": -0.13358765840530396, "logits/rejected": -0.01683134213089943, "logps/chosen": -2.2469773292541504, "logps/rejected": -3.156306028366089, "loss": 0.6982, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.2469773292541504, "rewards/margins": 0.909328818321228, "rewards/rejected": -3.156306028366089, "sft_loss": 2.1257612705230713, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 1.8757342641313572, "learning_rate": 8.39351765269868e-07, "logits/chosen": -0.19678595662117004, "logits/rejected": -0.12138146162033081, "logps/chosen": -2.1376781463623047, "logps/rejected": -2.8857154846191406, "loss": 0.6857, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1376781463623047, "rewards/margins": 0.7480372190475464, "rewards/rejected": -2.8857154846191406, "sft_loss": 2.0610835552215576, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 2.4086283181485046, "learning_rate": 8.382063451540431e-07, "logits/chosen": -0.20785903930664062, "logits/rejected": 0.004422978963702917, "logps/chosen": -2.1714940071105957, "logps/rejected": -2.8997912406921387, "loss": 0.696, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1714940071105957, "rewards/margins": 0.7282973527908325, "rewards/rejected": -2.8997912406921387, "sft_loss": 2.206144094467163, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 2.9374003746764936, "learning_rate": 8.370576437587742e-07, "logits/chosen": -0.1559458076953888, "logits/rejected": -0.1223495751619339, "logps/chosen": -2.1147940158843994, "logps/rejected": -2.761025905609131, "loss": 0.6979, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1147940158843994, "rewards/margins": 0.6462318301200867, "rewards/rejected": -2.761025905609131, "sft_loss": 2.0338943004608154, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 2.4265583317054165, "learning_rate": 8.359056722287674e-07, "logits/chosen": -0.2967333197593689, "logits/rejected": -0.06024886295199394, "logps/chosen": -2.209847927093506, "logps/rejected": -2.8883144855499268, "loss": 0.6918, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.209847927093506, "rewards/margins": 0.6784664392471313, "rewards/rejected": -2.8883144855499268, "sft_loss": 2.218695878982544, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 2.3172804009040386, "learning_rate": 8.347504417404553e-07, "logits/chosen": -0.21461844444274902, "logits/rejected": -0.08843618631362915, "logps/chosen": -2.2572195529937744, "logps/rejected": -2.836730480194092, "loss": 0.7036, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.2572195529937744, "rewards/margins": 0.5795107483863831, "rewards/rejected": -2.836730480194092, "sft_loss": 2.1930465698242188, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 2.3140340280324105, "learning_rate": 8.335919635018893e-07, "logits/chosen": -0.2957096993923187, "logits/rejected": -0.19099418818950653, "logps/chosen": -2.1492276191711426, "logps/rejected": -2.731222629547119, "loss": 0.703, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1492276191711426, "rewards/margins": 0.5819951295852661, "rewards/rejected": -2.731222629547119, "sft_loss": 2.0989179611206055, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 2.0832491114899745, "learning_rate": 8.324302487526303e-07, "logits/chosen": -0.24538812041282654, "logits/rejected": -0.12193469703197479, "logps/chosen": -2.225022315979004, "logps/rejected": -2.7711994647979736, "loss": 0.6912, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.225022315979004, "rewards/margins": 0.5461770296096802, "rewards/rejected": -2.7711994647979736, "sft_loss": 2.191540002822876, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 2.838280581924798, "learning_rate": 8.312653087636398e-07, "logits/chosen": -0.2832343578338623, "logits/rejected": -0.20158131420612335, "logps/chosen": -2.139833688735962, "logps/rejected": -3.0244996547698975, "loss": 0.6821, "rewards/accuracies": 0.71875, "rewards/chosen": -2.139833688735962, "rewards/margins": 0.8846660852432251, "rewards/rejected": -3.0244996547698975, "sft_loss": 2.187866687774658, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 2.8720659961289083, "learning_rate": 8.300971548371711e-07, "logits/chosen": -0.36418408155441284, "logits/rejected": -0.18453466892242432, "logps/chosen": -2.2548022270202637, "logps/rejected": -2.8945319652557373, "loss": 0.6983, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2548022270202637, "rewards/margins": 0.6397296190261841, "rewards/rejected": -2.8945319652557373, "sft_loss": 2.2566208839416504, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 3.008003242968152, "learning_rate": 8.289257983066582e-07, "logits/chosen": -0.3070002496242523, "logits/rejected": -0.18967430293560028, "logps/chosen": -2.070561408996582, "logps/rejected": -2.795137882232666, "loss": 0.6882, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.070561408996582, "rewards/margins": 0.7245765328407288, "rewards/rejected": -2.795137882232666, "sft_loss": 2.110377073287964, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 2.6165536235285054, "learning_rate": 8.277512505366077e-07, "logits/chosen": -0.344722181558609, "logits/rejected": -0.1587430238723755, "logps/chosen": -2.2222800254821777, "logps/rejected": -2.9637227058410645, "loss": 0.6992, "rewards/accuracies": 0.625, "rewards/chosen": -2.2222800254821777, "rewards/margins": 0.7414425611495972, "rewards/rejected": -2.9637227058410645, "sft_loss": 2.1827409267425537, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 2.543549696020325, "learning_rate": 8.265735229224868e-07, "logits/chosen": -0.27377718687057495, "logits/rejected": -0.18468934297561646, "logps/chosen": -2.0899338722229004, "logps/rejected": -2.9180965423583984, "loss": 0.689, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0899338722229004, "rewards/margins": 0.8281623721122742, "rewards/rejected": -2.9180965423583984, "sft_loss": 2.0393636226654053, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 2.932102047024119, "learning_rate": 8.253926268906144e-07, "logits/chosen": -0.3700665235519409, "logits/rejected": -0.23408885300159454, "logps/chosen": -2.2393529415130615, "logps/rejected": -3.1915836334228516, "loss": 0.6907, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.2393529415130615, "rewards/margins": 0.9522306323051453, "rewards/rejected": -3.1915836334228516, "sft_loss": 2.2528598308563232, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 1.5721902382966666, "learning_rate": 8.242085738980487e-07, "logits/chosen": -0.23069548606872559, "logits/rejected": -0.05136293172836304, "logps/chosen": -2.2052998542785645, "logps/rejected": -2.8688933849334717, "loss": 0.6997, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.2052998542785645, "rewards/margins": 0.6635935306549072, "rewards/rejected": -2.8688933849334717, "sft_loss": 2.2509541511535645, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 2.9671467167640593, "learning_rate": 8.230213754324772e-07, "logits/chosen": -0.2968910038471222, "logits/rejected": -0.21758243441581726, "logps/chosen": -2.156068801879883, "logps/rejected": -2.8106884956359863, "loss": 0.6953, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.156068801879883, "rewards/margins": 0.6546195149421692, "rewards/rejected": -2.8106884956359863, "sft_loss": 2.221719741821289, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 1.8944266269742676, "learning_rate": 8.218310430121045e-07, "logits/chosen": -0.2681064009666443, "logits/rejected": -0.2201647311449051, "logps/chosen": -2.1123995780944824, "logps/rejected": -2.6658949851989746, "loss": 0.7007, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.1123995780944824, "rewards/margins": 0.5534952282905579, "rewards/rejected": -2.6658949851989746, "sft_loss": 2.088008403778076, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 3.797915875820107, "learning_rate": 8.20637588185541e-07, "logits/chosen": -0.2441168576478958, "logits/rejected": -0.16347715258598328, "logps/chosen": -2.0733633041381836, "logps/rejected": -3.0631089210510254, "loss": 0.6884, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0733633041381836, "rewards/margins": 0.9897457361221313, "rewards/rejected": -3.0631089210510254, "sft_loss": 2.0929739475250244, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 2.693248674578846, "learning_rate": 8.194410225316906e-07, "logits/chosen": -0.29714250564575195, "logits/rejected": -0.1393791288137436, "logps/chosen": -2.1041672229766846, "logps/rejected": -2.7866697311401367, "loss": 0.6924, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1041672229766846, "rewards/margins": 0.6825023293495178, "rewards/rejected": -2.7866697311401367, "sft_loss": 2.0851259231567383, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 3.095263291230212, "learning_rate": 8.182413576596385e-07, "logits/chosen": -0.1551038771867752, "logits/rejected": -0.08851303160190582, "logps/chosen": -2.1007044315338135, "logps/rejected": -2.8332302570343018, "loss": 0.6959, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1007044315338135, "rewards/margins": 0.7325260043144226, "rewards/rejected": -2.8332302570343018, "sft_loss": 2.104759931564331, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 2.697160940766407, "learning_rate": 8.170386052085389e-07, "logits/chosen": -0.16222915053367615, "logits/rejected": -0.05635574460029602, "logps/chosen": -2.30318546295166, "logps/rejected": -3.0286173820495605, "loss": 0.709, "rewards/accuracies": 0.65625, "rewards/chosen": -2.30318546295166, "rewards/margins": 0.7254319190979004, "rewards/rejected": -3.0286173820495605, "sft_loss": 2.258232593536377, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 2.5573208863556633, "learning_rate": 8.158327768475008e-07, "logits/chosen": -0.20433588325977325, "logits/rejected": -0.06622826308012009, "logps/chosen": -2.348832607269287, "logps/rejected": -2.923677921295166, "loss": 0.7034, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.348832607269287, "rewards/margins": 0.5748453140258789, "rewards/rejected": -2.923677921295166, "sft_loss": 2.2062716484069824, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 2.3634203813791435, "learning_rate": 8.146238842754767e-07, "logits/chosen": -0.22017395496368408, "logits/rejected": -0.11190620809793472, "logps/chosen": -2.3437857627868652, "logps/rejected": -2.913809299468994, "loss": 0.7014, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.3437857627868652, "rewards/margins": 0.5700234770774841, "rewards/rejected": -2.913809299468994, "sft_loss": 2.219799757003784, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 3.1518573609297094, "learning_rate": 8.134119392211476e-07, "logits/chosen": -0.117152139544487, "logits/rejected": 0.03829234093427658, "logps/chosen": -2.151608467102051, "logps/rejected": -3.1825308799743652, "loss": 0.6873, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.151608467102051, "rewards/margins": 1.030922532081604, "rewards/rejected": -3.1825308799743652, "sft_loss": 2.1608572006225586, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 3.0095602307906972, "learning_rate": 8.121969534428094e-07, "logits/chosen": -0.2610512375831604, "logits/rejected": -0.09405355155467987, "logps/chosen": -2.3147358894348145, "logps/rejected": -3.0925610065460205, "loss": 0.6992, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.3147358894348145, "rewards/margins": 0.7778247594833374, "rewards/rejected": -3.0925610065460205, "sft_loss": 2.1939456462860107, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.03744465112686157, "eval_logits/rejected": 0.1252625286579132, "eval_logps/chosen": -2.138370990753174, "eval_logps/rejected": -2.963397264480591, "eval_loss": 0.6926766633987427, "eval_rewards/accuracies": 0.6765578389167786, "eval_rewards/chosen": -2.138370990753174, "eval_rewards/margins": 0.8250265121459961, "eval_rewards/rejected": -2.963397264480591, "eval_runtime": 47.8648, "eval_samples_per_second": 28.1, "eval_sft_loss": 2.067228317260742, "eval_steps_per_second": 7.041, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 3.4557100064183324, "learning_rate": 8.109789387282599e-07, "logits/chosen": -0.15685082972049713, "logits/rejected": -0.08967246115207672, "logps/chosen": -2.15382719039917, "logps/rejected": -2.877662420272827, "loss": 0.689, "rewards/accuracies": 0.65625, "rewards/chosen": -2.15382719039917, "rewards/margins": 0.7238351106643677, "rewards/rejected": -2.877662420272827, "sft_loss": 2.0683882236480713, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 2.971452618858938, "learning_rate": 8.097579068946827e-07, "logits/chosen": -0.15677562355995178, "logits/rejected": -0.05232198163866997, "logps/chosen": -2.0423381328582764, "logps/rejected": -2.7903552055358887, "loss": 0.6874, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0423381328582764, "rewards/margins": 0.7480170726776123, "rewards/rejected": -2.7903552055358887, "sft_loss": 2.012303113937378, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 3.295583092430273, "learning_rate": 8.085338697885344e-07, "logits/chosen": -0.169949471950531, "logits/rejected": -0.023434199392795563, "logps/chosen": -2.22428035736084, "logps/rejected": -2.977260112762451, "loss": 0.6924, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.22428035736084, "rewards/margins": 0.7529802918434143, "rewards/rejected": -2.977260112762451, "sft_loss": 2.100477695465088, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 2.7415581486831146, "learning_rate": 8.073068392854282e-07, "logits/chosen": -0.28308627009391785, "logits/rejected": -0.0862099900841713, "logps/chosen": -2.241190195083618, "logps/rejected": -3.168811321258545, "loss": 0.6876, "rewards/accuracies": 0.75, "rewards/chosen": -2.241190195083618, "rewards/margins": 0.9276212453842163, "rewards/rejected": -3.168811321258545, "sft_loss": 2.171774387359619, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 3.9811990236395136, "learning_rate": 8.060768272900193e-07, "logits/chosen": -0.13680367171764374, "logits/rejected": 0.019465472549200058, "logps/chosen": -2.2143447399139404, "logps/rejected": -3.1631412506103516, "loss": 0.7011, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2143447399139404, "rewards/margins": 0.9487963914871216, "rewards/rejected": -3.1631412506103516, "sft_loss": 2.1971354484558105, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 2.8975017177302185, "learning_rate": 8.0484384573589e-07, "logits/chosen": -0.22913579642772675, "logits/rejected": -0.1947326809167862, "logps/chosen": -2.102335214614868, "logps/rejected": -2.733572006225586, "loss": 0.6979, "rewards/accuracies": 0.65625, "rewards/chosen": -2.102335214614868, "rewards/margins": 0.6312370300292969, "rewards/rejected": -2.733572006225586, "sft_loss": 2.0779881477355957, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 3.014099252817148, "learning_rate": 8.03607906585432e-07, "logits/chosen": -0.22015182673931122, "logits/rejected": -0.05069545656442642, "logps/chosen": -2.243253469467163, "logps/rejected": -2.77600359916687, "loss": 0.702, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.243253469467163, "rewards/margins": 0.5327504277229309, "rewards/rejected": -2.77600359916687, "sft_loss": 2.220588445663452, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 3.498294436246651, "learning_rate": 8.023690218297329e-07, "logits/chosen": -0.28015822172164917, "logits/rejected": -0.210425466299057, "logps/chosen": -2.179668426513672, "logps/rejected": -2.826927423477173, "loss": 0.6961, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.179668426513672, "rewards/margins": 0.6472587585449219, "rewards/rejected": -2.826927423477173, "sft_loss": 2.0574076175689697, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 2.6428402943889853, "learning_rate": 8.01127203488458e-07, "logits/chosen": -0.16431409120559692, "logits/rejected": -0.11079733073711395, "logps/chosen": -2.178668737411499, "logps/rejected": -2.9302544593811035, "loss": 0.6863, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.178668737411499, "rewards/margins": 0.7515857219696045, "rewards/rejected": -2.9302544593811035, "sft_loss": 2.0807926654815674, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 5.6146039853408105, "learning_rate": 7.998824636097339e-07, "logits/chosen": -0.19174602627754211, "logits/rejected": -0.03761805593967438, "logps/chosen": -2.1677002906799316, "logps/rejected": -3.0381433963775635, "loss": 0.7032, "rewards/accuracies": 0.65625, "rewards/chosen": -2.1677002906799316, "rewards/margins": 0.8704432249069214, "rewards/rejected": -3.0381433963775635, "sft_loss": 2.1944992542266846, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 4.095182368933817, "learning_rate": 7.986348142700328e-07, "logits/chosen": -0.14897744357585907, "logits/rejected": 0.013398826122283936, "logps/chosen": -2.1722137928009033, "logps/rejected": -3.0760350227355957, "loss": 0.688, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1722137928009033, "rewards/margins": 0.903821587562561, "rewards/rejected": -3.0760350227355957, "sft_loss": 2.233502149581909, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 2.3074695699932373, "learning_rate": 7.973842675740539e-07, "logits/chosen": -0.07054503262042999, "logits/rejected": -0.003586306469514966, "logps/chosen": -2.15727162361145, "logps/rejected": -3.1748225688934326, "loss": 0.6837, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.15727162361145, "rewards/margins": 1.017551302909851, "rewards/rejected": -3.1748225688934326, "sft_loss": 2.1737217903137207, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 3.6905084049689956, "learning_rate": 7.961308356546066e-07, "logits/chosen": -0.12484552711248398, "logits/rejected": 0.03302082419395447, "logps/chosen": -2.2111077308654785, "logps/rejected": -2.932865619659424, "loss": 0.6925, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.2111077308654785, "rewards/margins": 0.7217577695846558, "rewards/rejected": -2.932865619659424, "sft_loss": 2.117246389389038, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 3.7559632713951587, "learning_rate": 7.948745306724931e-07, "logits/chosen": -0.09252647310495377, "logits/rejected": 0.08411063998937607, "logps/chosen": -2.143631935119629, "logps/rejected": -3.171995162963867, "loss": 0.681, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.143631935119629, "rewards/margins": 1.0283634662628174, "rewards/rejected": -3.171995162963867, "sft_loss": 2.054680585861206, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 3.785218662060919, "learning_rate": 7.936153648163897e-07, "logits/chosen": -0.1127227172255516, "logits/rejected": -0.006171461194753647, "logps/chosen": -2.428866147994995, "logps/rejected": -3.1094393730163574, "loss": 0.6954, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.428866147994995, "rewards/margins": 0.6805731654167175, "rewards/rejected": -3.1094393730163574, "sft_loss": 2.4352684020996094, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 2.9886125247810438, "learning_rate": 7.92353350302729e-07, "logits/chosen": -0.16224198043346405, "logits/rejected": 0.03348865360021591, "logps/chosen": -2.052062511444092, "logps/rejected": -3.012722969055176, "loss": 0.6824, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.052062511444092, "rewards/margins": 0.9606603384017944, "rewards/rejected": -3.012722969055176, "sft_loss": 2.0672218799591064, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 7.474193348670629, "learning_rate": 7.910884993755816e-07, "logits/chosen": -0.11525171995162964, "logits/rejected": -0.021411413326859474, "logps/chosen": -2.1601333618164062, "logps/rejected": -3.1377646923065186, "loss": 0.6911, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1601333618164062, "rewards/margins": 0.9776315689086914, "rewards/rejected": -3.1377646923065186, "sft_loss": 2.1464765071868896, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 3.675852847011412, "learning_rate": 7.898208243065367e-07, "logits/chosen": -0.14804470539093018, "logits/rejected": -0.12396585941314697, "logps/chosen": -2.142313241958618, "logps/rejected": -2.747668743133545, "loss": 0.696, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.142313241958618, "rewards/margins": 0.605355978012085, "rewards/rejected": -2.747668743133545, "sft_loss": 2.142691135406494, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 1.827307395063867, "learning_rate": 7.88550337394583e-07, "logits/chosen": -0.144433856010437, "logits/rejected": 0.03242563083767891, "logps/chosen": -2.1680703163146973, "logps/rejected": -2.841989517211914, "loss": 0.705, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1680703163146973, "rewards/margins": 0.6739190816879272, "rewards/rejected": -2.841989517211914, "sft_loss": 2.108734369277954, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 5.850728721372914, "learning_rate": 7.872770509659905e-07, "logits/chosen": -0.02978910505771637, "logits/rejected": 0.028155144304037094, "logps/chosen": -2.1252212524414062, "logps/rejected": -2.815303325653076, "loss": 0.6879, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1252212524414062, "rewards/margins": 0.6900821924209595, "rewards/rejected": -2.815303325653076, "sft_loss": 2.041010856628418, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 2.2595190183874685, "learning_rate": 7.860009773741896e-07, "logits/chosen": 0.035829830914735794, "logits/rejected": 0.18875843286514282, "logps/chosen": -2.093374729156494, "logps/rejected": -3.026752233505249, "loss": 0.6914, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.093374729156494, "rewards/margins": 0.933377742767334, "rewards/rejected": -3.026752233505249, "sft_loss": 2.0471291542053223, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 2.895533679660134, "learning_rate": 7.84722128999652e-07, "logits/chosen": 0.055860865861177444, "logits/rejected": 0.2418518364429474, "logps/chosen": -2.094313859939575, "logps/rejected": -3.223344087600708, "loss": 0.6916, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.094313859939575, "rewards/margins": 1.1290298700332642, "rewards/rejected": -3.223344087600708, "sft_loss": 2.103170156478882, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 4.157992270118887, "learning_rate": 7.834405182497699e-07, "logits/chosen": 0.1447199285030365, "logits/rejected": 0.21992520987987518, "logps/chosen": -2.229947566986084, "logps/rejected": -2.9657130241394043, "loss": 0.6907, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.229947566986084, "rewards/margins": 0.7357650995254517, "rewards/rejected": -2.9657130241394043, "sft_loss": 2.187542676925659, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 4.614746209370739, "learning_rate": 7.821561575587368e-07, "logits/chosen": 0.028188159689307213, "logits/rejected": 0.07980314642190933, "logps/chosen": -2.128232955932617, "logps/rejected": -2.8294498920440674, "loss": 0.6957, "rewards/accuracies": 0.6875, "rewards/chosen": -2.128232955932617, "rewards/margins": 0.7012170553207397, "rewards/rejected": -2.8294498920440674, "sft_loss": 2.120162010192871, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 2.6420371183347204, "learning_rate": 7.808690593874254e-07, "logits/chosen": 0.020165940746665, "logits/rejected": 0.1338595598936081, "logps/chosen": -2.051849603652954, "logps/rejected": -3.0664772987365723, "loss": 0.6796, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.051849603652954, "rewards/margins": 1.0146278142929077, "rewards/rejected": -3.0664772987365723, "sft_loss": 2.0553438663482666, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 3.764557498320856, "learning_rate": 7.79579236223268e-07, "logits/chosen": 0.07896244525909424, "logits/rejected": 0.3769746422767639, "logps/chosen": -2.0657172203063965, "logps/rejected": -3.0718822479248047, "loss": 0.6851, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.0657172203063965, "rewards/margins": 1.006164789199829, "rewards/rejected": -3.0718822479248047, "sft_loss": 2.0536303520202637, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 3.9943672104879697, "learning_rate": 7.782867005801346e-07, "logits/chosen": 0.03182698413729668, "logits/rejected": 0.26468613743782043, "logps/chosen": -2.0125889778137207, "logps/rejected": -2.906167507171631, "loss": 0.6767, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0125889778137207, "rewards/margins": 0.8935783505439758, "rewards/rejected": -2.906167507171631, "sft_loss": 2.0478873252868652, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 4.330009826251876, "learning_rate": 7.769914649982117e-07, "logits/chosen": 0.061708033084869385, "logits/rejected": 0.2412114441394806, "logps/chosen": -2.091811180114746, "logps/rejected": -2.896186351776123, "loss": 0.6874, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.091811180114746, "rewards/margins": 0.8043753504753113, "rewards/rejected": -2.896186351776123, "sft_loss": 2.1207423210144043, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 9.02744531612043, "learning_rate": 7.756935420438803e-07, "logits/chosen": 0.034645337611436844, "logits/rejected": 0.1325272023677826, "logps/chosen": -2.0987558364868164, "logps/rejected": -3.0625805854797363, "loss": 0.6892, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0987558364868164, "rewards/margins": 0.9638249278068542, "rewards/rejected": -3.0625805854797363, "sft_loss": 2.1816165447235107, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 1.866573424200562, "learning_rate": 7.743929443095951e-07, "logits/chosen": -0.0053394995629787445, "logits/rejected": 0.0790596604347229, "logps/chosen": -2.267483711242676, "logps/rejected": -3.1621835231781006, "loss": 0.6956, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.267483711242676, "rewards/margins": 0.8946998715400696, "rewards/rejected": -3.1621835231781006, "sft_loss": 2.173220157623291, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 2.2911175585506394, "learning_rate": 7.730896844137609e-07, "logits/chosen": 0.014605918899178505, "logits/rejected": 0.10750974714756012, "logps/chosen": -2.3764493465423584, "logps/rejected": -3.1504592895507812, "loss": 0.6958, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3764493465423584, "rewards/margins": 0.7740097045898438, "rewards/rejected": -3.1504592895507812, "sft_loss": 2.3708794116973877, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 2.1480728234079156, "learning_rate": 7.717837750006106e-07, "logits/chosen": -0.03691485524177551, "logits/rejected": 0.07787419110536575, "logps/chosen": -2.2911763191223145, "logps/rejected": -3.2916271686553955, "loss": 0.6771, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2911763191223145, "rewards/margins": 1.000450611114502, "rewards/rejected": -3.2916271686553955, "sft_loss": 2.260286331176758, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 1.8362749505910247, "learning_rate": 7.704752287400832e-07, "logits/chosen": 0.013022062368690968, "logits/rejected": 0.22734710574150085, "logps/chosen": -2.223485231399536, "logps/rejected": -3.106647491455078, "loss": 0.6952, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.223485231399536, "rewards/margins": 0.8831623792648315, "rewards/rejected": -3.106647491455078, "sft_loss": 2.1939940452575684, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 3.314984005450456, "learning_rate": 7.691640583277004e-07, "logits/chosen": 0.0009105164790526032, "logits/rejected": 0.19259147346019745, "logps/chosen": -2.1488661766052246, "logps/rejected": -3.095223903656006, "loss": 0.684, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.1488661766052246, "rewards/margins": 0.9463576078414917, "rewards/rejected": -3.095223903656006, "sft_loss": 2.160351037979126, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 2.426602568608894, "learning_rate": 7.678502764844433e-07, "logits/chosen": -0.05211140587925911, "logits/rejected": 0.16416163742542267, "logps/chosen": -2.1900627613067627, "logps/rejected": -2.8962035179138184, "loss": 0.6942, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.1900627613067627, "rewards/margins": 0.7061406970024109, "rewards/rejected": -2.8962035179138184, "sft_loss": 2.170743465423584, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 3.4497849360982396, "learning_rate": 7.665338959566288e-07, "logits/chosen": 0.022833820432424545, "logits/rejected": 0.14317765831947327, "logps/chosen": -2.170104742050171, "logps/rejected": -3.0590224266052246, "loss": 0.6728, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.170104742050171, "rewards/margins": 0.8889178037643433, "rewards/rejected": -3.0590224266052246, "sft_loss": 2.172898769378662, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 2.939900637807476, "learning_rate": 7.652149295157868e-07, "logits/chosen": 0.10830320417881012, "logits/rejected": 0.29697293043136597, "logps/chosen": -2.2700865268707275, "logps/rejected": -2.937676429748535, "loss": 0.6844, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2700865268707275, "rewards/margins": 0.6675900816917419, "rewards/rejected": -2.937676429748535, "sft_loss": 2.2042338848114014, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 3.276701370462819, "learning_rate": 7.638933899585354e-07, "logits/chosen": 0.16689571738243103, "logits/rejected": 0.2072555124759674, "logps/chosen": -2.110347032546997, "logps/rejected": -3.064884901046753, "loss": 0.6828, "rewards/accuracies": 0.75, "rewards/chosen": -2.110347032546997, "rewards/margins": 0.9545377492904663, "rewards/rejected": -3.064884901046753, "sft_loss": 2.2188868522644043, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 3.985887115424679, "learning_rate": 7.625692901064573e-07, "logits/chosen": 0.06006260961294174, "logits/rejected": 0.18580766022205353, "logps/chosen": -2.265280246734619, "logps/rejected": -3.1473121643066406, "loss": 0.6864, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.265280246734619, "rewards/margins": 0.8820323944091797, "rewards/rejected": -3.1473121643066406, "sft_loss": 2.2683613300323486, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 3.1283800272101887, "learning_rate": 7.61242642805975e-07, "logits/chosen": 0.05172146484255791, "logits/rejected": 0.04666357487440109, "logps/chosen": -2.2359461784362793, "logps/rejected": -2.916210889816284, "loss": 0.6922, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.2359461784362793, "rewards/margins": 0.6802645325660706, "rewards/rejected": -2.916210889816284, "sft_loss": 2.3265719413757324, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 2.438764303944812, "learning_rate": 7.599134609282266e-07, "logits/chosen": -0.08242259919643402, "logits/rejected": 0.1345296949148178, "logps/chosen": -2.2915401458740234, "logps/rejected": -2.9627060890197754, "loss": 0.7024, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.2915401458740234, "rewards/margins": 0.6711658239364624, "rewards/rejected": -2.9627060890197754, "sft_loss": 2.277489423751831, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 3.011823113421532, "learning_rate": 7.585817573689402e-07, "logits/chosen": -0.07217409461736679, "logits/rejected": 0.06581093370914459, "logps/chosen": -2.0234055519104004, "logps/rejected": -3.072521686553955, "loss": 0.6714, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0234055519104004, "rewards/margins": 1.0491163730621338, "rewards/rejected": -3.072521686553955, "sft_loss": 2.065383195877075, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 2.8564716043679446, "learning_rate": 7.572475450483098e-07, "logits/chosen": -0.02109135314822197, "logits/rejected": 0.08697710931301117, "logps/chosen": -2.264143705368042, "logps/rejected": -3.120563268661499, "loss": 0.6907, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.264143705368042, "rewards/margins": 0.856419563293457, "rewards/rejected": -3.120563268661499, "sft_loss": 2.190354347229004, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 2.7426036225809076, "learning_rate": 7.559108369108689e-07, "logits/chosen": -0.07922705262899399, "logits/rejected": 0.06583412736654282, "logps/chosen": -2.1012039184570312, "logps/rejected": -2.7338175773620605, "loss": 0.7093, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1012039184570312, "rewards/margins": 0.6326137781143188, "rewards/rejected": -2.7338175773620605, "sft_loss": 2.1062769889831543, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 4.963715932295364, "learning_rate": 7.54571645925366e-07, "logits/chosen": -0.08481200039386749, "logits/rejected": 0.1758030205965042, "logps/chosen": -2.024156093597412, "logps/rejected": -3.14689564704895, "loss": 0.6812, "rewards/accuracies": 0.71875, "rewards/chosen": -2.024156093597412, "rewards/margins": 1.1227390766143799, "rewards/rejected": -3.14689564704895, "sft_loss": 2.032785415649414, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 10.948141582318529, "learning_rate": 7.532299850846378e-07, "logits/chosen": -0.06958422064781189, "logits/rejected": 0.13180723786354065, "logps/chosen": -2.123347043991089, "logps/rejected": -3.3177521228790283, "loss": 0.6991, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.123347043991089, "rewards/margins": 1.194405198097229, "rewards/rejected": -3.3177521228790283, "sft_loss": 2.058178663253784, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 2.5259745067937915, "learning_rate": 7.518858674054838e-07, "logits/chosen": 0.00844159722328186, "logits/rejected": 0.24382808804512024, "logps/chosen": -2.0869460105895996, "logps/rejected": -2.9912471771240234, "loss": 0.6802, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.0869460105895996, "rewards/margins": 0.9043010473251343, "rewards/rejected": -2.9912471771240234, "sft_loss": 2.063582420349121, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 3.379119975688467, "learning_rate": 7.505393059285394e-07, "logits/chosen": -0.015473166480660439, "logits/rejected": 0.1876002997159958, "logps/chosen": -2.170123815536499, "logps/rejected": -3.030186891555786, "loss": 0.6819, "rewards/accuracies": 0.71875, "rewards/chosen": -2.170123815536499, "rewards/margins": 0.860063374042511, "rewards/rejected": -3.030186891555786, "sft_loss": 2.189866781234741, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 3.51519766181329, "learning_rate": 7.491903137181501e-07, "logits/chosen": 0.052792083472013474, "logits/rejected": 0.10203012079000473, "logps/chosen": -2.100395679473877, "logps/rejected": -2.775660991668701, "loss": 0.6966, "rewards/accuracies": 0.71875, "rewards/chosen": -2.100395679473877, "rewards/margins": 0.6752654314041138, "rewards/rejected": -2.775660991668701, "sft_loss": 2.113642454147339, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 3.8963991140526897, "learning_rate": 7.478389038622441e-07, "logits/chosen": 0.10614132881164551, "logits/rejected": 0.15122470259666443, "logps/chosen": -2.0579097270965576, "logps/rejected": -2.9311022758483887, "loss": 0.6794, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0579097270965576, "rewards/margins": 0.8731926679611206, "rewards/rejected": -2.9311022758483887, "sft_loss": 2.034806728363037, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 2.5495966112877837, "learning_rate": 7.46485089472206e-07, "logits/chosen": -0.05587650090456009, "logits/rejected": 0.05207020044326782, "logps/chosen": -2.127136468887329, "logps/rejected": -2.8834292888641357, "loss": 0.6947, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.127136468887329, "rewards/margins": 0.7562929391860962, "rewards/rejected": -2.8834292888641357, "sft_loss": 2.09360408782959, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 2.5223855716345147, "learning_rate": 7.451288836827487e-07, "logits/chosen": -0.001058913767337799, "logits/rejected": 0.004094363190233707, "logps/chosen": -2.078620433807373, "logps/rejected": -2.7044858932495117, "loss": 0.6921, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.078620433807373, "rewards/margins": 0.6258653998374939, "rewards/rejected": -2.7044858932495117, "sft_loss": 2.1078619956970215, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 3.504723988742914, "learning_rate": 7.437702996517869e-07, "logits/chosen": -0.04328620433807373, "logits/rejected": 0.06203915923833847, "logps/chosen": -2.0743579864501953, "logps/rejected": -2.8254854679107666, "loss": 0.6925, "rewards/accuracies": 0.65625, "rewards/chosen": -2.0743579864501953, "rewards/margins": 0.7511274814605713, "rewards/rejected": -2.8254854679107666, "sft_loss": 2.123765468597412, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 4.403498456561451, "learning_rate": 7.424093505603087e-07, "logits/chosen": -0.18621978163719177, "logits/rejected": 0.007757553365081549, "logps/chosen": -1.9891008138656616, "logps/rejected": -3.0335018634796143, "loss": 0.6723, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9891008138656616, "rewards/margins": 1.044400930404663, "rewards/rejected": -3.0335018634796143, "sft_loss": 1.9587218761444092, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 3.4931091756307078, "learning_rate": 7.410460496122482e-07, "logits/chosen": -0.1046561747789383, "logits/rejected": 0.038362883031368256, "logps/chosen": -2.0162036418914795, "logps/rejected": -3.0622410774230957, "loss": 0.6688, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.0162036418914795, "rewards/margins": 1.0460376739501953, "rewards/rejected": -3.0622410774230957, "sft_loss": 1.9905481338500977, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 13.147933058773736, "learning_rate": 7.396804100343572e-07, "logits/chosen": -0.16363480687141418, "logits/rejected": 0.05883455276489258, "logps/chosen": -1.9224354028701782, "logps/rejected": -2.836124897003174, "loss": 0.6824, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.9224354028701782, "rewards/margins": 0.9136892557144165, "rewards/rejected": -2.836124897003174, "sft_loss": 1.9475923776626587, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 4.1661904330642665, "learning_rate": 7.383124450760768e-07, "logits/chosen": -0.08059743791818619, "logits/rejected": 0.12536141276359558, "logps/chosen": -2.1742968559265137, "logps/rejected": -3.200460433959961, "loss": 0.6847, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1742968559265137, "rewards/margins": 1.0261633396148682, "rewards/rejected": -3.200460433959961, "sft_loss": 2.1663155555725098, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 4.642258363325745, "learning_rate": 7.369421680094091e-07, "logits/chosen": -0.18666820228099823, "logits/rejected": -0.010999524965882301, "logps/chosen": -2.0745177268981934, "logps/rejected": -3.116467237472534, "loss": 0.7008, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.0745177268981934, "rewards/margins": 1.0419495105743408, "rewards/rejected": -3.116467237472534, "sft_loss": 2.03446888923645, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 3.6794418828233146, "learning_rate": 7.355695921287881e-07, "logits/chosen": -0.14328600466251373, "logits/rejected": -0.031096193939447403, "logps/chosen": -2.2875900268554688, "logps/rejected": -3.104003429412842, "loss": 0.7005, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2875900268554688, "rewards/margins": 0.8164132833480835, "rewards/rejected": -3.104003429412842, "sft_loss": 2.327225923538208, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 3.7502568359924355, "learning_rate": 7.341947307509513e-07, "logits/chosen": -0.1312979757785797, "logits/rejected": 0.020264511927962303, "logps/chosen": -2.2415642738342285, "logps/rejected": -3.072537422180176, "loss": 0.6959, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.2415642738342285, "rewards/margins": 0.8309730291366577, "rewards/rejected": -3.072537422180176, "sft_loss": 2.2999441623687744, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 2.6776205141460423, "learning_rate": 7.328175972148094e-07, "logits/chosen": -0.1437736451625824, "logits/rejected": 0.004277849104255438, "logps/chosen": -2.3677988052368164, "logps/rejected": -3.3778228759765625, "loss": 0.6894, "rewards/accuracies": 0.71875, "rewards/chosen": -2.3677988052368164, "rewards/margins": 1.0100243091583252, "rewards/rejected": -3.3778228759765625, "sft_loss": 2.324275493621826, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 4.915817636495881, "learning_rate": 7.314382048813185e-07, "logits/chosen": -0.1395511031150818, "logits/rejected": 0.1314028948545456, "logps/chosen": -2.116849899291992, "logps/rejected": -3.1788382530212402, "loss": 0.6852, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.116849899291992, "rewards/margins": 1.0619887113571167, "rewards/rejected": -3.1788382530212402, "sft_loss": 2.091264247894287, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 2.9800217980885235, "learning_rate": 7.300565671333486e-07, "logits/chosen": -0.15477004647254944, "logits/rejected": 0.0680699497461319, "logps/chosen": -2.162829875946045, "logps/rejected": -2.9792914390563965, "loss": 0.6822, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.162829875946045, "rewards/margins": 0.8164618611335754, "rewards/rejected": -2.9792914390563965, "sft_loss": 2.13439679145813, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 3.16895488754896, "learning_rate": 7.286726973755554e-07, "logits/chosen": -0.05908023193478584, "logits/rejected": -0.02534184232354164, "logps/chosen": -2.12937593460083, "logps/rejected": -2.8794002532958984, "loss": 0.6953, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.12937593460083, "rewards/margins": 0.7500244379043579, "rewards/rejected": -2.8794002532958984, "sft_loss": 2.089693784713745, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 4.977184632311022, "learning_rate": 7.272866090342493e-07, "logits/chosen": -0.01070446241647005, "logits/rejected": 0.08270622789859772, "logps/chosen": -2.0935487747192383, "logps/rejected": -2.8940985202789307, "loss": 0.6912, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0935487747192383, "rewards/margins": 0.8005493879318237, "rewards/rejected": -2.8940985202789307, "sft_loss": 2.0332162380218506, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 3.0684148312667245, "learning_rate": 7.258983155572656e-07, "logits/chosen": -0.1379140168428421, "logits/rejected": -0.010696396231651306, "logps/chosen": -2.1980350017547607, "logps/rejected": -3.0138888359069824, "loss": 0.6841, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1980350017547607, "rewards/margins": 0.8158538937568665, "rewards/rejected": -3.0138888359069824, "sft_loss": 2.1696524620056152, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 3.6399155538573518, "learning_rate": 7.245078304138335e-07, "logits/chosen": -0.052761245518922806, "logits/rejected": 0.0353345051407814, "logps/chosen": -2.1806437969207764, "logps/rejected": -3.1056928634643555, "loss": 0.6827, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1806437969207764, "rewards/margins": 0.9250493049621582, "rewards/rejected": -3.1056928634643555, "sft_loss": 2.1760284900665283, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 2.599231904829798, "learning_rate": 7.231151670944462e-07, "logits/chosen": -0.22138762474060059, "logits/rejected": -0.01419213879853487, "logps/chosen": -2.241899013519287, "logps/rejected": -3.0917603969573975, "loss": 0.6936, "rewards/accuracies": 0.6875, "rewards/chosen": -2.241899013519287, "rewards/margins": 0.8498618006706238, "rewards/rejected": -3.0917603969573975, "sft_loss": 2.1691365242004395, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 2.7305141878885832, "learning_rate": 7.217203391107291e-07, "logits/chosen": -0.13783904910087585, "logits/rejected": 0.042418282479047775, "logps/chosen": -2.161641836166382, "logps/rejected": -3.363574266433716, "loss": 0.6809, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.161641836166382, "rewards/margins": 1.2019329071044922, "rewards/rejected": -3.363574266433716, "sft_loss": 2.0902111530303955, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 2.878417908727133, "learning_rate": 7.203233599953096e-07, "logits/chosen": -0.09307009726762772, "logits/rejected": 0.06742610782384872, "logps/chosen": -2.1318047046661377, "logps/rejected": -2.8575503826141357, "loss": 0.6898, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.1318047046661377, "rewards/margins": 0.7257457375526428, "rewards/rejected": -2.8575503826141357, "sft_loss": 2.0882675647735596, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 4.200935469789636, "learning_rate": 7.189242433016852e-07, "logits/chosen": -0.08697251975536346, "logits/rejected": 0.056865572929382324, "logps/chosen": -2.0718178749084473, "logps/rejected": -3.113229751586914, "loss": 0.6907, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0718178749084473, "rewards/margins": 1.0414116382598877, "rewards/rejected": -3.113229751586914, "sft_loss": 2.051907539367676, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 4.0478106630832515, "learning_rate": 7.17523002604092e-07, "logits/chosen": -0.09301309287548065, "logits/rejected": 0.0913057029247284, "logps/chosen": -2.084655284881592, "logps/rejected": -3.1130805015563965, "loss": 0.6852, "rewards/accuracies": 0.6875, "rewards/chosen": -2.084655284881592, "rewards/margins": 1.0284250974655151, "rewards/rejected": -3.1130805015563965, "sft_loss": 2.156642436981201, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 2.4032541950503625, "learning_rate": 7.161196514973734e-07, "logits/chosen": -0.07560623437166214, "logits/rejected": 0.08645117282867432, "logps/chosen": -2.083491802215576, "logps/rejected": -3.286257266998291, "loss": 0.6805, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.083491802215576, "rewards/margins": 1.2027655839920044, "rewards/rejected": -3.286257266998291, "sft_loss": 2.120220184326172, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 4.0463490987662265, "learning_rate": 7.147142035968483e-07, "logits/chosen": -0.011971333995461464, "logits/rejected": 0.16497957706451416, "logps/chosen": -2.0660626888275146, "logps/rejected": -2.881598711013794, "loss": 0.6863, "rewards/accuracies": 0.65625, "rewards/chosen": -2.0660626888275146, "rewards/margins": 0.815536379814148, "rewards/rejected": -2.881598711013794, "sft_loss": 2.107175827026367, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 4.071360410507156, "learning_rate": 7.133066725381781e-07, "logits/chosen": -0.18575963377952576, "logits/rejected": 0.02247786521911621, "logps/chosen": -1.9347765445709229, "logps/rejected": -2.760807514190674, "loss": 0.6917, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.9347765445709229, "rewards/margins": 0.8260312080383301, "rewards/rejected": -2.760807514190674, "sft_loss": 1.933650016784668, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 3.135509516176366, "learning_rate": 7.118970719772354e-07, "logits/chosen": -0.09797381609678268, "logits/rejected": 0.10881340503692627, "logps/chosen": -2.189521312713623, "logps/rejected": -3.2549827098846436, "loss": 0.6822, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.189521312713623, "rewards/margins": 1.06546151638031, "rewards/rejected": -3.2549827098846436, "sft_loss": 2.204078197479248, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 2.486331336437711, "learning_rate": 7.104854155899711e-07, "logits/chosen": -0.00970968697220087, "logits/rejected": 0.11408871412277222, "logps/chosen": -2.2408223152160645, "logps/rejected": -3.1621487140655518, "loss": 0.6857, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2408223152160645, "rewards/margins": 0.9213263392448425, "rewards/rejected": -3.1621487140655518, "sft_loss": 2.1868247985839844, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 2.688927072409448, "learning_rate": 7.090717170722817e-07, "logits/chosen": 0.014814998023211956, "logits/rejected": 0.10555567592382431, "logps/chosen": -2.166475534439087, "logps/rejected": -3.28424072265625, "loss": 0.6846, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.166475534439087, "rewards/margins": 1.117765188217163, "rewards/rejected": -3.28424072265625, "sft_loss": 2.1495277881622314, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 7.139311582165276, "learning_rate": 7.076559901398762e-07, "logits/chosen": -0.17499220371246338, "logits/rejected": -0.010774696245789528, "logps/chosen": -2.0613772869110107, "logps/rejected": -2.8371224403381348, "loss": 0.707, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0613772869110107, "rewards/margins": 0.7757450938224792, "rewards/rejected": -2.8371224403381348, "sft_loss": 2.0831761360168457, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 3.6126041542575633, "learning_rate": 7.062382485281436e-07, "logits/chosen": -0.09352283179759979, "logits/rejected": 0.05490085482597351, "logps/chosen": -2.0558700561523438, "logps/rejected": -2.79360032081604, "loss": 0.6894, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.0558700561523438, "rewards/margins": 0.7377304434776306, "rewards/rejected": -2.79360032081604, "sft_loss": 2.055211305618286, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.24242374300956726, "eval_logits/rejected": 0.34696489572525024, "eval_logps/chosen": -2.1527044773101807, "eval_logps/rejected": -3.098733901977539, "eval_loss": 0.6908385753631592, "eval_rewards/accuracies": 0.6810088753700256, "eval_rewards/chosen": -2.1527044773101807, "eval_rewards/margins": 0.9460291862487793, "eval_rewards/rejected": -3.098733901977539, "eval_runtime": 46.6607, "eval_samples_per_second": 28.825, "eval_sft_loss": 2.1131699085235596, "eval_steps_per_second": 7.222, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 3.0229337105586116, "learning_rate": 7.048185059920193e-07, "logits/chosen": -0.04934023693203926, "logits/rejected": 0.10366056859493256, "logps/chosen": -2.0795083045959473, "logps/rejected": -3.3166897296905518, "loss": 0.6839, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0795083045959473, "rewards/margins": 1.2371810674667358, "rewards/rejected": -3.3166897296905518, "sft_loss": 2.0751585960388184, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 2.077199559958702, "learning_rate": 7.033967763058516e-07, "logits/chosen": -0.12134470790624619, "logits/rejected": 0.07075067609548569, "logps/chosen": -2.217790365219116, "logps/rejected": -2.803415298461914, "loss": 0.6975, "rewards/accuracies": 0.6875, "rewards/chosen": -2.217790365219116, "rewards/margins": 0.5856245756149292, "rewards/rejected": -2.803415298461914, "sft_loss": 2.1529674530029297, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 10.27680753319747, "learning_rate": 7.019730732632681e-07, "logits/chosen": 0.008393806405365467, "logits/rejected": 0.11342382431030273, "logps/chosen": -2.071897268295288, "logps/rejected": -3.249005079269409, "loss": 0.675, "rewards/accuracies": 0.71875, "rewards/chosen": -2.071897268295288, "rewards/margins": 1.177107572555542, "rewards/rejected": -3.249005079269409, "sft_loss": 2.075561046600342, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 4.082283072340448, "learning_rate": 7.005474106770418e-07, "logits/chosen": -0.12556777894496918, "logits/rejected": 0.016571324318647385, "logps/chosen": -2.1948039531707764, "logps/rejected": -3.23891019821167, "loss": 0.6776, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1948039531707764, "rewards/margins": 1.0441062450408936, "rewards/rejected": -3.23891019821167, "sft_loss": 2.202822208404541, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 3.0767883445658026, "learning_rate": 6.991198023789577e-07, "logits/chosen": -0.051589228212833405, "logits/rejected": 0.05762894079089165, "logps/chosen": -1.9911400079727173, "logps/rejected": -2.8439207077026367, "loss": 0.681, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9911400079727173, "rewards/margins": 0.8527809381484985, "rewards/rejected": -2.8439207077026367, "sft_loss": 2.1229031085968018, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 1.8886738530903202, "learning_rate": 6.976902622196776e-07, "logits/chosen": -0.046624403446912766, "logits/rejected": 0.036238282918930054, "logps/chosen": -2.253139019012451, "logps/rejected": -3.0171914100646973, "loss": 0.7037, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.253139019012451, "rewards/margins": 0.7640522718429565, "rewards/rejected": -3.0171914100646973, "sft_loss": 2.2068352699279785, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 2.411245095502254, "learning_rate": 6.962588040686064e-07, "logits/chosen": -0.0738714337348938, "logits/rejected": 0.08386780321598053, "logps/chosen": -2.155534505844116, "logps/rejected": -2.863119125366211, "loss": 0.6965, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.155534505844116, "rewards/margins": 0.7075840830802917, "rewards/rejected": -2.863119125366211, "sft_loss": 2.1776299476623535, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 3.537028745158427, "learning_rate": 6.948254418137573e-07, "logits/chosen": -0.12377981841564178, "logits/rejected": 0.02569674327969551, "logps/chosen": -2.1482458114624023, "logps/rejected": -3.1123929023742676, "loss": 0.6931, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.1482458114624023, "rewards/margins": 0.9641472101211548, "rewards/rejected": -3.1123929023742676, "sft_loss": 2.0953750610351562, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 2.2661497164287674, "learning_rate": 6.933901893616174e-07, "logits/chosen": -0.1352919489145279, "logits/rejected": 0.03508482128381729, "logps/chosen": -2.173128604888916, "logps/rejected": -2.9652352333068848, "loss": 0.6932, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.173128604888916, "rewards/margins": 0.7921067476272583, "rewards/rejected": -2.9652352333068848, "sft_loss": 2.1787595748901367, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 2.8400594818591367, "learning_rate": 6.919530606370121e-07, "logits/chosen": -0.08295096457004547, "logits/rejected": 0.07789567112922668, "logps/chosen": -2.1355133056640625, "logps/rejected": -3.1785805225372314, "loss": 0.6797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1355133056640625, "rewards/margins": 1.0430670976638794, "rewards/rejected": -3.1785805225372314, "sft_loss": 2.096576452255249, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 1.7924089901573266, "learning_rate": 6.905140695829706e-07, "logits/chosen": -0.1479923576116562, "logits/rejected": 0.13825397193431854, "logps/chosen": -2.13856840133667, "logps/rejected": -3.0264368057250977, "loss": 0.6844, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.13856840133667, "rewards/margins": 0.8878685832023621, "rewards/rejected": -3.0264368057250977, "sft_loss": 2.0962822437286377, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 2.7934562089784354, "learning_rate": 6.890732301605904e-07, "logits/chosen": -0.08230151981115341, "logits/rejected": 0.0383109524846077, "logps/chosen": -2.1695401668548584, "logps/rejected": -2.992586612701416, "loss": 0.6834, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.1695401668548584, "rewards/margins": 0.8230465054512024, "rewards/rejected": -2.992586612701416, "sft_loss": 2.1208584308624268, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 2.4992385655272455, "learning_rate": 6.876305563489021e-07, "logits/chosen": -0.08335469663143158, "logits/rejected": 0.017959536984562874, "logps/chosen": -2.0857455730438232, "logps/rejected": -3.0713672637939453, "loss": 0.6792, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0857455730438232, "rewards/margins": 0.9856218099594116, "rewards/rejected": -3.0713672637939453, "sft_loss": 2.0089364051818848, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 10.105793611988156, "learning_rate": 6.861860621447331e-07, "logits/chosen": -0.21285517513751984, "logits/rejected": -0.1051969975233078, "logps/chosen": -2.1936020851135254, "logps/rejected": -2.995419979095459, "loss": 0.701, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1936020851135254, "rewards/margins": 0.8018182516098022, "rewards/rejected": -2.995419979095459, "sft_loss": 2.244130849838257, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 2.575060081317489, "learning_rate": 6.847397615625725e-07, "logits/chosen": -0.07824737578630447, "logits/rejected": -0.01782209798693657, "logps/chosen": -2.265883445739746, "logps/rejected": -3.1407861709594727, "loss": 0.6982, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.265883445739746, "rewards/margins": 0.8749030232429504, "rewards/rejected": -3.1407861709594727, "sft_loss": 2.172591209411621, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 3.530855461527871, "learning_rate": 6.83291668634435e-07, "logits/chosen": -0.20899859070777893, "logits/rejected": -0.006174634210765362, "logps/chosen": -2.1980414390563965, "logps/rejected": -3.2681076526641846, "loss": 0.6907, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1980414390563965, "rewards/margins": 1.070065975189209, "rewards/rejected": -3.2681076526641846, "sft_loss": 2.2746310234069824, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 4.209542371097667, "learning_rate": 6.818417974097246e-07, "logits/chosen": 0.0029031604062765837, "logits/rejected": 0.18295882642269135, "logps/chosen": -2.1312851905822754, "logps/rejected": -3.35154390335083, "loss": 0.6764, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1312851905822754, "rewards/margins": 1.2202587127685547, "rewards/rejected": -3.35154390335083, "sft_loss": 2.1908926963806152, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 2.804247444623222, "learning_rate": 6.803901619550981e-07, "logits/chosen": -0.11487329006195068, "logits/rejected": -0.03248930722475052, "logps/chosen": -2.2412500381469727, "logps/rejected": -3.1712937355041504, "loss": 0.6925, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.2412500381469727, "rewards/margins": 0.9300435185432434, "rewards/rejected": -3.1712937355041504, "sft_loss": 2.2203731536865234, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 3.353038019020451, "learning_rate": 6.789367763543292e-07, "logits/chosen": -0.06891624629497528, "logits/rejected": -0.036821819841861725, "logps/chosen": -2.2421700954437256, "logps/rejected": -3.067368984222412, "loss": 0.6936, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2421700954437256, "rewards/margins": 0.8251991271972656, "rewards/rejected": -3.067368984222412, "sft_loss": 2.243100643157959, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 3.458953722038985, "learning_rate": 6.774816547081714e-07, "logits/chosen": -0.057009391486644745, "logits/rejected": 0.11005090177059174, "logps/chosen": -2.0556159019470215, "logps/rejected": -2.845609188079834, "loss": 0.687, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0556159019470215, "rewards/margins": 0.7899934649467468, "rewards/rejected": -2.845609188079834, "sft_loss": 2.098008632659912, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 3.94927382740404, "learning_rate": 6.760248111342211e-07, "logits/chosen": -0.06724351644515991, "logits/rejected": 0.1374901980161667, "logps/chosen": -2.1209895610809326, "logps/rejected": -3.237943172454834, "loss": 0.6795, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1209895610809326, "rewards/margins": 1.116953730583191, "rewards/rejected": -3.237943172454834, "sft_loss": 2.0984835624694824, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 5.166669704979176, "learning_rate": 6.745662597667813e-07, "logits/chosen": -0.0967196449637413, "logits/rejected": 0.0657767504453659, "logps/chosen": -2.0116353034973145, "logps/rejected": -3.0105011463165283, "loss": 0.6755, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.0116353034973145, "rewards/margins": 0.9988659024238586, "rewards/rejected": -3.0105011463165283, "sft_loss": 2.0762581825256348, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 2.1962659992003672, "learning_rate": 6.731060147567236e-07, "logits/chosen": 0.0395740307867527, "logits/rejected": 0.173264741897583, "logps/chosen": -2.1408400535583496, "logps/rejected": -3.083125114440918, "loss": 0.6806, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1408400535583496, "rewards/margins": 0.9422849416732788, "rewards/rejected": -3.083125114440918, "sft_loss": 2.105454206466675, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 1.9316971958150804, "learning_rate": 6.716440902713515e-07, "logits/chosen": -0.04250287264585495, "logits/rejected": 0.05922934412956238, "logps/chosen": -2.09122633934021, "logps/rejected": -2.858992338180542, "loss": 0.6944, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.09122633934021, "rewards/margins": 0.7677661180496216, "rewards/rejected": -2.858992338180542, "sft_loss": 1.9963337182998657, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 4.502744430309704, "learning_rate": 6.701805004942627e-07, "logits/chosen": -0.02525998279452324, "logits/rejected": 0.0633404478430748, "logps/chosen": -2.0883584022521973, "logps/rejected": -2.9223198890686035, "loss": 0.6841, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0883584022521973, "rewards/margins": 0.8339619636535645, "rewards/rejected": -2.9223198890686035, "sft_loss": 2.147000789642334, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 17.42620499120433, "learning_rate": 6.687152596252119e-07, "logits/chosen": -0.04304816573858261, "logits/rejected": 0.05292002111673355, "logps/chosen": -2.245084285736084, "logps/rejected": -2.8690338134765625, "loss": 0.7071, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.245084285736084, "rewards/margins": 0.6239495873451233, "rewards/rejected": -2.8690338134765625, "sft_loss": 2.1775736808776855, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 2.675042367787873, "learning_rate": 6.672483818799722e-07, "logits/chosen": -0.12775865197181702, "logits/rejected": 0.05053830146789551, "logps/chosen": -2.1142070293426514, "logps/rejected": -3.1125524044036865, "loss": 0.6889, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1142070293426514, "rewards/margins": 0.9983454942703247, "rewards/rejected": -3.1125524044036865, "sft_loss": 2.073481559753418, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 9.224772976262004, "learning_rate": 6.657798814901978e-07, "logits/chosen": -0.04525815695524216, "logits/rejected": 0.15498168766498566, "logps/chosen": -2.4505741596221924, "logps/rejected": -3.086620807647705, "loss": 0.698, "rewards/accuracies": 0.65625, "rewards/chosen": -2.4505741596221924, "rewards/margins": 0.6360467076301575, "rewards/rejected": -3.086620807647705, "sft_loss": 2.2530603408813477, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 2.7095060842080856, "learning_rate": 6.643097727032863e-07, "logits/chosen": -0.05475207418203354, "logits/rejected": 0.17921461164951324, "logps/chosen": -2.172621965408325, "logps/rejected": -3.311145782470703, "loss": 0.6835, "rewards/accuracies": 0.71875, "rewards/chosen": -2.172621965408325, "rewards/margins": 1.138523817062378, "rewards/rejected": -3.311145782470703, "sft_loss": 2.169924259185791, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 2.836953925052455, "learning_rate": 6.628380697822392e-07, "logits/chosen": -0.042983829975128174, "logits/rejected": 0.14592500030994415, "logps/chosen": -2.232512950897217, "logps/rejected": -2.960404634475708, "loss": 0.6976, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.232512950897217, "rewards/margins": 0.7278915643692017, "rewards/rejected": -2.960404634475708, "sft_loss": 2.1754684448242188, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 4.065292751766514, "learning_rate": 6.61364787005525e-07, "logits/chosen": -0.020567212253808975, "logits/rejected": 0.1053391695022583, "logps/chosen": -2.131559133529663, "logps/rejected": -3.206552505493164, "loss": 0.6834, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.131559133529663, "rewards/margins": 1.074993371963501, "rewards/rejected": -3.206552505493164, "sft_loss": 2.1917171478271484, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 5.133914146344989, "learning_rate": 6.598899386669395e-07, "logits/chosen": -0.00458473339676857, "logits/rejected": 0.14149120450019836, "logps/chosen": -2.0979244709014893, "logps/rejected": -3.1505038738250732, "loss": 0.6871, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.0979244709014893, "rewards/margins": 1.0525795221328735, "rewards/rejected": -3.1505038738250732, "sft_loss": 2.0719380378723145, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 5.66680961721883, "learning_rate": 6.584135390754679e-07, "logits/chosen": -0.03058113530278206, "logits/rejected": 0.11164456605911255, "logps/chosen": -2.146927833557129, "logps/rejected": -3.096108913421631, "loss": 0.6887, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.146927833557129, "rewards/margins": 0.9491811990737915, "rewards/rejected": -3.096108913421631, "sft_loss": 2.147765874862671, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 6.270360510361445, "learning_rate": 6.569356025551454e-07, "logits/chosen": 0.0009026816114783287, "logits/rejected": 0.06363946944475174, "logps/chosen": -2.144469738006592, "logps/rejected": -2.9771530628204346, "loss": 0.681, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.144469738006592, "rewards/margins": 0.832683265209198, "rewards/rejected": -2.9771530628204346, "sft_loss": 2.064734935760498, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 8.729703440563144, "learning_rate": 6.554561434449186e-07, "logits/chosen": -0.12333973497152328, "logits/rejected": 0.06011788919568062, "logps/chosen": -2.118159770965576, "logps/rejected": -3.038071393966675, "loss": 0.6885, "rewards/accuracies": 0.6875, "rewards/chosen": -2.118159770965576, "rewards/margins": 0.9199115037918091, "rewards/rejected": -3.038071393966675, "sft_loss": 2.1175942420959473, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 4.973931538666568, "learning_rate": 6.539751760985063e-07, "logits/chosen": -0.07782983034849167, "logits/rejected": 0.012289203703403473, "logps/chosen": -2.131692409515381, "logps/rejected": -2.7401137351989746, "loss": 0.6987, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.131692409515381, "rewards/margins": 0.6084216833114624, "rewards/rejected": -2.7401137351989746, "sft_loss": 2.1399919986724854, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 3.318965976930725, "learning_rate": 6.524927148842602e-07, "logits/chosen": -0.0142856789752841, "logits/rejected": 0.15498195588588715, "logps/chosen": -2.137073516845703, "logps/rejected": -3.057272434234619, "loss": 0.6765, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.137073516845703, "rewards/margins": 0.9201983213424683, "rewards/rejected": -3.057272434234619, "sft_loss": 2.040189027786255, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 5.2308567106999595, "learning_rate": 6.510087741850254e-07, "logits/chosen": -0.09322819858789444, "logits/rejected": 0.04602864384651184, "logps/chosen": -2.081974983215332, "logps/rejected": -3.0379087924957275, "loss": 0.6917, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.081974983215332, "rewards/margins": 0.9559333920478821, "rewards/rejected": -3.0379087924957275, "sft_loss": 2.123471736907959, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 4.1952284473055075, "learning_rate": 6.495233683980012e-07, "logits/chosen": -0.1261480301618576, "logits/rejected": -0.0629156082868576, "logps/chosen": -2.166938304901123, "logps/rejected": -2.9580254554748535, "loss": 0.6908, "rewards/accuracies": 0.6875, "rewards/chosen": -2.166938304901123, "rewards/margins": 0.7910870313644409, "rewards/rejected": -2.9580254554748535, "sft_loss": 2.124786853790283, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 4.768424623304929, "learning_rate": 6.480365119346011e-07, "logits/chosen": -0.014707823283970356, "logits/rejected": 0.1322084218263626, "logps/chosen": -2.1736626625061035, "logps/rejected": -2.9306247234344482, "loss": 0.68, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1736626625061035, "rewards/margins": 0.7569620013237, "rewards/rejected": -2.9306247234344482, "sft_loss": 2.120302200317383, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 2.7539073753290637, "learning_rate": 6.465482192203129e-07, "logits/chosen": 0.013084961101412773, "logits/rejected": 0.0828789547085762, "logps/chosen": -2.197340250015259, "logps/rejected": -2.9716684818267822, "loss": 0.6936, "rewards/accuracies": 0.625, "rewards/chosen": -2.197340250015259, "rewards/margins": 0.774328351020813, "rewards/rejected": -2.9716684818267822, "sft_loss": 2.27701735496521, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 4.566894831208481, "learning_rate": 6.45058504694559e-07, "logits/chosen": 0.021959930658340454, "logits/rejected": 0.07701905816793442, "logps/chosen": -2.2421679496765137, "logps/rejected": -3.062756061553955, "loss": 0.69, "rewards/accuracies": 0.65625, "rewards/chosen": -2.2421679496765137, "rewards/margins": 0.8205882906913757, "rewards/rejected": -3.062756061553955, "sft_loss": 2.292219638824463, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 4.95882769400696, "learning_rate": 6.435673828105564e-07, "logits/chosen": -0.07913017272949219, "logits/rejected": 0.06965837627649307, "logps/chosen": -2.2385611534118652, "logps/rejected": -3.219817638397217, "loss": 0.6896, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2385611534118652, "rewards/margins": 0.9812566041946411, "rewards/rejected": -3.219817638397217, "sft_loss": 2.2946548461914062, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 9.493816800187712, "learning_rate": 6.420748680351763e-07, "logits/chosen": -0.06189330667257309, "logits/rejected": -0.11217920482158661, "logps/chosen": -2.3079352378845215, "logps/rejected": -2.866248369216919, "loss": 0.7073, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.3079352378845215, "rewards/margins": 0.5583130121231079, "rewards/rejected": -2.866248369216919, "sft_loss": 2.3487675189971924, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 2.054016193827397, "learning_rate": 6.405809748488032e-07, "logits/chosen": -0.14082583785057068, "logits/rejected": 0.02557968534529209, "logps/chosen": -2.4099087715148926, "logps/rejected": -3.370522975921631, "loss": 0.6846, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.4099087715148926, "rewards/margins": 0.9606143236160278, "rewards/rejected": -3.370522975921631, "sft_loss": 2.304067850112915, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 4.057502087752727, "learning_rate": 6.390857177451956e-07, "logits/chosen": -0.22423699498176575, "logits/rejected": -0.01131142396479845, "logps/chosen": -2.3235392570495605, "logps/rejected": -3.3511900901794434, "loss": 0.6996, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.3235392570495605, "rewards/margins": 1.0276509523391724, "rewards/rejected": -3.3511900901794434, "sft_loss": 2.3187010288238525, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 2.2240564234525264, "learning_rate": 6.375891112313445e-07, "logits/chosen": -0.2010897397994995, "logits/rejected": -0.10360528528690338, "logps/chosen": -2.27347993850708, "logps/rejected": -3.3287010192871094, "loss": 0.6869, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.27347993850708, "rewards/margins": 1.0552215576171875, "rewards/rejected": -3.3287010192871094, "sft_loss": 2.262712001800537, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 3.3316564064774665, "learning_rate": 6.360911698273326e-07, "logits/chosen": -0.0734337568283081, "logits/rejected": 0.030337577685713768, "logps/chosen": -2.405501127243042, "logps/rejected": -3.0734591484069824, "loss": 0.6951, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.405501127243042, "rewards/margins": 0.6679580211639404, "rewards/rejected": -3.0734591484069824, "sft_loss": 2.271773338317871, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 3.1025024967632397, "learning_rate": 6.345919080661944e-07, "logits/chosen": -0.11891385167837143, "logits/rejected": -0.03588943928480148, "logps/chosen": -2.1229982376098633, "logps/rejected": -3.171241283416748, "loss": 0.674, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1229982376098633, "rewards/margins": 1.0482432842254639, "rewards/rejected": -3.171241283416748, "sft_loss": 2.0762438774108887, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 2.418702025653042, "learning_rate": 6.330913404937737e-07, "logits/chosen": -0.12587347626686096, "logits/rejected": 0.03255102410912514, "logps/chosen": -2.1877565383911133, "logps/rejected": -3.266651153564453, "loss": 0.7039, "rewards/accuracies": 0.65625, "rewards/chosen": -2.1877565383911133, "rewards/margins": 1.0788941383361816, "rewards/rejected": -3.266651153564453, "sft_loss": 2.1533803939819336, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 4.455130732749031, "learning_rate": 6.315894816685838e-07, "logits/chosen": -0.11404607445001602, "logits/rejected": 0.067964106798172, "logps/chosen": -2.112873077392578, "logps/rejected": -3.0061681270599365, "loss": 0.6883, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.112873077392578, "rewards/margins": 0.8932951092720032, "rewards/rejected": -3.0061681270599365, "sft_loss": 2.1034371852874756, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 3.2683981759932887, "learning_rate": 6.300863461616657e-07, "logits/chosen": -0.05666341260075569, "logits/rejected": 0.016394445672631264, "logps/chosen": -2.1205034255981445, "logps/rejected": -3.087794542312622, "loss": 0.6923, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.1205034255981445, "rewards/margins": 0.9672911763191223, "rewards/rejected": -3.087794542312622, "sft_loss": 2.069809913635254, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 3.5313129911679066, "learning_rate": 6.285819485564465e-07, "logits/chosen": -0.21050508320331573, "logits/rejected": -0.06813536584377289, "logps/chosen": -2.082973003387451, "logps/rejected": -3.1073174476623535, "loss": 0.6829, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.082973003387451, "rewards/margins": 1.0243443250656128, "rewards/rejected": -3.1073174476623535, "sft_loss": 2.097914934158325, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 3.284569886518826, "learning_rate": 6.270763034485986e-07, "logits/chosen": -0.054897792637348175, "logits/rejected": 0.06013824790716171, "logps/chosen": -2.1149497032165527, "logps/rejected": -2.877192974090576, "loss": 0.6872, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1149497032165527, "rewards/margins": 0.7622435688972473, "rewards/rejected": -2.877192974090576, "sft_loss": 2.0054376125335693, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 3.659317295834073, "learning_rate": 6.255694254458972e-07, "logits/chosen": -0.15437868237495422, "logits/rejected": 0.0225119199603796, "logps/chosen": -2.2297348976135254, "logps/rejected": -2.859705924987793, "loss": 0.6925, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2297348976135254, "rewards/margins": 0.6299708485603333, "rewards/rejected": -2.859705924987793, "sft_loss": 1.9377400875091553, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 19.207448858430492, "learning_rate": 6.240613291680795e-07, "logits/chosen": -0.1754283607006073, "logits/rejected": 0.010037758387625217, "logps/chosen": -2.1164345741271973, "logps/rejected": -3.0492329597473145, "loss": 0.7039, "rewards/accuracies": 0.65625, "rewards/chosen": -2.1164345741271973, "rewards/margins": 0.9327982068061829, "rewards/rejected": -3.0492329597473145, "sft_loss": 2.0898313522338867, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 5.091046328651422, "learning_rate": 6.225520292467021e-07, "logits/chosen": -0.1769765168428421, "logits/rejected": 0.059815000742673874, "logps/chosen": -2.1104233264923096, "logps/rejected": -3.0075087547302246, "loss": 0.6808, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1104233264923096, "rewards/margins": 0.8970853686332703, "rewards/rejected": -3.0075087547302246, "sft_loss": 2.111417293548584, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 3.8557371204085675, "learning_rate": 6.210415403249993e-07, "logits/chosen": -0.2643243670463562, "logits/rejected": 0.016842365264892578, "logps/chosen": -2.2394261360168457, "logps/rejected": -3.551403760910034, "loss": 0.687, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.2394261360168457, "rewards/margins": 1.3119779825210571, "rewards/rejected": -3.551403760910034, "sft_loss": 2.1109273433685303, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 3.2034004384790236, "learning_rate": 6.195298770577415e-07, "logits/chosen": -0.04526636004447937, "logits/rejected": -0.016321176663041115, "logps/chosen": -2.2352993488311768, "logps/rejected": -3.2442870140075684, "loss": 0.6926, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2352993488311768, "rewards/margins": 1.0089879035949707, "rewards/rejected": -3.2442870140075684, "sft_loss": 2.1991376876831055, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 2.9629505412965678, "learning_rate": 6.180170541110923e-07, "logits/chosen": -0.10669227689504623, "logits/rejected": 0.10436113178730011, "logps/chosen": -2.392090082168579, "logps/rejected": -3.246337890625, "loss": 0.7042, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.392090082168579, "rewards/margins": 0.8542478680610657, "rewards/rejected": -3.246337890625, "sft_loss": 2.3559975624084473, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 2.4741411627768453, "learning_rate": 6.165030861624663e-07, "logits/chosen": -0.14265497028827667, "logits/rejected": 0.1273798644542694, "logps/chosen": -2.1982200145721436, "logps/rejected": -3.619431257247925, "loss": 0.6771, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1982200145721436, "rewards/margins": 1.4212112426757812, "rewards/rejected": -3.619431257247925, "sft_loss": 2.028993844985962, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 3.10751717393839, "learning_rate": 6.149879879003876e-07, "logits/chosen": -0.0072515071369707584, "logits/rejected": 0.017031148076057434, "logps/chosen": -2.2495055198669434, "logps/rejected": -3.25390625, "loss": 0.6831, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2495055198669434, "rewards/margins": 1.0044009685516357, "rewards/rejected": -3.25390625, "sft_loss": 2.162059783935547, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 6.7866290750320655, "learning_rate": 6.13471774024346e-07, "logits/chosen": -0.1543581187725067, "logits/rejected": -0.029458314180374146, "logps/chosen": -2.0542521476745605, "logps/rejected": -2.9937496185302734, "loss": 0.6866, "rewards/accuracies": 0.75, "rewards/chosen": -2.0542521476745605, "rewards/margins": 0.939497172832489, "rewards/rejected": -2.9937496185302734, "sft_loss": 2.108109951019287, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 4.616865662276508, "learning_rate": 6.119544592446551e-07, "logits/chosen": -0.0960715264081955, "logits/rejected": 0.04589344188570976, "logps/chosen": -2.112891912460327, "logps/rejected": -2.9276225566864014, "loss": 0.6929, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.112891912460327, "rewards/margins": 0.8147305250167847, "rewards/rejected": -2.9276225566864014, "sft_loss": 1.950931191444397, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 4.386497894315425, "learning_rate": 6.104360582823096e-07, "logits/chosen": -0.06804818660020828, "logits/rejected": 0.04135305806994438, "logps/chosen": -2.1073989868164062, "logps/rejected": -2.949591875076294, "loss": 0.6957, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1073989868164062, "rewards/margins": 0.8421930074691772, "rewards/rejected": -2.949591875076294, "sft_loss": 2.000044822692871, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 5.891030771344286, "learning_rate": 6.089165858688423e-07, "logits/chosen": -0.08829103410243988, "logits/rejected": 0.1070912703871727, "logps/chosen": -1.9999017715454102, "logps/rejected": -3.1407268047332764, "loss": 0.6803, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9999017715454102, "rewards/margins": 1.1408249139785767, "rewards/rejected": -3.1407268047332764, "sft_loss": 1.9781440496444702, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 3.2961844654332677, "learning_rate": 6.073960567461811e-07, "logits/chosen": -0.09192325174808502, "logits/rejected": 0.13550327718257904, "logps/chosen": -1.9177381992340088, "logps/rejected": -2.9311869144439697, "loss": 0.6763, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9177381992340088, "rewards/margins": 1.01344895362854, "rewards/rejected": -2.9311869144439697, "sft_loss": 1.8968242406845093, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 4.013646920419382, "learning_rate": 6.058744856665065e-07, "logits/chosen": -0.12752971053123474, "logits/rejected": -0.021973803639411926, "logps/chosen": -2.0632452964782715, "logps/rejected": -3.4493393898010254, "loss": 0.6846, "rewards/accuracies": 0.75, "rewards/chosen": -2.0632452964782715, "rewards/margins": 1.3860942125320435, "rewards/rejected": -3.4493393898010254, "sft_loss": 2.0599193572998047, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 3.427883733138985, "learning_rate": 6.043518873921074e-07, "logits/chosen": -0.08913668245077133, "logits/rejected": 0.06239492446184158, "logps/chosen": -2.0854339599609375, "logps/rejected": -2.9506967067718506, "loss": 0.6901, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0854339599609375, "rewards/margins": 0.8652628064155579, "rewards/rejected": -2.9506967067718506, "sft_loss": 1.9990270137786865, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 3.744537290651893, "learning_rate": 6.028282766952393e-07, "logits/chosen": -0.05436503887176514, "logits/rejected": 0.09435670077800751, "logps/chosen": -2.314728260040283, "logps/rejected": -3.2752997875213623, "loss": 0.687, "rewards/accuracies": 0.71875, "rewards/chosen": -2.314728260040283, "rewards/margins": 0.9605711698532104, "rewards/rejected": -3.2752997875213623, "sft_loss": 2.152890205383301, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 3.8042713205231786, "learning_rate": 6.013036683579798e-07, "logits/chosen": -0.04366375878453255, "logits/rejected": 0.09801409393548965, "logps/chosen": -2.060361862182617, "logps/rejected": -3.076183319091797, "loss": 0.6847, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.060361862182617, "rewards/margins": 1.0158214569091797, "rewards/rejected": -3.076183319091797, "sft_loss": 2.054492473602295, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 3.5111005283603847, "learning_rate": 5.997780771720854e-07, "logits/chosen": -0.13682511448860168, "logits/rejected": 0.10204527527093887, "logps/chosen": -2.1179380416870117, "logps/rejected": -3.244584560394287, "loss": 0.6811, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1179380416870117, "rewards/margins": 1.1266463994979858, "rewards/rejected": -3.244584560394287, "sft_loss": 2.091881513595581, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 4.3374137874837295, "learning_rate": 5.982515179388486e-07, "logits/chosen": -0.006934487726539373, "logits/rejected": 0.15117433667182922, "logps/chosen": -2.162091016769409, "logps/rejected": -3.2057461738586426, "loss": 0.6782, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.162091016769409, "rewards/margins": 1.0436547994613647, "rewards/rejected": -3.2057461738586426, "sft_loss": 2.1595187187194824, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 4.683417026061879, "learning_rate": 5.967240054689541e-07, "logits/chosen": -0.1065053790807724, "logits/rejected": -0.020374376326799393, "logps/chosen": -2.004215955734253, "logps/rejected": -2.893080234527588, "loss": 0.6834, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.004215955734253, "rewards/margins": 0.8888643383979797, "rewards/rejected": -2.893080234527588, "sft_loss": 2.09393572807312, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 3.037548868110086, "learning_rate": 5.951955545823342e-07, "logits/chosen": -0.02161308005452156, "logits/rejected": 0.0724690780043602, "logps/chosen": -2.0727245807647705, "logps/rejected": -3.1097216606140137, "loss": 0.6877, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0727245807647705, "rewards/margins": 1.0369970798492432, "rewards/rejected": -3.1097216606140137, "sft_loss": 2.0997555255889893, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 3.8869330057851443, "learning_rate": 5.936661801080263e-07, "logits/chosen": -0.022613149136304855, "logits/rejected": 0.11061491817235947, "logps/chosen": -2.1283986568450928, "logps/rejected": -2.840240955352783, "loss": 0.7006, "rewards/accuracies": 0.65625, "rewards/chosen": -2.1283986568450928, "rewards/margins": 0.7118419408798218, "rewards/rejected": -2.840240955352783, "sft_loss": 2.047914505004883, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 2.2609095311694754, "learning_rate": 5.92135896884028e-07, "logits/chosen": -0.0316866859793663, "logits/rejected": 0.14357516169548035, "logps/chosen": -2.233394145965576, "logps/rejected": -3.324572801589966, "loss": 0.6881, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.233394145965576, "rewards/margins": 1.0911785364151, "rewards/rejected": -3.324572801589966, "sft_loss": 2.155008316040039, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 3.6441905285982346, "learning_rate": 5.906047197571541e-07, "logits/chosen": 0.05673060566186905, "logits/rejected": 0.051228396594524384, "logps/chosen": -2.1823010444641113, "logps/rejected": -3.0657646656036377, "loss": 0.6946, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.1823010444641113, "rewards/margins": 0.8834635019302368, "rewards/rejected": -3.0657646656036377, "sft_loss": 2.2928757667541504, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 3.314604897832424, "learning_rate": 5.890726635828919e-07, "logits/chosen": 0.14910665154457092, "logits/rejected": 0.16638293862342834, "logps/chosen": -2.0749332904815674, "logps/rejected": -2.994952917098999, "loss": 0.6884, "rewards/accuracies": 0.65625, "rewards/chosen": -2.0749332904815674, "rewards/margins": 0.9200199246406555, "rewards/rejected": -2.994952917098999, "sft_loss": 1.9762952327728271, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 11.038593057821188, "learning_rate": 5.875397432252569e-07, "logits/chosen": -0.007317324168980122, "logits/rejected": 0.09911766648292542, "logps/chosen": -2.163130044937134, "logps/rejected": -3.0095102787017822, "loss": 0.6881, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.163130044937134, "rewards/margins": 0.8463799357414246, "rewards/rejected": -3.0095102787017822, "sft_loss": 2.1841933727264404, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.4063626527786255, "eval_logits/rejected": 0.5238215327262878, "eval_logps/chosen": -2.230652332305908, "eval_logps/rejected": -3.1887776851654053, "eval_loss": 0.6908450126647949, "eval_rewards/accuracies": 0.6862017512321472, "eval_rewards/chosen": -2.230652332305908, "eval_rewards/margins": 0.9581254720687866, "eval_rewards/rejected": -3.1887776851654053, "eval_runtime": 46.198, "eval_samples_per_second": 29.114, "eval_sft_loss": 2.1384212970733643, "eval_steps_per_second": 7.295, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 5.249129536525488, "learning_rate": 5.860059735566491e-07, "logits/chosen": -0.16110579669475555, "logits/rejected": 0.025376638397574425, "logps/chosen": -2.0280165672302246, "logps/rejected": -3.126965045928955, "loss": 0.6871, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0280165672302246, "rewards/margins": 1.098948359489441, "rewards/rejected": -3.126965045928955, "sft_loss": 1.9881798028945923, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 6.613956597776934, "learning_rate": 5.844713694577087e-07, "logits/chosen": -0.003742316272109747, "logits/rejected": 0.08696229755878448, "logps/chosen": -2.16291880607605, "logps/rejected": -3.104762554168701, "loss": 0.6848, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.16291880607605, "rewards/margins": 0.9418438076972961, "rewards/rejected": -3.104762554168701, "sft_loss": 2.196422815322876, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 3.279375142433588, "learning_rate": 5.829359458171714e-07, "logits/chosen": 0.0026434913743287325, "logits/rejected": 0.14492645859718323, "logps/chosen": -2.1458747386932373, "logps/rejected": -3.288313388824463, "loss": 0.6775, "rewards/accuracies": 0.75, "rewards/chosen": -2.1458747386932373, "rewards/margins": 1.142438530921936, "rewards/rejected": -3.288313388824463, "sft_loss": 2.1363162994384766, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 4.937155891046909, "learning_rate": 5.81399717531724e-07, "logits/chosen": -0.04307904094457626, "logits/rejected": 0.17259356379508972, "logps/chosen": -2.2710137367248535, "logps/rejected": -3.0655980110168457, "loss": 0.6946, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2710137367248535, "rewards/margins": 0.7945840358734131, "rewards/rejected": -3.0655980110168457, "sft_loss": 2.290271520614624, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 4.550612774789899, "learning_rate": 5.798626995058602e-07, "logits/chosen": -0.13909904658794403, "logits/rejected": 0.06880120187997818, "logps/chosen": -2.2514679431915283, "logps/rejected": -3.4120922088623047, "loss": 0.6979, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2514679431915283, "rewards/margins": 1.1606245040893555, "rewards/rejected": -3.4120922088623047, "sft_loss": 2.19282865524292, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 6.180464443390251, "learning_rate": 5.783249066517354e-07, "logits/chosen": -0.031425271183252335, "logits/rejected": 0.10157237946987152, "logps/chosen": -2.109801769256592, "logps/rejected": -2.9821524620056152, "loss": 0.6971, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.109801769256592, "rewards/margins": 0.872350811958313, "rewards/rejected": -2.9821524620056152, "sft_loss": 2.1158018112182617, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 5.626113358092249, "learning_rate": 5.767863538890228e-07, "logits/chosen": -0.07793781906366348, "logits/rejected": 0.08719579130411148, "logps/chosen": -2.122753381729126, "logps/rejected": -3.166059970855713, "loss": 0.683, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.122753381729126, "rewards/margins": 1.0433070659637451, "rewards/rejected": -3.166059970855713, "sft_loss": 2.135249614715576, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 5.6941103393042685, "learning_rate": 5.75247056144768e-07, "logits/chosen": -0.05138932541012764, "logits/rejected": 0.03822798654437065, "logps/chosen": -2.108766794204712, "logps/rejected": -2.8018271923065186, "loss": 0.6886, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.108766794204712, "rewards/margins": 0.693060576915741, "rewards/rejected": -2.8018271923065186, "sft_loss": 2.0910696983337402, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 4.607030383324202, "learning_rate": 5.737070283532444e-07, "logits/chosen": -0.017714444547891617, "logits/rejected": 0.08581845462322235, "logps/chosen": -2.2201247215270996, "logps/rejected": -2.962507724761963, "loss": 0.7, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.2201247215270996, "rewards/margins": 0.7423833608627319, "rewards/rejected": -2.962507724761963, "sft_loss": 2.1716628074645996, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 4.468149326200883, "learning_rate": 5.721662854558084e-07, "logits/chosen": -0.09650838375091553, "logits/rejected": 0.023339275270700455, "logps/chosen": -2.1109557151794434, "logps/rejected": -3.225125551223755, "loss": 0.6867, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1109557151794434, "rewards/margins": 1.1141700744628906, "rewards/rejected": -3.225125551223755, "sft_loss": 2.039635419845581, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 2.453265911287917, "learning_rate": 5.706248424007545e-07, "logits/chosen": -0.14216859638690948, "logits/rejected": 0.0720430314540863, "logps/chosen": -2.2796530723571777, "logps/rejected": -3.128239631652832, "loss": 0.7017, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.2796530723571777, "rewards/margins": 0.8485862612724304, "rewards/rejected": -3.128239631652832, "sft_loss": 2.2780544757843018, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 2.607407849995168, "learning_rate": 5.690827141431699e-07, "logits/chosen": -0.1372571736574173, "logits/rejected": 0.084715835750103, "logps/chosen": -2.3706908226013184, "logps/rejected": -3.081747531890869, "loss": 0.7004, "rewards/accuracies": 0.65625, "rewards/chosen": -2.3706908226013184, "rewards/margins": 0.7110565304756165, "rewards/rejected": -3.081747531890869, "sft_loss": 2.333536148071289, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 8.064558298484231, "learning_rate": 5.675399156447897e-07, "logits/chosen": -0.1745007485151291, "logits/rejected": -0.011042768135666847, "logps/chosen": -2.287440061569214, "logps/rejected": -3.0668773651123047, "loss": 0.7036, "rewards/accuracies": 0.71875, "rewards/chosen": -2.287440061569214, "rewards/margins": 0.7794371247291565, "rewards/rejected": -3.0668773651123047, "sft_loss": 2.3433339595794678, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 4.2037700192625325, "learning_rate": 5.659964618738515e-07, "logits/chosen": -0.11125187575817108, "logits/rejected": 0.031043073162436485, "logps/chosen": -2.205533981323242, "logps/rejected": -2.9975638389587402, "loss": 0.6995, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.205533981323242, "rewards/margins": 0.7920295000076294, "rewards/rejected": -2.9975638389587402, "sft_loss": 2.1208176612854004, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 2.7759824802719097, "learning_rate": 5.644523678049509e-07, "logits/chosen": -0.1375740021467209, "logits/rejected": -0.0133648831397295, "logps/chosen": -2.174509048461914, "logps/rejected": -3.10461163520813, "loss": 0.6828, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.174509048461914, "rewards/margins": 0.9301024675369263, "rewards/rejected": -3.10461163520813, "sft_loss": 2.068380117416382, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 4.088577632674356, "learning_rate": 5.629076484188952e-07, "logits/chosen": 0.006173181347548962, "logits/rejected": 0.1368529349565506, "logps/chosen": -2.194118022918701, "logps/rejected": -3.136410713195801, "loss": 0.6821, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.194118022918701, "rewards/margins": 0.9422923922538757, "rewards/rejected": -3.136410713195801, "sft_loss": 2.1732704639434814, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 4.426292415082979, "learning_rate": 5.613623187025587e-07, "logits/chosen": -0.09883008897304535, "logits/rejected": 0.051213592290878296, "logps/chosen": -2.1624035835266113, "logps/rejected": -3.1205196380615234, "loss": 0.6886, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1624035835266113, "rewards/margins": 0.9581155776977539, "rewards/rejected": -3.1205196380615234, "sft_loss": 2.1758995056152344, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 2.4395152212735476, "learning_rate": 5.598163936487369e-07, "logits/chosen": -0.16651707887649536, "logits/rejected": 0.04488401114940643, "logps/chosen": -2.1542766094207764, "logps/rejected": -3.1055610179901123, "loss": 0.6897, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1542766094207764, "rewards/margins": 0.9512848854064941, "rewards/rejected": -3.1055610179901123, "sft_loss": 2.0944085121154785, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 3.603265268828746, "learning_rate": 5.582698882560017e-07, "logits/chosen": -0.17208468914031982, "logits/rejected": 0.0010690949857234955, "logps/chosen": -2.142911672592163, "logps/rejected": -3.1632254123687744, "loss": 0.681, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.142911672592163, "rewards/margins": 1.0203137397766113, "rewards/rejected": -3.1632254123687744, "sft_loss": 2.0967581272125244, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 6.547466770834866, "learning_rate": 5.567228175285549e-07, "logits/chosen": -0.07524513453245163, "logits/rejected": 0.041724175214767456, "logps/chosen": -1.978674292564392, "logps/rejected": -3.111820936203003, "loss": 0.684, "rewards/accuracies": 0.71875, "rewards/chosen": -1.978674292564392, "rewards/margins": 1.1331464052200317, "rewards/rejected": -3.111820936203003, "sft_loss": 1.9212112426757812, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 3.2321118214909372, "learning_rate": 5.551751964760838e-07, "logits/chosen": -0.03188861533999443, "logits/rejected": -0.00413927435874939, "logps/chosen": -2.118699312210083, "logps/rejected": -2.957021951675415, "loss": 0.6808, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.118699312210083, "rewards/margins": 0.8383227586746216, "rewards/rejected": -2.957021951675415, "sft_loss": 2.168797254562378, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 2.8278055693680786, "learning_rate": 5.536270401136145e-07, "logits/chosen": -0.14892300963401794, "logits/rejected": -0.030514398589730263, "logps/chosen": -2.0119025707244873, "logps/rejected": -2.8636975288391113, "loss": 0.6799, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0119025707244873, "rewards/margins": 0.8517950177192688, "rewards/rejected": -2.8636975288391113, "sft_loss": 2.0259180068969727, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 2.988004895135199, "learning_rate": 5.520783634613667e-07, "logits/chosen": -0.08654950559139252, "logits/rejected": 0.1146860346198082, "logps/chosen": -2.184629201889038, "logps/rejected": -3.116959810256958, "loss": 0.6849, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.184629201889038, "rewards/margins": 0.9323304891586304, "rewards/rejected": -3.116959810256958, "sft_loss": 2.261133909225464, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 1.9290604318268172, "learning_rate": 5.505291815446082e-07, "logits/chosen": -0.09083691239356995, "logits/rejected": 0.026575163006782532, "logps/chosen": -2.237321376800537, "logps/rejected": -3.0823521614074707, "loss": 0.6958, "rewards/accuracies": 0.6875, "rewards/chosen": -2.237321376800537, "rewards/margins": 0.8450304865837097, "rewards/rejected": -3.0823521614074707, "sft_loss": 2.248321294784546, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 3.790374916646817, "learning_rate": 5.489795093935089e-07, "logits/chosen": -0.07825301587581635, "logits/rejected": 0.017550267279148102, "logps/chosen": -2.110146999359131, "logps/rejected": -2.972914218902588, "loss": 0.676, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.110146999359131, "rewards/margins": 0.8627673387527466, "rewards/rejected": -2.972914218902588, "sft_loss": 2.1116256713867188, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 2.9220937274086882, "learning_rate": 5.474293620429946e-07, "logits/chosen": -0.14792025089263916, "logits/rejected": 0.03304235264658928, "logps/chosen": -2.1069369316101074, "logps/rejected": -3.6364331245422363, "loss": 0.6804, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1069369316101074, "rewards/margins": 1.529496192932129, "rewards/rejected": -3.6364331245422363, "sft_loss": 2.1745944023132324, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 3.72442445552988, "learning_rate": 5.458787545326018e-07, "logits/chosen": -0.15431484580039978, "logits/rejected": -0.0035203725565224886, "logps/chosen": -2.216923236846924, "logps/rejected": -3.1889290809631348, "loss": 0.6908, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.216923236846924, "rewards/margins": 0.9720057249069214, "rewards/rejected": -3.1889290809631348, "sft_loss": 2.2095744609832764, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 5.576256203094089, "learning_rate": 5.443277019063311e-07, "logits/chosen": -0.18151506781578064, "logits/rejected": -0.02000053972005844, "logps/chosen": -2.1487581729888916, "logps/rejected": -3.3196964263916016, "loss": 0.6884, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1487581729888916, "rewards/margins": 1.17093825340271, "rewards/rejected": -3.3196964263916016, "sft_loss": 2.141321897506714, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 7.924339527105727, "learning_rate": 5.427762192125023e-07, "logits/chosen": -0.1652074158191681, "logits/rejected": -0.029881944879889488, "logps/chosen": -2.250629425048828, "logps/rejected": -3.0189013481140137, "loss": 0.696, "rewards/accuracies": 0.6875, "rewards/chosen": -2.250629425048828, "rewards/margins": 0.7682719230651855, "rewards/rejected": -3.0189013481140137, "sft_loss": 2.1879518032073975, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 3.9403049633715552, "learning_rate": 5.41224321503607e-07, "logits/chosen": -0.14509066939353943, "logits/rejected": 0.10351689159870148, "logps/chosen": -2.1401453018188477, "logps/rejected": -3.30322003364563, "loss": 0.6795, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1401453018188477, "rewards/margins": 1.1630749702453613, "rewards/rejected": -3.30322003364563, "sft_loss": 2.1067137718200684, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 4.295090476521195, "learning_rate": 5.396720238361637e-07, "logits/chosen": -0.06472639739513397, "logits/rejected": 0.06039848178625107, "logps/chosen": -2.192394971847534, "logps/rejected": -3.0823774337768555, "loss": 0.6897, "rewards/accuracies": 0.6875, "rewards/chosen": -2.192394971847534, "rewards/margins": 0.8899825811386108, "rewards/rejected": -3.0823774337768555, "sft_loss": 2.183534860610962, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 4.286368424165502, "learning_rate": 5.381193412705711e-07, "logits/chosen": -0.18022814393043518, "logits/rejected": -0.02799808979034424, "logps/chosen": -2.1506359577178955, "logps/rejected": -3.170880079269409, "loss": 0.6758, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1506359577178955, "rewards/margins": 1.0202442407608032, "rewards/rejected": -3.170880079269409, "sft_loss": 2.0969057083129883, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 4.374127775780103, "learning_rate": 5.365662888709622e-07, "logits/chosen": -0.19103659689426422, "logits/rejected": -0.04271901398897171, "logps/chosen": -2.113696575164795, "logps/rejected": -3.0879569053649902, "loss": 0.6918, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.113696575164795, "rewards/margins": 0.9742606282234192, "rewards/rejected": -3.0879569053649902, "sft_loss": 2.064755916595459, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 5.628471622661212, "learning_rate": 5.350128817050585e-07, "logits/chosen": -0.15977461636066437, "logits/rejected": 0.03489946201443672, "logps/chosen": -2.130821943283081, "logps/rejected": -3.102430820465088, "loss": 0.6887, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.130821943283081, "rewards/margins": 0.9716089367866516, "rewards/rejected": -3.102430820465088, "sft_loss": 2.1123976707458496, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 3.6802442032569367, "learning_rate": 5.334591348440229e-07, "logits/chosen": -0.10645937919616699, "logits/rejected": 0.058650027960538864, "logps/chosen": -2.086397409439087, "logps/rejected": -3.0651001930236816, "loss": 0.6836, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.086397409439087, "rewards/margins": 0.9787028431892395, "rewards/rejected": -3.0651001930236816, "sft_loss": 2.1481781005859375, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 4.276156693995655, "learning_rate": 5.319050633623141e-07, "logits/chosen": -0.15901954472064972, "logits/rejected": 0.015384090133011341, "logps/chosen": -2.1815383434295654, "logps/rejected": -2.9221997261047363, "loss": 0.6974, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1815383434295654, "rewards/margins": 0.7406615018844604, "rewards/rejected": -2.9221997261047363, "sft_loss": 2.1552226543426514, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 3.0153592254368684, "learning_rate": 5.303506823375409e-07, "logits/chosen": -0.16179385781288147, "logits/rejected": 0.04337051510810852, "logps/chosen": -2.2223448753356934, "logps/rejected": -3.1061816215515137, "loss": 0.6933, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2223448753356934, "rewards/margins": 0.8838367462158203, "rewards/rejected": -3.1061816215515137, "sft_loss": 2.18412709236145, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 4.186038813179471, "learning_rate": 5.287960068503143e-07, "logits/chosen": -0.1903652846813202, "logits/rejected": 0.03294439986348152, "logps/chosen": -2.03208589553833, "logps/rejected": -2.955350399017334, "loss": 0.6845, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.03208589553833, "rewards/margins": 0.9232648611068726, "rewards/rejected": -2.955350399017334, "sft_loss": 2.0463991165161133, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 3.66438788945719, "learning_rate": 5.272410519841032e-07, "logits/chosen": -0.07795457541942596, "logits/rejected": 0.044455841183662415, "logps/chosen": -2.093140125274658, "logps/rejected": -3.4263968467712402, "loss": 0.6704, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.093140125274658, "rewards/margins": 1.3332566022872925, "rewards/rejected": -3.4263968467712402, "sft_loss": 2.0226621627807617, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 3.151988106221695, "learning_rate": 5.256858328250861e-07, "logits/chosen": -0.15561290085315704, "logits/rejected": 0.033493004739284515, "logps/chosen": -2.302936553955078, "logps/rejected": -3.164163589477539, "loss": 0.7015, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.302936553955078, "rewards/margins": 0.8612270355224609, "rewards/rejected": -3.164163589477539, "sft_loss": 2.196629762649536, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 2.8628032654728264, "learning_rate": 5.241303644620063e-07, "logits/chosen": -0.2185770720243454, "logits/rejected": -0.029982399195432663, "logps/chosen": -2.2461745738983154, "logps/rejected": -2.9426379203796387, "loss": 0.6946, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2461745738983154, "rewards/margins": 0.6964629888534546, "rewards/rejected": -2.9426379203796387, "sft_loss": 2.1740994453430176, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 3.1255261422064433, "learning_rate": 5.225746619860248e-07, "logits/chosen": -0.20517203211784363, "logits/rejected": -0.06608210504055023, "logps/chosen": -2.1317734718322754, "logps/rejected": -2.9222843647003174, "loss": 0.6975, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.1317734718322754, "rewards/margins": 0.7905106544494629, "rewards/rejected": -2.9222843647003174, "sft_loss": 2.0713953971862793, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 5.768165969138451, "learning_rate": 5.210187404905735e-07, "logits/chosen": 0.0029327759984880686, "logits/rejected": 0.11774277687072754, "logps/chosen": -2.297055244445801, "logps/rejected": -3.0728812217712402, "loss": 0.685, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.297055244445801, "rewards/margins": 0.7758262157440186, "rewards/rejected": -3.0728812217712402, "sft_loss": 2.2605652809143066, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 2.5702006766937258, "learning_rate": 5.194626150712098e-07, "logits/chosen": -0.15413738787174225, "logits/rejected": 0.0005656067514792085, "logps/chosen": -2.1795623302459717, "logps/rejected": -2.9262468814849854, "loss": 0.6957, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1795623302459717, "rewards/margins": 0.7466843128204346, "rewards/rejected": -2.9262468814849854, "sft_loss": 2.213491439819336, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 5.271967894864706, "learning_rate": 5.179063008254695e-07, "logits/chosen": -0.1758485734462738, "logits/rejected": 0.023218151181936264, "logps/chosen": -2.1578831672668457, "logps/rejected": -2.9815473556518555, "loss": 0.7062, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.1578831672668457, "rewards/margins": 0.8236640095710754, "rewards/rejected": -2.9815473556518555, "sft_loss": 2.1854913234710693, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 2.075048894960246, "learning_rate": 5.163498128527199e-07, "logits/chosen": -0.11611847579479218, "logits/rejected": 0.05111664533615112, "logps/chosen": -2.231825351715088, "logps/rejected": -3.0683205127716064, "loss": 0.6903, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.231825351715088, "rewards/margins": 0.8364952206611633, "rewards/rejected": -3.0683205127716064, "sft_loss": 2.1659626960754395, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 3.5925567745372113, "learning_rate": 5.147931662540144e-07, "logits/chosen": -0.02528810128569603, "logits/rejected": 0.12856647372245789, "logps/chosen": -2.1949684619903564, "logps/rejected": -2.9283640384674072, "loss": 0.6903, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1949684619903564, "rewards/margins": 0.7333954572677612, "rewards/rejected": -2.9283640384674072, "sft_loss": 2.0562968254089355, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 4.432527418254685, "learning_rate": 5.132363761319449e-07, "logits/chosen": -0.1314159482717514, "logits/rejected": -0.04454684257507324, "logps/chosen": -1.9996497631072998, "logps/rejected": -3.1647861003875732, "loss": 0.6715, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9996497631072998, "rewards/margins": 1.1651363372802734, "rewards/rejected": -3.1647861003875732, "sft_loss": 1.9506438970565796, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 3.3377921145216276, "learning_rate": 5.116794575904962e-07, "logits/chosen": -0.09291049093008041, "logits/rejected": 0.02424928918480873, "logps/chosen": -2.110121250152588, "logps/rejected": -2.952497720718384, "loss": 0.6783, "rewards/accuracies": 0.65625, "rewards/chosen": -2.110121250152588, "rewards/margins": 0.8423765897750854, "rewards/rejected": -2.952497720718384, "sft_loss": 2.107722043991089, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 3.1045058013576843, "learning_rate": 5.101224257348987e-07, "logits/chosen": -0.10040037333965302, "logits/rejected": 0.06632226705551147, "logps/chosen": -2.252039909362793, "logps/rejected": -3.2612273693084717, "loss": 0.6813, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.252039909362793, "rewards/margins": 1.0091878175735474, "rewards/rejected": -3.2612273693084717, "sft_loss": 2.16507625579834, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 3.153065379729183, "learning_rate": 5.085652956714823e-07, "logits/chosen": -0.16843988001346588, "logits/rejected": 0.000528356060385704, "logps/chosen": -2.3047170639038086, "logps/rejected": -3.4012997150421143, "loss": 0.6845, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3047170639038086, "rewards/margins": 1.0965824127197266, "rewards/rejected": -3.4012997150421143, "sft_loss": 2.2305407524108887, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 3.9368702121084267, "learning_rate": 5.070080825075298e-07, "logits/chosen": -0.147186279296875, "logits/rejected": 0.05198676139116287, "logps/chosen": -2.1867175102233887, "logps/rejected": -2.9347281455993652, "loss": 0.703, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1867175102233887, "rewards/margins": 0.7480108141899109, "rewards/rejected": -2.9347281455993652, "sft_loss": 2.173590660095215, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 3.785919690362723, "learning_rate": 5.0545080135113e-07, "logits/chosen": -0.07448790222406387, "logits/rejected": 0.0009125359356403351, "logps/chosen": -2.205174684524536, "logps/rejected": -3.205883026123047, "loss": 0.6793, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.205174684524536, "rewards/margins": 1.0007085800170898, "rewards/rejected": -3.205883026123047, "sft_loss": 2.1745455265045166, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 3.4168904526951103, "learning_rate": 5.038934673110316e-07, "logits/chosen": -0.1712881475687027, "logits/rejected": -0.04074662923812866, "logps/chosen": -2.1688315868377686, "logps/rejected": -3.120831251144409, "loss": 0.6868, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.1688315868377686, "rewards/margins": 0.9519997835159302, "rewards/rejected": -3.120831251144409, "sft_loss": 2.1889805793762207, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 6.0006636293693365, "learning_rate": 5.023360954964963e-07, "logits/chosen": -0.18512877821922302, "logits/rejected": -0.08922187983989716, "logps/chosen": -2.178508758544922, "logps/rejected": -3.1332712173461914, "loss": 0.6942, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.178508758544922, "rewards/margins": 0.9547624588012695, "rewards/rejected": -3.1332712173461914, "sft_loss": 2.1173133850097656, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 4.0880950992063525, "learning_rate": 5.007787010171524e-07, "logits/chosen": -0.22838346660137177, "logits/rejected": -0.04794811084866524, "logps/chosen": -1.9763202667236328, "logps/rejected": -3.038149118423462, "loss": 0.6844, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9763202667236328, "rewards/margins": 1.06182861328125, "rewards/rejected": -3.038149118423462, "sft_loss": 2.046569347381592, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 5.322380708277623, "learning_rate": 4.992212989828477e-07, "logits/chosen": -0.03665539249777794, "logits/rejected": -0.000216527289012447, "logps/chosen": -2.0814623832702637, "logps/rejected": -3.04225492477417, "loss": 0.6778, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0814623832702637, "rewards/margins": 0.9607928395271301, "rewards/rejected": -3.04225492477417, "sft_loss": 2.105666399002075, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 5.2965558466896585, "learning_rate": 4.976639045035036e-07, "logits/chosen": -0.06266290694475174, "logits/rejected": 0.02715703286230564, "logps/chosen": -2.153308391571045, "logps/rejected": -2.997816324234009, "loss": 0.7017, "rewards/accuracies": 0.625, "rewards/chosen": -2.153308391571045, "rewards/margins": 0.8445073366165161, "rewards/rejected": -2.997816324234009, "sft_loss": 2.1758735179901123, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 5.050427817653276, "learning_rate": 4.961065326889683e-07, "logits/chosen": -0.11321164667606354, "logits/rejected": 0.04749942943453789, "logps/chosen": -2.0724129676818848, "logps/rejected": -3.036367416381836, "loss": 0.6885, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0724129676818848, "rewards/margins": 0.9639546275138855, "rewards/rejected": -3.036367416381836, "sft_loss": 2.055748701095581, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 2.8593400384428778, "learning_rate": 4.9454919864887e-07, "logits/chosen": -0.21733930706977844, "logits/rejected": -0.06262455135583878, "logps/chosen": -2.0864243507385254, "logps/rejected": -2.8974545001983643, "loss": 0.7004, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0864243507385254, "rewards/margins": 0.8110300302505493, "rewards/rejected": -2.8974545001983643, "sft_loss": 2.1405720710754395, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 4.075564805059594, "learning_rate": 4.929919174924701e-07, "logits/chosen": -0.162893608212471, "logits/rejected": 0.03598882630467415, "logps/chosen": -2.1391289234161377, "logps/rejected": -2.9104714393615723, "loss": 0.6947, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.1391289234161377, "rewards/margins": 0.7713426351547241, "rewards/rejected": -2.9104714393615723, "sft_loss": 2.18563175201416, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 2.585986685688883, "learning_rate": 4.914347043285177e-07, "logits/chosen": -0.13090942800045013, "logits/rejected": 0.01913970336318016, "logps/chosen": -2.203361749649048, "logps/rejected": -3.140873432159424, "loss": 0.6839, "rewards/accuracies": 0.71875, "rewards/chosen": -2.203361749649048, "rewards/margins": 0.9375116229057312, "rewards/rejected": -3.140873432159424, "sft_loss": 2.044311285018921, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 2.7794459362126056, "learning_rate": 4.898775742651013e-07, "logits/chosen": -0.05823718383908272, "logits/rejected": 0.07972903549671173, "logps/chosen": -2.183250904083252, "logps/rejected": -3.225594997406006, "loss": 0.674, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.183250904083252, "rewards/margins": 1.0423442125320435, "rewards/rejected": -3.225594997406006, "sft_loss": 2.2276806831359863, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 2.3950145551511017, "learning_rate": 4.883205424095037e-07, "logits/chosen": -0.14604242146015167, "logits/rejected": 0.03321395069360733, "logps/chosen": -2.200139284133911, "logps/rejected": -3.2287135124206543, "loss": 0.6935, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.200139284133911, "rewards/margins": 1.0285742282867432, "rewards/rejected": -3.2287135124206543, "sft_loss": 2.1648683547973633, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 2.6956670959006748, "learning_rate": 4.86763623868055e-07, "logits/chosen": -0.07609611749649048, "logits/rejected": 0.061819422990083694, "logps/chosen": -2.2954039573669434, "logps/rejected": -3.36649751663208, "loss": 0.6823, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2954039573669434, "rewards/margins": 1.0710933208465576, "rewards/rejected": -3.36649751663208, "sft_loss": 2.121478319168091, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 3.976046023660791, "learning_rate": 4.852068337459856e-07, "logits/chosen": -0.04199652001261711, "logits/rejected": 0.13352611660957336, "logps/chosen": -2.2372660636901855, "logps/rejected": -3.020014524459839, "loss": 0.6982, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.2372660636901855, "rewards/margins": 0.7827486991882324, "rewards/rejected": -3.020014524459839, "sft_loss": 2.188228130340576, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 3.9131279165094113, "learning_rate": 4.8365018714728e-07, "logits/chosen": -0.0072118318639695644, "logits/rejected": 0.11471160501241684, "logps/chosen": -2.3043265342712402, "logps/rejected": -3.1066927909851074, "loss": 0.6893, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.3043265342712402, "rewards/margins": 0.8023663759231567, "rewards/rejected": -3.1066927909851074, "sft_loss": 2.17824387550354, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 2.6934657257650065, "learning_rate": 4.820936991745304e-07, "logits/chosen": -0.23482973873615265, "logits/rejected": -0.0825137048959732, "logps/chosen": -1.9831600189208984, "logps/rejected": -2.984605312347412, "loss": 0.6748, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9831600189208984, "rewards/margins": 1.0014454126358032, "rewards/rejected": -2.984605312347412, "sft_loss": 1.999122977256775, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 9.744426748480588, "learning_rate": 4.8053738492879e-07, "logits/chosen": -0.03684605285525322, "logits/rejected": 0.11848436295986176, "logps/chosen": -2.0876357555389404, "logps/rejected": -3.0805702209472656, "loss": 0.6917, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.0876357555389404, "rewards/margins": 0.9929342269897461, "rewards/rejected": -3.0805702209472656, "sft_loss": 1.9902302026748657, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 2.702808170076698, "learning_rate": 4.789812595094265e-07, "logits/chosen": -0.16841986775398254, "logits/rejected": -0.032748930156230927, "logps/chosen": -2.0487492084503174, "logps/rejected": -2.90828275680542, "loss": 0.6772, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0487492084503174, "rewards/margins": 0.8595331311225891, "rewards/rejected": -2.90828275680542, "sft_loss": 1.9349241256713867, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 5.340687027874895, "learning_rate": 4.774253380139752e-07, "logits/chosen": -0.1761549711227417, "logits/rejected": -0.05665317177772522, "logps/chosen": -1.9597280025482178, "logps/rejected": -2.958338975906372, "loss": 0.6794, "rewards/accuracies": 0.71875, "rewards/chosen": -1.9597280025482178, "rewards/margins": 0.9986109733581543, "rewards/rejected": -2.958338975906372, "sft_loss": 1.918230652809143, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 5.223228830052162, "learning_rate": 4.758696355379936e-07, "logits/chosen": -0.06425870954990387, "logits/rejected": -0.08404186367988586, "logps/chosen": -2.114051342010498, "logps/rejected": -3.1076130867004395, "loss": 0.6814, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.114051342010498, "rewards/margins": 0.9935620427131653, "rewards/rejected": -3.1076130867004395, "sft_loss": 2.173743724822998, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 2.937341760162775, "learning_rate": 4.743141671749138e-07, "logits/chosen": -0.21452081203460693, "logits/rejected": -0.0759705901145935, "logps/chosen": -2.1878199577331543, "logps/rejected": -2.7924647331237793, "loss": 0.7066, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.1878199577331543, "rewards/margins": 0.6046445965766907, "rewards/rejected": -2.7924647331237793, "sft_loss": 2.2030882835388184, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 2.172738048264022, "learning_rate": 4.727589480158968e-07, "logits/chosen": -0.1627374291419983, "logits/rejected": -0.03917828947305679, "logps/chosen": -2.130401611328125, "logps/rejected": -3.2156753540039062, "loss": 0.69, "rewards/accuracies": 0.71875, "rewards/chosen": -2.130401611328125, "rewards/margins": 1.0852737426757812, "rewards/rejected": -3.2156753540039062, "sft_loss": 2.1349804401397705, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 4.130207287016142, "learning_rate": 4.712039931496855e-07, "logits/chosen": -0.1787964105606079, "logits/rejected": -0.02974070981144905, "logps/chosen": -2.211841583251953, "logps/rejected": -2.8528695106506348, "loss": 0.7068, "rewards/accuracies": 0.625, "rewards/chosen": -2.211841583251953, "rewards/margins": 0.6410278677940369, "rewards/rejected": -2.8528695106506348, "sft_loss": 2.191458225250244, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 2.1139808890437104, "learning_rate": 4.6964931766245905e-07, "logits/chosen": -0.015827927738428116, "logits/rejected": 0.04487539082765579, "logps/chosen": -2.256462335586548, "logps/rejected": -3.1642420291900635, "loss": 0.6976, "rewards/accuracies": 0.6875, "rewards/chosen": -2.256462335586548, "rewards/margins": 0.9077796936035156, "rewards/rejected": -3.1642420291900635, "sft_loss": 2.2019569873809814, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 4.308994349875126, "learning_rate": 4.6809493663768575e-07, "logits/chosen": -0.09209471940994263, "logits/rejected": -0.031523577868938446, "logps/chosen": -2.2957634925842285, "logps/rejected": -2.847827672958374, "loss": 0.6975, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.2957634925842285, "rewards/margins": 0.5520642995834351, "rewards/rejected": -2.847827672958374, "sft_loss": 2.2830893993377686, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 3.0778188600677643, "learning_rate": 4.6654086515597716e-07, "logits/chosen": -0.18241338431835175, "logits/rejected": 0.012217411771416664, "logps/chosen": -2.1110739707946777, "logps/rejected": -3.1337497234344482, "loss": 0.6808, "rewards/accuracies": 0.78125, "rewards/chosen": -2.1110739707946777, "rewards/margins": 1.0226755142211914, "rewards/rejected": -3.1337497234344482, "sft_loss": 2.089545965194702, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 3.611694227509719, "learning_rate": 4.6498711829494154e-07, "logits/chosen": -0.17740394175052643, "logits/rejected": -0.041568391025066376, "logps/chosen": -2.113341808319092, "logps/rejected": -3.0952255725860596, "loss": 0.6859, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.113341808319092, "rewards/margins": 0.9818838238716125, "rewards/rejected": -3.0952255725860596, "sft_loss": 2.0244381427764893, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 3.5878150063322045, "learning_rate": 4.6343371112903777e-07, "logits/chosen": -0.04258178547024727, "logits/rejected": 0.125771164894104, "logps/chosen": -2.2499518394470215, "logps/rejected": -3.1202502250671387, "loss": 0.6998, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.2499518394470215, "rewards/margins": 0.8702983856201172, "rewards/rejected": -3.1202502250671387, "sft_loss": 2.2162721157073975, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.16938525438308716, "eval_logits/rejected": 0.26880332827568054, "eval_logps/chosen": -2.171912670135498, "eval_logps/rejected": -3.1258187294006348, "eval_loss": 0.6900331377983093, "eval_rewards/accuracies": 0.6936202049255371, "eval_rewards/chosen": -2.171912670135498, "eval_rewards/margins": 0.9539060592651367, "eval_rewards/rejected": -3.1258187294006348, "eval_runtime": 44.4262, "eval_samples_per_second": 30.275, "eval_sft_loss": 2.1093075275421143, "eval_steps_per_second": 7.586, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 2.5065271531069038, "learning_rate": 4.618806587294291e-07, "logits/chosen": -0.21862120926380157, "logits/rejected": -0.10379710048437119, "logps/chosen": -2.1867823600769043, "logps/rejected": -3.1316921710968018, "loss": 0.6825, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.1867823600769043, "rewards/margins": 0.9449104070663452, "rewards/rejected": -3.1316921710968018, "sft_loss": 2.16475248336792, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 3.89401502233373, "learning_rate": 4.603279761638365e-07, "logits/chosen": -0.19211474061012268, "logits/rejected": -0.04500243440270424, "logps/chosen": -2.1232149600982666, "logps/rejected": -2.9413015842437744, "loss": 0.6963, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.1232149600982666, "rewards/margins": 0.8180867433547974, "rewards/rejected": -2.9413015842437744, "sft_loss": 2.0833516120910645, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 2.8031214070077213, "learning_rate": 4.5877567849639315e-07, "logits/chosen": -0.058820150792598724, "logits/rejected": 0.07056679576635361, "logps/chosen": -2.1051268577575684, "logps/rejected": -3.0708067417144775, "loss": 0.6871, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1051268577575684, "rewards/margins": 0.9656797647476196, "rewards/rejected": -3.0708067417144775, "sft_loss": 2.090538501739502, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 2.4297870549412246, "learning_rate": 4.572237807874979e-07, "logits/chosen": -0.11713247001171112, "logits/rejected": 0.16286148130893707, "logps/chosen": -2.40877366065979, "logps/rejected": -3.234921932220459, "loss": 0.6931, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.40877366065979, "rewards/margins": 0.8261480331420898, "rewards/rejected": -3.234921932220459, "sft_loss": 2.234438419342041, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 4.840884673603746, "learning_rate": 4.5567229809366895e-07, "logits/chosen": -0.0701625794172287, "logits/rejected": 0.08214187622070312, "logps/chosen": -1.9803073406219482, "logps/rejected": -2.982759952545166, "loss": 0.6848, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9803073406219482, "rewards/margins": 1.0024524927139282, "rewards/rejected": -2.982759952545166, "sft_loss": 1.9953334331512451, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 5.248148076110207, "learning_rate": 4.541212454673984e-07, "logits/chosen": -0.09757836908102036, "logits/rejected": 0.09544403105974197, "logps/chosen": -2.1850745677948, "logps/rejected": -3.2705535888671875, "loss": 0.683, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1850745677948, "rewards/margins": 1.0854787826538086, "rewards/rejected": -3.2705535888671875, "sft_loss": 2.143946886062622, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 3.887898220382095, "learning_rate": 4.525706379570055e-07, "logits/chosen": -0.07574521005153656, "logits/rejected": 0.03172556310892105, "logps/chosen": -2.0378127098083496, "logps/rejected": -2.9185850620269775, "loss": 0.6867, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0378127098083496, "rewards/margins": 0.8807722926139832, "rewards/rejected": -2.9185850620269775, "sft_loss": 2.0330047607421875, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 2.682458799901896, "learning_rate": 4.510204906064911e-07, "logits/chosen": 0.013766959309577942, "logits/rejected": 0.13985109329223633, "logps/chosen": -2.1408016681671143, "logps/rejected": -3.178284168243408, "loss": 0.672, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1408016681671143, "rewards/margins": 1.037482500076294, "rewards/rejected": -3.178284168243408, "sft_loss": 1.9422342777252197, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 4.050477821888417, "learning_rate": 4.4947081845539177e-07, "logits/chosen": -0.19383029639720917, "logits/rejected": -0.046612389385700226, "logps/chosen": -2.194392681121826, "logps/rejected": -2.949484348297119, "loss": 0.6904, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.194392681121826, "rewards/margins": 0.7550913095474243, "rewards/rejected": -2.949484348297119, "sft_loss": 2.1109633445739746, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 4.141531108529457, "learning_rate": 4.479216365386333e-07, "logits/chosen": 0.006335936486721039, "logits/rejected": 0.19332371652126312, "logps/chosen": -1.9818389415740967, "logps/rejected": -2.979146718978882, "loss": 0.6811, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.9818389415740967, "rewards/margins": 0.9973075985908508, "rewards/rejected": -2.979146718978882, "sft_loss": 1.9491941928863525, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 2.906960024292214, "learning_rate": 4.4637295988638555e-07, "logits/chosen": -0.039695125073194504, "logits/rejected": 0.06806042045354843, "logps/chosen": -2.106238603591919, "logps/rejected": -2.9520974159240723, "loss": 0.6843, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.106238603591919, "rewards/margins": 0.8458584547042847, "rewards/rejected": -2.9520974159240723, "sft_loss": 2.095874309539795, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 2.5598016246844804, "learning_rate": 4.4482480352391623e-07, "logits/chosen": -0.12927773594856262, "logits/rejected": 0.04138825461268425, "logps/chosen": -2.0338780879974365, "logps/rejected": -3.0352959632873535, "loss": 0.6752, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0338780879974365, "rewards/margins": 1.001417875289917, "rewards/rejected": -3.0352959632873535, "sft_loss": 2.0644609928131104, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 5.3129089347850345, "learning_rate": 4.4327718247144507e-07, "logits/chosen": -0.026618679985404015, "logits/rejected": 0.12305717170238495, "logps/chosen": -2.0354652404785156, "logps/rejected": -2.9555282592773438, "loss": 0.6874, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.0354652404785156, "rewards/margins": 0.9200627207756042, "rewards/rejected": -2.9555282592773438, "sft_loss": 2.086569309234619, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 17.067815849238478, "learning_rate": 4.417301117439984e-07, "logits/chosen": -0.09709037095308304, "logits/rejected": 0.05840582400560379, "logps/chosen": -1.999385118484497, "logps/rejected": -2.9995360374450684, "loss": 0.6862, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.999385118484497, "rewards/margins": 1.0001510381698608, "rewards/rejected": -2.9995360374450684, "sft_loss": 2.018332004547119, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 4.184862860164015, "learning_rate": 4.401836063512631e-07, "logits/chosen": -0.154140904545784, "logits/rejected": 0.15783226490020752, "logps/chosen": -2.037508487701416, "logps/rejected": -2.9666733741760254, "loss": 0.6768, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.037508487701416, "rewards/margins": 0.9291653633117676, "rewards/rejected": -2.9666733741760254, "sft_loss": 2.0432841777801514, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 4.83664371080636, "learning_rate": 4.386376812974413e-07, "logits/chosen": -0.12879234552383423, "logits/rejected": -0.055740825831890106, "logps/chosen": -2.003026247024536, "logps/rejected": -3.0168797969818115, "loss": 0.6803, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.003026247024536, "rewards/margins": 1.013853907585144, "rewards/rejected": -3.0168797969818115, "sft_loss": 2.0483086109161377, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 3.6468834080002828, "learning_rate": 4.370923515811048e-07, "logits/chosen": -0.11835076659917831, "logits/rejected": 0.12381935119628906, "logps/chosen": -2.1517369747161865, "logps/rejected": -3.050192356109619, "loss": 0.6867, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1517369747161865, "rewards/margins": 0.8984552621841431, "rewards/rejected": -3.050192356109619, "sft_loss": 2.141003370285034, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 3.8191587634646424, "learning_rate": 4.35547632195049e-07, "logits/chosen": -0.07299528270959854, "logits/rejected": 0.04519936814904213, "logps/chosen": -2.209841012954712, "logps/rejected": -3.1058108806610107, "loss": 0.6919, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.209841012954712, "rewards/margins": 0.8959699869155884, "rewards/rejected": -3.1058108806610107, "sft_loss": 2.2350947856903076, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 4.917189460703717, "learning_rate": 4.340035381261484e-07, "logits/chosen": -0.09429631382226944, "logits/rejected": 0.01863674819469452, "logps/chosen": -2.3670341968536377, "logps/rejected": -3.198622941970825, "loss": 0.7021, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.3670341968536377, "rewards/margins": 0.8315887451171875, "rewards/rejected": -3.198622941970825, "sft_loss": 2.329556941986084, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 6.2948407219727445, "learning_rate": 4.324600843552104e-07, "logits/chosen": -0.20224657654762268, "logits/rejected": -0.026348227635025978, "logps/chosen": -2.357205390930176, "logps/rejected": -3.389937162399292, "loss": 0.6903, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.357205390930176, "rewards/margins": 1.0327322483062744, "rewards/rejected": -3.389937162399292, "sft_loss": 2.300279378890991, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 3.535278360715077, "learning_rate": 4.309172858568302e-07, "logits/chosen": -0.19258543848991394, "logits/rejected": 0.0005183167522773147, "logps/chosen": -2.2726645469665527, "logps/rejected": -3.235269546508789, "loss": 0.6941, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2726645469665527, "rewards/margins": 0.9626048803329468, "rewards/rejected": -3.235269546508789, "sft_loss": 2.2232964038848877, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 3.821952745409955, "learning_rate": 4.293751575992455e-07, "logits/chosen": -0.022418078035116196, "logits/rejected": 0.03682177886366844, "logps/chosen": -2.3169894218444824, "logps/rejected": -3.344888687133789, "loss": 0.6867, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3169894218444824, "rewards/margins": 1.0278997421264648, "rewards/rejected": -3.344888687133789, "sft_loss": 2.3466386795043945, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 5.968262115040822, "learning_rate": 4.278337145441916e-07, "logits/chosen": -0.17735466361045837, "logits/rejected": 0.028507202863693237, "logps/chosen": -2.2298970222473145, "logps/rejected": -3.1068997383117676, "loss": 0.6935, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2298970222473145, "rewards/margins": 0.8770028948783875, "rewards/rejected": -3.1068997383117676, "sft_loss": 2.2399425506591797, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 5.369649985568279, "learning_rate": 4.262929716467556e-07, "logits/chosen": -0.11486073583364487, "logits/rejected": 0.12659046053886414, "logps/chosen": -2.2763190269470215, "logps/rejected": -3.341658115386963, "loss": 0.698, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2763190269470215, "rewards/margins": 1.065339207649231, "rewards/rejected": -3.341658115386963, "sft_loss": 2.2652580738067627, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 3.865119586160214, "learning_rate": 4.247529438552321e-07, "logits/chosen": -0.1526700258255005, "logits/rejected": 0.07134034484624863, "logps/chosen": -2.319143295288086, "logps/rejected": -3.345414400100708, "loss": 0.6854, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.319143295288086, "rewards/margins": 1.0262712240219116, "rewards/rejected": -3.345414400100708, "sft_loss": 2.3606820106506348, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 6.205044855109865, "learning_rate": 4.232136461109773e-07, "logits/chosen": -0.037674445658922195, "logits/rejected": 0.0866425484418869, "logps/chosen": -2.041942834854126, "logps/rejected": -3.265056610107422, "loss": 0.6686, "rewards/accuracies": 0.75, "rewards/chosen": -2.041942834854126, "rewards/margins": 1.223113775253296, "rewards/rejected": -3.265056610107422, "sft_loss": 2.0743894577026367, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 3.912801244648953, "learning_rate": 4.216750933482646e-07, "logits/chosen": -0.11040419340133667, "logits/rejected": 0.08194796741008759, "logps/chosen": -2.2899370193481445, "logps/rejected": -3.1266794204711914, "loss": 0.6925, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.2899370193481445, "rewards/margins": 0.8367422223091125, "rewards/rejected": -3.1266794204711914, "sft_loss": 2.195441722869873, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 3.7753776596782953, "learning_rate": 4.2013730049413986e-07, "logits/chosen": -0.05143510550260544, "logits/rejected": 0.11683867126703262, "logps/chosen": -1.96332585811615, "logps/rejected": -3.1719508171081543, "loss": 0.6688, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.96332585811615, "rewards/margins": 1.2086249589920044, "rewards/rejected": -3.1719508171081543, "sft_loss": 1.982080101966858, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 4.414998388539221, "learning_rate": 4.1860028246827594e-07, "logits/chosen": -0.08478949964046478, "logits/rejected": 0.1309826523065567, "logps/chosen": -1.9758306741714478, "logps/rejected": -2.9565446376800537, "loss": 0.6721, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9758306741714478, "rewards/margins": 0.9807138442993164, "rewards/rejected": -2.9565446376800537, "sft_loss": 2.01054310798645, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 3.5291379444260276, "learning_rate": 4.170640541828285e-07, "logits/chosen": -0.1986711025238037, "logits/rejected": -0.03366810828447342, "logps/chosen": -2.1853854656219482, "logps/rejected": -3.0746588706970215, "loss": 0.6907, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1853854656219482, "rewards/margins": 0.889273464679718, "rewards/rejected": -3.0746588706970215, "sft_loss": 2.1545419692993164, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 14.564448690812977, "learning_rate": 4.1552863054229116e-07, "logits/chosen": 0.016578923910856247, "logits/rejected": 0.09158939123153687, "logps/chosen": -2.2663533687591553, "logps/rejected": -3.0901200771331787, "loss": 0.7024, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2663533687591553, "rewards/margins": 0.8237667083740234, "rewards/rejected": -3.0901200771331787, "sft_loss": 2.151644229888916, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 7.208621192726958, "learning_rate": 4.139940264433508e-07, "logits/chosen": -0.09454993903636932, "logits/rejected": 0.11722595989704132, "logps/chosen": -2.0083000659942627, "logps/rejected": -2.8931920528411865, "loss": 0.6947, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.0083000659942627, "rewards/margins": 0.8848922848701477, "rewards/rejected": -2.8931920528411865, "sft_loss": 1.980495810508728, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 4.309712271139695, "learning_rate": 4.1246025677474303e-07, "logits/chosen": -0.10693303495645523, "logits/rejected": 0.0911029800772667, "logps/chosen": -2.194310426712036, "logps/rejected": -3.086164951324463, "loss": 0.6874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.194310426712036, "rewards/margins": 0.8918546438217163, "rewards/rejected": -3.086164951324463, "sft_loss": 2.226395606994629, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 3.0636279711589687, "learning_rate": 4.10927336417108e-07, "logits/chosen": -0.10850661993026733, "logits/rejected": 0.06401662528514862, "logps/chosen": -2.139122724533081, "logps/rejected": -2.977569103240967, "loss": 0.6881, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.139122724533081, "rewards/margins": 0.8384467363357544, "rewards/rejected": -2.977569103240967, "sft_loss": 2.096174478530884, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 5.109255566645895, "learning_rate": 4.093952802428457e-07, "logits/chosen": 0.06034206226468086, "logits/rejected": 0.14231623709201813, "logps/chosen": -2.2905266284942627, "logps/rejected": -3.098337173461914, "loss": 0.704, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.2905266284942627, "rewards/margins": 0.8078103065490723, "rewards/rejected": -3.098337173461914, "sft_loss": 2.2123026847839355, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 2.453292045111366, "learning_rate": 4.0786410311597184e-07, "logits/chosen": -0.16326487064361572, "logits/rejected": 0.026889164000749588, "logps/chosen": -2.2471578121185303, "logps/rejected": -3.129439353942871, "loss": 0.6912, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.2471578121185303, "rewards/margins": 0.8822811245918274, "rewards/rejected": -3.129439353942871, "sft_loss": 2.154139757156372, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 5.551184128221047, "learning_rate": 4.063338198919737e-07, "logits/chosen": -0.11775940656661987, "logits/rejected": -0.075945183634758, "logps/chosen": -2.2515721321105957, "logps/rejected": -2.957139015197754, "loss": 0.6936, "rewards/accuracies": 0.625, "rewards/chosen": -2.2515721321105957, "rewards/margins": 0.7055668830871582, "rewards/rejected": -2.957139015197754, "sft_loss": 2.2185423374176025, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 4.7354448751058325, "learning_rate": 4.0480444541766575e-07, "logits/chosen": -0.07824783027172089, "logits/rejected": 0.07530321180820465, "logps/chosen": -2.460838556289673, "logps/rejected": -3.3306336402893066, "loss": 0.6956, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.460838556289673, "rewards/margins": 0.8697946667671204, "rewards/rejected": -3.3306336402893066, "sft_loss": 2.426344394683838, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 3.179067217939218, "learning_rate": 4.0327599453104606e-07, "logits/chosen": -0.12580683827400208, "logits/rejected": 0.03872048109769821, "logps/chosen": -2.021254777908325, "logps/rejected": -3.144026279449463, "loss": 0.6864, "rewards/accuracies": 0.71875, "rewards/chosen": -2.021254777908325, "rewards/margins": 1.1227715015411377, "rewards/rejected": -3.144026279449463, "sft_loss": 2.0137553215026855, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 8.507807005568804, "learning_rate": 4.017484820611514e-07, "logits/chosen": -0.07757692039012909, "logits/rejected": 0.06929562985897064, "logps/chosen": -2.216972827911377, "logps/rejected": -3.2000606060028076, "loss": 0.6837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.216972827911377, "rewards/margins": 0.9830881357192993, "rewards/rejected": -3.2000606060028076, "sft_loss": 2.1666877269744873, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 4.622987862680945, "learning_rate": 4.002219228279148e-07, "logits/chosen": -0.12145348638296127, "logits/rejected": 0.05669483542442322, "logps/chosen": -2.1411659717559814, "logps/rejected": -3.0339109897613525, "loss": 0.6954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1411659717559814, "rewards/margins": 0.8927448391914368, "rewards/rejected": -3.0339109897613525, "sft_loss": 2.104557752609253, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 2.1985941281016737, "learning_rate": 3.9869633164202045e-07, "logits/chosen": -0.12451644241809845, "logits/rejected": 0.11840645968914032, "logps/chosen": -2.269446849822998, "logps/rejected": -3.09425687789917, "loss": 0.6923, "rewards/accuracies": 0.6875, "rewards/chosen": -2.269446849822998, "rewards/margins": 0.824809730052948, "rewards/rejected": -3.09425687789917, "sft_loss": 2.1708335876464844, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 5.604133301731, "learning_rate": 3.9717172330476077e-07, "logits/chosen": -0.13375402987003326, "logits/rejected": 0.01883949711918831, "logps/chosen": -2.252318859100342, "logps/rejected": -3.217984437942505, "loss": 0.6836, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.252318859100342, "rewards/margins": 0.9656656384468079, "rewards/rejected": -3.217984437942505, "sft_loss": 2.30187726020813, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 4.047489800275802, "learning_rate": 3.956481126078927e-07, "logits/chosen": -0.0439617820084095, "logits/rejected": 0.08844329416751862, "logps/chosen": -2.4110324382781982, "logps/rejected": -3.316443920135498, "loss": 0.6971, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.4110324382781982, "rewards/margins": 0.905411422252655, "rewards/rejected": -3.316443920135498, "sft_loss": 2.425555467605591, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 2.617352281279491, "learning_rate": 3.941255143334937e-07, "logits/chosen": -0.15715794265270233, "logits/rejected": -0.09440397471189499, "logps/chosen": -2.112018585205078, "logps/rejected": -3.1406939029693604, "loss": 0.6917, "rewards/accuracies": 0.71875, "rewards/chosen": -2.112018585205078, "rewards/margins": 1.0286751985549927, "rewards/rejected": -3.1406939029693604, "sft_loss": 2.0670018196105957, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 3.4875857107376875, "learning_rate": 3.9260394325381895e-07, "logits/chosen": -0.1380632072687149, "logits/rejected": 0.014524638652801514, "logps/chosen": -2.0882363319396973, "logps/rejected": -3.4028327465057373, "loss": 0.6773, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0882363319396973, "rewards/margins": 1.314596176147461, "rewards/rejected": -3.4028327465057373, "sft_loss": 2.0673441886901855, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 4.008539983512021, "learning_rate": 3.9108341413115784e-07, "logits/chosen": -0.11774700880050659, "logits/rejected": 0.0142317283898592, "logps/chosen": -2.068218946456909, "logps/rejected": -3.1259474754333496, "loss": 0.6774, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.068218946456909, "rewards/margins": 1.0577284097671509, "rewards/rejected": -3.1259474754333496, "sft_loss": 2.041489362716675, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 3.2923612802327025, "learning_rate": 3.895639417176905e-07, "logits/chosen": -0.16679182648658752, "logits/rejected": -0.05302317813038826, "logps/chosen": -2.1182868480682373, "logps/rejected": -3.375195264816284, "loss": 0.698, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1182868480682373, "rewards/margins": 1.2569081783294678, "rewards/rejected": -3.375195264816284, "sft_loss": 2.1025192737579346, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 3.9802507216073133, "learning_rate": 3.8804554075534497e-07, "logits/chosen": -0.15935611724853516, "logits/rejected": 0.06508435308933258, "logps/chosen": -2.0967845916748047, "logps/rejected": -3.107313632965088, "loss": 0.6773, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.0967845916748047, "rewards/margins": 1.0105292797088623, "rewards/rejected": -3.107313632965088, "sft_loss": 2.0953187942504883, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 4.551116828204739, "learning_rate": 3.8652822597565403e-07, "logits/chosen": -0.18554912507534027, "logits/rejected": 0.003855127142742276, "logps/chosen": -2.1789658069610596, "logps/rejected": -3.0571799278259277, "loss": 0.685, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1789658069610596, "rewards/margins": 0.8782145380973816, "rewards/rejected": -3.0571799278259277, "sft_loss": 2.1960291862487793, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 3.8012645362562836, "learning_rate": 3.850120120996123e-07, "logits/chosen": -0.11601553112268448, "logits/rejected": 0.09878197312355042, "logps/chosen": -2.219163179397583, "logps/rejected": -3.2052559852600098, "loss": 0.6935, "rewards/accuracies": 0.6875, "rewards/chosen": -2.219163179397583, "rewards/margins": 0.9860928654670715, "rewards/rejected": -3.2052559852600098, "sft_loss": 2.1843903064727783, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 3.2793922911834215, "learning_rate": 3.8349691383753356e-07, "logits/chosen": -0.0153552470728755, "logits/rejected": 0.11092513799667358, "logps/chosen": -2.2136993408203125, "logps/rejected": -3.1592071056365967, "loss": 0.6989, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.2136993408203125, "rewards/margins": 0.9455081820487976, "rewards/rejected": -3.1592071056365967, "sft_loss": 2.1067147254943848, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 2.3731437209575814, "learning_rate": 3.819829458889078e-07, "logits/chosen": -0.14444135129451752, "logits/rejected": -0.005835582502186298, "logps/chosen": -1.9672229290008545, "logps/rejected": -2.7969489097595215, "loss": 0.6861, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.9672229290008545, "rewards/margins": 0.8297258615493774, "rewards/rejected": -2.7969489097595215, "sft_loss": 1.9027982950210571, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 3.2927142582400855, "learning_rate": 3.804701229422585e-07, "logits/chosen": -0.18615327775478363, "logits/rejected": -0.06812240928411484, "logps/chosen": -2.227973699569702, "logps/rejected": -3.227752685546875, "loss": 0.6889, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.227973699569702, "rewards/margins": 0.9997787475585938, "rewards/rejected": -3.227752685546875, "sft_loss": 2.143904447555542, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 3.9052609817318227, "learning_rate": 3.789584596750007e-07, "logits/chosen": -0.186161071062088, "logits/rejected": -0.10436218976974487, "logps/chosen": -2.1326229572296143, "logps/rejected": -3.1862387657165527, "loss": 0.673, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1326229572296143, "rewards/margins": 1.0536160469055176, "rewards/rejected": -3.1862387657165527, "sft_loss": 2.1006031036376953, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 3.9607780631187084, "learning_rate": 3.77447970753298e-07, "logits/chosen": -0.047240983694791794, "logits/rejected": -0.005119943525642157, "logps/chosen": -2.162609815597534, "logps/rejected": -3.0216145515441895, "loss": 0.6888, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.162609815597534, "rewards/margins": 0.8590046763420105, "rewards/rejected": -3.0216145515441895, "sft_loss": 2.1431949138641357, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 8.937980779204771, "learning_rate": 3.7593867083192057e-07, "logits/chosen": -0.1427825391292572, "logits/rejected": 0.022900383919477463, "logps/chosen": -2.215710163116455, "logps/rejected": -3.103562116622925, "loss": 0.7099, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.215710163116455, "rewards/margins": 0.8878521919250488, "rewards/rejected": -3.103562116622925, "sft_loss": 2.205852746963501, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 3.5475971310646393, "learning_rate": 3.7443057455410276e-07, "logits/chosen": -0.09783361852169037, "logits/rejected": 0.010669758543372154, "logps/chosen": -1.9912612438201904, "logps/rejected": -2.944227695465088, "loss": 0.6821, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9912612438201904, "rewards/margins": 0.9529666900634766, "rewards/rejected": -2.944227695465088, "sft_loss": 2.081857681274414, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 4.171379109244644, "learning_rate": 3.7292369655140145e-07, "logits/chosen": -0.19658830761909485, "logits/rejected": -0.015646522864699364, "logps/chosen": -2.268094301223755, "logps/rejected": -3.0871119499206543, "loss": 0.6827, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.268094301223755, "rewards/margins": 0.8190175294876099, "rewards/rejected": -3.0871119499206543, "sft_loss": 2.2411656379699707, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 3.6592301505173337, "learning_rate": 3.714180514435534e-07, "logits/chosen": -0.10057705640792847, "logits/rejected": 0.07258979976177216, "logps/chosen": -2.230692148208618, "logps/rejected": -3.1516542434692383, "loss": 0.6814, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.230692148208618, "rewards/margins": 0.9209620356559753, "rewards/rejected": -3.1516542434692383, "sft_loss": 2.102454900741577, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 13.970667089680054, "learning_rate": 3.6991365383833426e-07, "logits/chosen": -0.1707434505224228, "logits/rejected": -0.012985003180801868, "logps/chosen": -2.1211893558502197, "logps/rejected": -3.187798500061035, "loss": 0.6924, "rewards/accuracies": 0.75, "rewards/chosen": -2.1211893558502197, "rewards/margins": 1.0666093826293945, "rewards/rejected": -3.187798500061035, "sft_loss": 2.0635311603546143, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 3.938830189364275, "learning_rate": 3.684105183314162e-07, "logits/chosen": -0.16227789223194122, "logits/rejected": -0.06405164301395416, "logps/chosen": -1.9964349269866943, "logps/rejected": -2.9089252948760986, "loss": 0.6764, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9964349269866943, "rewards/margins": 0.9124904870986938, "rewards/rejected": -2.9089252948760986, "sft_loss": 2.015360116958618, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 3.3631850849437015, "learning_rate": 3.669086595062263e-07, "logits/chosen": -0.16847261786460876, "logits/rejected": 0.046769242733716965, "logps/chosen": -2.1035525798797607, "logps/rejected": -3.0158162117004395, "loss": 0.7048, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1035525798797607, "rewards/margins": 0.9122636914253235, "rewards/rejected": -3.0158162117004395, "sft_loss": 2.0737602710723877, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 3.04163797970243, "learning_rate": 3.654080919338056e-07, "logits/chosen": -0.20176240801811218, "logits/rejected": -0.026735246181488037, "logps/chosen": -2.0932791233062744, "logps/rejected": -3.039994955062866, "loss": 0.6867, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0932791233062744, "rewards/margins": 0.9467160105705261, "rewards/rejected": -3.039994955062866, "sft_loss": 2.1171672344207764, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 4.348264134637991, "learning_rate": 3.639088301726673e-07, "logits/chosen": -0.1465270072221756, "logits/rejected": 0.055618755519390106, "logps/chosen": -2.162325382232666, "logps/rejected": -3.1199283599853516, "loss": 0.6885, "rewards/accuracies": 0.71875, "rewards/chosen": -2.162325382232666, "rewards/margins": 0.957602858543396, "rewards/rejected": -3.1199283599853516, "sft_loss": 2.1590447425842285, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 2.7569866499321365, "learning_rate": 3.624108887686556e-07, "logits/chosen": -0.1441269963979721, "logits/rejected": -0.05600646883249283, "logps/chosen": -2.1244869232177734, "logps/rejected": -2.8845534324645996, "loss": 0.6818, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1244869232177734, "rewards/margins": 0.760066568851471, "rewards/rejected": -2.8845534324645996, "sft_loss": 2.1846280097961426, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 4.013388373110339, "learning_rate": 3.6091428225480433e-07, "logits/chosen": -0.2562624216079712, "logits/rejected": -0.10145537555217743, "logps/chosen": -1.9844735860824585, "logps/rejected": -2.9652676582336426, "loss": 0.6852, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.9844735860824585, "rewards/margins": 0.9807940721511841, "rewards/rejected": -2.9652676582336426, "sft_loss": 2.021907329559326, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 4.317218275979374, "learning_rate": 3.5941902515119674e-07, "logits/chosen": -0.21493549644947052, "logits/rejected": 0.025484537705779076, "logps/chosen": -2.120227336883545, "logps/rejected": -2.944889545440674, "loss": 0.6887, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.120227336883545, "rewards/margins": 0.8246625661849976, "rewards/rejected": -2.944889545440674, "sft_loss": 2.1267037391662598, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 3.918133212884738, "learning_rate": 3.5792513196482373e-07, "logits/chosen": -0.30855846405029297, "logits/rejected": -0.0393378920853138, "logps/chosen": -2.0020647048950195, "logps/rejected": -2.9337689876556396, "loss": 0.6859, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0020647048950195, "rewards/margins": 0.9317046403884888, "rewards/rejected": -2.9337689876556396, "sft_loss": 1.9564940929412842, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 4.687343497306164, "learning_rate": 3.5643261718944346e-07, "logits/chosen": -0.12303660809993744, "logits/rejected": -0.012298181653022766, "logps/chosen": -2.117335557937622, "logps/rejected": -2.9036712646484375, "loss": 0.693, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.117335557937622, "rewards/margins": 0.7863359451293945, "rewards/rejected": -2.9036712646484375, "sft_loss": 1.9888889789581299, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 2.6110618995856543, "learning_rate": 3.5494149530544087e-07, "logits/chosen": -0.23396828770637512, "logits/rejected": -0.10798802226781845, "logps/chosen": -2.1209850311279297, "logps/rejected": -2.962287187576294, "loss": 0.7036, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1209850311279297, "rewards/margins": 0.8413020372390747, "rewards/rejected": -2.962287187576294, "sft_loss": 2.107417345046997, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 3.61417843766057, "learning_rate": 3.534517807796871e-07, "logits/chosen": -0.17204193770885468, "logits/rejected": -0.059635408222675323, "logps/chosen": -2.118635416030884, "logps/rejected": -2.945610761642456, "loss": 0.692, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.118635416030884, "rewards/margins": 0.8269752264022827, "rewards/rejected": -2.945610761642456, "sft_loss": 2.101919412612915, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 3.291048566978385, "learning_rate": 3.519634880653988e-07, "logits/chosen": -0.12395143508911133, "logits/rejected": -0.029307598248124123, "logps/chosen": -2.0599448680877686, "logps/rejected": -3.3115997314453125, "loss": 0.6826, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0599448680877686, "rewards/margins": 1.2516547441482544, "rewards/rejected": -3.3115997314453125, "sft_loss": 2.1145453453063965, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 3.3791336666800333, "learning_rate": 3.504766316019987e-07, "logits/chosen": -0.1779370754957199, "logits/rejected": 0.0018944144248962402, "logps/chosen": -2.1011738777160645, "logps/rejected": -2.851292133331299, "loss": 0.6844, "rewards/accuracies": 0.75, "rewards/chosen": -2.1011738777160645, "rewards/margins": 0.7501183748245239, "rewards/rejected": -2.851292133331299, "sft_loss": 1.9948400259017944, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 5.853961944038886, "learning_rate": 3.489912258149745e-07, "logits/chosen": -0.08530310541391373, "logits/rejected": 0.05743807554244995, "logps/chosen": -2.105227470397949, "logps/rejected": -3.1187050342559814, "loss": 0.6814, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.105227470397949, "rewards/margins": 1.0134776830673218, "rewards/rejected": -3.1187050342559814, "sft_loss": 2.0023372173309326, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 5.6431451193262046, "learning_rate": 3.475072851157397e-07, "logits/chosen": -0.13398191332817078, "logits/rejected": -0.0687723383307457, "logps/chosen": -2.034705638885498, "logps/rejected": -3.189368486404419, "loss": 0.6815, "rewards/accuracies": 0.71875, "rewards/chosen": -2.034705638885498, "rewards/margins": 1.1546627283096313, "rewards/rejected": -3.189368486404419, "sft_loss": 2.022989273071289, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 2.9075238752199537, "learning_rate": 3.460248239014936e-07, "logits/chosen": -0.051149677485227585, "logits/rejected": 0.013508930802345276, "logps/chosen": -2.290102481842041, "logps/rejected": -3.2551655769348145, "loss": 0.6793, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.290102481842041, "rewards/margins": 0.9650629758834839, "rewards/rejected": -3.2551655769348145, "sft_loss": 2.3164639472961426, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 2.749266267504163, "learning_rate": 3.4454385655508134e-07, "logits/chosen": -0.05810176208615303, "logits/rejected": 0.024312706664204597, "logps/chosen": -2.317183017730713, "logps/rejected": -3.0569870471954346, "loss": 0.6871, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.317183017730713, "rewards/margins": 0.7398041486740112, "rewards/rejected": -3.0569870471954346, "sft_loss": 2.2860019207000732, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 5.114260269298789, "learning_rate": 3.4306439744485447e-07, "logits/chosen": -0.21410679817199707, "logits/rejected": 0.03596454858779907, "logps/chosen": -2.273670196533203, "logps/rejected": -3.280043840408325, "loss": 0.6887, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.273670196533203, "rewards/margins": 1.0063737630844116, "rewards/rejected": -3.280043840408325, "sft_loss": 2.1528635025024414, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 5.059702714512347, "learning_rate": 3.415864609245322e-07, "logits/chosen": -0.13041231036186218, "logits/rejected": 0.07254637032747269, "logps/chosen": -2.240907669067383, "logps/rejected": -3.2653861045837402, "loss": 0.6837, "rewards/accuracies": 0.6875, "rewards/chosen": -2.240907669067383, "rewards/margins": 1.0244789123535156, "rewards/rejected": -3.2653861045837402, "sft_loss": 2.2703652381896973, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.19961495697498322, "eval_logits/rejected": 0.3035580515861511, "eval_logps/chosen": -2.2075111865997314, "eval_logps/rejected": -3.2094106674194336, "eval_loss": 0.6897660493850708, "eval_rewards/accuracies": 0.6965875625610352, "eval_rewards/chosen": -2.2075111865997314, "eval_rewards/margins": 1.0018997192382812, "eval_rewards/rejected": -3.2094106674194336, "eval_runtime": 64.7993, "eval_samples_per_second": 20.756, "eval_sft_loss": 2.142155885696411, "eval_steps_per_second": 5.201, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 5.684340495868352, "learning_rate": 3.401100613330605e-07, "logits/chosen": -0.15685749053955078, "logits/rejected": -0.12646745145320892, "logps/chosen": -2.1684701442718506, "logps/rejected": -2.9581832885742188, "loss": 0.6845, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1684701442718506, "rewards/margins": 0.7897127866744995, "rewards/rejected": -2.9581832885742188, "sft_loss": 2.157496213912964, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 3.709795179506504, "learning_rate": 3.3863521299447514e-07, "logits/chosen": -0.15225179493427277, "logits/rejected": 0.003287592204287648, "logps/chosen": -2.1358022689819336, "logps/rejected": -3.234461545944214, "loss": 0.6841, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1358022689819336, "rewards/margins": 1.0986593961715698, "rewards/rejected": -3.234461545944214, "sft_loss": 2.13959002494812, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 2.028879346077501, "learning_rate": 3.371619302177609e-07, "logits/chosen": -0.06895577162504196, "logits/rejected": 0.06460914760828018, "logps/chosen": -2.071643829345703, "logps/rejected": -2.862349033355713, "loss": 0.6955, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.071643829345703, "rewards/margins": 0.7907050848007202, "rewards/rejected": -2.862349033355713, "sft_loss": 1.998337984085083, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 3.7319291071719776, "learning_rate": 3.3569022729671393e-07, "logits/chosen": -0.15587207674980164, "logits/rejected": -0.04816683381795883, "logps/chosen": -2.1375787258148193, "logps/rejected": -3.0189738273620605, "loss": 0.688, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1375787258148193, "rewards/margins": 0.8813952207565308, "rewards/rejected": -3.0189738273620605, "sft_loss": 2.1859066486358643, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 4.087648916223257, "learning_rate": 3.342201185098024e-07, "logits/chosen": -0.05187777429819107, "logits/rejected": -0.058933060616254807, "logps/chosen": -2.1178596019744873, "logps/rejected": -2.950225353240967, "loss": 0.6814, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1178596019744873, "rewards/margins": 0.8323656916618347, "rewards/rejected": -2.950225353240967, "sft_loss": 2.124027729034424, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 3.6139847946934567, "learning_rate": 3.3275161812002807e-07, "logits/chosen": -0.1192985326051712, "logits/rejected": -0.07342572510242462, "logps/chosen": -2.1341209411621094, "logps/rejected": -3.061249017715454, "loss": 0.6863, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1341209411621094, "rewards/margins": 0.9271281361579895, "rewards/rejected": -3.061249017715454, "sft_loss": 2.2415740489959717, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 4.746970107460784, "learning_rate": 3.312847403747883e-07, "logits/chosen": -0.19757425785064697, "logits/rejected": -0.09125449508428574, "logps/chosen": -2.0948214530944824, "logps/rejected": -3.0264580249786377, "loss": 0.6848, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0948214530944824, "rewards/margins": 0.9316363334655762, "rewards/rejected": -3.0264580249786377, "sft_loss": 2.099250078201294, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 4.281285407499179, "learning_rate": 3.2981949950573733e-07, "logits/chosen": -0.16194570064544678, "logits/rejected": -0.05726348236203194, "logps/chosen": -2.2279863357543945, "logps/rejected": -2.8229076862335205, "loss": 0.6964, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.2279863357543945, "rewards/margins": 0.594921350479126, "rewards/rejected": -2.8229076862335205, "sft_loss": 2.206334114074707, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 5.525223650970297, "learning_rate": 3.283559097286486e-07, "logits/chosen": -0.1582116037607193, "logits/rejected": -0.0008633792167529464, "logps/chosen": -2.1605565547943115, "logps/rejected": -2.8597676753997803, "loss": 0.6914, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1605565547943115, "rewards/margins": 0.6992112398147583, "rewards/rejected": -2.8597676753997803, "sft_loss": 2.1679415702819824, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 6.5294785588226345, "learning_rate": 3.268939852432765e-07, "logits/chosen": -0.21969719231128693, "logits/rejected": -0.11124607175588608, "logps/chosen": -2.1147501468658447, "logps/rejected": -3.0070154666900635, "loss": 0.6848, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1147501468658447, "rewards/margins": 0.8922654986381531, "rewards/rejected": -3.0070154666900635, "sft_loss": 2.1740405559539795, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 6.510859827780001, "learning_rate": 3.254337402332187e-07, "logits/chosen": -0.14866802096366882, "logits/rejected": -0.011895375326275826, "logps/chosen": -2.295565366744995, "logps/rejected": -3.2828688621520996, "loss": 0.6939, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.295565366744995, "rewards/margins": 0.9873035550117493, "rewards/rejected": -3.2828688621520996, "sft_loss": 2.1841020584106445, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 5.159193596048617, "learning_rate": 3.239751888657788e-07, "logits/chosen": -0.17693455517292023, "logits/rejected": -0.02780548296868801, "logps/chosen": -2.0335822105407715, "logps/rejected": -2.9066224098205566, "loss": 0.6871, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.0335822105407715, "rewards/margins": 0.8730396032333374, "rewards/rejected": -2.9066224098205566, "sft_loss": 2.043599843978882, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 7.022194499754583, "learning_rate": 3.2251834529182856e-07, "logits/chosen": -0.16167587041854858, "logits/rejected": -0.033538367599248886, "logps/chosen": -2.103761672973633, "logps/rejected": -3.2121028900146484, "loss": 0.6846, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.103761672973633, "rewards/margins": 1.1083409786224365, "rewards/rejected": -3.2121028900146484, "sft_loss": 2.0176119804382324, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 2.7854409335237453, "learning_rate": 3.2106322364567075e-07, "logits/chosen": -0.18212896585464478, "logits/rejected": -0.04557342082262039, "logps/chosen": -2.0921874046325684, "logps/rejected": -3.319629669189453, "loss": 0.671, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0921874046325684, "rewards/margins": 1.2274422645568848, "rewards/rejected": -3.319629669189453, "sft_loss": 2.175380229949951, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 7.211839596635454, "learning_rate": 3.1960983804490183e-07, "logits/chosen": -0.18180282413959503, "logits/rejected": -0.031157314777374268, "logps/chosen": -2.2092483043670654, "logps/rejected": -3.370548725128174, "loss": 0.6885, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.2092483043670654, "rewards/margins": 1.1612999439239502, "rewards/rejected": -3.370548725128174, "sft_loss": 2.266460418701172, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 5.69334372057066, "learning_rate": 3.1815820259027537e-07, "logits/chosen": -0.18586108088493347, "logits/rejected": -0.03296568989753723, "logps/chosen": -1.992720365524292, "logps/rejected": -3.099524974822998, "loss": 0.6764, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.992720365524292, "rewards/margins": 1.1068050861358643, "rewards/rejected": -3.099524974822998, "sft_loss": 2.023709297180176, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 7.694932668337779, "learning_rate": 3.16708331365565e-07, "logits/chosen": -0.18205882608890533, "logits/rejected": -0.08366916328668594, "logps/chosen": -2.1995675563812256, "logps/rejected": -3.248079776763916, "loss": 0.673, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1995675563812256, "rewards/margins": 1.0485122203826904, "rewards/rejected": -3.248079776763916, "sft_loss": 2.2780404090881348, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 3.373708554633476, "learning_rate": 3.152602384374275e-07, "logits/chosen": -0.14092735946178436, "logits/rejected": 0.03805799037218094, "logps/chosen": -2.1809241771698, "logps/rejected": -3.150590181350708, "loss": 0.6913, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1809241771698, "rewards/margins": 0.9696657061576843, "rewards/rejected": -3.150590181350708, "sft_loss": 2.0995025634765625, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 3.9354177004042206, "learning_rate": 3.1381393785526697e-07, "logits/chosen": -0.1317126303911209, "logits/rejected": -0.03440633416175842, "logps/chosen": -2.1461985111236572, "logps/rejected": -3.2417449951171875, "loss": 0.6794, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1461985111236572, "rewards/margins": 1.0955464839935303, "rewards/rejected": -3.2417449951171875, "sft_loss": 2.2087178230285645, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 3.3353589299966653, "learning_rate": 3.123694436510979e-07, "logits/chosen": -0.11530756950378418, "logits/rejected": 0.01841713674366474, "logps/chosen": -2.188054323196411, "logps/rejected": -3.0514755249023438, "loss": 0.6847, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.188054323196411, "rewards/margins": 0.8634212613105774, "rewards/rejected": -3.0514755249023438, "sft_loss": 2.1960723400115967, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 5.582960140846795, "learning_rate": 3.1092676983940946e-07, "logits/chosen": -0.1665990799665451, "logits/rejected": -0.07521601766347885, "logps/chosen": -2.076925039291382, "logps/rejected": -3.199763536453247, "loss": 0.6767, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.076925039291382, "rewards/margins": 1.1228384971618652, "rewards/rejected": -3.199763536453247, "sft_loss": 2.0471179485321045, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 3.3886986437836453, "learning_rate": 3.094859304170293e-07, "logits/chosen": -0.024591270834207535, "logits/rejected": 0.020447324961423874, "logps/chosen": -2.1345932483673096, "logps/rejected": -3.0656065940856934, "loss": 0.6925, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1345932483673096, "rewards/margins": 0.9310134053230286, "rewards/rejected": -3.0656065940856934, "sft_loss": 2.1308083534240723, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 3.4491897887450347, "learning_rate": 3.0804693936298795e-07, "logits/chosen": -0.10499121248722076, "logits/rejected": -0.03471050411462784, "logps/chosen": -2.078094482421875, "logps/rejected": -3.24504017829895, "loss": 0.6731, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.078094482421875, "rewards/margins": 1.1669456958770752, "rewards/rejected": -3.24504017829895, "sft_loss": 2.100672960281372, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 4.077241980257639, "learning_rate": 3.066098106383826e-07, "logits/chosen": -0.17399199306964874, "logits/rejected": -0.06918822228908539, "logps/chosen": -2.1585144996643066, "logps/rejected": -3.043644428253174, "loss": 0.6944, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1585144996643066, "rewards/margins": 0.8851300477981567, "rewards/rejected": -3.043644428253174, "sft_loss": 2.036146879196167, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 3.0355512441322587, "learning_rate": 3.0517455818624263e-07, "logits/chosen": -0.231519415974617, "logits/rejected": -0.12187705188989639, "logps/chosen": -2.1352591514587402, "logps/rejected": -3.182976007461548, "loss": 0.6935, "rewards/accuracies": 0.65625, "rewards/chosen": -2.1352591514587402, "rewards/margins": 1.0477169752120972, "rewards/rejected": -3.182976007461548, "sft_loss": 2.189821720123291, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 3.9927289170007514, "learning_rate": 3.037411959313936e-07, "logits/chosen": -0.16165432333946228, "logits/rejected": -0.0339629128575325, "logps/chosen": -2.1408815383911133, "logps/rejected": -3.0898704528808594, "loss": 0.6771, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1408815383911133, "rewards/margins": 0.9489887952804565, "rewards/rejected": -3.0898704528808594, "sft_loss": 2.140064239501953, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 3.471098137156809, "learning_rate": 3.023097377803224e-07, "logits/chosen": -0.142087921500206, "logits/rejected": -0.038915760815143585, "logps/chosen": -2.316383123397827, "logps/rejected": -3.0058162212371826, "loss": 0.7038, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.316383123397827, "rewards/margins": 0.6894328594207764, "rewards/rejected": -3.0058162212371826, "sft_loss": 2.2604458332061768, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 2.6153338498303493, "learning_rate": 3.008801976210423e-07, "logits/chosen": -0.08909028023481369, "logits/rejected": -0.00841111596673727, "logps/chosen": -2.352787494659424, "logps/rejected": -3.154552936553955, "loss": 0.6918, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.352787494659424, "rewards/margins": 0.8017654418945312, "rewards/rejected": -3.154552936553955, "sft_loss": 2.252534866333008, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 2.975653477681364, "learning_rate": 2.994525893229581e-07, "logits/chosen": -0.18091581761837006, "logits/rejected": -0.08059791475534439, "logps/chosen": -2.198948383331299, "logps/rejected": -3.2791762351989746, "loss": 0.6726, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.198948383331299, "rewards/margins": 1.0802279710769653, "rewards/rejected": -3.2791762351989746, "sft_loss": 2.107489824295044, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 3.7792912907628646, "learning_rate": 2.98026926736732e-07, "logits/chosen": -0.21311037242412567, "logits/rejected": -0.11145709455013275, "logps/chosen": -2.069913387298584, "logps/rejected": -3.192847728729248, "loss": 0.6641, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.069913387298584, "rewards/margins": 1.122934103012085, "rewards/rejected": -3.192847728729248, "sft_loss": 2.018747329711914, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 4.791967935712868, "learning_rate": 2.9660322369414846e-07, "logits/chosen": -0.14493045210838318, "logits/rejected": 0.007240000180900097, "logps/chosen": -1.9749542474746704, "logps/rejected": -3.348956346511841, "loss": 0.6626, "rewards/accuracies": 0.75, "rewards/chosen": -1.9749542474746704, "rewards/margins": 1.3740019798278809, "rewards/rejected": -3.348956346511841, "sft_loss": 2.058170795440674, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 2.782908308940945, "learning_rate": 2.9518149400798063e-07, "logits/chosen": -0.23584513366222382, "logits/rejected": -0.1729080229997635, "logps/chosen": -2.0240962505340576, "logps/rejected": -3.370525360107422, "loss": 0.6659, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0240962505340576, "rewards/margins": 1.3464288711547852, "rewards/rejected": -3.370525360107422, "sft_loss": 2.124987840652466, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 4.361969659880761, "learning_rate": 2.9376175147185633e-07, "logits/chosen": -0.1731630265712738, "logits/rejected": 0.024024654179811478, "logps/chosen": -2.1874585151672363, "logps/rejected": -3.3264641761779785, "loss": 0.6743, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1874585151672363, "rewards/margins": 1.1390055418014526, "rewards/rejected": -3.3264641761779785, "sft_loss": 2.13765287399292, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 5.451218204119892, "learning_rate": 2.9234400986012376e-07, "logits/chosen": -0.2533346712589264, "logits/rejected": -0.09153694659471512, "logps/chosen": -1.9931917190551758, "logps/rejected": -3.301081895828247, "loss": 0.651, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9931917190551758, "rewards/margins": 1.3078901767730713, "rewards/rejected": -3.301081895828247, "sft_loss": 2.0416078567504883, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 5.230901570538194, "learning_rate": 2.9092828292771817e-07, "logits/chosen": -0.16264775395393372, "logits/rejected": -0.11411754041910172, "logps/chosen": -2.0707695484161377, "logps/rejected": -3.1301207542419434, "loss": 0.6779, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0707695484161377, "rewards/margins": 1.0593516826629639, "rewards/rejected": -3.1301207542419434, "sft_loss": 2.0640697479248047, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 5.682295649586755, "learning_rate": 2.8951458441002875e-07, "logits/chosen": -0.13695378601551056, "logits/rejected": -0.08521612733602524, "logps/chosen": -2.0727250576019287, "logps/rejected": -3.0686042308807373, "loss": 0.6753, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0727250576019287, "rewards/margins": 0.9958791732788086, "rewards/rejected": -3.0686042308807373, "sft_loss": 2.040381908416748, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 5.444971493009763, "learning_rate": 2.881029280227643e-07, "logits/chosen": -0.20039144158363342, "logits/rejected": -0.03169285133481026, "logps/chosen": -2.0249133110046387, "logps/rejected": -3.1004390716552734, "loss": 0.666, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0249133110046387, "rewards/margins": 1.0755256414413452, "rewards/rejected": -3.1004390716552734, "sft_loss": 1.98683762550354, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 3.2688059376125125, "learning_rate": 2.8669332746182177e-07, "logits/chosen": -0.23091144859790802, "logits/rejected": -0.05502986162900925, "logps/chosen": -1.965471625328064, "logps/rejected": -3.2271828651428223, "loss": 0.6567, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.965471625328064, "rewards/margins": 1.2617111206054688, "rewards/rejected": -3.2271828651428223, "sft_loss": 2.0017428398132324, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 3.3719898656196956, "learning_rate": 2.8528579640315156e-07, "logits/chosen": -0.10328583419322968, "logits/rejected": -0.06386563926935196, "logps/chosen": -2.1545088291168213, "logps/rejected": -2.9280734062194824, "loss": 0.681, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1545088291168213, "rewards/margins": 0.7735646963119507, "rewards/rejected": -2.9280734062194824, "sft_loss": 2.226701498031616, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 6.9516884717445, "learning_rate": 2.8388034850262646e-07, "logits/chosen": -0.13225576281547546, "logits/rejected": 0.003281575394794345, "logps/chosen": -2.168238639831543, "logps/rejected": -3.1461145877838135, "loss": 0.6668, "rewards/accuracies": 0.71875, "rewards/chosen": -2.168238639831543, "rewards/margins": 0.9778760671615601, "rewards/rejected": -3.1461145877838135, "sft_loss": 2.202183246612549, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 9.636546332290473, "learning_rate": 2.824769973959079e-07, "logits/chosen": -0.10065107047557831, "logits/rejected": 0.04348466545343399, "logps/chosen": -2.024111032485962, "logps/rejected": -2.97385835647583, "loss": 0.6713, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.024111032485962, "rewards/margins": 0.9497473835945129, "rewards/rejected": -2.97385835647583, "sft_loss": 2.0276641845703125, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 3.6337683737206623, "learning_rate": 2.81075756698315e-07, "logits/chosen": -0.025926511734724045, "logits/rejected": 0.07824753224849701, "logps/chosen": -2.019010066986084, "logps/rejected": -3.3775219917297363, "loss": 0.6536, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.019010066986084, "rewards/margins": 1.3585115671157837, "rewards/rejected": -3.3775219917297363, "sft_loss": 1.9829899072647095, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 3.4543113118103754, "learning_rate": 2.7967664000469035e-07, "logits/chosen": -0.221477672457695, "logits/rejected": -0.0890105590224266, "logps/chosen": -2.1813979148864746, "logps/rejected": -3.1783928871154785, "loss": 0.6884, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1813979148864746, "rewards/margins": 0.9969952702522278, "rewards/rejected": -3.1783928871154785, "sft_loss": 2.1206278800964355, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 2.225127207591837, "learning_rate": 2.7827966088927095e-07, "logits/chosen": -0.2266666144132614, "logits/rejected": 0.025529295206069946, "logps/chosen": -2.2012128829956055, "logps/rejected": -3.4361069202423096, "loss": 0.6803, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.2012128829956055, "rewards/margins": 1.2348941564559937, "rewards/rejected": -3.4361069202423096, "sft_loss": 2.183763265609741, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 5.048394544503118, "learning_rate": 2.768848329055538e-07, "logits/chosen": -0.15154269337654114, "logits/rejected": -0.07468608021736145, "logps/chosen": -2.091416120529175, "logps/rejected": -3.096597671508789, "loss": 0.672, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.091416120529175, "rewards/margins": 1.0051820278167725, "rewards/rejected": -3.096597671508789, "sft_loss": 2.18349552154541, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 4.868925838328225, "learning_rate": 2.7549216958616657e-07, "logits/chosen": -0.23196709156036377, "logits/rejected": -0.06467144191265106, "logps/chosen": -2.288769483566284, "logps/rejected": -3.4297847747802734, "loss": 0.6714, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.288769483566284, "rewards/margins": 1.1410152912139893, "rewards/rejected": -3.4297847747802734, "sft_loss": 2.199639081954956, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 3.7753382928988053, "learning_rate": 2.741016844427344e-07, "logits/chosen": -0.15316972136497498, "logits/rejected": 0.026164349168539047, "logps/chosen": -2.1056418418884277, "logps/rejected": -3.047377347946167, "loss": 0.6807, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1056418418884277, "rewards/margins": 0.941735565662384, "rewards/rejected": -3.047377347946167, "sft_loss": 2.1292455196380615, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 5.316549877681543, "learning_rate": 2.7271339096575073e-07, "logits/chosen": -0.10193123668432236, "logits/rejected": 0.05660524219274521, "logps/chosen": -2.0485825538635254, "logps/rejected": -3.189666748046875, "loss": 0.6608, "rewards/accuracies": 0.75, "rewards/chosen": -2.0485825538635254, "rewards/margins": 1.1410835981369019, "rewards/rejected": -3.189666748046875, "sft_loss": 2.0899040699005127, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 4.1783885206300715, "learning_rate": 2.713273026244446e-07, "logits/chosen": -0.25660255551338196, "logits/rejected": -0.02843310870230198, "logps/chosen": -2.236860513687134, "logps/rejected": -3.3940634727478027, "loss": 0.6791, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.236860513687134, "rewards/margins": 1.1572033166885376, "rewards/rejected": -3.3940634727478027, "sft_loss": 2.1988136768341064, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 11.66407269024, "learning_rate": 2.6994343286665156e-07, "logits/chosen": -0.1540280282497406, "logits/rejected": 0.03630285710096359, "logps/chosen": -2.119597911834717, "logps/rejected": -2.970017910003662, "loss": 0.6864, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.119597911834717, "rewards/margins": 0.8504198789596558, "rewards/rejected": -2.970017910003662, "sft_loss": 2.150630474090576, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 5.514295095964714, "learning_rate": 2.6856179511868156e-07, "logits/chosen": -0.12381670624017715, "logits/rejected": 0.08592347055673599, "logps/chosen": -2.0796501636505127, "logps/rejected": -3.2541632652282715, "loss": 0.6813, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0796501636505127, "rewards/margins": 1.1745132207870483, "rewards/rejected": -3.2541632652282715, "sft_loss": 2.063962697982788, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 3.2622503609392655, "learning_rate": 2.6718240278519056e-07, "logits/chosen": -0.1520196497440338, "logits/rejected": 0.003301681485027075, "logps/chosen": -2.108696460723877, "logps/rejected": -3.3694000244140625, "loss": 0.6808, "rewards/accuracies": 0.71875, "rewards/chosen": -2.108696460723877, "rewards/margins": 1.2607039213180542, "rewards/rejected": -3.3694000244140625, "sft_loss": 2.0896289348602295, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 5.528824956645526, "learning_rate": 2.6580526924904866e-07, "logits/chosen": -0.23660194873809814, "logits/rejected": -0.08224891126155853, "logps/chosen": -2.1784911155700684, "logps/rejected": -3.1277122497558594, "loss": 0.6737, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1784911155700684, "rewards/margins": 0.9492212533950806, "rewards/rejected": -3.1277122497558594, "sft_loss": 2.115309953689575, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 4.300814713228508, "learning_rate": 2.6443040787121186e-07, "logits/chosen": -0.24351458251476288, "logits/rejected": -0.14606614410877228, "logps/chosen": -1.8937175273895264, "logps/rejected": -3.051975727081299, "loss": 0.6562, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8937175273895264, "rewards/margins": 1.1582581996917725, "rewards/rejected": -3.051975727081299, "sft_loss": 1.9756580591201782, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 4.5057064274808845, "learning_rate": 2.6305783199059084e-07, "logits/chosen": -0.18646974861621857, "logits/rejected": -0.06458055973052979, "logps/chosen": -2.1127471923828125, "logps/rejected": -3.1759142875671387, "loss": 0.6719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1127471923828125, "rewards/margins": 1.0631673336029053, "rewards/rejected": -3.1759142875671387, "sft_loss": 2.1598129272460938, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 3.146074222965387, "learning_rate": 2.6168755492392324e-07, "logits/chosen": -0.19290082156658173, "logits/rejected": -0.03169599175453186, "logps/chosen": -1.9636176824569702, "logps/rejected": -3.1199774742126465, "loss": 0.6533, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9636176824569702, "rewards/margins": 1.1563596725463867, "rewards/rejected": -3.1199774742126465, "sft_loss": 1.9573945999145508, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 3.8083854291835872, "learning_rate": 2.6031958996564274e-07, "logits/chosen": -0.1969573199748993, "logits/rejected": -0.06371529400348663, "logps/chosen": -1.9405794143676758, "logps/rejected": -3.257289409637451, "loss": 0.6626, "rewards/accuracies": 0.71875, "rewards/chosen": -1.9405794143676758, "rewards/margins": 1.316710114479065, "rewards/rejected": -3.257289409637451, "sft_loss": 1.9770195484161377, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 9.200591844421464, "learning_rate": 2.589539503877518e-07, "logits/chosen": -0.14464333653450012, "logits/rejected": -0.02460755966603756, "logps/chosen": -2.0023107528686523, "logps/rejected": -3.093677043914795, "loss": 0.6801, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.0023107528686523, "rewards/margins": 1.0913660526275635, "rewards/rejected": -3.093677043914795, "sft_loss": 2.0361132621765137, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 13.13456351467358, "learning_rate": 2.5759064943969125e-07, "logits/chosen": -0.21079924702644348, "logits/rejected": 0.017463264986872673, "logps/chosen": -2.0185232162475586, "logps/rejected": -2.9785799980163574, "loss": 0.6773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0185232162475586, "rewards/margins": 0.9600569605827332, "rewards/rejected": -2.9785799980163574, "sft_loss": 1.9870984554290771, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 5.554726366552014, "learning_rate": 2.562297003482131e-07, "logits/chosen": -0.06970573216676712, "logits/rejected": -0.04162493348121643, "logps/chosen": -1.9545644521713257, "logps/rejected": -2.937133550643921, "loss": 0.663, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9545644521713257, "rewards/margins": 0.9825690984725952, "rewards/rejected": -2.937133550643921, "sft_loss": 2.006101608276367, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 3.8188131833797123, "learning_rate": 2.548711163172512e-07, "logits/chosen": -0.13708598911762238, "logits/rejected": -0.02338588610291481, "logps/chosen": -2.116072654724121, "logps/rejected": -2.9913673400878906, "loss": 0.6832, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.116072654724121, "rewards/margins": 0.8752948641777039, "rewards/rejected": -2.9913673400878906, "sft_loss": 2.0157470703125, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 3.9159541413812673, "learning_rate": 2.53514910527794e-07, "logits/chosen": -0.14079007506370544, "logits/rejected": -0.014591905288398266, "logps/chosen": -1.8882776498794556, "logps/rejected": -3.012269973754883, "loss": 0.6634, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.8882776498794556, "rewards/margins": 1.1239923238754272, "rewards/rejected": -3.012269973754883, "sft_loss": 1.9577144384384155, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 2.949276861968296, "learning_rate": 2.5216109613775573e-07, "logits/chosen": -0.19639217853546143, "logits/rejected": -0.03421352431178093, "logps/chosen": -2.181985855102539, "logps/rejected": -3.15867280960083, "loss": 0.6872, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.181985855102539, "rewards/margins": 0.976686954498291, "rewards/rejected": -3.15867280960083, "sft_loss": 2.2240633964538574, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 2.795612284396159, "learning_rate": 2.5080968628184993e-07, "logits/chosen": -0.20567235350608826, "logits/rejected": -0.016475200653076172, "logps/chosen": -2.110926389694214, "logps/rejected": -3.4056496620178223, "loss": 0.6727, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.110926389694214, "rewards/margins": 1.2947235107421875, "rewards/rejected": -3.4056496620178223, "sft_loss": 2.080010175704956, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 5.9208482659550254, "learning_rate": 2.494606940714605e-07, "logits/chosen": -0.1726199984550476, "logits/rejected": -0.077762171626091, "logps/chosen": -1.995126485824585, "logps/rejected": -3.295370578765869, "loss": 0.647, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.995126485824585, "rewards/margins": 1.3002442121505737, "rewards/rejected": -3.295370578765869, "sft_loss": 1.9892040491104126, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 3.6624647106926527, "learning_rate": 2.4811413259451625e-07, "logits/chosen": -0.26526567339897156, "logits/rejected": -0.10189725458621979, "logps/chosen": -2.0908007621765137, "logps/rejected": -3.1439366340637207, "loss": 0.6734, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0908007621765137, "rewards/margins": 1.0531362295150757, "rewards/rejected": -3.1439366340637207, "sft_loss": 2.0779733657836914, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 4.246459738842489, "learning_rate": 2.46770014915362e-07, "logits/chosen": -0.1405525803565979, "logits/rejected": -0.05778896063566208, "logps/chosen": -2.1469337940216064, "logps/rejected": -3.331935405731201, "loss": 0.6666, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1469337940216064, "rewards/margins": 1.1850017309188843, "rewards/rejected": -3.331935405731201, "sft_loss": 2.1456828117370605, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 4.555051792056545, "learning_rate": 2.45428354074634e-07, "logits/chosen": -0.18042424321174622, "logits/rejected": -0.11148138344287872, "logps/chosen": -2.0728728771209717, "logps/rejected": -3.2742888927459717, "loss": 0.6619, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0728728771209717, "rewards/margins": 1.201416015625, "rewards/rejected": -3.2742888927459717, "sft_loss": 1.9689483642578125, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 4.193908772515439, "learning_rate": 2.4408916308913105e-07, "logits/chosen": -0.24373853206634521, "logits/rejected": -0.06800062954425812, "logps/chosen": -2.1934638023376465, "logps/rejected": -3.0918126106262207, "loss": 0.6948, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1934638023376465, "rewards/margins": 0.8983484506607056, "rewards/rejected": -3.0918126106262207, "sft_loss": 2.211327075958252, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 5.861821260358606, "learning_rate": 2.4275245495169025e-07, "logits/chosen": -0.10776664316654205, "logits/rejected": 0.05332046002149582, "logps/chosen": -2.101513385772705, "logps/rejected": -3.1856369972229004, "loss": 0.6712, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.101513385772705, "rewards/margins": 1.0841234922409058, "rewards/rejected": -3.1856369972229004, "sft_loss": 2.065294027328491, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 3.6790955676732113, "learning_rate": 2.414182426310597e-07, "logits/chosen": -0.2536991238594055, "logits/rejected": -0.16523298621177673, "logps/chosen": -2.126446485519409, "logps/rejected": -3.386033296585083, "loss": 0.6738, "rewards/accuracies": 0.6875, "rewards/chosen": -2.126446485519409, "rewards/margins": 1.2595865726470947, "rewards/rejected": -3.386033296585083, "sft_loss": 2.1513524055480957, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 5.781208465642567, "learning_rate": 2.400865390717734e-07, "logits/chosen": -0.1769980937242508, "logits/rejected": -0.07903625816106796, "logps/chosen": -2.05415940284729, "logps/rejected": -3.569599151611328, "loss": 0.6588, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.05415940284729, "rewards/margins": 1.5154398679733276, "rewards/rejected": -3.569599151611328, "sft_loss": 2.114835262298584, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 4.243018773470032, "learning_rate": 2.3875735719402475e-07, "logits/chosen": -0.1651601493358612, "logits/rejected": -0.03835087642073631, "logps/chosen": -2.100426435470581, "logps/rejected": -3.3876376152038574, "loss": 0.6671, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.100426435470581, "rewards/margins": 1.2872109413146973, "rewards/rejected": -3.3876376152038574, "sft_loss": 2.1940059661865234, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 3.40614489973044, "learning_rate": 2.3743070989354258e-07, "logits/chosen": -0.12180168926715851, "logits/rejected": -0.016960904002189636, "logps/chosen": -2.2647228240966797, "logps/rejected": -3.349940538406372, "loss": 0.6672, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.2647228240966797, "rewards/margins": 1.0852177143096924, "rewards/rejected": -3.349940538406372, "sft_loss": 2.2234692573547363, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 5.217256447022118, "learning_rate": 2.3610661004146454e-07, "logits/chosen": -0.11785624921321869, "logits/rejected": -0.009331837296485901, "logps/chosen": -1.9256277084350586, "logps/rejected": -2.9111785888671875, "loss": 0.6615, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9256277084350586, "rewards/margins": 0.9855508804321289, "rewards/rejected": -2.9111785888671875, "sft_loss": 1.9216800928115845, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 5.574292090869096, "learning_rate": 2.3478507048421314e-07, "logits/chosen": -0.15140271186828613, "logits/rejected": -0.06172577291727066, "logps/chosen": -2.0302653312683105, "logps/rejected": -3.386890411376953, "loss": 0.6488, "rewards/accuracies": 0.75, "rewards/chosen": -2.0302653312683105, "rewards/margins": 1.3566254377365112, "rewards/rejected": -3.386890411376953, "sft_loss": 2.1113438606262207, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 3.8311914809355456, "learning_rate": 2.334661040433713e-07, "logits/chosen": -0.2237207591533661, "logits/rejected": -0.09334935247898102, "logps/chosen": -2.256551742553711, "logps/rejected": -3.352893352508545, "loss": 0.6682, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.256551742553711, "rewards/margins": 1.0963413715362549, "rewards/rejected": -3.352893352508545, "sft_loss": 2.2960710525512695, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 3.673517695626827, "learning_rate": 2.321497235155568e-07, "logits/chosen": -0.2837563455104828, "logits/rejected": -0.11372779309749603, "logps/chosen": -1.8886947631835938, "logps/rejected": -3.2296433448791504, "loss": 0.6524, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8886947631835938, "rewards/margins": 1.3409483432769775, "rewards/rejected": -3.2296433448791504, "sft_loss": 1.920108437538147, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 3.627622935982085, "learning_rate": 2.3083594167229965e-07, "logits/chosen": -0.2605969309806824, "logits/rejected": 0.02325456589460373, "logps/chosen": -2.215508460998535, "logps/rejected": -3.2646713256835938, "loss": 0.676, "rewards/accuracies": 0.71875, "rewards/chosen": -2.215508460998535, "rewards/margins": 1.0491631031036377, "rewards/rejected": -3.2646713256835938, "sft_loss": 2.1782376766204834, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 5.5650806107361825, "learning_rate": 2.295247712599167e-07, "logits/chosen": -0.16451916098594666, "logits/rejected": -0.05491523817181587, "logps/chosen": -1.9280630350112915, "logps/rejected": -3.411128282546997, "loss": 0.6446, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9280630350112915, "rewards/margins": 1.4830653667449951, "rewards/rejected": -3.411128282546997, "sft_loss": 1.9921998977661133, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.12216214835643768, "eval_logits/rejected": 0.22047200798988342, "eval_logps/chosen": -2.1866872310638428, "eval_logps/rejected": -3.2140347957611084, "eval_loss": 0.6902045607566833, "eval_rewards/accuracies": 0.6854599118232727, "eval_rewards/chosen": -2.1866872310638428, "eval_rewards/margins": 1.0273479223251343, "eval_rewards/rejected": -3.2140347957611084, "eval_runtime": 45.1516, "eval_samples_per_second": 29.789, "eval_sft_loss": 2.16142201423645, "eval_steps_per_second": 7.464, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 2.3070059749730167, "learning_rate": 2.2821622499938948e-07, "logits/chosen": -0.16050508618354797, "logits/rejected": 0.05246647074818611, "logps/chosen": -2.288259983062744, "logps/rejected": -3.2600135803222656, "loss": 0.6917, "rewards/accuracies": 0.6875, "rewards/chosen": -2.288259983062744, "rewards/margins": 0.9717534184455872, "rewards/rejected": -3.2600135803222656, "sft_loss": 2.1639692783355713, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 4.274171833193289, "learning_rate": 2.269103155862391e-07, "logits/chosen": -0.21428146958351135, "logits/rejected": -0.1049589067697525, "logps/chosen": -2.0670275688171387, "logps/rejected": -3.170724868774414, "loss": 0.6748, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0670275688171387, "rewards/margins": 1.103697419166565, "rewards/rejected": -3.170724868774414, "sft_loss": 2.0204029083251953, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 5.09358664314769, "learning_rate": 2.2560705569040483e-07, "logits/chosen": -0.2076890915632248, "logits/rejected": 0.05643507093191147, "logps/chosen": -2.120525360107422, "logps/rejected": -3.1633400917053223, "loss": 0.6826, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.120525360107422, "rewards/margins": 1.0428144931793213, "rewards/rejected": -3.1633400917053223, "sft_loss": 2.194258451461792, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 4.215634181369833, "learning_rate": 2.2430645795611963e-07, "logits/chosen": -0.27006030082702637, "logits/rejected": -0.10320593416690826, "logps/chosen": -2.1332249641418457, "logps/rejected": -3.0910048484802246, "loss": 0.6764, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.1332249641418457, "rewards/margins": 0.9577800035476685, "rewards/rejected": -3.0910048484802246, "sft_loss": 2.1724190711975098, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 3.5195337375890996, "learning_rate": 2.230085350017884e-07, "logits/chosen": -0.1685423105955124, "logits/rejected": -0.056601088494062424, "logps/chosen": -2.1004130840301514, "logps/rejected": -3.147625684738159, "loss": 0.6908, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1004130840301514, "rewards/margins": 1.0472123622894287, "rewards/rejected": -3.147625684738159, "sft_loss": 2.126330614089966, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 5.293014240915332, "learning_rate": 2.2171329941986554e-07, "logits/chosen": -0.2239580899477005, "logits/rejected": -0.13606879115104675, "logps/chosen": -2.025062084197998, "logps/rejected": -3.244068145751953, "loss": 0.6582, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.025062084197998, "rewards/margins": 1.2190057039260864, "rewards/rejected": -3.244068145751953, "sft_loss": 2.1253952980041504, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 10.634182633611681, "learning_rate": 2.2042076377673202e-07, "logits/chosen": -0.19350138306617737, "logits/rejected": -0.1688581109046936, "logps/chosen": -1.9272750616073608, "logps/rejected": -2.822693347930908, "loss": 0.6764, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9272750616073608, "rewards/margins": 0.8954181671142578, "rewards/rejected": -2.822693347930908, "sft_loss": 1.9559904336929321, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 6.431058173776684, "learning_rate": 2.1913094061257476e-07, "logits/chosen": -0.16342909634113312, "logits/rejected": -0.13806693255901337, "logps/chosen": -1.9453058242797852, "logps/rejected": -3.038445234298706, "loss": 0.6754, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9453058242797852, "rewards/margins": 1.0931396484375, "rewards/rejected": -3.038445234298706, "sft_loss": 1.92788565158844, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 4.244287447305024, "learning_rate": 2.178438424412633e-07, "logits/chosen": -0.1588595062494278, "logits/rejected": -0.008251415565609932, "logps/chosen": -2.0741567611694336, "logps/rejected": -2.979722738265991, "loss": 0.6788, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.0741567611694336, "rewards/margins": 0.9055658578872681, "rewards/rejected": -2.979722738265991, "sft_loss": 2.0933797359466553, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 3.260292178088132, "learning_rate": 2.165594817502302e-07, "logits/chosen": -0.23745575547218323, "logits/rejected": -0.10504372417926788, "logps/chosen": -2.184518337249756, "logps/rejected": -2.9294536113739014, "loss": 0.7078, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.184518337249756, "rewards/margins": 0.7449353337287903, "rewards/rejected": -2.9294536113739014, "sft_loss": 2.254260540008545, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 4.8497247516539055, "learning_rate": 2.1527787100034806e-07, "logits/chosen": -0.13812163472175598, "logits/rejected": -0.07096612453460693, "logps/chosen": -2.1685566902160645, "logps/rejected": -2.8709235191345215, "loss": 0.6785, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1685566902160645, "rewards/margins": 0.7023668885231018, "rewards/rejected": -2.8709235191345215, "sft_loss": 2.1655735969543457, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 3.3485727001851484, "learning_rate": 2.1399902262581037e-07, "logits/chosen": -0.08121892064809799, "logits/rejected": 0.061185263097286224, "logps/chosen": -2.0909311771392822, "logps/rejected": -3.0841729640960693, "loss": 0.6681, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0909311771392822, "rewards/margins": 0.9932416677474976, "rewards/rejected": -3.0841729640960693, "sft_loss": 2.140881061553955, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 3.707981491158516, "learning_rate": 2.127229490340094e-07, "logits/chosen": -0.23737430572509766, "logits/rejected": -0.1496736705303192, "logps/chosen": -2.0120530128479004, "logps/rejected": -3.2726852893829346, "loss": 0.6584, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0120530128479004, "rewards/margins": 1.2606327533721924, "rewards/rejected": -3.2726852893829346, "sft_loss": 2.0266661643981934, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 6.024021365435233, "learning_rate": 2.1144966260541698e-07, "logits/chosen": -0.1443227231502533, "logits/rejected": 0.07983455806970596, "logps/chosen": -2.093372106552124, "logps/rejected": -3.282695770263672, "loss": 0.6714, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.093372106552124, "rewards/margins": 1.1893236637115479, "rewards/rejected": -3.282695770263672, "sft_loss": 2.143284320831299, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 3.6485022760546504, "learning_rate": 2.1017917569346332e-07, "logits/chosen": -0.21213392913341522, "logits/rejected": -0.04222031682729721, "logps/chosen": -2.104381561279297, "logps/rejected": -3.0034842491149902, "loss": 0.6689, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.104381561279297, "rewards/margins": 0.8991022109985352, "rewards/rejected": -3.0034842491149902, "sft_loss": 2.0348763465881348, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 2.5335317808819533, "learning_rate": 2.0891150062441837e-07, "logits/chosen": -0.2206738442182541, "logits/rejected": -0.08761437982320786, "logps/chosen": -2.207284927368164, "logps/rejected": -3.4129977226257324, "loss": 0.6775, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.207284927368164, "rewards/margins": 1.2057130336761475, "rewards/rejected": -3.4129977226257324, "sft_loss": 2.2189645767211914, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 4.87628005158134, "learning_rate": 2.0764664969727086e-07, "logits/chosen": -0.15324094891548157, "logits/rejected": -0.0661562904715538, "logps/chosen": -1.9889189004898071, "logps/rejected": -3.1456923484802246, "loss": 0.6702, "rewards/accuracies": 0.75, "rewards/chosen": -1.9889189004898071, "rewards/margins": 1.156773328781128, "rewards/rejected": -3.1456923484802246, "sft_loss": 2.0339088439941406, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 4.3138329473464285, "learning_rate": 2.0638463518361033e-07, "logits/chosen": -0.26100295782089233, "logits/rejected": -0.08350460231304169, "logps/chosen": -2.0644030570983887, "logps/rejected": -3.0298380851745605, "loss": 0.6784, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.0644030570983887, "rewards/margins": 0.9654353260993958, "rewards/rejected": -3.0298380851745605, "sft_loss": 2.0661580562591553, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 7.085430632636539, "learning_rate": 2.0512546932750702e-07, "logits/chosen": -0.2247125655412674, "logits/rejected": -0.11863106489181519, "logps/chosen": -2.1816582679748535, "logps/rejected": -3.0513510704040527, "loss": 0.6741, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.1816582679748535, "rewards/margins": 0.8696924448013306, "rewards/rejected": -3.0513510704040527, "sft_loss": 2.1615498065948486, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 6.182948105987573, "learning_rate": 2.0386916434539343e-07, "logits/chosen": -0.1700625717639923, "logits/rejected": -0.017289992421865463, "logps/chosen": -1.9123014211654663, "logps/rejected": -3.273717164993286, "loss": 0.6535, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9123014211654663, "rewards/margins": 1.3614153861999512, "rewards/rejected": -3.273717164993286, "sft_loss": 2.0608344078063965, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 8.383302226866206, "learning_rate": 2.0261573242594627e-07, "logits/chosen": -0.15090781450271606, "logits/rejected": 0.02349255420267582, "logps/chosen": -2.2728543281555176, "logps/rejected": -3.1122283935546875, "loss": 0.6844, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.2728543281555176, "rewards/margins": 0.8393740653991699, "rewards/rejected": -3.1122283935546875, "sft_loss": 2.1716437339782715, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 4.71009773090505, "learning_rate": 2.0136518572996724e-07, "logits/chosen": -0.14930608868598938, "logits/rejected": 0.0531165674328804, "logps/chosen": -2.2689640522003174, "logps/rejected": -3.132922649383545, "loss": 0.6823, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2689640522003174, "rewards/margins": 0.8639589548110962, "rewards/rejected": -3.132922649383545, "sft_loss": 2.2543208599090576, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 4.544348221456484, "learning_rate": 2.0011753639026617e-07, "logits/chosen": -0.16687729954719543, "logits/rejected": -0.08343629539012909, "logps/chosen": -2.053715229034424, "logps/rejected": -3.274094820022583, "loss": 0.6721, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.053715229034424, "rewards/margins": 1.2203797101974487, "rewards/rejected": -3.274094820022583, "sft_loss": 2.095189094543457, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 2.7263042944114275, "learning_rate": 1.988727965115421e-07, "logits/chosen": -0.1594991385936737, "logits/rejected": -0.05406603962182999, "logps/chosen": -1.9413394927978516, "logps/rejected": -3.1444528102874756, "loss": 0.6586, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9413394927978516, "rewards/margins": 1.203113079071045, "rewards/rejected": -3.1444528102874756, "sft_loss": 1.9781215190887451, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 3.424507807498148, "learning_rate": 1.9763097817026713e-07, "logits/chosen": -0.21377746760845184, "logits/rejected": -0.018960028886795044, "logps/chosen": -2.046653985977173, "logps/rejected": -3.401824474334717, "loss": 0.6614, "rewards/accuracies": 0.75, "rewards/chosen": -2.046653985977173, "rewards/margins": 1.3551702499389648, "rewards/rejected": -3.401824474334717, "sft_loss": 2.1029536724090576, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 6.063552367358618, "learning_rate": 1.9639209341456796e-07, "logits/chosen": -0.13719500601291656, "logits/rejected": -0.02102483995258808, "logps/chosen": -2.0080060958862305, "logps/rejected": -3.1949410438537598, "loss": 0.6718, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0080060958862305, "rewards/margins": 1.1869351863861084, "rewards/rejected": -3.1949410438537598, "sft_loss": 2.161597967147827, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 9.965307089691409, "learning_rate": 1.951561542641102e-07, "logits/chosen": -0.10167713463306427, "logits/rejected": -0.1033143624663353, "logps/chosen": -2.1930813789367676, "logps/rejected": -3.31758451461792, "loss": 0.6724, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1930813789367676, "rewards/margins": 1.1245031356811523, "rewards/rejected": -3.31758451461792, "sft_loss": 2.1610350608825684, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 2.4400753492629454, "learning_rate": 1.939231727099806e-07, "logits/chosen": -0.296092689037323, "logits/rejected": -0.20539577305316925, "logps/chosen": -2.0341458320617676, "logps/rejected": -3.2220683097839355, "loss": 0.6643, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.0341458320617676, "rewards/margins": 1.187922716140747, "rewards/rejected": -3.2220683097839355, "sft_loss": 2.0283610820770264, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 4.024281354117429, "learning_rate": 1.926931607145719e-07, "logits/chosen": -0.09105809032917023, "logits/rejected": 0.06579498946666718, "logps/chosen": -2.2511210441589355, "logps/rejected": -3.344057559967041, "loss": 0.6852, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2511210441589355, "rewards/margins": 1.092936635017395, "rewards/rejected": -3.344057559967041, "sft_loss": 2.2170441150665283, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 4.525395256208599, "learning_rate": 1.9146613021146564e-07, "logits/chosen": -0.15142369270324707, "logits/rejected": -0.036278653889894485, "logps/chosen": -2.0554230213165283, "logps/rejected": -3.2157158851623535, "loss": 0.6603, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0554230213165283, "rewards/margins": 1.1602928638458252, "rewards/rejected": -3.2157158851623535, "sft_loss": 2.0941882133483887, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 3.8622033531060147, "learning_rate": 1.9024209310531736e-07, "logits/chosen": -0.12516744434833527, "logits/rejected": -0.09860964119434357, "logps/chosen": -2.007272243499756, "logps/rejected": -2.9931883811950684, "loss": 0.6599, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.007272243499756, "rewards/margins": 0.9859158396720886, "rewards/rejected": -2.9931883811950684, "sft_loss": 1.9754520654678345, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 3.5309672373675083, "learning_rate": 1.890210612717401e-07, "logits/chosen": -0.15712139010429382, "logits/rejected": -0.0017227933276444674, "logps/chosen": -2.1674985885620117, "logps/rejected": -3.099808931350708, "loss": 0.6778, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1674985885620117, "rewards/margins": 0.9323102831840515, "rewards/rejected": -3.099808931350708, "sft_loss": 2.1875224113464355, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 4.705264536623013, "learning_rate": 1.8780304655719054e-07, "logits/chosen": -0.11597056686878204, "logits/rejected": 0.009610396809875965, "logps/chosen": -2.099177122116089, "logps/rejected": -3.3595035076141357, "loss": 0.67, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.099177122116089, "rewards/margins": 1.2603263854980469, "rewards/rejected": -3.3595035076141357, "sft_loss": 2.085693836212158, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 3.9246632050914343, "learning_rate": 1.865880607788523e-07, "logits/chosen": -0.012894287705421448, "logits/rejected": 0.04866151511669159, "logps/chosen": -2.1491246223449707, "logps/rejected": -3.175144672393799, "loss": 0.6787, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1491246223449707, "rewards/margins": 1.0260196924209595, "rewards/rejected": -3.175144672393799, "sft_loss": 2.2446813583374023, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 4.522581248647605, "learning_rate": 1.8537611572452316e-07, "logits/chosen": -0.12783598899841309, "logits/rejected": -0.040727607905864716, "logps/chosen": -2.1029319763183594, "logps/rejected": -2.9529013633728027, "loss": 0.6796, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1029319763183594, "rewards/margins": 0.8499695062637329, "rewards/rejected": -2.9529013633728027, "sft_loss": 2.15362548828125, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 6.090100570578844, "learning_rate": 1.84167223152499e-07, "logits/chosen": -0.16989929974079132, "logits/rejected": 0.0463339164853096, "logps/chosen": -2.0245769023895264, "logps/rejected": -3.166522264480591, "loss": 0.668, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0245769023895264, "rewards/margins": 1.141945242881775, "rewards/rejected": -3.166522264480591, "sft_loss": 2.0397515296936035, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 5.252724677953759, "learning_rate": 1.8296139479146112e-07, "logits/chosen": -0.17356649041175842, "logits/rejected": -0.1362331509590149, "logps/chosen": -1.9837900400161743, "logps/rejected": -3.133542776107788, "loss": 0.66, "rewards/accuracies": 0.71875, "rewards/chosen": -1.9837900400161743, "rewards/margins": 1.1497529745101929, "rewards/rejected": -3.133542776107788, "sft_loss": 1.993292212486267, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 4.974097176371491, "learning_rate": 1.8175864234036132e-07, "logits/chosen": -0.08385307341814041, "logits/rejected": -0.008767305873334408, "logps/chosen": -2.0469765663146973, "logps/rejected": -3.248431444168091, "loss": 0.6696, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0469765663146973, "rewards/margins": 1.2014553546905518, "rewards/rejected": -3.248431444168091, "sft_loss": 2.076141357421875, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 3.215829779280696, "learning_rate": 1.805589774683094e-07, "logits/chosen": -0.2681628167629242, "logits/rejected": -0.11363331228494644, "logps/chosen": -2.019747257232666, "logps/rejected": -2.9316470623016357, "loss": 0.6757, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.019747257232666, "rewards/margins": 0.9119003415107727, "rewards/rejected": -2.9316470623016357, "sft_loss": 2.0684947967529297, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 5.504838028510426, "learning_rate": 1.79362411814459e-07, "logits/chosen": -0.017854904755949974, "logits/rejected": -0.030091866850852966, "logps/chosen": -2.1185128688812256, "logps/rejected": -3.105051040649414, "loss": 0.6683, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1185128688812256, "rewards/margins": 0.9865385890007019, "rewards/rejected": -3.105051040649414, "sft_loss": 2.1398489475250244, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 4.497631500600767, "learning_rate": 1.7816895698789552e-07, "logits/chosen": -0.19890213012695312, "logits/rejected": -0.10730002075433731, "logps/chosen": -1.9607737064361572, "logps/rejected": -2.923576831817627, "loss": 0.6748, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9607737064361572, "rewards/margins": 0.962803065776825, "rewards/rejected": -2.923576831817627, "sft_loss": 1.987640619277954, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 5.105379180206469, "learning_rate": 1.7697862456752271e-07, "logits/chosen": -0.17512214183807373, "logits/rejected": -0.05550699308514595, "logps/chosen": -2.1245641708374023, "logps/rejected": -3.594120502471924, "loss": 0.6786, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1245641708374023, "rewards/margins": 1.4695560932159424, "rewards/rejected": -3.594120502471924, "sft_loss": 2.149122714996338, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 3.83029784211302, "learning_rate": 1.7579142610195124e-07, "logits/chosen": -0.16118036210536957, "logits/rejected": -0.005187329836189747, "logps/chosen": -2.12148118019104, "logps/rejected": -3.23693585395813, "loss": 0.666, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.12148118019104, "rewards/margins": 1.1154545545578003, "rewards/rejected": -3.23693585395813, "sft_loss": 2.113344430923462, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 6.599358418640735, "learning_rate": 1.7460737310938568e-07, "logits/chosen": -0.18872864544391632, "logits/rejected": 0.03380126133561134, "logps/chosen": -1.9155935049057007, "logps/rejected": -3.2058091163635254, "loss": 0.6609, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9155935049057007, "rewards/margins": 1.2902159690856934, "rewards/rejected": -3.2058091163635254, "sft_loss": 1.9777272939682007, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 16.060417700477547, "learning_rate": 1.734264770775133e-07, "logits/chosen": -0.20039887726306915, "logits/rejected": 0.0246284157037735, "logps/chosen": -1.9851608276367188, "logps/rejected": -3.1726815700531006, "loss": 0.6812, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9851608276367188, "rewards/margins": 1.1875207424163818, "rewards/rejected": -3.1726815700531006, "sft_loss": 2.0441513061523438, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 11.398195564231516, "learning_rate": 1.7224874946339241e-07, "logits/chosen": -0.16940069198608398, "logits/rejected": -0.08222613483667374, "logps/chosen": -2.179595470428467, "logps/rejected": -3.1289970874786377, "loss": 0.6783, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.179595470428467, "rewards/margins": 0.9494016766548157, "rewards/rejected": -3.1289970874786377, "sft_loss": 2.0225982666015625, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 2.8613846338117335, "learning_rate": 1.7107420169334186e-07, "logits/chosen": -0.13604898750782013, "logits/rejected": -0.008158263750374317, "logps/chosen": -2.219383716583252, "logps/rejected": -3.0880210399627686, "loss": 0.6858, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.219383716583252, "rewards/margins": 0.8686376810073853, "rewards/rejected": -3.0880210399627686, "sft_loss": 2.2598366737365723, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 5.814034674569944, "learning_rate": 1.6990284516282893e-07, "logits/chosen": -0.1635938584804535, "logits/rejected": -0.054556943476200104, "logps/chosen": -1.9896701574325562, "logps/rejected": -3.0219662189483643, "loss": 0.6849, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9896701574325562, "rewards/margins": 1.0322957038879395, "rewards/rejected": -3.0219662189483643, "sft_loss": 2.025726318359375, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 4.641368804567021, "learning_rate": 1.687346912363602e-07, "logits/chosen": -0.1995202600955963, "logits/rejected": -0.05748230963945389, "logps/chosen": -2.0775208473205566, "logps/rejected": -3.147552967071533, "loss": 0.672, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0775208473205566, "rewards/margins": 1.0700318813323975, "rewards/rejected": -3.147552967071533, "sft_loss": 2.0811524391174316, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 2.18664544571687, "learning_rate": 1.675697512473697e-07, "logits/chosen": -0.18338724970817566, "logits/rejected": -0.01710915006697178, "logps/chosen": -2.08484148979187, "logps/rejected": -3.206913709640503, "loss": 0.6653, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.08484148979187, "rewards/margins": 1.1220722198486328, "rewards/rejected": -3.206913709640503, "sft_loss": 2.0524466037750244, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 4.943999541276936, "learning_rate": 1.6640803649811087e-07, "logits/chosen": -0.21756012737751007, "logits/rejected": 0.05011097341775894, "logps/chosen": -2.2237229347229004, "logps/rejected": -3.426621198654175, "loss": 0.6713, "rewards/accuracies": 0.75, "rewards/chosen": -2.2237229347229004, "rewards/margins": 1.2028987407684326, "rewards/rejected": -3.426621198654175, "sft_loss": 2.1406548023223877, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 5.408164318031505, "learning_rate": 1.6524955825954472e-07, "logits/chosen": -0.1468680053949356, "logits/rejected": -0.02393932268023491, "logps/chosen": -2.07031512260437, "logps/rejected": -3.1913414001464844, "loss": 0.6687, "rewards/accuracies": 0.71875, "rewards/chosen": -2.07031512260437, "rewards/margins": 1.1210265159606934, "rewards/rejected": -3.1913414001464844, "sft_loss": 2.0023207664489746, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 2.629647851286111, "learning_rate": 1.6409432777123277e-07, "logits/chosen": -0.26578372716903687, "logits/rejected": -0.10726435482501984, "logps/chosen": -2.0501198768615723, "logps/rejected": -3.2494723796844482, "loss": 0.6659, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0501198768615723, "rewards/margins": 1.199352502822876, "rewards/rejected": -3.2494723796844482, "sft_loss": 2.0937790870666504, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 4.192809371025622, "learning_rate": 1.6294235624122577e-07, "logits/chosen": -0.1099371537566185, "logits/rejected": 0.11603529751300812, "logps/chosen": -2.0957791805267334, "logps/rejected": -3.2949626445770264, "loss": 0.679, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.0957791805267334, "rewards/margins": 1.1991835832595825, "rewards/rejected": -3.2949626445770264, "sft_loss": 2.0239791870117188, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 3.903537454392507, "learning_rate": 1.6179365484595697e-07, "logits/chosen": -0.16982577741146088, "logits/rejected": -0.06521300226449966, "logps/chosen": -2.1850740909576416, "logps/rejected": -3.1623456478118896, "loss": 0.6874, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.1850740909576416, "rewards/margins": 0.977271556854248, "rewards/rejected": -3.1623456478118896, "sft_loss": 2.226618528366089, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 3.437569990163379, "learning_rate": 1.60648234730132e-07, "logits/chosen": -0.19504033029079437, "logits/rejected": -0.09787000715732574, "logps/chosen": -1.9441182613372803, "logps/rejected": -3.1635780334472656, "loss": 0.6554, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9441182613372803, "rewards/margins": 1.2194595336914062, "rewards/rejected": -3.1635780334472656, "sft_loss": 2.0067336559295654, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 5.147925937687013, "learning_rate": 1.595061070066222e-07, "logits/chosen": -0.11027028411626816, "logits/rejected": -0.09639479219913483, "logps/chosen": -2.033560276031494, "logps/rejected": -3.3017265796661377, "loss": 0.6618, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.033560276031494, "rewards/margins": 1.2681666612625122, "rewards/rejected": -3.3017265796661377, "sft_loss": 2.10772442817688, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 4.305073371928673, "learning_rate": 1.5836728275635542e-07, "logits/chosen": -0.22954753041267395, "logits/rejected": -0.06323157250881195, "logps/chosen": -2.1516566276550293, "logps/rejected": -3.090576171875, "loss": 0.6806, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1516566276550293, "rewards/margins": 0.9389199018478394, "rewards/rejected": -3.090576171875, "sft_loss": 2.0620648860931396, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 3.278783746807796, "learning_rate": 1.5723177302820984e-07, "logits/chosen": -0.20301339030265808, "logits/rejected": -0.11684314906597137, "logps/chosen": -2.1644604206085205, "logps/rejected": -3.050349712371826, "loss": 0.687, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1644604206085205, "rewards/margins": 0.8858893513679504, "rewards/rejected": -3.050349712371826, "sft_loss": 2.1849312782287598, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 4.657115472273005, "learning_rate": 1.5609958883890544e-07, "logits/chosen": -0.14204177260398865, "logits/rejected": -0.010100598447024822, "logps/chosen": -2.1115527153015137, "logps/rejected": -3.1037027835845947, "loss": 0.6725, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1115527153015137, "rewards/margins": 0.992149829864502, "rewards/rejected": -3.1037027835845947, "sft_loss": 1.9344184398651123, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 5.544476019095504, "learning_rate": 1.5497074117289865e-07, "logits/chosen": -0.2005896270275116, "logits/rejected": -0.07895728200674057, "logps/chosen": -2.069153070449829, "logps/rejected": -3.3531486988067627, "loss": 0.6483, "rewards/accuracies": 0.75, "rewards/chosen": -2.069153070449829, "rewards/margins": 1.2839953899383545, "rewards/rejected": -3.3531486988067627, "sft_loss": 2.10805606842041, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 3.3931901015368893, "learning_rate": 1.5384524098227402e-07, "logits/chosen": -0.173683762550354, "logits/rejected": 0.01994958147406578, "logps/chosen": -2.0747077465057373, "logps/rejected": -3.4305903911590576, "loss": 0.6662, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0747077465057373, "rewards/margins": 1.3558826446533203, "rewards/rejected": -3.4305903911590576, "sft_loss": 2.0860283374786377, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 4.20577772629158, "learning_rate": 1.5272309918663974e-07, "logits/chosen": -0.19935302436351776, "logits/rejected": -0.022657129913568497, "logps/chosen": -2.242023468017578, "logps/rejected": -2.900801658630371, "loss": 0.6863, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.242023468017578, "rewards/margins": 0.6587780714035034, "rewards/rejected": -2.900801658630371, "sft_loss": 2.1713993549346924, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 5.825064489064668, "learning_rate": 1.516043266730201e-07, "logits/chosen": -0.1561446338891983, "logits/rejected": -0.0014340400230139494, "logps/chosen": -2.1472461223602295, "logps/rejected": -3.130239963531494, "loss": 0.6756, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1472461223602295, "rewards/margins": 0.9829939007759094, "rewards/rejected": -3.130239963531494, "sft_loss": 2.094031810760498, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 7.255559691191783, "learning_rate": 1.504889342957512e-07, "logits/chosen": -0.1678655445575714, "logits/rejected": -0.01243885699659586, "logps/chosen": -2.26615571975708, "logps/rejected": -3.3633294105529785, "loss": 0.6772, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.26615571975708, "rewards/margins": 1.0971730947494507, "rewards/rejected": -3.3633294105529785, "sft_loss": 2.252499580383301, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 3.1621430151557117, "learning_rate": 1.4937693287637453e-07, "logits/chosen": -0.1939752995967865, "logits/rejected": -0.04376544803380966, "logps/chosen": -2.2893729209899902, "logps/rejected": -3.1768245697021484, "loss": 0.6798, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2893729209899902, "rewards/margins": 0.8874520063400269, "rewards/rejected": -3.1768245697021484, "sft_loss": 2.222339391708374, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 5.010713776256952, "learning_rate": 1.4826833320353305e-07, "logits/chosen": -0.1629539281129837, "logits/rejected": -0.07679332047700882, "logps/chosen": -2.2452406883239746, "logps/rejected": -3.230379581451416, "loss": 0.6962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2452406883239746, "rewards/margins": 0.9851387739181519, "rewards/rejected": -3.230379581451416, "sft_loss": 2.1471447944641113, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 5.8548545448021745, "learning_rate": 1.4716314603286528e-07, "logits/chosen": -0.18137940764427185, "logits/rejected": 0.02409942075610161, "logps/chosen": -2.000059127807617, "logps/rejected": -3.428389072418213, "loss": 0.6619, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.000059127807617, "rewards/margins": 1.4283303022384644, "rewards/rejected": -3.428389072418213, "sft_loss": 2.0621116161346436, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 4.338810629298849, "learning_rate": 1.4606138208690233e-07, "logits/chosen": -0.2061314880847931, "logits/rejected": -0.1214604377746582, "logps/chosen": -2.3152942657470703, "logps/rejected": -3.1802566051483154, "loss": 0.6856, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.3152942657470703, "rewards/margins": 0.8649622201919556, "rewards/rejected": -3.1802566051483154, "sft_loss": 2.219158172607422, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 2.656680842948589, "learning_rate": 1.4496305205496251e-07, "logits/chosen": -0.16902683675289154, "logits/rejected": -0.0923638790845871, "logps/chosen": -2.280181407928467, "logps/rejected": -3.3735251426696777, "loss": 0.6743, "rewards/accuracies": 0.71875, "rewards/chosen": -2.280181407928467, "rewards/margins": 1.0933433771133423, "rewards/rejected": -3.3735251426696777, "sft_loss": 2.243540048599243, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 3.460901158364958, "learning_rate": 1.4386816659304895e-07, "logits/chosen": -0.26293832063674927, "logits/rejected": -0.12125550210475922, "logps/chosen": -2.193232774734497, "logps/rejected": -3.2326667308807373, "loss": 0.6733, "rewards/accuracies": 0.75, "rewards/chosen": -2.193232774734497, "rewards/margins": 1.0394338369369507, "rewards/rejected": -3.2326667308807373, "sft_loss": 2.1737167835235596, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 3.3471424841073096, "learning_rate": 1.4277673632374492e-07, "logits/chosen": -0.22366316616535187, "logits/rejected": -0.0031981945503503084, "logps/chosen": -2.270132064819336, "logps/rejected": -3.141998052597046, "loss": 0.6843, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.270132064819336, "rewards/margins": 0.8718658685684204, "rewards/rejected": -3.141998052597046, "sft_loss": 2.204784870147705, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 4.734326537984999, "learning_rate": 1.416887718361119e-07, "logits/chosen": -0.06445908546447754, "logits/rejected": -0.037993717938661575, "logps/chosen": -2.031144618988037, "logps/rejected": -3.1980462074279785, "loss": 0.6646, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.031144618988037, "rewards/margins": 1.1669015884399414, "rewards/rejected": -3.1980462074279785, "sft_loss": 2.078840494155884, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 3.7313523331518557, "learning_rate": 1.406042836855859e-07, "logits/chosen": -0.15151730179786682, "logits/rejected": -0.022474870085716248, "logps/chosen": -1.991729497909546, "logps/rejected": -3.368088483810425, "loss": 0.6591, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.991729497909546, "rewards/margins": 1.3763587474822998, "rewards/rejected": -3.368088483810425, "sft_loss": 2.025282144546509, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 7.025836233543962, "learning_rate": 1.3952328239387595e-07, "logits/chosen": -0.25764456391334534, "logits/rejected": -0.055336516350507736, "logps/chosen": -2.141427993774414, "logps/rejected": -3.1318140029907227, "loss": 0.6718, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.141427993774414, "rewards/margins": 0.9903861880302429, "rewards/rejected": -3.1318140029907227, "sft_loss": 2.2568488121032715, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 5.938234903067426, "learning_rate": 1.3844577844886109e-07, "logits/chosen": -0.21786122024059296, "logits/rejected": -0.018373187631368637, "logps/chosen": -2.062985420227051, "logps/rejected": -3.075976848602295, "loss": 0.6773, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.062985420227051, "rewards/margins": 1.0129914283752441, "rewards/rejected": -3.075976848602295, "sft_loss": 2.0868048667907715, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 5.219983872580531, "learning_rate": 1.3737178230448955e-07, "logits/chosen": -0.24924568831920624, "logits/rejected": -0.11489690840244293, "logps/chosen": -2.0819058418273926, "logps/rejected": -3.0514657497406006, "loss": 0.6719, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0819058418273926, "rewards/margins": 0.9695600271224976, "rewards/rejected": -3.0514657497406006, "sft_loss": 2.061774253845215, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 3.94212703161178, "learning_rate": 1.363013043806764e-07, "logits/chosen": -0.21287953853607178, "logits/rejected": -0.06039626523852348, "logps/chosen": -1.9492467641830444, "logps/rejected": -3.088829517364502, "loss": 0.6674, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9492467641830444, "rewards/margins": 1.1395829916000366, "rewards/rejected": -3.088829517364502, "sft_loss": 1.9745715856552124, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 3.346019533502611, "learning_rate": 1.352343550632034e-07, "logits/chosen": -0.16004632413387299, "logits/rejected": -0.011574303731322289, "logps/chosen": -2.0461299419403076, "logps/rejected": -3.428122043609619, "loss": 0.6685, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0461299419403076, "rewards/margins": 1.3819921016693115, "rewards/rejected": -3.428122043609619, "sft_loss": 2.0484619140625, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 2.957888286685369, "learning_rate": 1.3417094470361722e-07, "logits/chosen": -0.22204062342643738, "logits/rejected": -0.0745166540145874, "logps/chosen": -2.1117758750915527, "logps/rejected": -3.0526363849639893, "loss": 0.6694, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1117758750915527, "rewards/margins": 0.9408601522445679, "rewards/rejected": -3.0526363849639893, "sft_loss": 2.1839990615844727, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": 0.1483335644006729, "eval_logits/rejected": 0.24738433957099915, "eval_logps/chosen": -2.15895938873291, "eval_logps/rejected": -3.1864676475524902, "eval_loss": 0.6887222528457642, "eval_rewards/accuracies": 0.6921365261077881, "eval_rewards/chosen": -2.15895938873291, "eval_rewards/margins": 1.02750825881958, "eval_rewards/rejected": -3.1864676475524902, "eval_runtime": 44.3003, "eval_samples_per_second": 30.361, "eval_sft_loss": 2.1145176887512207, "eval_steps_per_second": 7.607, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 5.817831977279581, "learning_rate": 1.3311108361913015e-07, "logits/chosen": -0.25421690940856934, "logits/rejected": -0.20666737854480743, "logps/chosen": -2.0045931339263916, "logps/rejected": -2.980300188064575, "loss": 0.6699, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.0045931339263916, "rewards/margins": 0.9757073521614075, "rewards/rejected": -2.980300188064575, "sft_loss": 2.0653605461120605, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 5.5107099677882445, "learning_rate": 1.3205478209251874e-07, "logits/chosen": -0.1670076549053192, "logits/rejected": -0.07222741842269897, "logps/chosen": -2.15533185005188, "logps/rejected": -3.3656413555145264, "loss": 0.6632, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.15533185005188, "rewards/margins": 1.2103097438812256, "rewards/rejected": -3.3656413555145264, "sft_loss": 2.161409854888916, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 6.296428782704514, "learning_rate": 1.310020503720254e-07, "logits/chosen": -0.17989398539066315, "logits/rejected": 0.0022085390519350767, "logps/chosen": -2.2136988639831543, "logps/rejected": -3.173541307449341, "loss": 0.6729, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.2136988639831543, "rewards/margins": 0.9598420858383179, "rewards/rejected": -3.173541307449341, "sft_loss": 2.1450047492980957, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 9.545366280350136, "learning_rate": 1.2995289867125752e-07, "logits/chosen": -0.18045374751091003, "logits/rejected": -0.09958475083112717, "logps/chosen": -2.098446846008301, "logps/rejected": -2.973470449447632, "loss": 0.6849, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.098446846008301, "rewards/margins": 0.8750236630439758, "rewards/rejected": -2.973470449447632, "sft_loss": 2.033970355987549, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 5.860959261583987, "learning_rate": 1.2890733716908986e-07, "logits/chosen": -0.18542693555355072, "logits/rejected": -0.09705062210559845, "logps/chosen": -1.8457778692245483, "logps/rejected": -2.8968265056610107, "loss": 0.6507, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8457778692245483, "rewards/margins": 1.0510485172271729, "rewards/rejected": -2.8968265056610107, "sft_loss": 1.942373514175415, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 3.2555514045628327, "learning_rate": 1.2786537600956454e-07, "logits/chosen": -0.22500939667224884, "logits/rejected": -0.05397395044565201, "logps/chosen": -2.0629329681396484, "logps/rejected": -3.292663097381592, "loss": 0.6686, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0629329681396484, "rewards/margins": 1.229730248451233, "rewards/rejected": -3.292663097381592, "sft_loss": 2.043607473373413, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 5.048163709079541, "learning_rate": 1.268270253017933e-07, "logits/chosen": -0.2142733782529831, "logits/rejected": -0.016649195924401283, "logps/chosen": -2.1254544258117676, "logps/rejected": -3.3514981269836426, "loss": 0.6675, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1254544258117676, "rewards/margins": 1.226043462753296, "rewards/rejected": -3.3514981269836426, "sft_loss": 2.1830906867980957, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 6.011265373821511, "learning_rate": 1.257922951198591e-07, "logits/chosen": -0.28187423944473267, "logits/rejected": -0.04165396839380264, "logps/chosen": -2.0677714347839355, "logps/rejected": -3.1529834270477295, "loss": 0.6743, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.0677714347839355, "rewards/margins": 1.085211992263794, "rewards/rejected": -3.1529834270477295, "sft_loss": 2.1271486282348633, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 4.059665837066231, "learning_rate": 1.24761195502719e-07, "logits/chosen": -0.23464807868003845, "logits/rejected": -0.03467894718050957, "logps/chosen": -2.2200443744659424, "logps/rejected": -3.086019992828369, "loss": 0.7057, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.2200443744659424, "rewards/margins": 0.8659753799438477, "rewards/rejected": -3.086019992828369, "sft_loss": 2.2082483768463135, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 4.484062105655913, "learning_rate": 1.2373373645410573e-07, "logits/chosen": -0.1839439868927002, "logits/rejected": -0.01578974723815918, "logps/chosen": -2.3946101665496826, "logps/rejected": -3.2734806537628174, "loss": 0.6811, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.3946101665496826, "rewards/margins": 0.8788701891899109, "rewards/rejected": -3.2734806537628174, "sft_loss": 2.288024663925171, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 2.701679577284141, "learning_rate": 1.2270992794243175e-07, "logits/chosen": -0.2606489658355713, "logits/rejected": -0.14730049669742584, "logps/chosen": -2.2450132369995117, "logps/rejected": -3.4032225608825684, "loss": 0.6725, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.2450132369995117, "rewards/margins": 1.1582093238830566, "rewards/rejected": -3.4032225608825684, "sft_loss": 2.2701351642608643, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 3.5960375002298557, "learning_rate": 1.2168977990069147e-07, "logits/chosen": -0.21604426205158234, "logits/rejected": 0.00216987868770957, "logps/chosen": -2.0344831943511963, "logps/rejected": -3.2819888591766357, "loss": 0.6703, "rewards/accuracies": 0.78125, "rewards/chosen": -2.0344831943511963, "rewards/margins": 1.2475056648254395, "rewards/rejected": -3.2819888591766357, "sft_loss": 2.083733558654785, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 2.383947641733567, "learning_rate": 1.206733022263659e-07, "logits/chosen": -0.2589126229286194, "logits/rejected": -0.046416062861680984, "logps/chosen": -2.1839864253997803, "logps/rejected": -3.235879421234131, "loss": 0.6849, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1839864253997803, "rewards/margins": 1.0518933534622192, "rewards/rejected": -3.235879421234131, "sft_loss": 2.14011287689209, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 4.902499961393214, "learning_rate": 1.1966050478132572e-07, "logits/chosen": -0.1358111947774887, "logits/rejected": -0.049908898770809174, "logps/chosen": -2.076606512069702, "logps/rejected": -3.116420269012451, "loss": 0.6687, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.076606512069702, "rewards/margins": 1.0398141145706177, "rewards/rejected": -3.116420269012451, "sft_loss": 2.177868366241455, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 3.999033078306384, "learning_rate": 1.1865139739173635e-07, "logits/chosen": -0.22534795105457306, "logits/rejected": -0.011211365461349487, "logps/chosen": -2.2466542720794678, "logps/rejected": -3.222562313079834, "loss": 0.6767, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2466542720794678, "rewards/margins": 0.975908100605011, "rewards/rejected": -3.222562313079834, "sft_loss": 2.2334485054016113, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 2.9046584204611103, "learning_rate": 1.1764598984796187e-07, "logits/chosen": -0.22426724433898926, "logits/rejected": -0.12043038755655289, "logps/chosen": -1.967124342918396, "logps/rejected": -2.9338607788085938, "loss": 0.6683, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.967124342918396, "rewards/margins": 0.9667367935180664, "rewards/rejected": -2.9338607788085938, "sft_loss": 1.9687387943267822, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 16.514901421405394, "learning_rate": 1.1664429190447095e-07, "logits/chosen": -0.164666548371315, "logits/rejected": -0.06656551361083984, "logps/chosen": -2.102875232696533, "logps/rejected": -3.20685076713562, "loss": 0.6849, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.102875232696533, "rewards/margins": 1.1039756536483765, "rewards/rejected": -3.20685076713562, "sft_loss": 2.0535407066345215, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 4.517911779314552, "learning_rate": 1.1564631327974122e-07, "logits/chosen": -0.21307143568992615, "logits/rejected": -0.02924274280667305, "logps/chosen": -2.1048378944396973, "logps/rejected": -3.3505256175994873, "loss": 0.6664, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1048378944396973, "rewards/margins": 1.24568772315979, "rewards/rejected": -3.3505256175994873, "sft_loss": 2.151399850845337, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 7.700116019487075, "learning_rate": 1.1465206365616587e-07, "logits/chosen": -0.273918479681015, "logits/rejected": -0.07913421839475632, "logps/chosen": -2.106267213821411, "logps/rejected": -3.02795672416687, "loss": 0.6886, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.106267213821411, "rewards/margins": 0.921689510345459, "rewards/rejected": -3.02795672416687, "sft_loss": 2.160156011581421, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 3.9073485908549697, "learning_rate": 1.1366155267995887e-07, "logits/chosen": -0.11443676799535751, "logits/rejected": -0.10150516033172607, "logps/chosen": -1.9929697513580322, "logps/rejected": -3.0766334533691406, "loss": 0.6686, "rewards/accuracies": 0.71875, "rewards/chosen": -1.9929697513580322, "rewards/margins": 1.0836635828018188, "rewards/rejected": -3.0766334533691406, "sft_loss": 2.0109825134277344, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 4.730581186560652, "learning_rate": 1.1267478996106228e-07, "logits/chosen": -0.20358338952064514, "logits/rejected": 0.0008606910705566406, "logps/chosen": -2.1794867515563965, "logps/rejected": -3.184910297393799, "loss": 0.6796, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1794867515563965, "rewards/margins": 1.0054237842559814, "rewards/rejected": -3.184910297393799, "sft_loss": 2.1558899879455566, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 4.984279237576151, "learning_rate": 1.116917850730521e-07, "logits/chosen": -0.23577456176280975, "logits/rejected": -0.08217061311006546, "logps/chosen": -2.0777785778045654, "logps/rejected": -3.0439350605010986, "loss": 0.667, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.0777785778045654, "rewards/margins": 0.9661566019058228, "rewards/rejected": -3.0439350605010986, "sft_loss": 2.0925450325012207, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 6.392868476110788, "learning_rate": 1.1071254755304637e-07, "logits/chosen": -0.19419506192207336, "logits/rejected": -0.12320198863744736, "logps/chosen": -1.963719367980957, "logps/rejected": -3.0543346405029297, "loss": 0.6625, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.963719367980957, "rewards/margins": 1.090614676475525, "rewards/rejected": -3.0543346405029297, "sft_loss": 1.983407974243164, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 3.660525378634166, "learning_rate": 1.0973708690161143e-07, "logits/chosen": -0.16067619621753693, "logits/rejected": -0.0643281489610672, "logps/chosen": -2.078392505645752, "logps/rejected": -3.2905898094177246, "loss": 0.654, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.078392505645752, "rewards/margins": 1.2121970653533936, "rewards/rejected": -3.2905898094177246, "sft_loss": 2.1481382846832275, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 4.053343742403941, "learning_rate": 1.0876541258267119e-07, "logits/chosen": -0.22848236560821533, "logits/rejected": -0.02969544194638729, "logps/chosen": -2.1285107135772705, "logps/rejected": -3.1666715145111084, "loss": 0.6682, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1285107135772705, "rewards/margins": 1.0381609201431274, "rewards/rejected": -3.1666715145111084, "sft_loss": 2.166686773300171, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 3.2692826243416078, "learning_rate": 1.0779753402341379e-07, "logits/chosen": -0.23465761542320251, "logits/rejected": -0.16107268631458282, "logps/chosen": -2.1159329414367676, "logps/rejected": -2.995014190673828, "loss": 0.6864, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1159329414367676, "rewards/margins": 0.8790813684463501, "rewards/rejected": -2.995014190673828, "sft_loss": 2.008147954940796, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 4.143745884555678, "learning_rate": 1.0683346061420157e-07, "logits/chosen": -0.08680951595306396, "logits/rejected": 0.007194558624178171, "logps/chosen": -2.0280799865722656, "logps/rejected": -3.1981682777404785, "loss": 0.6716, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.0280799865722656, "rewards/margins": 1.1700884103775024, "rewards/rejected": -3.1981682777404785, "sft_loss": 2.09446120262146, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 7.728816132361034, "learning_rate": 1.0587320170847874e-07, "logits/chosen": -0.16309547424316406, "logits/rejected": -0.045490048825740814, "logps/chosen": -2.087641716003418, "logps/rejected": -3.1135027408599854, "loss": 0.6735, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.087641716003418, "rewards/margins": 1.0258610248565674, "rewards/rejected": -3.1135027408599854, "sft_loss": 2.124109983444214, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 5.368612799469, "learning_rate": 1.0491676662268156e-07, "logits/chosen": -0.13496272265911102, "logits/rejected": -0.026529857888817787, "logps/chosen": -2.0952439308166504, "logps/rejected": -3.109184503555298, "loss": 0.6837, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0952439308166504, "rewards/margins": 1.013940453529358, "rewards/rejected": -3.109184503555298, "sft_loss": 2.1108901500701904, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 5.578544809423889, "learning_rate": 1.0396416463614732e-07, "logits/chosen": -0.240474134683609, "logits/rejected": -0.11594425141811371, "logps/chosen": -1.9697179794311523, "logps/rejected": -3.124454975128174, "loss": 0.6646, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9697179794311523, "rewards/margins": 1.154736876487732, "rewards/rejected": -3.124454975128174, "sft_loss": 2.0101077556610107, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 2.894872323011034, "learning_rate": 1.0301540499102479e-07, "logits/chosen": -0.17339864373207092, "logits/rejected": -0.07853229343891144, "logps/chosen": -2.1969246864318848, "logps/rejected": -3.136164426803589, "loss": 0.6745, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1969246864318848, "rewards/margins": 0.939239501953125, "rewards/rejected": -3.136164426803589, "sft_loss": 2.230463743209839, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 4.022297021715591, "learning_rate": 1.0207049689218405e-07, "logits/chosen": -0.2338290959596634, "logits/rejected": -0.0293536689132452, "logps/chosen": -2.089036226272583, "logps/rejected": -3.1517233848571777, "loss": 0.6603, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.089036226272583, "rewards/margins": 1.0626872777938843, "rewards/rejected": -3.1517233848571777, "sft_loss": 2.0483181476593018, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 6.602413577147545, "learning_rate": 1.0112944950712782e-07, "logits/chosen": -0.18875204026699066, "logits/rejected": -0.06928505003452301, "logps/chosen": -2.056365489959717, "logps/rejected": -3.2918601036071777, "loss": 0.6662, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.056365489959717, "rewards/margins": 1.2354947328567505, "rewards/rejected": -3.2918601036071777, "sft_loss": 1.9954465627670288, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 4.579347021717172, "learning_rate": 1.0019227196590174e-07, "logits/chosen": -0.15530741214752197, "logits/rejected": 0.010155891068279743, "logps/chosen": -2.1392271518707275, "logps/rejected": -3.348783493041992, "loss": 0.6705, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.1392271518707275, "rewards/margins": 1.2095563411712646, "rewards/rejected": -3.348783493041992, "sft_loss": 2.191584348678589, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 3.485684798487018, "learning_rate": 9.925897336100664e-08, "logits/chosen": -0.1419631689786911, "logits/rejected": -0.03920459747314453, "logps/chosen": -2.076598882675171, "logps/rejected": -3.1227006912231445, "loss": 0.6739, "rewards/accuracies": 0.71875, "rewards/chosen": -2.076598882675171, "rewards/margins": 1.0461018085479736, "rewards/rejected": -3.1227006912231445, "sft_loss": 2.107247829437256, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 5.141057227980531, "learning_rate": 9.832956274730946e-08, "logits/chosen": -0.1832994967699051, "logits/rejected": -0.12847161293029785, "logps/chosen": -2.0571811199188232, "logps/rejected": -3.0328803062438965, "loss": 0.6719, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.0571811199188232, "rewards/margins": 0.9756988286972046, "rewards/rejected": -3.0328803062438965, "sft_loss": 2.0433449745178223, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 4.358330681533576, "learning_rate": 9.740404914195633e-08, "logits/chosen": -0.1776488721370697, "logits/rejected": -0.0038052662275731564, "logps/chosen": -2.1023917198181152, "logps/rejected": -3.284991502761841, "loss": 0.6646, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1023917198181152, "rewards/margins": 1.1825997829437256, "rewards/rejected": -3.284991502761841, "sft_loss": 2.1911160945892334, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 3.771162771811942, "learning_rate": 9.648244152428392e-08, "logits/chosen": -0.2590990960597992, "logits/rejected": -0.11647045612335205, "logps/chosen": -2.125926971435547, "logps/rejected": -2.983856201171875, "loss": 0.6686, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.125926971435547, "rewards/margins": 0.8579292297363281, "rewards/rejected": -2.983856201171875, "sft_loss": 2.19528865814209, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 5.391570825028533, "learning_rate": 9.556474883573379e-08, "logits/chosen": -0.23013600707054138, "logits/rejected": -0.09508788585662842, "logps/chosen": -1.9805049896240234, "logps/rejected": -3.3649487495422363, "loss": 0.6677, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9805049896240234, "rewards/margins": 1.3844436407089233, "rewards/rejected": -3.3649487495422363, "sft_loss": 2.0095901489257812, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 3.9702939952336394, "learning_rate": 9.465097997976412e-08, "logits/chosen": -0.21318873763084412, "logits/rejected": 0.009989452548325062, "logps/chosen": -2.0758414268493652, "logps/rejected": -3.3712425231933594, "loss": 0.6634, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0758414268493652, "rewards/margins": 1.2954013347625732, "rewards/rejected": -3.3712425231933594, "sft_loss": 2.166355848312378, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 4.827633181150157, "learning_rate": 9.374114382176457e-08, "logits/chosen": -0.15626654028892517, "logits/rejected": 0.003751030657440424, "logps/chosen": -2.1273741722106934, "logps/rejected": -3.2390360832214355, "loss": 0.6749, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1273741722106934, "rewards/margins": 1.1116619110107422, "rewards/rejected": -3.2390360832214355, "sft_loss": 2.157222270965576, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 3.2688554722893253, "learning_rate": 9.283524918896945e-08, "logits/chosen": -0.19700530171394348, "logits/rejected": -0.054456066340208054, "logps/chosen": -2.1807610988616943, "logps/rejected": -3.256981611251831, "loss": 0.68, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1807610988616943, "rewards/margins": 1.076220989227295, "rewards/rejected": -3.256981611251831, "sft_loss": 2.1460068225860596, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 4.060086699739296, "learning_rate": 9.193330487037232e-08, "logits/chosen": -0.1563759744167328, "logits/rejected": -0.007863004691898823, "logps/chosen": -2.1849801540374756, "logps/rejected": -3.558262586593628, "loss": 0.6654, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1849801540374756, "rewards/margins": 1.373282551765442, "rewards/rejected": -3.558262586593628, "sft_loss": 2.270066976547241, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 6.246636699435798, "learning_rate": 9.103531961664118e-08, "logits/chosen": -0.14790132641792297, "logits/rejected": 0.057421762496232986, "logps/chosen": -2.08307147026062, "logps/rejected": -3.0761396884918213, "loss": 0.6625, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.08307147026062, "rewards/margins": 0.9930680990219116, "rewards/rejected": -3.0761396884918213, "sft_loss": 2.1781067848205566, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 2.882130224247622, "learning_rate": 9.014130214003269e-08, "logits/chosen": -0.24506108462810516, "logits/rejected": -0.191410630941391, "logps/chosen": -2.0796127319335938, "logps/rejected": -3.255584716796875, "loss": 0.676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0796127319335938, "rewards/margins": 1.1759722232818604, "rewards/rejected": -3.255584716796875, "sft_loss": 2.0985941886901855, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 8.769729197358377, "learning_rate": 8.925126111430848e-08, "logits/chosen": -0.17417939007282257, "logits/rejected": -0.08329921960830688, "logps/chosen": -1.988851547241211, "logps/rejected": -3.1151936054229736, "loss": 0.677, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.988851547241211, "rewards/margins": 1.1263421773910522, "rewards/rejected": -3.1151936054229736, "sft_loss": 1.986951470375061, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 15.640877111904508, "learning_rate": 8.83652051746504e-08, "logits/chosen": -0.07740931212902069, "logits/rejected": 0.06860514730215073, "logps/chosen": -2.248150110244751, "logps/rejected": -3.5168323516845703, "loss": 0.6774, "rewards/accuracies": 0.6875, "rewards/chosen": -2.248150110244751, "rewards/margins": 1.2686822414398193, "rewards/rejected": -3.5168323516845703, "sft_loss": 2.257601261138916, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 3.701452759156924, "learning_rate": 8.748314291757696e-08, "logits/chosen": -0.12445513904094696, "logits/rejected": -0.004047292284667492, "logps/chosen": -2.089381217956543, "logps/rejected": -3.2651374340057373, "loss": 0.6758, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.089381217956543, "rewards/margins": 1.1757564544677734, "rewards/rejected": -3.2651374340057373, "sft_loss": 2.09254789352417, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 7.154716282601555, "learning_rate": 8.660508290086032e-08, "logits/chosen": -0.15927599370479584, "logits/rejected": -0.005575224757194519, "logps/chosen": -1.9210288524627686, "logps/rejected": -3.160552740097046, "loss": 0.6573, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9210288524627686, "rewards/margins": 1.239524006843567, "rewards/rejected": -3.160552740097046, "sft_loss": 1.9605824947357178, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 4.952289202829157, "learning_rate": 8.573103364344231e-08, "logits/chosen": -0.1863076090812683, "logits/rejected": 0.06800957769155502, "logps/chosen": -2.0291895866394043, "logps/rejected": -3.1924831867218018, "loss": 0.6632, "rewards/accuracies": 0.75, "rewards/chosen": -2.0291895866394043, "rewards/margins": 1.163293480873108, "rewards/rejected": -3.1924831867218018, "sft_loss": 1.9746549129486084, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 4.648816313592943, "learning_rate": 8.486100362535292e-08, "logits/chosen": -0.18942146003246307, "logits/rejected": -0.0317835696041584, "logps/chosen": -2.0402283668518066, "logps/rejected": -2.8962950706481934, "loss": 0.6878, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.0402283668518066, "rewards/margins": 0.8560672998428345, "rewards/rejected": -2.8962950706481934, "sft_loss": 2.0892820358276367, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 4.5728506351169385, "learning_rate": 8.399500128762693e-08, "logits/chosen": -0.2015286386013031, "logits/rejected": -0.06532806903123856, "logps/chosen": -2.163422107696533, "logps/rejected": -3.1678988933563232, "loss": 0.6709, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.163422107696533, "rewards/margins": 1.0044764280319214, "rewards/rejected": -3.1678988933563232, "sft_loss": 2.1092007160186768, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 4.445299022884345, "learning_rate": 8.313303503222313e-08, "logits/chosen": -0.18331186473369598, "logits/rejected": -0.08265326172113419, "logps/chosen": -2.3359968662261963, "logps/rejected": -3.1260948181152344, "loss": 0.6873, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.3359968662261963, "rewards/margins": 0.7900980710983276, "rewards/rejected": -3.1260948181152344, "sft_loss": 2.2022814750671387, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 4.396326992507092, "learning_rate": 8.227511322194164e-08, "logits/chosen": -0.17815802991390228, "logits/rejected": -0.045660458505153656, "logps/chosen": -2.120434284210205, "logps/rejected": -2.9672372341156006, "loss": 0.6803, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.120434284210205, "rewards/margins": 0.8468027114868164, "rewards/rejected": -2.9672372341156006, "sft_loss": 2.0805301666259766, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 3.536039764346822, "learning_rate": 8.142124418034385e-08, "logits/chosen": -0.12677282094955444, "logits/rejected": 0.017463652417063713, "logps/chosen": -2.0878491401672363, "logps/rejected": -3.0715315341949463, "loss": 0.6695, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.0878491401672363, "rewards/margins": 0.9836824536323547, "rewards/rejected": -3.0715315341949463, "sft_loss": 2.0803754329681396, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 3.485078084719585, "learning_rate": 8.057143619167073e-08, "logits/chosen": -0.13141298294067383, "logits/rejected": -0.019352253526449203, "logps/chosen": -2.1474380493164062, "logps/rejected": -3.40448260307312, "loss": 0.6769, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1474380493164062, "rewards/margins": 1.2570445537567139, "rewards/rejected": -3.40448260307312, "sft_loss": 2.121006965637207, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 4.508835312403918, "learning_rate": 7.97256975007633e-08, "logits/chosen": -0.25155094265937805, "logits/rejected": -0.05297283083200455, "logps/chosen": -2.0105624198913574, "logps/rejected": -3.0212528705596924, "loss": 0.6742, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.0105624198913574, "rewards/margins": 1.0106905698776245, "rewards/rejected": -3.0212528705596924, "sft_loss": 2.00258731842041, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 5.373460727947873, "learning_rate": 7.888403631298186e-08, "logits/chosen": -0.14757230877876282, "logits/rejected": -0.06751175224781036, "logps/chosen": -1.9183895587921143, "logps/rejected": -3.0663979053497314, "loss": 0.6671, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9183895587921143, "rewards/margins": 1.1480082273483276, "rewards/rejected": -3.0663979053497314, "sft_loss": 1.9006916284561157, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 4.001601812949795, "learning_rate": 7.804646079412719e-08, "logits/chosen": -0.14795711636543274, "logits/rejected": 0.02044614776968956, "logps/chosen": -2.209221363067627, "logps/rejected": -3.196859836578369, "loss": 0.6687, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.209221363067627, "rewards/margins": 0.9876389503479004, "rewards/rejected": -3.196859836578369, "sft_loss": 2.2197983264923096, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 3.311482171020216, "learning_rate": 7.72129790703604e-08, "logits/chosen": -0.22143828868865967, "logits/rejected": -0.08104579150676727, "logps/chosen": -2.0433874130249023, "logps/rejected": -3.050140380859375, "loss": 0.6741, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.0433874130249023, "rewards/margins": 1.006752848625183, "rewards/rejected": -3.050140380859375, "sft_loss": 2.0471534729003906, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 4.445965037114739, "learning_rate": 7.638359922812504e-08, "logits/chosen": -0.11444427073001862, "logits/rejected": -0.059414125978946686, "logps/chosen": -2.22161602973938, "logps/rejected": -3.1719493865966797, "loss": 0.6792, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.22161602973938, "rewards/margins": 0.9503332376480103, "rewards/rejected": -3.1719493865966797, "sft_loss": 2.1201417446136475, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 5.143327483278184, "learning_rate": 7.555832931406774e-08, "logits/chosen": -0.20135203003883362, "logits/rejected": -0.02879125438630581, "logps/chosen": -2.1493937969207764, "logps/rejected": -3.2124671936035156, "loss": 0.6746, "rewards/accuracies": 0.75, "rewards/chosen": -2.1493937969207764, "rewards/margins": 1.0630733966827393, "rewards/rejected": -3.2124671936035156, "sft_loss": 2.1464171409606934, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 3.7711345709560766, "learning_rate": 7.47371773349611e-08, "logits/chosen": -0.13477244973182678, "logits/rejected": -0.09561160206794739, "logps/chosen": -2.101278781890869, "logps/rejected": -3.401787519454956, "loss": 0.6625, "rewards/accuracies": 0.78125, "rewards/chosen": -2.101278781890869, "rewards/margins": 1.300508975982666, "rewards/rejected": -3.401787519454956, "sft_loss": 2.103086233139038, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 13.488161310983283, "learning_rate": 7.392015125762496e-08, "logits/chosen": -0.13736936450004578, "logits/rejected": -0.01585746742784977, "logps/chosen": -1.9863735437393188, "logps/rejected": -3.251328229904175, "loss": 0.6707, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9863735437393188, "rewards/margins": 1.2649548053741455, "rewards/rejected": -3.251328229904175, "sft_loss": 1.9716198444366455, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 4.4555967248871085, "learning_rate": 7.310725900885018e-08, "logits/chosen": -0.18262706696987152, "logits/rejected": -0.09311272203922272, "logps/chosen": -2.151057243347168, "logps/rejected": -3.1605875492095947, "loss": 0.6733, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.151057243347168, "rewards/margins": 1.0095303058624268, "rewards/rejected": -3.1605875492095947, "sft_loss": 2.2076568603515625, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 8.275028499451532, "learning_rate": 7.229850847532076e-08, "logits/chosen": -0.15065720677375793, "logits/rejected": 0.0015823025023564696, "logps/chosen": -2.06017804145813, "logps/rejected": -3.1844449043273926, "loss": 0.6681, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.06017804145813, "rewards/margins": 1.1242671012878418, "rewards/rejected": -3.1844449043273926, "sft_loss": 2.1298396587371826, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 4.523229173230882, "learning_rate": 7.149390750353779e-08, "logits/chosen": -0.13953344523906708, "logits/rejected": -0.16973023116588593, "logps/chosen": -1.9981342554092407, "logps/rejected": -3.045314073562622, "loss": 0.6629, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9981342554092407, "rewards/margins": 1.0471800565719604, "rewards/rejected": -3.045314073562622, "sft_loss": 2.0397486686706543, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 3.7345807108239786, "learning_rate": 7.069346389974374e-08, "logits/chosen": -0.20351342856884003, "logits/rejected": -0.029998648911714554, "logps/chosen": -2.1162867546081543, "logps/rejected": -3.1932778358459473, "loss": 0.6769, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1162867546081543, "rewards/margins": 1.076991319656372, "rewards/rejected": -3.1932778358459473, "sft_loss": 2.173251152038574, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 3.9663568034487207, "learning_rate": 6.989718542984563e-08, "logits/chosen": -0.1681821644306183, "logits/rejected": -0.10534799098968506, "logps/chosen": -2.140448808670044, "logps/rejected": -3.256242275238037, "loss": 0.6662, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.140448808670044, "rewards/margins": 1.1157935857772827, "rewards/rejected": -3.256242275238037, "sft_loss": 2.1975722312927246, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 3.588701506766935, "learning_rate": 6.9105079819341e-08, "logits/chosen": -0.11192403733730316, "logits/rejected": 0.10734760761260986, "logps/chosen": -2.1018166542053223, "logps/rejected": -3.561558961868286, "loss": 0.6669, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.1018166542053223, "rewards/margins": 1.4597421884536743, "rewards/rejected": -3.561558961868286, "sft_loss": 2.1336729526519775, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 3.5016222528436303, "learning_rate": 6.831715475324163e-08, "logits/chosen": -0.2518225312232971, "logits/rejected": -0.07085306942462921, "logps/chosen": -2.039267063140869, "logps/rejected": -3.482922315597534, "loss": 0.6665, "rewards/accuracies": 0.75, "rewards/chosen": -2.039267063140869, "rewards/margins": 1.4436554908752441, "rewards/rejected": -3.482922315597534, "sft_loss": 2.106978178024292, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 6.226457792487666, "learning_rate": 6.753341787600026e-08, "logits/chosen": -0.253101646900177, "logits/rejected": -0.1483955681324005, "logps/chosen": -2.048860788345337, "logps/rejected": -3.3397223949432373, "loss": 0.667, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.048860788345337, "rewards/margins": 1.2908620834350586, "rewards/rejected": -3.3397223949432373, "sft_loss": 2.101193428039551, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 3.872483505026465, "learning_rate": 6.67538767914353e-08, "logits/chosen": -0.23167535662651062, "logits/rejected": -0.042937736958265305, "logps/chosen": -2.325857162475586, "logps/rejected": -3.220263719558716, "loss": 0.6945, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.325857162475586, "rewards/margins": 0.8944064378738403, "rewards/rejected": -3.220263719558716, "sft_loss": 2.371337413787842, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 3.8511845835646175, "learning_rate": 6.597853906265793e-08, "logits/chosen": -0.19250306487083435, "logits/rejected": -0.04675256460905075, "logps/chosen": -2.2092864513397217, "logps/rejected": -3.399085521697998, "loss": 0.6698, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2092864513397217, "rewards/margins": 1.1897990703582764, "rewards/rejected": -3.399085521697998, "sft_loss": 2.1123721599578857, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 3.7888654661200563, "learning_rate": 6.5207412211998e-08, "logits/chosen": -0.06298299133777618, "logits/rejected": 0.04790624603629112, "logps/chosen": -2.174638032913208, "logps/rejected": -3.4041781425476074, "loss": 0.6614, "rewards/accuracies": 0.75, "rewards/chosen": -2.174638032913208, "rewards/margins": 1.2295398712158203, "rewards/rejected": -3.4041781425476074, "sft_loss": 2.144359588623047, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 12.481381241745728, "learning_rate": 6.444050372093186e-08, "logits/chosen": -0.14639827609062195, "logits/rejected": -0.008682933636009693, "logps/chosen": -2.0565505027770996, "logps/rejected": -3.1733672618865967, "loss": 0.6718, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0565505027770996, "rewards/margins": 1.1168169975280762, "rewards/rejected": -3.1733672618865967, "sft_loss": 2.1009578704833984, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 4.628119591554495, "learning_rate": 6.367782103000873e-08, "logits/chosen": -0.1513335406780243, "logits/rejected": -0.08773148059844971, "logps/chosen": -2.12284779548645, "logps/rejected": -3.0121376514434814, "loss": 0.6872, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.12284779548645, "rewards/margins": 0.8892895579338074, "rewards/rejected": -3.0121376514434814, "sft_loss": 2.0487465858459473, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 3.281045074150346, "learning_rate": 6.29193715387798e-08, "logits/chosen": -0.20077288150787354, "logits/rejected": -0.06432090699672699, "logps/chosen": -2.0878376960754395, "logps/rejected": -3.4198544025421143, "loss": 0.6687, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.0878376960754395, "rewards/margins": 1.3320167064666748, "rewards/rejected": -3.4198544025421143, "sft_loss": 2.0510482788085938, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 4.126740038960389, "learning_rate": 6.216516260572502e-08, "logits/chosen": -0.1603049337863922, "logits/rejected": -0.05132744461297989, "logps/chosen": -2.1984238624572754, "logps/rejected": -3.1398496627807617, "loss": 0.6756, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1984238624572754, "rewards/margins": 0.9414256811141968, "rewards/rejected": -3.1398496627807617, "sft_loss": 2.188628911972046, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 5.464558142197646, "learning_rate": 6.141520154818297e-08, "logits/chosen": -0.1908133327960968, "logits/rejected": -0.06753195822238922, "logps/chosen": -2.2839598655700684, "logps/rejected": -3.1272454261779785, "loss": 0.6722, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.2839598655700684, "rewards/margins": 0.843285083770752, "rewards/rejected": -3.1272454261779785, "sft_loss": 2.335068464279175, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.12577809393405914, "eval_logits/rejected": 0.22317178547382355, "eval_logps/chosen": -2.161029815673828, "eval_logps/rejected": -3.202573776245117, "eval_loss": 0.6901692152023315, "eval_rewards/accuracies": 0.6906528472900391, "eval_rewards/chosen": -2.161029815673828, "eval_rewards/margins": 1.04154372215271, "eval_rewards/rejected": -3.202573776245117, "eval_runtime": 43.7133, "eval_samples_per_second": 30.769, "eval_sft_loss": 2.12894344329834, "eval_steps_per_second": 7.709, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 4.646692370921347, "learning_rate": 6.066949564227897e-08, "logits/chosen": -0.24269111454486847, "logits/rejected": -0.12880149483680725, "logps/chosen": -2.109504222869873, "logps/rejected": -3.1083571910858154, "loss": 0.6718, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.109504222869873, "rewards/margins": 0.9988533854484558, "rewards/rejected": -3.1083571910858154, "sft_loss": 2.1038758754730225, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 3.3794381952292922, "learning_rate": 5.992805212285523e-08, "logits/chosen": -0.1317947655916214, "logits/rejected": -0.016943860799074173, "logps/chosen": -2.104414939880371, "logps/rejected": -3.1304614543914795, "loss": 0.6772, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.104414939880371, "rewards/margins": 1.026046633720398, "rewards/rejected": -3.1304614543914795, "sft_loss": 2.0895590782165527, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 6.238982280162222, "learning_rate": 5.9190878183399684e-08, "logits/chosen": -0.12169364839792252, "logits/rejected": 0.004199688322842121, "logps/chosen": -2.032968044281006, "logps/rejected": -3.2582428455352783, "loss": 0.6545, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.032968044281006, "rewards/margins": 1.225274682044983, "rewards/rejected": -3.2582428455352783, "sft_loss": 2.0511133670806885, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 3.549271865321555, "learning_rate": 5.845798097597748e-08, "logits/chosen": -0.11396322399377823, "logits/rejected": -0.01782546192407608, "logps/chosen": -2.1708621978759766, "logps/rejected": -2.92978572845459, "loss": 0.6774, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.1708621978759766, "rewards/margins": 0.7589234113693237, "rewards/rejected": -2.92978572845459, "sft_loss": 2.1328911781311035, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 7.253675071341021, "learning_rate": 5.772936761116026e-08, "logits/chosen": -0.09767025709152222, "logits/rejected": 0.04703503102064133, "logps/chosen": -2.0461907386779785, "logps/rejected": -3.0559167861938477, "loss": 0.6645, "rewards/accuracies": 0.75, "rewards/chosen": -2.0461907386779785, "rewards/margins": 1.0097261667251587, "rewards/rejected": -3.0559167861938477, "sft_loss": 1.985823631286621, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 3.555662353812927, "learning_rate": 5.700504515795829e-08, "logits/chosen": -0.18815520405769348, "logits/rejected": -0.017928075045347214, "logps/chosen": -2.1906285285949707, "logps/rejected": -3.220522403717041, "loss": 0.6781, "rewards/accuracies": 0.75, "rewards/chosen": -2.1906285285949707, "rewards/margins": 1.0298939943313599, "rewards/rejected": -3.220522403717041, "sft_loss": 2.140667676925659, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 8.073554424832016, "learning_rate": 5.628502064375101e-08, "logits/chosen": -0.26414138078689575, "logits/rejected": -0.042422693222761154, "logps/chosen": -2.005246639251709, "logps/rejected": -3.2597591876983643, "loss": 0.6764, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.005246639251709, "rewards/margins": 1.2545123100280762, "rewards/rejected": -3.2597591876983643, "sft_loss": 2.057586669921875, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 5.1803684543611865, "learning_rate": 5.55693010542197e-08, "logits/chosen": -0.22406511008739471, "logits/rejected": 0.005454069469124079, "logps/chosen": -2.034522294998169, "logps/rejected": -3.4171080589294434, "loss": 0.6503, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.034522294998169, "rewards/margins": 1.382585883140564, "rewards/rejected": -3.4171080589294434, "sft_loss": 1.9969648122787476, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 2.955799743598282, "learning_rate": 5.485789333327856e-08, "logits/chosen": -0.12224028259515762, "logits/rejected": -0.038930658251047134, "logps/chosen": -2.140636920928955, "logps/rejected": -3.0933098793029785, "loss": 0.6755, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.140636920928955, "rewards/margins": 0.9526728391647339, "rewards/rejected": -3.0933098793029785, "sft_loss": 2.181284189224243, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 12.830745078811939, "learning_rate": 5.4150804383008675e-08, "logits/chosen": -0.28581708669662476, "logits/rejected": -0.10724548995494843, "logps/chosen": -2.1298556327819824, "logps/rejected": -3.379368305206299, "loss": 0.676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1298556327819824, "rewards/margins": 1.2495124340057373, "rewards/rejected": -3.379368305206299, "sft_loss": 2.1332108974456787, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 10.47629999400417, "learning_rate": 5.344804106359002e-08, "logits/chosen": -0.10475417226552963, "logits/rejected": 0.034112442284822464, "logps/chosen": -2.120316505432129, "logps/rejected": -3.2288997173309326, "loss": 0.684, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.120316505432129, "rewards/margins": 1.1085835695266724, "rewards/rejected": -3.2288997173309326, "sft_loss": 2.154690980911255, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 4.804057544653204, "learning_rate": 5.274961019323559e-08, "logits/chosen": -0.16452094912528992, "logits/rejected": -0.09058734029531479, "logps/chosen": -1.9013557434082031, "logps/rejected": -3.1004385948181152, "loss": 0.6641, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9013557434082031, "rewards/margins": 1.199082612991333, "rewards/rejected": -3.1004385948181152, "sft_loss": 1.9540598392486572, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 3.8392080026324487, "learning_rate": 5.205551854812451e-08, "logits/chosen": -0.26380443572998047, "logits/rejected": -0.15460802614688873, "logps/chosen": -2.058915615081787, "logps/rejected": -3.2198386192321777, "loss": 0.6611, "rewards/accuracies": 0.78125, "rewards/chosen": -2.058915615081787, "rewards/margins": 1.1609232425689697, "rewards/rejected": -3.2198386192321777, "sft_loss": 2.057905912399292, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 3.7836159260299085, "learning_rate": 5.1365772862337177e-08, "logits/chosen": -0.140593022108078, "logits/rejected": -0.01732134446501732, "logps/chosen": -2.0515201091766357, "logps/rejected": -3.2749295234680176, "loss": 0.6645, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.0515201091766357, "rewards/margins": 1.2234095335006714, "rewards/rejected": -3.2749295234680176, "sft_loss": 2.0359392166137695, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 4.177419036329663, "learning_rate": 5.068037982778905e-08, "logits/chosen": -0.07713289558887482, "logits/rejected": 0.010256970301270485, "logps/chosen": -2.0393309593200684, "logps/rejected": -3.295485019683838, "loss": 0.6706, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.0393309593200684, "rewards/margins": 1.2561534643173218, "rewards/rejected": -3.295485019683838, "sft_loss": 2.0710060596466064, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 7.009060968874998, "learning_rate": 4.999934609416656e-08, "logits/chosen": -0.11068640649318695, "logits/rejected": 0.017202546820044518, "logps/chosen": -1.9383538961410522, "logps/rejected": -3.306736469268799, "loss": 0.6492, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9383538961410522, "rewards/margins": 1.368382453918457, "rewards/rejected": -3.306736469268799, "sft_loss": 2.065255641937256, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 4.592285677487404, "learning_rate": 4.932267826886183e-08, "logits/chosen": -0.09724752604961395, "logits/rejected": -0.02438260242342949, "logps/chosen": -2.162257671356201, "logps/rejected": -3.315903902053833, "loss": 0.6655, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.162257671356201, "rewards/margins": 1.15364670753479, "rewards/rejected": -3.315903902053833, "sft_loss": 2.1836469173431396, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 4.442056230230379, "learning_rate": 4.8650382916909206e-08, "logits/chosen": -0.29001709818840027, "logits/rejected": -0.12049128860235214, "logps/chosen": -2.086012363433838, "logps/rejected": -3.252751588821411, "loss": 0.6742, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.086012363433838, "rewards/margins": 1.1667394638061523, "rewards/rejected": -3.252751588821411, "sft_loss": 2.132532835006714, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 3.4297832843382627, "learning_rate": 4.7982466560920976e-08, "logits/chosen": -0.1994694173336029, "logits/rejected": -0.09064370393753052, "logps/chosen": -2.1102771759033203, "logps/rejected": -3.0843772888183594, "loss": 0.666, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1102771759033203, "rewards/margins": 0.9740999937057495, "rewards/rejected": -3.0843772888183594, "sft_loss": 2.186908006668091, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 2.297242655481256, "learning_rate": 4.7318935681024685e-08, "logits/chosen": -0.1548094004392624, "logits/rejected": 0.023246418684720993, "logps/chosen": -2.0302085876464844, "logps/rejected": -3.1408843994140625, "loss": 0.6657, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.0302085876464844, "rewards/margins": 1.1106754541397095, "rewards/rejected": -3.1408843994140625, "sft_loss": 2.12398362159729, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 6.874330951742991, "learning_rate": 4.6659796714799745e-08, "logits/chosen": -0.17273852229118347, "logits/rejected": -0.015232471749186516, "logps/chosen": -2.0785112380981445, "logps/rejected": -3.3171119689941406, "loss": 0.6711, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.0785112380981445, "rewards/margins": 1.238600492477417, "rewards/rejected": -3.3171119689941406, "sft_loss": 2.21134090423584, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 3.3100218930305703, "learning_rate": 4.60050560572155e-08, "logits/chosen": -0.15971055626869202, "logits/rejected": -0.1629440039396286, "logps/chosen": -2.245551109313965, "logps/rejected": -3.7228760719299316, "loss": 0.6756, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.245551109313965, "rewards/margins": 1.4773248434066772, "rewards/rejected": -3.7228760719299316, "sft_loss": 2.2304935455322266, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 4.223907754553806, "learning_rate": 4.535472006056834e-08, "logits/chosen": -0.12787704169750214, "logits/rejected": 0.016034701839089394, "logps/chosen": -1.905869722366333, "logps/rejected": -3.0717577934265137, "loss": 0.663, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.905869722366333, "rewards/margins": 1.1658880710601807, "rewards/rejected": -3.0717577934265137, "sft_loss": 1.9925057888031006, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 2.4128440736146444, "learning_rate": 4.470879503442132e-08, "logits/chosen": -0.17662464082241058, "logits/rejected": -0.08665060997009277, "logps/chosen": -2.156773328781128, "logps/rejected": -3.257002592086792, "loss": 0.6753, "rewards/accuracies": 0.6875, "rewards/chosen": -2.156773328781128, "rewards/margins": 1.100229024887085, "rewards/rejected": -3.257002592086792, "sft_loss": 2.113800048828125, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 4.026826762044653, "learning_rate": 4.406728724554154e-08, "logits/chosen": -0.30629870295524597, "logits/rejected": -0.033193111419677734, "logps/chosen": -2.021440029144287, "logps/rejected": -3.25608491897583, "loss": 0.6605, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.021440029144287, "rewards/margins": 1.234644889831543, "rewards/rejected": -3.25608491897583, "sft_loss": 2.0926735401153564, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 3.9647555176436122, "learning_rate": 4.3430202917840664e-08, "logits/chosen": -0.1415877640247345, "logits/rejected": 0.04813080281019211, "logps/chosen": -2.2339959144592285, "logps/rejected": -3.5858314037323, "loss": 0.6696, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2339959144592285, "rewards/margins": 1.3518354892730713, "rewards/rejected": -3.5858314037323, "sft_loss": 2.1474475860595703, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 10.246014656143823, "learning_rate": 4.279754823231346e-08, "logits/chosen": -0.21439452469348907, "logits/rejected": -0.03963133692741394, "logps/chosen": -2.071315050125122, "logps/rejected": -3.20759654045105, "loss": 0.6625, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.071315050125122, "rewards/margins": 1.1362817287445068, "rewards/rejected": -3.20759654045105, "sft_loss": 1.9997138977050781, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 3.1955843244141318, "learning_rate": 4.216932932697859e-08, "logits/chosen": -0.18687336146831512, "logits/rejected": -0.11777372658252716, "logps/chosen": -1.9985196590423584, "logps/rejected": -2.8534021377563477, "loss": 0.6598, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9985196590423584, "rewards/margins": 0.8548822402954102, "rewards/rejected": -2.8534021377563477, "sft_loss": 1.9559974670410156, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 3.9897517262786515, "learning_rate": 4.154555229681844e-08, "logits/chosen": -0.17536196112632751, "logits/rejected": 0.039307206869125366, "logps/chosen": -2.0607361793518066, "logps/rejected": -3.2855000495910645, "loss": 0.6765, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0607361793518066, "rewards/margins": 1.2247636318206787, "rewards/rejected": -3.2855000495910645, "sft_loss": 2.0301108360290527, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 5.34316199107696, "learning_rate": 4.092622319372069e-08, "logits/chosen": -0.16160908341407776, "logits/rejected": -0.0008482426637783647, "logps/chosen": -2.192845582962036, "logps/rejected": -3.215425491333008, "loss": 0.6606, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.192845582962036, "rewards/margins": 1.0225796699523926, "rewards/rejected": -3.215425491333008, "sft_loss": 2.0834009647369385, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 3.9595536843580024, "learning_rate": 4.031134802641889e-08, "logits/chosen": -0.14186866581439972, "logits/rejected": -0.09743430465459824, "logps/chosen": -2.1751301288604736, "logps/rejected": -3.1686172485351562, "loss": 0.6676, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1751301288604736, "rewards/margins": 0.9934871792793274, "rewards/rejected": -3.1686172485351562, "sft_loss": 2.1217844486236572, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 3.10446590032169, "learning_rate": 3.970093276043468e-08, "logits/chosen": -0.11204180866479874, "logits/rejected": -0.017457684502005577, "logps/chosen": -2.005864381790161, "logps/rejected": -3.0677313804626465, "loss": 0.6677, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.005864381790161, "rewards/margins": 1.061867117881775, "rewards/rejected": -3.0677313804626465, "sft_loss": 2.0568666458129883, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 5.07956589232817, "learning_rate": 3.9094983318019584e-08, "logits/chosen": -0.22514183819293976, "logits/rejected": -0.07763149589300156, "logps/chosen": -1.9423367977142334, "logps/rejected": -3.1683859825134277, "loss": 0.6573, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9423367977142334, "rewards/margins": 1.2260488271713257, "rewards/rejected": -3.1683859825134277, "sft_loss": 2.053755760192871, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 4.32502942897498, "learning_rate": 3.849350557809789e-08, "logits/chosen": -0.08983586728572845, "logits/rejected": -0.0068774139508605, "logps/chosen": -2.064767837524414, "logps/rejected": -3.119546413421631, "loss": 0.6704, "rewards/accuracies": 0.75, "rewards/chosen": -2.064767837524414, "rewards/margins": 1.0547785758972168, "rewards/rejected": -3.119546413421631, "sft_loss": 1.9203109741210938, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 4.413263326536183, "learning_rate": 3.789650537620903e-08, "logits/chosen": -0.17833557724952698, "logits/rejected": -0.13192901015281677, "logps/chosen": -2.1899287700653076, "logps/rejected": -3.389059066772461, "loss": 0.6762, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1899287700653076, "rewards/margins": 1.1991302967071533, "rewards/rejected": -3.389059066772461, "sft_loss": 2.1803905963897705, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 3.524914844836635, "learning_rate": 3.730398850445182e-08, "logits/chosen": -0.05446472764015198, "logits/rejected": -0.00417535612359643, "logps/chosen": -2.3293063640594482, "logps/rejected": -3.2744858264923096, "loss": 0.6913, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.3293063640594482, "rewards/margins": 0.9451791644096375, "rewards/rejected": -3.2744858264923096, "sft_loss": 2.2219605445861816, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 5.762332946004345, "learning_rate": 3.671596071142735e-08, "logits/chosen": -0.10473129898309708, "logits/rejected": 0.06362743675708771, "logps/chosen": -2.1023190021514893, "logps/rejected": -3.534928560256958, "loss": 0.6678, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1023190021514893, "rewards/margins": 1.4326095581054688, "rewards/rejected": -3.534928560256958, "sft_loss": 2.063044309616089, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 2.864403064356372, "learning_rate": 3.6132427702183996e-08, "logits/chosen": -0.23619285225868225, "logits/rejected": -0.06422014534473419, "logps/chosen": -1.9801371097564697, "logps/rejected": -3.2924282550811768, "loss": 0.6569, "rewards/accuracies": 0.75, "rewards/chosen": -1.9801371097564697, "rewards/margins": 1.3122915029525757, "rewards/rejected": -3.2924282550811768, "sft_loss": 2.0358376502990723, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 6.884835339263708, "learning_rate": 3.555339513816147e-08, "logits/chosen": -0.2065448760986328, "logits/rejected": -0.18419429659843445, "logps/chosen": -2.1128482818603516, "logps/rejected": -2.9955358505249023, "loss": 0.6805, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1128482818603516, "rewards/margins": 0.8826876878738403, "rewards/rejected": -2.9955358505249023, "sft_loss": 2.17033314704895, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 6.121407517260863, "learning_rate": 3.497886863713639e-08, "logits/chosen": -0.17922380566596985, "logits/rejected": -0.11081000417470932, "logps/chosen": -2.2198586463928223, "logps/rejected": -3.308467149734497, "loss": 0.6715, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.2198586463928223, "rewards/margins": 1.0886080265045166, "rewards/rejected": -3.308467149734497, "sft_loss": 2.248330593109131, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 3.846489276009459, "learning_rate": 3.440885377316721e-08, "logits/chosen": -0.10410068184137344, "logits/rejected": -0.05124828964471817, "logps/chosen": -2.101375102996826, "logps/rejected": -3.006727695465088, "loss": 0.6836, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.101375102996826, "rewards/margins": 0.9053524136543274, "rewards/rejected": -3.006727695465088, "sft_loss": 2.0615711212158203, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 6.171461958311083, "learning_rate": 3.384335607654082e-08, "logits/chosen": -0.10195468366146088, "logits/rejected": -0.024979958310723305, "logps/chosen": -2.2979681491851807, "logps/rejected": -3.2801547050476074, "loss": 0.6826, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.2979681491851807, "rewards/margins": 0.9821867942810059, "rewards/rejected": -3.2801547050476074, "sft_loss": 2.202646493911743, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 5.20333662410297, "learning_rate": 3.328238103371811e-08, "logits/chosen": -0.1650058478116989, "logits/rejected": -0.07016507536172867, "logps/chosen": -2.2075905799865723, "logps/rejected": -3.2428231239318848, "loss": 0.6793, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.2075905799865723, "rewards/margins": 1.035232424736023, "rewards/rejected": -3.2428231239318848, "sft_loss": 2.0438759326934814, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 5.01306261947066, "learning_rate": 3.272593408728169e-08, "logits/chosen": -0.22710001468658447, "logits/rejected": -0.014016309753060341, "logps/chosen": -2.091968536376953, "logps/rejected": -3.1881086826324463, "loss": 0.68, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.091968536376953, "rewards/margins": 1.0961401462554932, "rewards/rejected": -3.1881086826324463, "sft_loss": 2.142371892929077, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 4.708937816887113, "learning_rate": 3.217402063588204e-08, "logits/chosen": -0.18712188303470612, "logits/rejected": -0.035339899361133575, "logps/chosen": -2.1658287048339844, "logps/rejected": -3.5366687774658203, "loss": 0.6799, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1658287048339844, "rewards/margins": 1.3708401918411255, "rewards/rejected": -3.5366687774658203, "sft_loss": 2.1351940631866455, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 3.8811218292727174, "learning_rate": 3.162664603418608e-08, "logits/chosen": -0.1747264415025711, "logits/rejected": -0.08946724981069565, "logps/chosen": -2.160559892654419, "logps/rejected": -3.4428908824920654, "loss": 0.6665, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.160559892654419, "rewards/margins": 1.2823314666748047, "rewards/rejected": -3.4428908824920654, "sft_loss": 2.0684890747070312, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 4.207281726952039, "learning_rate": 3.1083815592824416e-08, "logits/chosen": -0.18225672841072083, "logits/rejected": -0.04414839297533035, "logps/chosen": -2.2771077156066895, "logps/rejected": -3.3128814697265625, "loss": 0.6761, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2771077156066895, "rewards/margins": 1.0357741117477417, "rewards/rejected": -3.3128814697265625, "sft_loss": 2.263978958129883, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 4.521748523943575, "learning_rate": 3.054553457834053e-08, "logits/chosen": 0.00856291688978672, "logits/rejected": -0.02341640368103981, "logps/chosen": -2.138507843017578, "logps/rejected": -3.337174892425537, "loss": 0.6715, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.138507843017578, "rewards/margins": 1.1986671686172485, "rewards/rejected": -3.337174892425537, "sft_loss": 2.0876195430755615, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 4.552636861595934, "learning_rate": 3.0011808213139036e-08, "logits/chosen": -0.09765835106372833, "logits/rejected": -0.07042871415615082, "logps/chosen": -2.264831781387329, "logps/rejected": -3.0682289600372314, "loss": 0.6674, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.264831781387329, "rewards/margins": 0.8033971786499023, "rewards/rejected": -3.0682289600372314, "sft_loss": 2.1140804290771484, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 4.856819480189408, "learning_rate": 2.948264167543568e-08, "logits/chosen": -0.1874663084745407, "logits/rejected": -0.09940935671329498, "logps/chosen": -1.9742987155914307, "logps/rejected": -2.9669032096862793, "loss": 0.6598, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9742987155914307, "rewards/margins": 0.9926045536994934, "rewards/rejected": -2.9669032096862793, "sft_loss": 2.029134750366211, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 4.615891223624692, "learning_rate": 2.8958040099206216e-08, "logits/chosen": -0.2509460747241974, "logits/rejected": -0.15617252886295319, "logps/chosen": -1.9862926006317139, "logps/rejected": -3.0963289737701416, "loss": 0.6638, "rewards/accuracies": 0.75, "rewards/chosen": -1.9862926006317139, "rewards/margins": 1.1100363731384277, "rewards/rejected": -3.0963289737701416, "sft_loss": 1.9945812225341797, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 5.761371226735249, "learning_rate": 2.843800857413775e-08, "logits/chosen": -0.17955382168293, "logits/rejected": -0.08448338508605957, "logps/chosen": -2.048438310623169, "logps/rejected": -3.0260777473449707, "loss": 0.6678, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.048438310623169, "rewards/margins": 0.9776394963264465, "rewards/rejected": -3.0260777473449707, "sft_loss": 2.1401984691619873, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 6.856236650179769, "learning_rate": 2.7922552145578203e-08, "logits/chosen": -0.18293629586696625, "logits/rejected": 0.07218710333108902, "logps/chosen": -2.066840648651123, "logps/rejected": -3.114772319793701, "loss": 0.667, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.066840648651123, "rewards/margins": 1.0479316711425781, "rewards/rejected": -3.114772319793701, "sft_loss": 2.051462411880493, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 5.344510378844355, "learning_rate": 2.7411675814488277e-08, "logits/chosen": -0.08438555151224136, "logits/rejected": 0.05671966075897217, "logps/chosen": -1.9172179698944092, "logps/rejected": -2.8855223655700684, "loss": 0.6601, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9172179698944092, "rewards/margins": 0.9683046340942383, "rewards/rejected": -2.8855223655700684, "sft_loss": 1.979569673538208, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 6.607044123365808, "learning_rate": 2.690538453739216e-08, "logits/chosen": -0.10938652604818344, "logits/rejected": -0.02503049001097679, "logps/chosen": -2.0341567993164062, "logps/rejected": -2.8601226806640625, "loss": 0.6792, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0341567993164062, "rewards/margins": 0.8259660005569458, "rewards/rejected": -2.8601226806640625, "sft_loss": 2.113157272338867, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 3.6293731347471696, "learning_rate": 2.6403683226330298e-08, "logits/chosen": -0.20525026321411133, "logits/rejected": -0.07308445870876312, "logps/chosen": -2.0696821212768555, "logps/rejected": -3.2235043048858643, "loss": 0.6753, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0696821212768555, "rewards/margins": 1.1538223028182983, "rewards/rejected": -3.2235043048858643, "sft_loss": 2.0911519527435303, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 5.723829432538651, "learning_rate": 2.5906576748810804e-08, "logits/chosen": -0.237153097987175, "logits/rejected": -0.13139821588993073, "logps/chosen": -2.0324325561523438, "logps/rejected": -3.28680157661438, "loss": 0.6517, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0324325561523438, "rewards/margins": 1.254368782043457, "rewards/rejected": -3.28680157661438, "sft_loss": 2.0716185569763184, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 5.024835726782162, "learning_rate": 2.5414069927763016e-08, "logits/chosen": -0.275637686252594, "logits/rejected": -0.0931491032242775, "logps/chosen": -2.0387635231018066, "logps/rejected": -3.2675564289093018, "loss": 0.6721, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0387635231018066, "rewards/margins": 1.2287932634353638, "rewards/rejected": -3.2675564289093018, "sft_loss": 2.030611991882324, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 4.568697100727387, "learning_rate": 2.4926167541490185e-08, "logits/chosen": -0.3096490800380707, "logits/rejected": -0.0954984650015831, "logps/chosen": -2.0408742427825928, "logps/rejected": -3.3121063709259033, "loss": 0.6753, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0408742427825928, "rewards/margins": 1.2712323665618896, "rewards/rejected": -3.3121063709259033, "sft_loss": 2.074384927749634, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 8.451808031506655, "learning_rate": 2.4442874323623574e-08, "logits/chosen": -0.10867484658956528, "logits/rejected": 0.04849391430616379, "logps/chosen": -2.089449405670166, "logps/rejected": -3.361520767211914, "loss": 0.6668, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.089449405670166, "rewards/margins": 1.2720708847045898, "rewards/rejected": -3.361520767211914, "sft_loss": 1.99267578125, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 5.448757879157279, "learning_rate": 2.396419496307589e-08, "logits/chosen": -0.1507568210363388, "logits/rejected": 0.02013157121837139, "logps/chosen": -2.142974853515625, "logps/rejected": -3.375558376312256, "loss": 0.6727, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.142974853515625, "rewards/margins": 1.2325836420059204, "rewards/rejected": -3.375558376312256, "sft_loss": 2.1336617469787598, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 4.929875341258377, "learning_rate": 2.349013410399653e-08, "logits/chosen": -0.18589094281196594, "logits/rejected": -0.05891326814889908, "logps/chosen": -2.130286455154419, "logps/rejected": -3.1513850688934326, "loss": 0.6705, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.130286455154419, "rewards/margins": 1.0210990905761719, "rewards/rejected": -3.1513850688934326, "sft_loss": 2.0535569190979004, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 3.6774377763388264, "learning_rate": 2.3020696345725954e-08, "logits/chosen": -0.24028301239013672, "logits/rejected": -0.041994038969278336, "logps/chosen": -2.1880526542663574, "logps/rejected": -3.347641706466675, "loss": 0.6687, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1880526542663574, "rewards/margins": 1.159589171409607, "rewards/rejected": -3.347641706466675, "sft_loss": 2.2079193592071533, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 4.211134532604004, "learning_rate": 2.2555886242751398e-08, "logits/chosen": -0.17090514302253723, "logits/rejected": -0.08920306712388992, "logps/chosen": -2.0383031368255615, "logps/rejected": -3.322711229324341, "loss": 0.6637, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0383031368255615, "rewards/margins": 1.2844078540802002, "rewards/rejected": -3.322711229324341, "sft_loss": 2.0598537921905518, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 11.983066215876969, "learning_rate": 2.2095708304662453e-08, "logits/chosen": -0.2979956567287445, "logits/rejected": -0.04316394031047821, "logps/chosen": -2.1104862689971924, "logps/rejected": -3.100170850753784, "loss": 0.662, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1104862689971924, "rewards/margins": 0.9896847009658813, "rewards/rejected": -3.100170850753784, "sft_loss": 2.078278064727783, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 3.8671914495005995, "learning_rate": 2.16401669961076e-08, "logits/chosen": -0.2927331328392029, "logits/rejected": -0.07540827989578247, "logps/chosen": -1.9981857538223267, "logps/rejected": -3.385162353515625, "loss": 0.6635, "rewards/accuracies": 0.75, "rewards/chosen": -1.9981857538223267, "rewards/margins": 1.386976957321167, "rewards/rejected": -3.385162353515625, "sft_loss": 2.082742214202881, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 4.398321955741719, "learning_rate": 2.1189266736750532e-08, "logits/chosen": -0.08732561022043228, "logits/rejected": -0.007392602507025003, "logps/chosen": -2.091261625289917, "logps/rejected": -3.1702945232391357, "loss": 0.6771, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.091261625289917, "rewards/margins": 1.0790328979492188, "rewards/rejected": -3.1702945232391357, "sft_loss": 2.105546712875366, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 4.285963114631182, "learning_rate": 2.0743011901227623e-08, "logits/chosen": -0.1719670295715332, "logits/rejected": -0.026178916916251183, "logps/chosen": -2.0739388465881348, "logps/rejected": -3.075754165649414, "loss": 0.6792, "rewards/accuracies": 0.75, "rewards/chosen": -2.0739388465881348, "rewards/margins": 1.0018149614334106, "rewards/rejected": -3.075754165649414, "sft_loss": 2.021848678588867, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 5.593728784516396, "learning_rate": 2.030140681910508e-08, "logits/chosen": -0.1162596195936203, "logits/rejected": 0.04145939648151398, "logps/chosen": -2.101473331451416, "logps/rejected": -2.9324426651000977, "loss": 0.6853, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.101473331451416, "rewards/margins": 0.8309692144393921, "rewards/rejected": -2.9324426651000977, "sft_loss": 2.1028060913085938, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 2.714625920065556, "learning_rate": 1.986445577483753e-08, "logits/chosen": -0.1818411946296692, "logits/rejected": -0.051187075674533844, "logps/chosen": -1.9758121967315674, "logps/rejected": -3.1372430324554443, "loss": 0.6689, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.9758121967315674, "rewards/margins": 1.161431074142456, "rewards/rejected": -3.1372430324554443, "sft_loss": 1.9856704473495483, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 6.621918318043308, "learning_rate": 1.9432163007725765e-08, "logits/chosen": -0.18781431019306183, "logits/rejected": -0.09680631011724472, "logps/chosen": -2.171119213104248, "logps/rejected": -3.0972466468811035, "loss": 0.6712, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.171119213104248, "rewards/margins": 0.9261270761489868, "rewards/rejected": -3.0972466468811035, "sft_loss": 2.2441534996032715, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 7.4101187711934005, "learning_rate": 1.9004532711876297e-08, "logits/chosen": -0.1264840066432953, "logits/rejected": -0.06264735013246536, "logps/chosen": -1.9795137643814087, "logps/rejected": -3.176384210586548, "loss": 0.6416, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9795137643814087, "rewards/margins": 1.1968704462051392, "rewards/rejected": -3.176384210586548, "sft_loss": 2.0139267444610596, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 3.6986878836122234, "learning_rate": 1.8581569036159928e-08, "logits/chosen": -0.1670091450214386, "logits/rejected": 0.02274545654654503, "logps/chosen": -1.9214379787445068, "logps/rejected": -3.264592409133911, "loss": 0.6614, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.9214379787445068, "rewards/margins": 1.3431540727615356, "rewards/rejected": -3.264592409133911, "sft_loss": 1.9861595630645752, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 5.829180454278915, "learning_rate": 1.8163276084172285e-08, "logits/chosen": -0.1309589147567749, "logits/rejected": 0.01893536187708378, "logps/chosen": -2.15108060836792, "logps/rejected": -3.21575927734375, "loss": 0.6659, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.15108060836792, "rewards/margins": 1.0646789073944092, "rewards/rejected": -3.21575927734375, "sft_loss": 2.08768367767334, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 5.306349322069451, "learning_rate": 1.7749657914193194e-08, "logits/chosen": -0.17061103880405426, "logits/rejected": -0.07805173099040985, "logps/chosen": -2.105132818222046, "logps/rejected": -3.3097457885742188, "loss": 0.6663, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.105132818222046, "rewards/margins": 1.2046130895614624, "rewards/rejected": -3.3097457885742188, "sft_loss": 2.0536954402923584, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 3.9876647499108, "learning_rate": 1.7340718539148203e-08, "logits/chosen": -0.09365290403366089, "logits/rejected": -0.04905001074075699, "logps/chosen": -2.103774070739746, "logps/rejected": -3.1053762435913086, "loss": 0.6863, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.103774070739746, "rewards/margins": 1.001602292060852, "rewards/rejected": -3.1053762435913086, "sft_loss": 2.179610252380371, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 5.067724361905486, "learning_rate": 1.6936461926568724e-08, "logits/chosen": -0.12385039031505585, "logits/rejected": 0.015540236607193947, "logps/chosen": -2.0763049125671387, "logps/rejected": -3.225480318069458, "loss": 0.668, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.0763049125671387, "rewards/margins": 1.149175763130188, "rewards/rejected": -3.225480318069458, "sft_loss": 2.1354317665100098, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 6.4925921373426885, "learning_rate": 1.6536891998554346e-08, "logits/chosen": -0.25209927558898926, "logits/rejected": -0.06769673526287079, "logps/chosen": -1.9998257160186768, "logps/rejected": -3.136584520339966, "loss": 0.6592, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9998257160186768, "rewards/margins": 1.1367586851119995, "rewards/rejected": -3.136584520339966, "sft_loss": 2.1053662300109863, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 9.215554809878828, "learning_rate": 1.6142012631734093e-08, "logits/chosen": -0.1382247507572174, "logits/rejected": 0.034549959003925323, "logps/chosen": -2.05765962600708, "logps/rejected": -3.0788419246673584, "loss": 0.6662, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.05765962600708, "rewards/margins": 1.0211824178695679, "rewards/rejected": -3.0788419246673584, "sft_loss": 2.08469820022583, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 4.504764705430222, "learning_rate": 1.575182765722949e-08, "logits/chosen": -0.22649991512298584, "logits/rejected": -0.06499792635440826, "logps/chosen": -1.9972549676895142, "logps/rejected": -3.181198835372925, "loss": 0.6701, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9972549676895142, "rewards/margins": 1.1839439868927002, "rewards/rejected": -3.181198835372925, "sft_loss": 2.0632407665252686, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.14197993278503418, "eval_logits/rejected": 0.24067111313343048, "eval_logps/chosen": -2.169867753982544, "eval_logps/rejected": -3.226255416870117, "eval_loss": 0.6903512477874756, "eval_rewards/accuracies": 0.6928783655166626, "eval_rewards/chosen": -2.169867753982544, "eval_rewards/margins": 1.0563876628875732, "eval_rewards/rejected": -3.226255416870117, "eval_runtime": 43.3586, "eval_samples_per_second": 31.02, "eval_sft_loss": 2.132870674133301, "eval_steps_per_second": 7.772, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 5.01427272437642, "learning_rate": 1.536634086061672e-08, "logits/chosen": -0.12897427380084991, "logits/rejected": -0.10188676416873932, "logps/chosen": -1.9897922277450562, "logps/rejected": -3.2146172523498535, "loss": 0.6593, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.9897922277450562, "rewards/margins": 1.224825143814087, "rewards/rejected": -3.2146172523498535, "sft_loss": 1.9856764078140259, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 4.55113254249339, "learning_rate": 1.4985555981890495e-08, "logits/chosen": -0.17350664734840393, "logits/rejected": -0.0433095246553421, "logps/chosen": -2.182356834411621, "logps/rejected": -3.5557281970977783, "loss": 0.6616, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.182356834411621, "rewards/margins": 1.373370885848999, "rewards/rejected": -3.5557281970977783, "sft_loss": 2.0754942893981934, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 3.6117906709693277, "learning_rate": 1.4609476715427226e-08, "logits/chosen": -0.14435936510562897, "logits/rejected": -0.05727745220065117, "logps/chosen": -1.96018385887146, "logps/rejected": -3.2339508533477783, "loss": 0.6582, "rewards/accuracies": 0.78125, "rewards/chosen": -1.96018385887146, "rewards/margins": 1.273767113685608, "rewards/rejected": -3.2339508533477783, "sft_loss": 2.011904001235962, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 5.559407975672463, "learning_rate": 1.4238106709949792e-08, "logits/chosen": -0.1844475120306015, "logits/rejected": -0.08850296586751938, "logps/chosen": -1.9547264575958252, "logps/rejected": -3.431126832962036, "loss": 0.6546, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9547264575958252, "rewards/margins": 1.476400375366211, "rewards/rejected": -3.431126832962036, "sft_loss": 2.0375938415527344, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 4.223608667170843, "learning_rate": 1.3871449568491511e-08, "logits/chosen": -0.11957116425037384, "logits/rejected": 0.04231054708361626, "logps/chosen": -2.1211135387420654, "logps/rejected": -3.2857651710510254, "loss": 0.6721, "rewards/accuracies": 0.75, "rewards/chosen": -2.1211135387420654, "rewards/margins": 1.1646511554718018, "rewards/rejected": -3.2857651710510254, "sft_loss": 2.0883028507232666, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 3.438797614567151, "learning_rate": 1.3509508848361606e-08, "logits/chosen": -0.25931066274642944, "logits/rejected": -0.09509231150150299, "logps/chosen": -2.122140884399414, "logps/rejected": -3.284405469894409, "loss": 0.6714, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.122140884399414, "rewards/margins": 1.1622647047042847, "rewards/rejected": -3.284405469894409, "sft_loss": 2.0123064517974854, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 2.921720735308949, "learning_rate": 1.3152288061110517e-08, "logits/chosen": -0.18616695702075958, "logits/rejected": -0.047292523086071014, "logps/chosen": -2.049342393875122, "logps/rejected": -3.173543930053711, "loss": 0.6723, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.049342393875122, "rewards/margins": 1.124201774597168, "rewards/rejected": -3.173543930053711, "sft_loss": 2.0238633155822754, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 5.980546682249523, "learning_rate": 1.2799790672495814e-08, "logits/chosen": -0.19059044122695923, "logits/rejected": 0.021953938528895378, "logps/chosen": -2.1133408546447754, "logps/rejected": -3.2749500274658203, "loss": 0.6673, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1133408546447754, "rewards/margins": 1.1616089344024658, "rewards/rejected": -3.2749500274658203, "sft_loss": 2.0945544242858887, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 4.140174426760411, "learning_rate": 1.2452020102448835e-08, "logits/chosen": -0.13396288454532623, "logits/rejected": -0.058636974543333054, "logps/chosen": -2.233964204788208, "logps/rejected": -3.0428805351257324, "loss": 0.6813, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.233964204788208, "rewards/margins": 0.8089162111282349, "rewards/rejected": -3.0428805351257324, "sft_loss": 2.2309727668762207, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 3.654201317344783, "learning_rate": 1.2108979725041103e-08, "logits/chosen": -0.2601509094238281, "logits/rejected": -0.11655745655298233, "logps/chosen": -2.1270840167999268, "logps/rejected": -3.2105839252471924, "loss": 0.6753, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1270840167999268, "rewards/margins": 1.0835001468658447, "rewards/rejected": -3.2105839252471924, "sft_loss": 2.140364170074463, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 4.393285336057422, "learning_rate": 1.1770672868451958e-08, "logits/chosen": -0.19051814079284668, "logits/rejected": 0.030599230900406837, "logps/chosen": -2.1062216758728027, "logps/rejected": -3.3283398151397705, "loss": 0.675, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1062216758728027, "rewards/margins": 1.2221177816390991, "rewards/rejected": -3.3283398151397705, "sft_loss": 2.1199615001678467, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 6.197558219138, "learning_rate": 1.1437102814935872e-08, "logits/chosen": -0.17727169394493103, "logits/rejected": -0.10792503505945206, "logps/chosen": -2.0642919540405273, "logps/rejected": -3.2012767791748047, "loss": 0.6715, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0642919540405273, "rewards/margins": 1.1369847059249878, "rewards/rejected": -3.2012767791748047, "sft_loss": 2.1764206886291504, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 4.821366071225293, "learning_rate": 1.1108272800791018e-08, "logits/chosen": -0.2428952008485794, "logits/rejected": -0.015482127666473389, "logps/chosen": -2.207315683364868, "logps/rejected": -3.2127952575683594, "loss": 0.6831, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.207315683364868, "rewards/margins": 1.0054795742034912, "rewards/rejected": -3.2127952575683594, "sft_loss": 2.2028324604034424, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 4.326387223529835, "learning_rate": 1.078418601632769e-08, "logits/chosen": -0.15027108788490295, "logits/rejected": 0.008444221690297127, "logps/chosen": -2.0687239170074463, "logps/rejected": -3.254166841506958, "loss": 0.6625, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.0687239170074463, "rewards/margins": 1.1854428052902222, "rewards/rejected": -3.254166841506958, "sft_loss": 2.0933444499969482, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 4.647620075385404, "learning_rate": 1.0464845605837159e-08, "logits/chosen": -0.12983162701129913, "logits/rejected": 0.013550333678722382, "logps/chosen": -2.0212950706481934, "logps/rejected": -3.0738463401794434, "loss": 0.6721, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.0212950706481934, "rewards/margins": 1.0525516271591187, "rewards/rejected": -3.0738463401794434, "sft_loss": 2.0059714317321777, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 4.388856429894361, "learning_rate": 1.0150254667561642e-08, "logits/chosen": -0.1318751871585846, "logits/rejected": 0.022982800379395485, "logps/chosen": -2.2139945030212402, "logps/rejected": -3.544062376022339, "loss": 0.6743, "rewards/accuracies": 0.75, "rewards/chosen": -2.2139945030212402, "rewards/margins": 1.3300679922103882, "rewards/rejected": -3.544062376022339, "sft_loss": 2.1635942459106445, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 3.275829245543984, "learning_rate": 9.840416253663719e-09, "logits/chosen": -0.19172796607017517, "logits/rejected": -0.0724857896566391, "logps/chosen": -2.0069081783294678, "logps/rejected": -3.423060655593872, "loss": 0.6649, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.0069081783294678, "rewards/margins": 1.4161527156829834, "rewards/rejected": -3.423060655593872, "sft_loss": 2.022608518600464, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 6.76724206800637, "learning_rate": 9.535333370197074e-09, "logits/chosen": -0.13732025027275085, "logits/rejected": -0.0019424870843067765, "logps/chosen": -2.0925679206848145, "logps/rejected": -3.043700695037842, "loss": 0.6773, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.0925679206848145, "rewards/margins": 0.951132595539093, "rewards/rejected": -3.043700695037842, "sft_loss": 2.095527172088623, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 3.6575800369627136, "learning_rate": 9.23500897707713e-09, "logits/chosen": -0.2242976427078247, "logits/rejected": -0.010512808337807655, "logps/chosen": -2.2502903938293457, "logps/rejected": -3.3423380851745605, "loss": 0.6925, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.2502903938293457, "rewards/margins": 1.0920478105545044, "rewards/rejected": -3.3423380851745605, "sft_loss": 2.207596778869629, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 3.4687013378185605, "learning_rate": 8.939445988052574e-09, "logits/chosen": -0.1519078016281128, "logits/rejected": -0.08015980571508408, "logps/chosen": -2.031989097595215, "logps/rejected": -3.466524124145508, "loss": 0.6633, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.031989097595215, "rewards/margins": 1.4345349073410034, "rewards/rejected": -3.466524124145508, "sft_loss": 1.9486396312713623, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 6.424838703496807, "learning_rate": 8.648647270676656e-09, "logits/chosen": -0.1546718329191208, "logits/rejected": -0.04978444427251816, "logps/chosen": -2.0906074047088623, "logps/rejected": -3.186412811279297, "loss": 0.6762, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.0906074047088623, "rewards/margins": 1.0958056449890137, "rewards/rejected": -3.186412811279297, "sft_loss": 2.111370086669922, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 4.525221256770394, "learning_rate": 8.362615646279991e-09, "logits/chosen": -0.2863996624946594, "logits/rejected": -0.04077654331922531, "logps/chosen": -2.080845355987549, "logps/rejected": -3.4454243183135986, "loss": 0.6739, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.080845355987549, "rewards/margins": 1.3645789623260498, "rewards/rejected": -3.4454243183135986, "sft_loss": 2.1444432735443115, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 7.202187239597557, "learning_rate": 8.081353889942466e-09, "logits/chosen": -0.09252846240997314, "logits/rejected": 0.09165690839290619, "logps/chosen": -2.1996304988861084, "logps/rejected": -3.0849857330322266, "loss": 0.6769, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.1996304988861084, "rewards/margins": 0.8853553533554077, "rewards/rejected": -3.0849857330322266, "sft_loss": 2.258697509765625, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 6.146370444393983, "learning_rate": 7.804864730467042e-09, "logits/chosen": -0.1276165395975113, "logits/rejected": -0.04356785863637924, "logps/chosen": -2.0275347232818604, "logps/rejected": -3.036647319793701, "loss": 0.6682, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0275347232818604, "rewards/margins": 1.0091129541397095, "rewards/rejected": -3.036647319793701, "sft_loss": 1.960352897644043, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 5.173531913065765, "learning_rate": 7.533150850352665e-09, "logits/chosen": -0.12462538480758667, "logits/rejected": 0.04870520159602165, "logps/chosen": -2.121428966522217, "logps/rejected": -3.3238697052001953, "loss": 0.6681, "rewards/accuracies": 0.71875, "rewards/chosen": -2.121428966522217, "rewards/margins": 1.2024410963058472, "rewards/rejected": -3.3238697052001953, "sft_loss": 2.0240511894226074, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 6.456068348454724, "learning_rate": 7.2662148857686175e-09, "logits/chosen": -0.055983882397413254, "logits/rejected": 0.03901743143796921, "logps/chosen": -2.1341800689697266, "logps/rejected": -3.2898013591766357, "loss": 0.6686, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1341800689697266, "rewards/margins": 1.1556214094161987, "rewards/rejected": -3.2898013591766357, "sft_loss": 2.1365959644317627, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 4.565655643066571, "learning_rate": 7.0040594265287635e-09, "logits/chosen": -0.08348926156759262, "logits/rejected": -0.09771665930747986, "logps/chosen": -2.1372008323669434, "logps/rejected": -3.0167839527130127, "loss": 0.672, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1372008323669434, "rewards/margins": 0.8795830011367798, "rewards/rejected": -3.0167839527130127, "sft_loss": 2.1172802448272705, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 3.475641530879466, "learning_rate": 6.746687016066566e-09, "logits/chosen": -0.12629814445972443, "logits/rejected": -0.07990659773349762, "logps/chosen": -1.9846584796905518, "logps/rejected": -3.060459613800049, "loss": 0.6755, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.9846584796905518, "rewards/margins": 1.0758014917373657, "rewards/rejected": -3.060459613800049, "sft_loss": 1.912102460861206, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 4.152446641844467, "learning_rate": 6.494100151410276e-09, "logits/chosen": -0.25416839122772217, "logits/rejected": -0.05416093021631241, "logps/chosen": -1.9099677801132202, "logps/rejected": -3.2682998180389404, "loss": 0.6565, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9099677801132202, "rewards/margins": 1.3583317995071411, "rewards/rejected": -3.2682998180389404, "sft_loss": 1.9788919687271118, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 4.1051953454734, "learning_rate": 6.246301283158728e-09, "logits/chosen": -0.050106167793273926, "logits/rejected": -0.03766874969005585, "logps/chosen": -2.2540361881256104, "logps/rejected": -3.149855375289917, "loss": 0.6822, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2540361881256104, "rewards/margins": 0.8958193063735962, "rewards/rejected": -3.149855375289917, "sft_loss": 2.156536102294922, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 4.7264036002658925, "learning_rate": 6.0032928154576944e-09, "logits/chosen": -0.11822198331356049, "logits/rejected": -0.03527585044503212, "logps/chosen": -2.1379916667938232, "logps/rejected": -3.0635056495666504, "loss": 0.6881, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1379916667938232, "rewards/margins": 0.9255143404006958, "rewards/rejected": -3.0635056495666504, "sft_loss": 2.182905435562134, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 9.642319285917505, "learning_rate": 5.76507710597629e-09, "logits/chosen": -0.15814802050590515, "logits/rejected": 0.04452654719352722, "logps/chosen": -2.0034031867980957, "logps/rejected": -3.0512545108795166, "loss": 0.6707, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0034031867980957, "rewards/margins": 1.047851324081421, "rewards/rejected": -3.0512545108795166, "sft_loss": 2.0460562705993652, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 4.819242088785797, "learning_rate": 5.531656465884438e-09, "logits/chosen": -0.22122152149677277, "logits/rejected": -0.08308999240398407, "logps/chosen": -2.0886685848236084, "logps/rejected": -3.479623317718506, "loss": 0.6608, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0886685848236084, "rewards/margins": 1.3909549713134766, "rewards/rejected": -3.479623317718506, "sft_loss": 2.0830681324005127, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 4.6328032363780505, "learning_rate": 5.303033159830217e-09, "logits/chosen": -0.06698968261480331, "logits/rejected": -0.04075399041175842, "logps/chosen": -2.1402573585510254, "logps/rejected": -2.9639697074890137, "loss": 0.6837, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1402573585510254, "rewards/margins": 0.823712170124054, "rewards/rejected": -2.9639697074890137, "sft_loss": 2.119158983230591, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 4.904844953180419, "learning_rate": 5.079209405917939e-09, "logits/chosen": -0.14949668943881989, "logits/rejected": -0.03697948530316353, "logps/chosen": -1.9812753200531006, "logps/rejected": -3.7004337310791016, "loss": 0.6552, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9812753200531006, "rewards/margins": 1.7191585302352905, "rewards/rejected": -3.7004337310791016, "sft_loss": 2.085773468017578, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 3.29372844625657, "learning_rate": 4.860187375686664e-09, "logits/chosen": -0.1672995388507843, "logits/rejected": 0.05775245279073715, "logps/chosen": -2.269740581512451, "logps/rejected": -3.2039215564727783, "loss": 0.6773, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.269740581512451, "rewards/margins": 0.9341810941696167, "rewards/rejected": -3.2039215564727783, "sft_loss": 2.272864818572998, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 4.399064487472108, "learning_rate": 4.64596919408905e-09, "logits/chosen": -0.09647464752197266, "logits/rejected": -0.009688240475952625, "logps/chosen": -2.0818636417388916, "logps/rejected": -3.0700161457061768, "loss": 0.6674, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0818636417388916, "rewards/margins": 0.9881525039672852, "rewards/rejected": -3.0700161457061768, "sft_loss": 2.078864097595215, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 6.882650979565821, "learning_rate": 4.436556939470814e-09, "logits/chosen": -0.1049887165427208, "logits/rejected": 0.04000038653612137, "logps/chosen": -2.214542865753174, "logps/rejected": -3.1506035327911377, "loss": 0.6652, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.214542865753174, "rewards/margins": 0.9360604286193848, "rewards/rejected": -3.1506035327911377, "sft_loss": 2.2201972007751465, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 3.8787737835404297, "learning_rate": 4.23195264355064e-09, "logits/chosen": -0.32012152671813965, "logits/rejected": -0.09241650998592377, "logps/chosen": -1.9448812007904053, "logps/rejected": -3.220184803009033, "loss": 0.6505, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9448812007904053, "rewards/margins": 1.2753032445907593, "rewards/rejected": -3.220184803009033, "sft_loss": 2.0128612518310547, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 4.9661532860476445, "learning_rate": 4.032158291400245e-09, "logits/chosen": -0.20304766297340393, "logits/rejected": 0.044023532420396805, "logps/chosen": -1.974930763244629, "logps/rejected": -3.5096993446350098, "loss": 0.6527, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.974930763244629, "rewards/margins": 1.5347683429718018, "rewards/rejected": -3.5096993446350098, "sft_loss": 1.9161895513534546, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 2.4060071689912363, "learning_rate": 3.837175821425398e-09, "logits/chosen": -0.08438242971897125, "logits/rejected": -0.020516935735940933, "logps/chosen": -2.2892978191375732, "logps/rejected": -3.325204849243164, "loss": 0.6724, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.2892978191375732, "rewards/margins": 1.0359070301055908, "rewards/rejected": -3.325204849243164, "sft_loss": 2.2003917694091797, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 4.938144531154734, "learning_rate": 3.6470071253467683e-09, "logits/chosen": -0.14719071984291077, "logits/rejected": -0.014941292814910412, "logps/chosen": -2.0267221927642822, "logps/rejected": -3.362051486968994, "loss": 0.6662, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.0267221927642822, "rewards/margins": 1.3353294134140015, "rewards/rejected": -3.362051486968994, "sft_loss": 2.0396056175231934, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 3.9310444258559, "learning_rate": 3.461654048181939e-09, "logits/chosen": -0.15167245268821716, "logits/rejected": 0.030614938586950302, "logps/chosen": -2.124218463897705, "logps/rejected": -3.1093029975891113, "loss": 0.6828, "rewards/accuracies": 0.71875, "rewards/chosen": -2.124218463897705, "rewards/margins": 0.9850847125053406, "rewards/rejected": -3.1093029975891113, "sft_loss": 2.323394775390625, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 5.9589444266685625, "learning_rate": 3.281118388227255e-09, "logits/chosen": -0.10924432426691055, "logits/rejected": -0.024333816021680832, "logps/chosen": -2.1157054901123047, "logps/rejected": -3.0793604850769043, "loss": 0.6817, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1157054901123047, "rewards/margins": 0.9636548757553101, "rewards/rejected": -3.0793604850769043, "sft_loss": 2.05177640914917, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 5.2961624525344515, "learning_rate": 3.1054018970405048e-09, "logits/chosen": -0.14075425267219543, "logits/rejected": 0.005097964312881231, "logps/chosen": -2.076080799102783, "logps/rejected": -3.2447776794433594, "loss": 0.6707, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.076080799102783, "rewards/margins": 1.1686967611312866, "rewards/rejected": -3.2447776794433594, "sft_loss": 1.9993696212768555, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 7.863653320435026, "learning_rate": 2.9345062794238207e-09, "logits/chosen": -0.15427440404891968, "logits/rejected": 0.011923110112547874, "logps/chosen": -2.1327626705169678, "logps/rejected": -3.1934449672698975, "loss": 0.6703, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1327626705169678, "rewards/margins": 1.0606820583343506, "rewards/rejected": -3.1934449672698975, "sft_loss": 2.166590690612793, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 6.810870585680136, "learning_rate": 2.7684331934072492e-09, "logits/chosen": -0.2445758581161499, "logits/rejected": -0.15394611656665802, "logps/chosen": -2.049255847930908, "logps/rejected": -3.2269885540008545, "loss": 0.6643, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.049255847930908, "rewards/margins": 1.1777327060699463, "rewards/rejected": -3.2269885540008545, "sft_loss": 2.133906602859497, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 4.228931285194542, "learning_rate": 2.6071842502326526e-09, "logits/chosen": -0.2038041353225708, "logits/rejected": -0.07454162836074829, "logps/chosen": -2.1540584564208984, "logps/rejected": -3.112555742263794, "loss": 0.6717, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1540584564208984, "rewards/margins": 0.9584974050521851, "rewards/rejected": -3.112555742263794, "sft_loss": 2.1838250160217285, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 3.925845892222441, "learning_rate": 2.450761014337888e-09, "logits/chosen": 0.043465353548526764, "logits/rejected": 0.08534816652536392, "logps/chosen": -2.2249526977539062, "logps/rejected": -3.5208001136779785, "loss": 0.6791, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.2249526977539062, "rewards/margins": 1.2958471775054932, "rewards/rejected": -3.5208001136779785, "sft_loss": 2.1956400871276855, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 5.611947431725603, "learning_rate": 2.299165003341985e-09, "logits/chosen": -0.005017808172851801, "logits/rejected": 0.09742090851068497, "logps/chosen": -2.0905654430389404, "logps/rejected": -3.3462531566619873, "loss": 0.6686, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0905654430389404, "rewards/margins": 1.2556875944137573, "rewards/rejected": -3.3462531566619873, "sft_loss": 2.049487829208374, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 4.692837943530718, "learning_rate": 2.1523976880299945e-09, "logits/chosen": -0.18358221650123596, "logits/rejected": -0.006678286008536816, "logps/chosen": -2.1434359550476074, "logps/rejected": -3.0037713050842285, "loss": 0.6896, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1434359550476074, "rewards/margins": 0.8603354692459106, "rewards/rejected": -3.0037713050842285, "sft_loss": 2.1421120166778564, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 6.715649661958594, "learning_rate": 2.010460492339161e-09, "logits/chosen": -0.12474910169839859, "logits/rejected": 0.004342988133430481, "logps/chosen": -1.9698402881622314, "logps/rejected": -3.2126338481903076, "loss": 0.6534, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9698402881622314, "rewards/margins": 1.2427937984466553, "rewards/rejected": -3.2126338481903076, "sft_loss": 1.9961681365966797, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 5.025959003124493, "learning_rate": 1.8733547933446614e-09, "logits/chosen": -0.21705186367034912, "logits/rejected": 0.008288288488984108, "logps/chosen": -2.236253499984741, "logps/rejected": -3.270078659057617, "loss": 0.683, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.236253499984741, "rewards/margins": 1.033825159072876, "rewards/rejected": -3.270078659057617, "sft_loss": 2.1473069190979004, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 4.333369522800217, "learning_rate": 1.7410819212467231e-09, "logits/chosen": -0.13449707627296448, "logits/rejected": -0.05470636487007141, "logps/chosen": -2.23810076713562, "logps/rejected": -3.125683307647705, "loss": 0.685, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.23810076713562, "rewards/margins": 0.8875824809074402, "rewards/rejected": -3.125683307647705, "sft_loss": 2.2447752952575684, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 6.322699130876288, "learning_rate": 1.613643159357192e-09, "logits/chosen": -0.11728037893772125, "logits/rejected": -0.1461258977651596, "logps/chosen": -2.0434460639953613, "logps/rejected": -3.02321720123291, "loss": 0.6741, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0434460639953613, "rewards/margins": 0.9797712564468384, "rewards/rejected": -3.02321720123291, "sft_loss": 2.0746424198150635, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 5.5318695238993, "learning_rate": 1.4910397440875967e-09, "logits/chosen": -0.1640905737876892, "logits/rejected": -0.03388802334666252, "logps/chosen": -2.0576159954071045, "logps/rejected": -3.136462926864624, "loss": 0.6751, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0576159954071045, "rewards/margins": 1.078847050666809, "rewards/rejected": -3.136462926864624, "sft_loss": 2.081251382827759, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 3.403580517089844, "learning_rate": 1.3732728649368253e-09, "logits/chosen": -0.09311430156230927, "logits/rejected": 0.1041082963347435, "logps/chosen": -1.955439567565918, "logps/rejected": -3.12667179107666, "loss": 0.6591, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.955439567565918, "rewards/margins": 1.1712322235107422, "rewards/rejected": -3.12667179107666, "sft_loss": 1.9776054620742798, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 6.916351334848274, "learning_rate": 1.260343664479524e-09, "logits/chosen": -0.15566611289978027, "logits/rejected": -0.10274732112884521, "logps/chosen": -2.046116352081299, "logps/rejected": -3.0021262168884277, "loss": 0.6862, "rewards/accuracies": 0.6875, "rewards/chosen": -2.046116352081299, "rewards/margins": 0.9560097455978394, "rewards/rejected": -3.0021262168884277, "sft_loss": 2.062603712081909, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 6.230606417477509, "learning_rate": 1.1522532383554384e-09, "logits/chosen": -0.2095448225736618, "logits/rejected": 0.016530293971300125, "logps/chosen": -2.0500950813293457, "logps/rejected": -3.422104597091675, "loss": 0.6496, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.0500950813293457, "rewards/margins": 1.3720093965530396, "rewards/rejected": -3.422104597091675, "sft_loss": 2.092311143875122, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 2.987181490327306, "learning_rate": 1.049002635258256e-09, "logits/chosen": -0.10499457269906998, "logits/rejected": 0.008853035047650337, "logps/chosen": -2.199967861175537, "logps/rejected": -3.3355917930603027, "loss": 0.6735, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.199967861175537, "rewards/margins": 1.1356239318847656, "rewards/rejected": -3.3355917930603027, "sft_loss": 2.1222336292266846, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 5.246174523496143, "learning_rate": 9.505928569258358e-10, "logits/chosen": -0.021180534735322, "logits/rejected": 0.01215188205242157, "logps/chosen": -2.043039560317993, "logps/rejected": -3.1634812355041504, "loss": 0.6515, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.043039560317993, "rewards/margins": 1.1204414367675781, "rewards/rejected": -3.1634812355041504, "sft_loss": 2.107445240020752, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 4.011491700835256, "learning_rate": 8.57024858130273e-10, "logits/chosen": -0.1461961269378662, "logits/rejected": 0.027207564562559128, "logps/chosen": -2.150327205657959, "logps/rejected": -3.621079683303833, "loss": 0.6689, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.150327205657959, "rewards/margins": 1.4707525968551636, "rewards/rejected": -3.621079683303833, "sft_loss": 2.1148831844329834, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 7.2017527904981575, "learning_rate": 7.682995466686826e-10, "logits/chosen": -0.21892747282981873, "logits/rejected": -0.0924294963479042, "logps/chosen": -2.291236400604248, "logps/rejected": -3.4094910621643066, "loss": 0.6794, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.291236400604248, "rewards/margins": 1.1182541847229004, "rewards/rejected": -3.4094910621643066, "sft_loss": 2.2477917671203613, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 6.904277229174928, "learning_rate": 6.844177833543741e-10, "logits/chosen": -0.12460234016180038, "logits/rejected": -0.04724789783358574, "logps/chosen": -2.045562267303467, "logps/rejected": -3.1144003868103027, "loss": 0.6767, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.045562267303467, "rewards/margins": 1.068838119506836, "rewards/rejected": -3.1144003868103027, "sft_loss": 2.1038308143615723, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 3.18398188788031, "learning_rate": 6.053803820087467e-10, "logits/chosen": -0.14302311837673187, "logits/rejected": 0.02467949315905571, "logps/chosen": -2.2179653644561768, "logps/rejected": -3.471297025680542, "loss": 0.6761, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2179653644561768, "rewards/margins": 1.2533316612243652, "rewards/rejected": -3.471297025680542, "sft_loss": 2.2776458263397217, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 13.381147902591191, "learning_rate": 5.311881094528514e-10, "logits/chosen": -0.21192388236522675, "logits/rejected": 0.026648789644241333, "logps/chosen": -2.3372108936309814, "logps/rejected": -3.121948719024658, "loss": 0.6974, "rewards/accuracies": 0.71875, "rewards/chosen": -2.3372108936309814, "rewards/margins": 0.7847374677658081, "rewards/rejected": -3.121948719024658, "sft_loss": 2.2250218391418457, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 5.802925161561028, "learning_rate": 4.6184168550050806e-10, "logits/chosen": -0.16421891748905182, "logits/rejected": -0.09533698856830597, "logps/chosen": -2.1452269554138184, "logps/rejected": -3.2812087535858154, "loss": 0.6774, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1452269554138184, "rewards/margins": 1.1359819173812866, "rewards/rejected": -3.2812087535858154, "sft_loss": 2.2439322471618652, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 4.153869289541625, "learning_rate": 3.973417829510328e-10, "logits/chosen": -0.22801952064037323, "logits/rejected": -0.04530780762434006, "logps/chosen": -2.094473123550415, "logps/rejected": -3.0530173778533936, "loss": 0.6762, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.094473123550415, "rewards/margins": 0.9585443735122681, "rewards/rejected": -3.0530173778533936, "sft_loss": 1.990878701210022, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 7.429578513315398, "learning_rate": 3.3768902758274377e-10, "logits/chosen": -0.1138523668050766, "logits/rejected": -0.013996327295899391, "logps/chosen": -2.0203332901000977, "logps/rejected": -3.066257953643799, "loss": 0.6756, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0203332901000977, "rewards/margins": 1.0459246635437012, "rewards/rejected": -3.066257953643799, "sft_loss": 2.025296449661255, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 4.05501984298505, "learning_rate": 2.8288399814691e-10, "logits/chosen": -0.02194799669086933, "logits/rejected": 0.06368036568164825, "logps/chosen": -2.1512372493743896, "logps/rejected": -3.075148105621338, "loss": 0.6706, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1512372493743896, "rewards/margins": 0.9239107370376587, "rewards/rejected": -3.075148105621338, "sft_loss": 2.117356777191162, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 5.813124472257818, "learning_rate": 2.3292722636220066e-10, "logits/chosen": -0.1359180510044098, "logits/rejected": 0.06921157240867615, "logps/chosen": -2.1639323234558105, "logps/rejected": -3.4337375164031982, "loss": 0.6772, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.1639323234558105, "rewards/margins": 1.2698047161102295, "rewards/rejected": -3.4337375164031982, "sft_loss": 2.156768321990967, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 5.763459794171124, "learning_rate": 1.8781919690946668e-10, "logits/chosen": -0.09430771321058273, "logits/rejected": -0.06343688070774078, "logps/chosen": -2.088906764984131, "logps/rejected": -2.9498770236968994, "loss": 0.6793, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.088906764984131, "rewards/margins": 0.860970139503479, "rewards/rejected": -2.9498770236968994, "sft_loss": 2.1586833000183105, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 6.0807886966734666, "learning_rate": 1.4756034742696711e-10, "logits/chosen": -0.1709664762020111, "logits/rejected": -0.07853923738002777, "logps/chosen": -2.068702459335327, "logps/rejected": -3.2836601734161377, "loss": 0.6718, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.068702459335327, "rewards/margins": 1.2149574756622314, "rewards/rejected": -3.2836601734161377, "sft_loss": 2.1059365272521973, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 20.538841095877864, "learning_rate": 1.12151068506261e-10, "logits/chosen": -0.13746069371700287, "logits/rejected": -0.000490212463773787, "logps/chosen": -2.0011448860168457, "logps/rejected": -3.4362778663635254, "loss": 0.654, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0011448860168457, "rewards/margins": 1.4351327419281006, "rewards/rejected": -3.4362778663635254, "sft_loss": 2.031506061553955, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 8.268603418051823, "learning_rate": 8.159170368826629e-11, "logits/chosen": -0.1851300746202469, "logits/rejected": -0.01682780496776104, "logps/chosen": -2.000291347503662, "logps/rejected": -3.215944766998291, "loss": 0.6696, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.000291347503662, "rewards/margins": 1.2156531810760498, "rewards/rejected": -3.215944766998291, "sft_loss": 2.0130972862243652, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 5.855344973650639, "learning_rate": 5.588254946015114e-11, "logits/chosen": -0.2728053331375122, "logits/rejected": -0.005551565438508987, "logps/chosen": -1.9887988567352295, "logps/rejected": -3.1639037132263184, "loss": 0.6661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9887988567352295, "rewards/margins": 1.1751052141189575, "rewards/rejected": -3.1639037132263184, "sft_loss": 1.9876254796981812, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 12.96938514619225, "learning_rate": 3.502385525216978e-11, "logits/chosen": -0.23821833729743958, "logits/rejected": -0.057877153158187866, "logps/chosen": -2.0825464725494385, "logps/rejected": -3.262328624725342, "loss": 0.6913, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.0825464725494385, "rewards/margins": 1.1797820329666138, "rewards/rejected": -3.262328624725342, "sft_loss": 2.068385601043701, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 2.678422351463008, "learning_rate": 1.901582343555308e-11, "logits/chosen": -0.16784720122814178, "logits/rejected": -0.09556673467159271, "logps/chosen": -2.2567338943481445, "logps/rejected": -3.335172653198242, "loss": 0.6846, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2567338943481445, "rewards/margins": 1.078438639640808, "rewards/rejected": -3.335172653198242, "sft_loss": 2.1680376529693604, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 5.169085447705354, "learning_rate": 7.858609320232634e-12, "logits/chosen": -0.17234937846660614, "logits/rejected": -0.006200958043336868, "logps/chosen": -2.0979604721069336, "logps/rejected": -3.125894069671631, "loss": 0.6659, "rewards/accuracies": 0.75, "rewards/chosen": -2.0979604721069336, "rewards/margins": 1.0279338359832764, "rewards/rejected": -3.125894069671631, "sft_loss": 2.0555245876312256, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 5.833725919683387, "learning_rate": 1.5523211535639624e-12, "logits/chosen": -0.1850179135799408, "logits/rejected": -0.0650128573179245, "logps/chosen": -1.9984537363052368, "logps/rejected": -3.5584564208984375, "loss": 0.659, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9984537363052368, "rewards/margins": 1.5600025653839111, "rewards/rejected": -3.5584564208984375, "sft_loss": 2.0812668800354004, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.11851444095373154, "eval_logits/rejected": 0.21514485776424408, "eval_logps/chosen": -2.170725107192993, "eval_logps/rejected": -3.2270584106445312, "eval_loss": 0.6901265978813171, "eval_rewards/accuracies": 0.6913946866989136, "eval_rewards/chosen": -2.170725107192993, "eval_rewards/margins": 1.0563328266143799, "eval_rewards/rejected": -3.2270584106445312, "eval_runtime": 43.405, "eval_samples_per_second": 30.987, "eval_sft_loss": 2.133068323135376, "eval_steps_per_second": 7.764, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 0.6914210148065283, "train_runtime": 33926.4655, "train_samples_per_second": 5.287, "train_steps_per_second": 0.165 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }