diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18200 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999297541394882, + "eval_steps": 400, + "global_step": 5604, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002676032781401572, + "grad_norm": 2.5307798952804768, + "learning_rate": 8.9126559714795e-09, + "logits/chosen": -0.06028543785214424, + "logits/rejected": 0.15203383564949036, + "logps/chosen": -1.716343641281128, + "logps/rejected": -1.8897250890731812, + "loss": 0.7351, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.716343641281128, + "rewards/margins": 0.17338162660598755, + "rewards/rejected": -1.8897250890731812, + "sft_loss": 1.4684785604476929, + "step": 5 + }, + { + "epoch": 0.005352065562803144, + "grad_norm": 1.3045011604582515, + "learning_rate": 1.7825311942959e-08, + "logits/chosen": 0.006154696457087994, + "logits/rejected": 0.12953761219978333, + "logps/chosen": -1.8027820587158203, + "logps/rejected": -1.8465938568115234, + "loss": 0.7432, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.8027820587158203, + "rewards/margins": 0.043811749666929245, + "rewards/rejected": -1.8465938568115234, + "sft_loss": 1.5083377361297607, + "step": 10 + }, + { + "epoch": 0.008028098344204716, + "grad_norm": 1.3049558006137096, + "learning_rate": 2.67379679144385e-08, + "logits/chosen": -0.038712628185749054, + "logits/rejected": 0.06149368733167648, + "logps/chosen": -1.6352713108062744, + "logps/rejected": -1.7654964923858643, + "loss": 0.7566, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6352713108062744, + "rewards/margins": 0.13022512197494507, + "rewards/rejected": -1.7654964923858643, + "sft_loss": 1.5005992650985718, + "step": 15 + }, + { + "epoch": 0.010704131125606288, + "grad_norm": 1.7589380784200994, + "learning_rate": 3.5650623885918e-08, + "logits/chosen": -0.039089519530534744, + "logits/rejected": 0.049571335315704346, + "logps/chosen": -1.7260818481445312, + "logps/rejected": -1.807185411453247, + "loss": 0.7573, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.7260818481445312, + "rewards/margins": 0.08110358566045761, + "rewards/rejected": -1.807185411453247, + "sft_loss": 1.5007972717285156, + "step": 20 + }, + { + "epoch": 0.013380163907007862, + "grad_norm": 1.6148810466509307, + "learning_rate": 4.45632798573975e-08, + "logits/chosen": -0.056121088564395905, + "logits/rejected": 0.031073397025465965, + "logps/chosen": -1.8696680068969727, + "logps/rejected": -1.7794153690338135, + "loss": 0.788, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.8696680068969727, + "rewards/margins": -0.09025251865386963, + "rewards/rejected": -1.7794153690338135, + "sft_loss": 1.5459704399108887, + "step": 25 + }, + { + "epoch": 0.016056196688409432, + "grad_norm": 1.134138477310001, + "learning_rate": 5.3475935828877e-08, + "logits/chosen": -0.0943167433142662, + "logits/rejected": 0.0011650652159005404, + "logps/chosen": -1.9078174829483032, + "logps/rejected": -1.8316199779510498, + "loss": 0.7459, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.9078174829483032, + "rewards/margins": -0.0761973112821579, + "rewards/rejected": -1.8316199779510498, + "sft_loss": 1.646023154258728, + "step": 30 + }, + { + "epoch": 0.018732229469811006, + "grad_norm": 1.6619700873288943, + "learning_rate": 6.23885918003565e-08, + "logits/chosen": -0.05790115147829056, + "logits/rejected": 0.10320062935352325, + "logps/chosen": -1.8472083806991577, + "logps/rejected": -1.9982259273529053, + "loss": 0.7611, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.8472083806991577, + "rewards/margins": 0.15101750195026398, + "rewards/rejected": -1.9982259273529053, + "sft_loss": 1.5622081756591797, + "step": 35 + }, + { + "epoch": 0.021408262251212576, + "grad_norm": 1.464719715628282, + "learning_rate": 7.1301247771836e-08, + "logits/chosen": 0.02323739603161812, + "logits/rejected": 0.19785355031490326, + "logps/chosen": -1.8846538066864014, + "logps/rejected": -1.7455508708953857, + "loss": 0.7666, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -1.8846538066864014, + "rewards/margins": -0.13910314440727234, + "rewards/rejected": -1.7455508708953857, + "sft_loss": 1.5199060440063477, + "step": 40 + }, + { + "epoch": 0.02408429503261415, + "grad_norm": 1.5585864902929252, + "learning_rate": 8.021390374331551e-08, + "logits/chosen": 0.021629726514220238, + "logits/rejected": 0.22333423793315887, + "logps/chosen": -1.8406168222427368, + "logps/rejected": -1.8752384185791016, + "loss": 0.7563, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.8406168222427368, + "rewards/margins": 0.034621547907590866, + "rewards/rejected": -1.8752384185791016, + "sft_loss": 1.5375287532806396, + "step": 45 + }, + { + "epoch": 0.026760327814015723, + "grad_norm": 1.3244054200487152, + "learning_rate": 8.9126559714795e-08, + "logits/chosen": -0.05435756966471672, + "logits/rejected": 0.09777506440877914, + "logps/chosen": -1.9022912979125977, + "logps/rejected": -1.7811915874481201, + "loss": 0.7565, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.9022912979125977, + "rewards/margins": -0.12109962850809097, + "rewards/rejected": -1.7811915874481201, + "sft_loss": 1.5845015048980713, + "step": 50 + }, + { + "epoch": 0.029436360595417294, + "grad_norm": 1.4183844835158061, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -0.10434381663799286, + "logits/rejected": 0.1210247278213501, + "logps/chosen": -1.8414385318756104, + "logps/rejected": -1.875012755393982, + "loss": 0.7441, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.8414385318756104, + "rewards/margins": 0.033574365079402924, + "rewards/rejected": -1.875012755393982, + "sft_loss": 1.5868983268737793, + "step": 55 + }, + { + "epoch": 0.032112393376818864, + "grad_norm": 1.4194354948659196, + "learning_rate": 1.06951871657754e-07, + "logits/chosen": -0.08948967605829239, + "logits/rejected": 0.10288163274526596, + "logps/chosen": -1.8002837896347046, + "logps/rejected": -1.9058860540390015, + "loss": 0.7422, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.8002837896347046, + "rewards/margins": 0.10560242086648941, + "rewards/rejected": -1.9058860540390015, + "sft_loss": 1.5473297834396362, + "step": 60 + }, + { + "epoch": 0.03478842615822044, + "grad_norm": 1.3481178192361205, + "learning_rate": 1.158645276292335e-07, + "logits/chosen": -0.04388038441538811, + "logits/rejected": 0.10528695583343506, + "logps/chosen": -1.6494039297103882, + "logps/rejected": -1.78204345703125, + "loss": 0.7463, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.6494039297103882, + "rewards/margins": 0.1326395571231842, + "rewards/rejected": -1.78204345703125, + "sft_loss": 1.4804751873016357, + "step": 65 + }, + { + "epoch": 0.03746445893962201, + "grad_norm": 2.16778718518737, + "learning_rate": 1.24777183600713e-07, + "logits/chosen": -0.07954644411802292, + "logits/rejected": 0.07445457577705383, + "logps/chosen": -1.7811418771743774, + "logps/rejected": -1.829134225845337, + "loss": 0.761, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -1.7811418771743774, + "rewards/margins": 0.04799215868115425, + "rewards/rejected": -1.829134225845337, + "sft_loss": 1.638705849647522, + "step": 70 + }, + { + "epoch": 0.04014049172102358, + "grad_norm": 1.2302737702389388, + "learning_rate": 1.3368983957219251e-07, + "logits/chosen": -0.05034886673092842, + "logits/rejected": 0.1335323303937912, + "logps/chosen": -1.8009684085845947, + "logps/rejected": -2.0643763542175293, + "loss": 0.7454, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.8009684085845947, + "rewards/margins": 0.26340797543525696, + "rewards/rejected": -2.0643763542175293, + "sft_loss": 1.5755311250686646, + "step": 75 + }, + { + "epoch": 0.04281652450242515, + "grad_norm": 1.3121530039846816, + "learning_rate": 1.42602495543672e-07, + "logits/chosen": 0.0018813014030456543, + "logits/rejected": 0.10915534198284149, + "logps/chosen": -1.7485237121582031, + "logps/rejected": -1.7818000316619873, + "loss": 0.7563, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.7485237121582031, + "rewards/margins": 0.03327634930610657, + "rewards/rejected": -1.7818000316619873, + "sft_loss": 1.5394573211669922, + "step": 80 + }, + { + "epoch": 0.04549255728382673, + "grad_norm": 1.4021077740070411, + "learning_rate": 1.5151515151515152e-07, + "logits/chosen": -0.1436150074005127, + "logits/rejected": 0.10774292796850204, + "logps/chosen": -1.829763412475586, + "logps/rejected": -2.016918659210205, + "loss": 0.7571, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.829763412475586, + "rewards/margins": 0.1871553659439087, + "rewards/rejected": -2.016918659210205, + "sft_loss": 1.5105193853378296, + "step": 85 + }, + { + "epoch": 0.0481685900652283, + "grad_norm": 1.086482718318956, + "learning_rate": 1.6042780748663102e-07, + "logits/chosen": 0.09334637224674225, + "logits/rejected": 0.05624980852007866, + "logps/chosen": -1.8011209964752197, + "logps/rejected": -1.8162376880645752, + "loss": 0.7581, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.8011209964752197, + "rewards/margins": 0.015116686001420021, + "rewards/rejected": -1.8162376880645752, + "sft_loss": 1.4740904569625854, + "step": 90 + }, + { + "epoch": 0.05084462284662987, + "grad_norm": 1.2496592116011893, + "learning_rate": 1.693404634581105e-07, + "logits/chosen": -0.06257010996341705, + "logits/rejected": 0.09181664139032364, + "logps/chosen": -1.8811146020889282, + "logps/rejected": -1.9701454639434814, + "loss": 0.7473, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.8811146020889282, + "rewards/margins": 0.08903058618307114, + "rewards/rejected": -1.9701454639434814, + "sft_loss": 1.5524765253067017, + "step": 95 + }, + { + "epoch": 0.05352065562803145, + "grad_norm": 1.1358051297964684, + "learning_rate": 1.7825311942959e-07, + "logits/chosen": -0.032475076615810394, + "logits/rejected": 0.03443972021341324, + "logps/chosen": -1.7509632110595703, + "logps/rejected": -1.8610137701034546, + "loss": 0.7443, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.7509632110595703, + "rewards/margins": 0.11005053669214249, + "rewards/rejected": -1.8610137701034546, + "sft_loss": 1.5149767398834229, + "step": 100 + }, + { + "epoch": 0.05619668840943302, + "grad_norm": 1.1498117979009437, + "learning_rate": 1.8716577540106952e-07, + "logits/chosen": 0.07438740134239197, + "logits/rejected": 0.10223875194787979, + "logps/chosen": -1.7091327905654907, + "logps/rejected": -1.874015212059021, + "loss": 0.742, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.7091327905654907, + "rewards/margins": 0.16488228738307953, + "rewards/rejected": -1.874015212059021, + "sft_loss": 1.463929295539856, + "step": 105 + }, + { + "epoch": 0.05887272119083459, + "grad_norm": 1.2631632504843453, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": 0.020029287785291672, + "logits/rejected": 0.11946004629135132, + "logps/chosen": -1.7755727767944336, + "logps/rejected": -1.8332843780517578, + "loss": 0.7567, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7755727767944336, + "rewards/margins": 0.0577116496860981, + "rewards/rejected": -1.8332843780517578, + "sft_loss": 1.5115814208984375, + "step": 110 + }, + { + "epoch": 0.06154875397223616, + "grad_norm": 1.3979503483361755, + "learning_rate": 2.049910873440285e-07, + "logits/chosen": 0.019500691443681717, + "logits/rejected": 0.2292514592409134, + "logps/chosen": -1.7710542678833008, + "logps/rejected": -2.0842669010162354, + "loss": 0.7296, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.7710542678833008, + "rewards/margins": 0.3132126033306122, + "rewards/rejected": -2.0842669010162354, + "sft_loss": 1.6256048679351807, + "step": 115 + }, + { + "epoch": 0.06422478675363773, + "grad_norm": 0.9471896408017881, + "learning_rate": 2.13903743315508e-07, + "logits/chosen": -0.06396186351776123, + "logits/rejected": 0.11650919914245605, + "logps/chosen": -1.8994373083114624, + "logps/rejected": -2.0369832515716553, + "loss": 0.7397, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.8994373083114624, + "rewards/margins": 0.1375458985567093, + "rewards/rejected": -2.0369832515716553, + "sft_loss": 1.6425449848175049, + "step": 120 + }, + { + "epoch": 0.0669008195350393, + "grad_norm": 1.501264995813309, + "learning_rate": 2.2281639928698751e-07, + "logits/chosen": -0.0704985037446022, + "logits/rejected": 0.06473371386528015, + "logps/chosen": -1.814263939857483, + "logps/rejected": -1.7392761707305908, + "loss": 0.7617, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.814263939857483, + "rewards/margins": -0.07498808205127716, + "rewards/rejected": -1.7392761707305908, + "sft_loss": 1.5751149654388428, + "step": 125 + }, + { + "epoch": 0.06957685231644088, + "grad_norm": 1.8340291888002895, + "learning_rate": 2.31729055258467e-07, + "logits/chosen": 0.0581393726170063, + "logits/rejected": 0.20125238597393036, + "logps/chosen": -1.877229928970337, + "logps/rejected": -1.9977684020996094, + "loss": 0.7481, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.877229928970337, + "rewards/margins": 0.12053883075714111, + "rewards/rejected": -1.9977684020996094, + "sft_loss": 1.6528816223144531, + "step": 130 + }, + { + "epoch": 0.07225288509784245, + "grad_norm": 1.0603454056081036, + "learning_rate": 2.406417112299465e-07, + "logits/chosen": -0.014934045262634754, + "logits/rejected": 0.1119217649102211, + "logps/chosen": -1.9535691738128662, + "logps/rejected": -1.9393174648284912, + "loss": 0.7532, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9535691738128662, + "rewards/margins": -0.014251927845180035, + "rewards/rejected": -1.9393174648284912, + "sft_loss": 1.6135581731796265, + "step": 135 + }, + { + "epoch": 0.07492891787924402, + "grad_norm": 1.70352983220921, + "learning_rate": 2.49554367201426e-07, + "logits/chosen": -0.03259888291358948, + "logits/rejected": 0.14035694301128387, + "logps/chosen": -1.9172674417495728, + "logps/rejected": -2.143113613128662, + "loss": 0.7388, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.9172674417495728, + "rewards/margins": 0.22584640979766846, + "rewards/rejected": -2.143113613128662, + "sft_loss": 1.6537431478500366, + "step": 140 + }, + { + "epoch": 0.0776049506606456, + "grad_norm": 1.1788956191636877, + "learning_rate": 2.5846702317290554e-07, + "logits/chosen": -0.008710386231541634, + "logits/rejected": 0.1506728231906891, + "logps/chosen": -1.8602443933486938, + "logps/rejected": -1.9983711242675781, + "loss": 0.7427, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.8602443933486938, + "rewards/margins": 0.13812680542469025, + "rewards/rejected": -1.9983711242675781, + "sft_loss": 1.5861378908157349, + "step": 145 + }, + { + "epoch": 0.08028098344204716, + "grad_norm": 1.611957958433947, + "learning_rate": 2.6737967914438503e-07, + "logits/chosen": -0.047285713255405426, + "logits/rejected": 0.1237465962767601, + "logps/chosen": -1.8235307931900024, + "logps/rejected": -1.8251911401748657, + "loss": 0.7505, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.8235307931900024, + "rewards/margins": 0.0016601771349087358, + "rewards/rejected": -1.8251911401748657, + "sft_loss": 1.445634126663208, + "step": 150 + }, + { + "epoch": 0.08295701622344874, + "grad_norm": 1.4133899671490275, + "learning_rate": 2.762923351158645e-07, + "logits/chosen": -0.028733108192682266, + "logits/rejected": 0.02427602931857109, + "logps/chosen": -1.8935750722885132, + "logps/rejected": -1.9562524557113647, + "loss": 0.7509, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.8935750722885132, + "rewards/margins": 0.0626775249838829, + "rewards/rejected": -1.9562524557113647, + "sft_loss": 1.5830233097076416, + "step": 155 + }, + { + "epoch": 0.0856330490048503, + "grad_norm": 1.177019125859732, + "learning_rate": 2.85204991087344e-07, + "logits/chosen": -0.12482740730047226, + "logits/rejected": 0.0246548131108284, + "logps/chosen": -2.0720441341400146, + "logps/rejected": -2.0472512245178223, + "loss": 0.7583, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -2.0720441341400146, + "rewards/margins": -0.024792974814772606, + "rewards/rejected": -2.0472512245178223, + "sft_loss": 1.6636087894439697, + "step": 160 + }, + { + "epoch": 0.08830908178625188, + "grad_norm": 1.2937098303864403, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -0.025919023901224136, + "logits/rejected": 0.16027899086475372, + "logps/chosen": -1.853724718093872, + "logps/rejected": -2.1069750785827637, + "loss": 0.745, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.853724718093872, + "rewards/margins": 0.2532506287097931, + "rewards/rejected": -2.1069750785827637, + "sft_loss": 1.5339034795761108, + "step": 165 + }, + { + "epoch": 0.09098511456765346, + "grad_norm": 1.1908623116367458, + "learning_rate": 3.0303030303030305e-07, + "logits/chosen": -0.06941623985767365, + "logits/rejected": -0.014124035835266113, + "logps/chosen": -2.095472812652588, + "logps/rejected": -2.0944128036499023, + "loss": 0.7431, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -2.095472812652588, + "rewards/margins": -0.001059868955053389, + "rewards/rejected": -2.0944128036499023, + "sft_loss": 1.6449720859527588, + "step": 170 + }, + { + "epoch": 0.09366114734905502, + "grad_norm": 1.1539069879301553, + "learning_rate": 3.1194295900178254e-07, + "logits/chosen": 0.05122692137956619, + "logits/rejected": 0.050775568932294846, + "logps/chosen": -1.9801479578018188, + "logps/rejected": -2.052694797515869, + "loss": 0.7693, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.9801479578018188, + "rewards/margins": 0.0725465789437294, + "rewards/rejected": -2.052694797515869, + "sft_loss": 1.6534878015518188, + "step": 175 + }, + { + "epoch": 0.0963371801304566, + "grad_norm": 1.0008286652422143, + "learning_rate": 3.2085561497326203e-07, + "logits/chosen": 0.050390541553497314, + "logits/rejected": 0.055344052612781525, + "logps/chosen": -2.0598304271698, + "logps/rejected": -2.077967643737793, + "loss": 0.7446, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.0598304271698, + "rewards/margins": 0.01813710294663906, + "rewards/rejected": -2.077967643737793, + "sft_loss": 1.6423956155776978, + "step": 180 + }, + { + "epoch": 0.09901321291185818, + "grad_norm": 1.3549860343592595, + "learning_rate": 3.297682709447415e-07, + "logits/chosen": -0.11505118757486343, + "logits/rejected": -0.019612614065408707, + "logps/chosen": -2.0028810501098633, + "logps/rejected": -2.0836894512176514, + "loss": 0.7561, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.0028810501098633, + "rewards/margins": 0.0808083564043045, + "rewards/rejected": -2.0836894512176514, + "sft_loss": 1.6119740009307861, + "step": 185 + }, + { + "epoch": 0.10168924569325974, + "grad_norm": 1.7311733616395397, + "learning_rate": 3.38680926916221e-07, + "logits/chosen": -0.036365706473588943, + "logits/rejected": 0.09691213071346283, + "logps/chosen": -2.3790314197540283, + "logps/rejected": -2.2984354496002197, + "loss": 0.7477, + "rewards/accuracies": 0.46875, + "rewards/chosen": -2.3790314197540283, + "rewards/margins": -0.08059573173522949, + "rewards/rejected": -2.2984354496002197, + "sft_loss": 1.7958142757415771, + "step": 190 + }, + { + "epoch": 0.10436527847466132, + "grad_norm": 0.9496432892268047, + "learning_rate": 3.475935828877005e-07, + "logits/chosen": 0.04074074327945709, + "logits/rejected": 0.20683078467845917, + "logps/chosen": -1.8889601230621338, + "logps/rejected": -1.9778417348861694, + "loss": 0.7422, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.8889601230621338, + "rewards/margins": 0.08888188749551773, + "rewards/rejected": -1.9778417348861694, + "sft_loss": 1.5175426006317139, + "step": 195 + }, + { + "epoch": 0.1070413112560629, + "grad_norm": 0.9622038848794177, + "learning_rate": 3.5650623885918e-07, + "logits/chosen": -0.045106835663318634, + "logits/rejected": 0.10371474921703339, + "logps/chosen": -2.222414493560791, + "logps/rejected": -2.04349684715271, + "loss": 0.7524, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.222414493560791, + "rewards/margins": -0.17891743779182434, + "rewards/rejected": -2.04349684715271, + "sft_loss": 1.725873589515686, + "step": 200 + }, + { + "epoch": 0.10971734403746446, + "grad_norm": 1.201290427678159, + "learning_rate": 3.654188948306595e-07, + "logits/chosen": 0.0018949396908283234, + "logits/rejected": 0.16811925172805786, + "logps/chosen": -2.366450548171997, + "logps/rejected": -2.164440631866455, + "loss": 0.7477, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -2.366450548171997, + "rewards/margins": -0.20200976729393005, + "rewards/rejected": -2.164440631866455, + "sft_loss": 1.6500478982925415, + "step": 205 + }, + { + "epoch": 0.11239337681886603, + "grad_norm": 0.9519143647498421, + "learning_rate": 3.7433155080213904e-07, + "logits/chosen": -0.1441155970096588, + "logits/rejected": 0.054268885403871536, + "logps/chosen": -2.2122931480407715, + "logps/rejected": -2.50825834274292, + "loss": 0.727, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.2122931480407715, + "rewards/margins": 0.2959652543067932, + "rewards/rejected": -2.50825834274292, + "sft_loss": 1.7047412395477295, + "step": 210 + }, + { + "epoch": 0.1150694096002676, + "grad_norm": 0.9147143276129032, + "learning_rate": 3.8324420677361853e-07, + "logits/chosen": -0.13151441514492035, + "logits/rejected": 0.13415369391441345, + "logps/chosen": -2.0408544540405273, + "logps/rejected": -2.140831708908081, + "loss": 0.7324, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.0408544540405273, + "rewards/margins": 0.09997712820768356, + "rewards/rejected": -2.140831708908081, + "sft_loss": 1.689650535583496, + "step": 215 + }, + { + "epoch": 0.11774544238166917, + "grad_norm": 0.9340469402771395, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": 0.09782375395298004, + "logits/rejected": 0.21198837459087372, + "logps/chosen": -2.1927573680877686, + "logps/rejected": -2.5048155784606934, + "loss": 0.7261, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.1927573680877686, + "rewards/margins": 0.31205856800079346, + "rewards/rejected": -2.5048155784606934, + "sft_loss": 1.7473742961883545, + "step": 220 + }, + { + "epoch": 0.12042147516307075, + "grad_norm": 1.1940860708131598, + "learning_rate": 4.010695187165775e-07, + "logits/chosen": -0.06439894437789917, + "logits/rejected": 0.12158125638961792, + "logps/chosen": -2.0465662479400635, + "logps/rejected": -2.1566920280456543, + "loss": 0.7288, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.0465662479400635, + "rewards/margins": 0.11012595891952515, + "rewards/rejected": -2.1566920280456543, + "sft_loss": 1.579954743385315, + "step": 225 + }, + { + "epoch": 0.12309750794447231, + "grad_norm": 1.1570769692704002, + "learning_rate": 4.09982174688057e-07, + "logits/chosen": 0.024921538308262825, + "logits/rejected": 0.11274313926696777, + "logps/chosen": -2.291510820388794, + "logps/rejected": -2.380722761154175, + "loss": 0.7372, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.291510820388794, + "rewards/margins": 0.08921203017234802, + "rewards/rejected": -2.380722761154175, + "sft_loss": 1.6209876537322998, + "step": 230 + }, + { + "epoch": 0.1257735407258739, + "grad_norm": 0.9947029798361241, + "learning_rate": 4.188948306595365e-07, + "logits/chosen": 0.052441079169511795, + "logits/rejected": 0.2144056260585785, + "logps/chosen": -2.117339611053467, + "logps/rejected": -2.3475327491760254, + "loss": 0.7204, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.117339611053467, + "rewards/margins": 0.23019298911094666, + "rewards/rejected": -2.3475327491760254, + "sft_loss": 1.594679594039917, + "step": 235 + }, + { + "epoch": 0.12844957350727546, + "grad_norm": 1.0455265087391556, + "learning_rate": 4.27807486631016e-07, + "logits/chosen": -0.0020845234394073486, + "logits/rejected": 0.1302955150604248, + "logps/chosen": -2.195319414138794, + "logps/rejected": -2.3318018913269043, + "loss": 0.7474, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.195319414138794, + "rewards/margins": 0.13648256659507751, + "rewards/rejected": -2.3318018913269043, + "sft_loss": 1.7364375591278076, + "step": 240 + }, + { + "epoch": 0.13112560628867703, + "grad_norm": 1.0013964505950739, + "learning_rate": 4.3672014260249554e-07, + "logits/chosen": 0.053445588797330856, + "logits/rejected": 0.18915502727031708, + "logps/chosen": -2.10256028175354, + "logps/rejected": -2.3874642848968506, + "loss": 0.7297, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.10256028175354, + "rewards/margins": 0.2849038243293762, + "rewards/rejected": -2.3874642848968506, + "sft_loss": 1.7636394500732422, + "step": 245 + }, + { + "epoch": 0.1338016390700786, + "grad_norm": 1.1313050273773242, + "learning_rate": 4.4563279857397503e-07, + "logits/chosen": -0.025862867012619972, + "logits/rejected": 0.15776577591896057, + "logps/chosen": -2.5357606410980225, + "logps/rejected": -2.5316543579101562, + "loss": 0.745, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.5357606410980225, + "rewards/margins": -0.004106348846107721, + "rewards/rejected": -2.5316543579101562, + "sft_loss": 1.7009462118148804, + "step": 250 + }, + { + "epoch": 0.1364776718514802, + "grad_norm": 1.198688397864217, + "learning_rate": 4.545454545454545e-07, + "logits/chosen": 0.027461037039756775, + "logits/rejected": 0.20019881427288055, + "logps/chosen": -1.969020128250122, + "logps/rejected": -2.2718300819396973, + "loss": 0.7298, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.969020128250122, + "rewards/margins": 0.3028102517127991, + "rewards/rejected": -2.2718300819396973, + "sft_loss": 1.5253115892410278, + "step": 255 + }, + { + "epoch": 0.13915370463288176, + "grad_norm": 0.9576003040336463, + "learning_rate": 4.63458110516934e-07, + "logits/chosen": -0.217337965965271, + "logits/rejected": -0.09969816356897354, + "logps/chosen": -2.3528895378112793, + "logps/rejected": -2.2599847316741943, + "loss": 0.7258, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.3528895378112793, + "rewards/margins": -0.09290491044521332, + "rewards/rejected": -2.2599847316741943, + "sft_loss": 1.7216367721557617, + "step": 260 + }, + { + "epoch": 0.1418297374142833, + "grad_norm": 0.8083878838357342, + "learning_rate": 4.723707664884135e-07, + "logits/chosen": -0.04917464405298233, + "logits/rejected": 0.04295088350772858, + "logps/chosen": -2.5756170749664307, + "logps/rejected": -2.369469404220581, + "loss": 0.7435, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.5756170749664307, + "rewards/margins": -0.2061474323272705, + "rewards/rejected": -2.369469404220581, + "sft_loss": 1.998988389968872, + "step": 265 + }, + { + "epoch": 0.1445057701956849, + "grad_norm": 0.8439805417282846, + "learning_rate": 4.81283422459893e-07, + "logits/chosen": -0.06389988213777542, + "logits/rejected": 0.09119514375925064, + "logps/chosen": -2.194108486175537, + "logps/rejected": -2.3751087188720703, + "loss": 0.7411, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.194108486175537, + "rewards/margins": 0.18100018799304962, + "rewards/rejected": -2.3751087188720703, + "sft_loss": 1.682721734046936, + "step": 270 + }, + { + "epoch": 0.14718180297708647, + "grad_norm": 1.2479379665264587, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": 0.014311921782791615, + "logits/rejected": 0.12128366529941559, + "logps/chosen": -2.3143210411071777, + "logps/rejected": -2.603527307510376, + "loss": 0.7489, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.3143210411071777, + "rewards/margins": 0.28920650482177734, + "rewards/rejected": -2.603527307510376, + "sft_loss": 1.7431834936141968, + "step": 275 + }, + { + "epoch": 0.14985783575848804, + "grad_norm": 0.8807325016661434, + "learning_rate": 4.99108734402852e-07, + "logits/chosen": -0.07187201082706451, + "logits/rejected": 0.1185765266418457, + "logps/chosen": -2.59013295173645, + "logps/rejected": -2.569979190826416, + "loss": 0.7275, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.59013295173645, + "rewards/margins": -0.02015404775738716, + "rewards/rejected": -2.569979190826416, + "sft_loss": 2.0005898475646973, + "step": 280 + }, + { + "epoch": 0.15253386853988962, + "grad_norm": 1.262434366720682, + "learning_rate": 5.080213903743315e-07, + "logits/chosen": -0.0466768741607666, + "logits/rejected": 0.11643379926681519, + "logps/chosen": -2.5148816108703613, + "logps/rejected": -2.536932945251465, + "loss": 0.7479, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -2.5148816108703613, + "rewards/margins": 0.022051483392715454, + "rewards/rejected": -2.536932945251465, + "sft_loss": 1.7881135940551758, + "step": 285 + }, + { + "epoch": 0.1552099013212912, + "grad_norm": 0.7384226605508317, + "learning_rate": 5.169340463458111e-07, + "logits/chosen": -0.10155355930328369, + "logits/rejected": 0.2345721274614334, + "logps/chosen": -2.307314157485962, + "logps/rejected": -2.776353120803833, + "loss": 0.7086, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.307314157485962, + "rewards/margins": 0.46903902292251587, + "rewards/rejected": -2.776353120803833, + "sft_loss": 1.829089879989624, + "step": 290 + }, + { + "epoch": 0.15788593410269275, + "grad_norm": 0.7690015544470431, + "learning_rate": 5.258467023172905e-07, + "logits/chosen": -0.04389738291501999, + "logits/rejected": 0.02246524766087532, + "logps/chosen": -2.7992782592773438, + "logps/rejected": -2.68961763381958, + "loss": 0.7361, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.7992782592773438, + "rewards/margins": -0.10966069996356964, + "rewards/rejected": -2.68961763381958, + "sft_loss": 2.0393753051757812, + "step": 295 + }, + { + "epoch": 0.16056196688409433, + "grad_norm": 0.9284960969608202, + "learning_rate": 5.347593582887701e-07, + "logits/chosen": -0.07447122037410736, + "logits/rejected": 0.11382939666509628, + "logps/chosen": -3.0339980125427246, + "logps/rejected": -3.1267342567443848, + "loss": 0.7372, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.0339980125427246, + "rewards/margins": 0.09273599088191986, + "rewards/rejected": -3.1267342567443848, + "sft_loss": 1.9921964406967163, + "step": 300 + }, + { + "epoch": 0.1632379996654959, + "grad_norm": 0.8614208468355983, + "learning_rate": 5.436720142602496e-07, + "logits/chosen": -0.0066381096839904785, + "logits/rejected": 0.07297073304653168, + "logps/chosen": -2.851527690887451, + "logps/rejected": -2.8573780059814453, + "loss": 0.739, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -2.851527690887451, + "rewards/margins": 0.005850297398865223, + "rewards/rejected": -2.8573780059814453, + "sft_loss": 2.166574239730835, + "step": 305 + }, + { + "epoch": 0.16591403244689748, + "grad_norm": 0.6501720560581197, + "learning_rate": 5.52584670231729e-07, + "logits/chosen": -0.18907728791236877, + "logits/rejected": -0.0829610675573349, + "logps/chosen": -3.2818069458007812, + "logps/rejected": -3.3691933155059814, + "loss": 0.728, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -3.2818069458007812, + "rewards/margins": 0.08738609403371811, + "rewards/rejected": -3.3691933155059814, + "sft_loss": 2.39288592338562, + "step": 310 + }, + { + "epoch": 0.16859006522829906, + "grad_norm": 0.6372708648025124, + "learning_rate": 5.614973262032086e-07, + "logits/chosen": -0.006549348589032888, + "logits/rejected": 0.17332328855991364, + "logps/chosen": -3.1714937686920166, + "logps/rejected": -3.2386155128479004, + "loss": 0.7312, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.1714937686920166, + "rewards/margins": 0.0671217292547226, + "rewards/rejected": -3.2386155128479004, + "sft_loss": 2.2824227809906006, + "step": 315 + }, + { + "epoch": 0.1712660980097006, + "grad_norm": 0.8740635051818171, + "learning_rate": 5.70409982174688e-07, + "logits/chosen": -0.03601797670125961, + "logits/rejected": 0.11112309992313385, + "logps/chosen": -3.0045642852783203, + "logps/rejected": -2.9035542011260986, + "loss": 0.7351, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -3.0045642852783203, + "rewards/margins": -0.101010262966156, + "rewards/rejected": -2.9035542011260986, + "sft_loss": 2.2101263999938965, + "step": 320 + }, + { + "epoch": 0.17394213079110218, + "grad_norm": 0.5771610112338228, + "learning_rate": 5.793226381461676e-07, + "logits/chosen": -0.11896149069070816, + "logits/rejected": 0.018192211166024208, + "logps/chosen": -3.610837459564209, + "logps/rejected": -3.8876991271972656, + "loss": 0.725, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.610837459564209, + "rewards/margins": 0.27686089277267456, + "rewards/rejected": -3.8876991271972656, + "sft_loss": 2.300236225128174, + "step": 325 + }, + { + "epoch": 0.17661816357250376, + "grad_norm": 0.6229477054686579, + "learning_rate": 5.88235294117647e-07, + "logits/chosen": 0.005486331880092621, + "logits/rejected": 0.177880197763443, + "logps/chosen": -2.7989001274108887, + "logps/rejected": -3.7798194885253906, + "loss": 0.7251, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.7989001274108887, + "rewards/margins": 0.980919361114502, + "rewards/rejected": -3.7798194885253906, + "sft_loss": 2.2227303981781006, + "step": 330 + }, + { + "epoch": 0.17929419635390534, + "grad_norm": 0.7181549721950545, + "learning_rate": 5.971479500891266e-07, + "logits/chosen": 0.053232062608003616, + "logits/rejected": 0.1841447651386261, + "logps/chosen": -3.7734217643737793, + "logps/rejected": -3.7936007976531982, + "loss": 0.7314, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -3.7734217643737793, + "rewards/margins": 0.0201789028942585, + "rewards/rejected": -3.7936007976531982, + "sft_loss": 2.1736080646514893, + "step": 335 + }, + { + "epoch": 0.18197022913530692, + "grad_norm": 0.70309398793489, + "learning_rate": 6.060606060606061e-07, + "logits/chosen": -0.006882413290441036, + "logits/rejected": 0.1695895940065384, + "logps/chosen": -3.509596347808838, + "logps/rejected": -3.7672221660614014, + "loss": 0.7219, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.509596347808838, + "rewards/margins": 0.25762563943862915, + "rewards/rejected": -3.7672221660614014, + "sft_loss": 2.3050999641418457, + "step": 340 + }, + { + "epoch": 0.1846462619167085, + "grad_norm": 1.0684204512933249, + "learning_rate": 6.149732620320855e-07, + "logits/chosen": 0.06146972253918648, + "logits/rejected": 0.1032826155424118, + "logps/chosen": -3.7491230964660645, + "logps/rejected": -3.746976375579834, + "loss": 0.733, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.7491230964660645, + "rewards/margins": -0.00214635138399899, + "rewards/rejected": -3.746976375579834, + "sft_loss": 2.372309446334839, + "step": 345 + }, + { + "epoch": 0.18732229469811004, + "grad_norm": 0.674534244813876, + "learning_rate": 6.238859180035651e-07, + "logits/chosen": 0.04469449073076248, + "logits/rejected": 0.15989665687084198, + "logps/chosen": -3.837616443634033, + "logps/rejected": -3.3015296459198, + "loss": 0.729, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -3.837616443634033, + "rewards/margins": -0.5360864400863647, + "rewards/rejected": -3.3015296459198, + "sft_loss": 2.3684537410736084, + "step": 350 + }, + { + "epoch": 0.18999832747951162, + "grad_norm": 0.4798276107270202, + "learning_rate": 6.327985739750445e-07, + "logits/chosen": -0.03864552825689316, + "logits/rejected": 0.21794693171977997, + "logps/chosen": -3.891691207885742, + "logps/rejected": -3.7515900135040283, + "loss": 0.7264, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -3.891691207885742, + "rewards/margins": -0.14010128378868103, + "rewards/rejected": -3.7515900135040283, + "sft_loss": 2.518442153930664, + "step": 355 + }, + { + "epoch": 0.1926743602609132, + "grad_norm": 1.1965823226283256, + "learning_rate": 6.417112299465241e-07, + "logits/chosen": 0.004572421312332153, + "logits/rejected": 0.10303233563899994, + "logps/chosen": -4.1401591300964355, + "logps/rejected": -4.2200188636779785, + "loss": 0.7301, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -4.1401591300964355, + "rewards/margins": 0.07985991984605789, + "rewards/rejected": -4.2200188636779785, + "sft_loss": 2.446254253387451, + "step": 360 + }, + { + "epoch": 0.19535039304231477, + "grad_norm": 1.0181453679763794, + "learning_rate": 6.506238859180035e-07, + "logits/chosen": 0.06938910484313965, + "logits/rejected": 0.1773306131362915, + "logps/chosen": -3.9401297569274902, + "logps/rejected": -3.458475112915039, + "loss": 0.727, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.9401297569274902, + "rewards/margins": -0.48165464401245117, + "rewards/rejected": -3.458475112915039, + "sft_loss": 2.5714945793151855, + "step": 365 + }, + { + "epoch": 0.19802642582371635, + "grad_norm": 0.7034205710786969, + "learning_rate": 6.59536541889483e-07, + "logits/chosen": 0.04681248217821121, + "logits/rejected": 0.16646014153957367, + "logps/chosen": -3.4816317558288574, + "logps/rejected": -3.414193630218506, + "loss": 0.7407, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.4816317558288574, + "rewards/margins": -0.0674385279417038, + "rewards/rejected": -3.414193630218506, + "sft_loss": 2.5620157718658447, + "step": 370 + }, + { + "epoch": 0.2007024586051179, + "grad_norm": 0.9349621887687155, + "learning_rate": 6.684491978609626e-07, + "logits/chosen": 0.029973220080137253, + "logits/rejected": 0.22433066368103027, + "logps/chosen": -3.6492698192596436, + "logps/rejected": -4.301482677459717, + "loss": 0.7198, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.6492698192596436, + "rewards/margins": 0.6522127389907837, + "rewards/rejected": -4.301482677459717, + "sft_loss": 2.334585666656494, + "step": 375 + }, + { + "epoch": 0.20337849138651948, + "grad_norm": 0.6534594197266338, + "learning_rate": 6.77361853832442e-07, + "logits/chosen": 0.04294043034315109, + "logits/rejected": 0.15177340805530548, + "logps/chosen": -3.961987257003784, + "logps/rejected": -4.119868278503418, + "loss": 0.7196, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.961987257003784, + "rewards/margins": 0.15788108110427856, + "rewards/rejected": -4.119868278503418, + "sft_loss": 2.639615058898926, + "step": 380 + }, + { + "epoch": 0.20605452416792105, + "grad_norm": 0.8195329529138362, + "learning_rate": 6.862745098039216e-07, + "logits/chosen": 0.05825083702802658, + "logits/rejected": 0.1578684002161026, + "logps/chosen": -3.07454514503479, + "logps/rejected": -3.358675479888916, + "loss": 0.7206, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -3.07454514503479, + "rewards/margins": 0.28413036465644836, + "rewards/rejected": -3.358675479888916, + "sft_loss": 2.2494893074035645, + "step": 385 + }, + { + "epoch": 0.20873055694932263, + "grad_norm": 0.8015314543916019, + "learning_rate": 6.95187165775401e-07, + "logits/chosen": 0.1552068293094635, + "logits/rejected": 0.3621278405189514, + "logps/chosen": -3.0863308906555176, + "logps/rejected": -3.399913787841797, + "loss": 0.7198, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -3.0863308906555176, + "rewards/margins": 0.3135828375816345, + "rewards/rejected": -3.399913787841797, + "sft_loss": 2.292431354522705, + "step": 390 + }, + { + "epoch": 0.2114065897307242, + "grad_norm": 0.8421949387057387, + "learning_rate": 7.040998217468806e-07, + "logits/chosen": 0.031766343861818314, + "logits/rejected": 0.23609808087348938, + "logps/chosen": -2.7795357704162598, + "logps/rejected": -3.061073064804077, + "loss": 0.7247, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.7795357704162598, + "rewards/margins": 0.2815372347831726, + "rewards/rejected": -3.061073064804077, + "sft_loss": 2.0890800952911377, + "step": 395 + }, + { + "epoch": 0.2140826225121258, + "grad_norm": 1.070064093059005, + "learning_rate": 7.1301247771836e-07, + "logits/chosen": 0.14136911928653717, + "logits/rejected": 0.2708422541618347, + "logps/chosen": -3.2854278087615967, + "logps/rejected": -3.46429705619812, + "loss": 0.7149, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -3.2854278087615967, + "rewards/margins": 0.17886866629123688, + "rewards/rejected": -3.46429705619812, + "sft_loss": 2.1866886615753174, + "step": 400 + }, + { + "epoch": 0.2140826225121258, + "eval_logits/chosen": 0.4407428205013275, + "eval_logits/rejected": 0.5533545613288879, + "eval_logps/chosen": -3.312464952468872, + "eval_logps/rejected": -3.568161964416504, + "eval_loss": 0.7231929898262024, + "eval_rewards/accuracies": 0.5200296640396118, + "eval_rewards/chosen": -3.312464952468872, + "eval_rewards/margins": 0.25569629669189453, + "eval_rewards/rejected": -3.568161964416504, + "eval_runtime": 44.1302, + "eval_samples_per_second": 30.478, + "eval_sft_loss": 2.1337332725524902, + "eval_steps_per_second": 7.636, + "step": 400 + }, + { + "epoch": 0.21675865529352734, + "grad_norm": 0.8380463757471741, + "learning_rate": 7.219251336898395e-07, + "logits/chosen": 0.12377011775970459, + "logits/rejected": 0.23273494839668274, + "logps/chosen": -3.467702865600586, + "logps/rejected": -3.855557680130005, + "loss": 0.7318, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -3.467702865600586, + "rewards/margins": 0.3878548741340637, + "rewards/rejected": -3.855557680130005, + "sft_loss": 2.629897356033325, + "step": 405 + }, + { + "epoch": 0.2194346880749289, + "grad_norm": 0.8015436429519107, + "learning_rate": 7.30837789661319e-07, + "logits/chosen": 0.15863196551799774, + "logits/rejected": 0.3188668489456177, + "logps/chosen": -3.4703338146209717, + "logps/rejected": -3.9953055381774902, + "loss": 0.7181, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -3.4703338146209717, + "rewards/margins": 0.5249720215797424, + "rewards/rejected": -3.9953055381774902, + "sft_loss": 2.4155757427215576, + "step": 410 + }, + { + "epoch": 0.2221107208563305, + "grad_norm": 1.183682024326887, + "learning_rate": 7.397504456327985e-07, + "logits/chosen": 0.14013604819774628, + "logits/rejected": 0.21396835148334503, + "logps/chosen": -3.6671295166015625, + "logps/rejected": -3.4590630531311035, + "loss": 0.7319, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -3.6671295166015625, + "rewards/margins": -0.20806679129600525, + "rewards/rejected": -3.4590630531311035, + "sft_loss": 2.605466365814209, + "step": 415 + }, + { + "epoch": 0.22478675363773207, + "grad_norm": 1.3858150606864232, + "learning_rate": 7.486631016042781e-07, + "logits/chosen": 0.06939506530761719, + "logits/rejected": 0.3356233537197113, + "logps/chosen": -2.94575572013855, + "logps/rejected": -3.125661849975586, + "loss": 0.7168, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.94575572013855, + "rewards/margins": 0.17990592122077942, + "rewards/rejected": -3.125661849975586, + "sft_loss": 2.1718907356262207, + "step": 420 + }, + { + "epoch": 0.22746278641913364, + "grad_norm": 1.0438361542866905, + "learning_rate": 7.575757575757575e-07, + "logits/chosen": 0.058596737682819366, + "logits/rejected": 0.30125856399536133, + "logps/chosen": -3.111341714859009, + "logps/rejected": -3.4133172035217285, + "loss": 0.7194, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.111341714859009, + "rewards/margins": 0.3019755184650421, + "rewards/rejected": -3.4133172035217285, + "sft_loss": 2.271104335784912, + "step": 425 + }, + { + "epoch": 0.2301388192005352, + "grad_norm": 0.9585500950106343, + "learning_rate": 7.664884135472371e-07, + "logits/chosen": 0.012262892909348011, + "logits/rejected": 0.2687227129936218, + "logps/chosen": -2.8309836387634277, + "logps/rejected": -3.7098381519317627, + "loss": 0.7209, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.8309836387634277, + "rewards/margins": 0.8788547515869141, + "rewards/rejected": -3.7098381519317627, + "sft_loss": 2.2115042209625244, + "step": 430 + }, + { + "epoch": 0.23281485198193677, + "grad_norm": 0.7878172279611957, + "learning_rate": 7.754010695187165e-07, + "logits/chosen": 0.11982943117618561, + "logits/rejected": 0.2392585277557373, + "logps/chosen": -3.093507766723633, + "logps/rejected": -3.1052567958831787, + "loss": 0.7209, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -3.093507766723633, + "rewards/margins": 0.011749285273253918, + "rewards/rejected": -3.1052567958831787, + "sft_loss": 2.3222641944885254, + "step": 435 + }, + { + "epoch": 0.23549088476333835, + "grad_norm": 1.1063814156047223, + "learning_rate": 7.84313725490196e-07, + "logits/chosen": 0.08916598558425903, + "logits/rejected": 0.21588215231895447, + "logps/chosen": -2.7143657207489014, + "logps/rejected": -3.1528379917144775, + "loss": 0.724, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.7143657207489014, + "rewards/margins": 0.4384719729423523, + "rewards/rejected": -3.1528379917144775, + "sft_loss": 2.295525074005127, + "step": 440 + }, + { + "epoch": 0.23816691754473993, + "grad_norm": 0.883130703007747, + "learning_rate": 7.932263814616755e-07, + "logits/chosen": 0.07394398748874664, + "logits/rejected": 0.21995148062705994, + "logps/chosen": -2.920626163482666, + "logps/rejected": -3.506260395050049, + "loss": 0.7206, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.920626163482666, + "rewards/margins": 0.5856344103813171, + "rewards/rejected": -3.506260395050049, + "sft_loss": 2.224306106567383, + "step": 445 + }, + { + "epoch": 0.2408429503261415, + "grad_norm": 0.8695281019667966, + "learning_rate": 8.02139037433155e-07, + "logits/chosen": 0.09777168929576874, + "logits/rejected": 0.2562565207481384, + "logps/chosen": -2.7872490882873535, + "logps/rejected": -3.0836105346679688, + "loss": 0.7174, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.7872490882873535, + "rewards/margins": 0.29636120796203613, + "rewards/rejected": -3.0836105346679688, + "sft_loss": 2.252509117126465, + "step": 450 + }, + { + "epoch": 0.24351898310754308, + "grad_norm": 1.1522578168512165, + "learning_rate": 8.110516934046346e-07, + "logits/chosen": 0.10990069806575775, + "logits/rejected": 0.21983602643013, + "logps/chosen": -2.5689430236816406, + "logps/rejected": -2.998944044113159, + "loss": 0.7104, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.5689430236816406, + "rewards/margins": 0.4300007224082947, + "rewards/rejected": -2.998944044113159, + "sft_loss": 2.086578845977783, + "step": 455 + }, + { + "epoch": 0.24619501588894463, + "grad_norm": 2.435980412813483, + "learning_rate": 8.19964349376114e-07, + "logits/chosen": -0.014444696716964245, + "logits/rejected": 0.15174110233783722, + "logps/chosen": -2.6563944816589355, + "logps/rejected": -2.8838610649108887, + "loss": 0.7268, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.6563944816589355, + "rewards/margins": 0.2274663895368576, + "rewards/rejected": -2.8838610649108887, + "sft_loss": 2.333906888961792, + "step": 460 + }, + { + "epoch": 0.2488710486703462, + "grad_norm": 2.2069324871389653, + "learning_rate": 8.288770053475936e-07, + "logits/chosen": 0.22611455619335175, + "logits/rejected": 0.2659430503845215, + "logps/chosen": -2.8453681468963623, + "logps/rejected": -3.1539711952209473, + "loss": 0.7222, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.8453681468963623, + "rewards/margins": 0.3086031377315521, + "rewards/rejected": -3.1539711952209473, + "sft_loss": 2.4192214012145996, + "step": 465 + }, + { + "epoch": 0.2515470814517478, + "grad_norm": 3.704990416624573, + "learning_rate": 8.37789661319073e-07, + "logits/chosen": 0.2801414728164673, + "logits/rejected": 0.22712154686450958, + "logps/chosen": -2.801309108734131, + "logps/rejected": -2.847769260406494, + "loss": 0.733, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -2.801309108734131, + "rewards/margins": 0.04645988345146179, + "rewards/rejected": -2.847769260406494, + "sft_loss": 2.284790515899658, + "step": 470 + }, + { + "epoch": 0.25422311423314936, + "grad_norm": 1.068048481080084, + "learning_rate": 8.467023172905525e-07, + "logits/chosen": -8.206814527511597e-05, + "logits/rejected": 0.1875239610671997, + "logps/chosen": -2.40893816947937, + "logps/rejected": -3.194272518157959, + "loss": 0.7097, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.40893816947937, + "rewards/margins": 0.7853342890739441, + "rewards/rejected": -3.194272518157959, + "sft_loss": 2.097153425216675, + "step": 475 + }, + { + "epoch": 0.2568991470145509, + "grad_norm": 2.1566987984823514, + "learning_rate": 8.55614973262032e-07, + "logits/chosen": 0.04935279116034508, + "logits/rejected": 0.28145831823349, + "logps/chosen": -2.404567241668701, + "logps/rejected": -2.732786178588867, + "loss": 0.7217, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.404567241668701, + "rewards/margins": 0.32821884751319885, + "rewards/rejected": -2.732786178588867, + "sft_loss": 2.01057505607605, + "step": 480 + }, + { + "epoch": 0.2595751797959525, + "grad_norm": 0.9623812340325298, + "learning_rate": 8.645276292335115e-07, + "logits/chosen": 0.08504694700241089, + "logits/rejected": 0.14895778894424438, + "logps/chosen": -2.759326457977295, + "logps/rejected": -2.824343681335449, + "loss": 0.7263, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.759326457977295, + "rewards/margins": 0.06501699984073639, + "rewards/rejected": -2.824343681335449, + "sft_loss": 2.2212133407592773, + "step": 485 + }, + { + "epoch": 0.26225121257735406, + "grad_norm": 1.569603368409317, + "learning_rate": 8.734402852049911e-07, + "logits/chosen": 0.14935262501239777, + "logits/rejected": 0.23176102340221405, + "logps/chosen": -2.831303358078003, + "logps/rejected": -3.133817195892334, + "loss": 0.7248, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.831303358078003, + "rewards/margins": 0.30251362919807434, + "rewards/rejected": -3.133817195892334, + "sft_loss": 2.3560450077056885, + "step": 490 + }, + { + "epoch": 0.26492724535875567, + "grad_norm": 1.4572523265368869, + "learning_rate": 8.823529411764705e-07, + "logits/chosen": 0.0733940601348877, + "logits/rejected": 0.11172135919332504, + "logps/chosen": -2.8521978855133057, + "logps/rejected": -3.004929304122925, + "loss": 0.7243, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.8521978855133057, + "rewards/margins": 0.15273188054561615, + "rewards/rejected": -3.004929304122925, + "sft_loss": 2.426811695098877, + "step": 495 + }, + { + "epoch": 0.2676032781401572, + "grad_norm": 1.5609730411038651, + "learning_rate": 8.912655971479501e-07, + "logits/chosen": 0.04248486086726189, + "logits/rejected": 0.1631307303905487, + "logps/chosen": -2.8464436531066895, + "logps/rejected": -3.2533068656921387, + "loss": 0.7231, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.8464436531066895, + "rewards/margins": 0.4068628251552582, + "rewards/rejected": -3.2533068656921387, + "sft_loss": 2.2971839904785156, + "step": 500 + }, + { + "epoch": 0.27027931092155877, + "grad_norm": 1.4689783802748066, + "learning_rate": 9.001782531194295e-07, + "logits/chosen": 0.003608876373618841, + "logits/rejected": 0.17241023480892181, + "logps/chosen": -2.7894625663757324, + "logps/rejected": -2.8479764461517334, + "loss": 0.7173, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.7894625663757324, + "rewards/margins": 0.05851361155509949, + "rewards/rejected": -2.8479764461517334, + "sft_loss": 2.290825843811035, + "step": 505 + }, + { + "epoch": 0.2729553437029604, + "grad_norm": 1.250742204693819, + "learning_rate": 9.09090909090909e-07, + "logits/chosen": 0.152287095785141, + "logits/rejected": 0.2180653065443039, + "logps/chosen": -2.5529842376708984, + "logps/rejected": -3.0081677436828613, + "loss": 0.7178, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.5529842376708984, + "rewards/margins": 0.45518356561660767, + "rewards/rejected": -3.0081677436828613, + "sft_loss": 2.1848416328430176, + "step": 510 + }, + { + "epoch": 0.2756313764843619, + "grad_norm": 1.1650240017903066, + "learning_rate": 9.180035650623885e-07, + "logits/chosen": 0.07927284389734268, + "logits/rejected": 0.20023027062416077, + "logps/chosen": -2.3548150062561035, + "logps/rejected": -2.520425319671631, + "loss": 0.7125, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.3548150062561035, + "rewards/margins": 0.16561046242713928, + "rewards/rejected": -2.520425319671631, + "sft_loss": 2.0264971256256104, + "step": 515 + }, + { + "epoch": 0.27830740926576353, + "grad_norm": 1.4028935287772106, + "learning_rate": 9.26916221033868e-07, + "logits/chosen": 0.00641799857839942, + "logits/rejected": 0.16784623265266418, + "logps/chosen": -2.4290060997009277, + "logps/rejected": -2.6969335079193115, + "loss": 0.7192, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.4290060997009277, + "rewards/margins": 0.2679271101951599, + "rewards/rejected": -2.6969335079193115, + "sft_loss": 2.2241756916046143, + "step": 520 + }, + { + "epoch": 0.2809834420471651, + "grad_norm": 1.5865055700177921, + "learning_rate": 9.358288770053476e-07, + "logits/chosen": 0.15192261338233948, + "logits/rejected": 0.2448718100786209, + "logps/chosen": -2.57245135307312, + "logps/rejected": -2.901125192642212, + "loss": 0.7168, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.57245135307312, + "rewards/margins": 0.3286738395690918, + "rewards/rejected": -2.901125192642212, + "sft_loss": 2.272670269012451, + "step": 525 + }, + { + "epoch": 0.2836594748285666, + "grad_norm": 1.6613335130368763, + "learning_rate": 9.44741532976827e-07, + "logits/chosen": 0.09941687434911728, + "logits/rejected": 0.18588587641716003, + "logps/chosen": -2.3438611030578613, + "logps/rejected": -2.6406006813049316, + "loss": 0.7257, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.3438611030578613, + "rewards/margins": 0.296739399433136, + "rewards/rejected": -2.6406006813049316, + "sft_loss": 2.1210086345672607, + "step": 530 + }, + { + "epoch": 0.28633550760996823, + "grad_norm": 1.5684653787288134, + "learning_rate": 9.536541889483066e-07, + "logits/chosen": -0.08004043996334076, + "logits/rejected": 0.21586282551288605, + "logps/chosen": -2.391448974609375, + "logps/rejected": -2.769453525543213, + "loss": 0.7077, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.391448974609375, + "rewards/margins": 0.3780044615268707, + "rewards/rejected": -2.769453525543213, + "sft_loss": 2.076185464859009, + "step": 535 + }, + { + "epoch": 0.2890115403913698, + "grad_norm": 1.3887045431816825, + "learning_rate": 9.62566844919786e-07, + "logits/chosen": 0.04192466661334038, + "logits/rejected": 0.13054180145263672, + "logps/chosen": -2.6236674785614014, + "logps/rejected": -2.9908204078674316, + "loss": 0.7208, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.6236674785614014, + "rewards/margins": 0.3671533465385437, + "rewards/rejected": -2.9908204078674316, + "sft_loss": 2.410619020462036, + "step": 540 + }, + { + "epoch": 0.2916875731727714, + "grad_norm": 1.3678970419412904, + "learning_rate": 9.714795008912655e-07, + "logits/chosen": -0.06351624429225922, + "logits/rejected": 0.15162459015846252, + "logps/chosen": -2.6514830589294434, + "logps/rejected": -2.9466261863708496, + "loss": 0.7095, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.6514830589294434, + "rewards/margins": 0.29514291882514954, + "rewards/rejected": -2.9466261863708496, + "sft_loss": 2.2425341606140137, + "step": 545 + }, + { + "epoch": 0.29436360595417294, + "grad_norm": 1.1688094755564253, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": 0.06675631552934647, + "logits/rejected": 0.14489200711250305, + "logps/chosen": -2.3897202014923096, + "logps/rejected": -2.7485108375549316, + "loss": 0.7146, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.3897202014923096, + "rewards/margins": 0.3587908148765564, + "rewards/rejected": -2.7485108375549316, + "sft_loss": 2.1949100494384766, + "step": 550 + }, + { + "epoch": 0.2970396387355745, + "grad_norm": 1.2334375243791638, + "learning_rate": 9.893048128342244e-07, + "logits/chosen": -0.040539491921663284, + "logits/rejected": 0.09747248142957687, + "logps/chosen": -2.6100172996520996, + "logps/rejected": -2.8595051765441895, + "loss": 0.7208, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.6100172996520996, + "rewards/margins": 0.24948802590370178, + "rewards/rejected": -2.8595051765441895, + "sft_loss": 2.3463802337646484, + "step": 555 + }, + { + "epoch": 0.2997156715169761, + "grad_norm": 1.2178582513233482, + "learning_rate": 9.98217468805704e-07, + "logits/chosen": 0.05656442791223526, + "logits/rejected": 0.08870618045330048, + "logps/chosen": -2.4351911544799805, + "logps/rejected": -2.8274285793304443, + "loss": 0.7146, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.4351911544799805, + "rewards/margins": 0.3922370970249176, + "rewards/rejected": -2.8274285793304443, + "sft_loss": 2.3474295139312744, + "step": 560 + }, + { + "epoch": 0.30239170429837764, + "grad_norm": 1.4646884322093898, + "learning_rate": 9.999984476788462e-07, + "logits/chosen": 0.016786161810159683, + "logits/rejected": 0.07795722037553787, + "logps/chosen": -2.3994269371032715, + "logps/rejected": -2.7498245239257812, + "loss": 0.7173, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3994269371032715, + "rewards/margins": 0.35039767622947693, + "rewards/rejected": -2.7498245239257812, + "sft_loss": 2.2927894592285156, + "step": 565 + }, + { + "epoch": 0.30506773707977924, + "grad_norm": 2.280916772571687, + "learning_rate": 9.999921413906797e-07, + "logits/chosen": -0.07725201547145844, + "logits/rejected": 0.14675331115722656, + "logps/chosen": -2.3151724338531494, + "logps/rejected": -2.637024402618408, + "loss": 0.714, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.3151724338531494, + "rewards/margins": 0.3218519687652588, + "rewards/rejected": -2.637024402618408, + "sft_loss": 2.2025227546691895, + "step": 570 + }, + { + "epoch": 0.3077437698611808, + "grad_norm": 1.1544623408371042, + "learning_rate": 9.999809841765644e-07, + "logits/chosen": -0.03033895418047905, + "logits/rejected": 0.02668929658830166, + "logps/chosen": -2.313297748565674, + "logps/rejected": -2.608497381210327, + "loss": 0.7186, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.313297748565674, + "rewards/margins": 0.29519957304000854, + "rewards/rejected": -2.608497381210327, + "sft_loss": 2.221764326095581, + "step": 575 + }, + { + "epoch": 0.3104198026425824, + "grad_norm": 1.454393332211814, + "learning_rate": 9.999649761447477e-07, + "logits/chosen": -0.060431670397520065, + "logits/rejected": 0.11492051184177399, + "logps/chosen": -2.400331735610962, + "logps/rejected": -2.877683162689209, + "loss": 0.7171, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.400331735610962, + "rewards/margins": 0.47735172510147095, + "rewards/rejected": -2.877683162689209, + "sft_loss": 2.3011434078216553, + "step": 580 + }, + { + "epoch": 0.31309583542398395, + "grad_norm": 1.9403262233624872, + "learning_rate": 9.999441174505398e-07, + "logits/chosen": -0.06489763408899307, + "logits/rejected": 0.027169320732355118, + "logps/chosen": -2.6530721187591553, + "logps/rejected": -2.9842185974121094, + "loss": 0.7182, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.6530721187591553, + "rewards/margins": 0.331146240234375, + "rewards/rejected": -2.9842185974121094, + "sft_loss": 2.4599719047546387, + "step": 585 + }, + { + "epoch": 0.3157718682053855, + "grad_norm": 7.880831844211117, + "learning_rate": 9.999184082963116e-07, + "logits/chosen": -0.1074734479188919, + "logits/rejected": 0.011968502774834633, + "logps/chosen": -2.530820608139038, + "logps/rejected": -2.730072498321533, + "loss": 0.7164, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.530820608139038, + "rewards/margins": 0.1992516666650772, + "rewards/rejected": -2.730072498321533, + "sft_loss": 2.421886920928955, + "step": 590 + }, + { + "epoch": 0.3184479009867871, + "grad_norm": 1.6012133474306431, + "learning_rate": 9.998878489314937e-07, + "logits/chosen": -0.0028941601049154997, + "logits/rejected": 0.1417306363582611, + "logps/chosen": -2.3665213584899902, + "logps/rejected": -2.799224853515625, + "loss": 0.7165, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3665213584899902, + "rewards/margins": 0.43270349502563477, + "rewards/rejected": -2.799224853515625, + "sft_loss": 2.3110907077789307, + "step": 595 + }, + { + "epoch": 0.32112393376818865, + "grad_norm": 1.6966709570248353, + "learning_rate": 9.99852439652573e-07, + "logits/chosen": -0.0750584825873375, + "logits/rejected": 0.08908344805240631, + "logps/chosen": -2.4441514015197754, + "logps/rejected": -2.7474427223205566, + "loss": 0.7173, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.4441514015197754, + "rewards/margins": 0.3032917380332947, + "rewards/rejected": -2.7474427223205566, + "sft_loss": 2.3465425968170166, + "step": 600 + }, + { + "epoch": 0.32379996654959026, + "grad_norm": 1.7054325905287304, + "learning_rate": 9.998121808030904e-07, + "logits/chosen": -0.07087988406419754, + "logits/rejected": 0.022867068648338318, + "logps/chosen": -2.6355044841766357, + "logps/rejected": -2.976886034011841, + "loss": 0.7131, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.6355044841766357, + "rewards/margins": 0.34138113260269165, + "rewards/rejected": -2.976886034011841, + "sft_loss": 2.448537826538086, + "step": 605 + }, + { + "epoch": 0.3264759993309918, + "grad_norm": 1.949138520574182, + "learning_rate": 9.997670727736379e-07, + "logits/chosen": -0.034800779074430466, + "logits/rejected": 0.15406827628612518, + "logps/chosen": -2.5420567989349365, + "logps/rejected": -2.9066779613494873, + "loss": 0.7156, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.5420567989349365, + "rewards/margins": 0.3646214008331299, + "rewards/rejected": -2.9066779613494873, + "sft_loss": 2.3323192596435547, + "step": 610 + }, + { + "epoch": 0.32915203211239336, + "grad_norm": 1.092636916919118, + "learning_rate": 9.99717116001853e-07, + "logits/chosen": -0.014688342809677124, + "logits/rejected": 0.09319359064102173, + "logps/chosen": -2.4575400352478027, + "logps/rejected": -2.952061891555786, + "loss": 0.7115, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.4575400352478027, + "rewards/margins": 0.4945217967033386, + "rewards/rejected": -2.952061891555786, + "sft_loss": 2.2898201942443848, + "step": 615 + }, + { + "epoch": 0.33182806489379496, + "grad_norm": 1.2412022266614404, + "learning_rate": 9.996623109724173e-07, + "logits/chosen": 0.044067852199077606, + "logits/rejected": 0.1323404163122177, + "logps/chosen": -2.5102686882019043, + "logps/rejected": -2.789283275604248, + "loss": 0.7118, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.5102686882019043, + "rewards/margins": 0.2790144383907318, + "rewards/rejected": -2.789283275604248, + "sft_loss": 2.3239073753356934, + "step": 620 + }, + { + "epoch": 0.3345040976751965, + "grad_norm": 1.7171630474485193, + "learning_rate": 9.996026582170488e-07, + "logits/chosen": -0.002364107873290777, + "logits/rejected": 0.15507212281227112, + "logps/chosen": -2.3736510276794434, + "logps/rejected": -2.9077327251434326, + "loss": 0.7087, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3736510276794434, + "rewards/margins": 0.5340819358825684, + "rewards/rejected": -2.9077327251434326, + "sft_loss": 2.198334217071533, + "step": 625 + }, + { + "epoch": 0.3371801304565981, + "grad_norm": 1.2450982262768642, + "learning_rate": 9.995381583144996e-07, + "logits/chosen": -0.032905466854572296, + "logits/rejected": 0.09662505239248276, + "logps/chosen": -2.36788272857666, + "logps/rejected": -2.794607162475586, + "loss": 0.71, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.36788272857666, + "rewards/margins": 0.42672476172447205, + "rewards/rejected": -2.794607162475586, + "sft_loss": 2.192002534866333, + "step": 630 + }, + { + "epoch": 0.33985616323799966, + "grad_norm": 3.122728313345638, + "learning_rate": 9.994688118905471e-07, + "logits/chosen": 0.021928105503320694, + "logits/rejected": 0.299724280834198, + "logps/chosen": -2.40183687210083, + "logps/rejected": -2.842911720275879, + "loss": 0.7159, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.40183687210083, + "rewards/margins": 0.44107475876808167, + "rewards/rejected": -2.842911720275879, + "sft_loss": 2.27349591255188, + "step": 635 + }, + { + "epoch": 0.3425321960194012, + "grad_norm": 1.3092436802370593, + "learning_rate": 9.993946196179912e-07, + "logits/chosen": -0.03172747418284416, + "logits/rejected": 0.1991158276796341, + "logps/chosen": -2.472367763519287, + "logps/rejected": -2.981828212738037, + "loss": 0.7139, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.472367763519287, + "rewards/margins": 0.5094602704048157, + "rewards/rejected": -2.981828212738037, + "sft_loss": 2.336221218109131, + "step": 640 + }, + { + "epoch": 0.3452082288008028, + "grad_norm": 1.6621210076695574, + "learning_rate": 9.993155822166455e-07, + "logits/chosen": 0.03128929063677788, + "logits/rejected": 0.11202099174261093, + "logps/chosen": -2.5021328926086426, + "logps/rejected": -3.0028514862060547, + "loss": 0.7124, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.5021328926086426, + "rewards/margins": 0.5007185339927673, + "rewards/rejected": -3.0028514862060547, + "sft_loss": 2.308013439178467, + "step": 645 + }, + { + "epoch": 0.34788426158220437, + "grad_norm": 1.3392167687694965, + "learning_rate": 9.992317004533313e-07, + "logits/chosen": 0.06702680140733719, + "logits/rejected": 0.19247296452522278, + "logps/chosen": -2.5227482318878174, + "logps/rejected": -3.011077642440796, + "loss": 0.7163, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.5227482318878174, + "rewards/margins": 0.4883296489715576, + "rewards/rejected": -3.011077642440796, + "sft_loss": 2.4135658740997314, + "step": 650 + }, + { + "epoch": 0.350560294363606, + "grad_norm": 1.8709007126693438, + "learning_rate": 9.991429751418696e-07, + "logits/chosen": 0.07573200017213821, + "logits/rejected": 0.10133898258209229, + "logps/chosen": -2.5253071784973145, + "logps/rejected": -2.9615590572357178, + "loss": 0.7166, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.5253071784973145, + "rewards/margins": 0.43625155091285706, + "rewards/rejected": -2.9615590572357178, + "sft_loss": 2.3898026943206787, + "step": 655 + }, + { + "epoch": 0.3532363271450075, + "grad_norm": 2.3425277167206215, + "learning_rate": 9.99049407143074e-07, + "logits/chosen": -0.012026980519294739, + "logits/rejected": 0.13976624608039856, + "logps/chosen": -2.443476915359497, + "logps/rejected": -2.661541700363159, + "loss": 0.7152, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.443476915359497, + "rewards/margins": 0.2180650681257248, + "rewards/rejected": -2.661541700363159, + "sft_loss": 2.2523090839385986, + "step": 660 + }, + { + "epoch": 0.35591235992640907, + "grad_norm": 2.129722647662276, + "learning_rate": 9.989509973647416e-07, + "logits/chosen": -0.006276947446167469, + "logits/rejected": 0.15791794657707214, + "logps/chosen": -2.2242307662963867, + "logps/rejected": -2.573673963546753, + "loss": 0.7116, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.2242307662963867, + "rewards/margins": 0.349443256855011, + "rewards/rejected": -2.573673963546753, + "sft_loss": 2.207887887954712, + "step": 665 + }, + { + "epoch": 0.3585883927078107, + "grad_norm": 2.3242009017910963, + "learning_rate": 9.988477467616445e-07, + "logits/chosen": -0.041744984686374664, + "logits/rejected": 0.14908739924430847, + "logps/chosen": -2.3040108680725098, + "logps/rejected": -2.683948516845703, + "loss": 0.7153, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.3040108680725098, + "rewards/margins": 0.37993746995925903, + "rewards/rejected": -2.683948516845703, + "sft_loss": 2.2961134910583496, + "step": 670 + }, + { + "epoch": 0.3612644254892122, + "grad_norm": 1.6993879707708788, + "learning_rate": 9.987396563355205e-07, + "logits/chosen": -0.037251681089401245, + "logits/rejected": 0.058357805013656616, + "logps/chosen": -2.2798972129821777, + "logps/rejected": -2.778377056121826, + "loss": 0.7093, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2798972129821777, + "rewards/margins": 0.4984796643257141, + "rewards/rejected": -2.778377056121826, + "sft_loss": 2.2684168815612793, + "step": 675 + }, + { + "epoch": 0.36394045827061383, + "grad_norm": 4.211312152650652, + "learning_rate": 9.986267271350631e-07, + "logits/chosen": -0.0022398352157324553, + "logits/rejected": 0.15500149130821228, + "logps/chosen": -2.3524091243743896, + "logps/rejected": -2.7513551712036133, + "loss": 0.7179, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.3524091243743896, + "rewards/margins": 0.39894604682922363, + "rewards/rejected": -2.7513551712036133, + "sft_loss": 2.2516465187072754, + "step": 680 + }, + { + "epoch": 0.3666164910520154, + "grad_norm": 1.6333954397076826, + "learning_rate": 9.985089602559123e-07, + "logits/chosen": -0.022715812548995018, + "logits/rejected": 0.1583533138036728, + "logps/chosen": -2.3316569328308105, + "logps/rejected": -2.73073148727417, + "loss": 0.7089, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.3316569328308105, + "rewards/margins": 0.3990745544433594, + "rewards/rejected": -2.73073148727417, + "sft_loss": 2.2660598754882812, + "step": 685 + }, + { + "epoch": 0.369292523833417, + "grad_norm": 2.8513640368794304, + "learning_rate": 9.983863568406428e-07, + "logits/chosen": 0.06075746566057205, + "logits/rejected": 0.09779484570026398, + "logps/chosen": -2.4610791206359863, + "logps/rejected": -2.9051265716552734, + "loss": 0.7085, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.4610791206359863, + "rewards/margins": 0.44404739141464233, + "rewards/rejected": -2.9051265716552734, + "sft_loss": 2.4020164012908936, + "step": 690 + }, + { + "epoch": 0.37196855661481854, + "grad_norm": 2.353937482827338, + "learning_rate": 9.982589180787532e-07, + "logits/chosen": -0.026162762194871902, + "logits/rejected": 0.06587730348110199, + "logps/chosen": -2.3094372749328613, + "logps/rejected": -2.7674665451049805, + "loss": 0.7063, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.3094372749328613, + "rewards/margins": 0.45802921056747437, + "rewards/rejected": -2.7674665451049805, + "sft_loss": 2.2600460052490234, + "step": 695 + }, + { + "epoch": 0.3746445893962201, + "grad_norm": 2.3243908024294724, + "learning_rate": 9.981266452066553e-07, + "logits/chosen": -0.10598037391901016, + "logits/rejected": 0.03518640622496605, + "logps/chosen": -2.3793280124664307, + "logps/rejected": -2.6736900806427, + "loss": 0.7147, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.3793280124664307, + "rewards/margins": 0.2943619191646576, + "rewards/rejected": -2.6736900806427, + "sft_loss": 2.2658851146698, + "step": 700 + }, + { + "epoch": 0.3773206221776217, + "grad_norm": 1.4643334623964388, + "learning_rate": 9.979895395076608e-07, + "logits/chosen": -0.05929265171289444, + "logits/rejected": 0.14080263674259186, + "logps/chosen": -2.3748557567596436, + "logps/rejected": -2.955979824066162, + "loss": 0.7055, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.3748557567596436, + "rewards/margins": 0.5811238288879395, + "rewards/rejected": -2.955979824066162, + "sft_loss": 2.2898504734039307, + "step": 705 + }, + { + "epoch": 0.37999665495902324, + "grad_norm": 2.667905722196229, + "learning_rate": 9.9784760231197e-07, + "logits/chosen": -0.010344207286834717, + "logits/rejected": 0.10175009071826935, + "logps/chosen": -2.370831251144409, + "logps/rejected": -2.8821828365325928, + "loss": 0.7045, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.370831251144409, + "rewards/margins": 0.5113516449928284, + "rewards/rejected": -2.8821828365325928, + "sft_loss": 2.232008457183838, + "step": 710 + }, + { + "epoch": 0.38267268774042484, + "grad_norm": 1.3278072182397616, + "learning_rate": 9.97700834996658e-07, + "logits/chosen": -0.10242640972137451, + "logits/rejected": 0.06512792408466339, + "logps/chosen": -2.4747941493988037, + "logps/rejected": -2.8564939498901367, + "loss": 0.7109, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.4747941493988037, + "rewards/margins": 0.38169991970062256, + "rewards/rejected": -2.8564939498901367, + "sft_loss": 2.218146800994873, + "step": 715 + }, + { + "epoch": 0.3853487205218264, + "grad_norm": 2.136760100983307, + "learning_rate": 9.97549238985662e-07, + "logits/chosen": -0.02182629704475403, + "logits/rejected": 0.16530433297157288, + "logps/chosen": -2.430757999420166, + "logps/rejected": -2.852879524230957, + "loss": 0.713, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.430757999420166, + "rewards/margins": 0.4221215844154358, + "rewards/rejected": -2.852879524230957, + "sft_loss": 2.345231533050537, + "step": 720 + }, + { + "epoch": 0.38802475330322794, + "grad_norm": 4.6770537298554204, + "learning_rate": 9.973928157497674e-07, + "logits/chosen": -0.09692031145095825, + "logits/rejected": 0.055900346487760544, + "logps/chosen": -2.342827320098877, + "logps/rejected": -2.8272900581359863, + "loss": 0.7095, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.342827320098877, + "rewards/margins": 0.48446279764175415, + "rewards/rejected": -2.8272900581359863, + "sft_loss": 2.2327513694763184, + "step": 725 + }, + { + "epoch": 0.39070078608462955, + "grad_norm": 2.2551930524603083, + "learning_rate": 9.972315668065927e-07, + "logits/chosen": -0.15884767472743988, + "logits/rejected": -0.019425716251134872, + "logps/chosen": -2.444471836090088, + "logps/rejected": -2.7931387424468994, + "loss": 0.7094, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.444471836090088, + "rewards/margins": 0.3486667275428772, + "rewards/rejected": -2.7931387424468994, + "sft_loss": 2.2840495109558105, + "step": 730 + }, + { + "epoch": 0.3933768188660311, + "grad_norm": 1.446991465344738, + "learning_rate": 9.97065493720576e-07, + "logits/chosen": -0.1402565836906433, + "logits/rejected": -0.01722204126417637, + "logps/chosen": -2.2898192405700684, + "logps/rejected": -2.594698190689087, + "loss": 0.7104, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.2898192405700684, + "rewards/margins": 0.30487877130508423, + "rewards/rejected": -2.594698190689087, + "sft_loss": 2.2371888160705566, + "step": 735 + }, + { + "epoch": 0.3960528516474327, + "grad_norm": 2.254533551657452, + "learning_rate": 9.968945981029594e-07, + "logits/chosen": -0.17079803347587585, + "logits/rejected": -0.001663824892602861, + "logps/chosen": -2.373328685760498, + "logps/rejected": -2.7454586029052734, + "loss": 0.7082, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.373328685760498, + "rewards/margins": 0.3721300959587097, + "rewards/rejected": -2.7454586029052734, + "sft_loss": 2.2215752601623535, + "step": 740 + }, + { + "epoch": 0.39872888442883425, + "grad_norm": 2.899391862149443, + "learning_rate": 9.967188816117726e-07, + "logits/chosen": -0.05076109245419502, + "logits/rejected": 0.045064955949783325, + "logps/chosen": -2.3542187213897705, + "logps/rejected": -2.8510525226593018, + "loss": 0.7089, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.3542187213897705, + "rewards/margins": 0.4968341886997223, + "rewards/rejected": -2.8510525226593018, + "sft_loss": 2.2298872470855713, + "step": 745 + }, + { + "epoch": 0.4014049172102358, + "grad_norm": 1.8550090633215264, + "learning_rate": 9.965383459518179e-07, + "logits/chosen": -0.09492182731628418, + "logits/rejected": 0.08217965066432953, + "logps/chosen": -2.2518415451049805, + "logps/rejected": -2.728621244430542, + "loss": 0.7047, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.2518415451049805, + "rewards/margins": 0.47677963972091675, + "rewards/rejected": -2.728621244430542, + "sft_loss": 2.1824424266815186, + "step": 750 + }, + { + "epoch": 0.4040809499916374, + "grad_norm": 2.039586623800378, + "learning_rate": 9.963529928746533e-07, + "logits/chosen": -0.01995522901415825, + "logits/rejected": 0.12563300132751465, + "logps/chosen": -2.253056049346924, + "logps/rejected": -2.7270843982696533, + "loss": 0.7117, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.253056049346924, + "rewards/margins": 0.47402825951576233, + "rewards/rejected": -2.7270843982696533, + "sft_loss": 2.203734874725342, + "step": 755 + }, + { + "epoch": 0.40675698277303896, + "grad_norm": 1.6289697484565457, + "learning_rate": 9.961628241785746e-07, + "logits/chosen": -0.10875172913074493, + "logits/rejected": -0.02619188465178013, + "logps/chosen": -2.293623924255371, + "logps/rejected": -2.7733473777770996, + "loss": 0.7158, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.293623924255371, + "rewards/margins": 0.47972339391708374, + "rewards/rejected": -2.7733473777770996, + "sft_loss": 2.2108073234558105, + "step": 760 + }, + { + "epoch": 0.40943301555444056, + "grad_norm": 2.0602281192051457, + "learning_rate": 9.959678417085998e-07, + "logits/chosen": -0.06123873591423035, + "logits/rejected": 0.04782121628522873, + "logps/chosen": -2.481053352355957, + "logps/rejected": -2.871842384338379, + "loss": 0.7139, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.481053352355957, + "rewards/margins": 0.3907889425754547, + "rewards/rejected": -2.871842384338379, + "sft_loss": 2.2680141925811768, + "step": 765 + }, + { + "epoch": 0.4121090483358421, + "grad_norm": 1.621361099423288, + "learning_rate": 9.957680473564493e-07, + "logits/chosen": 0.03413773700594902, + "logits/rejected": 0.18661415576934814, + "logps/chosen": -2.510728597640991, + "logps/rejected": -3.1267201900482178, + "loss": 0.7073, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.510728597640991, + "rewards/margins": 0.6159918904304504, + "rewards/rejected": -3.1267201900482178, + "sft_loss": 2.2942333221435547, + "step": 770 + }, + { + "epoch": 0.41478508111724366, + "grad_norm": 1.2629751944575442, + "learning_rate": 9.95563443060529e-07, + "logits/chosen": -0.11486305296421051, + "logits/rejected": 0.08284200727939606, + "logps/chosen": -2.491071939468384, + "logps/rejected": -2.922267436981201, + "loss": 0.7097, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.491071939468384, + "rewards/margins": 0.43119579553604126, + "rewards/rejected": -2.922267436981201, + "sft_loss": 2.2405238151550293, + "step": 775 + }, + { + "epoch": 0.41746111389864526, + "grad_norm": 2.3807379465238987, + "learning_rate": 9.95354030805911e-07, + "logits/chosen": -0.15121929347515106, + "logits/rejected": 0.02654470130801201, + "logps/chosen": -2.372063636779785, + "logps/rejected": -2.7418456077575684, + "loss": 0.7143, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.372063636779785, + "rewards/margins": 0.3697819113731384, + "rewards/rejected": -2.7418456077575684, + "sft_loss": 2.2500219345092773, + "step": 780 + }, + { + "epoch": 0.4201371466800468, + "grad_norm": 2.5285997561445415, + "learning_rate": 9.951398126243133e-07, + "logits/chosen": -0.04930179938673973, + "logits/rejected": 0.08506530523300171, + "logps/chosen": -2.4192235469818115, + "logps/rejected": -2.856471538543701, + "loss": 0.7154, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.4192235469818115, + "rewards/margins": 0.43724822998046875, + "rewards/rejected": -2.856471538543701, + "sft_loss": 2.2640137672424316, + "step": 785 + }, + { + "epoch": 0.4228131794614484, + "grad_norm": 1.7388598414744387, + "learning_rate": 9.94920790594082e-07, + "logits/chosen": -0.08470118045806885, + "logits/rejected": 0.04234440252184868, + "logps/chosen": -2.269394874572754, + "logps/rejected": -2.7940497398376465, + "loss": 0.7059, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.269394874572754, + "rewards/margins": 0.5246550440788269, + "rewards/rejected": -2.7940497398376465, + "sft_loss": 2.1549248695373535, + "step": 790 + }, + { + "epoch": 0.42548921224284997, + "grad_norm": 1.2915713925072447, + "learning_rate": 9.946969668401696e-07, + "logits/chosen": -0.11627205461263657, + "logits/rejected": 0.08593938499689102, + "logps/chosen": -2.153496503829956, + "logps/rejected": -2.681713581085205, + "loss": 0.7077, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.153496503829956, + "rewards/margins": 0.5282168388366699, + "rewards/rejected": -2.681713581085205, + "sft_loss": 2.069179058074951, + "step": 795 + }, + { + "epoch": 0.4281652450242516, + "grad_norm": 2.743951187418527, + "learning_rate": 9.944683435341155e-07, + "logits/chosen": -0.0455852746963501, + "logits/rejected": 0.04003704711794853, + "logps/chosen": -2.217771530151367, + "logps/rejected": -2.646599769592285, + "loss": 0.7105, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.217771530151367, + "rewards/margins": 0.4288281500339508, + "rewards/rejected": -2.646599769592285, + "sft_loss": 2.1145291328430176, + "step": 800 + }, + { + "epoch": 0.4281652450242516, + "eval_logits/chosen": 0.2856700122356415, + "eval_logits/rejected": 0.3870464861392975, + "eval_logps/chosen": -2.235349655151367, + "eval_logps/rejected": -2.7243196964263916, + "eval_loss": 0.7055234313011169, + "eval_rewards/accuracies": 0.6446587443351746, + "eval_rewards/chosen": -2.235349655151367, + "eval_rewards/margins": 0.4889698326587677, + "eval_rewards/rejected": -2.7243196964263916, + "eval_runtime": 43.3921, + "eval_samples_per_second": 30.996, + "eval_sft_loss": 2.1066486835479736, + "eval_steps_per_second": 7.766, + "step": 800 + }, + { + "epoch": 0.4308412778056531, + "grad_norm": 1.6950654692711624, + "learning_rate": 9.942349228940236e-07, + "logits/chosen": -0.13364177942276, + "logits/rejected": 0.04280375689268112, + "logps/chosen": -2.181466579437256, + "logps/rejected": -2.7770562171936035, + "loss": 0.6995, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.181466579437256, + "rewards/margins": 0.595589280128479, + "rewards/rejected": -2.7770562171936035, + "sft_loss": 2.1175734996795654, + "step": 805 + }, + { + "epoch": 0.43351731058705467, + "grad_norm": 3.107157954891758, + "learning_rate": 9.939967071845424e-07, + "logits/chosen": -0.0517873540520668, + "logits/rejected": 0.02858349122107029, + "logps/chosen": -2.2146787643432617, + "logps/rejected": -2.631208896636963, + "loss": 0.7107, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.2146787643432617, + "rewards/margins": 0.41653013229370117, + "rewards/rejected": -2.631208896636963, + "sft_loss": 2.177154064178467, + "step": 810 + }, + { + "epoch": 0.4361933433684563, + "grad_norm": 3.62624736111432, + "learning_rate": 9.937536987168413e-07, + "logits/chosen": 0.0011221707100048661, + "logits/rejected": 0.12757374346256256, + "logps/chosen": -2.143000602722168, + "logps/rejected": -2.7792890071868896, + "loss": 0.6987, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.143000602722168, + "rewards/margins": 0.6362886428833008, + "rewards/rejected": -2.7792890071868896, + "sft_loss": 2.0995757579803467, + "step": 815 + }, + { + "epoch": 0.4388693761498578, + "grad_norm": 4.632145814822578, + "learning_rate": 9.935058998485896e-07, + "logits/chosen": 0.02935582958161831, + "logits/rejected": 0.08219500631093979, + "logps/chosen": -2.1796677112579346, + "logps/rejected": -2.678680658340454, + "loss": 0.7122, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1796677112579346, + "rewards/margins": 0.4990130364894867, + "rewards/rejected": -2.678680658340454, + "sft_loss": 2.1346020698547363, + "step": 820 + }, + { + "epoch": 0.44154540893125943, + "grad_norm": 2.5174642262048366, + "learning_rate": 9.932533129839333e-07, + "logits/chosen": 0.019097473472356796, + "logits/rejected": 0.16284245252609253, + "logps/chosen": -2.130647659301758, + "logps/rejected": -2.58423113822937, + "loss": 0.7126, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.130647659301758, + "rewards/margins": 0.4535837173461914, + "rewards/rejected": -2.58423113822937, + "sft_loss": 2.1586554050445557, + "step": 825 + }, + { + "epoch": 0.444221441712661, + "grad_norm": 2.555830574006697, + "learning_rate": 9.929959405734711e-07, + "logits/chosen": 0.1632825881242752, + "logits/rejected": 0.3447878956794739, + "logps/chosen": -2.2275099754333496, + "logps/rejected": -2.6180667877197266, + "loss": 0.7146, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2275099754333496, + "rewards/margins": 0.39055711030960083, + "rewards/rejected": -2.6180667877197266, + "sft_loss": 2.1432220935821533, + "step": 830 + }, + { + "epoch": 0.44689747449406253, + "grad_norm": 3.672400135778624, + "learning_rate": 9.927337851142314e-07, + "logits/chosen": 0.08189113438129425, + "logits/rejected": 0.22178630530834198, + "logps/chosen": -2.357943058013916, + "logps/rejected": -2.7608790397644043, + "loss": 0.7142, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.357943058013916, + "rewards/margins": 0.4029361605644226, + "rewards/rejected": -2.7608790397644043, + "sft_loss": 2.332587242126465, + "step": 835 + }, + { + "epoch": 0.44957350727546413, + "grad_norm": 3.5725253771617753, + "learning_rate": 9.924668491496474e-07, + "logits/chosen": 0.06914591789245605, + "logits/rejected": 0.32668501138687134, + "logps/chosen": -2.4014992713928223, + "logps/rejected": -2.8846962451934814, + "loss": 0.7154, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.4014992713928223, + "rewards/margins": 0.48319679498672485, + "rewards/rejected": -2.8846962451934814, + "sft_loss": 2.3815293312072754, + "step": 840 + }, + { + "epoch": 0.4522495400568657, + "grad_norm": 3.1294455746461116, + "learning_rate": 9.92195135269533e-07, + "logits/chosen": 0.1843930184841156, + "logits/rejected": 0.2454400509595871, + "logps/chosen": -2.3867859840393066, + "logps/rejected": -2.7935075759887695, + "loss": 0.7121, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.3867859840393066, + "rewards/margins": 0.4067217707633972, + "rewards/rejected": -2.7935075759887695, + "sft_loss": 2.3315231800079346, + "step": 845 + }, + { + "epoch": 0.4549255728382673, + "grad_norm": 3.0367682866830297, + "learning_rate": 9.919186461100574e-07, + "logits/chosen": 0.15431417524814606, + "logits/rejected": 0.2713518738746643, + "logps/chosen": -2.562764883041382, + "logps/rejected": -3.004469394683838, + "loss": 0.71, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.562764883041382, + "rewards/margins": 0.4417042136192322, + "rewards/rejected": -3.004469394683838, + "sft_loss": 2.359578847885132, + "step": 850 + }, + { + "epoch": 0.45760160561966884, + "grad_norm": 2.2239671562004606, + "learning_rate": 9.9163738435372e-07, + "logits/chosen": 0.17244210839271545, + "logits/rejected": 0.34257280826568604, + "logps/chosen": -2.564682960510254, + "logps/rejected": -3.1213314533233643, + "loss": 0.7148, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.564682960510254, + "rewards/margins": 0.5566484332084656, + "rewards/rejected": -3.1213314533233643, + "sft_loss": 2.3615224361419678, + "step": 855 + }, + { + "epoch": 0.4602776384010704, + "grad_norm": 1.0920440799644606, + "learning_rate": 9.913513527293234e-07, + "logits/chosen": 0.09747910499572754, + "logits/rejected": 0.3031081259250641, + "logps/chosen": -2.531813144683838, + "logps/rejected": -3.162034034729004, + "loss": 0.7055, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.531813144683838, + "rewards/margins": 0.630220890045166, + "rewards/rejected": -3.162034034729004, + "sft_loss": 2.3049185276031494, + "step": 860 + }, + { + "epoch": 0.462953671182472, + "grad_norm": 4.842613450105217, + "learning_rate": 9.910605540119474e-07, + "logits/chosen": 0.18088313937187195, + "logits/rejected": 0.3242112696170807, + "logps/chosen": -2.553947925567627, + "logps/rejected": -3.1162753105163574, + "loss": 0.7099, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.553947925567627, + "rewards/margins": 0.56232750415802, + "rewards/rejected": -3.1162753105163574, + "sft_loss": 2.3149704933166504, + "step": 865 + }, + { + "epoch": 0.46562970396387354, + "grad_norm": 1.676184646304957, + "learning_rate": 9.907649910229227e-07, + "logits/chosen": 0.1025063768029213, + "logits/rejected": 0.42056649923324585, + "logps/chosen": -2.3593525886535645, + "logps/rejected": -2.967888355255127, + "loss": 0.7043, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.3593525886535645, + "rewards/margins": 0.608535647392273, + "rewards/rejected": -2.967888355255127, + "sft_loss": 2.2593531608581543, + "step": 870 + }, + { + "epoch": 0.46830573674527515, + "grad_norm": 2.947417875013683, + "learning_rate": 9.90464666629803e-07, + "logits/chosen": 0.17791959643363953, + "logits/rejected": 0.27456963062286377, + "logps/chosen": -2.3868355751037598, + "logps/rejected": -2.758819580078125, + "loss": 0.7185, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.3868355751037598, + "rewards/margins": 0.37198397517204285, + "rewards/rejected": -2.758819580078125, + "sft_loss": 2.237908124923706, + "step": 875 + }, + { + "epoch": 0.4709817695266767, + "grad_norm": 3.535073328729538, + "learning_rate": 9.901595837463363e-07, + "logits/chosen": 0.129004567861557, + "logits/rejected": 0.3492968678474426, + "logps/chosen": -2.2729153633117676, + "logps/rejected": -2.8254361152648926, + "loss": 0.7008, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2729153633117676, + "rewards/margins": 0.552520751953125, + "rewards/rejected": -2.8254361152648926, + "sft_loss": 2.122039318084717, + "step": 880 + }, + { + "epoch": 0.47365780230807825, + "grad_norm": 1.9883077785571128, + "learning_rate": 9.898497453324384e-07, + "logits/chosen": 0.08982595801353455, + "logits/rejected": 0.19023357331752777, + "logps/chosen": -2.2362680435180664, + "logps/rejected": -2.804640531539917, + "loss": 0.7015, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2362680435180664, + "rewards/margins": 0.568372905254364, + "rewards/rejected": -2.804640531539917, + "sft_loss": 2.1717028617858887, + "step": 885 + }, + { + "epoch": 0.47633383508947985, + "grad_norm": 1.7328895899308072, + "learning_rate": 9.895351543941628e-07, + "logits/chosen": -0.07798051089048386, + "logits/rejected": 0.08264918625354767, + "logps/chosen": -2.19826602935791, + "logps/rejected": -2.604795455932617, + "loss": 0.7121, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.19826602935791, + "rewards/margins": 0.4065292775630951, + "rewards/rejected": -2.604795455932617, + "sft_loss": 2.130378246307373, + "step": 890 + }, + { + "epoch": 0.4790098678708814, + "grad_norm": 2.423264934459615, + "learning_rate": 9.892158139836724e-07, + "logits/chosen": 0.06881462037563324, + "logits/rejected": 0.18817448616027832, + "logps/chosen": -2.142784833908081, + "logps/rejected": -2.5093910694122314, + "loss": 0.7047, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.142784833908081, + "rewards/margins": 0.36660611629486084, + "rewards/rejected": -2.5093910694122314, + "sft_loss": 2.0799660682678223, + "step": 895 + }, + { + "epoch": 0.481685900652283, + "grad_norm": 2.4706182493332007, + "learning_rate": 9.88891727199209e-07, + "logits/chosen": 0.026805415749549866, + "logits/rejected": 0.1365644782781601, + "logps/chosen": -2.190398693084717, + "logps/rejected": -2.69905686378479, + "loss": 0.7007, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.190398693084717, + "rewards/margins": 0.508658230304718, + "rewards/rejected": -2.69905686378479, + "sft_loss": 2.1007237434387207, + "step": 900 + }, + { + "epoch": 0.48436193343368455, + "grad_norm": 2.2734688952192457, + "learning_rate": 9.885628971850641e-07, + "logits/chosen": 0.025261899456381798, + "logits/rejected": 0.23059232532978058, + "logps/chosen": -2.249539852142334, + "logps/rejected": -2.822911024093628, + "loss": 0.7067, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.249539852142334, + "rewards/margins": 0.573371171951294, + "rewards/rejected": -2.822911024093628, + "sft_loss": 2.253406524658203, + "step": 905 + }, + { + "epoch": 0.48703796621508616, + "grad_norm": 1.905964454067589, + "learning_rate": 9.882293271315481e-07, + "logits/chosen": 0.062464743852615356, + "logits/rejected": 0.18661728501319885, + "logps/chosen": -2.2542724609375, + "logps/rejected": -2.6662635803222656, + "loss": 0.7263, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.2542724609375, + "rewards/margins": 0.41199105978012085, + "rewards/rejected": -2.6662635803222656, + "sft_loss": 2.176217555999756, + "step": 910 + }, + { + "epoch": 0.4897139989964877, + "grad_norm": 2.078130359969576, + "learning_rate": 9.878910202749589e-07, + "logits/chosen": 0.060682039707899094, + "logits/rejected": 0.2705400288105011, + "logps/chosen": -2.239940881729126, + "logps/rejected": -2.697516441345215, + "loss": 0.7101, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.239940881729126, + "rewards/margins": 0.45757532119750977, + "rewards/rejected": -2.697516441345215, + "sft_loss": 2.1451468467712402, + "step": 915 + }, + { + "epoch": 0.49239003177788926, + "grad_norm": 2.0532116326739382, + "learning_rate": 9.875479798975512e-07, + "logits/chosen": 0.14619162678718567, + "logits/rejected": 0.3306914269924164, + "logps/chosen": -2.332319498062134, + "logps/rejected": -2.874541759490967, + "loss": 0.7052, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.332319498062134, + "rewards/margins": 0.5422223210334778, + "rewards/rejected": -2.874541759490967, + "sft_loss": 2.18951153755188, + "step": 920 + }, + { + "epoch": 0.49506606455929086, + "grad_norm": 1.3305226834529542, + "learning_rate": 9.87200209327504e-07, + "logits/chosen": 0.07990497350692749, + "logits/rejected": 0.30456340312957764, + "logps/chosen": -2.451970338821411, + "logps/rejected": -2.8729283809661865, + "loss": 0.7083, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.451970338821411, + "rewards/margins": 0.4209579527378082, + "rewards/rejected": -2.8729283809661865, + "sft_loss": 2.1967291831970215, + "step": 925 + }, + { + "epoch": 0.4977420973406924, + "grad_norm": 2.3836534390528743, + "learning_rate": 9.868477119388894e-07, + "logits/chosen": 0.08301494270563126, + "logits/rejected": 0.18483616411685944, + "logps/chosen": -2.343928337097168, + "logps/rejected": -2.943770170211792, + "loss": 0.7128, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.343928337097168, + "rewards/margins": 0.5998419523239136, + "rewards/rejected": -2.943770170211792, + "sft_loss": 2.2015366554260254, + "step": 930 + }, + { + "epoch": 0.500418130122094, + "grad_norm": 1.9175709672468506, + "learning_rate": 9.864904911516383e-07, + "logits/chosen": 0.14284172654151917, + "logits/rejected": 0.2308415174484253, + "logps/chosen": -2.4166016578674316, + "logps/rejected": -2.889622211456299, + "loss": 0.7092, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.4166016578674316, + "rewards/margins": 0.4730204939842224, + "rewards/rejected": -2.889622211456299, + "sft_loss": 2.2566981315612793, + "step": 935 + }, + { + "epoch": 0.5030941629034956, + "grad_norm": 1.7468529370557997, + "learning_rate": 9.861285504315084e-07, + "logits/chosen": 0.14955595135688782, + "logits/rejected": 0.26237112283706665, + "logps/chosen": -2.4178216457366943, + "logps/rejected": -2.961228609085083, + "loss": 0.7034, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.4178216457366943, + "rewards/margins": 0.54340660572052, + "rewards/rejected": -2.961228609085083, + "sft_loss": 2.302690267562866, + "step": 940 + }, + { + "epoch": 0.5057701956848971, + "grad_norm": 1.7821999235187895, + "learning_rate": 9.857618932900502e-07, + "logits/chosen": 0.04973750561475754, + "logits/rejected": 0.23481830954551697, + "logps/chosen": -2.339595079421997, + "logps/rejected": -2.958242893218994, + "loss": 0.6998, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.339595079421997, + "rewards/margins": 0.6186478137969971, + "rewards/rejected": -2.958242893218994, + "sft_loss": 2.2271862030029297, + "step": 945 + }, + { + "epoch": 0.5084462284662987, + "grad_norm": 2.375938544297863, + "learning_rate": 9.853905232845727e-07, + "logits/chosen": 0.03858000785112381, + "logits/rejected": 0.2346847802400589, + "logps/chosen": -2.3574490547180176, + "logps/rejected": -2.7179243564605713, + "loss": 0.714, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.3574490547180176, + "rewards/margins": 0.3604753017425537, + "rewards/rejected": -2.7179243564605713, + "sft_loss": 2.230804204940796, + "step": 950 + }, + { + "epoch": 0.5111222612477003, + "grad_norm": 1.3797533963570972, + "learning_rate": 9.850144440181095e-07, + "logits/chosen": 0.10901288688182831, + "logits/rejected": 0.3531850278377533, + "logps/chosen": -2.3547780513763428, + "logps/rejected": -2.7378902435302734, + "loss": 0.7157, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.3547780513763428, + "rewards/margins": 0.38311222195625305, + "rewards/rejected": -2.7378902435302734, + "sft_loss": 2.27116322517395, + "step": 955 + }, + { + "epoch": 0.5137982940291018, + "grad_norm": 2.4267629639987205, + "learning_rate": 9.846336591393832e-07, + "logits/chosen": 0.12777450680732727, + "logits/rejected": 0.3147388696670532, + "logps/chosen": -2.2933754920959473, + "logps/rejected": -2.753551959991455, + "loss": 0.7081, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.2933754920959473, + "rewards/margins": 0.46017637848854065, + "rewards/rejected": -2.753551959991455, + "sft_loss": 2.24653697013855, + "step": 960 + }, + { + "epoch": 0.5164743268105034, + "grad_norm": 1.876050989117389, + "learning_rate": 9.842481723427704e-07, + "logits/chosen": 0.16586394608020782, + "logits/rejected": 0.19615688920021057, + "logps/chosen": -2.46045184135437, + "logps/rejected": -2.9221224784851074, + "loss": 0.7113, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.46045184135437, + "rewards/margins": 0.46167030930519104, + "rewards/rejected": -2.9221224784851074, + "sft_loss": 2.3795344829559326, + "step": 965 + }, + { + "epoch": 0.519150359591905, + "grad_norm": 2.8189772657931726, + "learning_rate": 9.838579873682658e-07, + "logits/chosen": 0.1627335250377655, + "logits/rejected": 0.17865543067455292, + "logps/chosen": -2.2735114097595215, + "logps/rejected": -2.731293201446533, + "loss": 0.7084, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2735114097595215, + "rewards/margins": 0.45778173208236694, + "rewards/rejected": -2.731293201446533, + "sft_loss": 2.200697898864746, + "step": 970 + }, + { + "epoch": 0.5218263923733065, + "grad_norm": 1.3613854877272462, + "learning_rate": 9.834631080014457e-07, + "logits/chosen": 0.047867465764284134, + "logits/rejected": 0.27975499629974365, + "logps/chosen": -2.2784676551818848, + "logps/rejected": -2.8016064167022705, + "loss": 0.7053, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.2784676551818848, + "rewards/margins": 0.5231388807296753, + "rewards/rejected": -2.8016064167022705, + "sft_loss": 2.259875774383545, + "step": 975 + }, + { + "epoch": 0.5245024251547081, + "grad_norm": 2.045704139911155, + "learning_rate": 9.830635380734312e-07, + "logits/chosen": 0.03475247696042061, + "logits/rejected": 0.26037245988845825, + "logps/chosen": -2.4209530353546143, + "logps/rejected": -2.78556489944458, + "loss": 0.7137, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.4209530353546143, + "rewards/margins": 0.3646118640899658, + "rewards/rejected": -2.78556489944458, + "sft_loss": 2.3162875175476074, + "step": 980 + }, + { + "epoch": 0.5271784579361097, + "grad_norm": 2.1236606797076285, + "learning_rate": 9.826592814608517e-07, + "logits/chosen": 0.1473628282546997, + "logits/rejected": 0.3909761905670166, + "logps/chosen": -2.32572603225708, + "logps/rejected": -2.7549567222595215, + "loss": 0.7103, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.32572603225708, + "rewards/margins": 0.42923077940940857, + "rewards/rejected": -2.7549567222595215, + "sft_loss": 2.276968002319336, + "step": 985 + }, + { + "epoch": 0.5298544907175113, + "grad_norm": 2.2436977480189575, + "learning_rate": 9.822503420858067e-07, + "logits/chosen": 0.2726105749607086, + "logits/rejected": 0.2880789637565613, + "logps/chosen": -2.1168220043182373, + "logps/rejected": -2.6466643810272217, + "loss": 0.7027, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1168220043182373, + "rewards/margins": 0.5298421382904053, + "rewards/rejected": -2.6466643810272217, + "sft_loss": 2.1228058338165283, + "step": 990 + }, + { + "epoch": 0.5325305234989128, + "grad_norm": 2.0435846330257834, + "learning_rate": 9.818367239158277e-07, + "logits/chosen": 0.22745075821876526, + "logits/rejected": 0.3045138418674469, + "logps/chosen": -2.261383295059204, + "logps/rejected": -2.6280386447906494, + "loss": 0.718, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.261383295059204, + "rewards/margins": 0.366655170917511, + "rewards/rejected": -2.6280386447906494, + "sft_loss": 2.2003605365753174, + "step": 995 + }, + { + "epoch": 0.5352065562803144, + "grad_norm": 1.4535000931898072, + "learning_rate": 9.8141843096384e-07, + "logits/chosen": 0.1563439518213272, + "logits/rejected": 0.34087103605270386, + "logps/chosen": -2.3654720783233643, + "logps/rejected": -2.9726266860961914, + "loss": 0.7015, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.3654720783233643, + "rewards/margins": 0.607154369354248, + "rewards/rejected": -2.9726266860961914, + "sft_loss": 2.3127644062042236, + "step": 1000 + }, + { + "epoch": 0.537882589061716, + "grad_norm": 2.08251689341471, + "learning_rate": 9.809954672881237e-07, + "logits/chosen": 0.18138954043388367, + "logits/rejected": 0.34113839268684387, + "logps/chosen": -2.468475818634033, + "logps/rejected": -3.0173211097717285, + "loss": 0.7131, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.468475818634033, + "rewards/margins": 0.5488454103469849, + "rewards/rejected": -3.0173211097717285, + "sft_loss": 2.478551149368286, + "step": 1005 + }, + { + "epoch": 0.5405586218431175, + "grad_norm": 1.9358358256624035, + "learning_rate": 9.80567836992274e-07, + "logits/chosen": 0.1209973692893982, + "logits/rejected": 0.3399454951286316, + "logps/chosen": -2.2919278144836426, + "logps/rejected": -2.8712244033813477, + "loss": 0.706, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.2919278144836426, + "rewards/margins": 0.5792968273162842, + "rewards/rejected": -2.8712244033813477, + "sft_loss": 2.263434886932373, + "step": 1010 + }, + { + "epoch": 0.5432346546245191, + "grad_norm": 2.0145294277350794, + "learning_rate": 9.801355442251625e-07, + "logits/chosen": 0.1270841807126999, + "logits/rejected": 0.3247852921485901, + "logps/chosen": -2.1929144859313965, + "logps/rejected": -2.6835200786590576, + "loss": 0.7043, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1929144859313965, + "rewards/margins": 0.49060574173927307, + "rewards/rejected": -2.6835200786590576, + "sft_loss": 2.1795461177825928, + "step": 1015 + }, + { + "epoch": 0.5459106874059207, + "grad_norm": 3.9739830078495095, + "learning_rate": 9.796985931808949e-07, + "logits/chosen": 0.13059914112091064, + "logits/rejected": 0.3214924931526184, + "logps/chosen": -2.224752187728882, + "logps/rejected": -2.8040366172790527, + "loss": 0.7077, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.224752187728882, + "rewards/margins": 0.5792843103408813, + "rewards/rejected": -2.8040366172790527, + "sft_loss": 2.2124037742614746, + "step": 1020 + }, + { + "epoch": 0.5485867201873222, + "grad_norm": 3.151943102651046, + "learning_rate": 9.792569880987724e-07, + "logits/chosen": 0.10221121460199356, + "logits/rejected": 0.2548214793205261, + "logps/chosen": -2.177299976348877, + "logps/rejected": -2.803210496902466, + "loss": 0.6956, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.177299976348877, + "rewards/margins": 0.625910222530365, + "rewards/rejected": -2.803210496902466, + "sft_loss": 2.131620168685913, + "step": 1025 + }, + { + "epoch": 0.5512627529687238, + "grad_norm": 4.302780014141352, + "learning_rate": 9.788107332632493e-07, + "logits/chosen": 0.16803228855133057, + "logits/rejected": 0.27060407400131226, + "logps/chosen": -2.1256794929504395, + "logps/rejected": -2.6368584632873535, + "loss": 0.7032, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1256794929504395, + "rewards/margins": 0.5111792087554932, + "rewards/rejected": -2.6368584632873535, + "sft_loss": 2.1204867362976074, + "step": 1030 + }, + { + "epoch": 0.5539387857501255, + "grad_norm": 1.9088328959701046, + "learning_rate": 9.783598330038924e-07, + "logits/chosen": 0.1321832537651062, + "logits/rejected": 0.27038392424583435, + "logps/chosen": -2.1786580085754395, + "logps/rejected": -2.5679731369018555, + "loss": 0.7235, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.1786580085754395, + "rewards/margins": 0.3893152177333832, + "rewards/rejected": -2.5679731369018555, + "sft_loss": 2.169933319091797, + "step": 1035 + }, + { + "epoch": 0.5566148185315271, + "grad_norm": 1.5228203778779503, + "learning_rate": 9.779042916953376e-07, + "logits/chosen": 0.1220581904053688, + "logits/rejected": 0.3341611623764038, + "logps/chosen": -2.1551034450531006, + "logps/rejected": -2.824923038482666, + "loss": 0.6967, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1551034450531006, + "rewards/margins": 0.6698193550109863, + "rewards/rejected": -2.824923038482666, + "sft_loss": 2.188110828399658, + "step": 1040 + }, + { + "epoch": 0.5592908513129285, + "grad_norm": 1.8722753805757277, + "learning_rate": 9.774441137572487e-07, + "logits/chosen": 0.0969678983092308, + "logits/rejected": 0.2719166874885559, + "logps/chosen": -2.218916654586792, + "logps/rejected": -2.8767879009246826, + "loss": 0.7044, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.218916654586792, + "rewards/margins": 0.657870888710022, + "rewards/rejected": -2.8767879009246826, + "sft_loss": 2.1699187755584717, + "step": 1045 + }, + { + "epoch": 0.5619668840943302, + "grad_norm": 2.524700899597618, + "learning_rate": 9.76979303654274e-07, + "logits/chosen": 0.07900562137365341, + "logits/rejected": 0.2019214928150177, + "logps/chosen": -2.3817086219787598, + "logps/rejected": -2.9759418964385986, + "loss": 0.7069, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.3817086219787598, + "rewards/margins": 0.5942331552505493, + "rewards/rejected": -2.9759418964385986, + "sft_loss": 2.3367366790771484, + "step": 1050 + }, + { + "epoch": 0.5646429168757318, + "grad_norm": 2.778143248389814, + "learning_rate": 9.765098658960035e-07, + "logits/chosen": 0.15386387705802917, + "logits/rejected": 0.22167351841926575, + "logps/chosen": -2.329440116882324, + "logps/rejected": -2.9883041381835938, + "loss": 0.6992, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.329440116882324, + "rewards/margins": 0.6588642001152039, + "rewards/rejected": -2.9883041381835938, + "sft_loss": 2.254359483718872, + "step": 1055 + }, + { + "epoch": 0.5673189496571333, + "grad_norm": 2.163166547109692, + "learning_rate": 9.76035805036924e-07, + "logits/chosen": 0.20119425654411316, + "logits/rejected": 0.4054529070854187, + "logps/chosen": -2.332470655441284, + "logps/rejected": -2.8844902515411377, + "loss": 0.703, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.332470655441284, + "rewards/margins": 0.5520192384719849, + "rewards/rejected": -2.8844902515411377, + "sft_loss": 2.2267251014709473, + "step": 1060 + }, + { + "epoch": 0.5699949824385349, + "grad_norm": 2.572689395806096, + "learning_rate": 9.755571256763764e-07, + "logits/chosen": 0.14655031263828278, + "logits/rejected": 0.2958758473396301, + "logps/chosen": -2.2035248279571533, + "logps/rejected": -2.95106840133667, + "loss": 0.6966, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2035248279571533, + "rewards/margins": 0.7475437521934509, + "rewards/rejected": -2.95106840133667, + "sft_loss": 2.208761692047119, + "step": 1065 + }, + { + "epoch": 0.5726710152199365, + "grad_norm": 2.4396453685753428, + "learning_rate": 9.750738324585097e-07, + "logits/chosen": 0.07241447269916534, + "logits/rejected": 0.3418889343738556, + "logps/chosen": -2.136514902114868, + "logps/rejected": -2.6713004112243652, + "loss": 0.7007, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.136514902114868, + "rewards/margins": 0.5347855687141418, + "rewards/rejected": -2.6713004112243652, + "sft_loss": 2.092672109603882, + "step": 1070 + }, + { + "epoch": 0.5753470480013381, + "grad_norm": 3.0120905040874506, + "learning_rate": 9.74585930072237e-07, + "logits/chosen": 0.18253377079963684, + "logits/rejected": 0.33938705921173096, + "logps/chosen": -2.040200710296631, + "logps/rejected": -2.693777561187744, + "loss": 0.7011, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.040200710296631, + "rewards/margins": 0.6535772085189819, + "rewards/rejected": -2.693777561187744, + "sft_loss": 2.049227237701416, + "step": 1075 + }, + { + "epoch": 0.5780230807827396, + "grad_norm": 4.291489973247969, + "learning_rate": 9.740934232511892e-07, + "logits/chosen": 0.08960796147584915, + "logits/rejected": 0.21009111404418945, + "logps/chosen": -2.233616828918457, + "logps/rejected": -2.7052865028381348, + "loss": 0.7068, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.233616828918457, + "rewards/margins": 0.4716699719429016, + "rewards/rejected": -2.7052865028381348, + "sft_loss": 2.2202603816986084, + "step": 1080 + }, + { + "epoch": 0.5806991135641412, + "grad_norm": 1.6984441890547815, + "learning_rate": 9.735963167736698e-07, + "logits/chosen": 0.14620177447795868, + "logits/rejected": 0.3416219651699066, + "logps/chosen": -2.303943157196045, + "logps/rejected": -2.638916492462158, + "loss": 0.7126, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.303943157196045, + "rewards/margins": 0.3349727988243103, + "rewards/rejected": -2.638916492462158, + "sft_loss": 2.2308120727539062, + "step": 1085 + }, + { + "epoch": 0.5833751463455428, + "grad_norm": 2.0181546365872443, + "learning_rate": 9.730946154626078e-07, + "logits/chosen": 0.16837917268276215, + "logits/rejected": 0.2764458954334259, + "logps/chosen": -2.180739164352417, + "logps/rejected": -2.62367582321167, + "loss": 0.7072, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.180739164352417, + "rewards/margins": 0.44293683767318726, + "rewards/rejected": -2.62367582321167, + "sft_loss": 2.108156204223633, + "step": 1090 + }, + { + "epoch": 0.5860511791269443, + "grad_norm": 2.3660885845943467, + "learning_rate": 9.725883241855117e-07, + "logits/chosen": 0.0687054768204689, + "logits/rejected": 0.2612117528915405, + "logps/chosen": -2.339418888092041, + "logps/rejected": -2.985769748687744, + "loss": 0.7088, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.339418888092041, + "rewards/margins": 0.6463507413864136, + "rewards/rejected": -2.985769748687744, + "sft_loss": 2.275740385055542, + "step": 1095 + }, + { + "epoch": 0.5887272119083459, + "grad_norm": 2.294313228399238, + "learning_rate": 9.720774478544218e-07, + "logits/chosen": 0.18929219245910645, + "logits/rejected": 0.35968270897865295, + "logps/chosen": -2.3253414630889893, + "logps/rejected": -3.0628182888031006, + "loss": 0.7024, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.3253414630889893, + "rewards/margins": 0.7374764680862427, + "rewards/rejected": -3.0628182888031006, + "sft_loss": 2.2206921577453613, + "step": 1100 + }, + { + "epoch": 0.5914032446897475, + "grad_norm": 2.4074171680487217, + "learning_rate": 9.715619914258624e-07, + "logits/chosen": 0.10449817031621933, + "logits/rejected": 0.21964387595653534, + "logps/chosen": -2.3780767917633057, + "logps/rejected": -2.781838893890381, + "loss": 0.7068, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.3780767917633057, + "rewards/margins": 0.40376242995262146, + "rewards/rejected": -2.781838893890381, + "sft_loss": 2.1239264011383057, + "step": 1105 + }, + { + "epoch": 0.594079277471149, + "grad_norm": 1.7207229215774509, + "learning_rate": 9.710419599007937e-07, + "logits/chosen": 0.16424152255058289, + "logits/rejected": 0.33380651473999023, + "logps/chosen": -2.299243927001953, + "logps/rejected": -2.7112672328948975, + "loss": 0.7182, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.299243927001953, + "rewards/margins": 0.41202330589294434, + "rewards/rejected": -2.7112672328948975, + "sft_loss": 2.189572811126709, + "step": 1110 + }, + { + "epoch": 0.5967553102525506, + "grad_norm": 2.233411754559403, + "learning_rate": 9.705173583245643e-07, + "logits/chosen": 0.14826945960521698, + "logits/rejected": 0.3402601182460785, + "logps/chosen": -2.322040557861328, + "logps/rejected": -3.035202980041504, + "loss": 0.704, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.322040557861328, + "rewards/margins": 0.7131628394126892, + "rewards/rejected": -3.035202980041504, + "sft_loss": 2.1495418548583984, + "step": 1115 + }, + { + "epoch": 0.5994313430339522, + "grad_norm": 2.6376907033408323, + "learning_rate": 9.699881917868609e-07, + "logits/chosen": 0.04995732381939888, + "logits/rejected": 0.19983510673046112, + "logps/chosen": -2.2585396766662598, + "logps/rejected": -2.790225028991699, + "loss": 0.7016, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.2585396766662598, + "rewards/margins": 0.5316852927207947, + "rewards/rejected": -2.790225028991699, + "sft_loss": 2.169877290725708, + "step": 1120 + }, + { + "epoch": 0.6021073758153538, + "grad_norm": 2.155798516432301, + "learning_rate": 9.694544654216594e-07, + "logits/chosen": 0.048025552183389664, + "logits/rejected": 0.272845596075058, + "logps/chosen": -2.3054137229919434, + "logps/rejected": -2.8888356685638428, + "loss": 0.7, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.3054137229919434, + "rewards/margins": 0.583422064781189, + "rewards/rejected": -2.8888356685638428, + "sft_loss": 2.1739730834960938, + "step": 1125 + }, + { + "epoch": 0.6047834085967553, + "grad_norm": 2.161030120954079, + "learning_rate": 9.689161844071755e-07, + "logits/chosen": 0.19647592306137085, + "logits/rejected": 0.30404067039489746, + "logps/chosen": -2.1804988384246826, + "logps/rejected": -2.6770386695861816, + "loss": 0.7074, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1804988384246826, + "rewards/margins": 0.4965395927429199, + "rewards/rejected": -2.6770386695861816, + "sft_loss": 2.0427918434143066, + "step": 1130 + }, + { + "epoch": 0.6074594413781569, + "grad_norm": 1.7786484279015686, + "learning_rate": 9.683733539658138e-07, + "logits/chosen": 0.10375823080539703, + "logits/rejected": 0.3111613094806671, + "logps/chosen": -2.209398031234741, + "logps/rejected": -2.770206928253174, + "loss": 0.697, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.209398031234741, + "rewards/margins": 0.5608090162277222, + "rewards/rejected": -2.770206928253174, + "sft_loss": 2.001215934753418, + "step": 1135 + }, + { + "epoch": 0.6101354741595585, + "grad_norm": 2.0623461324880252, + "learning_rate": 9.678259793641178e-07, + "logits/chosen": 0.14811334013938904, + "logits/rejected": 0.20558318495750427, + "logps/chosen": -2.205653190612793, + "logps/rejected": -2.572145700454712, + "loss": 0.7099, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.205653190612793, + "rewards/margins": 0.3664925694465637, + "rewards/rejected": -2.572145700454712, + "sft_loss": 2.1566948890686035, + "step": 1140 + }, + { + "epoch": 0.61281150694096, + "grad_norm": 2.2424158455070224, + "learning_rate": 9.672740659127183e-07, + "logits/chosen": 0.0038683507591485977, + "logits/rejected": 0.1579521894454956, + "logps/chosen": -2.1706321239471436, + "logps/rejected": -2.7472522258758545, + "loss": 0.7053, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.1706321239471436, + "rewards/margins": 0.5766201615333557, + "rewards/rejected": -2.7472522258758545, + "sft_loss": 2.0863354206085205, + "step": 1145 + }, + { + "epoch": 0.6154875397223616, + "grad_norm": 1.9617321040832172, + "learning_rate": 9.667176189662818e-07, + "logits/chosen": 0.06456903368234634, + "logits/rejected": 0.24067726731300354, + "logps/chosen": -2.1974761486053467, + "logps/rejected": -2.812023401260376, + "loss": 0.6997, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1974761486053467, + "rewards/margins": 0.6145472526550293, + "rewards/rejected": -2.812023401260376, + "sft_loss": 2.060354709625244, + "step": 1150 + }, + { + "epoch": 0.6181635725037632, + "grad_norm": 2.16126487305779, + "learning_rate": 9.661566439234592e-07, + "logits/chosen": 0.15495900809764862, + "logits/rejected": 0.27409639954566956, + "logps/chosen": -2.3435518741607666, + "logps/rejected": -2.7621870040893555, + "loss": 0.7097, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.3435518741607666, + "rewards/margins": 0.4186350405216217, + "rewards/rejected": -2.7621870040893555, + "sft_loss": 2.2268261909484863, + "step": 1155 + }, + { + "epoch": 0.6208396052851648, + "grad_norm": 2.6944212251680324, + "learning_rate": 9.655911462268327e-07, + "logits/chosen": 0.1989332139492035, + "logits/rejected": 0.3290112018585205, + "logps/chosen": -2.212402820587158, + "logps/rejected": -2.9126648902893066, + "loss": 0.6954, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.212402820587158, + "rewards/margins": 0.700262188911438, + "rewards/rejected": -2.9126648902893066, + "sft_loss": 2.144258737564087, + "step": 1160 + }, + { + "epoch": 0.6235156380665663, + "grad_norm": 2.1501122351517363, + "learning_rate": 9.650211313628636e-07, + "logits/chosen": 0.1345038115978241, + "logits/rejected": 0.2468636929988861, + "logps/chosen": -2.232625722885132, + "logps/rejected": -2.700748920440674, + "loss": 0.7052, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.232625722885132, + "rewards/margins": 0.4681231379508972, + "rewards/rejected": -2.700748920440674, + "sft_loss": 2.1846437454223633, + "step": 1165 + }, + { + "epoch": 0.6261916708479679, + "grad_norm": 2.681493179683401, + "learning_rate": 9.644466048618386e-07, + "logits/chosen": 0.15378674864768982, + "logits/rejected": 0.34652647376060486, + "logps/chosen": -2.3802332878112793, + "logps/rejected": -2.8429431915283203, + "loss": 0.7063, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.3802332878112793, + "rewards/margins": 0.46271008253097534, + "rewards/rejected": -2.8429431915283203, + "sft_loss": 2.2455978393554688, + "step": 1170 + }, + { + "epoch": 0.6288677036293695, + "grad_norm": 3.349380471785963, + "learning_rate": 9.63867572297816e-07, + "logits/chosen": 0.12027013301849365, + "logits/rejected": 0.34836727380752563, + "logps/chosen": -2.1819441318511963, + "logps/rejected": -2.6743319034576416, + "loss": 0.7072, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1819441318511963, + "rewards/margins": 0.49238792061805725, + "rewards/rejected": -2.6743319034576416, + "sft_loss": 2.1214489936828613, + "step": 1175 + }, + { + "epoch": 0.631543736410771, + "grad_norm": 1.7856091716715792, + "learning_rate": 9.632840392885727e-07, + "logits/chosen": 0.12357620894908905, + "logits/rejected": 0.3255251944065094, + "logps/chosen": -2.2828421592712402, + "logps/rejected": -2.861924886703491, + "loss": 0.7083, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2828421592712402, + "rewards/margins": 0.5790826082229614, + "rewards/rejected": -2.861924886703491, + "sft_loss": 2.2045605182647705, + "step": 1180 + }, + { + "epoch": 0.6342197691921726, + "grad_norm": 3.1641412359972074, + "learning_rate": 9.626960114955483e-07, + "logits/chosen": 0.16735532879829407, + "logits/rejected": 0.3422599732875824, + "logps/chosen": -2.2051072120666504, + "logps/rejected": -3.099365234375, + "loss": 0.6991, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2051072120666504, + "rewards/margins": 0.8942579030990601, + "rewards/rejected": -3.099365234375, + "sft_loss": 2.128654956817627, + "step": 1185 + }, + { + "epoch": 0.6368958019735742, + "grad_norm": 1.9470313023305175, + "learning_rate": 9.621034946237909e-07, + "logits/chosen": 0.09066956490278244, + "logits/rejected": 0.2607859671115875, + "logps/chosen": -2.3099732398986816, + "logps/rejected": -3.01261305809021, + "loss": 0.6981, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.3099732398986816, + "rewards/margins": 0.7026399970054626, + "rewards/rejected": -3.01261305809021, + "sft_loss": 2.2505602836608887, + "step": 1190 + }, + { + "epoch": 0.6395718347549757, + "grad_norm": 3.184610641159135, + "learning_rate": 9.615064944219021e-07, + "logits/chosen": 0.1539539098739624, + "logits/rejected": 0.30308666825294495, + "logps/chosen": -2.1489720344543457, + "logps/rejected": -2.784179449081421, + "loss": 0.695, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1489720344543457, + "rewards/margins": 0.6352076530456543, + "rewards/rejected": -2.784179449081421, + "sft_loss": 2.13791561126709, + "step": 1195 + }, + { + "epoch": 0.6422478675363773, + "grad_norm": 4.4377039888981775, + "learning_rate": 9.609050166819803e-07, + "logits/chosen": 0.12475017458200455, + "logits/rejected": 0.21696901321411133, + "logps/chosen": -2.19476580619812, + "logps/rejected": -2.6287224292755127, + "loss": 0.7071, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.19476580619812, + "rewards/margins": 0.4339566230773926, + "rewards/rejected": -2.6287224292755127, + "sft_loss": 2.0920956134796143, + "step": 1200 + }, + { + "epoch": 0.6422478675363773, + "eval_logits/chosen": 0.555233359336853, + "eval_logits/rejected": 0.6762645244598389, + "eval_logps/chosen": -2.1362555027008057, + "eval_logps/rejected": -2.7640209197998047, + "eval_loss": 0.6987842917442322, + "eval_rewards/accuracies": 0.6691394448280334, + "eval_rewards/chosen": -2.1362555027008057, + "eval_rewards/margins": 0.6277655363082886, + "eval_rewards/rejected": -2.7640209197998047, + "eval_runtime": 43.2828, + "eval_samples_per_second": 31.075, + "eval_sft_loss": 2.044482946395874, + "eval_steps_per_second": 7.786, + "step": 1200 + }, + { + "epoch": 0.6449239003177789, + "grad_norm": 3.830774189470102, + "learning_rate": 9.602990672395653e-07, + "logits/chosen": 0.00027870089979842305, + "logits/rejected": 0.22381892800331116, + "logps/chosen": -2.0898985862731934, + "logps/rejected": -2.686279773712158, + "loss": 0.6929, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.0898985862731934, + "rewards/margins": 0.5963811874389648, + "rewards/rejected": -2.686279773712158, + "sft_loss": 2.0494701862335205, + "step": 1205 + }, + { + "epoch": 0.6475999330991805, + "grad_norm": 4.25157046723289, + "learning_rate": 9.59688651973581e-07, + "logits/chosen": 0.021906504407525063, + "logits/rejected": 0.2690677046775818, + "logps/chosen": -2.0723724365234375, + "logps/rejected": -2.5781683921813965, + "loss": 0.7024, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.0723724365234375, + "rewards/margins": 0.5057964324951172, + "rewards/rejected": -2.5781683921813965, + "sft_loss": 2.0166726112365723, + "step": 1210 + }, + { + "epoch": 0.650275965880582, + "grad_norm": 1.7915990533349566, + "learning_rate": 9.590737768062792e-07, + "logits/chosen": 0.06573444604873657, + "logits/rejected": 0.22309072315692902, + "logps/chosen": -2.202409029006958, + "logps/rejected": -2.7307677268981934, + "loss": 0.7096, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.202409029006958, + "rewards/margins": 0.528359055519104, + "rewards/rejected": -2.7307677268981934, + "sft_loss": 2.1225969791412354, + "step": 1215 + }, + { + "epoch": 0.6529519986619836, + "grad_norm": 2.306640871794144, + "learning_rate": 9.584544477031816e-07, + "logits/chosen": 0.2040838748216629, + "logits/rejected": 0.36291855573654175, + "logps/chosen": -2.194674015045166, + "logps/rejected": -2.743324041366577, + "loss": 0.7043, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.194674015045166, + "rewards/margins": 0.5486500859260559, + "rewards/rejected": -2.743324041366577, + "sft_loss": 2.0980405807495117, + "step": 1220 + }, + { + "epoch": 0.6556280314433852, + "grad_norm": 2.2545845017591235, + "learning_rate": 9.578306706730215e-07, + "logits/chosen": 0.010583726689219475, + "logits/rejected": 0.27537912130355835, + "logps/chosen": -2.328881025314331, + "logps/rejected": -2.8073489665985107, + "loss": 0.7045, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.328881025314331, + "rewards/margins": 0.4784678816795349, + "rewards/rejected": -2.8073489665985107, + "sft_loss": 2.170609951019287, + "step": 1225 + }, + { + "epoch": 0.6583040642247867, + "grad_norm": 3.663446419735623, + "learning_rate": 9.572024517676865e-07, + "logits/chosen": 0.0909842699766159, + "logits/rejected": 0.22236530482769012, + "logps/chosen": -2.3348231315612793, + "logps/rejected": -2.9920623302459717, + "loss": 0.7037, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.3348231315612793, + "rewards/margins": 0.6572390198707581, + "rewards/rejected": -2.9920623302459717, + "sft_loss": 2.1873583793640137, + "step": 1230 + }, + { + "epoch": 0.6609800970061883, + "grad_norm": 1.4543890342842616, + "learning_rate": 9.565697970821593e-07, + "logits/chosen": 0.1440887749195099, + "logits/rejected": 0.33302026987075806, + "logps/chosen": -2.454163074493408, + "logps/rejected": -3.0908660888671875, + "loss": 0.7083, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.454163074493408, + "rewards/margins": 0.6367031335830688, + "rewards/rejected": -3.0908660888671875, + "sft_loss": 2.254920721054077, + "step": 1235 + }, + { + "epoch": 0.6636561297875899, + "grad_norm": 2.4722340588869978, + "learning_rate": 9.559327127544585e-07, + "logits/chosen": 0.026768425479531288, + "logits/rejected": 0.19142289459705353, + "logps/chosen": -2.4390079975128174, + "logps/rejected": -3.031845808029175, + "loss": 0.7071, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.4390079975128174, + "rewards/margins": 0.592837929725647, + "rewards/rejected": -3.031845808029175, + "sft_loss": 2.2959556579589844, + "step": 1240 + }, + { + "epoch": 0.6663321625689914, + "grad_norm": 1.8974146027052907, + "learning_rate": 9.552912049655789e-07, + "logits/chosen": 0.04718981683254242, + "logits/rejected": 0.26890960335731506, + "logps/chosen": -2.355416774749756, + "logps/rejected": -2.8213212490081787, + "loss": 0.7085, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.355416774749756, + "rewards/margins": 0.46590447425842285, + "rewards/rejected": -2.8213212490081787, + "sft_loss": 2.2121856212615967, + "step": 1245 + }, + { + "epoch": 0.669008195350393, + "grad_norm": 2.5707161962938048, + "learning_rate": 9.546452799394315e-07, + "logits/chosen": 0.08500464260578156, + "logits/rejected": 0.3310920298099518, + "logps/chosen": -2.398761749267578, + "logps/rejected": -2.832510471343994, + "loss": 0.7127, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.398761749267578, + "rewards/margins": 0.43374842405319214, + "rewards/rejected": -2.832510471343994, + "sft_loss": 2.2532973289489746, + "step": 1250 + }, + { + "epoch": 0.6716842281317946, + "grad_norm": 2.777155725892231, + "learning_rate": 9.539949439427846e-07, + "logits/chosen": 0.10809852927923203, + "logits/rejected": 0.26978474855422974, + "logps/chosen": -2.203864336013794, + "logps/rejected": -2.7843985557556152, + "loss": 0.703, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.203864336013794, + "rewards/margins": 0.5805341005325317, + "rewards/rejected": -2.7843985557556152, + "sft_loss": 2.1802217960357666, + "step": 1255 + }, + { + "epoch": 0.6743602609131962, + "grad_norm": 2.6576584593887658, + "learning_rate": 9.533402032852002e-07, + "logits/chosen": 0.07701893150806427, + "logits/rejected": 0.2485908716917038, + "logps/chosen": -2.1946444511413574, + "logps/rejected": -2.8757450580596924, + "loss": 0.7118, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1946444511413574, + "rewards/margins": 0.681100606918335, + "rewards/rejected": -2.8757450580596924, + "sft_loss": 2.120004177093506, + "step": 1260 + }, + { + "epoch": 0.6770362936945977, + "grad_norm": 2.2163120337499542, + "learning_rate": 9.526810643189754e-07, + "logits/chosen": 0.1574430763721466, + "logits/rejected": 0.3811032474040985, + "logps/chosen": -2.132166862487793, + "logps/rejected": -2.809222459793091, + "loss": 0.7002, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.132166862487793, + "rewards/margins": 0.6770555973052979, + "rewards/rejected": -2.809222459793091, + "sft_loss": 2.09527850151062, + "step": 1265 + }, + { + "epoch": 0.6797123264759993, + "grad_norm": 2.0634743227388324, + "learning_rate": 9.52017533439079e-07, + "logits/chosen": 0.061243295669555664, + "logits/rejected": 0.16142085194587708, + "logps/chosen": -2.1922335624694824, + "logps/rejected": -2.8089139461517334, + "loss": 0.6999, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1922335624694824, + "rewards/margins": 0.6166807413101196, + "rewards/rejected": -2.8089139461517334, + "sft_loss": 2.1539809703826904, + "step": 1270 + }, + { + "epoch": 0.6823883592574009, + "grad_norm": 1.340766627146057, + "learning_rate": 9.513496170830909e-07, + "logits/chosen": 0.1113109439611435, + "logits/rejected": 0.21860043704509735, + "logps/chosen": -2.2608723640441895, + "logps/rejected": -2.8428518772125244, + "loss": 0.7003, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.2608723640441895, + "rewards/margins": 0.5819796323776245, + "rewards/rejected": -2.8428518772125244, + "sft_loss": 2.1284279823303223, + "step": 1275 + }, + { + "epoch": 0.6850643920388024, + "grad_norm": 5.504206460094653, + "learning_rate": 9.506773217311382e-07, + "logits/chosen": 0.10970310121774673, + "logits/rejected": 0.3257525861263275, + "logps/chosen": -2.2973437309265137, + "logps/rejected": -2.927894115447998, + "loss": 0.7095, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.2973437309265137, + "rewards/margins": 0.6305506229400635, + "rewards/rejected": -2.927894115447998, + "sft_loss": 2.231661319732666, + "step": 1280 + }, + { + "epoch": 0.687740424820204, + "grad_norm": 2.9750399204385043, + "learning_rate": 9.500006539058334e-07, + "logits/chosen": 0.1559508889913559, + "logits/rejected": 0.3778040409088135, + "logps/chosen": -2.1892244815826416, + "logps/rejected": -2.6358699798583984, + "loss": 0.7002, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1892244815826416, + "rewards/margins": 0.4466456472873688, + "rewards/rejected": -2.6358699798583984, + "sft_loss": 2.093132972717285, + "step": 1285 + }, + { + "epoch": 0.6904164576016056, + "grad_norm": 2.4309268153225263, + "learning_rate": 9.493196201722109e-07, + "logits/chosen": 0.05222257971763611, + "logits/rejected": 0.2760353684425354, + "logps/chosen": -2.3155436515808105, + "logps/rejected": -2.7205491065979004, + "loss": 0.7059, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.3155436515808105, + "rewards/margins": 0.4050050377845764, + "rewards/rejected": -2.7205491065979004, + "sft_loss": 2.2433018684387207, + "step": 1290 + }, + { + "epoch": 0.6930924903830072, + "grad_norm": 3.117488409175474, + "learning_rate": 9.486342271376628e-07, + "logits/chosen": 0.16181819140911102, + "logits/rejected": 0.15442785620689392, + "logps/chosen": -2.341973066329956, + "logps/rejected": -3.1276779174804688, + "loss": 0.701, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.341973066329956, + "rewards/margins": 0.7857049703598022, + "rewards/rejected": -3.1276779174804688, + "sft_loss": 2.221696138381958, + "step": 1295 + }, + { + "epoch": 0.6957685231644087, + "grad_norm": 2.080673352225228, + "learning_rate": 9.479444814518755e-07, + "logits/chosen": 0.13009627163410187, + "logits/rejected": 0.4935070872306824, + "logps/chosen": -2.3354337215423584, + "logps/rejected": -3.1901021003723145, + "loss": 0.6983, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.3354337215423584, + "rewards/margins": 0.8546679615974426, + "rewards/rejected": -3.1901021003723145, + "sft_loss": 2.2007999420166016, + "step": 1300 + }, + { + "epoch": 0.6984445559458103, + "grad_norm": 3.078906430649519, + "learning_rate": 9.472503898067645e-07, + "logits/chosen": 0.33531293272972107, + "logits/rejected": 0.4072476327419281, + "logps/chosen": -2.3676962852478027, + "logps/rejected": -2.9815027713775635, + "loss": 0.7061, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.3676962852478027, + "rewards/margins": 0.6138062477111816, + "rewards/rejected": -2.9815027713775635, + "sft_loss": 2.1546168327331543, + "step": 1305 + }, + { + "epoch": 0.701120588727212, + "grad_norm": 2.5677069132991104, + "learning_rate": 9.465519589364099e-07, + "logits/chosen": 0.29916244745254517, + "logits/rejected": 0.40381136536598206, + "logps/chosen": -2.371689796447754, + "logps/rejected": -2.9242143630981445, + "loss": 0.7138, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.371689796447754, + "rewards/margins": 0.5525246262550354, + "rewards/rejected": -2.9242143630981445, + "sft_loss": 2.1695141792297363, + "step": 1310 + }, + { + "epoch": 0.7037966215086134, + "grad_norm": 1.7137276143457714, + "learning_rate": 9.458491956169914e-07, + "logits/chosen": 0.16459494829177856, + "logits/rejected": 0.4160476624965668, + "logps/chosen": -2.3387157917022705, + "logps/rejected": -3.0234005451202393, + "loss": 0.6976, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.3387157917022705, + "rewards/margins": 0.6846847534179688, + "rewards/rejected": -3.0234005451202393, + "sft_loss": 2.14802885055542, + "step": 1315 + }, + { + "epoch": 0.706472654290015, + "grad_norm": 2.263175751819887, + "learning_rate": 9.451421066667215e-07, + "logits/chosen": 0.005195315927267075, + "logits/rejected": 0.2853628098964691, + "logps/chosen": -2.2528247833251953, + "logps/rejected": -2.944413661956787, + "loss": 0.6995, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.2528247833251953, + "rewards/margins": 0.6915886998176575, + "rewards/rejected": -2.944413661956787, + "sft_loss": 2.1431026458740234, + "step": 1320 + }, + { + "epoch": 0.7091486870714167, + "grad_norm": 1.9875831630039975, + "learning_rate": 9.444306989457805e-07, + "logits/chosen": 0.22154255211353302, + "logits/rejected": 0.3978469967842102, + "logps/chosen": -2.3452460765838623, + "logps/rejected": -2.894991397857666, + "loss": 0.7071, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.3452460765838623, + "rewards/margins": 0.5497456789016724, + "rewards/rejected": -2.894991397857666, + "sft_loss": 2.125516891479492, + "step": 1325 + }, + { + "epoch": 0.7118247198528181, + "grad_norm": 2.534540663345523, + "learning_rate": 9.437149793562489e-07, + "logits/chosen": 0.18094339966773987, + "logits/rejected": 0.34638968110084534, + "logps/chosen": -2.3127498626708984, + "logps/rejected": -2.7011842727661133, + "loss": 0.7069, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.3127498626708984, + "rewards/margins": 0.38843435049057007, + "rewards/rejected": -2.7011842727661133, + "sft_loss": 2.1503615379333496, + "step": 1330 + }, + { + "epoch": 0.7145007526342197, + "grad_norm": 2.0331521044506142, + "learning_rate": 9.429949548420417e-07, + "logits/chosen": 0.21482765674591064, + "logits/rejected": 0.348909854888916, + "logps/chosen": -2.3520545959472656, + "logps/rejected": -2.8570454120635986, + "loss": 0.7012, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.3520545959472656, + "rewards/margins": 0.5049908757209778, + "rewards/rejected": -2.8570454120635986, + "sft_loss": 2.2471046447753906, + "step": 1335 + }, + { + "epoch": 0.7171767854156214, + "grad_norm": 2.2432607952668615, + "learning_rate": 9.422706323888396e-07, + "logits/chosen": 0.20482811331748962, + "logits/rejected": 0.276005357503891, + "logps/chosen": -2.2051875591278076, + "logps/rejected": -2.8122718334198, + "loss": 0.7017, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2051875591278076, + "rewards/margins": 0.6070839166641235, + "rewards/rejected": -2.8122718334198, + "sft_loss": 2.1116836071014404, + "step": 1340 + }, + { + "epoch": 0.719852818197023, + "grad_norm": 2.1432552849114233, + "learning_rate": 9.415420190240225e-07, + "logits/chosen": 0.20497314631938934, + "logits/rejected": 0.47388821840286255, + "logps/chosen": -2.1197383403778076, + "logps/rejected": -2.9532077312469482, + "loss": 0.6895, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.1197383403778076, + "rewards/margins": 0.8334692716598511, + "rewards/rejected": -2.9532077312469482, + "sft_loss": 2.102599620819092, + "step": 1345 + }, + { + "epoch": 0.7225288509784245, + "grad_norm": 2.5423288896269143, + "learning_rate": 9.408091218166002e-07, + "logits/chosen": 0.19564411044120789, + "logits/rejected": 0.3067219853401184, + "logps/chosen": -2.1597135066986084, + "logps/rejected": -2.5702147483825684, + "loss": 0.7134, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.1597135066986084, + "rewards/margins": 0.4105011522769928, + "rewards/rejected": -2.5702147483825684, + "sft_loss": 2.123806953430176, + "step": 1350 + }, + { + "epoch": 0.7252048837598261, + "grad_norm": 1.8893484799831797, + "learning_rate": 9.400719478771449e-07, + "logits/chosen": 0.1256810575723648, + "logits/rejected": 0.49158114194869995, + "logps/chosen": -2.386765956878662, + "logps/rejected": -2.8819260597229004, + "loss": 0.7046, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.386765956878662, + "rewards/margins": 0.49516019225120544, + "rewards/rejected": -2.8819260597229004, + "sft_loss": 2.235278606414795, + "step": 1355 + }, + { + "epoch": 0.7278809165412277, + "grad_norm": 1.9988518198555865, + "learning_rate": 9.393305043577209e-07, + "logits/chosen": 0.03835318237543106, + "logits/rejected": 0.19741104543209076, + "logps/chosen": -2.27360463142395, + "logps/rejected": -2.9089667797088623, + "loss": 0.702, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.27360463142395, + "rewards/margins": 0.6353622674942017, + "rewards/rejected": -2.9089667797088623, + "sft_loss": 2.2150821685791016, + "step": 1360 + }, + { + "epoch": 0.7305569493226292, + "grad_norm": 1.5629991357488582, + "learning_rate": 9.38584798451817e-07, + "logits/chosen": 0.03389846533536911, + "logits/rejected": 0.19717106223106384, + "logps/chosen": -2.255682945251465, + "logps/rejected": -2.724970579147339, + "loss": 0.7011, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.255682945251465, + "rewards/margins": 0.4692877233028412, + "rewards/rejected": -2.724970579147339, + "sft_loss": 2.155445098876953, + "step": 1365 + }, + { + "epoch": 0.7332329821040308, + "grad_norm": 1.8213600131915777, + "learning_rate": 9.37834837394275e-07, + "logits/chosen": 0.07529856264591217, + "logits/rejected": 0.2234765738248825, + "logps/chosen": -2.281263828277588, + "logps/rejected": -3.119943141937256, + "loss": 0.6933, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.281263828277588, + "rewards/margins": 0.8386794328689575, + "rewards/rejected": -3.119943141937256, + "sft_loss": 2.2313129901885986, + "step": 1370 + }, + { + "epoch": 0.7359090148854324, + "grad_norm": 2.9933299216173577, + "learning_rate": 9.370806284612203e-07, + "logits/chosen": -0.016869869083166122, + "logits/rejected": 0.16321472823619843, + "logps/chosen": -2.199903964996338, + "logps/rejected": -2.920362949371338, + "loss": 0.6944, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.199903964996338, + "rewards/margins": 0.720458984375, + "rewards/rejected": -2.920362949371338, + "sft_loss": 2.1823477745056152, + "step": 1375 + }, + { + "epoch": 0.738585047666834, + "grad_norm": 5.500730072855545, + "learning_rate": 9.363221789699912e-07, + "logits/chosen": -0.075008325278759, + "logits/rejected": 0.0719195008277893, + "logps/chosen": -2.2905290126800537, + "logps/rejected": -2.7570128440856934, + "loss": 0.7169, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.2905290126800537, + "rewards/margins": 0.4664839208126068, + "rewards/rejected": -2.7570128440856934, + "sft_loss": 2.2204031944274902, + "step": 1380 + }, + { + "epoch": 0.7412610804482355, + "grad_norm": 5.900709315702155, + "learning_rate": 9.355594962790682e-07, + "logits/chosen": -0.041843343526124954, + "logits/rejected": 0.1123179942369461, + "logps/chosen": -2.1659350395202637, + "logps/rejected": -2.8170411586761475, + "loss": 0.7046, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1659350395202637, + "rewards/margins": 0.6511061787605286, + "rewards/rejected": -2.8170411586761475, + "sft_loss": 2.1369853019714355, + "step": 1385 + }, + { + "epoch": 0.7439371132296371, + "grad_norm": 3.0244073453127034, + "learning_rate": 9.34792587788002e-07, + "logits/chosen": -0.009002335369586945, + "logits/rejected": 0.1299920678138733, + "logps/chosen": -2.429741382598877, + "logps/rejected": -2.960287570953369, + "loss": 0.7078, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.429741382598877, + "rewards/margins": 0.5305465459823608, + "rewards/rejected": -2.960287570953369, + "sft_loss": 2.339808225631714, + "step": 1390 + }, + { + "epoch": 0.7466131460110387, + "grad_norm": 2.5131062987246704, + "learning_rate": 9.34021460937342e-07, + "logits/chosen": 0.0465545579791069, + "logits/rejected": 0.15400944650173187, + "logps/chosen": -2.3714404106140137, + "logps/rejected": -2.8975820541381836, + "loss": 0.712, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.3714404106140137, + "rewards/margins": 0.5261418223381042, + "rewards/rejected": -2.8975820541381836, + "sft_loss": 2.300178050994873, + "step": 1395 + }, + { + "epoch": 0.7492891787924402, + "grad_norm": 1.4429172448843048, + "learning_rate": 9.332461232085646e-07, + "logits/chosen": -0.11528120189905167, + "logits/rejected": 0.02020915411412716, + "logps/chosen": -2.4631452560424805, + "logps/rejected": -3.0314419269561768, + "loss": 0.7063, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.4631452560424805, + "rewards/margins": 0.5682967901229858, + "rewards/rejected": -3.0314419269561768, + "sft_loss": 2.417720317840576, + "step": 1400 + }, + { + "epoch": 0.7519652115738418, + "grad_norm": 2.038326194928855, + "learning_rate": 9.324665821239998e-07, + "logits/chosen": -0.07167818397283554, + "logits/rejected": 0.14383158087730408, + "logps/chosen": -2.2117011547088623, + "logps/rejected": -3.1124587059020996, + "loss": 0.7104, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.2117011547088623, + "rewards/margins": 0.9007574319839478, + "rewards/rejected": -3.1124587059020996, + "sft_loss": 2.210836887359619, + "step": 1405 + }, + { + "epoch": 0.7546412443552434, + "grad_norm": 2.578398933156557, + "learning_rate": 9.316828452467583e-07, + "logits/chosen": -0.05414128303527832, + "logits/rejected": 0.14086315035820007, + "logps/chosen": -2.2781615257263184, + "logps/rejected": -2.977079391479492, + "loss": 0.6949, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2781615257263184, + "rewards/margins": 0.6989179253578186, + "rewards/rejected": -2.977079391479492, + "sft_loss": 2.234966278076172, + "step": 1410 + }, + { + "epoch": 0.7573172771366449, + "grad_norm": 2.6551004124751256, + "learning_rate": 9.30894920180659e-07, + "logits/chosen": 0.03887999802827835, + "logits/rejected": 0.17733149230480194, + "logps/chosen": -2.222642183303833, + "logps/rejected": -2.6063666343688965, + "loss": 0.7132, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.222642183303833, + "rewards/margins": 0.3837243914604187, + "rewards/rejected": -2.6063666343688965, + "sft_loss": 2.07389497756958, + "step": 1415 + }, + { + "epoch": 0.7599933099180465, + "grad_norm": 2.1780359850681617, + "learning_rate": 9.301028145701543e-07, + "logits/chosen": 0.01445702277123928, + "logits/rejected": 0.16482463479042053, + "logps/chosen": -2.2106032371520996, + "logps/rejected": -2.9935824871063232, + "loss": 0.7007, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.2106032371520996, + "rewards/margins": 0.7829793691635132, + "rewards/rejected": -2.9935824871063232, + "sft_loss": 2.214637517929077, + "step": 1420 + }, + { + "epoch": 0.7626693426994481, + "grad_norm": 3.254513497125433, + "learning_rate": 9.293065361002563e-07, + "logits/chosen": 0.003981569316238165, + "logits/rejected": 0.13027504086494446, + "logps/chosen": -2.1432437896728516, + "logps/rejected": -2.844395637512207, + "loss": 0.6936, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1432437896728516, + "rewards/margins": 0.7011516690254211, + "rewards/rejected": -2.844395637512207, + "sft_loss": 2.105501174926758, + "step": 1425 + }, + { + "epoch": 0.7653453754808497, + "grad_norm": 2.192853514273409, + "learning_rate": 9.285060924964622e-07, + "logits/chosen": -0.037188876420259476, + "logits/rejected": 0.09554673731327057, + "logps/chosen": -2.263479709625244, + "logps/rejected": -2.7253289222717285, + "loss": 0.7029, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.263479709625244, + "rewards/margins": 0.4618498682975769, + "rewards/rejected": -2.7253289222717285, + "sft_loss": 2.08933687210083, + "step": 1430 + }, + { + "epoch": 0.7680214082622512, + "grad_norm": 1.81295648344349, + "learning_rate": 9.277014915246792e-07, + "logits/chosen": 0.06525443494319916, + "logits/rejected": 0.12765257060527802, + "logps/chosen": -2.149930715560913, + "logps/rejected": -2.93646502494812, + "loss": 0.6994, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.149930715560913, + "rewards/margins": 0.786534309387207, + "rewards/rejected": -2.93646502494812, + "sft_loss": 2.0745248794555664, + "step": 1435 + }, + { + "epoch": 0.7706974410436528, + "grad_norm": 1.915371902448545, + "learning_rate": 9.268927409911498e-07, + "logits/chosen": -0.06192860007286072, + "logits/rejected": 0.05368726700544357, + "logps/chosen": -2.1278603076934814, + "logps/rejected": -2.6336026191711426, + "loss": 0.7073, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.1278603076934814, + "rewards/margins": 0.5057421922683716, + "rewards/rejected": -2.6336026191711426, + "sft_loss": 2.1015305519104004, + "step": 1440 + }, + { + "epoch": 0.7733734738250544, + "grad_norm": 2.198257788150926, + "learning_rate": 9.260798487423749e-07, + "logits/chosen": -0.11924233287572861, + "logits/rejected": 0.10633299499750137, + "logps/chosen": -2.256054401397705, + "logps/rejected": -2.803920030593872, + "loss": 0.7021, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.256054401397705, + "rewards/margins": 0.5478653311729431, + "rewards/rejected": -2.803920030593872, + "sft_loss": 2.262547254562378, + "step": 1445 + }, + { + "epoch": 0.7760495066064559, + "grad_norm": 7.705182659305554, + "learning_rate": 9.252628226650389e-07, + "logits/chosen": -0.03130624443292618, + "logits/rejected": 0.09935715794563293, + "logps/chosen": -2.404046058654785, + "logps/rejected": -2.8623976707458496, + "loss": 0.7054, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.404046058654785, + "rewards/margins": 0.4583517611026764, + "rewards/rejected": -2.8623976707458496, + "sft_loss": 2.363663673400879, + "step": 1450 + }, + { + "epoch": 0.7787255393878575, + "grad_norm": 2.4972398291785893, + "learning_rate": 9.244416706859321e-07, + "logits/chosen": -0.08602894097566605, + "logits/rejected": 0.10091432183980942, + "logps/chosen": -2.3144848346710205, + "logps/rejected": -2.9508140087127686, + "loss": 0.6973, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.3144848346710205, + "rewards/margins": 0.6363292932510376, + "rewards/rejected": -2.9508140087127686, + "sft_loss": 2.3109982013702393, + "step": 1455 + }, + { + "epoch": 0.7814015721692591, + "grad_norm": 2.2190636962447945, + "learning_rate": 9.23616400771875e-07, + "logits/chosen": -0.05645718052983284, + "logits/rejected": 0.1200685054063797, + "logps/chosen": -2.2083747386932373, + "logps/rejected": -2.9030609130859375, + "loss": 0.6911, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.2083747386932373, + "rewards/margins": 0.6946861147880554, + "rewards/rejected": -2.9030609130859375, + "sft_loss": 2.1226391792297363, + "step": 1460 + }, + { + "epoch": 0.7840776049506607, + "grad_norm": 1.5901442821916314, + "learning_rate": 9.227870209296395e-07, + "logits/chosen": -0.034431345760822296, + "logits/rejected": 0.10250719636678696, + "logps/chosen": -2.251410961151123, + "logps/rejected": -2.75331711769104, + "loss": 0.7042, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.251410961151123, + "rewards/margins": 0.5019063353538513, + "rewards/rejected": -2.75331711769104, + "sft_loss": 2.1372828483581543, + "step": 1465 + }, + { + "epoch": 0.7867536377320622, + "grad_norm": 1.9633230437398599, + "learning_rate": 9.219535392058728e-07, + "logits/chosen": -0.11807402223348618, + "logits/rejected": -0.08948754519224167, + "logps/chosen": -2.2486908435821533, + "logps/rejected": -2.7955987453460693, + "loss": 0.7079, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.2486908435821533, + "rewards/margins": 0.5469079613685608, + "rewards/rejected": -2.7955987453460693, + "sft_loss": 2.15276837348938, + "step": 1470 + }, + { + "epoch": 0.7894296705134638, + "grad_norm": 2.2725449419839645, + "learning_rate": 9.211159636870181e-07, + "logits/chosen": -0.1662970334291458, + "logits/rejected": 0.02462906390428543, + "logps/chosen": -2.1715939044952393, + "logps/rejected": -2.8786914348602295, + "loss": 0.7046, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1715939044952393, + "rewards/margins": 0.7070977091789246, + "rewards/rejected": -2.8786914348602295, + "sft_loss": 2.0866100788116455, + "step": 1475 + }, + { + "epoch": 0.7921057032948654, + "grad_norm": 1.8485713742971077, + "learning_rate": 9.202743024992367e-07, + "logits/chosen": -0.04839160293340683, + "logits/rejected": 0.05790669471025467, + "logps/chosen": -2.131850004196167, + "logps/rejected": -2.935936450958252, + "loss": 0.7021, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.131850004196167, + "rewards/margins": 0.8040862083435059, + "rewards/rejected": -2.935936450958252, + "sft_loss": 2.0599653720855713, + "step": 1480 + }, + { + "epoch": 0.7947817360762669, + "grad_norm": 2.894647340821409, + "learning_rate": 9.194285638083293e-07, + "logits/chosen": -0.09339950978755951, + "logits/rejected": 0.06900927424430847, + "logps/chosen": -2.15094256401062, + "logps/rejected": -2.8568568229675293, + "loss": 0.6949, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.15094256401062, + "rewards/margins": 0.7059140205383301, + "rewards/rejected": -2.8568568229675293, + "sft_loss": 2.0355353355407715, + "step": 1485 + }, + { + "epoch": 0.7974577688576685, + "grad_norm": 4.426364323187453, + "learning_rate": 9.185787558196562e-07, + "logits/chosen": -0.11539479345083237, + "logits/rejected": -0.005151323974132538, + "logps/chosen": -2.1997592449188232, + "logps/rejected": -2.8636884689331055, + "loss": 0.7035, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1997592449188232, + "rewards/margins": 0.6639290452003479, + "rewards/rejected": -2.8636884689331055, + "sft_loss": 2.1363730430603027, + "step": 1490 + }, + { + "epoch": 0.8001338016390701, + "grad_norm": 3.428026695044482, + "learning_rate": 9.177248867780583e-07, + "logits/chosen": -0.08635888993740082, + "logits/rejected": 0.03695956617593765, + "logps/chosen": -2.423877000808716, + "logps/rejected": -2.8618521690368652, + "loss": 0.7105, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.423877000808716, + "rewards/margins": 0.43797531723976135, + "rewards/rejected": -2.8618521690368652, + "sft_loss": 2.333068370819092, + "step": 1495 + }, + { + "epoch": 0.8028098344204716, + "grad_norm": 3.0184449422286423, + "learning_rate": 9.168669649677769e-07, + "logits/chosen": -0.09940381348133087, + "logits/rejected": 0.029827887192368507, + "logps/chosen": -2.283815622329712, + "logps/rejected": -2.930316686630249, + "loss": 0.7062, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.283815622329712, + "rewards/margins": 0.6465011835098267, + "rewards/rejected": -2.930316686630249, + "sft_loss": 2.2652838230133057, + "step": 1500 + }, + { + "epoch": 0.8054858672018732, + "grad_norm": 2.8768669638127267, + "learning_rate": 9.16004998712373e-07, + "logits/chosen": -0.03785894066095352, + "logits/rejected": 0.05646789073944092, + "logps/chosen": -2.2821617126464844, + "logps/rejected": -2.957378625869751, + "loss": 0.6949, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2821617126464844, + "rewards/margins": 0.6752170324325562, + "rewards/rejected": -2.957378625869751, + "sft_loss": 2.1811747550964355, + "step": 1505 + }, + { + "epoch": 0.8081618999832748, + "grad_norm": 3.142060880674868, + "learning_rate": 9.151389963746472e-07, + "logits/chosen": -0.14265461266040802, + "logits/rejected": 0.1681184321641922, + "logps/chosen": -2.211472511291504, + "logps/rejected": -3.15592360496521, + "loss": 0.6955, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.211472511291504, + "rewards/margins": 0.9444509744644165, + "rewards/rejected": -3.15592360496521, + "sft_loss": 2.178743362426758, + "step": 1510 + }, + { + "epoch": 0.8108379327646764, + "grad_norm": 2.8514252011165087, + "learning_rate": 9.142689663565577e-07, + "logits/chosen": -0.05743175745010376, + "logits/rejected": 0.003899590577930212, + "logps/chosen": -2.146831750869751, + "logps/rejected": -2.847151279449463, + "loss": 0.6963, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.146831750869751, + "rewards/margins": 0.7003197073936462, + "rewards/rejected": -2.847151279449463, + "sft_loss": 2.150350570678711, + "step": 1515 + }, + { + "epoch": 0.8135139655460779, + "grad_norm": 4.351890597339401, + "learning_rate": 9.133949170991397e-07, + "logits/chosen": -0.07250859588384628, + "logits/rejected": 0.03235817700624466, + "logps/chosen": -2.158828020095825, + "logps/rejected": -2.7288763523101807, + "loss": 0.6976, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.158828020095825, + "rewards/margins": 0.5700482130050659, + "rewards/rejected": -2.7288763523101807, + "sft_loss": 2.229212522506714, + "step": 1520 + }, + { + "epoch": 0.8161899983274795, + "grad_norm": 1.8152384973896596, + "learning_rate": 9.125168570824231e-07, + "logits/chosen": -0.16011783480644226, + "logits/rejected": 0.024740198627114296, + "logps/chosen": -2.182220458984375, + "logps/rejected": -2.8084864616394043, + "loss": 0.7102, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.182220458984375, + "rewards/margins": 0.626266360282898, + "rewards/rejected": -2.8084864616394043, + "sft_loss": 2.122053623199463, + "step": 1525 + }, + { + "epoch": 0.8188660311088811, + "grad_norm": 1.713996615511991, + "learning_rate": 9.116347948253496e-07, + "logits/chosen": -0.13271503150463104, + "logits/rejected": -0.0041678594425320625, + "logps/chosen": -2.298312187194824, + "logps/rejected": -2.7899975776672363, + "loss": 0.7086, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.298312187194824, + "rewards/margins": 0.4916854798793793, + "rewards/rejected": -2.7899975776672363, + "sft_loss": 2.1895220279693604, + "step": 1530 + }, + { + "epoch": 0.8215420638902826, + "grad_norm": 2.8142124312312458, + "learning_rate": 9.107487388856916e-07, + "logits/chosen": -0.15361139178276062, + "logits/rejected": 0.03482117876410484, + "logps/chosen": -2.194044351577759, + "logps/rejected": -2.812548875808716, + "loss": 0.6983, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.194044351577759, + "rewards/margins": 0.618504524230957, + "rewards/rejected": -2.812548875808716, + "sft_loss": 2.105254650115967, + "step": 1535 + }, + { + "epoch": 0.8242180966716842, + "grad_norm": 2.503275869310201, + "learning_rate": 9.098586978599673e-07, + "logits/chosen": -0.13203874230384827, + "logits/rejected": 0.0336499884724617, + "logps/chosen": -2.146395444869995, + "logps/rejected": -2.9748904705047607, + "loss": 0.6888, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.146395444869995, + "rewards/margins": 0.8284950256347656, + "rewards/rejected": -2.9748904705047607, + "sft_loss": 2.065767765045166, + "step": 1540 + }, + { + "epoch": 0.8268941294530858, + "grad_norm": 2.2382958133005126, + "learning_rate": 9.089646803833588e-07, + "logits/chosen": -0.16713622212409973, + "logits/rejected": -0.03363850340247154, + "logps/chosen": -2.1303060054779053, + "logps/rejected": -2.7613275051116943, + "loss": 0.7082, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1303060054779053, + "rewards/margins": 0.6310214996337891, + "rewards/rejected": -2.7613275051116943, + "sft_loss": 2.1318306922912598, + "step": 1545 + }, + { + "epoch": 0.8295701622344873, + "grad_norm": 1.94318796170751, + "learning_rate": 9.080666951296276e-07, + "logits/chosen": -0.3367980122566223, + "logits/rejected": -0.08581961691379547, + "logps/chosen": -2.160841226577759, + "logps/rejected": -2.9789345264434814, + "loss": 0.6953, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.160841226577759, + "rewards/margins": 0.8180931806564331, + "rewards/rejected": -2.9789345264434814, + "sft_loss": 2.1677775382995605, + "step": 1550 + }, + { + "epoch": 0.8322461950158889, + "grad_norm": 1.526861078294158, + "learning_rate": 9.071647508110305e-07, + "logits/chosen": -0.2882223427295685, + "logits/rejected": -0.060058873146772385, + "logps/chosen": -2.2199759483337402, + "logps/rejected": -2.96525239944458, + "loss": 0.7042, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2199759483337402, + "rewards/margins": 0.7452765107154846, + "rewards/rejected": -2.96525239944458, + "sft_loss": 2.131960391998291, + "step": 1555 + }, + { + "epoch": 0.8349222277972905, + "grad_norm": 2.3404760802143896, + "learning_rate": 9.062588561782354e-07, + "logits/chosen": -0.1957237422466278, + "logits/rejected": -0.13356651365756989, + "logps/chosen": -2.247565746307373, + "logps/rejected": -2.748612880706787, + "loss": 0.7003, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.247565746307373, + "rewards/margins": 0.5010470151901245, + "rewards/rejected": -2.748612880706787, + "sft_loss": 2.2203712463378906, + "step": 1560 + }, + { + "epoch": 0.8375982605786921, + "grad_norm": 1.9202765729064233, + "learning_rate": 9.053490200202358e-07, + "logits/chosen": -0.1572050154209137, + "logits/rejected": -0.0726763978600502, + "logps/chosen": -2.1353588104248047, + "logps/rejected": -2.6677019596099854, + "loss": 0.6987, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1353588104248047, + "rewards/margins": 0.5323430299758911, + "rewards/rejected": -2.6677019596099854, + "sft_loss": 2.1538896560668945, + "step": 1565 + }, + { + "epoch": 0.8402742933600936, + "grad_norm": 3.2305496709675805, + "learning_rate": 9.044352511642661e-07, + "logits/chosen": -0.19083921611309052, + "logits/rejected": -0.14574851095676422, + "logps/chosen": -2.2574779987335205, + "logps/rejected": -2.750894546508789, + "loss": 0.7094, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.2574779987335205, + "rewards/margins": 0.4934166967868805, + "rewards/rejected": -2.750894546508789, + "sft_loss": 2.267965793609619, + "step": 1570 + }, + { + "epoch": 0.8429503261414952, + "grad_norm": 3.084250669754339, + "learning_rate": 9.03517558475716e-07, + "logits/chosen": -0.20070838928222656, + "logits/rejected": -0.11343379318714142, + "logps/chosen": -2.117205858230591, + "logps/rejected": -2.580235481262207, + "loss": 0.7087, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.117205858230591, + "rewards/margins": 0.46302956342697144, + "rewards/rejected": -2.580235481262207, + "sft_loss": 2.0681068897247314, + "step": 1575 + }, + { + "epoch": 0.8456263589228968, + "grad_norm": 2.7526441438895723, + "learning_rate": 9.025959508580436e-07, + "logits/chosen": -0.1787761151790619, + "logits/rejected": 0.023528896272182465, + "logps/chosen": -2.194460391998291, + "logps/rejected": -2.870157241821289, + "loss": 0.7, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.194460391998291, + "rewards/margins": 0.6756970882415771, + "rewards/rejected": -2.870157241821289, + "sft_loss": 2.1022377014160156, + "step": 1580 + }, + { + "epoch": 0.8483023917042983, + "grad_norm": 2.5042614370667486, + "learning_rate": 9.016704372526905e-07, + "logits/chosen": -0.20925016701221466, + "logits/rejected": -0.05993008613586426, + "logps/chosen": -2.0619091987609863, + "logps/rejected": -2.9169676303863525, + "loss": 0.6934, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0619091987609863, + "rewards/margins": 0.8550586700439453, + "rewards/rejected": -2.9169676303863525, + "sft_loss": 2.0523006916046143, + "step": 1585 + }, + { + "epoch": 0.8509784244856999, + "grad_norm": 3.3434727474771644, + "learning_rate": 9.007410266389934e-07, + "logits/chosen": -0.27181947231292725, + "logits/rejected": -0.2028999626636505, + "logps/chosen": -2.173649311065674, + "logps/rejected": -2.6213297843933105, + "loss": 0.7035, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.173649311065674, + "rewards/margins": 0.4476805627346039, + "rewards/rejected": -2.6213297843933105, + "sft_loss": 2.2011539936065674, + "step": 1590 + }, + { + "epoch": 0.8536544572671015, + "grad_norm": 2.2836497429851823, + "learning_rate": 8.998077280340981e-07, + "logits/chosen": -0.1945187747478485, + "logits/rejected": -0.1373414248228073, + "logps/chosen": -2.3528316020965576, + "logps/rejected": -2.9335505962371826, + "loss": 0.7054, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.3528316020965576, + "rewards/margins": 0.5807192921638489, + "rewards/rejected": -2.9335505962371826, + "sft_loss": 2.2219395637512207, + "step": 1595 + }, + { + "epoch": 0.8563304900485031, + "grad_norm": 2.070645749062221, + "learning_rate": 8.988705504928722e-07, + "logits/chosen": -0.2843402922153473, + "logits/rejected": -0.12146928161382675, + "logps/chosen": -2.226685047149658, + "logps/rejected": -3.1664788722991943, + "loss": 0.6909, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.226685047149658, + "rewards/margins": 0.9397938847541809, + "rewards/rejected": -3.1664788722991943, + "sft_loss": 2.2412779331207275, + "step": 1600 + }, + { + "epoch": 0.8563304900485031, + "eval_logits/chosen": -0.03452831879258156, + "eval_logits/rejected": 0.041437409818172455, + "eval_logps/chosen": -2.3066861629486084, + "eval_logps/rejected": -3.0784785747528076, + "eval_loss": 0.6950539350509644, + "eval_rewards/accuracies": 0.6824925541877747, + "eval_rewards/chosen": -2.3066861629486084, + "eval_rewards/margins": 0.7717921137809753, + "eval_rewards/rejected": -3.0784785747528076, + "eval_runtime": 47.6551, + "eval_samples_per_second": 28.224, + "eval_sft_loss": 2.231623888015747, + "eval_steps_per_second": 7.072, + "step": 1600 + }, + { + "epoch": 0.8590065228299046, + "grad_norm": 5.5586703197383605, + "learning_rate": 8.979295031078157e-07, + "logits/chosen": -0.295482873916626, + "logits/rejected": -0.09522996842861176, + "logps/chosen": -2.3428032398223877, + "logps/rejected": -3.139549493789673, + "loss": 0.6925, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.3428032398223877, + "rewards/margins": 0.7967461347579956, + "rewards/rejected": -3.139549493789673, + "sft_loss": 2.2508833408355713, + "step": 1605 + }, + { + "epoch": 0.8616825556113062, + "grad_norm": 2.2505753653004286, + "learning_rate": 8.969845950089751e-07, + "logits/chosen": -0.2747432589530945, + "logits/rejected": -0.12129826843738556, + "logps/chosen": -2.196272134780884, + "logps/rejected": -3.2144463062286377, + "loss": 0.692, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.196272134780884, + "rewards/margins": 1.018174171447754, + "rewards/rejected": -3.2144463062286377, + "sft_loss": 2.209385395050049, + "step": 1610 + }, + { + "epoch": 0.8643585883927078, + "grad_norm": 8.156452408256476, + "learning_rate": 8.960358353638526e-07, + "logits/chosen": -0.20282170176506042, + "logits/rejected": -0.11459418386220932, + "logps/chosen": -2.337512493133545, + "logps/rejected": -3.029510974884033, + "loss": 0.7022, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.337512493133545, + "rewards/margins": 0.6919983625411987, + "rewards/rejected": -3.029510974884033, + "sft_loss": 2.3106839656829834, + "step": 1615 + }, + { + "epoch": 0.8670346211741093, + "grad_norm": 2.748174363839319, + "learning_rate": 8.950832333773184e-07, + "logits/chosen": -0.2138424813747406, + "logits/rejected": -0.09318797290325165, + "logps/chosen": -2.16756010055542, + "logps/rejected": -2.8914496898651123, + "loss": 0.7034, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.16756010055542, + "rewards/margins": 0.7238895297050476, + "rewards/rejected": -2.8914496898651123, + "sft_loss": 2.1276867389678955, + "step": 1620 + }, + { + "epoch": 0.869710653955511, + "grad_norm": 2.239202934001568, + "learning_rate": 8.941267982915213e-07, + "logits/chosen": -0.1431226283311844, + "logits/rejected": -0.10495742410421371, + "logps/chosen": -2.298262119293213, + "logps/rejected": -2.645540475845337, + "loss": 0.7115, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.298262119293213, + "rewards/margins": 0.3472786545753479, + "rewards/rejected": -2.645540475845337, + "sft_loss": 2.180662155151367, + "step": 1625 + }, + { + "epoch": 0.8723866867369126, + "grad_norm": 2.4065553816037566, + "learning_rate": 8.931665393857983e-07, + "logits/chosen": -0.15369094908237457, + "logits/rejected": -0.022520026192069054, + "logps/chosen": -2.1215758323669434, + "logps/rejected": -2.918578624725342, + "loss": 0.6991, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1215758323669434, + "rewards/margins": 0.7970027923583984, + "rewards/rejected": -2.918578624725342, + "sft_loss": 2.0732903480529785, + "step": 1630 + }, + { + "epoch": 0.875062719518314, + "grad_norm": 3.1905090140746006, + "learning_rate": 8.922024659765861e-07, + "logits/chosen": -0.2666686177253723, + "logits/rejected": -0.1523967683315277, + "logps/chosen": -2.1386024951934814, + "logps/rejected": -2.9105467796325684, + "loss": 0.7043, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.1386024951934814, + "rewards/margins": 0.7719441652297974, + "rewards/rejected": -2.9105467796325684, + "sft_loss": 2.1258816719055176, + "step": 1635 + }, + { + "epoch": 0.8777387522997157, + "grad_norm": 2.1615218808447074, + "learning_rate": 8.912345874173288e-07, + "logits/chosen": -0.21783633530139923, + "logits/rejected": -0.1169201135635376, + "logps/chosen": -2.1727309226989746, + "logps/rejected": -3.094940662384033, + "loss": 0.7014, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1727309226989746, + "rewards/margins": 0.9222098588943481, + "rewards/rejected": -3.094940662384033, + "sft_loss": 2.1299543380737305, + "step": 1640 + }, + { + "epoch": 0.8804147850811173, + "grad_norm": 2.867818877426616, + "learning_rate": 8.902629130983885e-07, + "logits/chosen": -0.2079753428697586, + "logits/rejected": -0.15814396739006042, + "logps/chosen": -2.1862399578094482, + "logps/rejected": -2.729067802429199, + "loss": 0.7032, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.1862399578094482, + "rewards/margins": 0.5428280830383301, + "rewards/rejected": -2.729067802429199, + "sft_loss": 2.2186481952667236, + "step": 1645 + }, + { + "epoch": 0.8830908178625189, + "grad_norm": 3.0966118330484886, + "learning_rate": 8.892874524469537e-07, + "logits/chosen": -0.14018157124519348, + "logits/rejected": -0.09165972471237183, + "logps/chosen": -2.090341329574585, + "logps/rejected": -2.7218966484069824, + "loss": 0.7045, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.090341329574585, + "rewards/margins": 0.631555438041687, + "rewards/rejected": -2.7218966484069824, + "sft_loss": 1.9962527751922607, + "step": 1650 + }, + { + "epoch": 0.8857668506439204, + "grad_norm": 3.3357022059764745, + "learning_rate": 8.883082149269478e-07, + "logits/chosen": -0.26899534463882446, + "logits/rejected": -0.16346599161624908, + "logps/chosen": -2.192856550216675, + "logps/rejected": -2.9463510513305664, + "loss": 0.698, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.192856550216675, + "rewards/margins": 0.753494381904602, + "rewards/rejected": -2.9463510513305664, + "sft_loss": 2.127333402633667, + "step": 1655 + }, + { + "epoch": 0.888442883425322, + "grad_norm": 2.332298426755994, + "learning_rate": 8.873252100389377e-07, + "logits/chosen": -0.2414066046476364, + "logits/rejected": -0.20738832652568817, + "logps/chosen": -2.234240770339966, + "logps/rejected": -2.9939560890197754, + "loss": 0.7017, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.234240770339966, + "rewards/margins": 0.7597158551216125, + "rewards/rejected": -2.9939560890197754, + "sft_loss": 2.1499571800231934, + "step": 1660 + }, + { + "epoch": 0.8911189162067236, + "grad_norm": 2.7484518511406284, + "learning_rate": 8.863384473200411e-07, + "logits/chosen": -0.24828548729419708, + "logits/rejected": -0.18916873633861542, + "logps/chosen": -2.4132347106933594, + "logps/rejected": -2.844573497772217, + "loss": 0.7114, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.4132347106933594, + "rewards/margins": 0.4313390254974365, + "rewards/rejected": -2.844573497772217, + "sft_loss": 2.32940411567688, + "step": 1665 + }, + { + "epoch": 0.8937949489881251, + "grad_norm": 2.7554671577395555, + "learning_rate": 8.853479363438342e-07, + "logits/chosen": -0.18388070166110992, + "logits/rejected": -0.037655822932720184, + "logps/chosen": -2.497715711593628, + "logps/rejected": -3.0130228996276855, + "loss": 0.721, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.497715711593628, + "rewards/margins": 0.5153070688247681, + "rewards/rejected": -3.0130228996276855, + "sft_loss": 2.3574628829956055, + "step": 1670 + }, + { + "epoch": 0.8964709817695267, + "grad_norm": 3.1412922575918687, + "learning_rate": 8.843536867202588e-07, + "logits/chosen": -0.19547554850578308, + "logits/rejected": -0.030999530106782913, + "logps/chosen": -2.3352713584899902, + "logps/rejected": -3.2001075744628906, + "loss": 0.7033, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.3352713584899902, + "rewards/margins": 0.8648357391357422, + "rewards/rejected": -3.2001075744628906, + "sft_loss": 2.3229193687438965, + "step": 1675 + }, + { + "epoch": 0.8991470145509283, + "grad_norm": 2.6452804771537854, + "learning_rate": 8.833557080955292e-07, + "logits/chosen": -0.288478285074234, + "logits/rejected": -0.19373884797096252, + "logps/chosen": -2.3482749462127686, + "logps/rejected": -2.9911880493164062, + "loss": 0.703, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.3482749462127686, + "rewards/margins": 0.6429128646850586, + "rewards/rejected": -2.9911880493164062, + "sft_loss": 2.2894492149353027, + "step": 1680 + }, + { + "epoch": 0.9018230473323299, + "grad_norm": 2.6461804264836823, + "learning_rate": 8.823540101520381e-07, + "logits/chosen": -0.19792509078979492, + "logits/rejected": -0.007888046093285084, + "logps/chosen": -2.3069212436676025, + "logps/rejected": -3.104051113128662, + "loss": 0.7007, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3069212436676025, + "rewards/margins": 0.7971299290657043, + "rewards/rejected": -3.104051113128662, + "sft_loss": 2.2453808784484863, + "step": 1685 + }, + { + "epoch": 0.9044990801137314, + "grad_norm": 2.5770833660332007, + "learning_rate": 8.813486026082637e-07, + "logits/chosen": -0.23112145066261292, + "logits/rejected": -0.05248458310961723, + "logps/chosen": -2.1011803150177, + "logps/rejected": -3.042440414428711, + "loss": 0.6902, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1011803150177, + "rewards/margins": 0.9412603378295898, + "rewards/rejected": -3.042440414428711, + "sft_loss": 2.0605461597442627, + "step": 1690 + }, + { + "epoch": 0.907175112895133, + "grad_norm": 4.86659798854765, + "learning_rate": 8.803394952186742e-07, + "logits/chosen": -0.3249500095844269, + "logits/rejected": -0.16356393694877625, + "logps/chosen": -2.191807270050049, + "logps/rejected": -2.9690449237823486, + "loss": 0.7006, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.191807270050049, + "rewards/margins": 0.7772378325462341, + "rewards/rejected": -2.9690449237823486, + "sft_loss": 2.2248458862304688, + "step": 1695 + }, + { + "epoch": 0.9098511456765346, + "grad_norm": 3.1252711270501443, + "learning_rate": 8.793266977736342e-07, + "logits/chosen": -0.18225380778312683, + "logits/rejected": -0.23605379462242126, + "logps/chosen": -2.2445120811462402, + "logps/rejected": -2.6155567169189453, + "loss": 0.7126, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.2445120811462402, + "rewards/margins": 0.37104448676109314, + "rewards/rejected": -2.6155567169189453, + "sft_loss": 2.242246150970459, + "step": 1700 + }, + { + "epoch": 0.9125271784579361, + "grad_norm": 2.1193829188326876, + "learning_rate": 8.783102200993085e-07, + "logits/chosen": -0.18783244490623474, + "logits/rejected": -0.06622517108917236, + "logps/chosen": -2.0862059593200684, + "logps/rejected": -2.738128900527954, + "loss": 0.7036, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.0862059593200684, + "rewards/margins": 0.6519228219985962, + "rewards/rejected": -2.738128900527954, + "sft_loss": 2.0497243404388428, + "step": 1705 + }, + { + "epoch": 0.9152032112393377, + "grad_norm": 2.80200570752565, + "learning_rate": 8.772900720575683e-07, + "logits/chosen": -0.20885543525218964, + "logits/rejected": -0.13907435536384583, + "logps/chosen": -2.239652633666992, + "logps/rejected": -2.8518741130828857, + "loss": 0.6982, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.239652633666992, + "rewards/margins": 0.6122216582298279, + "rewards/rejected": -2.8518741130828857, + "sft_loss": 2.1831889152526855, + "step": 1710 + }, + { + "epoch": 0.9178792440207393, + "grad_norm": 2.6772905972911647, + "learning_rate": 8.762662635458944e-07, + "logits/chosen": -0.24115291237831116, + "logits/rejected": -0.070436030626297, + "logps/chosen": -2.3881723880767822, + "logps/rejected": -3.046025037765503, + "loss": 0.7064, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3881723880767822, + "rewards/margins": 0.6578529477119446, + "rewards/rejected": -3.046025037765503, + "sft_loss": 2.255465030670166, + "step": 1715 + }, + { + "epoch": 0.9205552768021408, + "grad_norm": 2.452321438573616, + "learning_rate": 8.752388044972811e-07, + "logits/chosen": -0.19311857223510742, + "logits/rejected": -0.13626573979854584, + "logps/chosen": -2.209670066833496, + "logps/rejected": -3.022451400756836, + "loss": 0.6979, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.209670066833496, + "rewards/margins": 0.8127814531326294, + "rewards/rejected": -3.022451400756836, + "sft_loss": 2.1349611282348633, + "step": 1720 + }, + { + "epoch": 0.9232313095835424, + "grad_norm": 2.472216839769144, + "learning_rate": 8.74207704880141e-07, + "logits/chosen": -0.18031716346740723, + "logits/rejected": -0.0725378543138504, + "logps/chosen": -2.1493165493011475, + "logps/rejected": -3.2853176593780518, + "loss": 0.6855, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1493165493011475, + "rewards/margins": 1.1360008716583252, + "rewards/rejected": -3.2853176593780518, + "sft_loss": 2.1569228172302246, + "step": 1725 + }, + { + "epoch": 0.925907342364944, + "grad_norm": 4.870408947854243, + "learning_rate": 8.731729746982068e-07, + "logits/chosen": -0.19539253413677216, + "logits/rejected": -0.1260274350643158, + "logps/chosen": -2.1418533325195312, + "logps/rejected": -2.6719813346862793, + "loss": 0.7017, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1418533325195312, + "rewards/margins": 0.530128002166748, + "rewards/rejected": -2.6719813346862793, + "sft_loss": 2.1912734508514404, + "step": 1730 + }, + { + "epoch": 0.9285833751463456, + "grad_norm": 2.734847887314104, + "learning_rate": 8.721346239904355e-07, + "logits/chosen": -0.2868293821811676, + "logits/rejected": -0.13878083229064941, + "logps/chosen": -2.084522008895874, + "logps/rejected": -2.909510374069214, + "loss": 0.692, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.084522008895874, + "rewards/margins": 0.824988067150116, + "rewards/rejected": -2.909510374069214, + "sft_loss": 2.0666072368621826, + "step": 1735 + }, + { + "epoch": 0.9312594079277471, + "grad_norm": 1.8351539850699354, + "learning_rate": 8.710926628309101e-07, + "logits/chosen": -0.22179599106311798, + "logits/rejected": -0.0967031866312027, + "logps/chosen": -2.1067683696746826, + "logps/rejected": -2.756479501724243, + "loss": 0.6962, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1067683696746826, + "rewards/margins": 0.649711012840271, + "rewards/rejected": -2.756479501724243, + "sft_loss": 2.08931040763855, + "step": 1740 + }, + { + "epoch": 0.9339354407091487, + "grad_norm": 2.9730415144289664, + "learning_rate": 8.700471013287424e-07, + "logits/chosen": -0.193211168050766, + "logits/rejected": -0.1608760505914688, + "logps/chosen": -2.051504373550415, + "logps/rejected": -2.6706979274749756, + "loss": 0.6959, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.051504373550415, + "rewards/margins": 0.6191936731338501, + "rewards/rejected": -2.6706979274749756, + "sft_loss": 2.0490097999572754, + "step": 1745 + }, + { + "epoch": 0.9366114734905503, + "grad_norm": 2.540363578802009, + "learning_rate": 8.689979496279746e-07, + "logits/chosen": -0.1945440024137497, + "logits/rejected": -0.14428547024726868, + "logps/chosen": -2.2550501823425293, + "logps/rejected": -2.7136588096618652, + "loss": 0.7086, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.2550501823425293, + "rewards/margins": 0.45860838890075684, + "rewards/rejected": -2.7136588096618652, + "sft_loss": 2.216766357421875, + "step": 1750 + }, + { + "epoch": 0.9392875062719518, + "grad_norm": 1.8463985419387632, + "learning_rate": 8.679452179074811e-07, + "logits/chosen": -0.22097475826740265, + "logits/rejected": -0.12107206881046295, + "logps/chosen": -2.1033809185028076, + "logps/rejected": -2.739773988723755, + "loss": 0.6929, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1033809185028076, + "rewards/margins": 0.6363930702209473, + "rewards/rejected": -2.739773988723755, + "sft_loss": 2.14668607711792, + "step": 1755 + }, + { + "epoch": 0.9419635390533534, + "grad_norm": 3.4086278612305723, + "learning_rate": 8.668889163808698e-07, + "logits/chosen": -0.2078239619731903, + "logits/rejected": -0.08686795830726624, + "logps/chosen": -2.072483539581299, + "logps/rejected": -2.6630349159240723, + "loss": 0.7048, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.072483539581299, + "rewards/margins": 0.5905511975288391, + "rewards/rejected": -2.6630349159240723, + "sft_loss": 2.0812599658966064, + "step": 1760 + }, + { + "epoch": 0.944639571834755, + "grad_norm": 7.215629129560617, + "learning_rate": 8.658290552963827e-07, + "logits/chosen": -0.1843900978565216, + "logits/rejected": -0.1560697853565216, + "logps/chosen": -2.170839309692383, + "logps/rejected": -2.9429659843444824, + "loss": 0.7124, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.170839309692383, + "rewards/margins": 0.7721266150474548, + "rewards/rejected": -2.9429659843444824, + "sft_loss": 2.1287245750427246, + "step": 1765 + }, + { + "epoch": 0.9473156046161565, + "grad_norm": 2.0259578444564186, + "learning_rate": 8.647656449367966e-07, + "logits/chosen": -0.20086932182312012, + "logits/rejected": -0.031804632395505905, + "logps/chosen": -2.117952823638916, + "logps/rejected": -2.6687893867492676, + "loss": 0.6943, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.117952823638916, + "rewards/margins": 0.5508365035057068, + "rewards/rejected": -2.6687893867492676, + "sft_loss": 2.1530754566192627, + "step": 1770 + }, + { + "epoch": 0.9499916373975581, + "grad_norm": 3.40133907473232, + "learning_rate": 8.636986956193235e-07, + "logits/chosen": -0.2689778208732605, + "logits/rejected": -0.16882191598415375, + "logps/chosen": -2.109886407852173, + "logps/rejected": -2.7701923847198486, + "loss": 0.6897, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.109886407852173, + "rewards/margins": 0.6603060364723206, + "rewards/rejected": -2.7701923847198486, + "sft_loss": 2.1185245513916016, + "step": 1775 + }, + { + "epoch": 0.9526676701789597, + "grad_norm": 6.154439580864391, + "learning_rate": 8.626282176955104e-07, + "logits/chosen": -0.2206428498029709, + "logits/rejected": -0.11337701231241226, + "logps/chosen": -2.1953907012939453, + "logps/rejected": -2.889822244644165, + "loss": 0.6981, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1953907012939453, + "rewards/margins": 0.6944314241409302, + "rewards/rejected": -2.889822244644165, + "sft_loss": 2.189736843109131, + "step": 1780 + }, + { + "epoch": 0.9553437029603613, + "grad_norm": 2.35738290621667, + "learning_rate": 8.615542215511389e-07, + "logits/chosen": -0.1755184829235077, + "logits/rejected": -0.13018499314785004, + "logps/chosen": -2.3339505195617676, + "logps/rejected": -2.749356746673584, + "loss": 0.7126, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.3339505195617676, + "rewards/margins": 0.4154065251350403, + "rewards/rejected": -2.749356746673584, + "sft_loss": 2.2870373725891113, + "step": 1785 + }, + { + "epoch": 0.9580197357417628, + "grad_norm": 3.3112070687276605, + "learning_rate": 8.604767176061241e-07, + "logits/chosen": -0.12363386154174805, + "logits/rejected": -0.03265656903386116, + "logps/chosen": -2.342581033706665, + "logps/rejected": -2.935011386871338, + "loss": 0.7009, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.342581033706665, + "rewards/margins": 0.5924302935600281, + "rewards/rejected": -2.935011386871338, + "sft_loss": 2.305471658706665, + "step": 1790 + }, + { + "epoch": 0.9606957685231644, + "grad_norm": 4.891263450981831, + "learning_rate": 8.593957163144141e-07, + "logits/chosen": -0.24133631587028503, + "logits/rejected": -0.11299635469913483, + "logps/chosen": -2.2154290676116943, + "logps/rejected": -3.2079403400421143, + "loss": 0.682, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.2154290676116943, + "rewards/margins": 0.9925110936164856, + "rewards/rejected": -3.2079403400421143, + "sft_loss": 2.215498685836792, + "step": 1795 + }, + { + "epoch": 0.963371801304566, + "grad_norm": 2.0208424129718083, + "learning_rate": 8.58311228163888e-07, + "logits/chosen": -0.15862031280994415, + "logits/rejected": -0.08435139060020447, + "logps/chosen": -2.3127713203430176, + "logps/rejected": -2.894026041030884, + "loss": 0.7017, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.3127713203430176, + "rewards/margins": 0.581254780292511, + "rewards/rejected": -2.894026041030884, + "sft_loss": 2.3109865188598633, + "step": 1800 + }, + { + "epoch": 0.9660478340859675, + "grad_norm": 2.8507836505925166, + "learning_rate": 8.57223263676255e-07, + "logits/chosen": -0.24791434407234192, + "logits/rejected": -0.13634294271469116, + "logps/chosen": -2.1066908836364746, + "logps/rejected": -3.230231761932373, + "loss": 0.6909, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1066908836364746, + "rewards/margins": 1.1235411167144775, + "rewards/rejected": -3.230231761932373, + "sft_loss": 2.103177547454834, + "step": 1805 + }, + { + "epoch": 0.9687238668673691, + "grad_norm": 2.280443520636504, + "learning_rate": 8.561318334069511e-07, + "logits/chosen": -0.15570509433746338, + "logits/rejected": -0.02451663836836815, + "logps/chosen": -2.162179470062256, + "logps/rejected": -2.8802807331085205, + "loss": 0.6978, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.162179470062256, + "rewards/margins": 0.7181010246276855, + "rewards/rejected": -2.8802807331085205, + "sft_loss": 2.0782687664031982, + "step": 1810 + }, + { + "epoch": 0.9713998996487707, + "grad_norm": 2.6339671478110964, + "learning_rate": 8.550369479450375e-07, + "logits/chosen": -0.17519035935401917, + "logits/rejected": -0.06617375463247299, + "logps/chosen": -2.117783784866333, + "logps/rejected": -2.8188655376434326, + "loss": 0.6921, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.117783784866333, + "rewards/margins": 0.7010820508003235, + "rewards/rejected": -2.8188655376434326, + "sft_loss": 2.0838098526000977, + "step": 1815 + }, + { + "epoch": 0.9740759324301723, + "grad_norm": 2.1942284258327236, + "learning_rate": 8.539386179130977e-07, + "logits/chosen": -0.17862842977046967, + "logits/rejected": -0.12102198600769043, + "logps/chosen": -2.0949339866638184, + "logps/rejected": -2.6766610145568848, + "loss": 0.6962, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0949339866638184, + "rewards/margins": 0.5817269086837769, + "rewards/rejected": -2.6766610145568848, + "sft_loss": 1.989256501197815, + "step": 1820 + }, + { + "epoch": 0.9767519652115738, + "grad_norm": 2.0531383973558808, + "learning_rate": 8.528368539671347e-07, + "logits/chosen": -0.2294219732284546, + "logits/rejected": -0.09604024887084961, + "logps/chosen": -2.1103386878967285, + "logps/rejected": -3.1935291290283203, + "loss": 0.6921, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.1103386878967285, + "rewards/margins": 1.0831904411315918, + "rewards/rejected": -3.1935291290283203, + "sft_loss": 2.0963447093963623, + "step": 1825 + }, + { + "epoch": 0.9794279979929754, + "grad_norm": 2.1561448351403474, + "learning_rate": 8.51731666796467e-07, + "logits/chosen": -0.10031883418560028, + "logits/rejected": -0.06307245790958405, + "logps/chosen": -2.1303870677948, + "logps/rejected": -2.627593755722046, + "loss": 0.7075, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.1303870677948, + "rewards/margins": 0.4972063899040222, + "rewards/rejected": -2.627593755722046, + "sft_loss": 2.038759708404541, + "step": 1830 + }, + { + "epoch": 0.982104030774377, + "grad_norm": 1.8915094325573296, + "learning_rate": 8.506230671236254e-07, + "logits/chosen": -0.24092257022857666, + "logits/rejected": -0.18085786700248718, + "logps/chosen": -2.1491005420684814, + "logps/rejected": -2.653780698776245, + "loss": 0.7051, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1491005420684814, + "rewards/margins": 0.5046799182891846, + "rewards/rejected": -2.653780698776245, + "sft_loss": 2.12119722366333, + "step": 1835 + }, + { + "epoch": 0.9847800635557785, + "grad_norm": 1.3368718107290438, + "learning_rate": 8.495110657042488e-07, + "logits/chosen": -0.20756366848945618, + "logits/rejected": -0.0775209441781044, + "logps/chosen": -2.227262258529663, + "logps/rejected": -2.9371085166931152, + "loss": 0.6965, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.227262258529663, + "rewards/margins": 0.7098467946052551, + "rewards/rejected": -2.9371085166931152, + "sft_loss": 2.236367702484131, + "step": 1840 + }, + { + "epoch": 0.9874560963371801, + "grad_norm": 3.199698485846893, + "learning_rate": 8.483956733269799e-07, + "logits/chosen": -0.18606507778167725, + "logits/rejected": -0.11772701889276505, + "logps/chosen": -2.1976842880249023, + "logps/rejected": -2.954611301422119, + "loss": 0.6986, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1976842880249023, + "rewards/margins": 0.7569268941879272, + "rewards/rejected": -2.954611301422119, + "sft_loss": 2.1414358615875244, + "step": 1845 + }, + { + "epoch": 0.9901321291185817, + "grad_norm": 1.7084071257906217, + "learning_rate": 8.472769008133602e-07, + "logits/chosen": -0.3265012800693512, + "logits/rejected": -0.20926764607429504, + "logps/chosen": -2.269092082977295, + "logps/rejected": -2.8173460960388184, + "loss": 0.7033, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.269092082977295, + "rewards/margins": 0.5482543110847473, + "rewards/rejected": -2.8173460960388184, + "sft_loss": 2.1899359226226807, + "step": 1850 + }, + { + "epoch": 0.9928081618999832, + "grad_norm": 2.617818345800502, + "learning_rate": 8.461547590177259e-07, + "logits/chosen": -0.23281869292259216, + "logits/rejected": -0.1132984310388565, + "logps/chosen": -2.226372241973877, + "logps/rejected": -3.053133726119995, + "loss": 0.6979, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.226372241973877, + "rewards/margins": 0.8267615437507629, + "rewards/rejected": -3.053133726119995, + "sft_loss": 2.1588146686553955, + "step": 1855 + }, + { + "epoch": 0.9954841946813848, + "grad_norm": 2.2899406517874445, + "learning_rate": 8.450292588271014e-07, + "logits/chosen": -0.2197950780391693, + "logits/rejected": -0.12617357075214386, + "logps/chosen": -2.483006715774536, + "logps/rejected": -3.114408254623413, + "loss": 0.699, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.483006715774536, + "rewards/margins": 0.631401538848877, + "rewards/rejected": -3.114408254623413, + "sft_loss": 2.2906270027160645, + "step": 1860 + }, + { + "epoch": 0.9981602274627864, + "grad_norm": 2.758295228529619, + "learning_rate": 8.439004111610945e-07, + "logits/chosen": -0.2070378065109253, + "logits/rejected": -0.13232679665088654, + "logps/chosen": -2.27894926071167, + "logps/rejected": -3.149691104888916, + "loss": 0.688, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.27894926071167, + "rewards/margins": 0.8707423210144043, + "rewards/rejected": -3.149691104888916, + "sft_loss": 2.22861647605896, + "step": 1865 + }, + { + "epoch": 1.000836260244188, + "grad_norm": 2.265093438441994, + "learning_rate": 8.427682269717901e-07, + "logits/chosen": -0.23553189635276794, + "logits/rejected": -0.09476248174905777, + "logps/chosen": -2.338883876800537, + "logps/rejected": -3.0769600868225098, + "loss": 0.7056, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.338883876800537, + "rewards/margins": 0.7380759119987488, + "rewards/rejected": -3.0769600868225098, + "sft_loss": 2.221405029296875, + "step": 1870 + }, + { + "epoch": 1.0035122930255895, + "grad_norm": 2.4026215375782627, + "learning_rate": 8.416327172436446e-07, + "logits/chosen": -0.2625536322593689, + "logits/rejected": -0.12275469303131104, + "logps/chosen": -2.4061732292175293, + "logps/rejected": -2.9844717979431152, + "loss": 0.7021, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.4061732292175293, + "rewards/margins": 0.5782989263534546, + "rewards/rejected": -2.9844717979431152, + "sft_loss": 2.265167713165283, + "step": 1875 + }, + { + "epoch": 1.0061883258069912, + "grad_norm": 2.8656083212996326, + "learning_rate": 8.404938929933778e-07, + "logits/chosen": -0.13358765840530396, + "logits/rejected": -0.01683134213089943, + "logps/chosen": -2.2469773292541504, + "logps/rejected": -3.156306028366089, + "loss": 0.6982, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.2469773292541504, + "rewards/margins": 0.909328818321228, + "rewards/rejected": -3.156306028366089, + "sft_loss": 2.1257612705230713, + "step": 1880 + }, + { + "epoch": 1.0088643585883927, + "grad_norm": 1.8757342641313572, + "learning_rate": 8.39351765269868e-07, + "logits/chosen": -0.19678595662117004, + "logits/rejected": -0.12138146162033081, + "logps/chosen": -2.1376781463623047, + "logps/rejected": -2.8857154846191406, + "loss": 0.6857, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1376781463623047, + "rewards/margins": 0.7480372190475464, + "rewards/rejected": -2.8857154846191406, + "sft_loss": 2.0610835552215576, + "step": 1885 + }, + { + "epoch": 1.0115403913697942, + "grad_norm": 2.4086283181485046, + "learning_rate": 8.382063451540431e-07, + "logits/chosen": -0.20785903930664062, + "logits/rejected": 0.004422978963702917, + "logps/chosen": -2.1714940071105957, + "logps/rejected": -2.8997912406921387, + "loss": 0.696, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1714940071105957, + "rewards/margins": 0.7282973527908325, + "rewards/rejected": -2.8997912406921387, + "sft_loss": 2.206144094467163, + "step": 1890 + }, + { + "epoch": 1.014216424151196, + "grad_norm": 2.9374003746764936, + "learning_rate": 8.370576437587742e-07, + "logits/chosen": -0.1559458076953888, + "logits/rejected": -0.1223495751619339, + "logps/chosen": -2.1147940158843994, + "logps/rejected": -2.761025905609131, + "loss": 0.6979, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1147940158843994, + "rewards/margins": 0.6462318301200867, + "rewards/rejected": -2.761025905609131, + "sft_loss": 2.0338943004608154, + "step": 1895 + }, + { + "epoch": 1.0168924569325974, + "grad_norm": 2.4265583317054165, + "learning_rate": 8.359056722287674e-07, + "logits/chosen": -0.2967333197593689, + "logits/rejected": -0.06024886295199394, + "logps/chosen": -2.209847927093506, + "logps/rejected": -2.8883144855499268, + "loss": 0.6918, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.209847927093506, + "rewards/margins": 0.6784664392471313, + "rewards/rejected": -2.8883144855499268, + "sft_loss": 2.218695878982544, + "step": 1900 + }, + { + "epoch": 1.019568489713999, + "grad_norm": 2.3172804009040386, + "learning_rate": 8.347504417404553e-07, + "logits/chosen": -0.21461844444274902, + "logits/rejected": -0.08843618631362915, + "logps/chosen": -2.2572195529937744, + "logps/rejected": -2.836730480194092, + "loss": 0.7036, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.2572195529937744, + "rewards/margins": 0.5795107483863831, + "rewards/rejected": -2.836730480194092, + "sft_loss": 2.1930465698242188, + "step": 1905 + }, + { + "epoch": 1.0222445224954007, + "grad_norm": 2.3140340280324105, + "learning_rate": 8.335919635018893e-07, + "logits/chosen": -0.2957096993923187, + "logits/rejected": -0.19099418818950653, + "logps/chosen": -2.1492276191711426, + "logps/rejected": -2.731222629547119, + "loss": 0.703, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1492276191711426, + "rewards/margins": 0.5819951295852661, + "rewards/rejected": -2.731222629547119, + "sft_loss": 2.0989179611206055, + "step": 1910 + }, + { + "epoch": 1.0249205552768021, + "grad_norm": 2.0832491114899745, + "learning_rate": 8.324302487526303e-07, + "logits/chosen": -0.24538812041282654, + "logits/rejected": -0.12193469703197479, + "logps/chosen": -2.225022315979004, + "logps/rejected": -2.7711994647979736, + "loss": 0.6912, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.225022315979004, + "rewards/margins": 0.5461770296096802, + "rewards/rejected": -2.7711994647979736, + "sft_loss": 2.191540002822876, + "step": 1915 + }, + { + "epoch": 1.0275965880582036, + "grad_norm": 2.838280581924798, + "learning_rate": 8.312653087636398e-07, + "logits/chosen": -0.2832343578338623, + "logits/rejected": -0.20158131420612335, + "logps/chosen": -2.139833688735962, + "logps/rejected": -3.0244996547698975, + "loss": 0.6821, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.139833688735962, + "rewards/margins": 0.8846660852432251, + "rewards/rejected": -3.0244996547698975, + "sft_loss": 2.187866687774658, + "step": 1920 + }, + { + "epoch": 1.0302726208396054, + "grad_norm": 2.8720659961289083, + "learning_rate": 8.300971548371711e-07, + "logits/chosen": -0.36418408155441284, + "logits/rejected": -0.18453466892242432, + "logps/chosen": -2.2548022270202637, + "logps/rejected": -2.8945319652557373, + "loss": 0.6983, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.2548022270202637, + "rewards/margins": 0.6397296190261841, + "rewards/rejected": -2.8945319652557373, + "sft_loss": 2.2566208839416504, + "step": 1925 + }, + { + "epoch": 1.0329486536210069, + "grad_norm": 3.008003242968152, + "learning_rate": 8.289257983066582e-07, + "logits/chosen": -0.3070002496242523, + "logits/rejected": -0.18967430293560028, + "logps/chosen": -2.070561408996582, + "logps/rejected": -2.795137882232666, + "loss": 0.6882, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.070561408996582, + "rewards/margins": 0.7245765328407288, + "rewards/rejected": -2.795137882232666, + "sft_loss": 2.110377073287964, + "step": 1930 + }, + { + "epoch": 1.0356246864024083, + "grad_norm": 2.6165536235285054, + "learning_rate": 8.277512505366077e-07, + "logits/chosen": -0.344722181558609, + "logits/rejected": -0.1587430238723755, + "logps/chosen": -2.2222800254821777, + "logps/rejected": -2.9637227058410645, + "loss": 0.6992, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2222800254821777, + "rewards/margins": 0.7414425611495972, + "rewards/rejected": -2.9637227058410645, + "sft_loss": 2.1827409267425537, + "step": 1935 + }, + { + "epoch": 1.03830071918381, + "grad_norm": 2.543549696020325, + "learning_rate": 8.265735229224868e-07, + "logits/chosen": -0.27377718687057495, + "logits/rejected": -0.18468934297561646, + "logps/chosen": -2.0899338722229004, + "logps/rejected": -2.9180965423583984, + "loss": 0.689, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0899338722229004, + "rewards/margins": 0.8281623721122742, + "rewards/rejected": -2.9180965423583984, + "sft_loss": 2.0393636226654053, + "step": 1940 + }, + { + "epoch": 1.0409767519652116, + "grad_norm": 2.932102047024119, + "learning_rate": 8.253926268906144e-07, + "logits/chosen": -0.3700665235519409, + "logits/rejected": -0.23408885300159454, + "logps/chosen": -2.2393529415130615, + "logps/rejected": -3.1915836334228516, + "loss": 0.6907, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.2393529415130615, + "rewards/margins": 0.9522306323051453, + "rewards/rejected": -3.1915836334228516, + "sft_loss": 2.2528598308563232, + "step": 1945 + }, + { + "epoch": 1.043652784746613, + "grad_norm": 1.5721902382966666, + "learning_rate": 8.242085738980487e-07, + "logits/chosen": -0.23069548606872559, + "logits/rejected": -0.05136293172836304, + "logps/chosen": -2.2052998542785645, + "logps/rejected": -2.8688933849334717, + "loss": 0.6997, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.2052998542785645, + "rewards/margins": 0.6635935306549072, + "rewards/rejected": -2.8688933849334717, + "sft_loss": 2.2509541511535645, + "step": 1950 + }, + { + "epoch": 1.0463288175280148, + "grad_norm": 2.9671467167640593, + "learning_rate": 8.230213754324772e-07, + "logits/chosen": -0.2968910038471222, + "logits/rejected": -0.21758243441581726, + "logps/chosen": -2.156068801879883, + "logps/rejected": -2.8106884956359863, + "loss": 0.6953, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.156068801879883, + "rewards/margins": 0.6546195149421692, + "rewards/rejected": -2.8106884956359863, + "sft_loss": 2.221719741821289, + "step": 1955 + }, + { + "epoch": 1.0490048503094163, + "grad_norm": 1.8944266269742676, + "learning_rate": 8.218310430121045e-07, + "logits/chosen": -0.2681064009666443, + "logits/rejected": -0.2201647311449051, + "logps/chosen": -2.1123995780944824, + "logps/rejected": -2.6658949851989746, + "loss": 0.7007, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.1123995780944824, + "rewards/margins": 0.5534952282905579, + "rewards/rejected": -2.6658949851989746, + "sft_loss": 2.088008403778076, + "step": 1960 + }, + { + "epoch": 1.051680883090818, + "grad_norm": 3.797915875820107, + "learning_rate": 8.20637588185541e-07, + "logits/chosen": -0.2441168576478958, + "logits/rejected": -0.16347715258598328, + "logps/chosen": -2.0733633041381836, + "logps/rejected": -3.0631089210510254, + "loss": 0.6884, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0733633041381836, + "rewards/margins": 0.9897457361221313, + "rewards/rejected": -3.0631089210510254, + "sft_loss": 2.0929739475250244, + "step": 1965 + }, + { + "epoch": 1.0543569158722195, + "grad_norm": 2.693248674578846, + "learning_rate": 8.194410225316906e-07, + "logits/chosen": -0.29714250564575195, + "logits/rejected": -0.1393791288137436, + "logps/chosen": -2.1041672229766846, + "logps/rejected": -2.7866697311401367, + "loss": 0.6924, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1041672229766846, + "rewards/margins": 0.6825023293495178, + "rewards/rejected": -2.7866697311401367, + "sft_loss": 2.0851259231567383, + "step": 1970 + }, + { + "epoch": 1.057032948653621, + "grad_norm": 3.095263291230212, + "learning_rate": 8.182413576596385e-07, + "logits/chosen": -0.1551038771867752, + "logits/rejected": -0.08851303160190582, + "logps/chosen": -2.1007044315338135, + "logps/rejected": -2.8332302570343018, + "loss": 0.6959, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1007044315338135, + "rewards/margins": 0.7325260043144226, + "rewards/rejected": -2.8332302570343018, + "sft_loss": 2.104759931564331, + "step": 1975 + }, + { + "epoch": 1.0597089814350227, + "grad_norm": 2.697160940766407, + "learning_rate": 8.170386052085389e-07, + "logits/chosen": -0.16222915053367615, + "logits/rejected": -0.05635574460029602, + "logps/chosen": -2.30318546295166, + "logps/rejected": -3.0286173820495605, + "loss": 0.709, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.30318546295166, + "rewards/margins": 0.7254319190979004, + "rewards/rejected": -3.0286173820495605, + "sft_loss": 2.258232593536377, + "step": 1980 + }, + { + "epoch": 1.0623850142164242, + "grad_norm": 2.5573208863556633, + "learning_rate": 8.158327768475008e-07, + "logits/chosen": -0.20433588325977325, + "logits/rejected": -0.06622826308012009, + "logps/chosen": -2.348832607269287, + "logps/rejected": -2.923677921295166, + "loss": 0.7034, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.348832607269287, + "rewards/margins": 0.5748453140258789, + "rewards/rejected": -2.923677921295166, + "sft_loss": 2.2062716484069824, + "step": 1985 + }, + { + "epoch": 1.0650610469978257, + "grad_norm": 2.3634203813791435, + "learning_rate": 8.146238842754767e-07, + "logits/chosen": -0.22017395496368408, + "logits/rejected": -0.11190620809793472, + "logps/chosen": -2.3437857627868652, + "logps/rejected": -2.913809299468994, + "loss": 0.7014, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.3437857627868652, + "rewards/margins": 0.5700234770774841, + "rewards/rejected": -2.913809299468994, + "sft_loss": 2.219799757003784, + "step": 1990 + }, + { + "epoch": 1.0677370797792274, + "grad_norm": 3.1518573609297094, + "learning_rate": 8.134119392211476e-07, + "logits/chosen": -0.117152139544487, + "logits/rejected": 0.03829234093427658, + "logps/chosen": -2.151608467102051, + "logps/rejected": -3.1825308799743652, + "loss": 0.6873, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.151608467102051, + "rewards/margins": 1.030922532081604, + "rewards/rejected": -3.1825308799743652, + "sft_loss": 2.1608572006225586, + "step": 1995 + }, + { + "epoch": 1.0704131125606289, + "grad_norm": 3.0095602307906972, + "learning_rate": 8.121969534428094e-07, + "logits/chosen": -0.2610512375831604, + "logits/rejected": -0.09405355155467987, + "logps/chosen": -2.3147358894348145, + "logps/rejected": -3.0925610065460205, + "loss": 0.6992, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.3147358894348145, + "rewards/margins": 0.7778247594833374, + "rewards/rejected": -3.0925610065460205, + "sft_loss": 2.1939456462860107, + "step": 2000 + }, + { + "epoch": 1.0704131125606289, + "eval_logits/chosen": 0.03744465112686157, + "eval_logits/rejected": 0.1252625286579132, + "eval_logps/chosen": -2.138370990753174, + "eval_logps/rejected": -2.963397264480591, + "eval_loss": 0.6926766633987427, + "eval_rewards/accuracies": 0.6765578389167786, + "eval_rewards/chosen": -2.138370990753174, + "eval_rewards/margins": 0.8250265121459961, + "eval_rewards/rejected": -2.963397264480591, + "eval_runtime": 47.8648, + "eval_samples_per_second": 28.1, + "eval_sft_loss": 2.067228317260742, + "eval_steps_per_second": 7.041, + "step": 2000 + }, + { + "epoch": 1.0730891453420304, + "grad_norm": 3.4557100064183324, + "learning_rate": 8.109789387282599e-07, + "logits/chosen": -0.15685082972049713, + "logits/rejected": -0.08967246115207672, + "logps/chosen": -2.15382719039917, + "logps/rejected": -2.877662420272827, + "loss": 0.689, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.15382719039917, + "rewards/margins": 0.7238351106643677, + "rewards/rejected": -2.877662420272827, + "sft_loss": 2.0683882236480713, + "step": 2005 + }, + { + "epoch": 1.075765178123432, + "grad_norm": 2.971452618858938, + "learning_rate": 8.097579068946827e-07, + "logits/chosen": -0.15677562355995178, + "logits/rejected": -0.05232198163866997, + "logps/chosen": -2.0423381328582764, + "logps/rejected": -2.7903552055358887, + "loss": 0.6874, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0423381328582764, + "rewards/margins": 0.7480170726776123, + "rewards/rejected": -2.7903552055358887, + "sft_loss": 2.012303113937378, + "step": 2010 + }, + { + "epoch": 1.0784412109048336, + "grad_norm": 3.295583092430273, + "learning_rate": 8.085338697885344e-07, + "logits/chosen": -0.169949471950531, + "logits/rejected": -0.023434199392795563, + "logps/chosen": -2.22428035736084, + "logps/rejected": -2.977260112762451, + "loss": 0.6924, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.22428035736084, + "rewards/margins": 0.7529802918434143, + "rewards/rejected": -2.977260112762451, + "sft_loss": 2.100477695465088, + "step": 2015 + }, + { + "epoch": 1.081117243686235, + "grad_norm": 2.7415581486831146, + "learning_rate": 8.073068392854282e-07, + "logits/chosen": -0.28308627009391785, + "logits/rejected": -0.0862099900841713, + "logps/chosen": -2.241190195083618, + "logps/rejected": -3.168811321258545, + "loss": 0.6876, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.241190195083618, + "rewards/margins": 0.9276212453842163, + "rewards/rejected": -3.168811321258545, + "sft_loss": 2.171774387359619, + "step": 2020 + }, + { + "epoch": 1.0837932764676368, + "grad_norm": 3.9811990236395136, + "learning_rate": 8.060768272900193e-07, + "logits/chosen": -0.13680367171764374, + "logits/rejected": 0.019465472549200058, + "logps/chosen": -2.2143447399139404, + "logps/rejected": -3.1631412506103516, + "loss": 0.7011, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.2143447399139404, + "rewards/margins": 0.9487963914871216, + "rewards/rejected": -3.1631412506103516, + "sft_loss": 2.1971354484558105, + "step": 2025 + }, + { + "epoch": 1.0864693092490383, + "grad_norm": 2.8975017177302185, + "learning_rate": 8.0484384573589e-07, + "logits/chosen": -0.22913579642772675, + "logits/rejected": -0.1947326809167862, + "logps/chosen": -2.102335214614868, + "logps/rejected": -2.733572006225586, + "loss": 0.6979, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.102335214614868, + "rewards/margins": 0.6312370300292969, + "rewards/rejected": -2.733572006225586, + "sft_loss": 2.0779881477355957, + "step": 2030 + }, + { + "epoch": 1.0891453420304398, + "grad_norm": 3.014099252817148, + "learning_rate": 8.03607906585432e-07, + "logits/chosen": -0.22015182673931122, + "logits/rejected": -0.05069545656442642, + "logps/chosen": -2.243253469467163, + "logps/rejected": -2.77600359916687, + "loss": 0.702, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.243253469467163, + "rewards/margins": 0.5327504277229309, + "rewards/rejected": -2.77600359916687, + "sft_loss": 2.220588445663452, + "step": 2035 + }, + { + "epoch": 1.0918213748118415, + "grad_norm": 3.498294436246651, + "learning_rate": 8.023690218297329e-07, + "logits/chosen": -0.28015822172164917, + "logits/rejected": -0.210425466299057, + "logps/chosen": -2.179668426513672, + "logps/rejected": -2.826927423477173, + "loss": 0.6961, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.179668426513672, + "rewards/margins": 0.6472587585449219, + "rewards/rejected": -2.826927423477173, + "sft_loss": 2.0574076175689697, + "step": 2040 + }, + { + "epoch": 1.094497407593243, + "grad_norm": 2.6428402943889853, + "learning_rate": 8.01127203488458e-07, + "logits/chosen": -0.16431409120559692, + "logits/rejected": -0.11079733073711395, + "logps/chosen": -2.178668737411499, + "logps/rejected": -2.9302544593811035, + "loss": 0.6863, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.178668737411499, + "rewards/margins": 0.7515857219696045, + "rewards/rejected": -2.9302544593811035, + "sft_loss": 2.0807926654815674, + "step": 2045 + }, + { + "epoch": 1.0971734403746445, + "grad_norm": 5.6146039853408105, + "learning_rate": 7.998824636097339e-07, + "logits/chosen": -0.19174602627754211, + "logits/rejected": -0.03761805593967438, + "logps/chosen": -2.1677002906799316, + "logps/rejected": -3.0381433963775635, + "loss": 0.7032, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1677002906799316, + "rewards/margins": 0.8704432249069214, + "rewards/rejected": -3.0381433963775635, + "sft_loss": 2.1944992542266846, + "step": 2050 + }, + { + "epoch": 1.0998494731560462, + "grad_norm": 4.095182368933817, + "learning_rate": 7.986348142700328e-07, + "logits/chosen": -0.14897744357585907, + "logits/rejected": 0.013398826122283936, + "logps/chosen": -2.1722137928009033, + "logps/rejected": -3.0760350227355957, + "loss": 0.688, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1722137928009033, + "rewards/margins": 0.903821587562561, + "rewards/rejected": -3.0760350227355957, + "sft_loss": 2.233502149581909, + "step": 2055 + }, + { + "epoch": 1.1025255059374477, + "grad_norm": 2.3074695699932373, + "learning_rate": 7.973842675740539e-07, + "logits/chosen": -0.07054503262042999, + "logits/rejected": -0.003586306469514966, + "logps/chosen": -2.15727162361145, + "logps/rejected": -3.1748225688934326, + "loss": 0.6837, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.15727162361145, + "rewards/margins": 1.017551302909851, + "rewards/rejected": -3.1748225688934326, + "sft_loss": 2.1737217903137207, + "step": 2060 + }, + { + "epoch": 1.1052015387188494, + "grad_norm": 3.6905084049689956, + "learning_rate": 7.961308356546066e-07, + "logits/chosen": -0.12484552711248398, + "logits/rejected": 0.03302082419395447, + "logps/chosen": -2.2111077308654785, + "logps/rejected": -2.932865619659424, + "loss": 0.6925, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.2111077308654785, + "rewards/margins": 0.7217577695846558, + "rewards/rejected": -2.932865619659424, + "sft_loss": 2.117246389389038, + "step": 2065 + }, + { + "epoch": 1.107877571500251, + "grad_norm": 3.7559632713951587, + "learning_rate": 7.948745306724931e-07, + "logits/chosen": -0.09252647310495377, + "logits/rejected": 0.08411063998937607, + "logps/chosen": -2.143631935119629, + "logps/rejected": -3.171995162963867, + "loss": 0.681, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.143631935119629, + "rewards/margins": 1.0283634662628174, + "rewards/rejected": -3.171995162963867, + "sft_loss": 2.054680585861206, + "step": 2070 + }, + { + "epoch": 1.1105536042816524, + "grad_norm": 3.785218662060919, + "learning_rate": 7.936153648163897e-07, + "logits/chosen": -0.1127227172255516, + "logits/rejected": -0.006171461194753647, + "logps/chosen": -2.428866147994995, + "logps/rejected": -3.1094393730163574, + "loss": 0.6954, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.428866147994995, + "rewards/margins": 0.6805731654167175, + "rewards/rejected": -3.1094393730163574, + "sft_loss": 2.4352684020996094, + "step": 2075 + }, + { + "epoch": 1.1132296370630541, + "grad_norm": 2.9886125247810438, + "learning_rate": 7.92353350302729e-07, + "logits/chosen": -0.16224198043346405, + "logits/rejected": 0.03348865360021591, + "logps/chosen": -2.052062511444092, + "logps/rejected": -3.012722969055176, + "loss": 0.6824, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.052062511444092, + "rewards/margins": 0.9606603384017944, + "rewards/rejected": -3.012722969055176, + "sft_loss": 2.0672218799591064, + "step": 2080 + }, + { + "epoch": 1.1159056698444556, + "grad_norm": 7.474193348670629, + "learning_rate": 7.910884993755816e-07, + "logits/chosen": -0.11525171995162964, + "logits/rejected": -0.021411413326859474, + "logps/chosen": -2.1601333618164062, + "logps/rejected": -3.1377646923065186, + "loss": 0.6911, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1601333618164062, + "rewards/margins": 0.9776315689086914, + "rewards/rejected": -3.1377646923065186, + "sft_loss": 2.1464765071868896, + "step": 2085 + }, + { + "epoch": 1.118581702625857, + "grad_norm": 3.675852847011412, + "learning_rate": 7.898208243065367e-07, + "logits/chosen": -0.14804470539093018, + "logits/rejected": -0.12396585941314697, + "logps/chosen": -2.142313241958618, + "logps/rejected": -2.747668743133545, + "loss": 0.696, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.142313241958618, + "rewards/margins": 0.605355978012085, + "rewards/rejected": -2.747668743133545, + "sft_loss": 2.142691135406494, + "step": 2090 + }, + { + "epoch": 1.1212577354072588, + "grad_norm": 1.827307395063867, + "learning_rate": 7.88550337394583e-07, + "logits/chosen": -0.144433856010437, + "logits/rejected": 0.03242563083767891, + "logps/chosen": -2.1680703163146973, + "logps/rejected": -2.841989517211914, + "loss": 0.705, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.1680703163146973, + "rewards/margins": 0.6739190816879272, + "rewards/rejected": -2.841989517211914, + "sft_loss": 2.108734369277954, + "step": 2095 + }, + { + "epoch": 1.1239337681886603, + "grad_norm": 5.850728721372914, + "learning_rate": 7.872770509659905e-07, + "logits/chosen": -0.02978910505771637, + "logits/rejected": 0.028155144304037094, + "logps/chosen": -2.1252212524414062, + "logps/rejected": -2.815303325653076, + "loss": 0.6879, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1252212524414062, + "rewards/margins": 0.6900821924209595, + "rewards/rejected": -2.815303325653076, + "sft_loss": 2.041010856628418, + "step": 2100 + }, + { + "epoch": 1.1266098009700618, + "grad_norm": 2.2595190183874685, + "learning_rate": 7.860009773741896e-07, + "logits/chosen": 0.035829830914735794, + "logits/rejected": 0.18875843286514282, + "logps/chosen": -2.093374729156494, + "logps/rejected": -3.026752233505249, + "loss": 0.6914, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.093374729156494, + "rewards/margins": 0.933377742767334, + "rewards/rejected": -3.026752233505249, + "sft_loss": 2.0471291542053223, + "step": 2105 + }, + { + "epoch": 1.1292858337514635, + "grad_norm": 2.895533679660134, + "learning_rate": 7.84722128999652e-07, + "logits/chosen": 0.055860865861177444, + "logits/rejected": 0.2418518364429474, + "logps/chosen": -2.094313859939575, + "logps/rejected": -3.223344087600708, + "loss": 0.6916, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.094313859939575, + "rewards/margins": 1.1290298700332642, + "rewards/rejected": -3.223344087600708, + "sft_loss": 2.103170156478882, + "step": 2110 + }, + { + "epoch": 1.131961866532865, + "grad_norm": 4.157992270118887, + "learning_rate": 7.834405182497699e-07, + "logits/chosen": 0.1447199285030365, + "logits/rejected": 0.21992520987987518, + "logps/chosen": -2.229947566986084, + "logps/rejected": -2.9657130241394043, + "loss": 0.6907, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.229947566986084, + "rewards/margins": 0.7357650995254517, + "rewards/rejected": -2.9657130241394043, + "sft_loss": 2.187542676925659, + "step": 2115 + }, + { + "epoch": 1.1346378993142665, + "grad_norm": 4.614746209370739, + "learning_rate": 7.821561575587368e-07, + "logits/chosen": 0.028188159689307213, + "logits/rejected": 0.07980314642190933, + "logps/chosen": -2.128232955932617, + "logps/rejected": -2.8294498920440674, + "loss": 0.6957, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.128232955932617, + "rewards/margins": 0.7012170553207397, + "rewards/rejected": -2.8294498920440674, + "sft_loss": 2.120162010192871, + "step": 2120 + }, + { + "epoch": 1.1373139320956682, + "grad_norm": 2.6420371183347204, + "learning_rate": 7.808690593874254e-07, + "logits/chosen": 0.020165940746665, + "logits/rejected": 0.1338595598936081, + "logps/chosen": -2.051849603652954, + "logps/rejected": -3.0664772987365723, + "loss": 0.6796, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.051849603652954, + "rewards/margins": 1.0146278142929077, + "rewards/rejected": -3.0664772987365723, + "sft_loss": 2.0553438663482666, + "step": 2125 + }, + { + "epoch": 1.1399899648770697, + "grad_norm": 3.764557498320856, + "learning_rate": 7.79579236223268e-07, + "logits/chosen": 0.07896244525909424, + "logits/rejected": 0.3769746422767639, + "logps/chosen": -2.0657172203063965, + "logps/rejected": -3.0718822479248047, + "loss": 0.6851, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.0657172203063965, + "rewards/margins": 1.006164789199829, + "rewards/rejected": -3.0718822479248047, + "sft_loss": 2.0536303520202637, + "step": 2130 + }, + { + "epoch": 1.1426659976584714, + "grad_norm": 3.9943672104879697, + "learning_rate": 7.782867005801346e-07, + "logits/chosen": 0.03182698413729668, + "logits/rejected": 0.26468613743782043, + "logps/chosen": -2.0125889778137207, + "logps/rejected": -2.906167507171631, + "loss": 0.6767, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0125889778137207, + "rewards/margins": 0.8935783505439758, + "rewards/rejected": -2.906167507171631, + "sft_loss": 2.0478873252868652, + "step": 2135 + }, + { + "epoch": 1.145342030439873, + "grad_norm": 4.330009826251876, + "learning_rate": 7.769914649982117e-07, + "logits/chosen": 0.061708033084869385, + "logits/rejected": 0.2412114441394806, + "logps/chosen": -2.091811180114746, + "logps/rejected": -2.896186351776123, + "loss": 0.6874, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.091811180114746, + "rewards/margins": 0.8043753504753113, + "rewards/rejected": -2.896186351776123, + "sft_loss": 2.1207423210144043, + "step": 2140 + }, + { + "epoch": 1.1480180632212744, + "grad_norm": 9.02744531612043, + "learning_rate": 7.756935420438803e-07, + "logits/chosen": 0.034645337611436844, + "logits/rejected": 0.1325272023677826, + "logps/chosen": -2.0987558364868164, + "logps/rejected": -3.0625805854797363, + "loss": 0.6892, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0987558364868164, + "rewards/margins": 0.9638249278068542, + "rewards/rejected": -3.0625805854797363, + "sft_loss": 2.1816165447235107, + "step": 2145 + }, + { + "epoch": 1.1506940960026761, + "grad_norm": 1.866573424200562, + "learning_rate": 7.743929443095951e-07, + "logits/chosen": -0.0053394995629787445, + "logits/rejected": 0.0790596604347229, + "logps/chosen": -2.267483711242676, + "logps/rejected": -3.1621835231781006, + "loss": 0.6956, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.267483711242676, + "rewards/margins": 0.8946998715400696, + "rewards/rejected": -3.1621835231781006, + "sft_loss": 2.173220157623291, + "step": 2150 + }, + { + "epoch": 1.1533701287840776, + "grad_norm": 2.2911175585506394, + "learning_rate": 7.730896844137609e-07, + "logits/chosen": 0.014605918899178505, + "logits/rejected": 0.10750974714756012, + "logps/chosen": -2.3764493465423584, + "logps/rejected": -3.1504592895507812, + "loss": 0.6958, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3764493465423584, + "rewards/margins": 0.7740097045898438, + "rewards/rejected": -3.1504592895507812, + "sft_loss": 2.3708794116973877, + "step": 2155 + }, + { + "epoch": 1.1560461615654791, + "grad_norm": 2.1480728234079156, + "learning_rate": 7.717837750006106e-07, + "logits/chosen": -0.03691485524177551, + "logits/rejected": 0.07787419110536575, + "logps/chosen": -2.2911763191223145, + "logps/rejected": -3.2916271686553955, + "loss": 0.6771, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2911763191223145, + "rewards/margins": 1.000450611114502, + "rewards/rejected": -3.2916271686553955, + "sft_loss": 2.260286331176758, + "step": 2160 + }, + { + "epoch": 1.1587221943468808, + "grad_norm": 1.8362749505910247, + "learning_rate": 7.704752287400832e-07, + "logits/chosen": 0.013022062368690968, + "logits/rejected": 0.22734710574150085, + "logps/chosen": -2.223485231399536, + "logps/rejected": -3.106647491455078, + "loss": 0.6952, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.223485231399536, + "rewards/margins": 0.8831623792648315, + "rewards/rejected": -3.106647491455078, + "sft_loss": 2.1939940452575684, + "step": 2165 + }, + { + "epoch": 1.1613982271282823, + "grad_norm": 3.314984005450456, + "learning_rate": 7.691640583277004e-07, + "logits/chosen": 0.0009105164790526032, + "logits/rejected": 0.19259147346019745, + "logps/chosen": -2.1488661766052246, + "logps/rejected": -3.095223903656006, + "loss": 0.684, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.1488661766052246, + "rewards/margins": 0.9463576078414917, + "rewards/rejected": -3.095223903656006, + "sft_loss": 2.160351037979126, + "step": 2170 + }, + { + "epoch": 1.1640742599096838, + "grad_norm": 2.426602568608894, + "learning_rate": 7.678502764844433e-07, + "logits/chosen": -0.05211140587925911, + "logits/rejected": 0.16416163742542267, + "logps/chosen": -2.1900627613067627, + "logps/rejected": -2.8962035179138184, + "loss": 0.6942, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.1900627613067627, + "rewards/margins": 0.7061406970024109, + "rewards/rejected": -2.8962035179138184, + "sft_loss": 2.170743465423584, + "step": 2175 + }, + { + "epoch": 1.1667502926910855, + "grad_norm": 3.4497849360982396, + "learning_rate": 7.665338959566288e-07, + "logits/chosen": 0.022833820432424545, + "logits/rejected": 0.14317765831947327, + "logps/chosen": -2.170104742050171, + "logps/rejected": -3.0590224266052246, + "loss": 0.6728, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.170104742050171, + "rewards/margins": 0.8889178037643433, + "rewards/rejected": -3.0590224266052246, + "sft_loss": 2.172898769378662, + "step": 2180 + }, + { + "epoch": 1.169426325472487, + "grad_norm": 2.939900637807476, + "learning_rate": 7.652149295157868e-07, + "logits/chosen": 0.10830320417881012, + "logits/rejected": 0.29697293043136597, + "logps/chosen": -2.2700865268707275, + "logps/rejected": -2.937676429748535, + "loss": 0.6844, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2700865268707275, + "rewards/margins": 0.6675900816917419, + "rewards/rejected": -2.937676429748535, + "sft_loss": 2.2042338848114014, + "step": 2185 + }, + { + "epoch": 1.1721023582538885, + "grad_norm": 3.276701370462819, + "learning_rate": 7.638933899585354e-07, + "logits/chosen": 0.16689571738243103, + "logits/rejected": 0.2072555124759674, + "logps/chosen": -2.110347032546997, + "logps/rejected": -3.064884901046753, + "loss": 0.6828, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.110347032546997, + "rewards/margins": 0.9545377492904663, + "rewards/rejected": -3.064884901046753, + "sft_loss": 2.2188868522644043, + "step": 2190 + }, + { + "epoch": 1.1747783910352902, + "grad_norm": 3.985887115424679, + "learning_rate": 7.625692901064573e-07, + "logits/chosen": 0.06006260961294174, + "logits/rejected": 0.18580766022205353, + "logps/chosen": -2.265280246734619, + "logps/rejected": -3.1473121643066406, + "loss": 0.6864, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.265280246734619, + "rewards/margins": 0.8820323944091797, + "rewards/rejected": -3.1473121643066406, + "sft_loss": 2.2683613300323486, + "step": 2195 + }, + { + "epoch": 1.1774544238166917, + "grad_norm": 3.1283800272101887, + "learning_rate": 7.61242642805975e-07, + "logits/chosen": 0.05172146484255791, + "logits/rejected": 0.04666357487440109, + "logps/chosen": -2.2359461784362793, + "logps/rejected": -2.916210889816284, + "loss": 0.6922, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.2359461784362793, + "rewards/margins": 0.6802645325660706, + "rewards/rejected": -2.916210889816284, + "sft_loss": 2.3265719413757324, + "step": 2200 + }, + { + "epoch": 1.1801304565980932, + "grad_norm": 2.438764303944812, + "learning_rate": 7.599134609282266e-07, + "logits/chosen": -0.08242259919643402, + "logits/rejected": 0.1345296949148178, + "logps/chosen": -2.2915401458740234, + "logps/rejected": -2.9627060890197754, + "loss": 0.7024, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.2915401458740234, + "rewards/margins": 0.6711658239364624, + "rewards/rejected": -2.9627060890197754, + "sft_loss": 2.277489423751831, + "step": 2205 + }, + { + "epoch": 1.182806489379495, + "grad_norm": 3.011823113421532, + "learning_rate": 7.585817573689402e-07, + "logits/chosen": -0.07217409461736679, + "logits/rejected": 0.06581093370914459, + "logps/chosen": -2.0234055519104004, + "logps/rejected": -3.072521686553955, + "loss": 0.6714, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.0234055519104004, + "rewards/margins": 1.0491163730621338, + "rewards/rejected": -3.072521686553955, + "sft_loss": 2.065383195877075, + "step": 2210 + }, + { + "epoch": 1.1854825221608964, + "grad_norm": 2.8564716043679446, + "learning_rate": 7.572475450483098e-07, + "logits/chosen": -0.02109135314822197, + "logits/rejected": 0.08697710931301117, + "logps/chosen": -2.264143705368042, + "logps/rejected": -3.120563268661499, + "loss": 0.6907, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.264143705368042, + "rewards/margins": 0.856419563293457, + "rewards/rejected": -3.120563268661499, + "sft_loss": 2.190354347229004, + "step": 2215 + }, + { + "epoch": 1.188158554942298, + "grad_norm": 2.7426036225809076, + "learning_rate": 7.559108369108689e-07, + "logits/chosen": -0.07922705262899399, + "logits/rejected": 0.06583412736654282, + "logps/chosen": -2.1012039184570312, + "logps/rejected": -2.7338175773620605, + "loss": 0.7093, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1012039184570312, + "rewards/margins": 0.6326137781143188, + "rewards/rejected": -2.7338175773620605, + "sft_loss": 2.1062769889831543, + "step": 2220 + }, + { + "epoch": 1.1908345877236997, + "grad_norm": 4.963715932295364, + "learning_rate": 7.54571645925366e-07, + "logits/chosen": -0.08481200039386749, + "logits/rejected": 0.1758030205965042, + "logps/chosen": -2.024156093597412, + "logps/rejected": -3.14689564704895, + "loss": 0.6812, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.024156093597412, + "rewards/margins": 1.1227390766143799, + "rewards/rejected": -3.14689564704895, + "sft_loss": 2.032785415649414, + "step": 2225 + }, + { + "epoch": 1.1935106205051011, + "grad_norm": 10.948141582318529, + "learning_rate": 7.532299850846378e-07, + "logits/chosen": -0.06958422064781189, + "logits/rejected": 0.13180723786354065, + "logps/chosen": -2.123347043991089, + "logps/rejected": -3.3177521228790283, + "loss": 0.6991, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.123347043991089, + "rewards/margins": 1.194405198097229, + "rewards/rejected": -3.3177521228790283, + "sft_loss": 2.058178663253784, + "step": 2230 + }, + { + "epoch": 1.1961866532865026, + "grad_norm": 2.5259745067937915, + "learning_rate": 7.518858674054838e-07, + "logits/chosen": 0.00844159722328186, + "logits/rejected": 0.24382808804512024, + "logps/chosen": -2.0869460105895996, + "logps/rejected": -2.9912471771240234, + "loss": 0.6802, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.0869460105895996, + "rewards/margins": 0.9043010473251343, + "rewards/rejected": -2.9912471771240234, + "sft_loss": 2.063582420349121, + "step": 2235 + }, + { + "epoch": 1.1988626860679044, + "grad_norm": 3.379119975688467, + "learning_rate": 7.505393059285394e-07, + "logits/chosen": -0.015473166480660439, + "logits/rejected": 0.1876002997159958, + "logps/chosen": -2.170123815536499, + "logps/rejected": -3.030186891555786, + "loss": 0.6819, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.170123815536499, + "rewards/margins": 0.860063374042511, + "rewards/rejected": -3.030186891555786, + "sft_loss": 2.189866781234741, + "step": 2240 + }, + { + "epoch": 1.2015387188493059, + "grad_norm": 3.51519766181329, + "learning_rate": 7.491903137181501e-07, + "logits/chosen": 0.052792083472013474, + "logits/rejected": 0.10203012079000473, + "logps/chosen": -2.100395679473877, + "logps/rejected": -2.775660991668701, + "loss": 0.6966, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.100395679473877, + "rewards/margins": 0.6752654314041138, + "rewards/rejected": -2.775660991668701, + "sft_loss": 2.113642454147339, + "step": 2245 + }, + { + "epoch": 1.2042147516307076, + "grad_norm": 3.8963991140526897, + "learning_rate": 7.478389038622441e-07, + "logits/chosen": 0.10614132881164551, + "logits/rejected": 0.15122470259666443, + "logps/chosen": -2.0579097270965576, + "logps/rejected": -2.9311022758483887, + "loss": 0.6794, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0579097270965576, + "rewards/margins": 0.8731926679611206, + "rewards/rejected": -2.9311022758483887, + "sft_loss": 2.034806728363037, + "step": 2250 + }, + { + "epoch": 1.206890784412109, + "grad_norm": 2.5495966112877837, + "learning_rate": 7.46485089472206e-07, + "logits/chosen": -0.05587650090456009, + "logits/rejected": 0.05207020044326782, + "logps/chosen": -2.127136468887329, + "logps/rejected": -2.8834292888641357, + "loss": 0.6947, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.127136468887329, + "rewards/margins": 0.7562929391860962, + "rewards/rejected": -2.8834292888641357, + "sft_loss": 2.09360408782959, + "step": 2255 + }, + { + "epoch": 1.2095668171935106, + "grad_norm": 2.5223855716345147, + "learning_rate": 7.451288836827487e-07, + "logits/chosen": -0.001058913767337799, + "logits/rejected": 0.004094363190233707, + "logps/chosen": -2.078620433807373, + "logps/rejected": -2.7044858932495117, + "loss": 0.6921, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.078620433807373, + "rewards/margins": 0.6258653998374939, + "rewards/rejected": -2.7044858932495117, + "sft_loss": 2.1078619956970215, + "step": 2260 + }, + { + "epoch": 1.2122428499749123, + "grad_norm": 3.504723988742914, + "learning_rate": 7.437702996517869e-07, + "logits/chosen": -0.04328620433807373, + "logits/rejected": 0.06203915923833847, + "logps/chosen": -2.0743579864501953, + "logps/rejected": -2.8254854679107666, + "loss": 0.6925, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.0743579864501953, + "rewards/margins": 0.7511274814605713, + "rewards/rejected": -2.8254854679107666, + "sft_loss": 2.123765468597412, + "step": 2265 + }, + { + "epoch": 1.2149188827563138, + "grad_norm": 4.403498456561451, + "learning_rate": 7.424093505603087e-07, + "logits/chosen": -0.18621978163719177, + "logits/rejected": 0.007757553365081549, + "logps/chosen": -1.9891008138656616, + "logps/rejected": -3.0335018634796143, + "loss": 0.6723, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9891008138656616, + "rewards/margins": 1.044400930404663, + "rewards/rejected": -3.0335018634796143, + "sft_loss": 1.9587218761444092, + "step": 2270 + }, + { + "epoch": 1.2175949155377153, + "grad_norm": 3.4931091756307078, + "learning_rate": 7.410460496122482e-07, + "logits/chosen": -0.1046561747789383, + "logits/rejected": 0.038362883031368256, + "logps/chosen": -2.0162036418914795, + "logps/rejected": -3.0622410774230957, + "loss": 0.6688, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.0162036418914795, + "rewards/margins": 1.0460376739501953, + "rewards/rejected": -3.0622410774230957, + "sft_loss": 1.9905481338500977, + "step": 2275 + }, + { + "epoch": 1.220270948319117, + "grad_norm": 13.147933058773736, + "learning_rate": 7.396804100343572e-07, + "logits/chosen": -0.16363480687141418, + "logits/rejected": 0.05883455276489258, + "logps/chosen": -1.9224354028701782, + "logps/rejected": -2.836124897003174, + "loss": 0.6824, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.9224354028701782, + "rewards/margins": 0.9136892557144165, + "rewards/rejected": -2.836124897003174, + "sft_loss": 1.9475923776626587, + "step": 2280 + }, + { + "epoch": 1.2229469811005185, + "grad_norm": 4.1661904330642665, + "learning_rate": 7.383124450760768e-07, + "logits/chosen": -0.08059743791818619, + "logits/rejected": 0.12536141276359558, + "logps/chosen": -2.1742968559265137, + "logps/rejected": -3.200460433959961, + "loss": 0.6847, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1742968559265137, + "rewards/margins": 1.0261633396148682, + "rewards/rejected": -3.200460433959961, + "sft_loss": 2.1663155555725098, + "step": 2285 + }, + { + "epoch": 1.22562301388192, + "grad_norm": 4.642258363325745, + "learning_rate": 7.369421680094091e-07, + "logits/chosen": -0.18666820228099823, + "logits/rejected": -0.010999524965882301, + "logps/chosen": -2.0745177268981934, + "logps/rejected": -3.116467237472534, + "loss": 0.7008, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.0745177268981934, + "rewards/margins": 1.0419495105743408, + "rewards/rejected": -3.116467237472534, + "sft_loss": 2.03446888923645, + "step": 2290 + }, + { + "epoch": 1.2282990466633217, + "grad_norm": 3.6794418828233146, + "learning_rate": 7.355695921287881e-07, + "logits/chosen": -0.14328600466251373, + "logits/rejected": -0.031096193939447403, + "logps/chosen": -2.2875900268554688, + "logps/rejected": -3.104003429412842, + "loss": 0.7005, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.2875900268554688, + "rewards/margins": 0.8164132833480835, + "rewards/rejected": -3.104003429412842, + "sft_loss": 2.327225923538208, + "step": 2295 + }, + { + "epoch": 1.2309750794447232, + "grad_norm": 3.7502568359924355, + "learning_rate": 7.341947307509513e-07, + "logits/chosen": -0.1312979757785797, + "logits/rejected": 0.020264511927962303, + "logps/chosen": -2.2415642738342285, + "logps/rejected": -3.072537422180176, + "loss": 0.6959, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.2415642738342285, + "rewards/margins": 0.8309730291366577, + "rewards/rejected": -3.072537422180176, + "sft_loss": 2.2999441623687744, + "step": 2300 + }, + { + "epoch": 1.233651112226125, + "grad_norm": 2.6776205141460423, + "learning_rate": 7.328175972148094e-07, + "logits/chosen": -0.1437736451625824, + "logits/rejected": 0.004277849104255438, + "logps/chosen": -2.3677988052368164, + "logps/rejected": -3.3778228759765625, + "loss": 0.6894, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3677988052368164, + "rewards/margins": 1.0100243091583252, + "rewards/rejected": -3.3778228759765625, + "sft_loss": 2.324275493621826, + "step": 2305 + }, + { + "epoch": 1.2363271450075264, + "grad_norm": 4.915817636495881, + "learning_rate": 7.314382048813185e-07, + "logits/chosen": -0.1395511031150818, + "logits/rejected": 0.1314028948545456, + "logps/chosen": -2.116849899291992, + "logps/rejected": -3.1788382530212402, + "loss": 0.6852, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.116849899291992, + "rewards/margins": 1.0619887113571167, + "rewards/rejected": -3.1788382530212402, + "sft_loss": 2.091264247894287, + "step": 2310 + }, + { + "epoch": 1.2390031777889279, + "grad_norm": 2.9800217980885235, + "learning_rate": 7.300565671333486e-07, + "logits/chosen": -0.15477004647254944, + "logits/rejected": 0.0680699497461319, + "logps/chosen": -2.162829875946045, + "logps/rejected": -2.9792914390563965, + "loss": 0.6822, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.162829875946045, + "rewards/margins": 0.8164618611335754, + "rewards/rejected": -2.9792914390563965, + "sft_loss": 2.13439679145813, + "step": 2315 + }, + { + "epoch": 1.2416792105703296, + "grad_norm": 3.16895488754896, + "learning_rate": 7.286726973755554e-07, + "logits/chosen": -0.05908023193478584, + "logits/rejected": -0.02534184232354164, + "logps/chosen": -2.12937593460083, + "logps/rejected": -2.8794002532958984, + "loss": 0.6953, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.12937593460083, + "rewards/margins": 0.7500244379043579, + "rewards/rejected": -2.8794002532958984, + "sft_loss": 2.089693784713745, + "step": 2320 + }, + { + "epoch": 1.244355243351731, + "grad_norm": 4.977184632311022, + "learning_rate": 7.272866090342493e-07, + "logits/chosen": -0.01070446241647005, + "logits/rejected": 0.08270622789859772, + "logps/chosen": -2.0935487747192383, + "logps/rejected": -2.8940985202789307, + "loss": 0.6912, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.0935487747192383, + "rewards/margins": 0.8005493879318237, + "rewards/rejected": -2.8940985202789307, + "sft_loss": 2.0332162380218506, + "step": 2325 + }, + { + "epoch": 1.2470312761331326, + "grad_norm": 3.0684148312667245, + "learning_rate": 7.258983155572656e-07, + "logits/chosen": -0.1379140168428421, + "logits/rejected": -0.010696396231651306, + "logps/chosen": -2.1980350017547607, + "logps/rejected": -3.0138888359069824, + "loss": 0.6841, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1980350017547607, + "rewards/margins": 0.8158538937568665, + "rewards/rejected": -3.0138888359069824, + "sft_loss": 2.1696524620056152, + "step": 2330 + }, + { + "epoch": 1.2497073089145343, + "grad_norm": 3.6399155538573518, + "learning_rate": 7.245078304138335e-07, + "logits/chosen": -0.052761245518922806, + "logits/rejected": 0.0353345051407814, + "logps/chosen": -2.1806437969207764, + "logps/rejected": -3.1056928634643555, + "loss": 0.6827, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1806437969207764, + "rewards/margins": 0.9250493049621582, + "rewards/rejected": -3.1056928634643555, + "sft_loss": 2.1760284900665283, + "step": 2335 + }, + { + "epoch": 1.2523833416959358, + "grad_norm": 2.599231904829798, + "learning_rate": 7.231151670944462e-07, + "logits/chosen": -0.22138762474060059, + "logits/rejected": -0.01419213879853487, + "logps/chosen": -2.241899013519287, + "logps/rejected": -3.0917603969573975, + "loss": 0.6936, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.241899013519287, + "rewards/margins": 0.8498618006706238, + "rewards/rejected": -3.0917603969573975, + "sft_loss": 2.1691365242004395, + "step": 2340 + }, + { + "epoch": 1.2550593744773373, + "grad_norm": 2.7305141878885832, + "learning_rate": 7.217203391107291e-07, + "logits/chosen": -0.13783904910087585, + "logits/rejected": 0.042418282479047775, + "logps/chosen": -2.161641836166382, + "logps/rejected": -3.363574266433716, + "loss": 0.6809, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.161641836166382, + "rewards/margins": 1.2019329071044922, + "rewards/rejected": -3.363574266433716, + "sft_loss": 2.0902111530303955, + "step": 2345 + }, + { + "epoch": 1.257735407258739, + "grad_norm": 2.878417908727133, + "learning_rate": 7.203233599953096e-07, + "logits/chosen": -0.09307009726762772, + "logits/rejected": 0.06742610782384872, + "logps/chosen": -2.1318047046661377, + "logps/rejected": -2.8575503826141357, + "loss": 0.6898, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.1318047046661377, + "rewards/margins": 0.7257457375526428, + "rewards/rejected": -2.8575503826141357, + "sft_loss": 2.0882675647735596, + "step": 2350 + }, + { + "epoch": 1.2604114400401405, + "grad_norm": 4.200935469789636, + "learning_rate": 7.189242433016852e-07, + "logits/chosen": -0.08697251975536346, + "logits/rejected": 0.056865572929382324, + "logps/chosen": -2.0718178749084473, + "logps/rejected": -3.113229751586914, + "loss": 0.6907, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0718178749084473, + "rewards/margins": 1.0414116382598877, + "rewards/rejected": -3.113229751586914, + "sft_loss": 2.051907539367676, + "step": 2355 + }, + { + "epoch": 1.263087472821542, + "grad_norm": 4.0478106630832515, + "learning_rate": 7.17523002604092e-07, + "logits/chosen": -0.09301309287548065, + "logits/rejected": 0.0913057029247284, + "logps/chosen": -2.084655284881592, + "logps/rejected": -3.1130805015563965, + "loss": 0.6852, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.084655284881592, + "rewards/margins": 1.0284250974655151, + "rewards/rejected": -3.1130805015563965, + "sft_loss": 2.156642436981201, + "step": 2360 + }, + { + "epoch": 1.2657635056029437, + "grad_norm": 2.4032541950503625, + "learning_rate": 7.161196514973734e-07, + "logits/chosen": -0.07560623437166214, + "logits/rejected": 0.08645117282867432, + "logps/chosen": -2.083491802215576, + "logps/rejected": -3.286257266998291, + "loss": 0.6805, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.083491802215576, + "rewards/margins": 1.2027655839920044, + "rewards/rejected": -3.286257266998291, + "sft_loss": 2.120220184326172, + "step": 2365 + }, + { + "epoch": 1.2684395383843452, + "grad_norm": 4.0463490987662265, + "learning_rate": 7.147142035968483e-07, + "logits/chosen": -0.011971333995461464, + "logits/rejected": 0.16497957706451416, + "logps/chosen": -2.0660626888275146, + "logps/rejected": -2.881598711013794, + "loss": 0.6863, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.0660626888275146, + "rewards/margins": 0.815536379814148, + "rewards/rejected": -2.881598711013794, + "sft_loss": 2.107175827026367, + "step": 2370 + }, + { + "epoch": 1.2711155711657467, + "grad_norm": 4.071360410507156, + "learning_rate": 7.133066725381781e-07, + "logits/chosen": -0.18575963377952576, + "logits/rejected": 0.02247786521911621, + "logps/chosen": -1.9347765445709229, + "logps/rejected": -2.760807514190674, + "loss": 0.6917, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.9347765445709229, + "rewards/margins": 0.8260312080383301, + "rewards/rejected": -2.760807514190674, + "sft_loss": 1.933650016784668, + "step": 2375 + }, + { + "epoch": 1.2737916039471484, + "grad_norm": 3.135509516176366, + "learning_rate": 7.118970719772354e-07, + "logits/chosen": -0.09797381609678268, + "logits/rejected": 0.10881340503692627, + "logps/chosen": -2.189521312713623, + "logps/rejected": -3.2549827098846436, + "loss": 0.6822, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.189521312713623, + "rewards/margins": 1.06546151638031, + "rewards/rejected": -3.2549827098846436, + "sft_loss": 2.204078197479248, + "step": 2380 + }, + { + "epoch": 1.27646763672855, + "grad_norm": 2.486331336437711, + "learning_rate": 7.104854155899711e-07, + "logits/chosen": -0.00970968697220087, + "logits/rejected": 0.11408871412277222, + "logps/chosen": -2.2408223152160645, + "logps/rejected": -3.1621487140655518, + "loss": 0.6857, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2408223152160645, + "rewards/margins": 0.9213263392448425, + "rewards/rejected": -3.1621487140655518, + "sft_loss": 2.1868247985839844, + "step": 2385 + }, + { + "epoch": 1.2791436695099514, + "grad_norm": 2.688927072409448, + "learning_rate": 7.090717170722817e-07, + "logits/chosen": 0.014814998023211956, + "logits/rejected": 0.10555567592382431, + "logps/chosen": -2.166475534439087, + "logps/rejected": -3.28424072265625, + "loss": 0.6846, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.166475534439087, + "rewards/margins": 1.117765188217163, + "rewards/rejected": -3.28424072265625, + "sft_loss": 2.1495277881622314, + "step": 2390 + }, + { + "epoch": 1.2818197022913531, + "grad_norm": 7.139311582165276, + "learning_rate": 7.076559901398762e-07, + "logits/chosen": -0.17499220371246338, + "logits/rejected": -0.010774696245789528, + "logps/chosen": -2.0613772869110107, + "logps/rejected": -2.8371224403381348, + "loss": 0.707, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0613772869110107, + "rewards/margins": 0.7757450938224792, + "rewards/rejected": -2.8371224403381348, + "sft_loss": 2.0831761360168457, + "step": 2395 + }, + { + "epoch": 1.2844957350727546, + "grad_norm": 3.6126041542575633, + "learning_rate": 7.062382485281436e-07, + "logits/chosen": -0.09352283179759979, + "logits/rejected": 0.05490085482597351, + "logps/chosen": -2.0558700561523438, + "logps/rejected": -2.79360032081604, + "loss": 0.6894, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.0558700561523438, + "rewards/margins": 0.7377304434776306, + "rewards/rejected": -2.79360032081604, + "sft_loss": 2.055211305618286, + "step": 2400 + }, + { + "epoch": 1.2844957350727546, + "eval_logits/chosen": 0.24242374300956726, + "eval_logits/rejected": 0.34696489572525024, + "eval_logps/chosen": -2.1527044773101807, + "eval_logps/rejected": -3.098733901977539, + "eval_loss": 0.6908385753631592, + "eval_rewards/accuracies": 0.6810088753700256, + "eval_rewards/chosen": -2.1527044773101807, + "eval_rewards/margins": 0.9460291862487793, + "eval_rewards/rejected": -3.098733901977539, + "eval_runtime": 46.6607, + "eval_samples_per_second": 28.825, + "eval_sft_loss": 2.1131699085235596, + "eval_steps_per_second": 7.222, + "step": 2400 + }, + { + "epoch": 1.287171767854156, + "grad_norm": 3.0229337105586116, + "learning_rate": 7.048185059920193e-07, + "logits/chosen": -0.04934023693203926, + "logits/rejected": 0.10366056859493256, + "logps/chosen": -2.0795083045959473, + "logps/rejected": -3.3166897296905518, + "loss": 0.6839, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0795083045959473, + "rewards/margins": 1.2371810674667358, + "rewards/rejected": -3.3166897296905518, + "sft_loss": 2.0751585960388184, + "step": 2405 + }, + { + "epoch": 1.2898478006355578, + "grad_norm": 2.077199559958702, + "learning_rate": 7.033967763058516e-07, + "logits/chosen": -0.12134470790624619, + "logits/rejected": 0.07075067609548569, + "logps/chosen": -2.217790365219116, + "logps/rejected": -2.803415298461914, + "loss": 0.6975, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.217790365219116, + "rewards/margins": 0.5856245756149292, + "rewards/rejected": -2.803415298461914, + "sft_loss": 2.1529674530029297, + "step": 2410 + }, + { + "epoch": 1.2925238334169593, + "grad_norm": 10.27680753319747, + "learning_rate": 7.019730732632681e-07, + "logits/chosen": 0.008393806405365467, + "logits/rejected": 0.11342382431030273, + "logps/chosen": -2.071897268295288, + "logps/rejected": -3.249005079269409, + "loss": 0.675, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.071897268295288, + "rewards/margins": 1.177107572555542, + "rewards/rejected": -3.249005079269409, + "sft_loss": 2.075561046600342, + "step": 2415 + }, + { + "epoch": 1.2951998661983608, + "grad_norm": 4.082283072340448, + "learning_rate": 7.005474106770418e-07, + "logits/chosen": -0.12556777894496918, + "logits/rejected": 0.016571324318647385, + "logps/chosen": -2.1948039531707764, + "logps/rejected": -3.23891019821167, + "loss": 0.6776, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1948039531707764, + "rewards/margins": 1.0441062450408936, + "rewards/rejected": -3.23891019821167, + "sft_loss": 2.202822208404541, + "step": 2420 + }, + { + "epoch": 1.2978758989797625, + "grad_norm": 3.0767883445658026, + "learning_rate": 6.991198023789577e-07, + "logits/chosen": -0.051589228212833405, + "logits/rejected": 0.05762894079089165, + "logps/chosen": -1.9911400079727173, + "logps/rejected": -2.8439207077026367, + "loss": 0.681, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9911400079727173, + "rewards/margins": 0.8527809381484985, + "rewards/rejected": -2.8439207077026367, + "sft_loss": 2.1229031085968018, + "step": 2425 + }, + { + "epoch": 1.300551931761164, + "grad_norm": 1.8886738530903202, + "learning_rate": 6.976902622196776e-07, + "logits/chosen": -0.046624403446912766, + "logits/rejected": 0.036238282918930054, + "logps/chosen": -2.253139019012451, + "logps/rejected": -3.0171914100646973, + "loss": 0.7037, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.253139019012451, + "rewards/margins": 0.7640522718429565, + "rewards/rejected": -3.0171914100646973, + "sft_loss": 2.2068352699279785, + "step": 2430 + }, + { + "epoch": 1.3032279645425655, + "grad_norm": 2.411245095502254, + "learning_rate": 6.962588040686064e-07, + "logits/chosen": -0.0738714337348938, + "logits/rejected": 0.08386780321598053, + "logps/chosen": -2.155534505844116, + "logps/rejected": -2.863119125366211, + "loss": 0.6965, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.155534505844116, + "rewards/margins": 0.7075840830802917, + "rewards/rejected": -2.863119125366211, + "sft_loss": 2.1776299476623535, + "step": 2435 + }, + { + "epoch": 1.3059039973239672, + "grad_norm": 3.537028745158427, + "learning_rate": 6.948254418137573e-07, + "logits/chosen": -0.12377981841564178, + "logits/rejected": 0.02569674327969551, + "logps/chosen": -2.1482458114624023, + "logps/rejected": -3.1123929023742676, + "loss": 0.6931, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.1482458114624023, + "rewards/margins": 0.9641472101211548, + "rewards/rejected": -3.1123929023742676, + "sft_loss": 2.0953750610351562, + "step": 2440 + }, + { + "epoch": 1.3085800301053687, + "grad_norm": 2.2661497164287674, + "learning_rate": 6.933901893616174e-07, + "logits/chosen": -0.1352919489145279, + "logits/rejected": 0.03508482128381729, + "logps/chosen": -2.173128604888916, + "logps/rejected": -2.9652352333068848, + "loss": 0.6932, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.173128604888916, + "rewards/margins": 0.7921067476272583, + "rewards/rejected": -2.9652352333068848, + "sft_loss": 2.1787595748901367, + "step": 2445 + }, + { + "epoch": 1.3112560628867704, + "grad_norm": 2.8400594818591367, + "learning_rate": 6.919530606370121e-07, + "logits/chosen": -0.08295096457004547, + "logits/rejected": 0.07789567112922668, + "logps/chosen": -2.1355133056640625, + "logps/rejected": -3.1785805225372314, + "loss": 0.6797, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1355133056640625, + "rewards/margins": 1.0430670976638794, + "rewards/rejected": -3.1785805225372314, + "sft_loss": 2.096576452255249, + "step": 2450 + }, + { + "epoch": 1.313932095668172, + "grad_norm": 1.7924089901573266, + "learning_rate": 6.905140695829706e-07, + "logits/chosen": -0.1479923576116562, + "logits/rejected": 0.13825397193431854, + "logps/chosen": -2.13856840133667, + "logps/rejected": -3.0264368057250977, + "loss": 0.6844, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.13856840133667, + "rewards/margins": 0.8878685832023621, + "rewards/rejected": -3.0264368057250977, + "sft_loss": 2.0962822437286377, + "step": 2455 + }, + { + "epoch": 1.3166081284495736, + "grad_norm": 2.7934562089784354, + "learning_rate": 6.890732301605904e-07, + "logits/chosen": -0.08230151981115341, + "logits/rejected": 0.0383109524846077, + "logps/chosen": -2.1695401668548584, + "logps/rejected": -2.992586612701416, + "loss": 0.6834, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.1695401668548584, + "rewards/margins": 0.8230465054512024, + "rewards/rejected": -2.992586612701416, + "sft_loss": 2.1208584308624268, + "step": 2460 + }, + { + "epoch": 1.3192841612309751, + "grad_norm": 2.4992385655272455, + "learning_rate": 6.876305563489021e-07, + "logits/chosen": -0.08335469663143158, + "logits/rejected": 0.017959536984562874, + "logps/chosen": -2.0857455730438232, + "logps/rejected": -3.0713672637939453, + "loss": 0.6792, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0857455730438232, + "rewards/margins": 0.9856218099594116, + "rewards/rejected": -3.0713672637939453, + "sft_loss": 2.0089364051818848, + "step": 2465 + }, + { + "epoch": 1.3219601940123766, + "grad_norm": 10.105793611988156, + "learning_rate": 6.861860621447331e-07, + "logits/chosen": -0.21285517513751984, + "logits/rejected": -0.1051969975233078, + "logps/chosen": -2.1936020851135254, + "logps/rejected": -2.995419979095459, + "loss": 0.701, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1936020851135254, + "rewards/margins": 0.8018182516098022, + "rewards/rejected": -2.995419979095459, + "sft_loss": 2.244130849838257, + "step": 2470 + }, + { + "epoch": 1.3246362267937783, + "grad_norm": 2.575060081317489, + "learning_rate": 6.847397615625725e-07, + "logits/chosen": -0.07824737578630447, + "logits/rejected": -0.01782209798693657, + "logps/chosen": -2.265883445739746, + "logps/rejected": -3.1407861709594727, + "loss": 0.6982, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.265883445739746, + "rewards/margins": 0.8749030232429504, + "rewards/rejected": -3.1407861709594727, + "sft_loss": 2.172591209411621, + "step": 2475 + }, + { + "epoch": 1.3273122595751798, + "grad_norm": 3.530855461527871, + "learning_rate": 6.83291668634435e-07, + "logits/chosen": -0.20899859070777893, + "logits/rejected": -0.006174634210765362, + "logps/chosen": -2.1980414390563965, + "logps/rejected": -3.2681076526641846, + "loss": 0.6907, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1980414390563965, + "rewards/margins": 1.070065975189209, + "rewards/rejected": -3.2681076526641846, + "sft_loss": 2.2746310234069824, + "step": 2480 + }, + { + "epoch": 1.3299882923565813, + "grad_norm": 4.209542371097667, + "learning_rate": 6.818417974097246e-07, + "logits/chosen": 0.0029031604062765837, + "logits/rejected": 0.18295882642269135, + "logps/chosen": -2.1312851905822754, + "logps/rejected": -3.35154390335083, + "loss": 0.6764, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1312851905822754, + "rewards/margins": 1.2202587127685547, + "rewards/rejected": -3.35154390335083, + "sft_loss": 2.1908926963806152, + "step": 2485 + }, + { + "epoch": 1.332664325137983, + "grad_norm": 2.804247444623222, + "learning_rate": 6.803901619550981e-07, + "logits/chosen": -0.11487329006195068, + "logits/rejected": -0.03248930722475052, + "logps/chosen": -2.2412500381469727, + "logps/rejected": -3.1712937355041504, + "loss": 0.6925, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.2412500381469727, + "rewards/margins": 0.9300435185432434, + "rewards/rejected": -3.1712937355041504, + "sft_loss": 2.2203731536865234, + "step": 2490 + }, + { + "epoch": 1.3353403579193845, + "grad_norm": 3.353038019020451, + "learning_rate": 6.789367763543292e-07, + "logits/chosen": -0.06891624629497528, + "logits/rejected": -0.036821819841861725, + "logps/chosen": -2.2421700954437256, + "logps/rejected": -3.067368984222412, + "loss": 0.6936, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.2421700954437256, + "rewards/margins": 0.8251991271972656, + "rewards/rejected": -3.067368984222412, + "sft_loss": 2.243100643157959, + "step": 2495 + }, + { + "epoch": 1.338016390700786, + "grad_norm": 3.458953722038985, + "learning_rate": 6.774816547081714e-07, + "logits/chosen": -0.057009391486644745, + "logits/rejected": 0.11005090177059174, + "logps/chosen": -2.0556159019470215, + "logps/rejected": -2.845609188079834, + "loss": 0.687, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0556159019470215, + "rewards/margins": 0.7899934649467468, + "rewards/rejected": -2.845609188079834, + "sft_loss": 2.098008632659912, + "step": 2500 + }, + { + "epoch": 1.3406924234821878, + "grad_norm": 3.94927382740404, + "learning_rate": 6.760248111342211e-07, + "logits/chosen": -0.06724351644515991, + "logits/rejected": 0.1374901980161667, + "logps/chosen": -2.1209895610809326, + "logps/rejected": -3.237943172454834, + "loss": 0.6795, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1209895610809326, + "rewards/margins": 1.116953730583191, + "rewards/rejected": -3.237943172454834, + "sft_loss": 2.0984835624694824, + "step": 2505 + }, + { + "epoch": 1.3433684562635893, + "grad_norm": 5.166669704979176, + "learning_rate": 6.745662597667813e-07, + "logits/chosen": -0.0967196449637413, + "logits/rejected": 0.0657767504453659, + "logps/chosen": -2.0116353034973145, + "logps/rejected": -3.0105011463165283, + "loss": 0.6755, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.0116353034973145, + "rewards/margins": 0.9988659024238586, + "rewards/rejected": -3.0105011463165283, + "sft_loss": 2.0762581825256348, + "step": 2510 + }, + { + "epoch": 1.3460444890449907, + "grad_norm": 2.1962659992003672, + "learning_rate": 6.731060147567236e-07, + "logits/chosen": 0.0395740307867527, + "logits/rejected": 0.173264741897583, + "logps/chosen": -2.1408400535583496, + "logps/rejected": -3.083125114440918, + "loss": 0.6806, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1408400535583496, + "rewards/margins": 0.9422849416732788, + "rewards/rejected": -3.083125114440918, + "sft_loss": 2.105454206466675, + "step": 2515 + }, + { + "epoch": 1.3487205218263925, + "grad_norm": 1.9316971958150804, + "learning_rate": 6.716440902713515e-07, + "logits/chosen": -0.04250287264585495, + "logits/rejected": 0.05922934412956238, + "logps/chosen": -2.09122633934021, + "logps/rejected": -2.858992338180542, + "loss": 0.6944, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.09122633934021, + "rewards/margins": 0.7677661180496216, + "rewards/rejected": -2.858992338180542, + "sft_loss": 1.9963337182998657, + "step": 2520 + }, + { + "epoch": 1.351396554607794, + "grad_norm": 4.502744430309704, + "learning_rate": 6.701805004942627e-07, + "logits/chosen": -0.02525998279452324, + "logits/rejected": 0.0633404478430748, + "logps/chosen": -2.0883584022521973, + "logps/rejected": -2.9223198890686035, + "loss": 0.6841, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0883584022521973, + "rewards/margins": 0.8339619636535645, + "rewards/rejected": -2.9223198890686035, + "sft_loss": 2.147000789642334, + "step": 2525 + }, + { + "epoch": 1.3540725873891954, + "grad_norm": 17.42620499120433, + "learning_rate": 6.687152596252119e-07, + "logits/chosen": -0.04304816573858261, + "logits/rejected": 0.05292002111673355, + "logps/chosen": -2.245084285736084, + "logps/rejected": -2.8690338134765625, + "loss": 0.7071, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.245084285736084, + "rewards/margins": 0.6239495873451233, + "rewards/rejected": -2.8690338134765625, + "sft_loss": 2.1775736808776855, + "step": 2530 + }, + { + "epoch": 1.3567486201705972, + "grad_norm": 2.675042367787873, + "learning_rate": 6.672483818799722e-07, + "logits/chosen": -0.12775865197181702, + "logits/rejected": 0.05053830146789551, + "logps/chosen": -2.1142070293426514, + "logps/rejected": -3.1125524044036865, + "loss": 0.6889, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1142070293426514, + "rewards/margins": 0.9983454942703247, + "rewards/rejected": -3.1125524044036865, + "sft_loss": 2.073481559753418, + "step": 2535 + }, + { + "epoch": 1.3594246529519987, + "grad_norm": 9.224772976262004, + "learning_rate": 6.657798814901978e-07, + "logits/chosen": -0.04525815695524216, + "logits/rejected": 0.15498168766498566, + "logps/chosen": -2.4505741596221924, + "logps/rejected": -3.086620807647705, + "loss": 0.698, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.4505741596221924, + "rewards/margins": 0.6360467076301575, + "rewards/rejected": -3.086620807647705, + "sft_loss": 2.2530603408813477, + "step": 2540 + }, + { + "epoch": 1.3621006857334002, + "grad_norm": 2.7095060842080856, + "learning_rate": 6.643097727032863e-07, + "logits/chosen": -0.05475207418203354, + "logits/rejected": 0.17921461164951324, + "logps/chosen": -2.172621965408325, + "logps/rejected": -3.311145782470703, + "loss": 0.6835, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.172621965408325, + "rewards/margins": 1.138523817062378, + "rewards/rejected": -3.311145782470703, + "sft_loss": 2.169924259185791, + "step": 2545 + }, + { + "epoch": 1.3647767185148019, + "grad_norm": 2.836953925052455, + "learning_rate": 6.628380697822392e-07, + "logits/chosen": -0.042983829975128174, + "logits/rejected": 0.14592500030994415, + "logps/chosen": -2.232512950897217, + "logps/rejected": -2.960404634475708, + "loss": 0.6976, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.232512950897217, + "rewards/margins": 0.7278915643692017, + "rewards/rejected": -2.960404634475708, + "sft_loss": 2.1754684448242188, + "step": 2550 + }, + { + "epoch": 1.3674527512962034, + "grad_norm": 4.065292751766514, + "learning_rate": 6.61364787005525e-07, + "logits/chosen": -0.020567212253808975, + "logits/rejected": 0.1053391695022583, + "logps/chosen": -2.131559133529663, + "logps/rejected": -3.206552505493164, + "loss": 0.6834, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.131559133529663, + "rewards/margins": 1.074993371963501, + "rewards/rejected": -3.206552505493164, + "sft_loss": 2.1917171478271484, + "step": 2555 + }, + { + "epoch": 1.3701287840776049, + "grad_norm": 5.133914146344989, + "learning_rate": 6.598899386669395e-07, + "logits/chosen": -0.00458473339676857, + "logits/rejected": 0.14149120450019836, + "logps/chosen": -2.0979244709014893, + "logps/rejected": -3.1505038738250732, + "loss": 0.6871, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.0979244709014893, + "rewards/margins": 1.0525795221328735, + "rewards/rejected": -3.1505038738250732, + "sft_loss": 2.0719380378723145, + "step": 2560 + }, + { + "epoch": 1.3728048168590066, + "grad_norm": 5.66680961721883, + "learning_rate": 6.584135390754679e-07, + "logits/chosen": -0.03058113530278206, + "logits/rejected": 0.11164456605911255, + "logps/chosen": -2.146927833557129, + "logps/rejected": -3.096108913421631, + "loss": 0.6887, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.146927833557129, + "rewards/margins": 0.9491811990737915, + "rewards/rejected": -3.096108913421631, + "sft_loss": 2.147765874862671, + "step": 2565 + }, + { + "epoch": 1.375480849640408, + "grad_norm": 6.270360510361445, + "learning_rate": 6.569356025551454e-07, + "logits/chosen": 0.0009026816114783287, + "logits/rejected": 0.06363946944475174, + "logps/chosen": -2.144469738006592, + "logps/rejected": -2.9771530628204346, + "loss": 0.681, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.144469738006592, + "rewards/margins": 0.832683265209198, + "rewards/rejected": -2.9771530628204346, + "sft_loss": 2.064734935760498, + "step": 2570 + }, + { + "epoch": 1.3781568824218096, + "grad_norm": 8.729703440563144, + "learning_rate": 6.554561434449186e-07, + "logits/chosen": -0.12333973497152328, + "logits/rejected": 0.06011788919568062, + "logps/chosen": -2.118159770965576, + "logps/rejected": -3.038071393966675, + "loss": 0.6885, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.118159770965576, + "rewards/margins": 0.9199115037918091, + "rewards/rejected": -3.038071393966675, + "sft_loss": 2.1175942420959473, + "step": 2575 + }, + { + "epoch": 1.3808329152032113, + "grad_norm": 4.973931538666568, + "learning_rate": 6.539751760985063e-07, + "logits/chosen": -0.07782983034849167, + "logits/rejected": 0.012289203703403473, + "logps/chosen": -2.131692409515381, + "logps/rejected": -2.7401137351989746, + "loss": 0.6987, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.131692409515381, + "rewards/margins": 0.6084216833114624, + "rewards/rejected": -2.7401137351989746, + "sft_loss": 2.1399919986724854, + "step": 2580 + }, + { + "epoch": 1.3835089479846128, + "grad_norm": 3.318965976930725, + "learning_rate": 6.524927148842602e-07, + "logits/chosen": -0.0142856789752841, + "logits/rejected": 0.15498195588588715, + "logps/chosen": -2.137073516845703, + "logps/rejected": -3.057272434234619, + "loss": 0.6765, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.137073516845703, + "rewards/margins": 0.9201983213424683, + "rewards/rejected": -3.057272434234619, + "sft_loss": 2.040189027786255, + "step": 2585 + }, + { + "epoch": 1.3861849807660143, + "grad_norm": 5.2308567106999595, + "learning_rate": 6.510087741850254e-07, + "logits/chosen": -0.09322819858789444, + "logits/rejected": 0.04602864384651184, + "logps/chosen": -2.081974983215332, + "logps/rejected": -3.0379087924957275, + "loss": 0.6917, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.081974983215332, + "rewards/margins": 0.9559333920478821, + "rewards/rejected": -3.0379087924957275, + "sft_loss": 2.123471736907959, + "step": 2590 + }, + { + "epoch": 1.388861013547416, + "grad_norm": 4.1952284473055075, + "learning_rate": 6.495233683980012e-07, + "logits/chosen": -0.1261480301618576, + "logits/rejected": -0.0629156082868576, + "logps/chosen": -2.166938304901123, + "logps/rejected": -2.9580254554748535, + "loss": 0.6908, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.166938304901123, + "rewards/margins": 0.7910870313644409, + "rewards/rejected": -2.9580254554748535, + "sft_loss": 2.124786853790283, + "step": 2595 + }, + { + "epoch": 1.3915370463288175, + "grad_norm": 4.768424623304929, + "learning_rate": 6.480365119346011e-07, + "logits/chosen": -0.014707823283970356, + "logits/rejected": 0.1322084218263626, + "logps/chosen": -2.1736626625061035, + "logps/rejected": -2.9306247234344482, + "loss": 0.68, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1736626625061035, + "rewards/margins": 0.7569620013237, + "rewards/rejected": -2.9306247234344482, + "sft_loss": 2.120302200317383, + "step": 2600 + }, + { + "epoch": 1.394213079110219, + "grad_norm": 2.7539073753290637, + "learning_rate": 6.465482192203129e-07, + "logits/chosen": 0.013084961101412773, + "logits/rejected": 0.0828789547085762, + "logps/chosen": -2.197340250015259, + "logps/rejected": -2.9716684818267822, + "loss": 0.6936, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.197340250015259, + "rewards/margins": 0.774328351020813, + "rewards/rejected": -2.9716684818267822, + "sft_loss": 2.27701735496521, + "step": 2605 + }, + { + "epoch": 1.3968891118916207, + "grad_norm": 4.566894831208481, + "learning_rate": 6.45058504694559e-07, + "logits/chosen": 0.021959930658340454, + "logits/rejected": 0.07701905816793442, + "logps/chosen": -2.2421679496765137, + "logps/rejected": -3.062756061553955, + "loss": 0.69, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.2421679496765137, + "rewards/margins": 0.8205882906913757, + "rewards/rejected": -3.062756061553955, + "sft_loss": 2.292219638824463, + "step": 2610 + }, + { + "epoch": 1.3995651446730222, + "grad_norm": 4.95882769400696, + "learning_rate": 6.435673828105564e-07, + "logits/chosen": -0.07913017272949219, + "logits/rejected": 0.06965837627649307, + "logps/chosen": -2.2385611534118652, + "logps/rejected": -3.219817638397217, + "loss": 0.6896, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2385611534118652, + "rewards/margins": 0.9812566041946411, + "rewards/rejected": -3.219817638397217, + "sft_loss": 2.2946548461914062, + "step": 2615 + }, + { + "epoch": 1.402241177454424, + "grad_norm": 9.493816800187712, + "learning_rate": 6.420748680351763e-07, + "logits/chosen": -0.06189330667257309, + "logits/rejected": -0.11217920482158661, + "logps/chosen": -2.3079352378845215, + "logps/rejected": -2.866248369216919, + "loss": 0.7073, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.3079352378845215, + "rewards/margins": 0.5583130121231079, + "rewards/rejected": -2.866248369216919, + "sft_loss": 2.3487675189971924, + "step": 2620 + }, + { + "epoch": 1.4049172102358254, + "grad_norm": 2.054016193827397, + "learning_rate": 6.405809748488032e-07, + "logits/chosen": -0.14082583785057068, + "logits/rejected": 0.02557968534529209, + "logps/chosen": -2.4099087715148926, + "logps/rejected": -3.370522975921631, + "loss": 0.6846, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.4099087715148926, + "rewards/margins": 0.9606143236160278, + "rewards/rejected": -3.370522975921631, + "sft_loss": 2.304067850112915, + "step": 2625 + }, + { + "epoch": 1.4075932430172269, + "grad_norm": 4.057502087752727, + "learning_rate": 6.390857177451956e-07, + "logits/chosen": -0.22423699498176575, + "logits/rejected": -0.01131142396479845, + "logps/chosen": -2.3235392570495605, + "logps/rejected": -3.3511900901794434, + "loss": 0.6996, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.3235392570495605, + "rewards/margins": 1.0276509523391724, + "rewards/rejected": -3.3511900901794434, + "sft_loss": 2.3187010288238525, + "step": 2630 + }, + { + "epoch": 1.4102692757986286, + "grad_norm": 2.2240564234525264, + "learning_rate": 6.375891112313445e-07, + "logits/chosen": -0.2010897397994995, + "logits/rejected": -0.10360528528690338, + "logps/chosen": -2.27347993850708, + "logps/rejected": -3.3287010192871094, + "loss": 0.6869, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.27347993850708, + "rewards/margins": 1.0552215576171875, + "rewards/rejected": -3.3287010192871094, + "sft_loss": 2.262712001800537, + "step": 2635 + }, + { + "epoch": 1.41294530858003, + "grad_norm": 3.3316564064774665, + "learning_rate": 6.360911698273326e-07, + "logits/chosen": -0.0734337568283081, + "logits/rejected": 0.030337577685713768, + "logps/chosen": -2.405501127243042, + "logps/rejected": -3.0734591484069824, + "loss": 0.6951, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.405501127243042, + "rewards/margins": 0.6679580211639404, + "rewards/rejected": -3.0734591484069824, + "sft_loss": 2.271773338317871, + "step": 2640 + }, + { + "epoch": 1.4156213413614318, + "grad_norm": 3.1025024967632397, + "learning_rate": 6.345919080661944e-07, + "logits/chosen": -0.11891385167837143, + "logits/rejected": -0.03588943928480148, + "logps/chosen": -2.1229982376098633, + "logps/rejected": -3.171241283416748, + "loss": 0.674, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1229982376098633, + "rewards/margins": 1.0482432842254639, + "rewards/rejected": -3.171241283416748, + "sft_loss": 2.0762438774108887, + "step": 2645 + }, + { + "epoch": 1.4182973741428333, + "grad_norm": 2.418702025653042, + "learning_rate": 6.330913404937737e-07, + "logits/chosen": -0.12587347626686096, + "logits/rejected": 0.03255102410912514, + "logps/chosen": -2.1877565383911133, + "logps/rejected": -3.266651153564453, + "loss": 0.7039, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1877565383911133, + "rewards/margins": 1.0788941383361816, + "rewards/rejected": -3.266651153564453, + "sft_loss": 2.1533803939819336, + "step": 2650 + }, + { + "epoch": 1.4209734069242348, + "grad_norm": 4.455130732749031, + "learning_rate": 6.315894816685838e-07, + "logits/chosen": -0.11404607445001602, + "logits/rejected": 0.067964106798172, + "logps/chosen": -2.112873077392578, + "logps/rejected": -3.0061681270599365, + "loss": 0.6883, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.112873077392578, + "rewards/margins": 0.8932951092720032, + "rewards/rejected": -3.0061681270599365, + "sft_loss": 2.1034371852874756, + "step": 2655 + }, + { + "epoch": 1.4236494397056365, + "grad_norm": 3.2683981759932887, + "learning_rate": 6.300863461616657e-07, + "logits/chosen": -0.05666341260075569, + "logits/rejected": 0.016394445672631264, + "logps/chosen": -2.1205034255981445, + "logps/rejected": -3.087794542312622, + "loss": 0.6923, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.1205034255981445, + "rewards/margins": 0.9672911763191223, + "rewards/rejected": -3.087794542312622, + "sft_loss": 2.069809913635254, + "step": 2660 + }, + { + "epoch": 1.426325472487038, + "grad_norm": 3.5313129911679066, + "learning_rate": 6.285819485564465e-07, + "logits/chosen": -0.21050508320331573, + "logits/rejected": -0.06813536584377289, + "logps/chosen": -2.082973003387451, + "logps/rejected": -3.1073174476623535, + "loss": 0.6829, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.082973003387451, + "rewards/margins": 1.0243443250656128, + "rewards/rejected": -3.1073174476623535, + "sft_loss": 2.097914934158325, + "step": 2665 + }, + { + "epoch": 1.4290015052684395, + "grad_norm": 3.284569886518826, + "learning_rate": 6.270763034485986e-07, + "logits/chosen": -0.054897792637348175, + "logits/rejected": 0.06013824790716171, + "logps/chosen": -2.1149497032165527, + "logps/rejected": -2.877192974090576, + "loss": 0.6872, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1149497032165527, + "rewards/margins": 0.7622435688972473, + "rewards/rejected": -2.877192974090576, + "sft_loss": 2.0054376125335693, + "step": 2670 + }, + { + "epoch": 1.4316775380498412, + "grad_norm": 3.659317295834073, + "learning_rate": 6.255694254458972e-07, + "logits/chosen": -0.15437868237495422, + "logits/rejected": 0.0225119199603796, + "logps/chosen": -2.2297348976135254, + "logps/rejected": -2.859705924987793, + "loss": 0.6925, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2297348976135254, + "rewards/margins": 0.6299708485603333, + "rewards/rejected": -2.859705924987793, + "sft_loss": 1.9377400875091553, + "step": 2675 + }, + { + "epoch": 1.4343535708312427, + "grad_norm": 19.207448858430492, + "learning_rate": 6.240613291680795e-07, + "logits/chosen": -0.1754283607006073, + "logits/rejected": 0.010037758387625217, + "logps/chosen": -2.1164345741271973, + "logps/rejected": -3.0492329597473145, + "loss": 0.7039, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1164345741271973, + "rewards/margins": 0.9327982068061829, + "rewards/rejected": -3.0492329597473145, + "sft_loss": 2.0898313522338867, + "step": 2680 + }, + { + "epoch": 1.4370296036126442, + "grad_norm": 5.091046328651422, + "learning_rate": 6.225520292467021e-07, + "logits/chosen": -0.1769765168428421, + "logits/rejected": 0.059815000742673874, + "logps/chosen": -2.1104233264923096, + "logps/rejected": -3.0075087547302246, + "loss": 0.6808, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1104233264923096, + "rewards/margins": 0.8970853686332703, + "rewards/rejected": -3.0075087547302246, + "sft_loss": 2.111417293548584, + "step": 2685 + }, + { + "epoch": 1.439705636394046, + "grad_norm": 3.8557371204085675, + "learning_rate": 6.210415403249993e-07, + "logits/chosen": -0.2643243670463562, + "logits/rejected": 0.016842365264892578, + "logps/chosen": -2.2394261360168457, + "logps/rejected": -3.551403760910034, + "loss": 0.687, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.2394261360168457, + "rewards/margins": 1.3119779825210571, + "rewards/rejected": -3.551403760910034, + "sft_loss": 2.1109273433685303, + "step": 2690 + }, + { + "epoch": 1.4423816691754474, + "grad_norm": 3.2034004384790236, + "learning_rate": 6.195298770577415e-07, + "logits/chosen": -0.04526636004447937, + "logits/rejected": -0.016321176663041115, + "logps/chosen": -2.2352993488311768, + "logps/rejected": -3.2442870140075684, + "loss": 0.6926, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.2352993488311768, + "rewards/margins": 1.0089879035949707, + "rewards/rejected": -3.2442870140075684, + "sft_loss": 2.1991376876831055, + "step": 2695 + }, + { + "epoch": 1.445057701956849, + "grad_norm": 2.9629505412965678, + "learning_rate": 6.180170541110923e-07, + "logits/chosen": -0.10669227689504623, + "logits/rejected": 0.10436113178730011, + "logps/chosen": -2.392090082168579, + "logps/rejected": -3.246337890625, + "loss": 0.7042, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.392090082168579, + "rewards/margins": 0.8542478680610657, + "rewards/rejected": -3.246337890625, + "sft_loss": 2.3559975624084473, + "step": 2700 + }, + { + "epoch": 1.4477337347382506, + "grad_norm": 2.4741411627768453, + "learning_rate": 6.165030861624663e-07, + "logits/chosen": -0.14265497028827667, + "logits/rejected": 0.1273798644542694, + "logps/chosen": -2.1982200145721436, + "logps/rejected": -3.619431257247925, + "loss": 0.6771, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1982200145721436, + "rewards/margins": 1.4212112426757812, + "rewards/rejected": -3.619431257247925, + "sft_loss": 2.028993844985962, + "step": 2705 + }, + { + "epoch": 1.4504097675196521, + "grad_norm": 3.10751717393839, + "learning_rate": 6.149879879003876e-07, + "logits/chosen": -0.0072515071369707584, + "logits/rejected": 0.017031148076057434, + "logps/chosen": -2.2495055198669434, + "logps/rejected": -3.25390625, + "loss": 0.6831, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.2495055198669434, + "rewards/margins": 1.0044009685516357, + "rewards/rejected": -3.25390625, + "sft_loss": 2.162059783935547, + "step": 2710 + }, + { + "epoch": 1.4530858003010536, + "grad_norm": 6.7866290750320655, + "learning_rate": 6.13471774024346e-07, + "logits/chosen": -0.1543581187725067, + "logits/rejected": -0.029458314180374146, + "logps/chosen": -2.0542521476745605, + "logps/rejected": -2.9937496185302734, + "loss": 0.6866, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0542521476745605, + "rewards/margins": 0.939497172832489, + "rewards/rejected": -2.9937496185302734, + "sft_loss": 2.108109951019287, + "step": 2715 + }, + { + "epoch": 1.4557618330824553, + "grad_norm": 4.616865662276508, + "learning_rate": 6.119544592446551e-07, + "logits/chosen": -0.0960715264081955, + "logits/rejected": 0.04589344188570976, + "logps/chosen": -2.112891912460327, + "logps/rejected": -2.9276225566864014, + "loss": 0.6929, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.112891912460327, + "rewards/margins": 0.8147305250167847, + "rewards/rejected": -2.9276225566864014, + "sft_loss": 1.950931191444397, + "step": 2720 + }, + { + "epoch": 1.4584378658638568, + "grad_norm": 4.386497894315425, + "learning_rate": 6.104360582823096e-07, + "logits/chosen": -0.06804818660020828, + "logits/rejected": 0.04135305806994438, + "logps/chosen": -2.1073989868164062, + "logps/rejected": -2.949591875076294, + "loss": 0.6957, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1073989868164062, + "rewards/margins": 0.8421930074691772, + "rewards/rejected": -2.949591875076294, + "sft_loss": 2.000044822692871, + "step": 2725 + }, + { + "epoch": 1.4611138986452583, + "grad_norm": 5.891030771344286, + "learning_rate": 6.089165858688423e-07, + "logits/chosen": -0.08829103410243988, + "logits/rejected": 0.1070912703871727, + "logps/chosen": -1.9999017715454102, + "logps/rejected": -3.1407268047332764, + "loss": 0.6803, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9999017715454102, + "rewards/margins": 1.1408249139785767, + "rewards/rejected": -3.1407268047332764, + "sft_loss": 1.9781440496444702, + "step": 2730 + }, + { + "epoch": 1.46378993142666, + "grad_norm": 3.2961844654332677, + "learning_rate": 6.073960567461811e-07, + "logits/chosen": -0.09192325174808502, + "logits/rejected": 0.13550327718257904, + "logps/chosen": -1.9177381992340088, + "logps/rejected": -2.9311869144439697, + "loss": 0.6763, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9177381992340088, + "rewards/margins": 1.01344895362854, + "rewards/rejected": -2.9311869144439697, + "sft_loss": 1.8968242406845093, + "step": 2735 + }, + { + "epoch": 1.4664659642080615, + "grad_norm": 4.013646920419382, + "learning_rate": 6.058744856665065e-07, + "logits/chosen": -0.12752971053123474, + "logits/rejected": -0.021973803639411926, + "logps/chosen": -2.0632452964782715, + "logps/rejected": -3.4493393898010254, + "loss": 0.6846, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0632452964782715, + "rewards/margins": 1.3860942125320435, + "rewards/rejected": -3.4493393898010254, + "sft_loss": 2.0599193572998047, + "step": 2740 + }, + { + "epoch": 1.469141996989463, + "grad_norm": 3.427883733138985, + "learning_rate": 6.043518873921074e-07, + "logits/chosen": -0.08913668245077133, + "logits/rejected": 0.06239492446184158, + "logps/chosen": -2.0854339599609375, + "logps/rejected": -2.9506967067718506, + "loss": 0.6901, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.0854339599609375, + "rewards/margins": 0.8652628064155579, + "rewards/rejected": -2.9506967067718506, + "sft_loss": 1.9990270137786865, + "step": 2745 + }, + { + "epoch": 1.4718180297708647, + "grad_norm": 3.744537290651893, + "learning_rate": 6.028282766952393e-07, + "logits/chosen": -0.05436503887176514, + "logits/rejected": 0.09435670077800751, + "logps/chosen": -2.314728260040283, + "logps/rejected": -3.2752997875213623, + "loss": 0.687, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.314728260040283, + "rewards/margins": 0.9605711698532104, + "rewards/rejected": -3.2752997875213623, + "sft_loss": 2.152890205383301, + "step": 2750 + }, + { + "epoch": 1.4744940625522662, + "grad_norm": 3.8042713205231786, + "learning_rate": 6.013036683579798e-07, + "logits/chosen": -0.04366375878453255, + "logits/rejected": 0.09801409393548965, + "logps/chosen": -2.060361862182617, + "logps/rejected": -3.076183319091797, + "loss": 0.6847, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.060361862182617, + "rewards/margins": 1.0158214569091797, + "rewards/rejected": -3.076183319091797, + "sft_loss": 2.054492473602295, + "step": 2755 + }, + { + "epoch": 1.4771700953336677, + "grad_norm": 3.5111005283603847, + "learning_rate": 5.997780771720854e-07, + "logits/chosen": -0.13682511448860168, + "logits/rejected": 0.10204527527093887, + "logps/chosen": -2.1179380416870117, + "logps/rejected": -3.244584560394287, + "loss": 0.6811, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1179380416870117, + "rewards/margins": 1.1266463994979858, + "rewards/rejected": -3.244584560394287, + "sft_loss": 2.091881513595581, + "step": 2760 + }, + { + "epoch": 1.4798461281150694, + "grad_norm": 4.3374137874837295, + "learning_rate": 5.982515179388486e-07, + "logits/chosen": -0.006934487726539373, + "logits/rejected": 0.15117433667182922, + "logps/chosen": -2.162091016769409, + "logps/rejected": -3.2057461738586426, + "loss": 0.6782, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.162091016769409, + "rewards/margins": 1.0436547994613647, + "rewards/rejected": -3.2057461738586426, + "sft_loss": 2.1595187187194824, + "step": 2765 + }, + { + "epoch": 1.482522160896471, + "grad_norm": 4.683417026061879, + "learning_rate": 5.967240054689541e-07, + "logits/chosen": -0.1065053790807724, + "logits/rejected": -0.020374376326799393, + "logps/chosen": -2.004215955734253, + "logps/rejected": -2.893080234527588, + "loss": 0.6834, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.004215955734253, + "rewards/margins": 0.8888643383979797, + "rewards/rejected": -2.893080234527588, + "sft_loss": 2.09393572807312, + "step": 2770 + }, + { + "epoch": 1.4851981936778724, + "grad_norm": 3.037548868110086, + "learning_rate": 5.951955545823342e-07, + "logits/chosen": -0.02161308005452156, + "logits/rejected": 0.0724690780043602, + "logps/chosen": -2.0727245807647705, + "logps/rejected": -3.1097216606140137, + "loss": 0.6877, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0727245807647705, + "rewards/margins": 1.0369970798492432, + "rewards/rejected": -3.1097216606140137, + "sft_loss": 2.0997555255889893, + "step": 2775 + }, + { + "epoch": 1.4878742264592741, + "grad_norm": 3.8869330057851443, + "learning_rate": 5.936661801080263e-07, + "logits/chosen": -0.022613149136304855, + "logits/rejected": 0.11061491817235947, + "logps/chosen": -2.1283986568450928, + "logps/rejected": -2.840240955352783, + "loss": 0.7006, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1283986568450928, + "rewards/margins": 0.7118419408798218, + "rewards/rejected": -2.840240955352783, + "sft_loss": 2.047914505004883, + "step": 2780 + }, + { + "epoch": 1.4905502592406756, + "grad_norm": 2.2609095311694754, + "learning_rate": 5.92135896884028e-07, + "logits/chosen": -0.0316866859793663, + "logits/rejected": 0.14357516169548035, + "logps/chosen": -2.233394145965576, + "logps/rejected": -3.324572801589966, + "loss": 0.6881, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.233394145965576, + "rewards/margins": 1.0911785364151, + "rewards/rejected": -3.324572801589966, + "sft_loss": 2.155008316040039, + "step": 2785 + }, + { + "epoch": 1.4932262920220774, + "grad_norm": 3.6441905285982346, + "learning_rate": 5.906047197571541e-07, + "logits/chosen": 0.05673060566186905, + "logits/rejected": 0.051228396594524384, + "logps/chosen": -2.1823010444641113, + "logps/rejected": -3.0657646656036377, + "loss": 0.6946, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.1823010444641113, + "rewards/margins": 0.8834635019302368, + "rewards/rejected": -3.0657646656036377, + "sft_loss": 2.2928757667541504, + "step": 2790 + }, + { + "epoch": 1.4959023248034788, + "grad_norm": 3.314604897832424, + "learning_rate": 5.890726635828919e-07, + "logits/chosen": 0.14910665154457092, + "logits/rejected": 0.16638293862342834, + "logps/chosen": -2.0749332904815674, + "logps/rejected": -2.994952917098999, + "loss": 0.6884, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.0749332904815674, + "rewards/margins": 0.9200199246406555, + "rewards/rejected": -2.994952917098999, + "sft_loss": 1.9762952327728271, + "step": 2795 + }, + { + "epoch": 1.4985783575848803, + "grad_norm": 11.038593057821188, + "learning_rate": 5.875397432252569e-07, + "logits/chosen": -0.007317324168980122, + "logits/rejected": 0.09911766648292542, + "logps/chosen": -2.163130044937134, + "logps/rejected": -3.0095102787017822, + "loss": 0.6881, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.163130044937134, + "rewards/margins": 0.8463799357414246, + "rewards/rejected": -3.0095102787017822, + "sft_loss": 2.1841933727264404, + "step": 2800 + }, + { + "epoch": 1.4985783575848803, + "eval_logits/chosen": 0.4063626527786255, + "eval_logits/rejected": 0.5238215327262878, + "eval_logps/chosen": -2.230652332305908, + "eval_logps/rejected": -3.1887776851654053, + "eval_loss": 0.6908450126647949, + "eval_rewards/accuracies": 0.6862017512321472, + "eval_rewards/chosen": -2.230652332305908, + "eval_rewards/margins": 0.9581254720687866, + "eval_rewards/rejected": -3.1887776851654053, + "eval_runtime": 46.198, + "eval_samples_per_second": 29.114, + "eval_sft_loss": 2.1384212970733643, + "eval_steps_per_second": 7.295, + "step": 2800 + }, + { + "epoch": 1.5012543903662818, + "grad_norm": 5.249129536525488, + "learning_rate": 5.860059735566491e-07, + "logits/chosen": -0.16110579669475555, + "logits/rejected": 0.025376638397574425, + "logps/chosen": -2.0280165672302246, + "logps/rejected": -3.126965045928955, + "loss": 0.6871, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0280165672302246, + "rewards/margins": 1.098948359489441, + "rewards/rejected": -3.126965045928955, + "sft_loss": 1.9881798028945923, + "step": 2805 + }, + { + "epoch": 1.5039304231476835, + "grad_norm": 6.613956597776934, + "learning_rate": 5.844713694577087e-07, + "logits/chosen": -0.003742316272109747, + "logits/rejected": 0.08696229755878448, + "logps/chosen": -2.16291880607605, + "logps/rejected": -3.104762554168701, + "loss": 0.6848, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.16291880607605, + "rewards/margins": 0.9418438076972961, + "rewards/rejected": -3.104762554168701, + "sft_loss": 2.196422815322876, + "step": 2810 + }, + { + "epoch": 1.5066064559290853, + "grad_norm": 3.279375142433588, + "learning_rate": 5.829359458171714e-07, + "logits/chosen": 0.0026434913743287325, + "logits/rejected": 0.14492645859718323, + "logps/chosen": -2.1458747386932373, + "logps/rejected": -3.288313388824463, + "loss": 0.6775, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1458747386932373, + "rewards/margins": 1.142438530921936, + "rewards/rejected": -3.288313388824463, + "sft_loss": 2.1363162994384766, + "step": 2815 + }, + { + "epoch": 1.5092824887104868, + "grad_norm": 4.937155891046909, + "learning_rate": 5.81399717531724e-07, + "logits/chosen": -0.04307904094457626, + "logits/rejected": 0.17259356379508972, + "logps/chosen": -2.2710137367248535, + "logps/rejected": -3.0655980110168457, + "loss": 0.6946, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.2710137367248535, + "rewards/margins": 0.7945840358734131, + "rewards/rejected": -3.0655980110168457, + "sft_loss": 2.290271520614624, + "step": 2820 + }, + { + "epoch": 1.5119585214918883, + "grad_norm": 4.550612774789899, + "learning_rate": 5.798626995058602e-07, + "logits/chosen": -0.13909904658794403, + "logits/rejected": 0.06880120187997818, + "logps/chosen": -2.2514679431915283, + "logps/rejected": -3.4120922088623047, + "loss": 0.6979, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.2514679431915283, + "rewards/margins": 1.1606245040893555, + "rewards/rejected": -3.4120922088623047, + "sft_loss": 2.19282865524292, + "step": 2825 + }, + { + "epoch": 1.51463455427329, + "grad_norm": 6.180464443390251, + "learning_rate": 5.783249066517354e-07, + "logits/chosen": -0.031425271183252335, + "logits/rejected": 0.10157237946987152, + "logps/chosen": -2.109801769256592, + "logps/rejected": -2.9821524620056152, + "loss": 0.6971, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.109801769256592, + "rewards/margins": 0.872350811958313, + "rewards/rejected": -2.9821524620056152, + "sft_loss": 2.1158018112182617, + "step": 2830 + }, + { + "epoch": 1.5173105870546915, + "grad_norm": 5.626113358092249, + "learning_rate": 5.767863538890228e-07, + "logits/chosen": -0.07793781906366348, + "logits/rejected": 0.08719579130411148, + "logps/chosen": -2.122753381729126, + "logps/rejected": -3.166059970855713, + "loss": 0.683, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.122753381729126, + "rewards/margins": 1.0433070659637451, + "rewards/rejected": -3.166059970855713, + "sft_loss": 2.135249614715576, + "step": 2835 + }, + { + "epoch": 1.519986619836093, + "grad_norm": 5.6941103393042685, + "learning_rate": 5.75247056144768e-07, + "logits/chosen": -0.05138932541012764, + "logits/rejected": 0.03822798654437065, + "logps/chosen": -2.108766794204712, + "logps/rejected": -2.8018271923065186, + "loss": 0.6886, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.108766794204712, + "rewards/margins": 0.693060576915741, + "rewards/rejected": -2.8018271923065186, + "sft_loss": 2.0910696983337402, + "step": 2840 + }, + { + "epoch": 1.5226626526174947, + "grad_norm": 4.607030383324202, + "learning_rate": 5.737070283532444e-07, + "logits/chosen": -0.017714444547891617, + "logits/rejected": 0.08581845462322235, + "logps/chosen": -2.2201247215270996, + "logps/rejected": -2.962507724761963, + "loss": 0.7, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.2201247215270996, + "rewards/margins": 0.7423833608627319, + "rewards/rejected": -2.962507724761963, + "sft_loss": 2.1716628074645996, + "step": 2845 + }, + { + "epoch": 1.5253386853988962, + "grad_norm": 4.468149326200883, + "learning_rate": 5.721662854558084e-07, + "logits/chosen": -0.09650838375091553, + "logits/rejected": 0.023339275270700455, + "logps/chosen": -2.1109557151794434, + "logps/rejected": -3.225125551223755, + "loss": 0.6867, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1109557151794434, + "rewards/margins": 1.1141700744628906, + "rewards/rejected": -3.225125551223755, + "sft_loss": 2.039635419845581, + "step": 2850 + }, + { + "epoch": 1.5280147181802977, + "grad_norm": 2.453265911287917, + "learning_rate": 5.706248424007545e-07, + "logits/chosen": -0.14216859638690948, + "logits/rejected": 0.0720430314540863, + "logps/chosen": -2.2796530723571777, + "logps/rejected": -3.128239631652832, + "loss": 0.7017, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.2796530723571777, + "rewards/margins": 0.8485862612724304, + "rewards/rejected": -3.128239631652832, + "sft_loss": 2.2780544757843018, + "step": 2855 + }, + { + "epoch": 1.5306907509616994, + "grad_norm": 2.607407849995168, + "learning_rate": 5.690827141431699e-07, + "logits/chosen": -0.1372571736574173, + "logits/rejected": 0.084715835750103, + "logps/chosen": -2.3706908226013184, + "logps/rejected": -3.081747531890869, + "loss": 0.7004, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.3706908226013184, + "rewards/margins": 0.7110565304756165, + "rewards/rejected": -3.081747531890869, + "sft_loss": 2.333536148071289, + "step": 2860 + }, + { + "epoch": 1.5333667837431009, + "grad_norm": 8.064558298484231, + "learning_rate": 5.675399156447897e-07, + "logits/chosen": -0.1745007485151291, + "logits/rejected": -0.011042768135666847, + "logps/chosen": -2.287440061569214, + "logps/rejected": -3.0668773651123047, + "loss": 0.7036, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.287440061569214, + "rewards/margins": 0.7794371247291565, + "rewards/rejected": -3.0668773651123047, + "sft_loss": 2.3433339595794678, + "step": 2865 + }, + { + "epoch": 1.5360428165245024, + "grad_norm": 4.2037700192625325, + "learning_rate": 5.659964618738515e-07, + "logits/chosen": -0.11125187575817108, + "logits/rejected": 0.031043073162436485, + "logps/chosen": -2.205533981323242, + "logps/rejected": -2.9975638389587402, + "loss": 0.6995, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.205533981323242, + "rewards/margins": 0.7920295000076294, + "rewards/rejected": -2.9975638389587402, + "sft_loss": 2.1208176612854004, + "step": 2870 + }, + { + "epoch": 1.538718849305904, + "grad_norm": 2.7759824802719097, + "learning_rate": 5.644523678049509e-07, + "logits/chosen": -0.1375740021467209, + "logits/rejected": -0.0133648831397295, + "logps/chosen": -2.174509048461914, + "logps/rejected": -3.10461163520813, + "loss": 0.6828, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.174509048461914, + "rewards/margins": 0.9301024675369263, + "rewards/rejected": -3.10461163520813, + "sft_loss": 2.068380117416382, + "step": 2875 + }, + { + "epoch": 1.5413948820873056, + "grad_norm": 4.088577632674356, + "learning_rate": 5.629076484188952e-07, + "logits/chosen": 0.006173181347548962, + "logits/rejected": 0.1368529349565506, + "logps/chosen": -2.194118022918701, + "logps/rejected": -3.136410713195801, + "loss": 0.6821, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.194118022918701, + "rewards/margins": 0.9422923922538757, + "rewards/rejected": -3.136410713195801, + "sft_loss": 2.1732704639434814, + "step": 2880 + }, + { + "epoch": 1.544070914868707, + "grad_norm": 4.426292415082979, + "learning_rate": 5.613623187025587e-07, + "logits/chosen": -0.09883008897304535, + "logits/rejected": 0.051213592290878296, + "logps/chosen": -2.1624035835266113, + "logps/rejected": -3.1205196380615234, + "loss": 0.6886, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1624035835266113, + "rewards/margins": 0.9581155776977539, + "rewards/rejected": -3.1205196380615234, + "sft_loss": 2.1758995056152344, + "step": 2885 + }, + { + "epoch": 1.5467469476501088, + "grad_norm": 2.4395152212735476, + "learning_rate": 5.598163936487369e-07, + "logits/chosen": -0.16651707887649536, + "logits/rejected": 0.04488401114940643, + "logps/chosen": -2.1542766094207764, + "logps/rejected": -3.1055610179901123, + "loss": 0.6897, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1542766094207764, + "rewards/margins": 0.9512848854064941, + "rewards/rejected": -3.1055610179901123, + "sft_loss": 2.0944085121154785, + "step": 2890 + }, + { + "epoch": 1.5494229804315103, + "grad_norm": 3.603265268828746, + "learning_rate": 5.582698882560017e-07, + "logits/chosen": -0.17208468914031982, + "logits/rejected": 0.0010690949857234955, + "logps/chosen": -2.142911672592163, + "logps/rejected": -3.1632254123687744, + "loss": 0.681, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.142911672592163, + "rewards/margins": 1.0203137397766113, + "rewards/rejected": -3.1632254123687744, + "sft_loss": 2.0967581272125244, + "step": 2895 + }, + { + "epoch": 1.5520990132129118, + "grad_norm": 6.547466770834866, + "learning_rate": 5.567228175285549e-07, + "logits/chosen": -0.07524513453245163, + "logits/rejected": 0.041724175214767456, + "logps/chosen": -1.978674292564392, + "logps/rejected": -3.111820936203003, + "loss": 0.684, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.978674292564392, + "rewards/margins": 1.1331464052200317, + "rewards/rejected": -3.111820936203003, + "sft_loss": 1.9212112426757812, + "step": 2900 + }, + { + "epoch": 1.5547750459943135, + "grad_norm": 3.2321118214909372, + "learning_rate": 5.551751964760838e-07, + "logits/chosen": -0.03188861533999443, + "logits/rejected": -0.00413927435874939, + "logps/chosen": -2.118699312210083, + "logps/rejected": -2.957021951675415, + "loss": 0.6808, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.118699312210083, + "rewards/margins": 0.8383227586746216, + "rewards/rejected": -2.957021951675415, + "sft_loss": 2.168797254562378, + "step": 2905 + }, + { + "epoch": 1.557451078775715, + "grad_norm": 2.8278055693680786, + "learning_rate": 5.536270401136145e-07, + "logits/chosen": -0.14892300963401794, + "logits/rejected": -0.030514398589730263, + "logps/chosen": -2.0119025707244873, + "logps/rejected": -2.8636975288391113, + "loss": 0.6799, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.0119025707244873, + "rewards/margins": 0.8517950177192688, + "rewards/rejected": -2.8636975288391113, + "sft_loss": 2.0259180068969727, + "step": 2910 + }, + { + "epoch": 1.5601271115571165, + "grad_norm": 2.988004895135199, + "learning_rate": 5.520783634613667e-07, + "logits/chosen": -0.08654950559139252, + "logits/rejected": 0.1146860346198082, + "logps/chosen": -2.184629201889038, + "logps/rejected": -3.116959810256958, + "loss": 0.6849, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.184629201889038, + "rewards/margins": 0.9323304891586304, + "rewards/rejected": -3.116959810256958, + "sft_loss": 2.261133909225464, + "step": 2915 + }, + { + "epoch": 1.5628031443385182, + "grad_norm": 1.9290604318268172, + "learning_rate": 5.505291815446082e-07, + "logits/chosen": -0.09083691239356995, + "logits/rejected": 0.026575163006782532, + "logps/chosen": -2.237321376800537, + "logps/rejected": -3.0823521614074707, + "loss": 0.6958, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.237321376800537, + "rewards/margins": 0.8450304865837097, + "rewards/rejected": -3.0823521614074707, + "sft_loss": 2.248321294784546, + "step": 2920 + }, + { + "epoch": 1.5654791771199197, + "grad_norm": 3.790374916646817, + "learning_rate": 5.489795093935089e-07, + "logits/chosen": -0.07825301587581635, + "logits/rejected": 0.017550267279148102, + "logps/chosen": -2.110146999359131, + "logps/rejected": -2.972914218902588, + "loss": 0.676, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.110146999359131, + "rewards/margins": 0.8627673387527466, + "rewards/rejected": -2.972914218902588, + "sft_loss": 2.1116256713867188, + "step": 2925 + }, + { + "epoch": 1.5681552099013212, + "grad_norm": 2.9220937274086882, + "learning_rate": 5.474293620429946e-07, + "logits/chosen": -0.14792025089263916, + "logits/rejected": 0.03304235264658928, + "logps/chosen": -2.1069369316101074, + "logps/rejected": -3.6364331245422363, + "loss": 0.6804, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1069369316101074, + "rewards/margins": 1.529496192932129, + "rewards/rejected": -3.6364331245422363, + "sft_loss": 2.1745944023132324, + "step": 2930 + }, + { + "epoch": 1.570831242682723, + "grad_norm": 3.72442445552988, + "learning_rate": 5.458787545326018e-07, + "logits/chosen": -0.15431484580039978, + "logits/rejected": -0.0035203725565224886, + "logps/chosen": -2.216923236846924, + "logps/rejected": -3.1889290809631348, + "loss": 0.6908, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.216923236846924, + "rewards/margins": 0.9720057249069214, + "rewards/rejected": -3.1889290809631348, + "sft_loss": 2.2095744609832764, + "step": 2935 + }, + { + "epoch": 1.5735072754641244, + "grad_norm": 5.576256203094089, + "learning_rate": 5.443277019063311e-07, + "logits/chosen": -0.18151506781578064, + "logits/rejected": -0.02000053972005844, + "logps/chosen": -2.1487581729888916, + "logps/rejected": -3.3196964263916016, + "loss": 0.6884, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1487581729888916, + "rewards/margins": 1.17093825340271, + "rewards/rejected": -3.3196964263916016, + "sft_loss": 2.141321897506714, + "step": 2940 + }, + { + "epoch": 1.5761833082455259, + "grad_norm": 7.924339527105727, + "learning_rate": 5.427762192125023e-07, + "logits/chosen": -0.1652074158191681, + "logits/rejected": -0.029881944879889488, + "logps/chosen": -2.250629425048828, + "logps/rejected": -3.0189013481140137, + "loss": 0.696, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.250629425048828, + "rewards/margins": 0.7682719230651855, + "rewards/rejected": -3.0189013481140137, + "sft_loss": 2.1879518032073975, + "step": 2945 + }, + { + "epoch": 1.5788593410269276, + "grad_norm": 3.9403049633715552, + "learning_rate": 5.41224321503607e-07, + "logits/chosen": -0.14509066939353943, + "logits/rejected": 0.10351689159870148, + "logps/chosen": -2.1401453018188477, + "logps/rejected": -3.30322003364563, + "loss": 0.6795, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1401453018188477, + "rewards/margins": 1.1630749702453613, + "rewards/rejected": -3.30322003364563, + "sft_loss": 2.1067137718200684, + "step": 2950 + }, + { + "epoch": 1.5815353738083293, + "grad_norm": 4.295090476521195, + "learning_rate": 5.396720238361637e-07, + "logits/chosen": -0.06472639739513397, + "logits/rejected": 0.06039848178625107, + "logps/chosen": -2.192394971847534, + "logps/rejected": -3.0823774337768555, + "loss": 0.6897, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.192394971847534, + "rewards/margins": 0.8899825811386108, + "rewards/rejected": -3.0823774337768555, + "sft_loss": 2.183534860610962, + "step": 2955 + }, + { + "epoch": 1.5842114065897306, + "grad_norm": 4.286368424165502, + "learning_rate": 5.381193412705711e-07, + "logits/chosen": -0.18022814393043518, + "logits/rejected": -0.02799808979034424, + "logps/chosen": -2.1506359577178955, + "logps/rejected": -3.170880079269409, + "loss": 0.6758, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1506359577178955, + "rewards/margins": 1.0202442407608032, + "rewards/rejected": -3.170880079269409, + "sft_loss": 2.0969057083129883, + "step": 2960 + }, + { + "epoch": 1.5868874393711323, + "grad_norm": 4.374127775780103, + "learning_rate": 5.365662888709622e-07, + "logits/chosen": -0.19103659689426422, + "logits/rejected": -0.04271901398897171, + "logps/chosen": -2.113696575164795, + "logps/rejected": -3.0879569053649902, + "loss": 0.6918, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.113696575164795, + "rewards/margins": 0.9742606282234192, + "rewards/rejected": -3.0879569053649902, + "sft_loss": 2.064755916595459, + "step": 2965 + }, + { + "epoch": 1.589563472152534, + "grad_norm": 5.628471622661212, + "learning_rate": 5.350128817050585e-07, + "logits/chosen": -0.15977461636066437, + "logits/rejected": 0.03489946201443672, + "logps/chosen": -2.130821943283081, + "logps/rejected": -3.102430820465088, + "loss": 0.6887, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.130821943283081, + "rewards/margins": 0.9716089367866516, + "rewards/rejected": -3.102430820465088, + "sft_loss": 2.1123976707458496, + "step": 2970 + }, + { + "epoch": 1.5922395049339353, + "grad_norm": 3.6802442032569367, + "learning_rate": 5.334591348440229e-07, + "logits/chosen": -0.10645937919616699, + "logits/rejected": 0.058650027960538864, + "logps/chosen": -2.086397409439087, + "logps/rejected": -3.0651001930236816, + "loss": 0.6836, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.086397409439087, + "rewards/margins": 0.9787028431892395, + "rewards/rejected": -3.0651001930236816, + "sft_loss": 2.1481781005859375, + "step": 2975 + }, + { + "epoch": 1.594915537715337, + "grad_norm": 4.276156693995655, + "learning_rate": 5.319050633623141e-07, + "logits/chosen": -0.15901954472064972, + "logits/rejected": 0.015384090133011341, + "logps/chosen": -2.1815383434295654, + "logps/rejected": -2.9221997261047363, + "loss": 0.6974, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1815383434295654, + "rewards/margins": 0.7406615018844604, + "rewards/rejected": -2.9221997261047363, + "sft_loss": 2.1552226543426514, + "step": 2980 + }, + { + "epoch": 1.5975915704967387, + "grad_norm": 3.0153592254368684, + "learning_rate": 5.303506823375409e-07, + "logits/chosen": -0.16179385781288147, + "logits/rejected": 0.04337051510810852, + "logps/chosen": -2.2223448753356934, + "logps/rejected": -3.1061816215515137, + "loss": 0.6933, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2223448753356934, + "rewards/margins": 0.8838367462158203, + "rewards/rejected": -3.1061816215515137, + "sft_loss": 2.18412709236145, + "step": 2985 + }, + { + "epoch": 1.60026760327814, + "grad_norm": 4.186038813179471, + "learning_rate": 5.287960068503143e-07, + "logits/chosen": -0.1903652846813202, + "logits/rejected": 0.03294439986348152, + "logps/chosen": -2.03208589553833, + "logps/rejected": -2.955350399017334, + "loss": 0.6845, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.03208589553833, + "rewards/margins": 0.9232648611068726, + "rewards/rejected": -2.955350399017334, + "sft_loss": 2.0463991165161133, + "step": 2990 + }, + { + "epoch": 1.6029436360595417, + "grad_norm": 3.66438788945719, + "learning_rate": 5.272410519841032e-07, + "logits/chosen": -0.07795457541942596, + "logits/rejected": 0.044455841183662415, + "logps/chosen": -2.093140125274658, + "logps/rejected": -3.4263968467712402, + "loss": 0.6704, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.093140125274658, + "rewards/margins": 1.3332566022872925, + "rewards/rejected": -3.4263968467712402, + "sft_loss": 2.0226621627807617, + "step": 2995 + }, + { + "epoch": 1.6056196688409434, + "grad_norm": 3.151988106221695, + "learning_rate": 5.256858328250861e-07, + "logits/chosen": -0.15561290085315704, + "logits/rejected": 0.033493004739284515, + "logps/chosen": -2.302936553955078, + "logps/rejected": -3.164163589477539, + "loss": 0.7015, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.302936553955078, + "rewards/margins": 0.8612270355224609, + "rewards/rejected": -3.164163589477539, + "sft_loss": 2.196629762649536, + "step": 3000 + }, + { + "epoch": 1.608295701622345, + "grad_norm": 2.8628032654728264, + "learning_rate": 5.241303644620063e-07, + "logits/chosen": -0.2185770720243454, + "logits/rejected": -0.029982399195432663, + "logps/chosen": -2.2461745738983154, + "logps/rejected": -2.9426379203796387, + "loss": 0.6946, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2461745738983154, + "rewards/margins": 0.6964629888534546, + "rewards/rejected": -2.9426379203796387, + "sft_loss": 2.1740994453430176, + "step": 3005 + }, + { + "epoch": 1.6109717344037464, + "grad_norm": 3.1255261422064433, + "learning_rate": 5.225746619860248e-07, + "logits/chosen": -0.20517203211784363, + "logits/rejected": -0.06608210504055023, + "logps/chosen": -2.1317734718322754, + "logps/rejected": -2.9222843647003174, + "loss": 0.6975, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.1317734718322754, + "rewards/margins": 0.7905106544494629, + "rewards/rejected": -2.9222843647003174, + "sft_loss": 2.0713953971862793, + "step": 3010 + }, + { + "epoch": 1.6136477671851481, + "grad_norm": 5.768165969138451, + "learning_rate": 5.210187404905735e-07, + "logits/chosen": 0.0029327759984880686, + "logits/rejected": 0.11774277687072754, + "logps/chosen": -2.297055244445801, + "logps/rejected": -3.0728812217712402, + "loss": 0.685, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.297055244445801, + "rewards/margins": 0.7758262157440186, + "rewards/rejected": -3.0728812217712402, + "sft_loss": 2.2605652809143066, + "step": 3015 + }, + { + "epoch": 1.6163237999665496, + "grad_norm": 2.5702006766937258, + "learning_rate": 5.194626150712098e-07, + "logits/chosen": -0.15413738787174225, + "logits/rejected": 0.0005656067514792085, + "logps/chosen": -2.1795623302459717, + "logps/rejected": -2.9262468814849854, + "loss": 0.6957, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1795623302459717, + "rewards/margins": 0.7466843128204346, + "rewards/rejected": -2.9262468814849854, + "sft_loss": 2.213491439819336, + "step": 3020 + }, + { + "epoch": 1.6189998327479511, + "grad_norm": 5.271967894864706, + "learning_rate": 5.179063008254695e-07, + "logits/chosen": -0.1758485734462738, + "logits/rejected": 0.023218151181936264, + "logps/chosen": -2.1578831672668457, + "logps/rejected": -2.9815473556518555, + "loss": 0.7062, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.1578831672668457, + "rewards/margins": 0.8236640095710754, + "rewards/rejected": -2.9815473556518555, + "sft_loss": 2.1854913234710693, + "step": 3025 + }, + { + "epoch": 1.6216758655293528, + "grad_norm": 2.075048894960246, + "learning_rate": 5.163498128527199e-07, + "logits/chosen": -0.11611847579479218, + "logits/rejected": 0.05111664533615112, + "logps/chosen": -2.231825351715088, + "logps/rejected": -3.0683205127716064, + "loss": 0.6903, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.231825351715088, + "rewards/margins": 0.8364952206611633, + "rewards/rejected": -3.0683205127716064, + "sft_loss": 2.1659626960754395, + "step": 3030 + }, + { + "epoch": 1.6243518983107543, + "grad_norm": 3.5925567745372113, + "learning_rate": 5.147931662540144e-07, + "logits/chosen": -0.02528810128569603, + "logits/rejected": 0.12856647372245789, + "logps/chosen": -2.1949684619903564, + "logps/rejected": -2.9283640384674072, + "loss": 0.6903, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1949684619903564, + "rewards/margins": 0.7333954572677612, + "rewards/rejected": -2.9283640384674072, + "sft_loss": 2.0562968254089355, + "step": 3035 + }, + { + "epoch": 1.6270279310921558, + "grad_norm": 4.432527418254685, + "learning_rate": 5.132363761319449e-07, + "logits/chosen": -0.1314159482717514, + "logits/rejected": -0.04454684257507324, + "logps/chosen": -1.9996497631072998, + "logps/rejected": -3.1647861003875732, + "loss": 0.6715, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.9996497631072998, + "rewards/margins": 1.1651363372802734, + "rewards/rejected": -3.1647861003875732, + "sft_loss": 1.9506438970565796, + "step": 3040 + }, + { + "epoch": 1.6297039638735575, + "grad_norm": 3.3377921145216276, + "learning_rate": 5.116794575904962e-07, + "logits/chosen": -0.09291049093008041, + "logits/rejected": 0.02424928918480873, + "logps/chosen": -2.110121250152588, + "logps/rejected": -2.952497720718384, + "loss": 0.6783, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.110121250152588, + "rewards/margins": 0.8423765897750854, + "rewards/rejected": -2.952497720718384, + "sft_loss": 2.107722043991089, + "step": 3045 + }, + { + "epoch": 1.632379996654959, + "grad_norm": 3.1045058013576843, + "learning_rate": 5.101224257348987e-07, + "logits/chosen": -0.10040037333965302, + "logits/rejected": 0.06632226705551147, + "logps/chosen": -2.252039909362793, + "logps/rejected": -3.2612273693084717, + "loss": 0.6813, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.252039909362793, + "rewards/margins": 1.0091878175735474, + "rewards/rejected": -3.2612273693084717, + "sft_loss": 2.16507625579834, + "step": 3050 + }, + { + "epoch": 1.6350560294363605, + "grad_norm": 3.153065379729183, + "learning_rate": 5.085652956714823e-07, + "logits/chosen": -0.16843988001346588, + "logits/rejected": 0.000528356060385704, + "logps/chosen": -2.3047170639038086, + "logps/rejected": -3.4012997150421143, + "loss": 0.6845, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3047170639038086, + "rewards/margins": 1.0965824127197266, + "rewards/rejected": -3.4012997150421143, + "sft_loss": 2.2305407524108887, + "step": 3055 + }, + { + "epoch": 1.6377320622177622, + "grad_norm": 3.9368702121084267, + "learning_rate": 5.070080825075298e-07, + "logits/chosen": -0.147186279296875, + "logits/rejected": 0.05198676139116287, + "logps/chosen": -2.1867175102233887, + "logps/rejected": -2.9347281455993652, + "loss": 0.703, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1867175102233887, + "rewards/margins": 0.7480108141899109, + "rewards/rejected": -2.9347281455993652, + "sft_loss": 2.173590660095215, + "step": 3060 + }, + { + "epoch": 1.6404080949991637, + "grad_norm": 3.785919690362723, + "learning_rate": 5.0545080135113e-07, + "logits/chosen": -0.07448790222406387, + "logits/rejected": 0.0009125359356403351, + "logps/chosen": -2.205174684524536, + "logps/rejected": -3.205883026123047, + "loss": 0.6793, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.205174684524536, + "rewards/margins": 1.0007085800170898, + "rewards/rejected": -3.205883026123047, + "sft_loss": 2.1745455265045166, + "step": 3065 + }, + { + "epoch": 1.6430841277805652, + "grad_norm": 3.4168904526951103, + "learning_rate": 5.038934673110316e-07, + "logits/chosen": -0.1712881475687027, + "logits/rejected": -0.04074662923812866, + "logps/chosen": -2.1688315868377686, + "logps/rejected": -3.120831251144409, + "loss": 0.6868, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.1688315868377686, + "rewards/margins": 0.9519997835159302, + "rewards/rejected": -3.120831251144409, + "sft_loss": 2.1889805793762207, + "step": 3070 + }, + { + "epoch": 1.645760160561967, + "grad_norm": 6.0006636293693365, + "learning_rate": 5.023360954964963e-07, + "logits/chosen": -0.18512877821922302, + "logits/rejected": -0.08922187983989716, + "logps/chosen": -2.178508758544922, + "logps/rejected": -3.1332712173461914, + "loss": 0.6942, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.178508758544922, + "rewards/margins": 0.9547624588012695, + "rewards/rejected": -3.1332712173461914, + "sft_loss": 2.1173133850097656, + "step": 3075 + }, + { + "epoch": 1.6484361933433684, + "grad_norm": 4.0880950992063525, + "learning_rate": 5.007787010171524e-07, + "logits/chosen": -0.22838346660137177, + "logits/rejected": -0.04794811084866524, + "logps/chosen": -1.9763202667236328, + "logps/rejected": -3.038149118423462, + "loss": 0.6844, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9763202667236328, + "rewards/margins": 1.06182861328125, + "rewards/rejected": -3.038149118423462, + "sft_loss": 2.046569347381592, + "step": 3080 + }, + { + "epoch": 1.65111222612477, + "grad_norm": 5.322380708277623, + "learning_rate": 4.992212989828477e-07, + "logits/chosen": -0.03665539249777794, + "logits/rejected": -0.000216527289012447, + "logps/chosen": -2.0814623832702637, + "logps/rejected": -3.04225492477417, + "loss": 0.6778, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0814623832702637, + "rewards/margins": 0.9607928395271301, + "rewards/rejected": -3.04225492477417, + "sft_loss": 2.105666399002075, + "step": 3085 + }, + { + "epoch": 1.6537882589061716, + "grad_norm": 5.2965558466896585, + "learning_rate": 4.976639045035036e-07, + "logits/chosen": -0.06266290694475174, + "logits/rejected": 0.02715703286230564, + "logps/chosen": -2.153308391571045, + "logps/rejected": -2.997816324234009, + "loss": 0.7017, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.153308391571045, + "rewards/margins": 0.8445073366165161, + "rewards/rejected": -2.997816324234009, + "sft_loss": 2.1758735179901123, + "step": 3090 + }, + { + "epoch": 1.6564642916875731, + "grad_norm": 5.050427817653276, + "learning_rate": 4.961065326889683e-07, + "logits/chosen": -0.11321164667606354, + "logits/rejected": 0.04749942943453789, + "logps/chosen": -2.0724129676818848, + "logps/rejected": -3.036367416381836, + "loss": 0.6885, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0724129676818848, + "rewards/margins": 0.9639546275138855, + "rewards/rejected": -3.036367416381836, + "sft_loss": 2.055748701095581, + "step": 3095 + }, + { + "epoch": 1.6591403244689746, + "grad_norm": 2.8593400384428778, + "learning_rate": 4.9454919864887e-07, + "logits/chosen": -0.21733930706977844, + "logits/rejected": -0.06262455135583878, + "logps/chosen": -2.0864243507385254, + "logps/rejected": -2.8974545001983643, + "loss": 0.7004, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0864243507385254, + "rewards/margins": 0.8110300302505493, + "rewards/rejected": -2.8974545001983643, + "sft_loss": 2.1405720710754395, + "step": 3100 + }, + { + "epoch": 1.6618163572503764, + "grad_norm": 4.075564805059594, + "learning_rate": 4.929919174924701e-07, + "logits/chosen": -0.162893608212471, + "logits/rejected": 0.03598882630467415, + "logps/chosen": -2.1391289234161377, + "logps/rejected": -2.9104714393615723, + "loss": 0.6947, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.1391289234161377, + "rewards/margins": 0.7713426351547241, + "rewards/rejected": -2.9104714393615723, + "sft_loss": 2.18563175201416, + "step": 3105 + }, + { + "epoch": 1.6644923900317778, + "grad_norm": 2.585986685688883, + "learning_rate": 4.914347043285177e-07, + "logits/chosen": -0.13090942800045013, + "logits/rejected": 0.01913970336318016, + "logps/chosen": -2.203361749649048, + "logps/rejected": -3.140873432159424, + "loss": 0.6839, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.203361749649048, + "rewards/margins": 0.9375116229057312, + "rewards/rejected": -3.140873432159424, + "sft_loss": 2.044311285018921, + "step": 3110 + }, + { + "epoch": 1.6671684228131793, + "grad_norm": 2.7794459362126056, + "learning_rate": 4.898775742651013e-07, + "logits/chosen": -0.05823718383908272, + "logits/rejected": 0.07972903549671173, + "logps/chosen": -2.183250904083252, + "logps/rejected": -3.225594997406006, + "loss": 0.674, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.183250904083252, + "rewards/margins": 1.0423442125320435, + "rewards/rejected": -3.225594997406006, + "sft_loss": 2.2276806831359863, + "step": 3115 + }, + { + "epoch": 1.669844455594581, + "grad_norm": 2.3950145551511017, + "learning_rate": 4.883205424095037e-07, + "logits/chosen": -0.14604242146015167, + "logits/rejected": 0.03321395069360733, + "logps/chosen": -2.200139284133911, + "logps/rejected": -3.2287135124206543, + "loss": 0.6935, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.200139284133911, + "rewards/margins": 1.0285742282867432, + "rewards/rejected": -3.2287135124206543, + "sft_loss": 2.1648683547973633, + "step": 3120 + }, + { + "epoch": 1.6725204883759828, + "grad_norm": 2.6956670959006748, + "learning_rate": 4.86763623868055e-07, + "logits/chosen": -0.07609611749649048, + "logits/rejected": 0.061819422990083694, + "logps/chosen": -2.2954039573669434, + "logps/rejected": -3.36649751663208, + "loss": 0.6823, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2954039573669434, + "rewards/margins": 1.0710933208465576, + "rewards/rejected": -3.36649751663208, + "sft_loss": 2.121478319168091, + "step": 3125 + }, + { + "epoch": 1.675196521157384, + "grad_norm": 3.976046023660791, + "learning_rate": 4.852068337459856e-07, + "logits/chosen": -0.04199652001261711, + "logits/rejected": 0.13352611660957336, + "logps/chosen": -2.2372660636901855, + "logps/rejected": -3.020014524459839, + "loss": 0.6982, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.2372660636901855, + "rewards/margins": 0.7827486991882324, + "rewards/rejected": -3.020014524459839, + "sft_loss": 2.188228130340576, + "step": 3130 + }, + { + "epoch": 1.6778725539387858, + "grad_norm": 3.9131279165094113, + "learning_rate": 4.8365018714728e-07, + "logits/chosen": -0.0072118318639695644, + "logits/rejected": 0.11471160501241684, + "logps/chosen": -2.3043265342712402, + "logps/rejected": -3.1066927909851074, + "loss": 0.6893, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.3043265342712402, + "rewards/margins": 0.8023663759231567, + "rewards/rejected": -3.1066927909851074, + "sft_loss": 2.17824387550354, + "step": 3135 + }, + { + "epoch": 1.6805485867201875, + "grad_norm": 2.6934657257650065, + "learning_rate": 4.820936991745304e-07, + "logits/chosen": -0.23482973873615265, + "logits/rejected": -0.0825137048959732, + "logps/chosen": -1.9831600189208984, + "logps/rejected": -2.984605312347412, + "loss": 0.6748, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9831600189208984, + "rewards/margins": 1.0014454126358032, + "rewards/rejected": -2.984605312347412, + "sft_loss": 1.999122977256775, + "step": 3140 + }, + { + "epoch": 1.6832246195015887, + "grad_norm": 9.744426748480588, + "learning_rate": 4.8053738492879e-07, + "logits/chosen": -0.03684605285525322, + "logits/rejected": 0.11848436295986176, + "logps/chosen": -2.0876357555389404, + "logps/rejected": -3.0805702209472656, + "loss": 0.6917, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.0876357555389404, + "rewards/margins": 0.9929342269897461, + "rewards/rejected": -3.0805702209472656, + "sft_loss": 1.9902302026748657, + "step": 3145 + }, + { + "epoch": 1.6859006522829905, + "grad_norm": 2.702808170076698, + "learning_rate": 4.789812595094265e-07, + "logits/chosen": -0.16841986775398254, + "logits/rejected": -0.032748930156230927, + "logps/chosen": -2.0487492084503174, + "logps/rejected": -2.90828275680542, + "loss": 0.6772, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0487492084503174, + "rewards/margins": 0.8595331311225891, + "rewards/rejected": -2.90828275680542, + "sft_loss": 1.9349241256713867, + "step": 3150 + }, + { + "epoch": 1.6885766850643922, + "grad_norm": 5.340687027874895, + "learning_rate": 4.774253380139752e-07, + "logits/chosen": -0.1761549711227417, + "logits/rejected": -0.05665317177772522, + "logps/chosen": -1.9597280025482178, + "logps/rejected": -2.958338975906372, + "loss": 0.6794, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.9597280025482178, + "rewards/margins": 0.9986109733581543, + "rewards/rejected": -2.958338975906372, + "sft_loss": 1.918230652809143, + "step": 3155 + }, + { + "epoch": 1.6912527178457935, + "grad_norm": 5.223228830052162, + "learning_rate": 4.758696355379936e-07, + "logits/chosen": -0.06425870954990387, + "logits/rejected": -0.08404186367988586, + "logps/chosen": -2.114051342010498, + "logps/rejected": -3.1076130867004395, + "loss": 0.6814, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.114051342010498, + "rewards/margins": 0.9935620427131653, + "rewards/rejected": -3.1076130867004395, + "sft_loss": 2.173743724822998, + "step": 3160 + }, + { + "epoch": 1.6939287506271952, + "grad_norm": 2.937341760162775, + "learning_rate": 4.743141671749138e-07, + "logits/chosen": -0.21452081203460693, + "logits/rejected": -0.0759705901145935, + "logps/chosen": -2.1878199577331543, + "logps/rejected": -2.7924647331237793, + "loss": 0.7066, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.1878199577331543, + "rewards/margins": 0.6046445965766907, + "rewards/rejected": -2.7924647331237793, + "sft_loss": 2.2030882835388184, + "step": 3165 + }, + { + "epoch": 1.6966047834085969, + "grad_norm": 2.172738048264022, + "learning_rate": 4.727589480158968e-07, + "logits/chosen": -0.1627374291419983, + "logits/rejected": -0.03917828947305679, + "logps/chosen": -2.130401611328125, + "logps/rejected": -3.2156753540039062, + "loss": 0.69, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.130401611328125, + "rewards/margins": 1.0852737426757812, + "rewards/rejected": -3.2156753540039062, + "sft_loss": 2.1349804401397705, + "step": 3170 + }, + { + "epoch": 1.6992808161899984, + "grad_norm": 4.130207287016142, + "learning_rate": 4.712039931496855e-07, + "logits/chosen": -0.1787964105606079, + "logits/rejected": -0.02974070981144905, + "logps/chosen": -2.211841583251953, + "logps/rejected": -2.8528695106506348, + "loss": 0.7068, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.211841583251953, + "rewards/margins": 0.6410278677940369, + "rewards/rejected": -2.8528695106506348, + "sft_loss": 2.191458225250244, + "step": 3175 + }, + { + "epoch": 1.7019568489713999, + "grad_norm": 2.1139808890437104, + "learning_rate": 4.6964931766245905e-07, + "logits/chosen": -0.015827927738428116, + "logits/rejected": 0.04487539082765579, + "logps/chosen": -2.256462335586548, + "logps/rejected": -3.1642420291900635, + "loss": 0.6976, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.256462335586548, + "rewards/margins": 0.9077796936035156, + "rewards/rejected": -3.1642420291900635, + "sft_loss": 2.2019569873809814, + "step": 3180 + }, + { + "epoch": 1.7046328817528016, + "grad_norm": 4.308994349875126, + "learning_rate": 4.6809493663768575e-07, + "logits/chosen": -0.09209471940994263, + "logits/rejected": -0.031523577868938446, + "logps/chosen": -2.2957634925842285, + "logps/rejected": -2.847827672958374, + "loss": 0.6975, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.2957634925842285, + "rewards/margins": 0.5520642995834351, + "rewards/rejected": -2.847827672958374, + "sft_loss": 2.2830893993377686, + "step": 3185 + }, + { + "epoch": 1.707308914534203, + "grad_norm": 3.0778188600677643, + "learning_rate": 4.6654086515597716e-07, + "logits/chosen": -0.18241338431835175, + "logits/rejected": 0.012217411771416664, + "logps/chosen": -2.1110739707946777, + "logps/rejected": -3.1337497234344482, + "loss": 0.6808, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.1110739707946777, + "rewards/margins": 1.0226755142211914, + "rewards/rejected": -3.1337497234344482, + "sft_loss": 2.089545965194702, + "step": 3190 + }, + { + "epoch": 1.7099849473156046, + "grad_norm": 3.611694227509719, + "learning_rate": 4.6498711829494154e-07, + "logits/chosen": -0.17740394175052643, + "logits/rejected": -0.041568391025066376, + "logps/chosen": -2.113341808319092, + "logps/rejected": -3.0952255725860596, + "loss": 0.6859, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.113341808319092, + "rewards/margins": 0.9818838238716125, + "rewards/rejected": -3.0952255725860596, + "sft_loss": 2.0244381427764893, + "step": 3195 + }, + { + "epoch": 1.7126609800970063, + "grad_norm": 3.5878150063322045, + "learning_rate": 4.6343371112903777e-07, + "logits/chosen": -0.04258178547024727, + "logits/rejected": 0.125771164894104, + "logps/chosen": -2.2499518394470215, + "logps/rejected": -3.1202502250671387, + "loss": 0.6998, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.2499518394470215, + "rewards/margins": 0.8702983856201172, + "rewards/rejected": -3.1202502250671387, + "sft_loss": 2.2162721157073975, + "step": 3200 + }, + { + "epoch": 1.7126609800970063, + "eval_logits/chosen": 0.16938525438308716, + "eval_logits/rejected": 0.26880332827568054, + "eval_logps/chosen": -2.171912670135498, + "eval_logps/rejected": -3.1258187294006348, + "eval_loss": 0.6900331377983093, + "eval_rewards/accuracies": 0.6936202049255371, + "eval_rewards/chosen": -2.171912670135498, + "eval_rewards/margins": 0.9539060592651367, + "eval_rewards/rejected": -3.1258187294006348, + "eval_runtime": 44.4262, + "eval_samples_per_second": 30.275, + "eval_sft_loss": 2.1093075275421143, + "eval_steps_per_second": 7.586, + "step": 3200 + }, + { + "epoch": 1.7153370128784078, + "grad_norm": 2.5065271531069038, + "learning_rate": 4.618806587294291e-07, + "logits/chosen": -0.21862120926380157, + "logits/rejected": -0.10379710048437119, + "logps/chosen": -2.1867823600769043, + "logps/rejected": -3.1316921710968018, + "loss": 0.6825, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.1867823600769043, + "rewards/margins": 0.9449104070663452, + "rewards/rejected": -3.1316921710968018, + "sft_loss": 2.16475248336792, + "step": 3205 + }, + { + "epoch": 1.7180130456598093, + "grad_norm": 3.89401502233373, + "learning_rate": 4.603279761638365e-07, + "logits/chosen": -0.19211474061012268, + "logits/rejected": -0.04500243440270424, + "logps/chosen": -2.1232149600982666, + "logps/rejected": -2.9413015842437744, + "loss": 0.6963, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.1232149600982666, + "rewards/margins": 0.8180867433547974, + "rewards/rejected": -2.9413015842437744, + "sft_loss": 2.0833516120910645, + "step": 3210 + }, + { + "epoch": 1.720689078441211, + "grad_norm": 2.8031214070077213, + "learning_rate": 4.5877567849639315e-07, + "logits/chosen": -0.058820150792598724, + "logits/rejected": 0.07056679576635361, + "logps/chosen": -2.1051268577575684, + "logps/rejected": -3.0708067417144775, + "loss": 0.6871, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1051268577575684, + "rewards/margins": 0.9656797647476196, + "rewards/rejected": -3.0708067417144775, + "sft_loss": 2.090538501739502, + "step": 3215 + }, + { + "epoch": 1.7233651112226125, + "grad_norm": 2.4297870549412246, + "learning_rate": 4.572237807874979e-07, + "logits/chosen": -0.11713247001171112, + "logits/rejected": 0.16286148130893707, + "logps/chosen": -2.40877366065979, + "logps/rejected": -3.234921932220459, + "loss": 0.6931, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.40877366065979, + "rewards/margins": 0.8261480331420898, + "rewards/rejected": -3.234921932220459, + "sft_loss": 2.234438419342041, + "step": 3220 + }, + { + "epoch": 1.726041144004014, + "grad_norm": 4.840884673603746, + "learning_rate": 4.5567229809366895e-07, + "logits/chosen": -0.0701625794172287, + "logits/rejected": 0.08214187622070312, + "logps/chosen": -1.9803073406219482, + "logps/rejected": -2.982759952545166, + "loss": 0.6848, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.9803073406219482, + "rewards/margins": 1.0024524927139282, + "rewards/rejected": -2.982759952545166, + "sft_loss": 1.9953334331512451, + "step": 3225 + }, + { + "epoch": 1.7287171767854157, + "grad_norm": 5.248148076110207, + "learning_rate": 4.541212454673984e-07, + "logits/chosen": -0.09757836908102036, + "logits/rejected": 0.09544403105974197, + "logps/chosen": -2.1850745677948, + "logps/rejected": -3.2705535888671875, + "loss": 0.683, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1850745677948, + "rewards/margins": 1.0854787826538086, + "rewards/rejected": -3.2705535888671875, + "sft_loss": 2.143946886062622, + "step": 3230 + }, + { + "epoch": 1.7313932095668172, + "grad_norm": 3.887898220382095, + "learning_rate": 4.525706379570055e-07, + "logits/chosen": -0.07574521005153656, + "logits/rejected": 0.03172556310892105, + "logps/chosen": -2.0378127098083496, + "logps/rejected": -2.9185850620269775, + "loss": 0.6867, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0378127098083496, + "rewards/margins": 0.8807722926139832, + "rewards/rejected": -2.9185850620269775, + "sft_loss": 2.0330047607421875, + "step": 3235 + }, + { + "epoch": 1.7340692423482187, + "grad_norm": 2.682458799901896, + "learning_rate": 4.510204906064911e-07, + "logits/chosen": 0.013766959309577942, + "logits/rejected": 0.13985109329223633, + "logps/chosen": -2.1408016681671143, + "logps/rejected": -3.178284168243408, + "loss": 0.672, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1408016681671143, + "rewards/margins": 1.037482500076294, + "rewards/rejected": -3.178284168243408, + "sft_loss": 1.9422342777252197, + "step": 3240 + }, + { + "epoch": 1.7367452751296204, + "grad_norm": 4.050477821888417, + "learning_rate": 4.4947081845539177e-07, + "logits/chosen": -0.19383029639720917, + "logits/rejected": -0.046612389385700226, + "logps/chosen": -2.194392681121826, + "logps/rejected": -2.949484348297119, + "loss": 0.6904, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.194392681121826, + "rewards/margins": 0.7550913095474243, + "rewards/rejected": -2.949484348297119, + "sft_loss": 2.1109633445739746, + "step": 3245 + }, + { + "epoch": 1.739421307911022, + "grad_norm": 4.141531108529457, + "learning_rate": 4.479216365386333e-07, + "logits/chosen": 0.006335936486721039, + "logits/rejected": 0.19332371652126312, + "logps/chosen": -1.9818389415740967, + "logps/rejected": -2.979146718978882, + "loss": 0.6811, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.9818389415740967, + "rewards/margins": 0.9973075985908508, + "rewards/rejected": -2.979146718978882, + "sft_loss": 1.9491941928863525, + "step": 3250 + }, + { + "epoch": 1.7420973406924234, + "grad_norm": 2.906960024292214, + "learning_rate": 4.4637295988638555e-07, + "logits/chosen": -0.039695125073194504, + "logits/rejected": 0.06806042045354843, + "logps/chosen": -2.106238603591919, + "logps/rejected": -2.9520974159240723, + "loss": 0.6843, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.106238603591919, + "rewards/margins": 0.8458584547042847, + "rewards/rejected": -2.9520974159240723, + "sft_loss": 2.095874309539795, + "step": 3255 + }, + { + "epoch": 1.744773373473825, + "grad_norm": 2.5598016246844804, + "learning_rate": 4.4482480352391623e-07, + "logits/chosen": -0.12927773594856262, + "logits/rejected": 0.04138825461268425, + "logps/chosen": -2.0338780879974365, + "logps/rejected": -3.0352959632873535, + "loss": 0.6752, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0338780879974365, + "rewards/margins": 1.001417875289917, + "rewards/rejected": -3.0352959632873535, + "sft_loss": 2.0644609928131104, + "step": 3260 + }, + { + "epoch": 1.7474494062552266, + "grad_norm": 5.3129089347850345, + "learning_rate": 4.4327718247144507e-07, + "logits/chosen": -0.026618679985404015, + "logits/rejected": 0.12305717170238495, + "logps/chosen": -2.0354652404785156, + "logps/rejected": -2.9555282592773438, + "loss": 0.6874, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.0354652404785156, + "rewards/margins": 0.9200627207756042, + "rewards/rejected": -2.9555282592773438, + "sft_loss": 2.086569309234619, + "step": 3265 + }, + { + "epoch": 1.750125439036628, + "grad_norm": 17.067815849238478, + "learning_rate": 4.417301117439984e-07, + "logits/chosen": -0.09709037095308304, + "logits/rejected": 0.05840582400560379, + "logps/chosen": -1.999385118484497, + "logps/rejected": -2.9995360374450684, + "loss": 0.6862, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.999385118484497, + "rewards/margins": 1.0001510381698608, + "rewards/rejected": -2.9995360374450684, + "sft_loss": 2.018332004547119, + "step": 3270 + }, + { + "epoch": 1.7528014718180298, + "grad_norm": 4.184862860164015, + "learning_rate": 4.401836063512631e-07, + "logits/chosen": -0.154140904545784, + "logits/rejected": 0.15783226490020752, + "logps/chosen": -2.037508487701416, + "logps/rejected": -2.9666733741760254, + "loss": 0.6768, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.037508487701416, + "rewards/margins": 0.9291653633117676, + "rewards/rejected": -2.9666733741760254, + "sft_loss": 2.0432841777801514, + "step": 3275 + }, + { + "epoch": 1.7554775045994313, + "grad_norm": 4.83664371080636, + "learning_rate": 4.386376812974413e-07, + "logits/chosen": -0.12879234552383423, + "logits/rejected": -0.055740825831890106, + "logps/chosen": -2.003026247024536, + "logps/rejected": -3.0168797969818115, + "loss": 0.6803, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.003026247024536, + "rewards/margins": 1.013853907585144, + "rewards/rejected": -3.0168797969818115, + "sft_loss": 2.0483086109161377, + "step": 3280 + }, + { + "epoch": 1.7581535373808328, + "grad_norm": 3.6468834080002828, + "learning_rate": 4.370923515811048e-07, + "logits/chosen": -0.11835076659917831, + "logits/rejected": 0.12381935119628906, + "logps/chosen": -2.1517369747161865, + "logps/rejected": -3.050192356109619, + "loss": 0.6867, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1517369747161865, + "rewards/margins": 0.8984552621841431, + "rewards/rejected": -3.050192356109619, + "sft_loss": 2.141003370285034, + "step": 3285 + }, + { + "epoch": 1.7608295701622345, + "grad_norm": 3.8191587634646424, + "learning_rate": 4.35547632195049e-07, + "logits/chosen": -0.07299528270959854, + "logits/rejected": 0.04519936814904213, + "logps/chosen": -2.209841012954712, + "logps/rejected": -3.1058108806610107, + "loss": 0.6919, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.209841012954712, + "rewards/margins": 0.8959699869155884, + "rewards/rejected": -3.1058108806610107, + "sft_loss": 2.2350947856903076, + "step": 3290 + }, + { + "epoch": 1.763505602943636, + "grad_norm": 4.917189460703717, + "learning_rate": 4.340035381261484e-07, + "logits/chosen": -0.09429631382226944, + "logits/rejected": 0.01863674819469452, + "logps/chosen": -2.3670341968536377, + "logps/rejected": -3.198622941970825, + "loss": 0.7021, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.3670341968536377, + "rewards/margins": 0.8315887451171875, + "rewards/rejected": -3.198622941970825, + "sft_loss": 2.329556941986084, + "step": 3295 + }, + { + "epoch": 1.7661816357250375, + "grad_norm": 6.2948407219727445, + "learning_rate": 4.324600843552104e-07, + "logits/chosen": -0.20224657654762268, + "logits/rejected": -0.026348227635025978, + "logps/chosen": -2.357205390930176, + "logps/rejected": -3.389937162399292, + "loss": 0.6903, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.357205390930176, + "rewards/margins": 1.0327322483062744, + "rewards/rejected": -3.389937162399292, + "sft_loss": 2.300279378890991, + "step": 3300 + }, + { + "epoch": 1.7688576685064392, + "grad_norm": 3.535278360715077, + "learning_rate": 4.309172858568302e-07, + "logits/chosen": -0.19258543848991394, + "logits/rejected": 0.0005183167522773147, + "logps/chosen": -2.2726645469665527, + "logps/rejected": -3.235269546508789, + "loss": 0.6941, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2726645469665527, + "rewards/margins": 0.9626048803329468, + "rewards/rejected": -3.235269546508789, + "sft_loss": 2.2232964038848877, + "step": 3305 + }, + { + "epoch": 1.771533701287841, + "grad_norm": 3.821952745409955, + "learning_rate": 4.293751575992455e-07, + "logits/chosen": -0.022418078035116196, + "logits/rejected": 0.03682177886366844, + "logps/chosen": -2.3169894218444824, + "logps/rejected": -3.344888687133789, + "loss": 0.6867, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.3169894218444824, + "rewards/margins": 1.0278997421264648, + "rewards/rejected": -3.344888687133789, + "sft_loss": 2.3466386795043945, + "step": 3310 + }, + { + "epoch": 1.7742097340692422, + "grad_norm": 5.968262115040822, + "learning_rate": 4.278337145441916e-07, + "logits/chosen": -0.17735466361045837, + "logits/rejected": 0.028507202863693237, + "logps/chosen": -2.2298970222473145, + "logps/rejected": -3.1068997383117676, + "loss": 0.6935, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2298970222473145, + "rewards/margins": 0.8770028948783875, + "rewards/rejected": -3.1068997383117676, + "sft_loss": 2.2399425506591797, + "step": 3315 + }, + { + "epoch": 1.776885766850644, + "grad_norm": 5.369649985568279, + "learning_rate": 4.262929716467556e-07, + "logits/chosen": -0.11486073583364487, + "logits/rejected": 0.12659046053886414, + "logps/chosen": -2.2763190269470215, + "logps/rejected": -3.341658115386963, + "loss": 0.698, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2763190269470215, + "rewards/margins": 1.065339207649231, + "rewards/rejected": -3.341658115386963, + "sft_loss": 2.2652580738067627, + "step": 3320 + }, + { + "epoch": 1.7795617996320456, + "grad_norm": 3.865119586160214, + "learning_rate": 4.247529438552321e-07, + "logits/chosen": -0.1526700258255005, + "logits/rejected": 0.07134034484624863, + "logps/chosen": -2.319143295288086, + "logps/rejected": -3.345414400100708, + "loss": 0.6854, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.319143295288086, + "rewards/margins": 1.0262712240219116, + "rewards/rejected": -3.345414400100708, + "sft_loss": 2.3606820106506348, + "step": 3325 + }, + { + "epoch": 1.782237832413447, + "grad_norm": 6.205044855109865, + "learning_rate": 4.232136461109773e-07, + "logits/chosen": -0.037674445658922195, + "logits/rejected": 0.0866425484418869, + "logps/chosen": -2.041942834854126, + "logps/rejected": -3.265056610107422, + "loss": 0.6686, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.041942834854126, + "rewards/margins": 1.223113775253296, + "rewards/rejected": -3.265056610107422, + "sft_loss": 2.0743894577026367, + "step": 3330 + }, + { + "epoch": 1.7849138651948486, + "grad_norm": 3.912801244648953, + "learning_rate": 4.216750933482646e-07, + "logits/chosen": -0.11040419340133667, + "logits/rejected": 0.08194796741008759, + "logps/chosen": -2.2899370193481445, + "logps/rejected": -3.1266794204711914, + "loss": 0.6925, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.2899370193481445, + "rewards/margins": 0.8367422223091125, + "rewards/rejected": -3.1266794204711914, + "sft_loss": 2.195441722869873, + "step": 3335 + }, + { + "epoch": 1.7875898979762503, + "grad_norm": 3.7753776596782953, + "learning_rate": 4.2013730049413986e-07, + "logits/chosen": -0.05143510550260544, + "logits/rejected": 0.11683867126703262, + "logps/chosen": -1.96332585811615, + "logps/rejected": -3.1719508171081543, + "loss": 0.6688, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.96332585811615, + "rewards/margins": 1.2086249589920044, + "rewards/rejected": -3.1719508171081543, + "sft_loss": 1.982080101966858, + "step": 3340 + }, + { + "epoch": 1.7902659307576518, + "grad_norm": 4.414998388539221, + "learning_rate": 4.1860028246827594e-07, + "logits/chosen": -0.08478949964046478, + "logits/rejected": 0.1309826523065567, + "logps/chosen": -1.9758306741714478, + "logps/rejected": -2.9565446376800537, + "loss": 0.6721, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9758306741714478, + "rewards/margins": 0.9807138442993164, + "rewards/rejected": -2.9565446376800537, + "sft_loss": 2.01054310798645, + "step": 3345 + }, + { + "epoch": 1.7929419635390533, + "grad_norm": 3.5291379444260276, + "learning_rate": 4.170640541828285e-07, + "logits/chosen": -0.1986711025238037, + "logits/rejected": -0.03366810828447342, + "logps/chosen": -2.1853854656219482, + "logps/rejected": -3.0746588706970215, + "loss": 0.6907, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1853854656219482, + "rewards/margins": 0.889273464679718, + "rewards/rejected": -3.0746588706970215, + "sft_loss": 2.1545419692993164, + "step": 3350 + }, + { + "epoch": 1.795617996320455, + "grad_norm": 14.564448690812977, + "learning_rate": 4.1552863054229116e-07, + "logits/chosen": 0.016578923910856247, + "logits/rejected": 0.09158939123153687, + "logps/chosen": -2.2663533687591553, + "logps/rejected": -3.0901200771331787, + "loss": 0.7024, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2663533687591553, + "rewards/margins": 0.8237667083740234, + "rewards/rejected": -3.0901200771331787, + "sft_loss": 2.151644229888916, + "step": 3355 + }, + { + "epoch": 1.7982940291018565, + "grad_norm": 7.208621192726958, + "learning_rate": 4.139940264433508e-07, + "logits/chosen": -0.09454993903636932, + "logits/rejected": 0.11722595989704132, + "logps/chosen": -2.0083000659942627, + "logps/rejected": -2.8931920528411865, + "loss": 0.6947, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.0083000659942627, + "rewards/margins": 0.8848922848701477, + "rewards/rejected": -2.8931920528411865, + "sft_loss": 1.980495810508728, + "step": 3360 + }, + { + "epoch": 1.800970061883258, + "grad_norm": 4.309712271139695, + "learning_rate": 4.1246025677474303e-07, + "logits/chosen": -0.10693303495645523, + "logits/rejected": 0.0911029800772667, + "logps/chosen": -2.194310426712036, + "logps/rejected": -3.086164951324463, + "loss": 0.6874, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.194310426712036, + "rewards/margins": 0.8918546438217163, + "rewards/rejected": -3.086164951324463, + "sft_loss": 2.226395606994629, + "step": 3365 + }, + { + "epoch": 1.8036460946646597, + "grad_norm": 3.0636279711589687, + "learning_rate": 4.10927336417108e-07, + "logits/chosen": -0.10850661993026733, + "logits/rejected": 0.06401662528514862, + "logps/chosen": -2.139122724533081, + "logps/rejected": -2.977569103240967, + "loss": 0.6881, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.139122724533081, + "rewards/margins": 0.8384467363357544, + "rewards/rejected": -2.977569103240967, + "sft_loss": 2.096174478530884, + "step": 3370 + }, + { + "epoch": 1.8063221274460612, + "grad_norm": 5.109255566645895, + "learning_rate": 4.093952802428457e-07, + "logits/chosen": 0.06034206226468086, + "logits/rejected": 0.14231623709201813, + "logps/chosen": -2.2905266284942627, + "logps/rejected": -3.098337173461914, + "loss": 0.704, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.2905266284942627, + "rewards/margins": 0.8078103065490723, + "rewards/rejected": -3.098337173461914, + "sft_loss": 2.2123026847839355, + "step": 3375 + }, + { + "epoch": 1.8089981602274627, + "grad_norm": 2.453292045111366, + "learning_rate": 4.0786410311597184e-07, + "logits/chosen": -0.16326487064361572, + "logits/rejected": 0.026889164000749588, + "logps/chosen": -2.2471578121185303, + "logps/rejected": -3.129439353942871, + "loss": 0.6912, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.2471578121185303, + "rewards/margins": 0.8822811245918274, + "rewards/rejected": -3.129439353942871, + "sft_loss": 2.154139757156372, + "step": 3380 + }, + { + "epoch": 1.8116741930088645, + "grad_norm": 5.551184128221047, + "learning_rate": 4.063338198919737e-07, + "logits/chosen": -0.11775940656661987, + "logits/rejected": -0.075945183634758, + "logps/chosen": -2.2515721321105957, + "logps/rejected": -2.957139015197754, + "loss": 0.6936, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2515721321105957, + "rewards/margins": 0.7055668830871582, + "rewards/rejected": -2.957139015197754, + "sft_loss": 2.2185423374176025, + "step": 3385 + }, + { + "epoch": 1.814350225790266, + "grad_norm": 4.7354448751058325, + "learning_rate": 4.0480444541766575e-07, + "logits/chosen": -0.07824783027172089, + "logits/rejected": 0.07530321180820465, + "logps/chosen": -2.460838556289673, + "logps/rejected": -3.3306336402893066, + "loss": 0.6956, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.460838556289673, + "rewards/margins": 0.8697946667671204, + "rewards/rejected": -3.3306336402893066, + "sft_loss": 2.426344394683838, + "step": 3390 + }, + { + "epoch": 1.8170262585716674, + "grad_norm": 3.179067217939218, + "learning_rate": 4.0327599453104606e-07, + "logits/chosen": -0.12580683827400208, + "logits/rejected": 0.03872048109769821, + "logps/chosen": -2.021254777908325, + "logps/rejected": -3.144026279449463, + "loss": 0.6864, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.021254777908325, + "rewards/margins": 1.1227715015411377, + "rewards/rejected": -3.144026279449463, + "sft_loss": 2.0137553215026855, + "step": 3395 + }, + { + "epoch": 1.8197022913530692, + "grad_norm": 8.507807005568804, + "learning_rate": 4.017484820611514e-07, + "logits/chosen": -0.07757692039012909, + "logits/rejected": 0.06929562985897064, + "logps/chosen": -2.216972827911377, + "logps/rejected": -3.2000606060028076, + "loss": 0.6837, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.216972827911377, + "rewards/margins": 0.9830881357192993, + "rewards/rejected": -3.2000606060028076, + "sft_loss": 2.1666877269744873, + "step": 3400 + }, + { + "epoch": 1.8223783241344707, + "grad_norm": 4.622987862680945, + "learning_rate": 4.002219228279148e-07, + "logits/chosen": -0.12145348638296127, + "logits/rejected": 0.05669483542442322, + "logps/chosen": -2.1411659717559814, + "logps/rejected": -3.0339109897613525, + "loss": 0.6954, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1411659717559814, + "rewards/margins": 0.8927448391914368, + "rewards/rejected": -3.0339109897613525, + "sft_loss": 2.104557752609253, + "step": 3405 + }, + { + "epoch": 1.8250543569158721, + "grad_norm": 2.1985941281016737, + "learning_rate": 3.9869633164202045e-07, + "logits/chosen": -0.12451644241809845, + "logits/rejected": 0.11840645968914032, + "logps/chosen": -2.269446849822998, + "logps/rejected": -3.09425687789917, + "loss": 0.6923, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.269446849822998, + "rewards/margins": 0.824809730052948, + "rewards/rejected": -3.09425687789917, + "sft_loss": 2.1708335876464844, + "step": 3410 + }, + { + "epoch": 1.8277303896972739, + "grad_norm": 5.604133301731, + "learning_rate": 3.9717172330476077e-07, + "logits/chosen": -0.13375402987003326, + "logits/rejected": 0.01883949711918831, + "logps/chosen": -2.252318859100342, + "logps/rejected": -3.217984437942505, + "loss": 0.6836, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.252318859100342, + "rewards/margins": 0.9656656384468079, + "rewards/rejected": -3.217984437942505, + "sft_loss": 2.30187726020813, + "step": 3415 + }, + { + "epoch": 1.8304064224786754, + "grad_norm": 4.047489800275802, + "learning_rate": 3.956481126078927e-07, + "logits/chosen": -0.0439617820084095, + "logits/rejected": 0.08844329416751862, + "logps/chosen": -2.4110324382781982, + "logps/rejected": -3.316443920135498, + "loss": 0.6971, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.4110324382781982, + "rewards/margins": 0.905411422252655, + "rewards/rejected": -3.316443920135498, + "sft_loss": 2.425555467605591, + "step": 3420 + }, + { + "epoch": 1.8330824552600768, + "grad_norm": 2.617352281279491, + "learning_rate": 3.941255143334937e-07, + "logits/chosen": -0.15715794265270233, + "logits/rejected": -0.09440397471189499, + "logps/chosen": -2.112018585205078, + "logps/rejected": -3.1406939029693604, + "loss": 0.6917, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.112018585205078, + "rewards/margins": 1.0286751985549927, + "rewards/rejected": -3.1406939029693604, + "sft_loss": 2.0670018196105957, + "step": 3425 + }, + { + "epoch": 1.8357584880414786, + "grad_norm": 3.4875857107376875, + "learning_rate": 3.9260394325381895e-07, + "logits/chosen": -0.1380632072687149, + "logits/rejected": 0.014524638652801514, + "logps/chosen": -2.0882363319396973, + "logps/rejected": -3.4028327465057373, + "loss": 0.6773, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0882363319396973, + "rewards/margins": 1.314596176147461, + "rewards/rejected": -3.4028327465057373, + "sft_loss": 2.0673441886901855, + "step": 3430 + }, + { + "epoch": 1.83843452082288, + "grad_norm": 4.008539983512021, + "learning_rate": 3.9108341413115784e-07, + "logits/chosen": -0.11774700880050659, + "logits/rejected": 0.0142317283898592, + "logps/chosen": -2.068218946456909, + "logps/rejected": -3.1259474754333496, + "loss": 0.6774, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.068218946456909, + "rewards/margins": 1.0577284097671509, + "rewards/rejected": -3.1259474754333496, + "sft_loss": 2.041489362716675, + "step": 3435 + }, + { + "epoch": 1.8411105536042816, + "grad_norm": 3.2923612802327025, + "learning_rate": 3.895639417176905e-07, + "logits/chosen": -0.16679182648658752, + "logits/rejected": -0.05302317813038826, + "logps/chosen": -2.1182868480682373, + "logps/rejected": -3.375195264816284, + "loss": 0.698, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1182868480682373, + "rewards/margins": 1.2569081783294678, + "rewards/rejected": -3.375195264816284, + "sft_loss": 2.1025192737579346, + "step": 3440 + }, + { + "epoch": 1.8437865863856833, + "grad_norm": 3.9802507216073133, + "learning_rate": 3.8804554075534497e-07, + "logits/chosen": -0.15935611724853516, + "logits/rejected": 0.06508435308933258, + "logps/chosen": -2.0967845916748047, + "logps/rejected": -3.107313632965088, + "loss": 0.6773, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.0967845916748047, + "rewards/margins": 1.0105292797088623, + "rewards/rejected": -3.107313632965088, + "sft_loss": 2.0953187942504883, + "step": 3445 + }, + { + "epoch": 1.8464626191670848, + "grad_norm": 4.551116828204739, + "learning_rate": 3.8652822597565403e-07, + "logits/chosen": -0.18554912507534027, + "logits/rejected": 0.003855127142742276, + "logps/chosen": -2.1789658069610596, + "logps/rejected": -3.0571799278259277, + "loss": 0.685, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1789658069610596, + "rewards/margins": 0.8782145380973816, + "rewards/rejected": -3.0571799278259277, + "sft_loss": 2.1960291862487793, + "step": 3450 + }, + { + "epoch": 1.8491386519484863, + "grad_norm": 3.8012645362562836, + "learning_rate": 3.850120120996123e-07, + "logits/chosen": -0.11601553112268448, + "logits/rejected": 0.09878197312355042, + "logps/chosen": -2.219163179397583, + "logps/rejected": -3.2052559852600098, + "loss": 0.6935, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.219163179397583, + "rewards/margins": 0.9860928654670715, + "rewards/rejected": -3.2052559852600098, + "sft_loss": 2.1843903064727783, + "step": 3455 + }, + { + "epoch": 1.851814684729888, + "grad_norm": 3.2793922911834215, + "learning_rate": 3.8349691383753356e-07, + "logits/chosen": -0.0153552470728755, + "logits/rejected": 0.11092513799667358, + "logps/chosen": -2.2136993408203125, + "logps/rejected": -3.1592071056365967, + "loss": 0.6989, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2136993408203125, + "rewards/margins": 0.9455081820487976, + "rewards/rejected": -3.1592071056365967, + "sft_loss": 2.1067147254943848, + "step": 3460 + }, + { + "epoch": 1.8544907175112895, + "grad_norm": 2.3731437209575814, + "learning_rate": 3.819829458889078e-07, + "logits/chosen": -0.14444135129451752, + "logits/rejected": -0.005835582502186298, + "logps/chosen": -1.9672229290008545, + "logps/rejected": -2.7969489097595215, + "loss": 0.6861, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.9672229290008545, + "rewards/margins": 0.8297258615493774, + "rewards/rejected": -2.7969489097595215, + "sft_loss": 1.9027982950210571, + "step": 3465 + }, + { + "epoch": 1.857166750292691, + "grad_norm": 3.2927142582400855, + "learning_rate": 3.804701229422585e-07, + "logits/chosen": -0.18615327775478363, + "logits/rejected": -0.06812240928411484, + "logps/chosen": -2.227973699569702, + "logps/rejected": -3.227752685546875, + "loss": 0.6889, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.227973699569702, + "rewards/margins": 0.9997787475585938, + "rewards/rejected": -3.227752685546875, + "sft_loss": 2.143904447555542, + "step": 3470 + }, + { + "epoch": 1.8598427830740927, + "grad_norm": 3.9052609817318227, + "learning_rate": 3.789584596750007e-07, + "logits/chosen": -0.186161071062088, + "logits/rejected": -0.10436218976974487, + "logps/chosen": -2.1326229572296143, + "logps/rejected": -3.1862387657165527, + "loss": 0.673, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1326229572296143, + "rewards/margins": 1.0536160469055176, + "rewards/rejected": -3.1862387657165527, + "sft_loss": 2.1006031036376953, + "step": 3475 + }, + { + "epoch": 1.8625188158554944, + "grad_norm": 3.9607780631187084, + "learning_rate": 3.77447970753298e-07, + "logits/chosen": -0.047240983694791794, + "logits/rejected": -0.005119943525642157, + "logps/chosen": -2.162609815597534, + "logps/rejected": -3.0216145515441895, + "loss": 0.6888, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.162609815597534, + "rewards/margins": 0.8590046763420105, + "rewards/rejected": -3.0216145515441895, + "sft_loss": 2.1431949138641357, + "step": 3480 + }, + { + "epoch": 1.8651948486368957, + "grad_norm": 8.937980779204771, + "learning_rate": 3.7593867083192057e-07, + "logits/chosen": -0.1427825391292572, + "logits/rejected": 0.022900383919477463, + "logps/chosen": -2.215710163116455, + "logps/rejected": -3.103562116622925, + "loss": 0.7099, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.215710163116455, + "rewards/margins": 0.8878521919250488, + "rewards/rejected": -3.103562116622925, + "sft_loss": 2.205852746963501, + "step": 3485 + }, + { + "epoch": 1.8678708814182974, + "grad_norm": 3.5475971310646393, + "learning_rate": 3.7443057455410276e-07, + "logits/chosen": -0.09783361852169037, + "logits/rejected": 0.010669758543372154, + "logps/chosen": -1.9912612438201904, + "logps/rejected": -2.944227695465088, + "loss": 0.6821, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9912612438201904, + "rewards/margins": 0.9529666900634766, + "rewards/rejected": -2.944227695465088, + "sft_loss": 2.081857681274414, + "step": 3490 + }, + { + "epoch": 1.870546914199699, + "grad_norm": 4.171379109244644, + "learning_rate": 3.7292369655140145e-07, + "logits/chosen": -0.19658830761909485, + "logits/rejected": -0.015646522864699364, + "logps/chosen": -2.268094301223755, + "logps/rejected": -3.0871119499206543, + "loss": 0.6827, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.268094301223755, + "rewards/margins": 0.8190175294876099, + "rewards/rejected": -3.0871119499206543, + "sft_loss": 2.2411656379699707, + "step": 3495 + }, + { + "epoch": 1.8732229469811004, + "grad_norm": 3.6592301505173337, + "learning_rate": 3.714180514435534e-07, + "logits/chosen": -0.10057705640792847, + "logits/rejected": 0.07258979976177216, + "logps/chosen": -2.230692148208618, + "logps/rejected": -3.1516542434692383, + "loss": 0.6814, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.230692148208618, + "rewards/margins": 0.9209620356559753, + "rewards/rejected": -3.1516542434692383, + "sft_loss": 2.102454900741577, + "step": 3500 + }, + { + "epoch": 1.875898979762502, + "grad_norm": 13.970667089680054, + "learning_rate": 3.6991365383833426e-07, + "logits/chosen": -0.1707434505224228, + "logits/rejected": -0.012985003180801868, + "logps/chosen": -2.1211893558502197, + "logps/rejected": -3.187798500061035, + "loss": 0.6924, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1211893558502197, + "rewards/margins": 1.0666093826293945, + "rewards/rejected": -3.187798500061035, + "sft_loss": 2.0635311603546143, + "step": 3505 + }, + { + "epoch": 1.8785750125439038, + "grad_norm": 3.938830189364275, + "learning_rate": 3.684105183314162e-07, + "logits/chosen": -0.16227789223194122, + "logits/rejected": -0.06405164301395416, + "logps/chosen": -1.9964349269866943, + "logps/rejected": -2.9089252948760986, + "loss": 0.6764, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9964349269866943, + "rewards/margins": 0.9124904870986938, + "rewards/rejected": -2.9089252948760986, + "sft_loss": 2.015360116958618, + "step": 3510 + }, + { + "epoch": 1.881251045325305, + "grad_norm": 3.3631850849437015, + "learning_rate": 3.669086595062263e-07, + "logits/chosen": -0.16847261786460876, + "logits/rejected": 0.046769242733716965, + "logps/chosen": -2.1035525798797607, + "logps/rejected": -3.0158162117004395, + "loss": 0.7048, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1035525798797607, + "rewards/margins": 0.9122636914253235, + "rewards/rejected": -3.0158162117004395, + "sft_loss": 2.0737602710723877, + "step": 3515 + }, + { + "epoch": 1.8839270781067068, + "grad_norm": 3.04163797970243, + "learning_rate": 3.654080919338056e-07, + "logits/chosen": -0.20176240801811218, + "logits/rejected": -0.026735246181488037, + "logps/chosen": -2.0932791233062744, + "logps/rejected": -3.039994955062866, + "loss": 0.6867, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0932791233062744, + "rewards/margins": 0.9467160105705261, + "rewards/rejected": -3.039994955062866, + "sft_loss": 2.1171672344207764, + "step": 3520 + }, + { + "epoch": 1.8866031108881085, + "grad_norm": 4.348264134637991, + "learning_rate": 3.639088301726673e-07, + "logits/chosen": -0.1465270072221756, + "logits/rejected": 0.055618755519390106, + "logps/chosen": -2.162325382232666, + "logps/rejected": -3.1199283599853516, + "loss": 0.6885, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.162325382232666, + "rewards/margins": 0.957602858543396, + "rewards/rejected": -3.1199283599853516, + "sft_loss": 2.1590447425842285, + "step": 3525 + }, + { + "epoch": 1.88927914366951, + "grad_norm": 2.7569866499321365, + "learning_rate": 3.624108887686556e-07, + "logits/chosen": -0.1441269963979721, + "logits/rejected": -0.05600646883249283, + "logps/chosen": -2.1244869232177734, + "logps/rejected": -2.8845534324645996, + "loss": 0.6818, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1244869232177734, + "rewards/margins": 0.760066568851471, + "rewards/rejected": -2.8845534324645996, + "sft_loss": 2.1846280097961426, + "step": 3530 + }, + { + "epoch": 1.8919551764509115, + "grad_norm": 4.013388373110339, + "learning_rate": 3.6091428225480433e-07, + "logits/chosen": -0.2562624216079712, + "logits/rejected": -0.10145537555217743, + "logps/chosen": -1.9844735860824585, + "logps/rejected": -2.9652676582336426, + "loss": 0.6852, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.9844735860824585, + "rewards/margins": 0.9807940721511841, + "rewards/rejected": -2.9652676582336426, + "sft_loss": 2.021907329559326, + "step": 3535 + }, + { + "epoch": 1.8946312092323132, + "grad_norm": 4.317218275979374, + "learning_rate": 3.5941902515119674e-07, + "logits/chosen": -0.21493549644947052, + "logits/rejected": 0.025484537705779076, + "logps/chosen": -2.120227336883545, + "logps/rejected": -2.944889545440674, + "loss": 0.6887, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.120227336883545, + "rewards/margins": 0.8246625661849976, + "rewards/rejected": -2.944889545440674, + "sft_loss": 2.1267037391662598, + "step": 3540 + }, + { + "epoch": 1.8973072420137147, + "grad_norm": 3.918133212884738, + "learning_rate": 3.5792513196482373e-07, + "logits/chosen": -0.30855846405029297, + "logits/rejected": -0.0393378920853138, + "logps/chosen": -2.0020647048950195, + "logps/rejected": -2.9337689876556396, + "loss": 0.6859, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0020647048950195, + "rewards/margins": 0.9317046403884888, + "rewards/rejected": -2.9337689876556396, + "sft_loss": 1.9564940929412842, + "step": 3545 + }, + { + "epoch": 1.8999832747951162, + "grad_norm": 4.687343497306164, + "learning_rate": 3.5643261718944346e-07, + "logits/chosen": -0.12303660809993744, + "logits/rejected": -0.012298181653022766, + "logps/chosen": -2.117335557937622, + "logps/rejected": -2.9036712646484375, + "loss": 0.693, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.117335557937622, + "rewards/margins": 0.7863359451293945, + "rewards/rejected": -2.9036712646484375, + "sft_loss": 1.9888889789581299, + "step": 3550 + }, + { + "epoch": 1.902659307576518, + "grad_norm": 2.6110618995856543, + "learning_rate": 3.5494149530544087e-07, + "logits/chosen": -0.23396828770637512, + "logits/rejected": -0.10798802226781845, + "logps/chosen": -2.1209850311279297, + "logps/rejected": -2.962287187576294, + "loss": 0.7036, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1209850311279297, + "rewards/margins": 0.8413020372390747, + "rewards/rejected": -2.962287187576294, + "sft_loss": 2.107417345046997, + "step": 3555 + }, + { + "epoch": 1.9053353403579194, + "grad_norm": 3.61417843766057, + "learning_rate": 3.534517807796871e-07, + "logits/chosen": -0.17204193770885468, + "logits/rejected": -0.059635408222675323, + "logps/chosen": -2.118635416030884, + "logps/rejected": -2.945610761642456, + "loss": 0.692, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.118635416030884, + "rewards/margins": 0.8269752264022827, + "rewards/rejected": -2.945610761642456, + "sft_loss": 2.101919412612915, + "step": 3560 + }, + { + "epoch": 1.908011373139321, + "grad_norm": 3.291048566978385, + "learning_rate": 3.519634880653988e-07, + "logits/chosen": -0.12395143508911133, + "logits/rejected": -0.029307598248124123, + "logps/chosen": -2.0599448680877686, + "logps/rejected": -3.3115997314453125, + "loss": 0.6826, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.0599448680877686, + "rewards/margins": 1.2516547441482544, + "rewards/rejected": -3.3115997314453125, + "sft_loss": 2.1145453453063965, + "step": 3565 + }, + { + "epoch": 1.9106874059207226, + "grad_norm": 3.3791336666800333, + "learning_rate": 3.504766316019987e-07, + "logits/chosen": -0.1779370754957199, + "logits/rejected": 0.0018944144248962402, + "logps/chosen": -2.1011738777160645, + "logps/rejected": -2.851292133331299, + "loss": 0.6844, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1011738777160645, + "rewards/margins": 0.7501183748245239, + "rewards/rejected": -2.851292133331299, + "sft_loss": 1.9948400259017944, + "step": 3570 + }, + { + "epoch": 1.913363438702124, + "grad_norm": 5.853961944038886, + "learning_rate": 3.489912258149745e-07, + "logits/chosen": -0.08530310541391373, + "logits/rejected": 0.05743807554244995, + "logps/chosen": -2.105227470397949, + "logps/rejected": -3.1187050342559814, + "loss": 0.6814, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.105227470397949, + "rewards/margins": 1.0134776830673218, + "rewards/rejected": -3.1187050342559814, + "sft_loss": 2.0023372173309326, + "step": 3575 + }, + { + "epoch": 1.9160394714835256, + "grad_norm": 5.6431451193262046, + "learning_rate": 3.475072851157397e-07, + "logits/chosen": -0.13398191332817078, + "logits/rejected": -0.0687723383307457, + "logps/chosen": -2.034705638885498, + "logps/rejected": -3.189368486404419, + "loss": 0.6815, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.034705638885498, + "rewards/margins": 1.1546627283096313, + "rewards/rejected": -3.189368486404419, + "sft_loss": 2.022989273071289, + "step": 3580 + }, + { + "epoch": 1.9187155042649273, + "grad_norm": 2.9075238752199537, + "learning_rate": 3.460248239014936e-07, + "logits/chosen": -0.051149677485227585, + "logits/rejected": 0.013508930802345276, + "logps/chosen": -2.290102481842041, + "logps/rejected": -3.2551655769348145, + "loss": 0.6793, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.290102481842041, + "rewards/margins": 0.9650629758834839, + "rewards/rejected": -3.2551655769348145, + "sft_loss": 2.3164639472961426, + "step": 3585 + }, + { + "epoch": 1.9213915370463288, + "grad_norm": 2.749266267504163, + "learning_rate": 3.4454385655508134e-07, + "logits/chosen": -0.05810176208615303, + "logits/rejected": 0.024312706664204597, + "logps/chosen": -2.317183017730713, + "logps/rejected": -3.0569870471954346, + "loss": 0.6871, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.317183017730713, + "rewards/margins": 0.7398041486740112, + "rewards/rejected": -3.0569870471954346, + "sft_loss": 2.2860019207000732, + "step": 3590 + }, + { + "epoch": 1.9240675698277303, + "grad_norm": 5.114260269298789, + "learning_rate": 3.4306439744485447e-07, + "logits/chosen": -0.21410679817199707, + "logits/rejected": 0.03596454858779907, + "logps/chosen": -2.273670196533203, + "logps/rejected": -3.280043840408325, + "loss": 0.6887, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.273670196533203, + "rewards/margins": 1.0063737630844116, + "rewards/rejected": -3.280043840408325, + "sft_loss": 2.1528635025024414, + "step": 3595 + }, + { + "epoch": 1.926743602609132, + "grad_norm": 5.059702714512347, + "learning_rate": 3.415864609245322e-07, + "logits/chosen": -0.13041231036186218, + "logits/rejected": 0.07254637032747269, + "logps/chosen": -2.240907669067383, + "logps/rejected": -3.2653861045837402, + "loss": 0.6837, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.240907669067383, + "rewards/margins": 1.0244789123535156, + "rewards/rejected": -3.2653861045837402, + "sft_loss": 2.2703652381896973, + "step": 3600 + }, + { + "epoch": 1.926743602609132, + "eval_logits/chosen": 0.19961495697498322, + "eval_logits/rejected": 0.3035580515861511, + "eval_logps/chosen": -2.2075111865997314, + "eval_logps/rejected": -3.2094106674194336, + "eval_loss": 0.6897660493850708, + "eval_rewards/accuracies": 0.6965875625610352, + "eval_rewards/chosen": -2.2075111865997314, + "eval_rewards/margins": 1.0018997192382812, + "eval_rewards/rejected": -3.2094106674194336, + "eval_runtime": 64.7993, + "eval_samples_per_second": 20.756, + "eval_sft_loss": 2.142155885696411, + "eval_steps_per_second": 5.201, + "step": 3600 + }, + { + "epoch": 1.9294196353905335, + "grad_norm": 5.684340495868352, + "learning_rate": 3.401100613330605e-07, + "logits/chosen": -0.15685749053955078, + "logits/rejected": -0.12646745145320892, + "logps/chosen": -2.1684701442718506, + "logps/rejected": -2.9581832885742188, + "loss": 0.6845, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1684701442718506, + "rewards/margins": 0.7897127866744995, + "rewards/rejected": -2.9581832885742188, + "sft_loss": 2.157496213912964, + "step": 3605 + }, + { + "epoch": 1.932095668171935, + "grad_norm": 3.709795179506504, + "learning_rate": 3.3863521299447514e-07, + "logits/chosen": -0.15225179493427277, + "logits/rejected": 0.003287592204287648, + "logps/chosen": -2.1358022689819336, + "logps/rejected": -3.234461545944214, + "loss": 0.6841, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1358022689819336, + "rewards/margins": 1.0986593961715698, + "rewards/rejected": -3.234461545944214, + "sft_loss": 2.13959002494812, + "step": 3610 + }, + { + "epoch": 1.9347717009533367, + "grad_norm": 2.028879346077501, + "learning_rate": 3.371619302177609e-07, + "logits/chosen": -0.06895577162504196, + "logits/rejected": 0.06460914760828018, + "logps/chosen": -2.071643829345703, + "logps/rejected": -2.862349033355713, + "loss": 0.6955, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.071643829345703, + "rewards/margins": 0.7907050848007202, + "rewards/rejected": -2.862349033355713, + "sft_loss": 1.998337984085083, + "step": 3615 + }, + { + "epoch": 1.9374477337347382, + "grad_norm": 3.7319291071719776, + "learning_rate": 3.3569022729671393e-07, + "logits/chosen": -0.15587207674980164, + "logits/rejected": -0.04816683381795883, + "logps/chosen": -2.1375787258148193, + "logps/rejected": -3.0189738273620605, + "loss": 0.688, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1375787258148193, + "rewards/margins": 0.8813952207565308, + "rewards/rejected": -3.0189738273620605, + "sft_loss": 2.1859066486358643, + "step": 3620 + }, + { + "epoch": 1.9401237665161397, + "grad_norm": 4.087648916223257, + "learning_rate": 3.342201185098024e-07, + "logits/chosen": -0.05187777429819107, + "logits/rejected": -0.058933060616254807, + "logps/chosen": -2.1178596019744873, + "logps/rejected": -2.950225353240967, + "loss": 0.6814, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1178596019744873, + "rewards/margins": 0.8323656916618347, + "rewards/rejected": -2.950225353240967, + "sft_loss": 2.124027729034424, + "step": 3625 + }, + { + "epoch": 1.9427997992975414, + "grad_norm": 3.6139847946934567, + "learning_rate": 3.3275161812002807e-07, + "logits/chosen": -0.1192985326051712, + "logits/rejected": -0.07342572510242462, + "logps/chosen": -2.1341209411621094, + "logps/rejected": -3.061249017715454, + "loss": 0.6863, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1341209411621094, + "rewards/margins": 0.9271281361579895, + "rewards/rejected": -3.061249017715454, + "sft_loss": 2.2415740489959717, + "step": 3630 + }, + { + "epoch": 1.945475832078943, + "grad_norm": 4.746970107460784, + "learning_rate": 3.312847403747883e-07, + "logits/chosen": -0.19757425785064697, + "logits/rejected": -0.09125449508428574, + "logps/chosen": -2.0948214530944824, + "logps/rejected": -3.0264580249786377, + "loss": 0.6848, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0948214530944824, + "rewards/margins": 0.9316363334655762, + "rewards/rejected": -3.0264580249786377, + "sft_loss": 2.099250078201294, + "step": 3635 + }, + { + "epoch": 1.9481518648603444, + "grad_norm": 4.281285407499179, + "learning_rate": 3.2981949950573733e-07, + "logits/chosen": -0.16194570064544678, + "logits/rejected": -0.05726348236203194, + "logps/chosen": -2.2279863357543945, + "logps/rejected": -2.8229076862335205, + "loss": 0.6964, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.2279863357543945, + "rewards/margins": 0.594921350479126, + "rewards/rejected": -2.8229076862335205, + "sft_loss": 2.206334114074707, + "step": 3640 + }, + { + "epoch": 1.9508278976417461, + "grad_norm": 5.525223650970297, + "learning_rate": 3.283559097286486e-07, + "logits/chosen": -0.1582116037607193, + "logits/rejected": -0.0008633792167529464, + "logps/chosen": -2.1605565547943115, + "logps/rejected": -2.8597676753997803, + "loss": 0.6914, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1605565547943115, + "rewards/margins": 0.6992112398147583, + "rewards/rejected": -2.8597676753997803, + "sft_loss": 2.1679415702819824, + "step": 3645 + }, + { + "epoch": 1.9535039304231478, + "grad_norm": 6.5294785588226345, + "learning_rate": 3.268939852432765e-07, + "logits/chosen": -0.21969719231128693, + "logits/rejected": -0.11124607175588608, + "logps/chosen": -2.1147501468658447, + "logps/rejected": -3.0070154666900635, + "loss": 0.6848, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1147501468658447, + "rewards/margins": 0.8922654986381531, + "rewards/rejected": -3.0070154666900635, + "sft_loss": 2.1740405559539795, + "step": 3650 + }, + { + "epoch": 1.9561799632045491, + "grad_norm": 6.510859827780001, + "learning_rate": 3.254337402332187e-07, + "logits/chosen": -0.14866802096366882, + "logits/rejected": -0.011895375326275826, + "logps/chosen": -2.295565366744995, + "logps/rejected": -3.2828688621520996, + "loss": 0.6939, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.295565366744995, + "rewards/margins": 0.9873035550117493, + "rewards/rejected": -3.2828688621520996, + "sft_loss": 2.1841020584106445, + "step": 3655 + }, + { + "epoch": 1.9588559959859508, + "grad_norm": 5.159193596048617, + "learning_rate": 3.239751888657788e-07, + "logits/chosen": -0.17693455517292023, + "logits/rejected": -0.02780548296868801, + "logps/chosen": -2.0335822105407715, + "logps/rejected": -2.9066224098205566, + "loss": 0.6871, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.0335822105407715, + "rewards/margins": 0.8730396032333374, + "rewards/rejected": -2.9066224098205566, + "sft_loss": 2.043599843978882, + "step": 3660 + }, + { + "epoch": 1.9615320287673526, + "grad_norm": 7.022194499754583, + "learning_rate": 3.2251834529182856e-07, + "logits/chosen": -0.16167587041854858, + "logits/rejected": -0.033538367599248886, + "logps/chosen": -2.103761672973633, + "logps/rejected": -3.2121028900146484, + "loss": 0.6846, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.103761672973633, + "rewards/margins": 1.1083409786224365, + "rewards/rejected": -3.2121028900146484, + "sft_loss": 2.0176119804382324, + "step": 3665 + }, + { + "epoch": 1.9642080615487538, + "grad_norm": 2.7854409335237453, + "learning_rate": 3.2106322364567075e-07, + "logits/chosen": -0.18212896585464478, + "logits/rejected": -0.04557342082262039, + "logps/chosen": -2.0921874046325684, + "logps/rejected": -3.319629669189453, + "loss": 0.671, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0921874046325684, + "rewards/margins": 1.2274422645568848, + "rewards/rejected": -3.319629669189453, + "sft_loss": 2.175380229949951, + "step": 3670 + }, + { + "epoch": 1.9668840943301555, + "grad_norm": 7.211839596635454, + "learning_rate": 3.1960983804490183e-07, + "logits/chosen": -0.18180282413959503, + "logits/rejected": -0.031157314777374268, + "logps/chosen": -2.2092483043670654, + "logps/rejected": -3.370548725128174, + "loss": 0.6885, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.2092483043670654, + "rewards/margins": 1.1612999439239502, + "rewards/rejected": -3.370548725128174, + "sft_loss": 2.266460418701172, + "step": 3675 + }, + { + "epoch": 1.9695601271115573, + "grad_norm": 5.69334372057066, + "learning_rate": 3.1815820259027537e-07, + "logits/chosen": -0.18586108088493347, + "logits/rejected": -0.03296568989753723, + "logps/chosen": -1.992720365524292, + "logps/rejected": -3.099524974822998, + "loss": 0.6764, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.992720365524292, + "rewards/margins": 1.1068050861358643, + "rewards/rejected": -3.099524974822998, + "sft_loss": 2.023709297180176, + "step": 3680 + }, + { + "epoch": 1.9722361598929585, + "grad_norm": 7.694932668337779, + "learning_rate": 3.16708331365565e-07, + "logits/chosen": -0.18205882608890533, + "logits/rejected": -0.08366916328668594, + "logps/chosen": -2.1995675563812256, + "logps/rejected": -3.248079776763916, + "loss": 0.673, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1995675563812256, + "rewards/margins": 1.0485122203826904, + "rewards/rejected": -3.248079776763916, + "sft_loss": 2.2780404090881348, + "step": 3685 + }, + { + "epoch": 1.9749121926743602, + "grad_norm": 3.373708554633476, + "learning_rate": 3.152602384374275e-07, + "logits/chosen": -0.14092735946178436, + "logits/rejected": 0.03805799037218094, + "logps/chosen": -2.1809241771698, + "logps/rejected": -3.150590181350708, + "loss": 0.6913, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1809241771698, + "rewards/margins": 0.9696657061576843, + "rewards/rejected": -3.150590181350708, + "sft_loss": 2.0995025634765625, + "step": 3690 + }, + { + "epoch": 1.977588225455762, + "grad_norm": 3.9354177004042206, + "learning_rate": 3.1381393785526697e-07, + "logits/chosen": -0.1317126303911209, + "logits/rejected": -0.03440633416175842, + "logps/chosen": -2.1461985111236572, + "logps/rejected": -3.2417449951171875, + "loss": 0.6794, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1461985111236572, + "rewards/margins": 1.0955464839935303, + "rewards/rejected": -3.2417449951171875, + "sft_loss": 2.2087178230285645, + "step": 3695 + }, + { + "epoch": 1.9802642582371635, + "grad_norm": 3.3353589299966653, + "learning_rate": 3.123694436510979e-07, + "logits/chosen": -0.11530756950378418, + "logits/rejected": 0.01841713674366474, + "logps/chosen": -2.188054323196411, + "logps/rejected": -3.0514755249023438, + "loss": 0.6847, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.188054323196411, + "rewards/margins": 0.8634212613105774, + "rewards/rejected": -3.0514755249023438, + "sft_loss": 2.1960723400115967, + "step": 3700 + }, + { + "epoch": 1.982940291018565, + "grad_norm": 5.582960140846795, + "learning_rate": 3.1092676983940946e-07, + "logits/chosen": -0.1665990799665451, + "logits/rejected": -0.07521601766347885, + "logps/chosen": -2.076925039291382, + "logps/rejected": -3.199763536453247, + "loss": 0.6767, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.076925039291382, + "rewards/margins": 1.1228384971618652, + "rewards/rejected": -3.199763536453247, + "sft_loss": 2.0471179485321045, + "step": 3705 + }, + { + "epoch": 1.9856163237999667, + "grad_norm": 3.3886986437836453, + "learning_rate": 3.094859304170293e-07, + "logits/chosen": -0.024591270834207535, + "logits/rejected": 0.020447324961423874, + "logps/chosen": -2.1345932483673096, + "logps/rejected": -3.0656065940856934, + "loss": 0.6925, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1345932483673096, + "rewards/margins": 0.9310134053230286, + "rewards/rejected": -3.0656065940856934, + "sft_loss": 2.1308083534240723, + "step": 3710 + }, + { + "epoch": 1.9882923565813682, + "grad_norm": 3.4491897887450347, + "learning_rate": 3.0804693936298795e-07, + "logits/chosen": -0.10499121248722076, + "logits/rejected": -0.03471050411462784, + "logps/chosen": -2.078094482421875, + "logps/rejected": -3.24504017829895, + "loss": 0.6731, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.078094482421875, + "rewards/margins": 1.1669456958770752, + "rewards/rejected": -3.24504017829895, + "sft_loss": 2.100672960281372, + "step": 3715 + }, + { + "epoch": 1.9909683893627697, + "grad_norm": 4.077241980257639, + "learning_rate": 3.066098106383826e-07, + "logits/chosen": -0.17399199306964874, + "logits/rejected": -0.06918822228908539, + "logps/chosen": -2.1585144996643066, + "logps/rejected": -3.043644428253174, + "loss": 0.6944, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1585144996643066, + "rewards/margins": 0.8851300477981567, + "rewards/rejected": -3.043644428253174, + "sft_loss": 2.036146879196167, + "step": 3720 + }, + { + "epoch": 1.9936444221441714, + "grad_norm": 3.0355512441322587, + "learning_rate": 3.0517455818624263e-07, + "logits/chosen": -0.231519415974617, + "logits/rejected": -0.12187705188989639, + "logps/chosen": -2.1352591514587402, + "logps/rejected": -3.182976007461548, + "loss": 0.6935, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1352591514587402, + "rewards/margins": 1.0477169752120972, + "rewards/rejected": -3.182976007461548, + "sft_loss": 2.189821720123291, + "step": 3725 + }, + { + "epoch": 1.9963204549255729, + "grad_norm": 3.9927289170007514, + "learning_rate": 3.037411959313936e-07, + "logits/chosen": -0.16165432333946228, + "logits/rejected": -0.0339629128575325, + "logps/chosen": -2.1408815383911133, + "logps/rejected": -3.0898704528808594, + "loss": 0.6771, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.1408815383911133, + "rewards/margins": 0.9489887952804565, + "rewards/rejected": -3.0898704528808594, + "sft_loss": 2.140064239501953, + "step": 3730 + }, + { + "epoch": 1.9989964877069744, + "grad_norm": 3.471098137156809, + "learning_rate": 3.023097377803224e-07, + "logits/chosen": -0.142087921500206, + "logits/rejected": -0.038915760815143585, + "logps/chosen": -2.316383123397827, + "logps/rejected": -3.0058162212371826, + "loss": 0.7038, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.316383123397827, + "rewards/margins": 0.6894328594207764, + "rewards/rejected": -3.0058162212371826, + "sft_loss": 2.2604458332061768, + "step": 3735 + }, + { + "epoch": 2.001672520488376, + "grad_norm": 2.6153338498303493, + "learning_rate": 3.008801976210423e-07, + "logits/chosen": -0.08909028023481369, + "logits/rejected": -0.00841111596673727, + "logps/chosen": -2.352787494659424, + "logps/rejected": -3.154552936553955, + "loss": 0.6918, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.352787494659424, + "rewards/margins": 0.8017654418945312, + "rewards/rejected": -3.154552936553955, + "sft_loss": 2.252534866333008, + "step": 3740 + }, + { + "epoch": 2.0043485532697773, + "grad_norm": 2.975653477681364, + "learning_rate": 2.994525893229581e-07, + "logits/chosen": -0.18091581761837006, + "logits/rejected": -0.08059791475534439, + "logps/chosen": -2.198948383331299, + "logps/rejected": -3.2791762351989746, + "loss": 0.6726, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.198948383331299, + "rewards/margins": 1.0802279710769653, + "rewards/rejected": -3.2791762351989746, + "sft_loss": 2.107489824295044, + "step": 3745 + }, + { + "epoch": 2.007024586051179, + "grad_norm": 3.7792912907628646, + "learning_rate": 2.98026926736732e-07, + "logits/chosen": -0.21311037242412567, + "logits/rejected": -0.11145709455013275, + "logps/chosen": -2.069913387298584, + "logps/rejected": -3.192847728729248, + "loss": 0.6641, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.069913387298584, + "rewards/margins": 1.122934103012085, + "rewards/rejected": -3.192847728729248, + "sft_loss": 2.018747329711914, + "step": 3750 + }, + { + "epoch": 2.0097006188325808, + "grad_norm": 4.791967935712868, + "learning_rate": 2.9660322369414846e-07, + "logits/chosen": -0.14493045210838318, + "logits/rejected": 0.007240000180900097, + "logps/chosen": -1.9749542474746704, + "logps/rejected": -3.348956346511841, + "loss": 0.6626, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9749542474746704, + "rewards/margins": 1.3740019798278809, + "rewards/rejected": -3.348956346511841, + "sft_loss": 2.058170795440674, + "step": 3755 + }, + { + "epoch": 2.0123766516139825, + "grad_norm": 2.782908308940945, + "learning_rate": 2.9518149400798063e-07, + "logits/chosen": -0.23584513366222382, + "logits/rejected": -0.1729080229997635, + "logps/chosen": -2.0240962505340576, + "logps/rejected": -3.370525360107422, + "loss": 0.6659, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0240962505340576, + "rewards/margins": 1.3464288711547852, + "rewards/rejected": -3.370525360107422, + "sft_loss": 2.124987840652466, + "step": 3760 + }, + { + "epoch": 2.0150526843953838, + "grad_norm": 4.361969659880761, + "learning_rate": 2.9376175147185633e-07, + "logits/chosen": -0.1731630265712738, + "logits/rejected": 0.024024654179811478, + "logps/chosen": -2.1874585151672363, + "logps/rejected": -3.3264641761779785, + "loss": 0.6743, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1874585151672363, + "rewards/margins": 1.1390055418014526, + "rewards/rejected": -3.3264641761779785, + "sft_loss": 2.13765287399292, + "step": 3765 + }, + { + "epoch": 2.0177287171767855, + "grad_norm": 5.451218204119892, + "learning_rate": 2.9234400986012376e-07, + "logits/chosen": -0.2533346712589264, + "logits/rejected": -0.09153694659471512, + "logps/chosen": -1.9931917190551758, + "logps/rejected": -3.301081895828247, + "loss": 0.651, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9931917190551758, + "rewards/margins": 1.3078901767730713, + "rewards/rejected": -3.301081895828247, + "sft_loss": 2.0416078567504883, + "step": 3770 + }, + { + "epoch": 2.020404749958187, + "grad_norm": 5.230901570538194, + "learning_rate": 2.9092828292771817e-07, + "logits/chosen": -0.16264775395393372, + "logits/rejected": -0.11411754041910172, + "logps/chosen": -2.0707695484161377, + "logps/rejected": -3.1301207542419434, + "loss": 0.6779, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0707695484161377, + "rewards/margins": 1.0593516826629639, + "rewards/rejected": -3.1301207542419434, + "sft_loss": 2.0640697479248047, + "step": 3775 + }, + { + "epoch": 2.0230807827395885, + "grad_norm": 5.682295649586755, + "learning_rate": 2.8951458441002875e-07, + "logits/chosen": -0.13695378601551056, + "logits/rejected": -0.08521612733602524, + "logps/chosen": -2.0727250576019287, + "logps/rejected": -3.0686042308807373, + "loss": 0.6753, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0727250576019287, + "rewards/margins": 0.9958791732788086, + "rewards/rejected": -3.0686042308807373, + "sft_loss": 2.040381908416748, + "step": 3780 + }, + { + "epoch": 2.02575681552099, + "grad_norm": 5.444971493009763, + "learning_rate": 2.881029280227643e-07, + "logits/chosen": -0.20039144158363342, + "logits/rejected": -0.03169285133481026, + "logps/chosen": -2.0249133110046387, + "logps/rejected": -3.1004390716552734, + "loss": 0.666, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.0249133110046387, + "rewards/margins": 1.0755256414413452, + "rewards/rejected": -3.1004390716552734, + "sft_loss": 1.98683762550354, + "step": 3785 + }, + { + "epoch": 2.028432848302392, + "grad_norm": 3.2688059376125125, + "learning_rate": 2.8669332746182177e-07, + "logits/chosen": -0.23091144859790802, + "logits/rejected": -0.05502986162900925, + "logps/chosen": -1.965471625328064, + "logps/rejected": -3.2271828651428223, + "loss": 0.6567, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.965471625328064, + "rewards/margins": 1.2617111206054688, + "rewards/rejected": -3.2271828651428223, + "sft_loss": 2.0017428398132324, + "step": 3790 + }, + { + "epoch": 2.031108881083793, + "grad_norm": 3.3719898656196956, + "learning_rate": 2.8528579640315156e-07, + "logits/chosen": -0.10328583419322968, + "logits/rejected": -0.06386563926935196, + "logps/chosen": -2.1545088291168213, + "logps/rejected": -2.9280734062194824, + "loss": 0.681, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1545088291168213, + "rewards/margins": 0.7735646963119507, + "rewards/rejected": -2.9280734062194824, + "sft_loss": 2.226701498031616, + "step": 3795 + }, + { + "epoch": 2.033784913865195, + "grad_norm": 6.9516884717445, + "learning_rate": 2.8388034850262646e-07, + "logits/chosen": -0.13225576281547546, + "logits/rejected": 0.003281575394794345, + "logps/chosen": -2.168238639831543, + "logps/rejected": -3.1461145877838135, + "loss": 0.6668, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.168238639831543, + "rewards/margins": 0.9778760671615601, + "rewards/rejected": -3.1461145877838135, + "sft_loss": 2.202183246612549, + "step": 3800 + }, + { + "epoch": 2.0364609466465966, + "grad_norm": 9.636546332290473, + "learning_rate": 2.824769973959079e-07, + "logits/chosen": -0.10065107047557831, + "logits/rejected": 0.04348466545343399, + "logps/chosen": -2.024111032485962, + "logps/rejected": -2.97385835647583, + "loss": 0.6713, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.024111032485962, + "rewards/margins": 0.9497473835945129, + "rewards/rejected": -2.97385835647583, + "sft_loss": 2.0276641845703125, + "step": 3805 + }, + { + "epoch": 2.039136979427998, + "grad_norm": 3.6337683737206623, + "learning_rate": 2.81075756698315e-07, + "logits/chosen": -0.025926511734724045, + "logits/rejected": 0.07824753224849701, + "logps/chosen": -2.019010066986084, + "logps/rejected": -3.3775219917297363, + "loss": 0.6536, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.019010066986084, + "rewards/margins": 1.3585115671157837, + "rewards/rejected": -3.3775219917297363, + "sft_loss": 1.9829899072647095, + "step": 3810 + }, + { + "epoch": 2.0418130122093996, + "grad_norm": 3.4543113118103754, + "learning_rate": 2.7967664000469035e-07, + "logits/chosen": -0.221477672457695, + "logits/rejected": -0.0890105590224266, + "logps/chosen": -2.1813979148864746, + "logps/rejected": -3.1783928871154785, + "loss": 0.6884, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1813979148864746, + "rewards/margins": 0.9969952702522278, + "rewards/rejected": -3.1783928871154785, + "sft_loss": 2.1206278800964355, + "step": 3815 + }, + { + "epoch": 2.0444890449908013, + "grad_norm": 2.225127207591837, + "learning_rate": 2.7827966088927095e-07, + "logits/chosen": -0.2266666144132614, + "logits/rejected": 0.025529295206069946, + "logps/chosen": -2.2012128829956055, + "logps/rejected": -3.4361069202423096, + "loss": 0.6803, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.2012128829956055, + "rewards/margins": 1.2348941564559937, + "rewards/rejected": -3.4361069202423096, + "sft_loss": 2.183763265609741, + "step": 3820 + }, + { + "epoch": 2.0471650777722026, + "grad_norm": 5.048394544503118, + "learning_rate": 2.768848329055538e-07, + "logits/chosen": -0.15154269337654114, + "logits/rejected": -0.07468608021736145, + "logps/chosen": -2.091416120529175, + "logps/rejected": -3.096597671508789, + "loss": 0.672, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.091416120529175, + "rewards/margins": 1.0051820278167725, + "rewards/rejected": -3.096597671508789, + "sft_loss": 2.18349552154541, + "step": 3825 + }, + { + "epoch": 2.0498411105536043, + "grad_norm": 4.868925838328225, + "learning_rate": 2.7549216958616657e-07, + "logits/chosen": -0.23196709156036377, + "logits/rejected": -0.06467144191265106, + "logps/chosen": -2.288769483566284, + "logps/rejected": -3.4297847747802734, + "loss": 0.6714, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.288769483566284, + "rewards/margins": 1.1410152912139893, + "rewards/rejected": -3.4297847747802734, + "sft_loss": 2.199639081954956, + "step": 3830 + }, + { + "epoch": 2.052517143335006, + "grad_norm": 3.7753382928988053, + "learning_rate": 2.741016844427344e-07, + "logits/chosen": -0.15316972136497498, + "logits/rejected": 0.026164349168539047, + "logps/chosen": -2.1056418418884277, + "logps/rejected": -3.047377347946167, + "loss": 0.6807, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1056418418884277, + "rewards/margins": 0.941735565662384, + "rewards/rejected": -3.047377347946167, + "sft_loss": 2.1292455196380615, + "step": 3835 + }, + { + "epoch": 2.0551931761164073, + "grad_norm": 5.316549877681543, + "learning_rate": 2.7271339096575073e-07, + "logits/chosen": -0.10193123668432236, + "logits/rejected": 0.05660524219274521, + "logps/chosen": -2.0485825538635254, + "logps/rejected": -3.189666748046875, + "loss": 0.6608, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0485825538635254, + "rewards/margins": 1.1410835981369019, + "rewards/rejected": -3.189666748046875, + "sft_loss": 2.0899040699005127, + "step": 3840 + }, + { + "epoch": 2.057869208897809, + "grad_norm": 4.1783885206300715, + "learning_rate": 2.713273026244446e-07, + "logits/chosen": -0.25660255551338196, + "logits/rejected": -0.02843310870230198, + "logps/chosen": -2.236860513687134, + "logps/rejected": -3.3940634727478027, + "loss": 0.6791, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.236860513687134, + "rewards/margins": 1.1572033166885376, + "rewards/rejected": -3.3940634727478027, + "sft_loss": 2.1988136768341064, + "step": 3845 + }, + { + "epoch": 2.0605452416792107, + "grad_norm": 11.66407269024, + "learning_rate": 2.6994343286665156e-07, + "logits/chosen": -0.1540280282497406, + "logits/rejected": 0.03630285710096359, + "logps/chosen": -2.119597911834717, + "logps/rejected": -2.970017910003662, + "loss": 0.6864, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.119597911834717, + "rewards/margins": 0.8504198789596558, + "rewards/rejected": -2.970017910003662, + "sft_loss": 2.150630474090576, + "step": 3850 + }, + { + "epoch": 2.063221274460612, + "grad_norm": 5.514295095964714, + "learning_rate": 2.6856179511868156e-07, + "logits/chosen": -0.12381670624017715, + "logits/rejected": 0.08592347055673599, + "logps/chosen": -2.0796501636505127, + "logps/rejected": -3.2541632652282715, + "loss": 0.6813, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.0796501636505127, + "rewards/margins": 1.1745132207870483, + "rewards/rejected": -3.2541632652282715, + "sft_loss": 2.063962697982788, + "step": 3855 + }, + { + "epoch": 2.0658973072420137, + "grad_norm": 3.2622503609392655, + "learning_rate": 2.6718240278519056e-07, + "logits/chosen": -0.1520196497440338, + "logits/rejected": 0.003301681485027075, + "logps/chosen": -2.108696460723877, + "logps/rejected": -3.3694000244140625, + "loss": 0.6808, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.108696460723877, + "rewards/margins": 1.2607039213180542, + "rewards/rejected": -3.3694000244140625, + "sft_loss": 2.0896289348602295, + "step": 3860 + }, + { + "epoch": 2.0685733400234154, + "grad_norm": 5.528824956645526, + "learning_rate": 2.6580526924904866e-07, + "logits/chosen": -0.23660194873809814, + "logits/rejected": -0.08224891126155853, + "logps/chosen": -2.1784911155700684, + "logps/rejected": -3.1277122497558594, + "loss": 0.6737, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1784911155700684, + "rewards/margins": 0.9492212533950806, + "rewards/rejected": -3.1277122497558594, + "sft_loss": 2.115309953689575, + "step": 3865 + }, + { + "epoch": 2.0712493728048167, + "grad_norm": 4.300814713228508, + "learning_rate": 2.6443040787121186e-07, + "logits/chosen": -0.24351458251476288, + "logits/rejected": -0.14606614410877228, + "logps/chosen": -1.8937175273895264, + "logps/rejected": -3.051975727081299, + "loss": 0.6562, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8937175273895264, + "rewards/margins": 1.1582581996917725, + "rewards/rejected": -3.051975727081299, + "sft_loss": 1.9756580591201782, + "step": 3870 + }, + { + "epoch": 2.0739254055862184, + "grad_norm": 4.5057064274808845, + "learning_rate": 2.6305783199059084e-07, + "logits/chosen": -0.18646974861621857, + "logits/rejected": -0.06458055973052979, + "logps/chosen": -2.1127471923828125, + "logps/rejected": -3.1759142875671387, + "loss": 0.6719, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1127471923828125, + "rewards/margins": 1.0631673336029053, + "rewards/rejected": -3.1759142875671387, + "sft_loss": 2.1598129272460938, + "step": 3875 + }, + { + "epoch": 2.07660143836762, + "grad_norm": 3.146074222965387, + "learning_rate": 2.6168755492392324e-07, + "logits/chosen": -0.19290082156658173, + "logits/rejected": -0.03169599175453186, + "logps/chosen": -1.9636176824569702, + "logps/rejected": -3.1199774742126465, + "loss": 0.6533, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.9636176824569702, + "rewards/margins": 1.1563596725463867, + "rewards/rejected": -3.1199774742126465, + "sft_loss": 1.9573945999145508, + "step": 3880 + }, + { + "epoch": 2.0792774711490214, + "grad_norm": 3.8083854291835872, + "learning_rate": 2.6031958996564274e-07, + "logits/chosen": -0.1969573199748993, + "logits/rejected": -0.06371529400348663, + "logps/chosen": -1.9405794143676758, + "logps/rejected": -3.257289409637451, + "loss": 0.6626, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.9405794143676758, + "rewards/margins": 1.316710114479065, + "rewards/rejected": -3.257289409637451, + "sft_loss": 1.9770195484161377, + "step": 3885 + }, + { + "epoch": 2.081953503930423, + "grad_norm": 9.200591844421464, + "learning_rate": 2.589539503877518e-07, + "logits/chosen": -0.14464333653450012, + "logits/rejected": -0.02460755966603756, + "logps/chosen": -2.0023107528686523, + "logps/rejected": -3.093677043914795, + "loss": 0.6801, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.0023107528686523, + "rewards/margins": 1.0913660526275635, + "rewards/rejected": -3.093677043914795, + "sft_loss": 2.0361132621765137, + "step": 3890 + }, + { + "epoch": 2.084629536711825, + "grad_norm": 13.13456351467358, + "learning_rate": 2.5759064943969125e-07, + "logits/chosen": -0.21079924702644348, + "logits/rejected": 0.017463264986872673, + "logps/chosen": -2.0185232162475586, + "logps/rejected": -2.9785799980163574, + "loss": 0.6773, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0185232162475586, + "rewards/margins": 0.9600569605827332, + "rewards/rejected": -2.9785799980163574, + "sft_loss": 1.9870984554290771, + "step": 3895 + }, + { + "epoch": 2.087305569493226, + "grad_norm": 5.554726366552014, + "learning_rate": 2.562297003482131e-07, + "logits/chosen": -0.06970573216676712, + "logits/rejected": -0.04162493348121643, + "logps/chosen": -1.9545644521713257, + "logps/rejected": -2.937133550643921, + "loss": 0.663, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.9545644521713257, + "rewards/margins": 0.9825690984725952, + "rewards/rejected": -2.937133550643921, + "sft_loss": 2.006101608276367, + "step": 3900 + }, + { + "epoch": 2.089981602274628, + "grad_norm": 3.8188131833797123, + "learning_rate": 2.548711163172512e-07, + "logits/chosen": -0.13708598911762238, + "logits/rejected": -0.02338588610291481, + "logps/chosen": -2.116072654724121, + "logps/rejected": -2.9913673400878906, + "loss": 0.6832, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.116072654724121, + "rewards/margins": 0.8752948641777039, + "rewards/rejected": -2.9913673400878906, + "sft_loss": 2.0157470703125, + "step": 3905 + }, + { + "epoch": 2.0926576350560295, + "grad_norm": 3.9159541413812673, + "learning_rate": 2.53514910527794e-07, + "logits/chosen": -0.14079007506370544, + "logits/rejected": -0.014591905288398266, + "logps/chosen": -1.8882776498794556, + "logps/rejected": -3.012269973754883, + "loss": 0.6634, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.8882776498794556, + "rewards/margins": 1.1239923238754272, + "rewards/rejected": -3.012269973754883, + "sft_loss": 1.9577144384384155, + "step": 3910 + }, + { + "epoch": 2.095333667837431, + "grad_norm": 2.949276861968296, + "learning_rate": 2.5216109613775573e-07, + "logits/chosen": -0.19639217853546143, + "logits/rejected": -0.03421352431178093, + "logps/chosen": -2.181985855102539, + "logps/rejected": -3.15867280960083, + "loss": 0.6872, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.181985855102539, + "rewards/margins": 0.976686954498291, + "rewards/rejected": -3.15867280960083, + "sft_loss": 2.2240633964538574, + "step": 3915 + }, + { + "epoch": 2.0980097006188325, + "grad_norm": 2.795612284396159, + "learning_rate": 2.5080968628184993e-07, + "logits/chosen": -0.20567235350608826, + "logits/rejected": -0.016475200653076172, + "logps/chosen": -2.110926389694214, + "logps/rejected": -3.4056496620178223, + "loss": 0.6727, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.110926389694214, + "rewards/margins": 1.2947235107421875, + "rewards/rejected": -3.4056496620178223, + "sft_loss": 2.080010175704956, + "step": 3920 + }, + { + "epoch": 2.1006857334002342, + "grad_norm": 5.9208482659550254, + "learning_rate": 2.494606940714605e-07, + "logits/chosen": -0.1726199984550476, + "logits/rejected": -0.077762171626091, + "logps/chosen": -1.995126485824585, + "logps/rejected": -3.295370578765869, + "loss": 0.647, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.995126485824585, + "rewards/margins": 1.3002442121505737, + "rewards/rejected": -3.295370578765869, + "sft_loss": 1.9892040491104126, + "step": 3925 + }, + { + "epoch": 2.103361766181636, + "grad_norm": 3.6624647106926527, + "learning_rate": 2.4811413259451625e-07, + "logits/chosen": -0.26526567339897156, + "logits/rejected": -0.10189725458621979, + "logps/chosen": -2.0908007621765137, + "logps/rejected": -3.1439366340637207, + "loss": 0.6734, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0908007621765137, + "rewards/margins": 1.0531362295150757, + "rewards/rejected": -3.1439366340637207, + "sft_loss": 2.0779733657836914, + "step": 3930 + }, + { + "epoch": 2.106037798963037, + "grad_norm": 4.246459738842489, + "learning_rate": 2.46770014915362e-07, + "logits/chosen": -0.1405525803565979, + "logits/rejected": -0.05778896063566208, + "logps/chosen": -2.1469337940216064, + "logps/rejected": -3.331935405731201, + "loss": 0.6666, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1469337940216064, + "rewards/margins": 1.1850017309188843, + "rewards/rejected": -3.331935405731201, + "sft_loss": 2.1456828117370605, + "step": 3935 + }, + { + "epoch": 2.108713831744439, + "grad_norm": 4.555051792056545, + "learning_rate": 2.45428354074634e-07, + "logits/chosen": -0.18042424321174622, + "logits/rejected": -0.11148138344287872, + "logps/chosen": -2.0728728771209717, + "logps/rejected": -3.2742888927459717, + "loss": 0.6619, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0728728771209717, + "rewards/margins": 1.201416015625, + "rewards/rejected": -3.2742888927459717, + "sft_loss": 1.9689483642578125, + "step": 3940 + }, + { + "epoch": 2.1113898645258407, + "grad_norm": 4.193908772515439, + "learning_rate": 2.4408916308913105e-07, + "logits/chosen": -0.24373853206634521, + "logits/rejected": -0.06800062954425812, + "logps/chosen": -2.1934638023376465, + "logps/rejected": -3.0918126106262207, + "loss": 0.6948, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1934638023376465, + "rewards/margins": 0.8983484506607056, + "rewards/rejected": -3.0918126106262207, + "sft_loss": 2.211327075958252, + "step": 3945 + }, + { + "epoch": 2.114065897307242, + "grad_norm": 5.861821260358606, + "learning_rate": 2.4275245495169025e-07, + "logits/chosen": -0.10776664316654205, + "logits/rejected": 0.05332046002149582, + "logps/chosen": -2.101513385772705, + "logps/rejected": -3.1856369972229004, + "loss": 0.6712, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.101513385772705, + "rewards/margins": 1.0841234922409058, + "rewards/rejected": -3.1856369972229004, + "sft_loss": 2.065294027328491, + "step": 3950 + }, + { + "epoch": 2.1167419300886436, + "grad_norm": 3.6790955676732113, + "learning_rate": 2.414182426310597e-07, + "logits/chosen": -0.2536991238594055, + "logits/rejected": -0.16523298621177673, + "logps/chosen": -2.126446485519409, + "logps/rejected": -3.386033296585083, + "loss": 0.6738, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.126446485519409, + "rewards/margins": 1.2595865726470947, + "rewards/rejected": -3.386033296585083, + "sft_loss": 2.1513524055480957, + "step": 3955 + }, + { + "epoch": 2.1194179628700454, + "grad_norm": 5.781208465642567, + "learning_rate": 2.400865390717734e-07, + "logits/chosen": -0.1769980937242508, + "logits/rejected": -0.07903625816106796, + "logps/chosen": -2.05415940284729, + "logps/rejected": -3.569599151611328, + "loss": 0.6588, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.05415940284729, + "rewards/margins": 1.5154398679733276, + "rewards/rejected": -3.569599151611328, + "sft_loss": 2.114835262298584, + "step": 3960 + }, + { + "epoch": 2.1220939956514466, + "grad_norm": 4.243018773470032, + "learning_rate": 2.3875735719402475e-07, + "logits/chosen": -0.1651601493358612, + "logits/rejected": -0.03835087642073631, + "logps/chosen": -2.100426435470581, + "logps/rejected": -3.3876376152038574, + "loss": 0.6671, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.100426435470581, + "rewards/margins": 1.2872109413146973, + "rewards/rejected": -3.3876376152038574, + "sft_loss": 2.1940059661865234, + "step": 3965 + }, + { + "epoch": 2.1247700284328483, + "grad_norm": 3.40614489973044, + "learning_rate": 2.3743070989354258e-07, + "logits/chosen": -0.12180168926715851, + "logits/rejected": -0.016960904002189636, + "logps/chosen": -2.2647228240966797, + "logps/rejected": -3.349940538406372, + "loss": 0.6672, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.2647228240966797, + "rewards/margins": 1.0852177143096924, + "rewards/rejected": -3.349940538406372, + "sft_loss": 2.2234692573547363, + "step": 3970 + }, + { + "epoch": 2.12744606121425, + "grad_norm": 5.217256447022118, + "learning_rate": 2.3610661004146454e-07, + "logits/chosen": -0.11785624921321869, + "logits/rejected": -0.009331837296485901, + "logps/chosen": -1.9256277084350586, + "logps/rejected": -2.9111785888671875, + "loss": 0.6615, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9256277084350586, + "rewards/margins": 0.9855508804321289, + "rewards/rejected": -2.9111785888671875, + "sft_loss": 1.9216800928115845, + "step": 3975 + }, + { + "epoch": 2.1301220939956513, + "grad_norm": 5.574292090869096, + "learning_rate": 2.3478507048421314e-07, + "logits/chosen": -0.15140271186828613, + "logits/rejected": -0.06172577291727066, + "logps/chosen": -2.0302653312683105, + "logps/rejected": -3.386890411376953, + "loss": 0.6488, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0302653312683105, + "rewards/margins": 1.3566254377365112, + "rewards/rejected": -3.386890411376953, + "sft_loss": 2.1113438606262207, + "step": 3980 + }, + { + "epoch": 2.132798126777053, + "grad_norm": 3.8311914809355456, + "learning_rate": 2.334661040433713e-07, + "logits/chosen": -0.2237207591533661, + "logits/rejected": -0.09334935247898102, + "logps/chosen": -2.256551742553711, + "logps/rejected": -3.352893352508545, + "loss": 0.6682, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.256551742553711, + "rewards/margins": 1.0963413715362549, + "rewards/rejected": -3.352893352508545, + "sft_loss": 2.2960710525512695, + "step": 3985 + }, + { + "epoch": 2.1354741595584548, + "grad_norm": 3.673517695626827, + "learning_rate": 2.321497235155568e-07, + "logits/chosen": -0.2837563455104828, + "logits/rejected": -0.11372779309749603, + "logps/chosen": -1.8886947631835938, + "logps/rejected": -3.2296433448791504, + "loss": 0.6524, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.8886947631835938, + "rewards/margins": 1.3409483432769775, + "rewards/rejected": -3.2296433448791504, + "sft_loss": 1.920108437538147, + "step": 3990 + }, + { + "epoch": 2.138150192339856, + "grad_norm": 3.627622935982085, + "learning_rate": 2.3083594167229965e-07, + "logits/chosen": -0.2605969309806824, + "logits/rejected": 0.02325456589460373, + "logps/chosen": -2.215508460998535, + "logps/rejected": -3.2646713256835938, + "loss": 0.676, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.215508460998535, + "rewards/margins": 1.0491631031036377, + "rewards/rejected": -3.2646713256835938, + "sft_loss": 2.1782376766204834, + "step": 3995 + }, + { + "epoch": 2.1408262251212578, + "grad_norm": 5.5650806107361825, + "learning_rate": 2.295247712599167e-07, + "logits/chosen": -0.16451916098594666, + "logits/rejected": -0.05491523817181587, + "logps/chosen": -1.9280630350112915, + "logps/rejected": -3.411128282546997, + "loss": 0.6446, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9280630350112915, + "rewards/margins": 1.4830653667449951, + "rewards/rejected": -3.411128282546997, + "sft_loss": 1.9921998977661133, + "step": 4000 + }, + { + "epoch": 2.1408262251212578, + "eval_logits/chosen": 0.12216214835643768, + "eval_logits/rejected": 0.22047200798988342, + "eval_logps/chosen": -2.1866872310638428, + "eval_logps/rejected": -3.2140347957611084, + "eval_loss": 0.6902045607566833, + "eval_rewards/accuracies": 0.6854599118232727, + "eval_rewards/chosen": -2.1866872310638428, + "eval_rewards/margins": 1.0273479223251343, + "eval_rewards/rejected": -3.2140347957611084, + "eval_runtime": 45.1516, + "eval_samples_per_second": 29.789, + "eval_sft_loss": 2.16142201423645, + "eval_steps_per_second": 7.464, + "step": 4000 + }, + { + "epoch": 2.1435022579026595, + "grad_norm": 2.3070059749730167, + "learning_rate": 2.2821622499938948e-07, + "logits/chosen": -0.16050508618354797, + "logits/rejected": 0.05246647074818611, + "logps/chosen": -2.288259983062744, + "logps/rejected": -3.2600135803222656, + "loss": 0.6917, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.288259983062744, + "rewards/margins": 0.9717534184455872, + "rewards/rejected": -3.2600135803222656, + "sft_loss": 2.1639692783355713, + "step": 4005 + }, + { + "epoch": 2.1461782906840607, + "grad_norm": 4.274171833193289, + "learning_rate": 2.269103155862391e-07, + "logits/chosen": -0.21428146958351135, + "logits/rejected": -0.1049589067697525, + "logps/chosen": -2.0670275688171387, + "logps/rejected": -3.170724868774414, + "loss": 0.6748, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0670275688171387, + "rewards/margins": 1.103697419166565, + "rewards/rejected": -3.170724868774414, + "sft_loss": 2.0204029083251953, + "step": 4010 + }, + { + "epoch": 2.1488543234654625, + "grad_norm": 5.09358664314769, + "learning_rate": 2.2560705569040483e-07, + "logits/chosen": -0.2076890915632248, + "logits/rejected": 0.05643507093191147, + "logps/chosen": -2.120525360107422, + "logps/rejected": -3.1633400917053223, + "loss": 0.6826, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.120525360107422, + "rewards/margins": 1.0428144931793213, + "rewards/rejected": -3.1633400917053223, + "sft_loss": 2.194258451461792, + "step": 4015 + }, + { + "epoch": 2.151530356246864, + "grad_norm": 4.215634181369833, + "learning_rate": 2.2430645795611963e-07, + "logits/chosen": -0.27006030082702637, + "logits/rejected": -0.10320593416690826, + "logps/chosen": -2.1332249641418457, + "logps/rejected": -3.0910048484802246, + "loss": 0.6764, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.1332249641418457, + "rewards/margins": 0.9577800035476685, + "rewards/rejected": -3.0910048484802246, + "sft_loss": 2.1724190711975098, + "step": 4020 + }, + { + "epoch": 2.1542063890282654, + "grad_norm": 3.5195337375890996, + "learning_rate": 2.230085350017884e-07, + "logits/chosen": -0.1685423105955124, + "logits/rejected": -0.056601088494062424, + "logps/chosen": -2.1004130840301514, + "logps/rejected": -3.147625684738159, + "loss": 0.6908, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1004130840301514, + "rewards/margins": 1.0472123622894287, + "rewards/rejected": -3.147625684738159, + "sft_loss": 2.126330614089966, + "step": 4025 + }, + { + "epoch": 2.156882421809667, + "grad_norm": 5.293014240915332, + "learning_rate": 2.2171329941986554e-07, + "logits/chosen": -0.2239580899477005, + "logits/rejected": -0.13606879115104675, + "logps/chosen": -2.025062084197998, + "logps/rejected": -3.244068145751953, + "loss": 0.6582, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.025062084197998, + "rewards/margins": 1.2190057039260864, + "rewards/rejected": -3.244068145751953, + "sft_loss": 2.1253952980041504, + "step": 4030 + }, + { + "epoch": 2.159558454591069, + "grad_norm": 10.634182633611681, + "learning_rate": 2.2042076377673202e-07, + "logits/chosen": -0.19350138306617737, + "logits/rejected": -0.1688581109046936, + "logps/chosen": -1.9272750616073608, + "logps/rejected": -2.822693347930908, + "loss": 0.6764, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.9272750616073608, + "rewards/margins": 0.8954181671142578, + "rewards/rejected": -2.822693347930908, + "sft_loss": 1.9559904336929321, + "step": 4035 + }, + { + "epoch": 2.16223448737247, + "grad_norm": 6.431058173776684, + "learning_rate": 2.1913094061257476e-07, + "logits/chosen": -0.16342909634113312, + "logits/rejected": -0.13806693255901337, + "logps/chosen": -1.9453058242797852, + "logps/rejected": -3.038445234298706, + "loss": 0.6754, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9453058242797852, + "rewards/margins": 1.0931396484375, + "rewards/rejected": -3.038445234298706, + "sft_loss": 1.92788565158844, + "step": 4040 + }, + { + "epoch": 2.164910520153872, + "grad_norm": 4.244287447305024, + "learning_rate": 2.178438424412633e-07, + "logits/chosen": -0.1588595062494278, + "logits/rejected": -0.008251415565609932, + "logps/chosen": -2.0741567611694336, + "logps/rejected": -2.979722738265991, + "loss": 0.6788, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.0741567611694336, + "rewards/margins": 0.9055658578872681, + "rewards/rejected": -2.979722738265991, + "sft_loss": 2.0933797359466553, + "step": 4045 + }, + { + "epoch": 2.1675865529352736, + "grad_norm": 3.260292178088132, + "learning_rate": 2.165594817502302e-07, + "logits/chosen": -0.23745575547218323, + "logits/rejected": -0.10504372417926788, + "logps/chosen": -2.184518337249756, + "logps/rejected": -2.9294536113739014, + "loss": 0.7078, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.184518337249756, + "rewards/margins": 0.7449353337287903, + "rewards/rejected": -2.9294536113739014, + "sft_loss": 2.254260540008545, + "step": 4050 + }, + { + "epoch": 2.170262585716675, + "grad_norm": 4.8497247516539055, + "learning_rate": 2.1527787100034806e-07, + "logits/chosen": -0.13812163472175598, + "logits/rejected": -0.07096612453460693, + "logps/chosen": -2.1685566902160645, + "logps/rejected": -2.8709235191345215, + "loss": 0.6785, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.1685566902160645, + "rewards/margins": 0.7023668885231018, + "rewards/rejected": -2.8709235191345215, + "sft_loss": 2.1655735969543457, + "step": 4055 + }, + { + "epoch": 2.1729386184980766, + "grad_norm": 3.3485727001851484, + "learning_rate": 2.1399902262581037e-07, + "logits/chosen": -0.08121892064809799, + "logits/rejected": 0.061185263097286224, + "logps/chosen": -2.0909311771392822, + "logps/rejected": -3.0841729640960693, + "loss": 0.6681, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.0909311771392822, + "rewards/margins": 0.9932416677474976, + "rewards/rejected": -3.0841729640960693, + "sft_loss": 2.140881061553955, + "step": 4060 + }, + { + "epoch": 2.1756146512794783, + "grad_norm": 3.707981491158516, + "learning_rate": 2.127229490340094e-07, + "logits/chosen": -0.23737430572509766, + "logits/rejected": -0.1496736705303192, + "logps/chosen": -2.0120530128479004, + "logps/rejected": -3.2726852893829346, + "loss": 0.6584, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.0120530128479004, + "rewards/margins": 1.2606327533721924, + "rewards/rejected": -3.2726852893829346, + "sft_loss": 2.0266661643981934, + "step": 4065 + }, + { + "epoch": 2.1782906840608796, + "grad_norm": 6.024021365435233, + "learning_rate": 2.1144966260541698e-07, + "logits/chosen": -0.1443227231502533, + "logits/rejected": 0.07983455806970596, + "logps/chosen": -2.093372106552124, + "logps/rejected": -3.282695770263672, + "loss": 0.6714, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.093372106552124, + "rewards/margins": 1.1893236637115479, + "rewards/rejected": -3.282695770263672, + "sft_loss": 2.143284320831299, + "step": 4070 + }, + { + "epoch": 2.1809667168422813, + "grad_norm": 3.6485022760546504, + "learning_rate": 2.1017917569346332e-07, + "logits/chosen": -0.21213392913341522, + "logits/rejected": -0.04222031682729721, + "logps/chosen": -2.104381561279297, + "logps/rejected": -3.0034842491149902, + "loss": 0.6689, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.104381561279297, + "rewards/margins": 0.8991022109985352, + "rewards/rejected": -3.0034842491149902, + "sft_loss": 2.0348763465881348, + "step": 4075 + }, + { + "epoch": 2.183642749623683, + "grad_norm": 2.5335317808819533, + "learning_rate": 2.0891150062441837e-07, + "logits/chosen": -0.2206738442182541, + "logits/rejected": -0.08761437982320786, + "logps/chosen": -2.207284927368164, + "logps/rejected": -3.4129977226257324, + "loss": 0.6775, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.207284927368164, + "rewards/margins": 1.2057130336761475, + "rewards/rejected": -3.4129977226257324, + "sft_loss": 2.2189645767211914, + "step": 4080 + }, + { + "epoch": 2.1863187824050843, + "grad_norm": 4.87628005158134, + "learning_rate": 2.0764664969727086e-07, + "logits/chosen": -0.15324094891548157, + "logits/rejected": -0.0661562904715538, + "logps/chosen": -1.9889189004898071, + "logps/rejected": -3.1456923484802246, + "loss": 0.6702, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9889189004898071, + "rewards/margins": 1.156773328781128, + "rewards/rejected": -3.1456923484802246, + "sft_loss": 2.0339088439941406, + "step": 4085 + }, + { + "epoch": 2.188994815186486, + "grad_norm": 4.3138329473464285, + "learning_rate": 2.0638463518361033e-07, + "logits/chosen": -0.26100295782089233, + "logits/rejected": -0.08350460231304169, + "logps/chosen": -2.0644030570983887, + "logps/rejected": -3.0298380851745605, + "loss": 0.6784, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.0644030570983887, + "rewards/margins": 0.9654353260993958, + "rewards/rejected": -3.0298380851745605, + "sft_loss": 2.0661580562591553, + "step": 4090 + }, + { + "epoch": 2.1916708479678877, + "grad_norm": 7.085430632636539, + "learning_rate": 2.0512546932750702e-07, + "logits/chosen": -0.2247125655412674, + "logits/rejected": -0.11863106489181519, + "logps/chosen": -2.1816582679748535, + "logps/rejected": -3.0513510704040527, + "loss": 0.6741, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.1816582679748535, + "rewards/margins": 0.8696924448013306, + "rewards/rejected": -3.0513510704040527, + "sft_loss": 2.1615498065948486, + "step": 4095 + }, + { + "epoch": 2.194346880749289, + "grad_norm": 6.182948105987573, + "learning_rate": 2.0386916434539343e-07, + "logits/chosen": -0.1700625717639923, + "logits/rejected": -0.017289992421865463, + "logps/chosen": -1.9123014211654663, + "logps/rejected": -3.273717164993286, + "loss": 0.6535, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9123014211654663, + "rewards/margins": 1.3614153861999512, + "rewards/rejected": -3.273717164993286, + "sft_loss": 2.0608344078063965, + "step": 4100 + }, + { + "epoch": 2.1970229135306907, + "grad_norm": 8.383302226866206, + "learning_rate": 2.0261573242594627e-07, + "logits/chosen": -0.15090781450271606, + "logits/rejected": 0.02349255420267582, + "logps/chosen": -2.2728543281555176, + "logps/rejected": -3.1122283935546875, + "loss": 0.6844, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.2728543281555176, + "rewards/margins": 0.8393740653991699, + "rewards/rejected": -3.1122283935546875, + "sft_loss": 2.1716437339782715, + "step": 4105 + }, + { + "epoch": 2.1996989463120924, + "grad_norm": 4.71009773090505, + "learning_rate": 2.0136518572996724e-07, + "logits/chosen": -0.14930608868598938, + "logits/rejected": 0.0531165674328804, + "logps/chosen": -2.2689640522003174, + "logps/rejected": -3.132922649383545, + "loss": 0.6823, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2689640522003174, + "rewards/margins": 0.8639589548110962, + "rewards/rejected": -3.132922649383545, + "sft_loss": 2.2543208599090576, + "step": 4110 + }, + { + "epoch": 2.202374979093494, + "grad_norm": 4.544348221456484, + "learning_rate": 2.0011753639026617e-07, + "logits/chosen": -0.16687729954719543, + "logits/rejected": -0.08343629539012909, + "logps/chosen": -2.053715229034424, + "logps/rejected": -3.274094820022583, + "loss": 0.6721, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.053715229034424, + "rewards/margins": 1.2203797101974487, + "rewards/rejected": -3.274094820022583, + "sft_loss": 2.095189094543457, + "step": 4115 + }, + { + "epoch": 2.2050510118748954, + "grad_norm": 2.7263042944114275, + "learning_rate": 1.988727965115421e-07, + "logits/chosen": -0.1594991385936737, + "logits/rejected": -0.05406603962182999, + "logps/chosen": -1.9413394927978516, + "logps/rejected": -3.1444528102874756, + "loss": 0.6586, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.9413394927978516, + "rewards/margins": 1.203113079071045, + "rewards/rejected": -3.1444528102874756, + "sft_loss": 1.9781215190887451, + "step": 4120 + }, + { + "epoch": 2.207727044656297, + "grad_norm": 3.424507807498148, + "learning_rate": 1.9763097817026713e-07, + "logits/chosen": -0.21377746760845184, + "logits/rejected": -0.018960028886795044, + "logps/chosen": -2.046653985977173, + "logps/rejected": -3.401824474334717, + "loss": 0.6614, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.046653985977173, + "rewards/margins": 1.3551702499389648, + "rewards/rejected": -3.401824474334717, + "sft_loss": 2.1029536724090576, + "step": 4125 + }, + { + "epoch": 2.210403077437699, + "grad_norm": 6.063552367358618, + "learning_rate": 1.9639209341456796e-07, + "logits/chosen": -0.13719500601291656, + "logits/rejected": -0.02102483995258808, + "logps/chosen": -2.0080060958862305, + "logps/rejected": -3.1949410438537598, + "loss": 0.6718, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0080060958862305, + "rewards/margins": 1.1869351863861084, + "rewards/rejected": -3.1949410438537598, + "sft_loss": 2.161597967147827, + "step": 4130 + }, + { + "epoch": 2.2130791102191, + "grad_norm": 9.965307089691409, + "learning_rate": 1.951561542641102e-07, + "logits/chosen": -0.10167713463306427, + "logits/rejected": -0.1033143624663353, + "logps/chosen": -2.1930813789367676, + "logps/rejected": -3.31758451461792, + "loss": 0.6724, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1930813789367676, + "rewards/margins": 1.1245031356811523, + "rewards/rejected": -3.31758451461792, + "sft_loss": 2.1610350608825684, + "step": 4135 + }, + { + "epoch": 2.215755143000502, + "grad_norm": 2.4400753492629454, + "learning_rate": 1.939231727099806e-07, + "logits/chosen": -0.296092689037323, + "logits/rejected": -0.20539577305316925, + "logps/chosen": -2.0341458320617676, + "logps/rejected": -3.2220683097839355, + "loss": 0.6643, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.0341458320617676, + "rewards/margins": 1.187922716140747, + "rewards/rejected": -3.2220683097839355, + "sft_loss": 2.0283610820770264, + "step": 4140 + }, + { + "epoch": 2.2184311757819035, + "grad_norm": 4.024281354117429, + "learning_rate": 1.926931607145719e-07, + "logits/chosen": -0.09105809032917023, + "logits/rejected": 0.06579498946666718, + "logps/chosen": -2.2511210441589355, + "logps/rejected": -3.344057559967041, + "loss": 0.6852, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2511210441589355, + "rewards/margins": 1.092936635017395, + "rewards/rejected": -3.344057559967041, + "sft_loss": 2.2170441150665283, + "step": 4145 + }, + { + "epoch": 2.221107208563305, + "grad_norm": 4.525395256208599, + "learning_rate": 1.9146613021146564e-07, + "logits/chosen": -0.15142369270324707, + "logits/rejected": -0.036278653889894485, + "logps/chosen": -2.0554230213165283, + "logps/rejected": -3.2157158851623535, + "loss": 0.6603, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0554230213165283, + "rewards/margins": 1.1602928638458252, + "rewards/rejected": -3.2157158851623535, + "sft_loss": 2.0941882133483887, + "step": 4150 + }, + { + "epoch": 2.2237832413447065, + "grad_norm": 3.8622033531060147, + "learning_rate": 1.9024209310531736e-07, + "logits/chosen": -0.12516744434833527, + "logits/rejected": -0.09860964119434357, + "logps/chosen": -2.007272243499756, + "logps/rejected": -2.9931883811950684, + "loss": 0.6599, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.007272243499756, + "rewards/margins": 0.9859158396720886, + "rewards/rejected": -2.9931883811950684, + "sft_loss": 1.9754520654678345, + "step": 4155 + }, + { + "epoch": 2.2264592741261082, + "grad_norm": 3.5309672373675083, + "learning_rate": 1.890210612717401e-07, + "logits/chosen": -0.15712139010429382, + "logits/rejected": -0.0017227933276444674, + "logps/chosen": -2.1674985885620117, + "logps/rejected": -3.099808931350708, + "loss": 0.6778, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1674985885620117, + "rewards/margins": 0.9323102831840515, + "rewards/rejected": -3.099808931350708, + "sft_loss": 2.1875224113464355, + "step": 4160 + }, + { + "epoch": 2.2291353069075095, + "grad_norm": 4.705264536623013, + "learning_rate": 1.8780304655719054e-07, + "logits/chosen": -0.11597056686878204, + "logits/rejected": 0.009610396809875965, + "logps/chosen": -2.099177122116089, + "logps/rejected": -3.3595035076141357, + "loss": 0.67, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.099177122116089, + "rewards/margins": 1.2603263854980469, + "rewards/rejected": -3.3595035076141357, + "sft_loss": 2.085693836212158, + "step": 4165 + }, + { + "epoch": 2.231811339688911, + "grad_norm": 3.9246632050914343, + "learning_rate": 1.865880607788523e-07, + "logits/chosen": -0.012894287705421448, + "logits/rejected": 0.04866151511669159, + "logps/chosen": -2.1491246223449707, + "logps/rejected": -3.175144672393799, + "loss": 0.6787, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1491246223449707, + "rewards/margins": 1.0260196924209595, + "rewards/rejected": -3.175144672393799, + "sft_loss": 2.2446813583374023, + "step": 4170 + }, + { + "epoch": 2.234487372470313, + "grad_norm": 4.522581248647605, + "learning_rate": 1.8537611572452316e-07, + "logits/chosen": -0.12783598899841309, + "logits/rejected": -0.040727607905864716, + "logps/chosen": -2.1029319763183594, + "logps/rejected": -2.9529013633728027, + "loss": 0.6796, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1029319763183594, + "rewards/margins": 0.8499695062637329, + "rewards/rejected": -2.9529013633728027, + "sft_loss": 2.15362548828125, + "step": 4175 + }, + { + "epoch": 2.237163405251714, + "grad_norm": 6.090100570578844, + "learning_rate": 1.84167223152499e-07, + "logits/chosen": -0.16989929974079132, + "logits/rejected": 0.0463339164853096, + "logps/chosen": -2.0245769023895264, + "logps/rejected": -3.166522264480591, + "loss": 0.668, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0245769023895264, + "rewards/margins": 1.141945242881775, + "rewards/rejected": -3.166522264480591, + "sft_loss": 2.0397515296936035, + "step": 4180 + }, + { + "epoch": 2.239839438033116, + "grad_norm": 5.252724677953759, + "learning_rate": 1.8296139479146112e-07, + "logits/chosen": -0.17356649041175842, + "logits/rejected": -0.1362331509590149, + "logps/chosen": -1.9837900400161743, + "logps/rejected": -3.133542776107788, + "loss": 0.66, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.9837900400161743, + "rewards/margins": 1.1497529745101929, + "rewards/rejected": -3.133542776107788, + "sft_loss": 1.993292212486267, + "step": 4185 + }, + { + "epoch": 2.2425154708145176, + "grad_norm": 4.974097176371491, + "learning_rate": 1.8175864234036132e-07, + "logits/chosen": -0.08385307341814041, + "logits/rejected": -0.008767305873334408, + "logps/chosen": -2.0469765663146973, + "logps/rejected": -3.248431444168091, + "loss": 0.6696, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0469765663146973, + "rewards/margins": 1.2014553546905518, + "rewards/rejected": -3.248431444168091, + "sft_loss": 2.076141357421875, + "step": 4190 + }, + { + "epoch": 2.245191503595919, + "grad_norm": 3.215829779280696, + "learning_rate": 1.805589774683094e-07, + "logits/chosen": -0.2681628167629242, + "logits/rejected": -0.11363331228494644, + "logps/chosen": -2.019747257232666, + "logps/rejected": -2.9316470623016357, + "loss": 0.6757, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.019747257232666, + "rewards/margins": 0.9119003415107727, + "rewards/rejected": -2.9316470623016357, + "sft_loss": 2.0684947967529297, + "step": 4195 + }, + { + "epoch": 2.2478675363773206, + "grad_norm": 5.504838028510426, + "learning_rate": 1.79362411814459e-07, + "logits/chosen": -0.017854904755949974, + "logits/rejected": -0.030091866850852966, + "logps/chosen": -2.1185128688812256, + "logps/rejected": -3.105051040649414, + "loss": 0.6683, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1185128688812256, + "rewards/margins": 0.9865385890007019, + "rewards/rejected": -3.105051040649414, + "sft_loss": 2.1398489475250244, + "step": 4200 + }, + { + "epoch": 2.2505435691587223, + "grad_norm": 4.497631500600767, + "learning_rate": 1.7816895698789552e-07, + "logits/chosen": -0.19890213012695312, + "logits/rejected": -0.10730002075433731, + "logps/chosen": -1.9607737064361572, + "logps/rejected": -2.923576831817627, + "loss": 0.6748, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9607737064361572, + "rewards/margins": 0.962803065776825, + "rewards/rejected": -2.923576831817627, + "sft_loss": 1.987640619277954, + "step": 4205 + }, + { + "epoch": 2.2532196019401236, + "grad_norm": 5.105379180206469, + "learning_rate": 1.7697862456752271e-07, + "logits/chosen": -0.17512214183807373, + "logits/rejected": -0.05550699308514595, + "logps/chosen": -2.1245641708374023, + "logps/rejected": -3.594120502471924, + "loss": 0.6786, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1245641708374023, + "rewards/margins": 1.4695560932159424, + "rewards/rejected": -3.594120502471924, + "sft_loss": 2.149122714996338, + "step": 4210 + }, + { + "epoch": 2.2558956347215253, + "grad_norm": 3.83029784211302, + "learning_rate": 1.7579142610195124e-07, + "logits/chosen": -0.16118036210536957, + "logits/rejected": -0.005187329836189747, + "logps/chosen": -2.12148118019104, + "logps/rejected": -3.23693585395813, + "loss": 0.666, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.12148118019104, + "rewards/margins": 1.1154545545578003, + "rewards/rejected": -3.23693585395813, + "sft_loss": 2.113344430923462, + "step": 4215 + }, + { + "epoch": 2.258571667502927, + "grad_norm": 6.599358418640735, + "learning_rate": 1.7460737310938568e-07, + "logits/chosen": -0.18872864544391632, + "logits/rejected": 0.03380126133561134, + "logps/chosen": -1.9155935049057007, + "logps/rejected": -3.2058091163635254, + "loss": 0.6609, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.9155935049057007, + "rewards/margins": 1.2902159690856934, + "rewards/rejected": -3.2058091163635254, + "sft_loss": 1.9777272939682007, + "step": 4220 + }, + { + "epoch": 2.2612477002843283, + "grad_norm": 16.060417700477547, + "learning_rate": 1.734264770775133e-07, + "logits/chosen": -0.20039887726306915, + "logits/rejected": 0.0246284157037735, + "logps/chosen": -1.9851608276367188, + "logps/rejected": -3.1726815700531006, + "loss": 0.6812, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9851608276367188, + "rewards/margins": 1.1875207424163818, + "rewards/rejected": -3.1726815700531006, + "sft_loss": 2.0441513061523438, + "step": 4225 + }, + { + "epoch": 2.26392373306573, + "grad_norm": 11.398195564231516, + "learning_rate": 1.7224874946339241e-07, + "logits/chosen": -0.16940069198608398, + "logits/rejected": -0.08222613483667374, + "logps/chosen": -2.179595470428467, + "logps/rejected": -3.1289970874786377, + "loss": 0.6783, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.179595470428467, + "rewards/margins": 0.9494016766548157, + "rewards/rejected": -3.1289970874786377, + "sft_loss": 2.0225982666015625, + "step": 4230 + }, + { + "epoch": 2.2665997658471317, + "grad_norm": 2.8613846338117335, + "learning_rate": 1.7107420169334186e-07, + "logits/chosen": -0.13604898750782013, + "logits/rejected": -0.008158263750374317, + "logps/chosen": -2.219383716583252, + "logps/rejected": -3.0880210399627686, + "loss": 0.6858, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.219383716583252, + "rewards/margins": 0.8686376810073853, + "rewards/rejected": -3.0880210399627686, + "sft_loss": 2.2598366737365723, + "step": 4235 + }, + { + "epoch": 2.269275798628533, + "grad_norm": 5.814034674569944, + "learning_rate": 1.6990284516282893e-07, + "logits/chosen": -0.1635938584804535, + "logits/rejected": -0.054556943476200104, + "logps/chosen": -1.9896701574325562, + "logps/rejected": -3.0219662189483643, + "loss": 0.6849, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.9896701574325562, + "rewards/margins": 1.0322957038879395, + "rewards/rejected": -3.0219662189483643, + "sft_loss": 2.025726318359375, + "step": 4240 + }, + { + "epoch": 2.2719518314099347, + "grad_norm": 4.641368804567021, + "learning_rate": 1.687346912363602e-07, + "logits/chosen": -0.1995202600955963, + "logits/rejected": -0.05748230963945389, + "logps/chosen": -2.0775208473205566, + "logps/rejected": -3.147552967071533, + "loss": 0.672, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0775208473205566, + "rewards/margins": 1.0700318813323975, + "rewards/rejected": -3.147552967071533, + "sft_loss": 2.0811524391174316, + "step": 4245 + }, + { + "epoch": 2.2746278641913364, + "grad_norm": 2.18664544571687, + "learning_rate": 1.675697512473697e-07, + "logits/chosen": -0.18338724970817566, + "logits/rejected": -0.01710915006697178, + "logps/chosen": -2.08484148979187, + "logps/rejected": -3.206913709640503, + "loss": 0.6653, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.08484148979187, + "rewards/margins": 1.1220722198486328, + "rewards/rejected": -3.206913709640503, + "sft_loss": 2.0524466037750244, + "step": 4250 + }, + { + "epoch": 2.2773038969727377, + "grad_norm": 4.943999541276936, + "learning_rate": 1.6640803649811087e-07, + "logits/chosen": -0.21756012737751007, + "logits/rejected": 0.05011097341775894, + "logps/chosen": -2.2237229347229004, + "logps/rejected": -3.426621198654175, + "loss": 0.6713, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2237229347229004, + "rewards/margins": 1.2028987407684326, + "rewards/rejected": -3.426621198654175, + "sft_loss": 2.1406548023223877, + "step": 4255 + }, + { + "epoch": 2.2799799297541394, + "grad_norm": 5.408164318031505, + "learning_rate": 1.6524955825954472e-07, + "logits/chosen": -0.1468680053949356, + "logits/rejected": -0.02393932268023491, + "logps/chosen": -2.07031512260437, + "logps/rejected": -3.1913414001464844, + "loss": 0.6687, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.07031512260437, + "rewards/margins": 1.1210265159606934, + "rewards/rejected": -3.1913414001464844, + "sft_loss": 2.0023207664489746, + "step": 4260 + }, + { + "epoch": 2.282655962535541, + "grad_norm": 2.629647851286111, + "learning_rate": 1.6409432777123277e-07, + "logits/chosen": -0.26578372716903687, + "logits/rejected": -0.10726435482501984, + "logps/chosen": -2.0501198768615723, + "logps/rejected": -3.2494723796844482, + "loss": 0.6659, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.0501198768615723, + "rewards/margins": 1.199352502822876, + "rewards/rejected": -3.2494723796844482, + "sft_loss": 2.0937790870666504, + "step": 4265 + }, + { + "epoch": 2.285331995316943, + "grad_norm": 4.192809371025622, + "learning_rate": 1.6294235624122577e-07, + "logits/chosen": -0.1099371537566185, + "logits/rejected": 0.11603529751300812, + "logps/chosen": -2.0957791805267334, + "logps/rejected": -3.2949626445770264, + "loss": 0.679, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.0957791805267334, + "rewards/margins": 1.1991835832595825, + "rewards/rejected": -3.2949626445770264, + "sft_loss": 2.0239791870117188, + "step": 4270 + }, + { + "epoch": 2.288008028098344, + "grad_norm": 3.903537454392507, + "learning_rate": 1.6179365484595697e-07, + "logits/chosen": -0.16982577741146088, + "logits/rejected": -0.06521300226449966, + "logps/chosen": -2.1850740909576416, + "logps/rejected": -3.1623456478118896, + "loss": 0.6874, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.1850740909576416, + "rewards/margins": 0.977271556854248, + "rewards/rejected": -3.1623456478118896, + "sft_loss": 2.226618528366089, + "step": 4275 + }, + { + "epoch": 2.290684060879746, + "grad_norm": 3.437569990163379, + "learning_rate": 1.60648234730132e-07, + "logits/chosen": -0.19504033029079437, + "logits/rejected": -0.09787000715732574, + "logps/chosen": -1.9441182613372803, + "logps/rejected": -3.1635780334472656, + "loss": 0.6554, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9441182613372803, + "rewards/margins": 1.2194595336914062, + "rewards/rejected": -3.1635780334472656, + "sft_loss": 2.0067336559295654, + "step": 4280 + }, + { + "epoch": 2.293360093661147, + "grad_norm": 5.147925937687013, + "learning_rate": 1.595061070066222e-07, + "logits/chosen": -0.11027028411626816, + "logits/rejected": -0.09639479219913483, + "logps/chosen": -2.033560276031494, + "logps/rejected": -3.3017265796661377, + "loss": 0.6618, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.033560276031494, + "rewards/margins": 1.2681666612625122, + "rewards/rejected": -3.3017265796661377, + "sft_loss": 2.10772442817688, + "step": 4285 + }, + { + "epoch": 2.296036126442549, + "grad_norm": 4.305073371928673, + "learning_rate": 1.5836728275635542e-07, + "logits/chosen": -0.22954753041267395, + "logits/rejected": -0.06323157250881195, + "logps/chosen": -2.1516566276550293, + "logps/rejected": -3.090576171875, + "loss": 0.6806, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1516566276550293, + "rewards/margins": 0.9389199018478394, + "rewards/rejected": -3.090576171875, + "sft_loss": 2.0620648860931396, + "step": 4290 + }, + { + "epoch": 2.2987121592239506, + "grad_norm": 3.278783746807796, + "learning_rate": 1.5723177302820984e-07, + "logits/chosen": -0.20301339030265808, + "logits/rejected": -0.11684314906597137, + "logps/chosen": -2.1644604206085205, + "logps/rejected": -3.050349712371826, + "loss": 0.687, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1644604206085205, + "rewards/margins": 0.8858893513679504, + "rewards/rejected": -3.050349712371826, + "sft_loss": 2.1849312782287598, + "step": 4295 + }, + { + "epoch": 2.3013881920053523, + "grad_norm": 4.657115472273005, + "learning_rate": 1.5609958883890544e-07, + "logits/chosen": -0.14204177260398865, + "logits/rejected": -0.010100598447024822, + "logps/chosen": -2.1115527153015137, + "logps/rejected": -3.1037027835845947, + "loss": 0.6725, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1115527153015137, + "rewards/margins": 0.992149829864502, + "rewards/rejected": -3.1037027835845947, + "sft_loss": 1.9344184398651123, + "step": 4300 + }, + { + "epoch": 2.3040642247867535, + "grad_norm": 5.544476019095504, + "learning_rate": 1.5497074117289865e-07, + "logits/chosen": -0.2005896270275116, + "logits/rejected": -0.07895728200674057, + "logps/chosen": -2.069153070449829, + "logps/rejected": -3.3531486988067627, + "loss": 0.6483, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.069153070449829, + "rewards/margins": 1.2839953899383545, + "rewards/rejected": -3.3531486988067627, + "sft_loss": 2.10805606842041, + "step": 4305 + }, + { + "epoch": 2.3067402575681553, + "grad_norm": 3.3931901015368893, + "learning_rate": 1.5384524098227402e-07, + "logits/chosen": -0.173683762550354, + "logits/rejected": 0.01994958147406578, + "logps/chosen": -2.0747077465057373, + "logps/rejected": -3.4305903911590576, + "loss": 0.6662, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0747077465057373, + "rewards/margins": 1.3558826446533203, + "rewards/rejected": -3.4305903911590576, + "sft_loss": 2.0860283374786377, + "step": 4310 + }, + { + "epoch": 2.3094162903495565, + "grad_norm": 4.20577772629158, + "learning_rate": 1.5272309918663974e-07, + "logits/chosen": -0.19935302436351776, + "logits/rejected": -0.022657129913568497, + "logps/chosen": -2.242023468017578, + "logps/rejected": -2.900801658630371, + "loss": 0.6863, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.242023468017578, + "rewards/margins": 0.6587780714035034, + "rewards/rejected": -2.900801658630371, + "sft_loss": 2.1713993549346924, + "step": 4315 + }, + { + "epoch": 2.3120923231309582, + "grad_norm": 5.825064489064668, + "learning_rate": 1.516043266730201e-07, + "logits/chosen": -0.1561446338891983, + "logits/rejected": -0.0014340400230139494, + "logps/chosen": -2.1472461223602295, + "logps/rejected": -3.130239963531494, + "loss": 0.6756, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1472461223602295, + "rewards/margins": 0.9829939007759094, + "rewards/rejected": -3.130239963531494, + "sft_loss": 2.094031810760498, + "step": 4320 + }, + { + "epoch": 2.31476835591236, + "grad_norm": 7.255559691191783, + "learning_rate": 1.504889342957512e-07, + "logits/chosen": -0.1678655445575714, + "logits/rejected": -0.01243885699659586, + "logps/chosen": -2.26615571975708, + "logps/rejected": -3.3633294105529785, + "loss": 0.6772, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.26615571975708, + "rewards/margins": 1.0971730947494507, + "rewards/rejected": -3.3633294105529785, + "sft_loss": 2.252499580383301, + "step": 4325 + }, + { + "epoch": 2.3174443886937617, + "grad_norm": 3.1621430151557117, + "learning_rate": 1.4937693287637453e-07, + "logits/chosen": -0.1939752995967865, + "logits/rejected": -0.04376544803380966, + "logps/chosen": -2.2893729209899902, + "logps/rejected": -3.1768245697021484, + "loss": 0.6798, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2893729209899902, + "rewards/margins": 0.8874520063400269, + "rewards/rejected": -3.1768245697021484, + "sft_loss": 2.222339391708374, + "step": 4330 + }, + { + "epoch": 2.320120421475163, + "grad_norm": 5.010713776256952, + "learning_rate": 1.4826833320353305e-07, + "logits/chosen": -0.1629539281129837, + "logits/rejected": -0.07679332047700882, + "logps/chosen": -2.2452406883239746, + "logps/rejected": -3.230379581451416, + "loss": 0.6962, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2452406883239746, + "rewards/margins": 0.9851387739181519, + "rewards/rejected": -3.230379581451416, + "sft_loss": 2.1471447944641113, + "step": 4335 + }, + { + "epoch": 2.3227964542565647, + "grad_norm": 5.8548545448021745, + "learning_rate": 1.4716314603286528e-07, + "logits/chosen": -0.18137940764427185, + "logits/rejected": 0.02409942075610161, + "logps/chosen": -2.000059127807617, + "logps/rejected": -3.428389072418213, + "loss": 0.6619, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.000059127807617, + "rewards/margins": 1.4283303022384644, + "rewards/rejected": -3.428389072418213, + "sft_loss": 2.0621116161346436, + "step": 4340 + }, + { + "epoch": 2.3254724870379664, + "grad_norm": 4.338810629298849, + "learning_rate": 1.4606138208690233e-07, + "logits/chosen": -0.2061314880847931, + "logits/rejected": -0.1214604377746582, + "logps/chosen": -2.3152942657470703, + "logps/rejected": -3.1802566051483154, + "loss": 0.6856, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.3152942657470703, + "rewards/margins": 0.8649622201919556, + "rewards/rejected": -3.1802566051483154, + "sft_loss": 2.219158172607422, + "step": 4345 + }, + { + "epoch": 2.3281485198193677, + "grad_norm": 2.656680842948589, + "learning_rate": 1.4496305205496251e-07, + "logits/chosen": -0.16902683675289154, + "logits/rejected": -0.0923638790845871, + "logps/chosen": -2.280181407928467, + "logps/rejected": -3.3735251426696777, + "loss": 0.6743, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.280181407928467, + "rewards/margins": 1.0933433771133423, + "rewards/rejected": -3.3735251426696777, + "sft_loss": 2.243540048599243, + "step": 4350 + }, + { + "epoch": 2.3308245526007694, + "grad_norm": 3.460901158364958, + "learning_rate": 1.4386816659304895e-07, + "logits/chosen": -0.26293832063674927, + "logits/rejected": -0.12125550210475922, + "logps/chosen": -2.193232774734497, + "logps/rejected": -3.2326667308807373, + "loss": 0.6733, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.193232774734497, + "rewards/margins": 1.0394338369369507, + "rewards/rejected": -3.2326667308807373, + "sft_loss": 2.1737167835235596, + "step": 4355 + }, + { + "epoch": 2.333500585382171, + "grad_norm": 3.3471424841073096, + "learning_rate": 1.4277673632374492e-07, + "logits/chosen": -0.22366316616535187, + "logits/rejected": -0.0031981945503503084, + "logps/chosen": -2.270132064819336, + "logps/rejected": -3.141998052597046, + "loss": 0.6843, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.270132064819336, + "rewards/margins": 0.8718658685684204, + "rewards/rejected": -3.141998052597046, + "sft_loss": 2.204784870147705, + "step": 4360 + }, + { + "epoch": 2.3361766181635724, + "grad_norm": 4.734326537984999, + "learning_rate": 1.416887718361119e-07, + "logits/chosen": -0.06445908546447754, + "logits/rejected": -0.037993717938661575, + "logps/chosen": -2.031144618988037, + "logps/rejected": -3.1980462074279785, + "loss": 0.6646, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.031144618988037, + "rewards/margins": 1.1669015884399414, + "rewards/rejected": -3.1980462074279785, + "sft_loss": 2.078840494155884, + "step": 4365 + }, + { + "epoch": 2.338852650944974, + "grad_norm": 3.7313523331518557, + "learning_rate": 1.406042836855859e-07, + "logits/chosen": -0.15151730179786682, + "logits/rejected": -0.022474870085716248, + "logps/chosen": -1.991729497909546, + "logps/rejected": -3.368088483810425, + "loss": 0.6591, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.991729497909546, + "rewards/margins": 1.3763587474822998, + "rewards/rejected": -3.368088483810425, + "sft_loss": 2.025282144546509, + "step": 4370 + }, + { + "epoch": 2.341528683726376, + "grad_norm": 7.025836233543962, + "learning_rate": 1.3952328239387595e-07, + "logits/chosen": -0.25764456391334534, + "logits/rejected": -0.055336516350507736, + "logps/chosen": -2.141427993774414, + "logps/rejected": -3.1318140029907227, + "loss": 0.6718, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.141427993774414, + "rewards/margins": 0.9903861880302429, + "rewards/rejected": -3.1318140029907227, + "sft_loss": 2.2568488121032715, + "step": 4375 + }, + { + "epoch": 2.344204716507777, + "grad_norm": 5.938234903067426, + "learning_rate": 1.3844577844886109e-07, + "logits/chosen": -0.21786122024059296, + "logits/rejected": -0.018373187631368637, + "logps/chosen": -2.062985420227051, + "logps/rejected": -3.075976848602295, + "loss": 0.6773, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.062985420227051, + "rewards/margins": 1.0129914283752441, + "rewards/rejected": -3.075976848602295, + "sft_loss": 2.0868048667907715, + "step": 4380 + }, + { + "epoch": 2.346880749289179, + "grad_norm": 5.219983872580531, + "learning_rate": 1.3737178230448955e-07, + "logits/chosen": -0.24924568831920624, + "logits/rejected": -0.11489690840244293, + "logps/chosen": -2.0819058418273926, + "logps/rejected": -3.0514657497406006, + "loss": 0.6719, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0819058418273926, + "rewards/margins": 0.9695600271224976, + "rewards/rejected": -3.0514657497406006, + "sft_loss": 2.061774253845215, + "step": 4385 + }, + { + "epoch": 2.3495567820705805, + "grad_norm": 3.94212703161178, + "learning_rate": 1.363013043806764e-07, + "logits/chosen": -0.21287953853607178, + "logits/rejected": -0.06039626523852348, + "logps/chosen": -1.9492467641830444, + "logps/rejected": -3.088829517364502, + "loss": 0.6674, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9492467641830444, + "rewards/margins": 1.1395829916000366, + "rewards/rejected": -3.088829517364502, + "sft_loss": 1.9745715856552124, + "step": 4390 + }, + { + "epoch": 2.3522328148519818, + "grad_norm": 3.346019533502611, + "learning_rate": 1.352343550632034e-07, + "logits/chosen": -0.16004632413387299, + "logits/rejected": -0.011574303731322289, + "logps/chosen": -2.0461299419403076, + "logps/rejected": -3.428122043609619, + "loss": 0.6685, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0461299419403076, + "rewards/margins": 1.3819921016693115, + "rewards/rejected": -3.428122043609619, + "sft_loss": 2.0484619140625, + "step": 4395 + }, + { + "epoch": 2.3549088476333835, + "grad_norm": 2.957888286685369, + "learning_rate": 1.3417094470361722e-07, + "logits/chosen": -0.22204062342643738, + "logits/rejected": -0.0745166540145874, + "logps/chosen": -2.1117758750915527, + "logps/rejected": -3.0526363849639893, + "loss": 0.6694, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1117758750915527, + "rewards/margins": 0.9408601522445679, + "rewards/rejected": -3.0526363849639893, + "sft_loss": 2.1839990615844727, + "step": 4400 + }, + { + "epoch": 2.3549088476333835, + "eval_logits/chosen": 0.1483335644006729, + "eval_logits/rejected": 0.24738433957099915, + "eval_logps/chosen": -2.15895938873291, + "eval_logps/rejected": -3.1864676475524902, + "eval_loss": 0.6887222528457642, + "eval_rewards/accuracies": 0.6921365261077881, + "eval_rewards/chosen": -2.15895938873291, + "eval_rewards/margins": 1.02750825881958, + "eval_rewards/rejected": -3.1864676475524902, + "eval_runtime": 44.3003, + "eval_samples_per_second": 30.361, + "eval_sft_loss": 2.1145176887512207, + "eval_steps_per_second": 7.607, + "step": 4400 + }, + { + "epoch": 2.357584880414785, + "grad_norm": 5.817831977279581, + "learning_rate": 1.3311108361913015e-07, + "logits/chosen": -0.25421690940856934, + "logits/rejected": -0.20666737854480743, + "logps/chosen": -2.0045931339263916, + "logps/rejected": -2.980300188064575, + "loss": 0.6699, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.0045931339263916, + "rewards/margins": 0.9757073521614075, + "rewards/rejected": -2.980300188064575, + "sft_loss": 2.0653605461120605, + "step": 4405 + }, + { + "epoch": 2.3602609131961865, + "grad_norm": 5.5107099677882445, + "learning_rate": 1.3205478209251874e-07, + "logits/chosen": -0.1670076549053192, + "logits/rejected": -0.07222741842269897, + "logps/chosen": -2.15533185005188, + "logps/rejected": -3.3656413555145264, + "loss": 0.6632, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.15533185005188, + "rewards/margins": 1.2103097438812256, + "rewards/rejected": -3.3656413555145264, + "sft_loss": 2.161409854888916, + "step": 4410 + }, + { + "epoch": 2.362936945977588, + "grad_norm": 6.296428782704514, + "learning_rate": 1.310020503720254e-07, + "logits/chosen": -0.17989398539066315, + "logits/rejected": 0.0022085390519350767, + "logps/chosen": -2.2136988639831543, + "logps/rejected": -3.173541307449341, + "loss": 0.6729, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2136988639831543, + "rewards/margins": 0.9598420858383179, + "rewards/rejected": -3.173541307449341, + "sft_loss": 2.1450047492980957, + "step": 4415 + }, + { + "epoch": 2.36561297875899, + "grad_norm": 9.545366280350136, + "learning_rate": 1.2995289867125752e-07, + "logits/chosen": -0.18045374751091003, + "logits/rejected": -0.09958475083112717, + "logps/chosen": -2.098446846008301, + "logps/rejected": -2.973470449447632, + "loss": 0.6849, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.098446846008301, + "rewards/margins": 0.8750236630439758, + "rewards/rejected": -2.973470449447632, + "sft_loss": 2.033970355987549, + "step": 4420 + }, + { + "epoch": 2.368289011540391, + "grad_norm": 5.860959261583987, + "learning_rate": 1.2890733716908986e-07, + "logits/chosen": -0.18542693555355072, + "logits/rejected": -0.09705062210559845, + "logps/chosen": -1.8457778692245483, + "logps/rejected": -2.8968265056610107, + "loss": 0.6507, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.8457778692245483, + "rewards/margins": 1.0510485172271729, + "rewards/rejected": -2.8968265056610107, + "sft_loss": 1.942373514175415, + "step": 4425 + }, + { + "epoch": 2.370965044321793, + "grad_norm": 3.2555514045628327, + "learning_rate": 1.2786537600956454e-07, + "logits/chosen": -0.22500939667224884, + "logits/rejected": -0.05397395044565201, + "logps/chosen": -2.0629329681396484, + "logps/rejected": -3.292663097381592, + "loss": 0.6686, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0629329681396484, + "rewards/margins": 1.229730248451233, + "rewards/rejected": -3.292663097381592, + "sft_loss": 2.043607473373413, + "step": 4430 + }, + { + "epoch": 2.3736410771031946, + "grad_norm": 5.048163709079541, + "learning_rate": 1.268270253017933e-07, + "logits/chosen": -0.2142733782529831, + "logits/rejected": -0.016649195924401283, + "logps/chosen": -2.1254544258117676, + "logps/rejected": -3.3514981269836426, + "loss": 0.6675, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1254544258117676, + "rewards/margins": 1.226043462753296, + "rewards/rejected": -3.3514981269836426, + "sft_loss": 2.1830906867980957, + "step": 4435 + }, + { + "epoch": 2.376317109884596, + "grad_norm": 6.011265373821511, + "learning_rate": 1.257922951198591e-07, + "logits/chosen": -0.28187423944473267, + "logits/rejected": -0.04165396839380264, + "logps/chosen": -2.0677714347839355, + "logps/rejected": -3.1529834270477295, + "loss": 0.6743, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.0677714347839355, + "rewards/margins": 1.085211992263794, + "rewards/rejected": -3.1529834270477295, + "sft_loss": 2.1271486282348633, + "step": 4440 + }, + { + "epoch": 2.3789931426659976, + "grad_norm": 4.059665837066231, + "learning_rate": 1.24761195502719e-07, + "logits/chosen": -0.23464807868003845, + "logits/rejected": -0.03467894718050957, + "logps/chosen": -2.2200443744659424, + "logps/rejected": -3.086019992828369, + "loss": 0.7057, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.2200443744659424, + "rewards/margins": 0.8659753799438477, + "rewards/rejected": -3.086019992828369, + "sft_loss": 2.2082483768463135, + "step": 4445 + }, + { + "epoch": 2.3816691754473993, + "grad_norm": 4.484062105655913, + "learning_rate": 1.2373373645410573e-07, + "logits/chosen": -0.1839439868927002, + "logits/rejected": -0.01578974723815918, + "logps/chosen": -2.3946101665496826, + "logps/rejected": -3.2734806537628174, + "loss": 0.6811, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.3946101665496826, + "rewards/margins": 0.8788701891899109, + "rewards/rejected": -3.2734806537628174, + "sft_loss": 2.288024663925171, + "step": 4450 + }, + { + "epoch": 2.384345208228801, + "grad_norm": 2.701679577284141, + "learning_rate": 1.2270992794243175e-07, + "logits/chosen": -0.2606489658355713, + "logits/rejected": -0.14730049669742584, + "logps/chosen": -2.2450132369995117, + "logps/rejected": -3.4032225608825684, + "loss": 0.6725, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2450132369995117, + "rewards/margins": 1.1582093238830566, + "rewards/rejected": -3.4032225608825684, + "sft_loss": 2.2701351642608643, + "step": 4455 + }, + { + "epoch": 2.3870212410102023, + "grad_norm": 3.5960375002298557, + "learning_rate": 1.2168977990069147e-07, + "logits/chosen": -0.21604426205158234, + "logits/rejected": 0.00216987868770957, + "logps/chosen": -2.0344831943511963, + "logps/rejected": -3.2819888591766357, + "loss": 0.6703, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.0344831943511963, + "rewards/margins": 1.2475056648254395, + "rewards/rejected": -3.2819888591766357, + "sft_loss": 2.083733558654785, + "step": 4460 + }, + { + "epoch": 2.389697273791604, + "grad_norm": 2.383947641733567, + "learning_rate": 1.206733022263659e-07, + "logits/chosen": -0.2589126229286194, + "logits/rejected": -0.046416062861680984, + "logps/chosen": -2.1839864253997803, + "logps/rejected": -3.235879421234131, + "loss": 0.6849, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1839864253997803, + "rewards/margins": 1.0518933534622192, + "rewards/rejected": -3.235879421234131, + "sft_loss": 2.14011287689209, + "step": 4465 + }, + { + "epoch": 2.3923733065730053, + "grad_norm": 4.902499961393214, + "learning_rate": 1.1966050478132572e-07, + "logits/chosen": -0.1358111947774887, + "logits/rejected": -0.049908898770809174, + "logps/chosen": -2.076606512069702, + "logps/rejected": -3.116420269012451, + "loss": 0.6687, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.076606512069702, + "rewards/margins": 1.0398141145706177, + "rewards/rejected": -3.116420269012451, + "sft_loss": 2.177868366241455, + "step": 4470 + }, + { + "epoch": 2.395049339354407, + "grad_norm": 3.999033078306384, + "learning_rate": 1.1865139739173635e-07, + "logits/chosen": -0.22534795105457306, + "logits/rejected": -0.011211365461349487, + "logps/chosen": -2.2466542720794678, + "logps/rejected": -3.222562313079834, + "loss": 0.6767, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2466542720794678, + "rewards/margins": 0.975908100605011, + "rewards/rejected": -3.222562313079834, + "sft_loss": 2.2334485054016113, + "step": 4475 + }, + { + "epoch": 2.3977253721358087, + "grad_norm": 2.9046584204611103, + "learning_rate": 1.1764598984796187e-07, + "logits/chosen": -0.22426724433898926, + "logits/rejected": -0.12043038755655289, + "logps/chosen": -1.967124342918396, + "logps/rejected": -2.9338607788085938, + "loss": 0.6683, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.967124342918396, + "rewards/margins": 0.9667367935180664, + "rewards/rejected": -2.9338607788085938, + "sft_loss": 1.9687387943267822, + "step": 4480 + }, + { + "epoch": 2.4004014049172104, + "grad_norm": 16.514901421405394, + "learning_rate": 1.1664429190447095e-07, + "logits/chosen": -0.164666548371315, + "logits/rejected": -0.06656551361083984, + "logps/chosen": -2.102875232696533, + "logps/rejected": -3.20685076713562, + "loss": 0.6849, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.102875232696533, + "rewards/margins": 1.1039756536483765, + "rewards/rejected": -3.20685076713562, + "sft_loss": 2.0535407066345215, + "step": 4485 + }, + { + "epoch": 2.4030774376986117, + "grad_norm": 4.517911779314552, + "learning_rate": 1.1564631327974122e-07, + "logits/chosen": -0.21307143568992615, + "logits/rejected": -0.02924274280667305, + "logps/chosen": -2.1048378944396973, + "logps/rejected": -3.3505256175994873, + "loss": 0.6664, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1048378944396973, + "rewards/margins": 1.24568772315979, + "rewards/rejected": -3.3505256175994873, + "sft_loss": 2.151399850845337, + "step": 4490 + }, + { + "epoch": 2.4057534704800134, + "grad_norm": 7.700116019487075, + "learning_rate": 1.1465206365616587e-07, + "logits/chosen": -0.273918479681015, + "logits/rejected": -0.07913421839475632, + "logps/chosen": -2.106267213821411, + "logps/rejected": -3.02795672416687, + "loss": 0.6886, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.106267213821411, + "rewards/margins": 0.921689510345459, + "rewards/rejected": -3.02795672416687, + "sft_loss": 2.160156011581421, + "step": 4495 + }, + { + "epoch": 2.408429503261415, + "grad_norm": 3.9073485908549697, + "learning_rate": 1.1366155267995887e-07, + "logits/chosen": -0.11443676799535751, + "logits/rejected": -0.10150516033172607, + "logps/chosen": -1.9929697513580322, + "logps/rejected": -3.0766334533691406, + "loss": 0.6686, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.9929697513580322, + "rewards/margins": 1.0836635828018188, + "rewards/rejected": -3.0766334533691406, + "sft_loss": 2.0109825134277344, + "step": 4500 + }, + { + "epoch": 2.4111055360428164, + "grad_norm": 4.730581186560652, + "learning_rate": 1.1267478996106228e-07, + "logits/chosen": -0.20358338952064514, + "logits/rejected": 0.0008606910705566406, + "logps/chosen": -2.1794867515563965, + "logps/rejected": -3.184910297393799, + "loss": 0.6796, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1794867515563965, + "rewards/margins": 1.0054237842559814, + "rewards/rejected": -3.184910297393799, + "sft_loss": 2.1558899879455566, + "step": 4505 + }, + { + "epoch": 2.413781568824218, + "grad_norm": 4.984279237576151, + "learning_rate": 1.116917850730521e-07, + "logits/chosen": -0.23577456176280975, + "logits/rejected": -0.08217061311006546, + "logps/chosen": -2.0777785778045654, + "logps/rejected": -3.0439350605010986, + "loss": 0.667, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.0777785778045654, + "rewards/margins": 0.9661566019058228, + "rewards/rejected": -3.0439350605010986, + "sft_loss": 2.0925450325012207, + "step": 4510 + }, + { + "epoch": 2.41645760160562, + "grad_norm": 6.392868476110788, + "learning_rate": 1.1071254755304637e-07, + "logits/chosen": -0.19419506192207336, + "logits/rejected": -0.12320198863744736, + "logps/chosen": -1.963719367980957, + "logps/rejected": -3.0543346405029297, + "loss": 0.6625, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.963719367980957, + "rewards/margins": 1.090614676475525, + "rewards/rejected": -3.0543346405029297, + "sft_loss": 1.983407974243164, + "step": 4515 + }, + { + "epoch": 2.419133634387021, + "grad_norm": 3.660525378634166, + "learning_rate": 1.0973708690161143e-07, + "logits/chosen": -0.16067619621753693, + "logits/rejected": -0.0643281489610672, + "logps/chosen": -2.078392505645752, + "logps/rejected": -3.2905898094177246, + "loss": 0.654, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.078392505645752, + "rewards/margins": 1.2121970653533936, + "rewards/rejected": -3.2905898094177246, + "sft_loss": 2.1481382846832275, + "step": 4520 + }, + { + "epoch": 2.421809667168423, + "grad_norm": 4.053343742403941, + "learning_rate": 1.0876541258267119e-07, + "logits/chosen": -0.22848236560821533, + "logits/rejected": -0.02969544194638729, + "logps/chosen": -2.1285107135772705, + "logps/rejected": -3.1666715145111084, + "loss": 0.6682, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.1285107135772705, + "rewards/margins": 1.0381609201431274, + "rewards/rejected": -3.1666715145111084, + "sft_loss": 2.166686773300171, + "step": 4525 + }, + { + "epoch": 2.4244856999498245, + "grad_norm": 3.2692826243416078, + "learning_rate": 1.0779753402341379e-07, + "logits/chosen": -0.23465761542320251, + "logits/rejected": -0.16107268631458282, + "logps/chosen": -2.1159329414367676, + "logps/rejected": -2.995014190673828, + "loss": 0.6864, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1159329414367676, + "rewards/margins": 0.8790813684463501, + "rewards/rejected": -2.995014190673828, + "sft_loss": 2.008147954940796, + "step": 4530 + }, + { + "epoch": 2.427161732731226, + "grad_norm": 4.143745884555678, + "learning_rate": 1.0683346061420157e-07, + "logits/chosen": -0.08680951595306396, + "logits/rejected": 0.007194558624178171, + "logps/chosen": -2.0280799865722656, + "logps/rejected": -3.1981682777404785, + "loss": 0.6716, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.0280799865722656, + "rewards/margins": 1.1700884103775024, + "rewards/rejected": -3.1981682777404785, + "sft_loss": 2.09446120262146, + "step": 4535 + }, + { + "epoch": 2.4298377655126275, + "grad_norm": 7.728816132361034, + "learning_rate": 1.0587320170847874e-07, + "logits/chosen": -0.16309547424316406, + "logits/rejected": -0.045490048825740814, + "logps/chosen": -2.087641716003418, + "logps/rejected": -3.1135027408599854, + "loss": 0.6735, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.087641716003418, + "rewards/margins": 1.0258610248565674, + "rewards/rejected": -3.1135027408599854, + "sft_loss": 2.124109983444214, + "step": 4540 + }, + { + "epoch": 2.4325137982940293, + "grad_norm": 5.368612799469, + "learning_rate": 1.0491676662268156e-07, + "logits/chosen": -0.13496272265911102, + "logits/rejected": -0.026529857888817787, + "logps/chosen": -2.0952439308166504, + "logps/rejected": -3.109184503555298, + "loss": 0.6837, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0952439308166504, + "rewards/margins": 1.013940453529358, + "rewards/rejected": -3.109184503555298, + "sft_loss": 2.1108901500701904, + "step": 4545 + }, + { + "epoch": 2.4351898310754305, + "grad_norm": 5.578544809423889, + "learning_rate": 1.0396416463614732e-07, + "logits/chosen": -0.240474134683609, + "logits/rejected": -0.11594425141811371, + "logps/chosen": -1.9697179794311523, + "logps/rejected": -3.124454975128174, + "loss": 0.6646, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9697179794311523, + "rewards/margins": 1.154736876487732, + "rewards/rejected": -3.124454975128174, + "sft_loss": 2.0101077556610107, + "step": 4550 + }, + { + "epoch": 2.4378658638568322, + "grad_norm": 2.894872323011034, + "learning_rate": 1.0301540499102479e-07, + "logits/chosen": -0.17339864373207092, + "logits/rejected": -0.07853229343891144, + "logps/chosen": -2.1969246864318848, + "logps/rejected": -3.136164426803589, + "loss": 0.6745, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1969246864318848, + "rewards/margins": 0.939239501953125, + "rewards/rejected": -3.136164426803589, + "sft_loss": 2.230463743209839, + "step": 4555 + }, + { + "epoch": 2.440541896638234, + "grad_norm": 4.022297021715591, + "learning_rate": 1.0207049689218405e-07, + "logits/chosen": -0.2338290959596634, + "logits/rejected": -0.0293536689132452, + "logps/chosen": -2.089036226272583, + "logps/rejected": -3.1517233848571777, + "loss": 0.6603, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.089036226272583, + "rewards/margins": 1.0626872777938843, + "rewards/rejected": -3.1517233848571777, + "sft_loss": 2.0483181476593018, + "step": 4560 + }, + { + "epoch": 2.4432179294196352, + "grad_norm": 6.602413577147545, + "learning_rate": 1.0112944950712782e-07, + "logits/chosen": -0.18875204026699066, + "logits/rejected": -0.06928505003452301, + "logps/chosen": -2.056365489959717, + "logps/rejected": -3.2918601036071777, + "loss": 0.6662, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.056365489959717, + "rewards/margins": 1.2354947328567505, + "rewards/rejected": -3.2918601036071777, + "sft_loss": 1.9954465627670288, + "step": 4565 + }, + { + "epoch": 2.445893962201037, + "grad_norm": 4.579347021717172, + "learning_rate": 1.0019227196590174e-07, + "logits/chosen": -0.15530741214752197, + "logits/rejected": 0.010155891068279743, + "logps/chosen": -2.1392271518707275, + "logps/rejected": -3.348783493041992, + "loss": 0.6705, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.1392271518707275, + "rewards/margins": 1.2095563411712646, + "rewards/rejected": -3.348783493041992, + "sft_loss": 2.191584348678589, + "step": 4570 + }, + { + "epoch": 2.4485699949824387, + "grad_norm": 3.485684798487018, + "learning_rate": 9.925897336100664e-08, + "logits/chosen": -0.1419631689786911, + "logits/rejected": -0.03920459747314453, + "logps/chosen": -2.076598882675171, + "logps/rejected": -3.1227006912231445, + "loss": 0.6739, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.076598882675171, + "rewards/margins": 1.0461018085479736, + "rewards/rejected": -3.1227006912231445, + "sft_loss": 2.107247829437256, + "step": 4575 + }, + { + "epoch": 2.45124602776384, + "grad_norm": 5.141057227980531, + "learning_rate": 9.832956274730946e-08, + "logits/chosen": -0.1832994967699051, + "logits/rejected": -0.12847161293029785, + "logps/chosen": -2.0571811199188232, + "logps/rejected": -3.0328803062438965, + "loss": 0.6719, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.0571811199188232, + "rewards/margins": 0.9756988286972046, + "rewards/rejected": -3.0328803062438965, + "sft_loss": 2.0433449745178223, + "step": 4580 + }, + { + "epoch": 2.4539220605452416, + "grad_norm": 4.358330681533576, + "learning_rate": 9.740404914195633e-08, + "logits/chosen": -0.1776488721370697, + "logits/rejected": -0.0038052662275731564, + "logps/chosen": -2.1023917198181152, + "logps/rejected": -3.284991502761841, + "loss": 0.6646, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1023917198181152, + "rewards/margins": 1.1825997829437256, + "rewards/rejected": -3.284991502761841, + "sft_loss": 2.1911160945892334, + "step": 4585 + }, + { + "epoch": 2.4565980933266434, + "grad_norm": 3.771162771811942, + "learning_rate": 9.648244152428392e-08, + "logits/chosen": -0.2590990960597992, + "logits/rejected": -0.11647045612335205, + "logps/chosen": -2.125926971435547, + "logps/rejected": -2.983856201171875, + "loss": 0.6686, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.125926971435547, + "rewards/margins": 0.8579292297363281, + "rewards/rejected": -2.983856201171875, + "sft_loss": 2.19528865814209, + "step": 4590 + }, + { + "epoch": 2.4592741261080446, + "grad_norm": 5.391570825028533, + "learning_rate": 9.556474883573379e-08, + "logits/chosen": -0.23013600707054138, + "logits/rejected": -0.09508788585662842, + "logps/chosen": -1.9805049896240234, + "logps/rejected": -3.3649487495422363, + "loss": 0.6677, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9805049896240234, + "rewards/margins": 1.3844436407089233, + "rewards/rejected": -3.3649487495422363, + "sft_loss": 2.0095901489257812, + "step": 4595 + }, + { + "epoch": 2.4619501588894463, + "grad_norm": 3.9702939952336394, + "learning_rate": 9.465097997976412e-08, + "logits/chosen": -0.21318873763084412, + "logits/rejected": 0.009989452548325062, + "logps/chosen": -2.0758414268493652, + "logps/rejected": -3.3712425231933594, + "loss": 0.6634, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0758414268493652, + "rewards/margins": 1.2954013347625732, + "rewards/rejected": -3.3712425231933594, + "sft_loss": 2.166355848312378, + "step": 4600 + }, + { + "epoch": 2.464626191670848, + "grad_norm": 4.827633181150157, + "learning_rate": 9.374114382176457e-08, + "logits/chosen": -0.15626654028892517, + "logits/rejected": 0.003751030657440424, + "logps/chosen": -2.1273741722106934, + "logps/rejected": -3.2390360832214355, + "loss": 0.6749, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1273741722106934, + "rewards/margins": 1.1116619110107422, + "rewards/rejected": -3.2390360832214355, + "sft_loss": 2.157222270965576, + "step": 4605 + }, + { + "epoch": 2.46730222445225, + "grad_norm": 3.2688554722893253, + "learning_rate": 9.283524918896945e-08, + "logits/chosen": -0.19700530171394348, + "logits/rejected": -0.054456066340208054, + "logps/chosen": -2.1807610988616943, + "logps/rejected": -3.256981611251831, + "loss": 0.68, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1807610988616943, + "rewards/margins": 1.076220989227295, + "rewards/rejected": -3.256981611251831, + "sft_loss": 2.1460068225860596, + "step": 4610 + }, + { + "epoch": 2.469978257233651, + "grad_norm": 4.060086699739296, + "learning_rate": 9.193330487037232e-08, + "logits/chosen": -0.1563759744167328, + "logits/rejected": -0.007863004691898823, + "logps/chosen": -2.1849801540374756, + "logps/rejected": -3.558262586593628, + "loss": 0.6654, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1849801540374756, + "rewards/margins": 1.373282551765442, + "rewards/rejected": -3.558262586593628, + "sft_loss": 2.270066976547241, + "step": 4615 + }, + { + "epoch": 2.4726542900150528, + "grad_norm": 6.246636699435798, + "learning_rate": 9.103531961664118e-08, + "logits/chosen": -0.14790132641792297, + "logits/rejected": 0.057421762496232986, + "logps/chosen": -2.08307147026062, + "logps/rejected": -3.0761396884918213, + "loss": 0.6625, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.08307147026062, + "rewards/margins": 0.9930680990219116, + "rewards/rejected": -3.0761396884918213, + "sft_loss": 2.1781067848205566, + "step": 4620 + }, + { + "epoch": 2.475330322796454, + "grad_norm": 2.882130224247622, + "learning_rate": 9.014130214003269e-08, + "logits/chosen": -0.24506108462810516, + "logits/rejected": -0.191410630941391, + "logps/chosen": -2.0796127319335938, + "logps/rejected": -3.255584716796875, + "loss": 0.676, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0796127319335938, + "rewards/margins": 1.1759722232818604, + "rewards/rejected": -3.255584716796875, + "sft_loss": 2.0985941886901855, + "step": 4625 + }, + { + "epoch": 2.4780063555778558, + "grad_norm": 8.769729197358377, + "learning_rate": 8.925126111430848e-08, + "logits/chosen": -0.17417939007282257, + "logits/rejected": -0.08329921960830688, + "logps/chosen": -1.988851547241211, + "logps/rejected": -3.1151936054229736, + "loss": 0.677, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.988851547241211, + "rewards/margins": 1.1263421773910522, + "rewards/rejected": -3.1151936054229736, + "sft_loss": 1.986951470375061, + "step": 4630 + }, + { + "epoch": 2.4806823883592575, + "grad_norm": 15.640877111904508, + "learning_rate": 8.83652051746504e-08, + "logits/chosen": -0.07740931212902069, + "logits/rejected": 0.06860514730215073, + "logps/chosen": -2.248150110244751, + "logps/rejected": -3.5168323516845703, + "loss": 0.6774, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.248150110244751, + "rewards/margins": 1.2686822414398193, + "rewards/rejected": -3.5168323516845703, + "sft_loss": 2.257601261138916, + "step": 4635 + }, + { + "epoch": 2.483358421140659, + "grad_norm": 3.701452759156924, + "learning_rate": 8.748314291757696e-08, + "logits/chosen": -0.12445513904094696, + "logits/rejected": -0.004047292284667492, + "logps/chosen": -2.089381217956543, + "logps/rejected": -3.2651374340057373, + "loss": 0.6758, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.089381217956543, + "rewards/margins": 1.1757564544677734, + "rewards/rejected": -3.2651374340057373, + "sft_loss": 2.09254789352417, + "step": 4640 + }, + { + "epoch": 2.4860344539220605, + "grad_norm": 7.154716282601555, + "learning_rate": 8.660508290086032e-08, + "logits/chosen": -0.15927599370479584, + "logits/rejected": -0.005575224757194519, + "logps/chosen": -1.9210288524627686, + "logps/rejected": -3.160552740097046, + "loss": 0.6573, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9210288524627686, + "rewards/margins": 1.239524006843567, + "rewards/rejected": -3.160552740097046, + "sft_loss": 1.9605824947357178, + "step": 4645 + }, + { + "epoch": 2.488710486703462, + "grad_norm": 4.952289202829157, + "learning_rate": 8.573103364344231e-08, + "logits/chosen": -0.1863076090812683, + "logits/rejected": 0.06800957769155502, + "logps/chosen": -2.0291895866394043, + "logps/rejected": -3.1924831867218018, + "loss": 0.6632, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0291895866394043, + "rewards/margins": 1.163293480873108, + "rewards/rejected": -3.1924831867218018, + "sft_loss": 1.9746549129486084, + "step": 4650 + }, + { + "epoch": 2.4913865194848634, + "grad_norm": 4.648816313592943, + "learning_rate": 8.486100362535292e-08, + "logits/chosen": -0.18942146003246307, + "logits/rejected": -0.0317835696041584, + "logps/chosen": -2.0402283668518066, + "logps/rejected": -2.8962950706481934, + "loss": 0.6878, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.0402283668518066, + "rewards/margins": 0.8560672998428345, + "rewards/rejected": -2.8962950706481934, + "sft_loss": 2.0892820358276367, + "step": 4655 + }, + { + "epoch": 2.494062552266265, + "grad_norm": 4.5728506351169385, + "learning_rate": 8.399500128762693e-08, + "logits/chosen": -0.2015286386013031, + "logits/rejected": -0.06532806903123856, + "logps/chosen": -2.163422107696533, + "logps/rejected": -3.1678988933563232, + "loss": 0.6709, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.163422107696533, + "rewards/margins": 1.0044764280319214, + "rewards/rejected": -3.1678988933563232, + "sft_loss": 2.1092007160186768, + "step": 4660 + }, + { + "epoch": 2.496738585047667, + "grad_norm": 4.445299022884345, + "learning_rate": 8.313303503222313e-08, + "logits/chosen": -0.18331186473369598, + "logits/rejected": -0.08265326172113419, + "logps/chosen": -2.3359968662261963, + "logps/rejected": -3.1260948181152344, + "loss": 0.6873, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.3359968662261963, + "rewards/margins": 0.7900980710983276, + "rewards/rejected": -3.1260948181152344, + "sft_loss": 2.2022814750671387, + "step": 4665 + }, + { + "epoch": 2.4994146178290686, + "grad_norm": 4.396326992507092, + "learning_rate": 8.227511322194164e-08, + "logits/chosen": -0.17815802991390228, + "logits/rejected": -0.045660458505153656, + "logps/chosen": -2.120434284210205, + "logps/rejected": -2.9672372341156006, + "loss": 0.6803, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.120434284210205, + "rewards/margins": 0.8468027114868164, + "rewards/rejected": -2.9672372341156006, + "sft_loss": 2.0805301666259766, + "step": 4670 + }, + { + "epoch": 2.50209065061047, + "grad_norm": 3.536039764346822, + "learning_rate": 8.142124418034385e-08, + "logits/chosen": -0.12677282094955444, + "logits/rejected": 0.017463652417063713, + "logps/chosen": -2.0878491401672363, + "logps/rejected": -3.0715315341949463, + "loss": 0.6695, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.0878491401672363, + "rewards/margins": 0.9836824536323547, + "rewards/rejected": -3.0715315341949463, + "sft_loss": 2.0803754329681396, + "step": 4675 + }, + { + "epoch": 2.5047666833918716, + "grad_norm": 3.485078084719585, + "learning_rate": 8.057143619167073e-08, + "logits/chosen": -0.13141298294067383, + "logits/rejected": -0.019352253526449203, + "logps/chosen": -2.1474380493164062, + "logps/rejected": -3.40448260307312, + "loss": 0.6769, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1474380493164062, + "rewards/margins": 1.2570445537567139, + "rewards/rejected": -3.40448260307312, + "sft_loss": 2.121006965637207, + "step": 4680 + }, + { + "epoch": 2.507442716173273, + "grad_norm": 4.508835312403918, + "learning_rate": 7.97256975007633e-08, + "logits/chosen": -0.25155094265937805, + "logits/rejected": -0.05297283083200455, + "logps/chosen": -2.0105624198913574, + "logps/rejected": -3.0212528705596924, + "loss": 0.6742, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.0105624198913574, + "rewards/margins": 1.0106905698776245, + "rewards/rejected": -3.0212528705596924, + "sft_loss": 2.00258731842041, + "step": 4685 + }, + { + "epoch": 2.5101187489546746, + "grad_norm": 5.373460727947873, + "learning_rate": 7.888403631298186e-08, + "logits/chosen": -0.14757230877876282, + "logits/rejected": -0.06751175224781036, + "logps/chosen": -1.9183895587921143, + "logps/rejected": -3.0663979053497314, + "loss": 0.6671, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.9183895587921143, + "rewards/margins": 1.1480082273483276, + "rewards/rejected": -3.0663979053497314, + "sft_loss": 1.9006916284561157, + "step": 4690 + }, + { + "epoch": 2.5127947817360763, + "grad_norm": 4.001601812949795, + "learning_rate": 7.804646079412719e-08, + "logits/chosen": -0.14795711636543274, + "logits/rejected": 0.02044614776968956, + "logps/chosen": -2.209221363067627, + "logps/rejected": -3.196859836578369, + "loss": 0.6687, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.209221363067627, + "rewards/margins": 0.9876389503479004, + "rewards/rejected": -3.196859836578369, + "sft_loss": 2.2197983264923096, + "step": 4695 + }, + { + "epoch": 2.515470814517478, + "grad_norm": 3.311482171020216, + "learning_rate": 7.72129790703604e-08, + "logits/chosen": -0.22143828868865967, + "logits/rejected": -0.08104579150676727, + "logps/chosen": -2.0433874130249023, + "logps/rejected": -3.050140380859375, + "loss": 0.6741, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.0433874130249023, + "rewards/margins": 1.006752848625183, + "rewards/rejected": -3.050140380859375, + "sft_loss": 2.0471534729003906, + "step": 4700 + }, + { + "epoch": 2.5181468472988793, + "grad_norm": 4.445965037114739, + "learning_rate": 7.638359922812504e-08, + "logits/chosen": -0.11444427073001862, + "logits/rejected": -0.059414125978946686, + "logps/chosen": -2.22161602973938, + "logps/rejected": -3.1719493865966797, + "loss": 0.6792, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.22161602973938, + "rewards/margins": 0.9503332376480103, + "rewards/rejected": -3.1719493865966797, + "sft_loss": 2.1201417446136475, + "step": 4705 + }, + { + "epoch": 2.520822880080281, + "grad_norm": 5.143327483278184, + "learning_rate": 7.555832931406774e-08, + "logits/chosen": -0.20135203003883362, + "logits/rejected": -0.02879125438630581, + "logps/chosen": -2.1493937969207764, + "logps/rejected": -3.2124671936035156, + "loss": 0.6746, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1493937969207764, + "rewards/margins": 1.0630733966827393, + "rewards/rejected": -3.2124671936035156, + "sft_loss": 2.1464171409606934, + "step": 4710 + }, + { + "epoch": 2.5234989128616827, + "grad_norm": 3.7711345709560766, + "learning_rate": 7.47371773349611e-08, + "logits/chosen": -0.13477244973182678, + "logits/rejected": -0.09561160206794739, + "logps/chosen": -2.101278781890869, + "logps/rejected": -3.401787519454956, + "loss": 0.6625, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.101278781890869, + "rewards/margins": 1.300508975982666, + "rewards/rejected": -3.401787519454956, + "sft_loss": 2.103086233139038, + "step": 4715 + }, + { + "epoch": 2.526174945643084, + "grad_norm": 13.488161310983283, + "learning_rate": 7.392015125762496e-08, + "logits/chosen": -0.13736936450004578, + "logits/rejected": -0.01585746742784977, + "logps/chosen": -1.9863735437393188, + "logps/rejected": -3.251328229904175, + "loss": 0.6707, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9863735437393188, + "rewards/margins": 1.2649548053741455, + "rewards/rejected": -3.251328229904175, + "sft_loss": 1.9716198444366455, + "step": 4720 + }, + { + "epoch": 2.5288509784244857, + "grad_norm": 4.4555967248871085, + "learning_rate": 7.310725900885018e-08, + "logits/chosen": -0.18262706696987152, + "logits/rejected": -0.09311272203922272, + "logps/chosen": -2.151057243347168, + "logps/rejected": -3.1605875492095947, + "loss": 0.6733, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.151057243347168, + "rewards/margins": 1.0095303058624268, + "rewards/rejected": -3.1605875492095947, + "sft_loss": 2.2076568603515625, + "step": 4725 + }, + { + "epoch": 2.5315270112058874, + "grad_norm": 8.275028499451532, + "learning_rate": 7.229850847532076e-08, + "logits/chosen": -0.15065720677375793, + "logits/rejected": 0.0015823025023564696, + "logps/chosen": -2.06017804145813, + "logps/rejected": -3.1844449043273926, + "loss": 0.6681, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.06017804145813, + "rewards/margins": 1.1242671012878418, + "rewards/rejected": -3.1844449043273926, + "sft_loss": 2.1298396587371826, + "step": 4730 + }, + { + "epoch": 2.5342030439872887, + "grad_norm": 4.523229173230882, + "learning_rate": 7.149390750353779e-08, + "logits/chosen": -0.13953344523906708, + "logits/rejected": -0.16973023116588593, + "logps/chosen": -1.9981342554092407, + "logps/rejected": -3.045314073562622, + "loss": 0.6629, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.9981342554092407, + "rewards/margins": 1.0471800565719604, + "rewards/rejected": -3.045314073562622, + "sft_loss": 2.0397486686706543, + "step": 4735 + }, + { + "epoch": 2.5368790767686904, + "grad_norm": 3.7345807108239786, + "learning_rate": 7.069346389974374e-08, + "logits/chosen": -0.20351342856884003, + "logits/rejected": -0.029998648911714554, + "logps/chosen": -2.1162867546081543, + "logps/rejected": -3.1932778358459473, + "loss": 0.6769, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1162867546081543, + "rewards/margins": 1.076991319656372, + "rewards/rejected": -3.1932778358459473, + "sft_loss": 2.173251152038574, + "step": 4740 + }, + { + "epoch": 2.539555109550092, + "grad_norm": 3.9663568034487207, + "learning_rate": 6.989718542984563e-08, + "logits/chosen": -0.1681821644306183, + "logits/rejected": -0.10534799098968506, + "logps/chosen": -2.140448808670044, + "logps/rejected": -3.256242275238037, + "loss": 0.6662, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.140448808670044, + "rewards/margins": 1.1157935857772827, + "rewards/rejected": -3.256242275238037, + "sft_loss": 2.1975722312927246, + "step": 4745 + }, + { + "epoch": 2.5422311423314934, + "grad_norm": 3.588701506766935, + "learning_rate": 6.9105079819341e-08, + "logits/chosen": -0.11192403733730316, + "logits/rejected": 0.10734760761260986, + "logps/chosen": -2.1018166542053223, + "logps/rejected": -3.561558961868286, + "loss": 0.6669, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.1018166542053223, + "rewards/margins": 1.4597421884536743, + "rewards/rejected": -3.561558961868286, + "sft_loss": 2.1336729526519775, + "step": 4750 + }, + { + "epoch": 2.544907175112895, + "grad_norm": 3.5016222528436303, + "learning_rate": 6.831715475324163e-08, + "logits/chosen": -0.2518225312232971, + "logits/rejected": -0.07085306942462921, + "logps/chosen": -2.039267063140869, + "logps/rejected": -3.482922315597534, + "loss": 0.6665, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.039267063140869, + "rewards/margins": 1.4436554908752441, + "rewards/rejected": -3.482922315597534, + "sft_loss": 2.106978178024292, + "step": 4755 + }, + { + "epoch": 2.547583207894297, + "grad_norm": 6.226457792487666, + "learning_rate": 6.753341787600026e-08, + "logits/chosen": -0.253101646900177, + "logits/rejected": -0.1483955681324005, + "logps/chosen": -2.048860788345337, + "logps/rejected": -3.3397223949432373, + "loss": 0.667, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.048860788345337, + "rewards/margins": 1.2908620834350586, + "rewards/rejected": -3.3397223949432373, + "sft_loss": 2.101193428039551, + "step": 4760 + }, + { + "epoch": 2.5502592406756985, + "grad_norm": 3.872483505026465, + "learning_rate": 6.67538767914353e-08, + "logits/chosen": -0.23167535662651062, + "logits/rejected": -0.042937736958265305, + "logps/chosen": -2.325857162475586, + "logps/rejected": -3.220263719558716, + "loss": 0.6945, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.325857162475586, + "rewards/margins": 0.8944064378738403, + "rewards/rejected": -3.220263719558716, + "sft_loss": 2.371337413787842, + "step": 4765 + }, + { + "epoch": 2.5529352734571, + "grad_norm": 3.8511845835646175, + "learning_rate": 6.597853906265793e-08, + "logits/chosen": -0.19250306487083435, + "logits/rejected": -0.04675256460905075, + "logps/chosen": -2.2092864513397217, + "logps/rejected": -3.399085521697998, + "loss": 0.6698, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.2092864513397217, + "rewards/margins": 1.1897990703582764, + "rewards/rejected": -3.399085521697998, + "sft_loss": 2.1123721599578857, + "step": 4770 + }, + { + "epoch": 2.5556113062385015, + "grad_norm": 3.7888654661200563, + "learning_rate": 6.5207412211998e-08, + "logits/chosen": -0.06298299133777618, + "logits/rejected": 0.04790624603629112, + "logps/chosen": -2.174638032913208, + "logps/rejected": -3.4041781425476074, + "loss": 0.6614, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.174638032913208, + "rewards/margins": 1.2295398712158203, + "rewards/rejected": -3.4041781425476074, + "sft_loss": 2.144359588623047, + "step": 4775 + }, + { + "epoch": 2.558287339019903, + "grad_norm": 12.481381241745728, + "learning_rate": 6.444050372093186e-08, + "logits/chosen": -0.14639827609062195, + "logits/rejected": -0.008682933636009693, + "logps/chosen": -2.0565505027770996, + "logps/rejected": -3.1733672618865967, + "loss": 0.6718, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0565505027770996, + "rewards/margins": 1.1168169975280762, + "rewards/rejected": -3.1733672618865967, + "sft_loss": 2.1009578704833984, + "step": 4780 + }, + { + "epoch": 2.5609633718013045, + "grad_norm": 4.628119591554495, + "learning_rate": 6.367782103000873e-08, + "logits/chosen": -0.1513335406780243, + "logits/rejected": -0.08773148059844971, + "logps/chosen": -2.12284779548645, + "logps/rejected": -3.0121376514434814, + "loss": 0.6872, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.12284779548645, + "rewards/margins": 0.8892895579338074, + "rewards/rejected": -3.0121376514434814, + "sft_loss": 2.0487465858459473, + "step": 4785 + }, + { + "epoch": 2.5636394045827062, + "grad_norm": 3.281045074150346, + "learning_rate": 6.29193715387798e-08, + "logits/chosen": -0.20077288150787354, + "logits/rejected": -0.06432090699672699, + "logps/chosen": -2.0878376960754395, + "logps/rejected": -3.4198544025421143, + "loss": 0.6687, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.0878376960754395, + "rewards/margins": 1.3320167064666748, + "rewards/rejected": -3.4198544025421143, + "sft_loss": 2.0510482788085938, + "step": 4790 + }, + { + "epoch": 2.566315437364108, + "grad_norm": 4.126740038960389, + "learning_rate": 6.216516260572502e-08, + "logits/chosen": -0.1603049337863922, + "logits/rejected": -0.05132744461297989, + "logps/chosen": -2.1984238624572754, + "logps/rejected": -3.1398496627807617, + "loss": 0.6756, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1984238624572754, + "rewards/margins": 0.9414256811141968, + "rewards/rejected": -3.1398496627807617, + "sft_loss": 2.188628911972046, + "step": 4795 + }, + { + "epoch": 2.568991470145509, + "grad_norm": 5.464558142197646, + "learning_rate": 6.141520154818297e-08, + "logits/chosen": -0.1908133327960968, + "logits/rejected": -0.06753195822238922, + "logps/chosen": -2.2839598655700684, + "logps/rejected": -3.1272454261779785, + "loss": 0.6722, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.2839598655700684, + "rewards/margins": 0.843285083770752, + "rewards/rejected": -3.1272454261779785, + "sft_loss": 2.335068464279175, + "step": 4800 + }, + { + "epoch": 2.568991470145509, + "eval_logits/chosen": 0.12577809393405914, + "eval_logits/rejected": 0.22317178547382355, + "eval_logps/chosen": -2.161029815673828, + "eval_logps/rejected": -3.202573776245117, + "eval_loss": 0.6901692152023315, + "eval_rewards/accuracies": 0.6906528472900391, + "eval_rewards/chosen": -2.161029815673828, + "eval_rewards/margins": 1.04154372215271, + "eval_rewards/rejected": -3.202573776245117, + "eval_runtime": 43.7133, + "eval_samples_per_second": 30.769, + "eval_sft_loss": 2.12894344329834, + "eval_steps_per_second": 7.709, + "step": 4800 + }, + { + "epoch": 2.571667502926911, + "grad_norm": 4.646692370921347, + "learning_rate": 6.066949564227897e-08, + "logits/chosen": -0.24269111454486847, + "logits/rejected": -0.12880149483680725, + "logps/chosen": -2.109504222869873, + "logps/rejected": -3.1083571910858154, + "loss": 0.6718, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.109504222869873, + "rewards/margins": 0.9988533854484558, + "rewards/rejected": -3.1083571910858154, + "sft_loss": 2.1038758754730225, + "step": 4805 + }, + { + "epoch": 2.574343535708312, + "grad_norm": 3.3794381952292922, + "learning_rate": 5.992805212285523e-08, + "logits/chosen": -0.1317947655916214, + "logits/rejected": -0.016943860799074173, + "logps/chosen": -2.104414939880371, + "logps/rejected": -3.1304614543914795, + "loss": 0.6772, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.104414939880371, + "rewards/margins": 1.026046633720398, + "rewards/rejected": -3.1304614543914795, + "sft_loss": 2.0895590782165527, + "step": 4810 + }, + { + "epoch": 2.577019568489714, + "grad_norm": 6.238982280162222, + "learning_rate": 5.9190878183399684e-08, + "logits/chosen": -0.12169364839792252, + "logits/rejected": 0.004199688322842121, + "logps/chosen": -2.032968044281006, + "logps/rejected": -3.2582428455352783, + "loss": 0.6545, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.032968044281006, + "rewards/margins": 1.225274682044983, + "rewards/rejected": -3.2582428455352783, + "sft_loss": 2.0511133670806885, + "step": 4815 + }, + { + "epoch": 2.5796956012711156, + "grad_norm": 3.549271865321555, + "learning_rate": 5.845798097597748e-08, + "logits/chosen": -0.11396322399377823, + "logits/rejected": -0.01782546192407608, + "logps/chosen": -2.1708621978759766, + "logps/rejected": -2.92978572845459, + "loss": 0.6774, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.1708621978759766, + "rewards/margins": 0.7589234113693237, + "rewards/rejected": -2.92978572845459, + "sft_loss": 2.1328911781311035, + "step": 4820 + }, + { + "epoch": 2.5823716340525174, + "grad_norm": 7.253675071341021, + "learning_rate": 5.772936761116026e-08, + "logits/chosen": -0.09767025709152222, + "logits/rejected": 0.04703503102064133, + "logps/chosen": -2.0461907386779785, + "logps/rejected": -3.0559167861938477, + "loss": 0.6645, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0461907386779785, + "rewards/margins": 1.0097261667251587, + "rewards/rejected": -3.0559167861938477, + "sft_loss": 1.985823631286621, + "step": 4825 + }, + { + "epoch": 2.5850476668339186, + "grad_norm": 3.555662353812927, + "learning_rate": 5.700504515795829e-08, + "logits/chosen": -0.18815520405769348, + "logits/rejected": -0.017928075045347214, + "logps/chosen": -2.1906285285949707, + "logps/rejected": -3.220522403717041, + "loss": 0.6781, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1906285285949707, + "rewards/margins": 1.0298939943313599, + "rewards/rejected": -3.220522403717041, + "sft_loss": 2.140667676925659, + "step": 4830 + }, + { + "epoch": 2.5877236996153203, + "grad_norm": 8.073554424832016, + "learning_rate": 5.628502064375101e-08, + "logits/chosen": -0.26414138078689575, + "logits/rejected": -0.042422693222761154, + "logps/chosen": -2.005246639251709, + "logps/rejected": -3.2597591876983643, + "loss": 0.6764, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.005246639251709, + "rewards/margins": 1.2545123100280762, + "rewards/rejected": -3.2597591876983643, + "sft_loss": 2.057586669921875, + "step": 4835 + }, + { + "epoch": 2.5903997323967216, + "grad_norm": 5.1803684543611865, + "learning_rate": 5.55693010542197e-08, + "logits/chosen": -0.22406511008739471, + "logits/rejected": 0.005454069469124079, + "logps/chosen": -2.034522294998169, + "logps/rejected": -3.4171080589294434, + "loss": 0.6503, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.034522294998169, + "rewards/margins": 1.382585883140564, + "rewards/rejected": -3.4171080589294434, + "sft_loss": 1.9969648122787476, + "step": 4840 + }, + { + "epoch": 2.5930757651781233, + "grad_norm": 2.955799743598282, + "learning_rate": 5.485789333327856e-08, + "logits/chosen": -0.12224028259515762, + "logits/rejected": -0.038930658251047134, + "logps/chosen": -2.140636920928955, + "logps/rejected": -3.0933098793029785, + "loss": 0.6755, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.140636920928955, + "rewards/margins": 0.9526728391647339, + "rewards/rejected": -3.0933098793029785, + "sft_loss": 2.181284189224243, + "step": 4845 + }, + { + "epoch": 2.595751797959525, + "grad_norm": 12.830745078811939, + "learning_rate": 5.4150804383008675e-08, + "logits/chosen": -0.28581708669662476, + "logits/rejected": -0.10724548995494843, + "logps/chosen": -2.1298556327819824, + "logps/rejected": -3.379368305206299, + "loss": 0.676, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.1298556327819824, + "rewards/margins": 1.2495124340057373, + "rewards/rejected": -3.379368305206299, + "sft_loss": 2.1332108974456787, + "step": 4850 + }, + { + "epoch": 2.5984278307409268, + "grad_norm": 10.47629999400417, + "learning_rate": 5.344804106359002e-08, + "logits/chosen": -0.10475417226552963, + "logits/rejected": 0.034112442284822464, + "logps/chosen": -2.120316505432129, + "logps/rejected": -3.2288997173309326, + "loss": 0.684, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.120316505432129, + "rewards/margins": 1.1085835695266724, + "rewards/rejected": -3.2288997173309326, + "sft_loss": 2.154690980911255, + "step": 4855 + }, + { + "epoch": 2.601103863522328, + "grad_norm": 4.804057544653204, + "learning_rate": 5.274961019323559e-08, + "logits/chosen": -0.16452094912528992, + "logits/rejected": -0.09058734029531479, + "logps/chosen": -1.9013557434082031, + "logps/rejected": -3.1004385948181152, + "loss": 0.6641, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9013557434082031, + "rewards/margins": 1.199082612991333, + "rewards/rejected": -3.1004385948181152, + "sft_loss": 1.9540598392486572, + "step": 4860 + }, + { + "epoch": 2.6037798963037297, + "grad_norm": 3.8392080026324487, + "learning_rate": 5.205551854812451e-08, + "logits/chosen": -0.26380443572998047, + "logits/rejected": -0.15460802614688873, + "logps/chosen": -2.058915615081787, + "logps/rejected": -3.2198386192321777, + "loss": 0.6611, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.058915615081787, + "rewards/margins": 1.1609232425689697, + "rewards/rejected": -3.2198386192321777, + "sft_loss": 2.057905912399292, + "step": 4865 + }, + { + "epoch": 2.606455929085131, + "grad_norm": 3.7836159260299085, + "learning_rate": 5.1365772862337177e-08, + "logits/chosen": -0.140593022108078, + "logits/rejected": -0.01732134446501732, + "logps/chosen": -2.0515201091766357, + "logps/rejected": -3.2749295234680176, + "loss": 0.6645, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.0515201091766357, + "rewards/margins": 1.2234095335006714, + "rewards/rejected": -3.2749295234680176, + "sft_loss": 2.0359392166137695, + "step": 4870 + }, + { + "epoch": 2.6091319618665327, + "grad_norm": 4.177419036329663, + "learning_rate": 5.068037982778905e-08, + "logits/chosen": -0.07713289558887482, + "logits/rejected": 0.010256970301270485, + "logps/chosen": -2.0393309593200684, + "logps/rejected": -3.295485019683838, + "loss": 0.6706, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.0393309593200684, + "rewards/margins": 1.2561534643173218, + "rewards/rejected": -3.295485019683838, + "sft_loss": 2.0710060596466064, + "step": 4875 + }, + { + "epoch": 2.6118079946479344, + "grad_norm": 7.009060968874998, + "learning_rate": 4.999934609416656e-08, + "logits/chosen": -0.11068640649318695, + "logits/rejected": 0.017202546820044518, + "logps/chosen": -1.9383538961410522, + "logps/rejected": -3.306736469268799, + "loss": 0.6492, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.9383538961410522, + "rewards/margins": 1.368382453918457, + "rewards/rejected": -3.306736469268799, + "sft_loss": 2.065255641937256, + "step": 4880 + }, + { + "epoch": 2.614484027429336, + "grad_norm": 4.592285677487404, + "learning_rate": 4.932267826886183e-08, + "logits/chosen": -0.09724752604961395, + "logits/rejected": -0.02438260242342949, + "logps/chosen": -2.162257671356201, + "logps/rejected": -3.315903902053833, + "loss": 0.6655, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.162257671356201, + "rewards/margins": 1.15364670753479, + "rewards/rejected": -3.315903902053833, + "sft_loss": 2.1836469173431396, + "step": 4885 + }, + { + "epoch": 2.6171600602107374, + "grad_norm": 4.442056230230379, + "learning_rate": 4.8650382916909206e-08, + "logits/chosen": -0.29001709818840027, + "logits/rejected": -0.12049128860235214, + "logps/chosen": -2.086012363433838, + "logps/rejected": -3.252751588821411, + "loss": 0.6742, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.086012363433838, + "rewards/margins": 1.1667394638061523, + "rewards/rejected": -3.252751588821411, + "sft_loss": 2.132532835006714, + "step": 4890 + }, + { + "epoch": 2.619836092992139, + "grad_norm": 3.4297832843382627, + "learning_rate": 4.7982466560920976e-08, + "logits/chosen": -0.1994694173336029, + "logits/rejected": -0.09064370393753052, + "logps/chosen": -2.1102771759033203, + "logps/rejected": -3.0843772888183594, + "loss": 0.666, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1102771759033203, + "rewards/margins": 0.9740999937057495, + "rewards/rejected": -3.0843772888183594, + "sft_loss": 2.186908006668091, + "step": 4895 + }, + { + "epoch": 2.622512125773541, + "grad_norm": 2.297242655481256, + "learning_rate": 4.7318935681024685e-08, + "logits/chosen": -0.1548094004392624, + "logits/rejected": 0.023246418684720993, + "logps/chosen": -2.0302085876464844, + "logps/rejected": -3.1408843994140625, + "loss": 0.6657, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.0302085876464844, + "rewards/margins": 1.1106754541397095, + "rewards/rejected": -3.1408843994140625, + "sft_loss": 2.12398362159729, + "step": 4900 + }, + { + "epoch": 2.625188158554942, + "grad_norm": 6.874330951742991, + "learning_rate": 4.6659796714799745e-08, + "logits/chosen": -0.17273852229118347, + "logits/rejected": -0.015232471749186516, + "logps/chosen": -2.0785112380981445, + "logps/rejected": -3.3171119689941406, + "loss": 0.6711, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.0785112380981445, + "rewards/margins": 1.238600492477417, + "rewards/rejected": -3.3171119689941406, + "sft_loss": 2.21134090423584, + "step": 4905 + }, + { + "epoch": 2.627864191336344, + "grad_norm": 3.3100218930305703, + "learning_rate": 4.60050560572155e-08, + "logits/chosen": -0.15971055626869202, + "logits/rejected": -0.1629440039396286, + "logps/chosen": -2.245551109313965, + "logps/rejected": -3.7228760719299316, + "loss": 0.6756, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.245551109313965, + "rewards/margins": 1.4773248434066772, + "rewards/rejected": -3.7228760719299316, + "sft_loss": 2.2304935455322266, + "step": 4910 + }, + { + "epoch": 2.6305402241177456, + "grad_norm": 4.223907754553806, + "learning_rate": 4.535472006056834e-08, + "logits/chosen": -0.12787704169750214, + "logits/rejected": 0.016034701839089394, + "logps/chosen": -1.905869722366333, + "logps/rejected": -3.0717577934265137, + "loss": 0.663, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.905869722366333, + "rewards/margins": 1.1658880710601807, + "rewards/rejected": -3.0717577934265137, + "sft_loss": 1.9925057888031006, + "step": 4915 + }, + { + "epoch": 2.6332162568991473, + "grad_norm": 2.4128440736146444, + "learning_rate": 4.470879503442132e-08, + "logits/chosen": -0.17662464082241058, + "logits/rejected": -0.08665060997009277, + "logps/chosen": -2.156773328781128, + "logps/rejected": -3.257002592086792, + "loss": 0.6753, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.156773328781128, + "rewards/margins": 1.100229024887085, + "rewards/rejected": -3.257002592086792, + "sft_loss": 2.113800048828125, + "step": 4920 + }, + { + "epoch": 2.6358922896805486, + "grad_norm": 4.026826762044653, + "learning_rate": 4.406728724554154e-08, + "logits/chosen": -0.30629870295524597, + "logits/rejected": -0.033193111419677734, + "logps/chosen": -2.021440029144287, + "logps/rejected": -3.25608491897583, + "loss": 0.6605, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.021440029144287, + "rewards/margins": 1.234644889831543, + "rewards/rejected": -3.25608491897583, + "sft_loss": 2.0926735401153564, + "step": 4925 + }, + { + "epoch": 2.6385683224619503, + "grad_norm": 3.9647555176436122, + "learning_rate": 4.3430202917840664e-08, + "logits/chosen": -0.1415877640247345, + "logits/rejected": 0.04813080281019211, + "logps/chosen": -2.2339959144592285, + "logps/rejected": -3.5858314037323, + "loss": 0.6696, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2339959144592285, + "rewards/margins": 1.3518354892730713, + "rewards/rejected": -3.5858314037323, + "sft_loss": 2.1474475860595703, + "step": 4930 + }, + { + "epoch": 2.6412443552433515, + "grad_norm": 10.246014656143823, + "learning_rate": 4.279754823231346e-08, + "logits/chosen": -0.21439452469348907, + "logits/rejected": -0.03963133692741394, + "logps/chosen": -2.071315050125122, + "logps/rejected": -3.20759654045105, + "loss": 0.6625, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.071315050125122, + "rewards/margins": 1.1362817287445068, + "rewards/rejected": -3.20759654045105, + "sft_loss": 1.9997138977050781, + "step": 4935 + }, + { + "epoch": 2.6439203880247533, + "grad_norm": 3.1955843244141318, + "learning_rate": 4.216932932697859e-08, + "logits/chosen": -0.18687336146831512, + "logits/rejected": -0.11777372658252716, + "logps/chosen": -1.9985196590423584, + "logps/rejected": -2.8534021377563477, + "loss": 0.6598, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9985196590423584, + "rewards/margins": 0.8548822402954102, + "rewards/rejected": -2.8534021377563477, + "sft_loss": 1.9559974670410156, + "step": 4940 + }, + { + "epoch": 2.646596420806155, + "grad_norm": 3.9897517262786515, + "learning_rate": 4.154555229681844e-08, + "logits/chosen": -0.17536196112632751, + "logits/rejected": 0.039307206869125366, + "logps/chosen": -2.0607361793518066, + "logps/rejected": -3.2855000495910645, + "loss": 0.6765, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.0607361793518066, + "rewards/margins": 1.2247636318206787, + "rewards/rejected": -3.2855000495910645, + "sft_loss": 2.0301108360290527, + "step": 4945 + }, + { + "epoch": 2.6492724535875567, + "grad_norm": 5.34316199107696, + "learning_rate": 4.092622319372069e-08, + "logits/chosen": -0.16160908341407776, + "logits/rejected": -0.0008482426637783647, + "logps/chosen": -2.192845582962036, + "logps/rejected": -3.215425491333008, + "loss": 0.6606, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.192845582962036, + "rewards/margins": 1.0225796699523926, + "rewards/rejected": -3.215425491333008, + "sft_loss": 2.0834009647369385, + "step": 4950 + }, + { + "epoch": 2.651948486368958, + "grad_norm": 3.9595536843580024, + "learning_rate": 4.031134802641889e-08, + "logits/chosen": -0.14186866581439972, + "logits/rejected": -0.09743430465459824, + "logps/chosen": -2.1751301288604736, + "logps/rejected": -3.1686172485351562, + "loss": 0.6676, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1751301288604736, + "rewards/margins": 0.9934871792793274, + "rewards/rejected": -3.1686172485351562, + "sft_loss": 2.1217844486236572, + "step": 4955 + }, + { + "epoch": 2.6546245191503597, + "grad_norm": 3.10446590032169, + "learning_rate": 3.970093276043468e-08, + "logits/chosen": -0.11204180866479874, + "logits/rejected": -0.017457684502005577, + "logps/chosen": -2.005864381790161, + "logps/rejected": -3.0677313804626465, + "loss": 0.6677, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.005864381790161, + "rewards/margins": 1.061867117881775, + "rewards/rejected": -3.0677313804626465, + "sft_loss": 2.0568666458129883, + "step": 4960 + }, + { + "epoch": 2.657300551931761, + "grad_norm": 5.07956589232817, + "learning_rate": 3.9094983318019584e-08, + "logits/chosen": -0.22514183819293976, + "logits/rejected": -0.07763149589300156, + "logps/chosen": -1.9423367977142334, + "logps/rejected": -3.1683859825134277, + "loss": 0.6573, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.9423367977142334, + "rewards/margins": 1.2260488271713257, + "rewards/rejected": -3.1683859825134277, + "sft_loss": 2.053755760192871, + "step": 4965 + }, + { + "epoch": 2.6599765847131627, + "grad_norm": 4.32502942897498, + "learning_rate": 3.849350557809789e-08, + "logits/chosen": -0.08983586728572845, + "logits/rejected": -0.0068774139508605, + "logps/chosen": -2.064767837524414, + "logps/rejected": -3.119546413421631, + "loss": 0.6704, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.064767837524414, + "rewards/margins": 1.0547785758972168, + "rewards/rejected": -3.119546413421631, + "sft_loss": 1.9203109741210938, + "step": 4970 + }, + { + "epoch": 2.6626526174945644, + "grad_norm": 4.413263326536183, + "learning_rate": 3.789650537620903e-08, + "logits/chosen": -0.17833557724952698, + "logits/rejected": -0.13192901015281677, + "logps/chosen": -2.1899287700653076, + "logps/rejected": -3.389059066772461, + "loss": 0.6762, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1899287700653076, + "rewards/margins": 1.1991302967071533, + "rewards/rejected": -3.389059066772461, + "sft_loss": 2.1803905963897705, + "step": 4975 + }, + { + "epoch": 2.665328650275966, + "grad_norm": 3.524914844836635, + "learning_rate": 3.730398850445182e-08, + "logits/chosen": -0.05446472764015198, + "logits/rejected": -0.00417535612359643, + "logps/chosen": -2.3293063640594482, + "logps/rejected": -3.2744858264923096, + "loss": 0.6913, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.3293063640594482, + "rewards/margins": 0.9451791644096375, + "rewards/rejected": -3.2744858264923096, + "sft_loss": 2.2219605445861816, + "step": 4980 + }, + { + "epoch": 2.6680046830573674, + "grad_norm": 5.762332946004345, + "learning_rate": 3.671596071142735e-08, + "logits/chosen": -0.10473129898309708, + "logits/rejected": 0.06362743675708771, + "logps/chosen": -2.1023190021514893, + "logps/rejected": -3.534928560256958, + "loss": 0.6678, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1023190021514893, + "rewards/margins": 1.4326095581054688, + "rewards/rejected": -3.534928560256958, + "sft_loss": 2.063044309616089, + "step": 4985 + }, + { + "epoch": 2.670680715838769, + "grad_norm": 2.864403064356372, + "learning_rate": 3.6132427702183996e-08, + "logits/chosen": -0.23619285225868225, + "logits/rejected": -0.06422014534473419, + "logps/chosen": -1.9801371097564697, + "logps/rejected": -3.2924282550811768, + "loss": 0.6569, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9801371097564697, + "rewards/margins": 1.3122915029525757, + "rewards/rejected": -3.2924282550811768, + "sft_loss": 2.0358376502990723, + "step": 4990 + }, + { + "epoch": 2.6733567486201704, + "grad_norm": 6.884835339263708, + "learning_rate": 3.555339513816147e-08, + "logits/chosen": -0.2065448760986328, + "logits/rejected": -0.18419429659843445, + "logps/chosen": -2.1128482818603516, + "logps/rejected": -2.9955358505249023, + "loss": 0.6805, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1128482818603516, + "rewards/margins": 0.8826876878738403, + "rewards/rejected": -2.9955358505249023, + "sft_loss": 2.17033314704895, + "step": 4995 + }, + { + "epoch": 2.676032781401572, + "grad_norm": 6.121407517260863, + "learning_rate": 3.497886863713639e-08, + "logits/chosen": -0.17922380566596985, + "logits/rejected": -0.11081000417470932, + "logps/chosen": -2.2198586463928223, + "logps/rejected": -3.308467149734497, + "loss": 0.6715, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.2198586463928223, + "rewards/margins": 1.0886080265045166, + "rewards/rejected": -3.308467149734497, + "sft_loss": 2.248330593109131, + "step": 5000 + }, + { + "epoch": 2.678708814182974, + "grad_norm": 3.846489276009459, + "learning_rate": 3.440885377316721e-08, + "logits/chosen": -0.10410068184137344, + "logits/rejected": -0.05124828964471817, + "logps/chosen": -2.101375102996826, + "logps/rejected": -3.006727695465088, + "loss": 0.6836, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.101375102996826, + "rewards/margins": 0.9053524136543274, + "rewards/rejected": -3.006727695465088, + "sft_loss": 2.0615711212158203, + "step": 5005 + }, + { + "epoch": 2.6813848469643755, + "grad_norm": 6.171461958311083, + "learning_rate": 3.384335607654082e-08, + "logits/chosen": -0.10195468366146088, + "logits/rejected": -0.024979958310723305, + "logps/chosen": -2.2979681491851807, + "logps/rejected": -3.2801547050476074, + "loss": 0.6826, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.2979681491851807, + "rewards/margins": 0.9821867942810059, + "rewards/rejected": -3.2801547050476074, + "sft_loss": 2.202646493911743, + "step": 5010 + }, + { + "epoch": 2.684060879745777, + "grad_norm": 5.20333662410297, + "learning_rate": 3.328238103371811e-08, + "logits/chosen": -0.1650058478116989, + "logits/rejected": -0.07016507536172867, + "logps/chosen": -2.2075905799865723, + "logps/rejected": -3.2428231239318848, + "loss": 0.6793, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.2075905799865723, + "rewards/margins": 1.035232424736023, + "rewards/rejected": -3.2428231239318848, + "sft_loss": 2.0438759326934814, + "step": 5015 + }, + { + "epoch": 2.6867369125271785, + "grad_norm": 5.01306261947066, + "learning_rate": 3.272593408728169e-08, + "logits/chosen": -0.22710001468658447, + "logits/rejected": -0.014016309753060341, + "logps/chosen": -2.091968536376953, + "logps/rejected": -3.1881086826324463, + "loss": 0.68, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.091968536376953, + "rewards/margins": 1.0961401462554932, + "rewards/rejected": -3.1881086826324463, + "sft_loss": 2.142371892929077, + "step": 5020 + }, + { + "epoch": 2.6894129453085798, + "grad_norm": 4.708937816887113, + "learning_rate": 3.217402063588204e-08, + "logits/chosen": -0.18712188303470612, + "logits/rejected": -0.035339899361133575, + "logps/chosen": -2.1658287048339844, + "logps/rejected": -3.5366687774658203, + "loss": 0.6799, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1658287048339844, + "rewards/margins": 1.3708401918411255, + "rewards/rejected": -3.5366687774658203, + "sft_loss": 2.1351940631866455, + "step": 5025 + }, + { + "epoch": 2.6920889780899815, + "grad_norm": 3.8811218292727174, + "learning_rate": 3.162664603418608e-08, + "logits/chosen": -0.1747264415025711, + "logits/rejected": -0.08946724981069565, + "logps/chosen": -2.160559892654419, + "logps/rejected": -3.4428908824920654, + "loss": 0.6665, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.160559892654419, + "rewards/margins": 1.2823314666748047, + "rewards/rejected": -3.4428908824920654, + "sft_loss": 2.0684890747070312, + "step": 5030 + }, + { + "epoch": 2.694765010871383, + "grad_norm": 4.207281726952039, + "learning_rate": 3.1083815592824416e-08, + "logits/chosen": -0.18225672841072083, + "logits/rejected": -0.04414839297533035, + "logps/chosen": -2.2771077156066895, + "logps/rejected": -3.3128814697265625, + "loss": 0.6761, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.2771077156066895, + "rewards/margins": 1.0357741117477417, + "rewards/rejected": -3.3128814697265625, + "sft_loss": 2.263978958129883, + "step": 5035 + }, + { + "epoch": 2.697441043652785, + "grad_norm": 4.521748523943575, + "learning_rate": 3.054553457834053e-08, + "logits/chosen": 0.00856291688978672, + "logits/rejected": -0.02341640368103981, + "logps/chosen": -2.138507843017578, + "logps/rejected": -3.337174892425537, + "loss": 0.6715, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.138507843017578, + "rewards/margins": 1.1986671686172485, + "rewards/rejected": -3.337174892425537, + "sft_loss": 2.0876195430755615, + "step": 5040 + }, + { + "epoch": 2.700117076434186, + "grad_norm": 4.552636861595934, + "learning_rate": 3.0011808213139036e-08, + "logits/chosen": -0.09765835106372833, + "logits/rejected": -0.07042871415615082, + "logps/chosen": -2.264831781387329, + "logps/rejected": -3.0682289600372314, + "loss": 0.6674, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.264831781387329, + "rewards/margins": 0.8033971786499023, + "rewards/rejected": -3.0682289600372314, + "sft_loss": 2.1140804290771484, + "step": 5045 + }, + { + "epoch": 2.702793109215588, + "grad_norm": 4.856819480189408, + "learning_rate": 2.948264167543568e-08, + "logits/chosen": -0.1874663084745407, + "logits/rejected": -0.09940935671329498, + "logps/chosen": -1.9742987155914307, + "logps/rejected": -2.9669032096862793, + "loss": 0.6598, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.9742987155914307, + "rewards/margins": 0.9926045536994934, + "rewards/rejected": -2.9669032096862793, + "sft_loss": 2.029134750366211, + "step": 5050 + }, + { + "epoch": 2.7054691419969896, + "grad_norm": 4.615891223624692, + "learning_rate": 2.8958040099206216e-08, + "logits/chosen": -0.2509460747241974, + "logits/rejected": -0.15617252886295319, + "logps/chosen": -1.9862926006317139, + "logps/rejected": -3.0963289737701416, + "loss": 0.6638, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9862926006317139, + "rewards/margins": 1.1100363731384277, + "rewards/rejected": -3.0963289737701416, + "sft_loss": 1.9945812225341797, + "step": 5055 + }, + { + "epoch": 2.708145174778391, + "grad_norm": 5.761371226735249, + "learning_rate": 2.843800857413775e-08, + "logits/chosen": -0.17955382168293, + "logits/rejected": -0.08448338508605957, + "logps/chosen": -2.048438310623169, + "logps/rejected": -3.0260777473449707, + "loss": 0.6678, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.048438310623169, + "rewards/margins": 0.9776394963264465, + "rewards/rejected": -3.0260777473449707, + "sft_loss": 2.1401984691619873, + "step": 5060 + }, + { + "epoch": 2.7108212075597926, + "grad_norm": 6.856236650179769, + "learning_rate": 2.7922552145578203e-08, + "logits/chosen": -0.18293629586696625, + "logits/rejected": 0.07218710333108902, + "logps/chosen": -2.066840648651123, + "logps/rejected": -3.114772319793701, + "loss": 0.667, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.066840648651123, + "rewards/margins": 1.0479316711425781, + "rewards/rejected": -3.114772319793701, + "sft_loss": 2.051462411880493, + "step": 5065 + }, + { + "epoch": 2.7134972403411943, + "grad_norm": 5.344510378844355, + "learning_rate": 2.7411675814488277e-08, + "logits/chosen": -0.08438555151224136, + "logits/rejected": 0.05671966075897217, + "logps/chosen": -1.9172179698944092, + "logps/rejected": -2.8855223655700684, + "loss": 0.6601, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.9172179698944092, + "rewards/margins": 0.9683046340942383, + "rewards/rejected": -2.8855223655700684, + "sft_loss": 1.979569673538208, + "step": 5070 + }, + { + "epoch": 2.7161732731225956, + "grad_norm": 6.607044123365808, + "learning_rate": 2.690538453739216e-08, + "logits/chosen": -0.10938652604818344, + "logits/rejected": -0.02503049001097679, + "logps/chosen": -2.0341567993164062, + "logps/rejected": -2.8601226806640625, + "loss": 0.6792, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0341567993164062, + "rewards/margins": 0.8259660005569458, + "rewards/rejected": -2.8601226806640625, + "sft_loss": 2.113157272338867, + "step": 5075 + }, + { + "epoch": 2.7188493059039973, + "grad_norm": 3.6293731347471696, + "learning_rate": 2.6403683226330298e-08, + "logits/chosen": -0.20525026321411133, + "logits/rejected": -0.07308445870876312, + "logps/chosen": -2.0696821212768555, + "logps/rejected": -3.2235043048858643, + "loss": 0.6753, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0696821212768555, + "rewards/margins": 1.1538223028182983, + "rewards/rejected": -3.2235043048858643, + "sft_loss": 2.0911519527435303, + "step": 5080 + }, + { + "epoch": 2.721525338685399, + "grad_norm": 5.723829432538651, + "learning_rate": 2.5906576748810804e-08, + "logits/chosen": -0.237153097987175, + "logits/rejected": -0.13139821588993073, + "logps/chosen": -2.0324325561523438, + "logps/rejected": -3.28680157661438, + "loss": 0.6517, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0324325561523438, + "rewards/margins": 1.254368782043457, + "rewards/rejected": -3.28680157661438, + "sft_loss": 2.0716185569763184, + "step": 5085 + }, + { + "epoch": 2.7242013714668003, + "grad_norm": 5.024835726782162, + "learning_rate": 2.5414069927763016e-08, + "logits/chosen": -0.275637686252594, + "logits/rejected": -0.0931491032242775, + "logps/chosen": -2.0387635231018066, + "logps/rejected": -3.2675564289093018, + "loss": 0.6721, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0387635231018066, + "rewards/margins": 1.2287932634353638, + "rewards/rejected": -3.2675564289093018, + "sft_loss": 2.030611991882324, + "step": 5090 + }, + { + "epoch": 2.726877404248202, + "grad_norm": 4.568697100727387, + "learning_rate": 2.4926167541490185e-08, + "logits/chosen": -0.3096490800380707, + "logits/rejected": -0.0954984650015831, + "logps/chosen": -2.0408742427825928, + "logps/rejected": -3.3121063709259033, + "loss": 0.6753, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0408742427825928, + "rewards/margins": 1.2712323665618896, + "rewards/rejected": -3.3121063709259033, + "sft_loss": 2.074384927749634, + "step": 5095 + }, + { + "epoch": 2.7295534370296037, + "grad_norm": 8.451808031506655, + "learning_rate": 2.4442874323623574e-08, + "logits/chosen": -0.10867484658956528, + "logits/rejected": 0.04849391430616379, + "logps/chosen": -2.089449405670166, + "logps/rejected": -3.361520767211914, + "loss": 0.6668, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.089449405670166, + "rewards/margins": 1.2720708847045898, + "rewards/rejected": -3.361520767211914, + "sft_loss": 1.99267578125, + "step": 5100 + }, + { + "epoch": 2.7322294698110055, + "grad_norm": 5.448757879157279, + "learning_rate": 2.396419496307589e-08, + "logits/chosen": -0.1507568210363388, + "logits/rejected": 0.02013157121837139, + "logps/chosen": -2.142974853515625, + "logps/rejected": -3.375558376312256, + "loss": 0.6727, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.142974853515625, + "rewards/margins": 1.2325836420059204, + "rewards/rejected": -3.375558376312256, + "sft_loss": 2.1336617469787598, + "step": 5105 + }, + { + "epoch": 2.7349055025924067, + "grad_norm": 4.929875341258377, + "learning_rate": 2.349013410399653e-08, + "logits/chosen": -0.18589094281196594, + "logits/rejected": -0.05891326814889908, + "logps/chosen": -2.130286455154419, + "logps/rejected": -3.1513850688934326, + "loss": 0.6705, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.130286455154419, + "rewards/margins": 1.0210990905761719, + "rewards/rejected": -3.1513850688934326, + "sft_loss": 2.0535569190979004, + "step": 5110 + }, + { + "epoch": 2.7375815353738084, + "grad_norm": 3.6774377763388264, + "learning_rate": 2.3020696345725954e-08, + "logits/chosen": -0.24028301239013672, + "logits/rejected": -0.041994038969278336, + "logps/chosen": -2.1880526542663574, + "logps/rejected": -3.347641706466675, + "loss": 0.6687, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1880526542663574, + "rewards/margins": 1.159589171409607, + "rewards/rejected": -3.347641706466675, + "sft_loss": 2.2079193592071533, + "step": 5115 + }, + { + "epoch": 2.7402575681552097, + "grad_norm": 4.211134532604004, + "learning_rate": 2.2555886242751398e-08, + "logits/chosen": -0.17090514302253723, + "logits/rejected": -0.08920306712388992, + "logps/chosen": -2.0383031368255615, + "logps/rejected": -3.322711229324341, + "loss": 0.6637, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.0383031368255615, + "rewards/margins": 1.2844078540802002, + "rewards/rejected": -3.322711229324341, + "sft_loss": 2.0598537921905518, + "step": 5120 + }, + { + "epoch": 2.7429336009366114, + "grad_norm": 11.983066215876969, + "learning_rate": 2.2095708304662453e-08, + "logits/chosen": -0.2979956567287445, + "logits/rejected": -0.04316394031047821, + "logps/chosen": -2.1104862689971924, + "logps/rejected": -3.100170850753784, + "loss": 0.662, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1104862689971924, + "rewards/margins": 0.9896847009658813, + "rewards/rejected": -3.100170850753784, + "sft_loss": 2.078278064727783, + "step": 5125 + }, + { + "epoch": 2.745609633718013, + "grad_norm": 3.8671914495005995, + "learning_rate": 2.16401669961076e-08, + "logits/chosen": -0.2927331328392029, + "logits/rejected": -0.07540827989578247, + "logps/chosen": -1.9981857538223267, + "logps/rejected": -3.385162353515625, + "loss": 0.6635, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9981857538223267, + "rewards/margins": 1.386976957321167, + "rewards/rejected": -3.385162353515625, + "sft_loss": 2.082742214202881, + "step": 5130 + }, + { + "epoch": 2.748285666499415, + "grad_norm": 4.398321955741719, + "learning_rate": 2.1189266736750532e-08, + "logits/chosen": -0.08732561022043228, + "logits/rejected": -0.007392602507025003, + "logps/chosen": -2.091261625289917, + "logps/rejected": -3.1702945232391357, + "loss": 0.6771, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.091261625289917, + "rewards/margins": 1.0790328979492188, + "rewards/rejected": -3.1702945232391357, + "sft_loss": 2.105546712875366, + "step": 5135 + }, + { + "epoch": 2.750961699280816, + "grad_norm": 4.285963114631182, + "learning_rate": 2.0743011901227623e-08, + "logits/chosen": -0.1719670295715332, + "logits/rejected": -0.026178916916251183, + "logps/chosen": -2.0739388465881348, + "logps/rejected": -3.075754165649414, + "loss": 0.6792, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0739388465881348, + "rewards/margins": 1.0018149614334106, + "rewards/rejected": -3.075754165649414, + "sft_loss": 2.021848678588867, + "step": 5140 + }, + { + "epoch": 2.753637732062218, + "grad_norm": 5.593728784516396, + "learning_rate": 2.030140681910508e-08, + "logits/chosen": -0.1162596195936203, + "logits/rejected": 0.04145939648151398, + "logps/chosen": -2.101473331451416, + "logps/rejected": -2.9324426651000977, + "loss": 0.6853, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.101473331451416, + "rewards/margins": 0.8309692144393921, + "rewards/rejected": -2.9324426651000977, + "sft_loss": 2.1028060913085938, + "step": 5145 + }, + { + "epoch": 2.756313764843619, + "grad_norm": 2.714625920065556, + "learning_rate": 1.986445577483753e-08, + "logits/chosen": -0.1818411946296692, + "logits/rejected": -0.051187075674533844, + "logps/chosen": -1.9758121967315674, + "logps/rejected": -3.1372430324554443, + "loss": 0.6689, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.9758121967315674, + "rewards/margins": 1.161431074142456, + "rewards/rejected": -3.1372430324554443, + "sft_loss": 1.9856704473495483, + "step": 5150 + }, + { + "epoch": 2.758989797625021, + "grad_norm": 6.621918318043308, + "learning_rate": 1.9432163007725765e-08, + "logits/chosen": -0.18781431019306183, + "logits/rejected": -0.09680631011724472, + "logps/chosen": -2.171119213104248, + "logps/rejected": -3.0972466468811035, + "loss": 0.6712, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.171119213104248, + "rewards/margins": 0.9261270761489868, + "rewards/rejected": -3.0972466468811035, + "sft_loss": 2.2441534996032715, + "step": 5155 + }, + { + "epoch": 2.7616658304064226, + "grad_norm": 7.4101187711934005, + "learning_rate": 1.9004532711876297e-08, + "logits/chosen": -0.1264840066432953, + "logits/rejected": -0.06264735013246536, + "logps/chosen": -1.9795137643814087, + "logps/rejected": -3.176384210586548, + "loss": 0.6416, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.9795137643814087, + "rewards/margins": 1.1968704462051392, + "rewards/rejected": -3.176384210586548, + "sft_loss": 2.0139267444610596, + "step": 5160 + }, + { + "epoch": 2.7643418631878243, + "grad_norm": 3.6986878836122234, + "learning_rate": 1.8581569036159928e-08, + "logits/chosen": -0.1670091450214386, + "logits/rejected": 0.02274545654654503, + "logps/chosen": -1.9214379787445068, + "logps/rejected": -3.264592409133911, + "loss": 0.6614, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.9214379787445068, + "rewards/margins": 1.3431540727615356, + "rewards/rejected": -3.264592409133911, + "sft_loss": 1.9861595630645752, + "step": 5165 + }, + { + "epoch": 2.7670178959692255, + "grad_norm": 5.829180454278915, + "learning_rate": 1.8163276084172285e-08, + "logits/chosen": -0.1309589147567749, + "logits/rejected": 0.01893536187708378, + "logps/chosen": -2.15108060836792, + "logps/rejected": -3.21575927734375, + "loss": 0.6659, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.15108060836792, + "rewards/margins": 1.0646789073944092, + "rewards/rejected": -3.21575927734375, + "sft_loss": 2.08768367767334, + "step": 5170 + }, + { + "epoch": 2.7696939287506273, + "grad_norm": 5.306349322069451, + "learning_rate": 1.7749657914193194e-08, + "logits/chosen": -0.17061103880405426, + "logits/rejected": -0.07805173099040985, + "logps/chosen": -2.105132818222046, + "logps/rejected": -3.3097457885742188, + "loss": 0.6663, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.105132818222046, + "rewards/margins": 1.2046130895614624, + "rewards/rejected": -3.3097457885742188, + "sft_loss": 2.0536954402923584, + "step": 5175 + }, + { + "epoch": 2.7723699615320285, + "grad_norm": 3.9876647499108, + "learning_rate": 1.7340718539148203e-08, + "logits/chosen": -0.09365290403366089, + "logits/rejected": -0.04905001074075699, + "logps/chosen": -2.103774070739746, + "logps/rejected": -3.1053762435913086, + "loss": 0.6863, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.103774070739746, + "rewards/margins": 1.001602292060852, + "rewards/rejected": -3.1053762435913086, + "sft_loss": 2.179610252380371, + "step": 5180 + }, + { + "epoch": 2.7750459943134302, + "grad_norm": 5.067724361905486, + "learning_rate": 1.6936461926568724e-08, + "logits/chosen": -0.12385039031505585, + "logits/rejected": 0.015540236607193947, + "logps/chosen": -2.0763049125671387, + "logps/rejected": -3.225480318069458, + "loss": 0.668, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.0763049125671387, + "rewards/margins": 1.149175763130188, + "rewards/rejected": -3.225480318069458, + "sft_loss": 2.1354317665100098, + "step": 5185 + }, + { + "epoch": 2.777722027094832, + "grad_norm": 6.4925921373426885, + "learning_rate": 1.6536891998554346e-08, + "logits/chosen": -0.25209927558898926, + "logits/rejected": -0.06769673526287079, + "logps/chosen": -1.9998257160186768, + "logps/rejected": -3.136584520339966, + "loss": 0.6592, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9998257160186768, + "rewards/margins": 1.1367586851119995, + "rewards/rejected": -3.136584520339966, + "sft_loss": 2.1053662300109863, + "step": 5190 + }, + { + "epoch": 2.7803980598762337, + "grad_norm": 9.215554809878828, + "learning_rate": 1.6142012631734093e-08, + "logits/chosen": -0.1382247507572174, + "logits/rejected": 0.034549959003925323, + "logps/chosen": -2.05765962600708, + "logps/rejected": -3.0788419246673584, + "loss": 0.6662, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.05765962600708, + "rewards/margins": 1.0211824178695679, + "rewards/rejected": -3.0788419246673584, + "sft_loss": 2.08469820022583, + "step": 5195 + }, + { + "epoch": 2.783074092657635, + "grad_norm": 4.504764705430222, + "learning_rate": 1.575182765722949e-08, + "logits/chosen": -0.22649991512298584, + "logits/rejected": -0.06499792635440826, + "logps/chosen": -1.9972549676895142, + "logps/rejected": -3.181198835372925, + "loss": 0.6701, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9972549676895142, + "rewards/margins": 1.1839439868927002, + "rewards/rejected": -3.181198835372925, + "sft_loss": 2.0632407665252686, + "step": 5200 + }, + { + "epoch": 2.783074092657635, + "eval_logits/chosen": 0.14197993278503418, + "eval_logits/rejected": 0.24067111313343048, + "eval_logps/chosen": -2.169867753982544, + "eval_logps/rejected": -3.226255416870117, + "eval_loss": 0.6903512477874756, + "eval_rewards/accuracies": 0.6928783655166626, + "eval_rewards/chosen": -2.169867753982544, + "eval_rewards/margins": 1.0563876628875732, + "eval_rewards/rejected": -3.226255416870117, + "eval_runtime": 43.3586, + "eval_samples_per_second": 31.02, + "eval_sft_loss": 2.132870674133301, + "eval_steps_per_second": 7.772, + "step": 5200 + }, + { + "epoch": 2.7857501254390367, + "grad_norm": 5.01427272437642, + "learning_rate": 1.536634086061672e-08, + "logits/chosen": -0.12897427380084991, + "logits/rejected": -0.10188676416873932, + "logps/chosen": -1.9897922277450562, + "logps/rejected": -3.2146172523498535, + "loss": 0.6593, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.9897922277450562, + "rewards/margins": 1.224825143814087, + "rewards/rejected": -3.2146172523498535, + "sft_loss": 1.9856764078140259, + "step": 5205 + }, + { + "epoch": 2.788426158220438, + "grad_norm": 4.55113254249339, + "learning_rate": 1.4985555981890495e-08, + "logits/chosen": -0.17350664734840393, + "logits/rejected": -0.0433095246553421, + "logps/chosen": -2.182356834411621, + "logps/rejected": -3.5557281970977783, + "loss": 0.6616, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.182356834411621, + "rewards/margins": 1.373370885848999, + "rewards/rejected": -3.5557281970977783, + "sft_loss": 2.0754942893981934, + "step": 5210 + }, + { + "epoch": 2.7911021910018396, + "grad_norm": 3.6117906709693277, + "learning_rate": 1.4609476715427226e-08, + "logits/chosen": -0.14435936510562897, + "logits/rejected": -0.05727745220065117, + "logps/chosen": -1.96018385887146, + "logps/rejected": -3.2339508533477783, + "loss": 0.6582, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.96018385887146, + "rewards/margins": 1.273767113685608, + "rewards/rejected": -3.2339508533477783, + "sft_loss": 2.011904001235962, + "step": 5215 + }, + { + "epoch": 2.7937782237832414, + "grad_norm": 5.559407975672463, + "learning_rate": 1.4238106709949792e-08, + "logits/chosen": -0.1844475120306015, + "logits/rejected": -0.08850296586751938, + "logps/chosen": -1.9547264575958252, + "logps/rejected": -3.431126832962036, + "loss": 0.6546, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.9547264575958252, + "rewards/margins": 1.476400375366211, + "rewards/rejected": -3.431126832962036, + "sft_loss": 2.0375938415527344, + "step": 5220 + }, + { + "epoch": 2.796454256564643, + "grad_norm": 4.223608667170843, + "learning_rate": 1.3871449568491511e-08, + "logits/chosen": -0.11957116425037384, + "logits/rejected": 0.04231054708361626, + "logps/chosen": -2.1211135387420654, + "logps/rejected": -3.2857651710510254, + "loss": 0.6721, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1211135387420654, + "rewards/margins": 1.1646511554718018, + "rewards/rejected": -3.2857651710510254, + "sft_loss": 2.0883028507232666, + "step": 5225 + }, + { + "epoch": 2.7991302893460444, + "grad_norm": 3.438797614567151, + "learning_rate": 1.3509508848361606e-08, + "logits/chosen": -0.25931066274642944, + "logits/rejected": -0.09509231150150299, + "logps/chosen": -2.122140884399414, + "logps/rejected": -3.284405469894409, + "loss": 0.6714, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.122140884399414, + "rewards/margins": 1.1622647047042847, + "rewards/rejected": -3.284405469894409, + "sft_loss": 2.0123064517974854, + "step": 5230 + }, + { + "epoch": 2.801806322127446, + "grad_norm": 2.921720735308949, + "learning_rate": 1.3152288061110517e-08, + "logits/chosen": -0.18616695702075958, + "logits/rejected": -0.047292523086071014, + "logps/chosen": -2.049342393875122, + "logps/rejected": -3.173543930053711, + "loss": 0.6723, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.049342393875122, + "rewards/margins": 1.124201774597168, + "rewards/rejected": -3.173543930053711, + "sft_loss": 2.0238633155822754, + "step": 5235 + }, + { + "epoch": 2.804482354908848, + "grad_norm": 5.980546682249523, + "learning_rate": 1.2799790672495814e-08, + "logits/chosen": -0.19059044122695923, + "logits/rejected": 0.021953938528895378, + "logps/chosen": -2.1133408546447754, + "logps/rejected": -3.2749500274658203, + "loss": 0.6673, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1133408546447754, + "rewards/margins": 1.1616089344024658, + "rewards/rejected": -3.2749500274658203, + "sft_loss": 2.0945544242858887, + "step": 5240 + }, + { + "epoch": 2.807158387690249, + "grad_norm": 4.140174426760411, + "learning_rate": 1.2452020102448835e-08, + "logits/chosen": -0.13396288454532623, + "logits/rejected": -0.058636974543333054, + "logps/chosen": -2.233964204788208, + "logps/rejected": -3.0428805351257324, + "loss": 0.6813, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.233964204788208, + "rewards/margins": 0.8089162111282349, + "rewards/rejected": -3.0428805351257324, + "sft_loss": 2.2309727668762207, + "step": 5245 + }, + { + "epoch": 2.8098344204716508, + "grad_norm": 3.654201317344783, + "learning_rate": 1.2108979725041103e-08, + "logits/chosen": -0.2601509094238281, + "logits/rejected": -0.11655745655298233, + "logps/chosen": -2.1270840167999268, + "logps/rejected": -3.2105839252471924, + "loss": 0.6753, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.1270840167999268, + "rewards/margins": 1.0835001468658447, + "rewards/rejected": -3.2105839252471924, + "sft_loss": 2.140364170074463, + "step": 5250 + }, + { + "epoch": 2.8125104532530525, + "grad_norm": 4.393285336057422, + "learning_rate": 1.1770672868451958e-08, + "logits/chosen": -0.19051814079284668, + "logits/rejected": 0.030599230900406837, + "logps/chosen": -2.1062216758728027, + "logps/rejected": -3.3283398151397705, + "loss": 0.675, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1062216758728027, + "rewards/margins": 1.2221177816390991, + "rewards/rejected": -3.3283398151397705, + "sft_loss": 2.1199615001678467, + "step": 5255 + }, + { + "epoch": 2.8151864860344538, + "grad_norm": 6.197558219138, + "learning_rate": 1.1437102814935872e-08, + "logits/chosen": -0.17727169394493103, + "logits/rejected": -0.10792503505945206, + "logps/chosen": -2.0642919540405273, + "logps/rejected": -3.2012767791748047, + "loss": 0.6715, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.0642919540405273, + "rewards/margins": 1.1369847059249878, + "rewards/rejected": -3.2012767791748047, + "sft_loss": 2.1764206886291504, + "step": 5260 + }, + { + "epoch": 2.8178625188158555, + "grad_norm": 4.821366071225293, + "learning_rate": 1.1108272800791018e-08, + "logits/chosen": -0.2428952008485794, + "logits/rejected": -0.015482127666473389, + "logps/chosen": -2.207315683364868, + "logps/rejected": -3.2127952575683594, + "loss": 0.6831, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.207315683364868, + "rewards/margins": 1.0054795742034912, + "rewards/rejected": -3.2127952575683594, + "sft_loss": 2.2028324604034424, + "step": 5265 + }, + { + "epoch": 2.820538551597257, + "grad_norm": 4.326387223529835, + "learning_rate": 1.078418601632769e-08, + "logits/chosen": -0.15027108788490295, + "logits/rejected": 0.008444221690297127, + "logps/chosen": -2.0687239170074463, + "logps/rejected": -3.254166841506958, + "loss": 0.6625, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.0687239170074463, + "rewards/margins": 1.1854428052902222, + "rewards/rejected": -3.254166841506958, + "sft_loss": 2.0933444499969482, + "step": 5270 + }, + { + "epoch": 2.8232145843786585, + "grad_norm": 4.647620075385404, + "learning_rate": 1.0464845605837159e-08, + "logits/chosen": -0.12983162701129913, + "logits/rejected": 0.013550333678722382, + "logps/chosen": -2.0212950706481934, + "logps/rejected": -3.0738463401794434, + "loss": 0.6721, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.0212950706481934, + "rewards/margins": 1.0525516271591187, + "rewards/rejected": -3.0738463401794434, + "sft_loss": 2.0059714317321777, + "step": 5275 + }, + { + "epoch": 2.82589061716006, + "grad_norm": 4.388856429894361, + "learning_rate": 1.0150254667561642e-08, + "logits/chosen": -0.1318751871585846, + "logits/rejected": 0.022982800379395485, + "logps/chosen": -2.2139945030212402, + "logps/rejected": -3.544062376022339, + "loss": 0.6743, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2139945030212402, + "rewards/margins": 1.3300679922103882, + "rewards/rejected": -3.544062376022339, + "sft_loss": 2.1635942459106445, + "step": 5280 + }, + { + "epoch": 2.828566649941462, + "grad_norm": 3.275829245543984, + "learning_rate": 9.840416253663719e-09, + "logits/chosen": -0.19172796607017517, + "logits/rejected": -0.0724857896566391, + "logps/chosen": -2.0069081783294678, + "logps/rejected": -3.423060655593872, + "loss": 0.6649, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.0069081783294678, + "rewards/margins": 1.4161527156829834, + "rewards/rejected": -3.423060655593872, + "sft_loss": 2.022608518600464, + "step": 5285 + }, + { + "epoch": 2.8312426827228636, + "grad_norm": 6.76724206800637, + "learning_rate": 9.535333370197074e-09, + "logits/chosen": -0.13732025027275085, + "logits/rejected": -0.0019424870843067765, + "logps/chosen": -2.0925679206848145, + "logps/rejected": -3.043700695037842, + "loss": 0.6773, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.0925679206848145, + "rewards/margins": 0.951132595539093, + "rewards/rejected": -3.043700695037842, + "sft_loss": 2.095527172088623, + "step": 5290 + }, + { + "epoch": 2.833918715504265, + "grad_norm": 3.6575800369627136, + "learning_rate": 9.23500897707713e-09, + "logits/chosen": -0.2242976427078247, + "logits/rejected": -0.010512808337807655, + "logps/chosen": -2.2502903938293457, + "logps/rejected": -3.3423380851745605, + "loss": 0.6925, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.2502903938293457, + "rewards/margins": 1.0920478105545044, + "rewards/rejected": -3.3423380851745605, + "sft_loss": 2.207596778869629, + "step": 5295 + }, + { + "epoch": 2.8365947482856666, + "grad_norm": 3.4687013378185605, + "learning_rate": 8.939445988052574e-09, + "logits/chosen": -0.1519078016281128, + "logits/rejected": -0.08015980571508408, + "logps/chosen": -2.031989097595215, + "logps/rejected": -3.466524124145508, + "loss": 0.6633, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.031989097595215, + "rewards/margins": 1.4345349073410034, + "rewards/rejected": -3.466524124145508, + "sft_loss": 1.9486396312713623, + "step": 5300 + }, + { + "epoch": 2.839270781067068, + "grad_norm": 6.424838703496807, + "learning_rate": 8.648647270676656e-09, + "logits/chosen": -0.1546718329191208, + "logits/rejected": -0.04978444427251816, + "logps/chosen": -2.0906074047088623, + "logps/rejected": -3.186412811279297, + "loss": 0.6762, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.0906074047088623, + "rewards/margins": 1.0958056449890137, + "rewards/rejected": -3.186412811279297, + "sft_loss": 2.111370086669922, + "step": 5305 + }, + { + "epoch": 2.8419468138484696, + "grad_norm": 4.525221256770394, + "learning_rate": 8.362615646279991e-09, + "logits/chosen": -0.2863996624946594, + "logits/rejected": -0.04077654331922531, + "logps/chosen": -2.080845355987549, + "logps/rejected": -3.4454243183135986, + "loss": 0.6739, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.080845355987549, + "rewards/margins": 1.3645789623260498, + "rewards/rejected": -3.4454243183135986, + "sft_loss": 2.1444432735443115, + "step": 5310 + }, + { + "epoch": 2.8446228466298713, + "grad_norm": 7.202187239597557, + "learning_rate": 8.081353889942466e-09, + "logits/chosen": -0.09252846240997314, + "logits/rejected": 0.09165690839290619, + "logps/chosen": -2.1996304988861084, + "logps/rejected": -3.0849857330322266, + "loss": 0.6769, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.1996304988861084, + "rewards/margins": 0.8853553533554077, + "rewards/rejected": -3.0849857330322266, + "sft_loss": 2.258697509765625, + "step": 5315 + }, + { + "epoch": 2.847298879411273, + "grad_norm": 6.146370444393983, + "learning_rate": 7.804864730467042e-09, + "logits/chosen": -0.1276165395975113, + "logits/rejected": -0.04356785863637924, + "logps/chosen": -2.0275347232818604, + "logps/rejected": -3.036647319793701, + "loss": 0.6682, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0275347232818604, + "rewards/margins": 1.0091129541397095, + "rewards/rejected": -3.036647319793701, + "sft_loss": 1.960352897644043, + "step": 5320 + }, + { + "epoch": 2.8499749121926743, + "grad_norm": 5.173531913065765, + "learning_rate": 7.533150850352665e-09, + "logits/chosen": -0.12462538480758667, + "logits/rejected": 0.04870520159602165, + "logps/chosen": -2.121428966522217, + "logps/rejected": -3.3238697052001953, + "loss": 0.6681, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.121428966522217, + "rewards/margins": 1.2024410963058472, + "rewards/rejected": -3.3238697052001953, + "sft_loss": 2.0240511894226074, + "step": 5325 + }, + { + "epoch": 2.852650944974076, + "grad_norm": 6.456068348454724, + "learning_rate": 7.2662148857686175e-09, + "logits/chosen": -0.055983882397413254, + "logits/rejected": 0.03901743143796921, + "logps/chosen": -2.1341800689697266, + "logps/rejected": -3.2898013591766357, + "loss": 0.6686, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1341800689697266, + "rewards/margins": 1.1556214094161987, + "rewards/rejected": -3.2898013591766357, + "sft_loss": 2.1365959644317627, + "step": 5330 + }, + { + "epoch": 2.8553269777554773, + "grad_norm": 4.565655643066571, + "learning_rate": 7.0040594265287635e-09, + "logits/chosen": -0.08348926156759262, + "logits/rejected": -0.09771665930747986, + "logps/chosen": -2.1372008323669434, + "logps/rejected": -3.0167839527130127, + "loss": 0.672, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.1372008323669434, + "rewards/margins": 0.8795830011367798, + "rewards/rejected": -3.0167839527130127, + "sft_loss": 2.1172802448272705, + "step": 5335 + }, + { + "epoch": 2.858003010536879, + "grad_norm": 3.475641530879466, + "learning_rate": 6.746687016066566e-09, + "logits/chosen": -0.12629814445972443, + "logits/rejected": -0.07990659773349762, + "logps/chosen": -1.9846584796905518, + "logps/rejected": -3.060459613800049, + "loss": 0.6755, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.9846584796905518, + "rewards/margins": 1.0758014917373657, + "rewards/rejected": -3.060459613800049, + "sft_loss": 1.912102460861206, + "step": 5340 + }, + { + "epoch": 2.8606790433182807, + "grad_norm": 4.152446641844467, + "learning_rate": 6.494100151410276e-09, + "logits/chosen": -0.25416839122772217, + "logits/rejected": -0.05416093021631241, + "logps/chosen": -1.9099677801132202, + "logps/rejected": -3.2682998180389404, + "loss": 0.6565, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.9099677801132202, + "rewards/margins": 1.3583317995071411, + "rewards/rejected": -3.2682998180389404, + "sft_loss": 1.9788919687271118, + "step": 5345 + }, + { + "epoch": 2.8633550760996824, + "grad_norm": 4.1051953454734, + "learning_rate": 6.246301283158728e-09, + "logits/chosen": -0.050106167793273926, + "logits/rejected": -0.03766874969005585, + "logps/chosen": -2.2540361881256104, + "logps/rejected": -3.149855375289917, + "loss": 0.6822, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.2540361881256104, + "rewards/margins": 0.8958193063735962, + "rewards/rejected": -3.149855375289917, + "sft_loss": 2.156536102294922, + "step": 5350 + }, + { + "epoch": 2.8660311088810837, + "grad_norm": 4.7264036002658925, + "learning_rate": 6.0032928154576944e-09, + "logits/chosen": -0.11822198331356049, + "logits/rejected": -0.03527585044503212, + "logps/chosen": -2.1379916667938232, + "logps/rejected": -3.0635056495666504, + "loss": 0.6881, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1379916667938232, + "rewards/margins": 0.9255143404006958, + "rewards/rejected": -3.0635056495666504, + "sft_loss": 2.182905435562134, + "step": 5355 + }, + { + "epoch": 2.8687071416624854, + "grad_norm": 9.642319285917505, + "learning_rate": 5.76507710597629e-09, + "logits/chosen": -0.15814802050590515, + "logits/rejected": 0.04452654719352722, + "logps/chosen": -2.0034031867980957, + "logps/rejected": -3.0512545108795166, + "loss": 0.6707, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.0034031867980957, + "rewards/margins": 1.047851324081421, + "rewards/rejected": -3.0512545108795166, + "sft_loss": 2.0460562705993652, + "step": 5360 + }, + { + "epoch": 2.8713831744438867, + "grad_norm": 4.819242088785797, + "learning_rate": 5.531656465884438e-09, + "logits/chosen": -0.22122152149677277, + "logits/rejected": -0.08308999240398407, + "logps/chosen": -2.0886685848236084, + "logps/rejected": -3.479623317718506, + "loss": 0.6608, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.0886685848236084, + "rewards/margins": 1.3909549713134766, + "rewards/rejected": -3.479623317718506, + "sft_loss": 2.0830681324005127, + "step": 5365 + }, + { + "epoch": 2.8740592072252884, + "grad_norm": 4.6328032363780505, + "learning_rate": 5.303033159830217e-09, + "logits/chosen": -0.06698968261480331, + "logits/rejected": -0.04075399041175842, + "logps/chosen": -2.1402573585510254, + "logps/rejected": -2.9639697074890137, + "loss": 0.6837, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1402573585510254, + "rewards/margins": 0.823712170124054, + "rewards/rejected": -2.9639697074890137, + "sft_loss": 2.119158983230591, + "step": 5370 + }, + { + "epoch": 2.87673524000669, + "grad_norm": 4.904844953180419, + "learning_rate": 5.079209405917939e-09, + "logits/chosen": -0.14949668943881989, + "logits/rejected": -0.03697948530316353, + "logps/chosen": -1.9812753200531006, + "logps/rejected": -3.7004337310791016, + "loss": 0.6552, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.9812753200531006, + "rewards/margins": 1.7191585302352905, + "rewards/rejected": -3.7004337310791016, + "sft_loss": 2.085773468017578, + "step": 5375 + }, + { + "epoch": 2.879411272788092, + "grad_norm": 3.29372844625657, + "learning_rate": 4.860187375686664e-09, + "logits/chosen": -0.1672995388507843, + "logits/rejected": 0.05775245279073715, + "logps/chosen": -2.269740581512451, + "logps/rejected": -3.2039215564727783, + "loss": 0.6773, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.269740581512451, + "rewards/margins": 0.9341810941696167, + "rewards/rejected": -3.2039215564727783, + "sft_loss": 2.272864818572998, + "step": 5380 + }, + { + "epoch": 2.882087305569493, + "grad_norm": 4.399064487472108, + "learning_rate": 4.64596919408905e-09, + "logits/chosen": -0.09647464752197266, + "logits/rejected": -0.009688240475952625, + "logps/chosen": -2.0818636417388916, + "logps/rejected": -3.0700161457061768, + "loss": 0.6674, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0818636417388916, + "rewards/margins": 0.9881525039672852, + "rewards/rejected": -3.0700161457061768, + "sft_loss": 2.078864097595215, + "step": 5385 + }, + { + "epoch": 2.884763338350895, + "grad_norm": 6.882650979565821, + "learning_rate": 4.436556939470814e-09, + "logits/chosen": -0.1049887165427208, + "logits/rejected": 0.04000038653612137, + "logps/chosen": -2.214542865753174, + "logps/rejected": -3.1506035327911377, + "loss": 0.6652, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.214542865753174, + "rewards/margins": 0.9360604286193848, + "rewards/rejected": -3.1506035327911377, + "sft_loss": 2.2201972007751465, + "step": 5390 + }, + { + "epoch": 2.887439371132296, + "grad_norm": 3.8787737835404297, + "learning_rate": 4.23195264355064e-09, + "logits/chosen": -0.32012152671813965, + "logits/rejected": -0.09241650998592377, + "logps/chosen": -1.9448812007904053, + "logps/rejected": -3.220184803009033, + "loss": 0.6505, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.9448812007904053, + "rewards/margins": 1.2753032445907593, + "rewards/rejected": -3.220184803009033, + "sft_loss": 2.0128612518310547, + "step": 5395 + }, + { + "epoch": 2.890115403913698, + "grad_norm": 4.9661532860476445, + "learning_rate": 4.032158291400245e-09, + "logits/chosen": -0.20304766297340393, + "logits/rejected": 0.044023532420396805, + "logps/chosen": -1.974930763244629, + "logps/rejected": -3.5096993446350098, + "loss": 0.6527, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.974930763244629, + "rewards/margins": 1.5347683429718018, + "rewards/rejected": -3.5096993446350098, + "sft_loss": 1.9161895513534546, + "step": 5400 + }, + { + "epoch": 2.8927914366950995, + "grad_norm": 2.4060071689912363, + "learning_rate": 3.837175821425398e-09, + "logits/chosen": -0.08438242971897125, + "logits/rejected": -0.020516935735940933, + "logps/chosen": -2.2892978191375732, + "logps/rejected": -3.325204849243164, + "loss": 0.6724, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.2892978191375732, + "rewards/margins": 1.0359070301055908, + "rewards/rejected": -3.325204849243164, + "sft_loss": 2.2003917694091797, + "step": 5405 + }, + { + "epoch": 2.8954674694765012, + "grad_norm": 4.938144531154734, + "learning_rate": 3.6470071253467683e-09, + "logits/chosen": -0.14719071984291077, + "logits/rejected": -0.014941292814910412, + "logps/chosen": -2.0267221927642822, + "logps/rejected": -3.362051486968994, + "loss": 0.6662, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.0267221927642822, + "rewards/margins": 1.3353294134140015, + "rewards/rejected": -3.362051486968994, + "sft_loss": 2.0396056175231934, + "step": 5410 + }, + { + "epoch": 2.8981435022579025, + "grad_norm": 3.9310444258559, + "learning_rate": 3.461654048181939e-09, + "logits/chosen": -0.15167245268821716, + "logits/rejected": 0.030614938586950302, + "logps/chosen": -2.124218463897705, + "logps/rejected": -3.1093029975891113, + "loss": 0.6828, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.124218463897705, + "rewards/margins": 0.9850847125053406, + "rewards/rejected": -3.1093029975891113, + "sft_loss": 2.323394775390625, + "step": 5415 + }, + { + "epoch": 2.9008195350393042, + "grad_norm": 5.9589444266685625, + "learning_rate": 3.281118388227255e-09, + "logits/chosen": -0.10924432426691055, + "logits/rejected": -0.024333816021680832, + "logps/chosen": -2.1157054901123047, + "logps/rejected": -3.0793604850769043, + "loss": 0.6817, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1157054901123047, + "rewards/margins": 0.9636548757553101, + "rewards/rejected": -3.0793604850769043, + "sft_loss": 2.05177640914917, + "step": 5420 + }, + { + "epoch": 2.903495567820706, + "grad_norm": 5.2961624525344515, + "learning_rate": 3.1054018970405048e-09, + "logits/chosen": -0.14075425267219543, + "logits/rejected": 0.005097964312881231, + "logps/chosen": -2.076080799102783, + "logps/rejected": -3.2447776794433594, + "loss": 0.6707, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.076080799102783, + "rewards/margins": 1.1686967611312866, + "rewards/rejected": -3.2447776794433594, + "sft_loss": 1.9993696212768555, + "step": 5425 + }, + { + "epoch": 2.906171600602107, + "grad_norm": 7.863653320435026, + "learning_rate": 2.9345062794238207e-09, + "logits/chosen": -0.15427440404891968, + "logits/rejected": 0.011923110112547874, + "logps/chosen": -2.1327626705169678, + "logps/rejected": -3.1934449672698975, + "loss": 0.6703, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1327626705169678, + "rewards/margins": 1.0606820583343506, + "rewards/rejected": -3.1934449672698975, + "sft_loss": 2.166590690612793, + "step": 5430 + }, + { + "epoch": 2.908847633383509, + "grad_norm": 6.810870585680136, + "learning_rate": 2.7684331934072492e-09, + "logits/chosen": -0.2445758581161499, + "logits/rejected": -0.15394611656665802, + "logps/chosen": -2.049255847930908, + "logps/rejected": -3.2269885540008545, + "loss": 0.6643, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.049255847930908, + "rewards/margins": 1.1777327060699463, + "rewards/rejected": -3.2269885540008545, + "sft_loss": 2.133906602859497, + "step": 5435 + }, + { + "epoch": 2.9115236661649107, + "grad_norm": 4.228931285194542, + "learning_rate": 2.6071842502326526e-09, + "logits/chosen": -0.2038041353225708, + "logits/rejected": -0.07454162836074829, + "logps/chosen": -2.1540584564208984, + "logps/rejected": -3.112555742263794, + "loss": 0.6717, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1540584564208984, + "rewards/margins": 0.9584974050521851, + "rewards/rejected": -3.112555742263794, + "sft_loss": 2.1838250160217285, + "step": 5440 + }, + { + "epoch": 2.9141996989463124, + "grad_norm": 3.925845892222441, + "learning_rate": 2.450761014337888e-09, + "logits/chosen": 0.043465353548526764, + "logits/rejected": 0.08534816652536392, + "logps/chosen": -2.2249526977539062, + "logps/rejected": -3.5208001136779785, + "loss": 0.6791, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.2249526977539062, + "rewards/margins": 1.2958471775054932, + "rewards/rejected": -3.5208001136779785, + "sft_loss": 2.1956400871276855, + "step": 5445 + }, + { + "epoch": 2.9168757317277136, + "grad_norm": 5.611947431725603, + "learning_rate": 2.299165003341985e-09, + "logits/chosen": -0.005017808172851801, + "logits/rejected": 0.09742090851068497, + "logps/chosen": -2.0905654430389404, + "logps/rejected": -3.3462531566619873, + "loss": 0.6686, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0905654430389404, + "rewards/margins": 1.2556875944137573, + "rewards/rejected": -3.3462531566619873, + "sft_loss": 2.049487829208374, + "step": 5450 + }, + { + "epoch": 2.9195517645091154, + "grad_norm": 4.692837943530718, + "learning_rate": 2.1523976880299945e-09, + "logits/chosen": -0.18358221650123596, + "logits/rejected": -0.006678286008536816, + "logps/chosen": -2.1434359550476074, + "logps/rejected": -3.0037713050842285, + "loss": 0.6896, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1434359550476074, + "rewards/margins": 0.8603354692459106, + "rewards/rejected": -3.0037713050842285, + "sft_loss": 2.1421120166778564, + "step": 5455 + }, + { + "epoch": 2.9222277972905166, + "grad_norm": 6.715649661958594, + "learning_rate": 2.010460492339161e-09, + "logits/chosen": -0.12474910169839859, + "logits/rejected": 0.004342988133430481, + "logps/chosen": -1.9698402881622314, + "logps/rejected": -3.2126338481903076, + "loss": 0.6534, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9698402881622314, + "rewards/margins": 1.2427937984466553, + "rewards/rejected": -3.2126338481903076, + "sft_loss": 1.9961681365966797, + "step": 5460 + }, + { + "epoch": 2.9249038300719183, + "grad_norm": 5.025959003124493, + "learning_rate": 1.8733547933446614e-09, + "logits/chosen": -0.21705186367034912, + "logits/rejected": 0.008288288488984108, + "logps/chosen": -2.236253499984741, + "logps/rejected": -3.270078659057617, + "loss": 0.683, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.236253499984741, + "rewards/margins": 1.033825159072876, + "rewards/rejected": -3.270078659057617, + "sft_loss": 2.1473069190979004, + "step": 5465 + }, + { + "epoch": 2.92757986285332, + "grad_norm": 4.333369522800217, + "learning_rate": 1.7410819212467231e-09, + "logits/chosen": -0.13449707627296448, + "logits/rejected": -0.05470636487007141, + "logps/chosen": -2.23810076713562, + "logps/rejected": -3.125683307647705, + "loss": 0.685, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.23810076713562, + "rewards/margins": 0.8875824809074402, + "rewards/rejected": -3.125683307647705, + "sft_loss": 2.2447752952575684, + "step": 5470 + }, + { + "epoch": 2.9302558956347218, + "grad_norm": 6.322699130876288, + "learning_rate": 1.613643159357192e-09, + "logits/chosen": -0.11728037893772125, + "logits/rejected": -0.1461258977651596, + "logps/chosen": -2.0434460639953613, + "logps/rejected": -3.02321720123291, + "loss": 0.6741, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0434460639953613, + "rewards/margins": 0.9797712564468384, + "rewards/rejected": -3.02321720123291, + "sft_loss": 2.0746424198150635, + "step": 5475 + }, + { + "epoch": 2.932931928416123, + "grad_norm": 5.5318695238993, + "learning_rate": 1.4910397440875967e-09, + "logits/chosen": -0.1640905737876892, + "logits/rejected": -0.03388802334666252, + "logps/chosen": -2.0576159954071045, + "logps/rejected": -3.136462926864624, + "loss": 0.6751, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.0576159954071045, + "rewards/margins": 1.078847050666809, + "rewards/rejected": -3.136462926864624, + "sft_loss": 2.081251382827759, + "step": 5480 + }, + { + "epoch": 2.9356079611975248, + "grad_norm": 3.403580517089844, + "learning_rate": 1.3732728649368253e-09, + "logits/chosen": -0.09311430156230927, + "logits/rejected": 0.1041082963347435, + "logps/chosen": -1.955439567565918, + "logps/rejected": -3.12667179107666, + "loss": 0.6591, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.955439567565918, + "rewards/margins": 1.1712322235107422, + "rewards/rejected": -3.12667179107666, + "sft_loss": 1.9776054620742798, + "step": 5485 + }, + { + "epoch": 2.938283993978926, + "grad_norm": 6.916351334848274, + "learning_rate": 1.260343664479524e-09, + "logits/chosen": -0.15566611289978027, + "logits/rejected": -0.10274732112884521, + "logps/chosen": -2.046116352081299, + "logps/rejected": -3.0021262168884277, + "loss": 0.6862, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.046116352081299, + "rewards/margins": 0.9560097455978394, + "rewards/rejected": -3.0021262168884277, + "sft_loss": 2.062603712081909, + "step": 5490 + }, + { + "epoch": 2.9409600267603278, + "grad_norm": 6.230606417477509, + "learning_rate": 1.1522532383554384e-09, + "logits/chosen": -0.2095448225736618, + "logits/rejected": 0.016530293971300125, + "logps/chosen": -2.0500950813293457, + "logps/rejected": -3.422104597091675, + "loss": 0.6496, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.0500950813293457, + "rewards/margins": 1.3720093965530396, + "rewards/rejected": -3.422104597091675, + "sft_loss": 2.092311143875122, + "step": 5495 + }, + { + "epoch": 2.9436360595417295, + "grad_norm": 2.987181490327306, + "learning_rate": 1.049002635258256e-09, + "logits/chosen": -0.10499457269906998, + "logits/rejected": 0.008853035047650337, + "logps/chosen": -2.199967861175537, + "logps/rejected": -3.3355917930603027, + "loss": 0.6735, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.199967861175537, + "rewards/margins": 1.1356239318847656, + "rewards/rejected": -3.3355917930603027, + "sft_loss": 2.1222336292266846, + "step": 5500 + }, + { + "epoch": 2.946312092323131, + "grad_norm": 5.246174523496143, + "learning_rate": 9.505928569258358e-10, + "logits/chosen": -0.021180534735322, + "logits/rejected": 0.01215188205242157, + "logps/chosen": -2.043039560317993, + "logps/rejected": -3.1634812355041504, + "loss": 0.6515, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.043039560317993, + "rewards/margins": 1.1204414367675781, + "rewards/rejected": -3.1634812355041504, + "sft_loss": 2.107445240020752, + "step": 5505 + }, + { + "epoch": 2.9489881251045325, + "grad_norm": 4.011491700835256, + "learning_rate": 8.57024858130273e-10, + "logits/chosen": -0.1461961269378662, + "logits/rejected": 0.027207564562559128, + "logps/chosen": -2.150327205657959, + "logps/rejected": -3.621079683303833, + "loss": 0.6689, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.150327205657959, + "rewards/margins": 1.4707525968551636, + "rewards/rejected": -3.621079683303833, + "sft_loss": 2.1148831844329834, + "step": 5510 + }, + { + "epoch": 2.951664157885934, + "grad_norm": 7.2017527904981575, + "learning_rate": 7.682995466686826e-10, + "logits/chosen": -0.21892747282981873, + "logits/rejected": -0.0924294963479042, + "logps/chosen": -2.291236400604248, + "logps/rejected": -3.4094910621643066, + "loss": 0.6794, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.291236400604248, + "rewards/margins": 1.1182541847229004, + "rewards/rejected": -3.4094910621643066, + "sft_loss": 2.2477917671203613, + "step": 5515 + }, + { + "epoch": 2.9543401906673354, + "grad_norm": 6.904277229174928, + "learning_rate": 6.844177833543741e-10, + "logits/chosen": -0.12460234016180038, + "logits/rejected": -0.04724789783358574, + "logps/chosen": -2.045562267303467, + "logps/rejected": -3.1144003868103027, + "loss": 0.6767, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.045562267303467, + "rewards/margins": 1.068838119506836, + "rewards/rejected": -3.1144003868103027, + "sft_loss": 2.1038308143615723, + "step": 5520 + }, + { + "epoch": 2.957016223448737, + "grad_norm": 3.18398188788031, + "learning_rate": 6.053803820087467e-10, + "logits/chosen": -0.14302311837673187, + "logits/rejected": 0.02467949315905571, + "logps/chosen": -2.2179653644561768, + "logps/rejected": -3.471297025680542, + "loss": 0.6761, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2179653644561768, + "rewards/margins": 1.2533316612243652, + "rewards/rejected": -3.471297025680542, + "sft_loss": 2.2776458263397217, + "step": 5525 + }, + { + "epoch": 2.959692256230139, + "grad_norm": 13.381147902591191, + "learning_rate": 5.311881094528514e-10, + "logits/chosen": -0.21192388236522675, + "logits/rejected": 0.026648789644241333, + "logps/chosen": -2.3372108936309814, + "logps/rejected": -3.121948719024658, + "loss": 0.6974, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3372108936309814, + "rewards/margins": 0.7847374677658081, + "rewards/rejected": -3.121948719024658, + "sft_loss": 2.2250218391418457, + "step": 5530 + }, + { + "epoch": 2.9623682890115406, + "grad_norm": 5.802925161561028, + "learning_rate": 4.6184168550050806e-10, + "logits/chosen": -0.16421891748905182, + "logits/rejected": -0.09533698856830597, + "logps/chosen": -2.1452269554138184, + "logps/rejected": -3.2812087535858154, + "loss": 0.6774, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1452269554138184, + "rewards/margins": 1.1359819173812866, + "rewards/rejected": -3.2812087535858154, + "sft_loss": 2.2439322471618652, + "step": 5535 + }, + { + "epoch": 2.965044321792942, + "grad_norm": 4.153869289541625, + "learning_rate": 3.973417829510328e-10, + "logits/chosen": -0.22801952064037323, + "logits/rejected": -0.04530780762434006, + "logps/chosen": -2.094473123550415, + "logps/rejected": -3.0530173778533936, + "loss": 0.6762, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.094473123550415, + "rewards/margins": 0.9585443735122681, + "rewards/rejected": -3.0530173778533936, + "sft_loss": 1.990878701210022, + "step": 5540 + }, + { + "epoch": 2.9677203545743436, + "grad_norm": 7.429578513315398, + "learning_rate": 3.3768902758274377e-10, + "logits/chosen": -0.1138523668050766, + "logits/rejected": -0.013996327295899391, + "logps/chosen": -2.0203332901000977, + "logps/rejected": -3.066257953643799, + "loss": 0.6756, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0203332901000977, + "rewards/margins": 1.0459246635437012, + "rewards/rejected": -3.066257953643799, + "sft_loss": 2.025296449661255, + "step": 5545 + }, + { + "epoch": 2.970396387355745, + "grad_norm": 4.05501984298505, + "learning_rate": 2.8288399814691e-10, + "logits/chosen": -0.02194799669086933, + "logits/rejected": 0.06368036568164825, + "logps/chosen": -2.1512372493743896, + "logps/rejected": -3.075148105621338, + "loss": 0.6706, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1512372493743896, + "rewards/margins": 0.9239107370376587, + "rewards/rejected": -3.075148105621338, + "sft_loss": 2.117356777191162, + "step": 5550 + }, + { + "epoch": 2.9730724201371466, + "grad_norm": 5.813124472257818, + "learning_rate": 2.3292722636220066e-10, + "logits/chosen": -0.1359180510044098, + "logits/rejected": 0.06921157240867615, + "logps/chosen": -2.1639323234558105, + "logps/rejected": -3.4337375164031982, + "loss": 0.6772, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1639323234558105, + "rewards/margins": 1.2698047161102295, + "rewards/rejected": -3.4337375164031982, + "sft_loss": 2.156768321990967, + "step": 5555 + }, + { + "epoch": 2.9757484529185483, + "grad_norm": 5.763459794171124, + "learning_rate": 1.8781919690946668e-10, + "logits/chosen": -0.09430771321058273, + "logits/rejected": -0.06343688070774078, + "logps/chosen": -2.088906764984131, + "logps/rejected": -2.9498770236968994, + "loss": 0.6793, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.088906764984131, + "rewards/margins": 0.860970139503479, + "rewards/rejected": -2.9498770236968994, + "sft_loss": 2.1586833000183105, + "step": 5560 + }, + { + "epoch": 2.97842448569995, + "grad_norm": 6.0807886966734666, + "learning_rate": 1.4756034742696711e-10, + "logits/chosen": -0.1709664762020111, + "logits/rejected": -0.07853923738002777, + "logps/chosen": -2.068702459335327, + "logps/rejected": -3.2836601734161377, + "loss": 0.6718, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.068702459335327, + "rewards/margins": 1.2149574756622314, + "rewards/rejected": -3.2836601734161377, + "sft_loss": 2.1059365272521973, + "step": 5565 + }, + { + "epoch": 2.9811005184813513, + "grad_norm": 20.538841095877864, + "learning_rate": 1.12151068506261e-10, + "logits/chosen": -0.13746069371700287, + "logits/rejected": -0.000490212463773787, + "logps/chosen": -2.0011448860168457, + "logps/rejected": -3.4362778663635254, + "loss": 0.654, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0011448860168457, + "rewards/margins": 1.4351327419281006, + "rewards/rejected": -3.4362778663635254, + "sft_loss": 2.031506061553955, + "step": 5570 + }, + { + "epoch": 2.983776551262753, + "grad_norm": 8.268603418051823, + "learning_rate": 8.159170368826629e-11, + "logits/chosen": -0.1851300746202469, + "logits/rejected": -0.01682780496776104, + "logps/chosen": -2.000291347503662, + "logps/rejected": -3.215944766998291, + "loss": 0.6696, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.000291347503662, + "rewards/margins": 1.2156531810760498, + "rewards/rejected": -3.215944766998291, + "sft_loss": 2.0130972862243652, + "step": 5575 + }, + { + "epoch": 2.9864525840441547, + "grad_norm": 5.855344973650639, + "learning_rate": 5.588254946015114e-11, + "logits/chosen": -0.2728053331375122, + "logits/rejected": -0.005551565438508987, + "logps/chosen": -1.9887988567352295, + "logps/rejected": -3.1639037132263184, + "loss": 0.6661, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9887988567352295, + "rewards/margins": 1.1751052141189575, + "rewards/rejected": -3.1639037132263184, + "sft_loss": 1.9876254796981812, + "step": 5580 + }, + { + "epoch": 2.989128616825556, + "grad_norm": 12.96938514619225, + "learning_rate": 3.502385525216978e-11, + "logits/chosen": -0.23821833729743958, + "logits/rejected": -0.057877153158187866, + "logps/chosen": -2.0825464725494385, + "logps/rejected": -3.262328624725342, + "loss": 0.6913, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.0825464725494385, + "rewards/margins": 1.1797820329666138, + "rewards/rejected": -3.262328624725342, + "sft_loss": 2.068385601043701, + "step": 5585 + }, + { + "epoch": 2.9918046496069577, + "grad_norm": 2.678422351463008, + "learning_rate": 1.901582343555308e-11, + "logits/chosen": -0.16784720122814178, + "logits/rejected": -0.09556673467159271, + "logps/chosen": -2.2567338943481445, + "logps/rejected": -3.335172653198242, + "loss": 0.6846, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2567338943481445, + "rewards/margins": 1.078438639640808, + "rewards/rejected": -3.335172653198242, + "sft_loss": 2.1680376529693604, + "step": 5590 + }, + { + "epoch": 2.9944806823883594, + "grad_norm": 5.169085447705354, + "learning_rate": 7.858609320232634e-12, + "logits/chosen": -0.17234937846660614, + "logits/rejected": -0.006200958043336868, + "logps/chosen": -2.0979604721069336, + "logps/rejected": -3.125894069671631, + "loss": 0.6659, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0979604721069336, + "rewards/margins": 1.0279338359832764, + "rewards/rejected": -3.125894069671631, + "sft_loss": 2.0555245876312256, + "step": 5595 + }, + { + "epoch": 2.9971567151697607, + "grad_norm": 5.833725919683387, + "learning_rate": 1.5523211535639624e-12, + "logits/chosen": -0.1850179135799408, + "logits/rejected": -0.0650128573179245, + "logps/chosen": -1.9984537363052368, + "logps/rejected": -3.5584564208984375, + "loss": 0.659, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.9984537363052368, + "rewards/margins": 1.5600025653839111, + "rewards/rejected": -3.5584564208984375, + "sft_loss": 2.0812668800354004, + "step": 5600 + }, + { + "epoch": 2.9971567151697607, + "eval_logits/chosen": 0.11851444095373154, + "eval_logits/rejected": 0.21514485776424408, + "eval_logps/chosen": -2.170725107192993, + "eval_logps/rejected": -3.2270584106445312, + "eval_loss": 0.6901265978813171, + "eval_rewards/accuracies": 0.6913946866989136, + "eval_rewards/chosen": -2.170725107192993, + "eval_rewards/margins": 1.0563328266143799, + "eval_rewards/rejected": -3.2270584106445312, + "eval_runtime": 43.405, + "eval_samples_per_second": 30.987, + "eval_sft_loss": 2.133068323135376, + "eval_steps_per_second": 7.764, + "step": 5600 + }, + { + "epoch": 2.999297541394882, + "step": 5604, + "total_flos": 0.0, + "train_loss": 0.6914210148065283, + "train_runtime": 33926.4655, + "train_samples_per_second": 5.287, + "train_steps_per_second": 0.165 + } + ], + "logging_steps": 5, + "max_steps": 5604, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}