diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6383 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 3821, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 3.8125, + "learning_rate": 1.3054830287206268e-08, + "logits/chosen": -2.377302885055542, + "logits/rejected": -2.2193117141723633, + "logps/chosen": -290.4185485839844, + "logps/rejected": -374.6501770019531, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 2.40625, + "learning_rate": 1.3054830287206266e-07, + "logits/chosen": -2.25045108795166, + "logits/rejected": -2.052776575088501, + "logps/chosen": -279.61688232421875, + "logps/rejected": -245.4197540283203, + "loss": 0.6931, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.000771976076066494, + "rewards/margins": 0.00010288292105542496, + "rewards/rejected": 0.0006690931040793657, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 2.484375, + "learning_rate": 2.610966057441253e-07, + "logits/chosen": -2.2451391220092773, + "logits/rejected": -1.944021224975586, + "logps/chosen": -305.45184326171875, + "logps/rejected": -237.7191619873047, + "loss": 0.6926, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.004307927563786507, + "rewards/margins": 0.0011060098186135292, + "rewards/rejected": 0.003201917978003621, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 2.3125, + "learning_rate": 3.9164490861618804e-07, + "logits/chosen": -2.2053542137145996, + "logits/rejected": -2.136805772781372, + "logps/chosen": -251.1873016357422, + "logps/rejected": -251.39126586914062, + "loss": 0.692, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.012356054969131947, + "rewards/margins": 0.0023143726866692305, + "rewards/rejected": 0.010041682049632072, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 1.9453125, + "learning_rate": 5.221932114882506e-07, + "logits/chosen": -2.062053918838501, + "logits/rejected": -2.0244908332824707, + "logps/chosen": -216.23828125, + "logps/rejected": -221.68917846679688, + "loss": 0.6915, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.019059285521507263, + "rewards/margins": 0.0032902732491493225, + "rewards/rejected": 0.01576901227235794, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 2.078125, + "learning_rate": 6.527415143603135e-07, + "logits/chosen": -2.1121723651885986, + "logits/rejected": -2.1005072593688965, + "logps/chosen": -266.8817443847656, + "logps/rejected": -234.3415069580078, + "loss": 0.6905, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.030057832598686218, + "rewards/margins": 0.005467818584293127, + "rewards/rejected": 0.024590013548731804, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 2.125, + "learning_rate": 7.832898172323761e-07, + "logits/chosen": -2.0995335578918457, + "logits/rejected": -1.9425058364868164, + "logps/chosen": -252.32351684570312, + "logps/rejected": -226.69961547851562, + "loss": 0.69, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.03176448494195938, + "rewards/margins": 0.006372343748807907, + "rewards/rejected": 0.025392139330506325, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 2.03125, + "learning_rate": 9.138381201044387e-07, + "logits/chosen": -2.2442469596862793, + "logits/rejected": -2.036492347717285, + "logps/chosen": -272.0433044433594, + "logps/rejected": -246.6951446533203, + "loss": 0.6879, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.04112860932946205, + "rewards/margins": 0.010742614977061749, + "rewards/rejected": 0.030385995283722878, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 2.390625, + "learning_rate": 1.0443864229765013e-06, + "logits/chosen": -2.153740882873535, + "logits/rejected": -1.977267861366272, + "logps/chosen": -257.5650329589844, + "logps/rejected": -246.85354614257812, + "loss": 0.6872, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.038635507225990295, + "rewards/margins": 0.012301743030548096, + "rewards/rejected": 0.0263337641954422, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 2.1875, + "learning_rate": 1.1749347258485642e-06, + "logits/chosen": -2.136314868927002, + "logits/rejected": -2.000256061553955, + "logps/chosen": -250.14096069335938, + "logps/rejected": -234.5118408203125, + "loss": 0.6848, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.04195228964090347, + "rewards/margins": 0.017196740955114365, + "rewards/rejected": 0.02475554868578911, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 2.125, + "learning_rate": 1.305483028720627e-06, + "logits/chosen": -2.179086208343506, + "logits/rejected": -2.068403482437134, + "logps/chosen": -246.95883178710938, + "logps/rejected": -230.7919921875, + "loss": 0.6819, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.04810682684183121, + "rewards/margins": 0.023308780044317245, + "rewards/rejected": 0.024798044934868813, + "step": 100 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -2.095933198928833, + "eval_logits/rejected": -1.9564727544784546, + "eval_logps/chosen": -259.64715576171875, + "eval_logps/rejected": -241.9028778076172, + "eval_loss": 0.6821568012237549, + "eval_rewards/accuracies": 0.6545000076293945, + "eval_rewards/chosen": 0.05004846677184105, + "eval_rewards/margins": 0.02299799770116806, + "eval_rewards/rejected": 0.02705046720802784, + "eval_runtime": 381.806, + "eval_samples_per_second": 5.238, + "eval_steps_per_second": 0.655, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 2.3125, + "learning_rate": 1.4360313315926894e-06, + "logits/chosen": -2.1454405784606934, + "logits/rejected": -2.0017640590667725, + "logps/chosen": -284.425537109375, + "logps/rejected": -238.8695526123047, + "loss": 0.6795, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.049382902681827545, + "rewards/margins": 0.02859182097017765, + "rewards/rejected": 0.020791077986359596, + "step": 110 + }, + { + "epoch": 0.03, + "grad_norm": 2.140625, + "learning_rate": 1.5665796344647521e-06, + "logits/chosen": -2.1937575340270996, + "logits/rejected": -2.054399013519287, + "logps/chosen": -287.4629821777344, + "logps/rejected": -271.8957824707031, + "loss": 0.6729, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.05581967160105705, + "rewards/margins": 0.042316947132349014, + "rewards/rejected": 0.013502727262675762, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 2.765625, + "learning_rate": 1.6971279373368146e-06, + "logits/chosen": -2.208482265472412, + "logits/rejected": -2.118875503540039, + "logps/chosen": -250.0573272705078, + "logps/rejected": -252.57418823242188, + "loss": 0.6698, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.050946980714797974, + "rewards/margins": 0.049403756856918335, + "rewards/rejected": 0.0015432273503392935, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 2.484375, + "learning_rate": 1.8276762402088774e-06, + "logits/chosen": -2.2458949089050293, + "logits/rejected": -1.911431074142456, + "logps/chosen": -270.4693298339844, + "logps/rejected": -226.22677612304688, + "loss": 0.6685, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.04268602281808853, + "rewards/margins": 0.05290870741009712, + "rewards/rejected": -0.010222683660686016, + "step": 140 + }, + { + "epoch": 0.04, + "grad_norm": 2.640625, + "learning_rate": 1.9582245430809403e-06, + "logits/chosen": -2.2650039196014404, + "logits/rejected": -2.039114475250244, + "logps/chosen": -280.2913818359375, + "logps/rejected": -242.7501983642578, + "loss": 0.6678, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.03719528391957283, + "rewards/margins": 0.05549495667219162, + "rewards/rejected": -0.01829967275261879, + "step": 150 + }, + { + "epoch": 0.04, + "grad_norm": 2.671875, + "learning_rate": 2.0887728459530026e-06, + "logits/chosen": -2.1557822227478027, + "logits/rejected": -2.0535261631011963, + "logps/chosen": -256.06103515625, + "logps/rejected": -261.87261962890625, + "loss": 0.6687, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.007628369145095348, + "rewards/margins": 0.05603449419140816, + "rewards/rejected": -0.04840613156557083, + "step": 160 + }, + { + "epoch": 0.04, + "grad_norm": 2.875, + "learning_rate": 2.2193211488250653e-06, + "logits/chosen": -2.125109910964966, + "logits/rejected": -1.9704573154449463, + "logps/chosen": -220.9778594970703, + "logps/rejected": -228.26919555664062, + "loss": 0.671, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0038716509006917477, + "rewards/margins": 0.05044783279299736, + "rewards/rejected": -0.05431948974728584, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 3.53125, + "learning_rate": 2.3498694516971284e-06, + "logits/chosen": -2.1243832111358643, + "logits/rejected": -1.9889084100723267, + "logps/chosen": -258.29095458984375, + "logps/rejected": -251.7142333984375, + "loss": 0.6638, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.025864282622933388, + "rewards/margins": 0.06769417971372604, + "rewards/rejected": -0.09355846792459488, + "step": 180 + }, + { + "epoch": 0.05, + "grad_norm": 3.015625, + "learning_rate": 2.4804177545691907e-06, + "logits/chosen": -2.2455646991729736, + "logits/rejected": -2.0299086570739746, + "logps/chosen": -272.17633056640625, + "logps/rejected": -253.8187255859375, + "loss": 0.6499, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.020921263843774796, + "rewards/margins": 0.09995204210281372, + "rewards/rejected": -0.12087330967187881, + "step": 190 + }, + { + "epoch": 0.05, + "grad_norm": 3.546875, + "learning_rate": 2.610966057441254e-06, + "logits/chosen": -2.1975388526916504, + "logits/rejected": -1.9570707082748413, + "logps/chosen": -264.46234130859375, + "logps/rejected": -235.4163818359375, + "loss": 0.6548, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.14753015339374542, + "rewards/margins": 0.09057016670703888, + "rewards/rejected": -0.23810029029846191, + "step": 200 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -2.0694758892059326, + "eval_logits/rejected": -1.9328563213348389, + "eval_logps/chosen": -279.5373229980469, + "eval_logps/rejected": -269.7627868652344, + "eval_loss": 0.6499924063682556, + "eval_rewards/accuracies": 0.6779999732971191, + "eval_rewards/chosen": -0.1488528698682785, + "eval_rewards/margins": 0.10269534587860107, + "eval_rewards/rejected": -0.2515482008457184, + "eval_runtime": 382.022, + "eval_samples_per_second": 5.235, + "eval_steps_per_second": 0.654, + "step": 200 + }, + { + "epoch": 0.05, + "grad_norm": 3.140625, + "learning_rate": 2.741514360313316e-06, + "logits/chosen": -2.198995590209961, + "logits/rejected": -1.9819616079330444, + "logps/chosen": -271.3312072753906, + "logps/rejected": -252.93746948242188, + "loss": 0.6365, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.09111092239618301, + "rewards/margins": 0.1327463835477829, + "rewards/rejected": -0.2238573133945465, + "step": 210 + }, + { + "epoch": 0.06, + "grad_norm": 3.1875, + "learning_rate": 2.872062663185379e-06, + "logits/chosen": -2.097423553466797, + "logits/rejected": -1.9822295904159546, + "logps/chosen": -259.9545593261719, + "logps/rejected": -246.3585662841797, + "loss": 0.6394, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.05938801169395447, + "rewards/margins": 0.12806808948516846, + "rewards/rejected": -0.18745610117912292, + "step": 220 + }, + { + "epoch": 0.06, + "grad_norm": 6.40625, + "learning_rate": 3.0026109660574416e-06, + "logits/chosen": -2.2377123832702637, + "logits/rejected": -2.050795078277588, + "logps/chosen": -315.82159423828125, + "logps/rejected": -288.96539306640625, + "loss": 0.6629, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.09652389585971832, + "rewards/margins": 0.08648413419723511, + "rewards/rejected": -0.18300803005695343, + "step": 230 + }, + { + "epoch": 0.06, + "grad_norm": 3.21875, + "learning_rate": 3.1331592689295043e-06, + "logits/chosen": -2.1486618518829346, + "logits/rejected": -1.961085319519043, + "logps/chosen": -312.89373779296875, + "logps/rejected": -312.0883483886719, + "loss": 0.6388, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.1659378707408905, + "rewards/margins": 0.1430220901966095, + "rewards/rejected": -0.3089599311351776, + "step": 240 + }, + { + "epoch": 0.07, + "grad_norm": 5.15625, + "learning_rate": 3.263707571801567e-06, + "logits/chosen": -2.112567186355591, + "logits/rejected": -2.012845039367676, + "logps/chosen": -277.0249938964844, + "logps/rejected": -268.689208984375, + "loss": 0.6263, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.1744508147239685, + "rewards/margins": 0.17131540179252625, + "rewards/rejected": -0.34576624631881714, + "step": 250 + }, + { + "epoch": 0.07, + "grad_norm": 4.53125, + "learning_rate": 3.3942558746736293e-06, + "logits/chosen": -2.1583478450775146, + "logits/rejected": -1.9551265239715576, + "logps/chosen": -310.0099792480469, + "logps/rejected": -299.52789306640625, + "loss": 0.6515, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.37605080008506775, + "rewards/margins": 0.11539731919765472, + "rewards/rejected": -0.49144816398620605, + "step": 260 + }, + { + "epoch": 0.07, + "grad_norm": 3.296875, + "learning_rate": 3.524804177545692e-06, + "logits/chosen": -2.0597262382507324, + "logits/rejected": -1.9347015619277954, + "logps/chosen": -287.3021545410156, + "logps/rejected": -277.96014404296875, + "loss": 0.6083, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.26627668738365173, + "rewards/margins": 0.22069358825683594, + "rewards/rejected": -0.48697033524513245, + "step": 270 + }, + { + "epoch": 0.07, + "grad_norm": 4.25, + "learning_rate": 3.6553524804177547e-06, + "logits/chosen": -2.125945568084717, + "logits/rejected": -1.954007863998413, + "logps/chosen": -298.900390625, + "logps/rejected": -293.0090637207031, + "loss": 0.6386, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.39951270818710327, + "rewards/margins": 0.1558128446340561, + "rewards/rejected": -0.5553255081176758, + "step": 280 + }, + { + "epoch": 0.08, + "grad_norm": 3.796875, + "learning_rate": 3.7859007832898174e-06, + "logits/chosen": -2.0477206707000732, + "logits/rejected": -1.9491031169891357, + "logps/chosen": -324.5054626464844, + "logps/rejected": -319.0287780761719, + "loss": 0.6271, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.38871732354164124, + "rewards/margins": 0.19628065824508667, + "rewards/rejected": -0.5849979519844055, + "step": 290 + }, + { + "epoch": 0.08, + "grad_norm": 4.96875, + "learning_rate": 3.9164490861618806e-06, + "logits/chosen": -2.0910630226135254, + "logits/rejected": -1.888196587562561, + "logps/chosen": -272.47198486328125, + "logps/rejected": -281.57830810546875, + "loss": 0.6084, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.28343814611434937, + "rewards/margins": 0.22831246256828308, + "rewards/rejected": -0.5117505788803101, + "step": 300 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -2.011384963989258, + "eval_logits/rejected": -1.8770692348480225, + "eval_logps/chosen": -294.2168884277344, + "eval_logps/rejected": -294.5921325683594, + "eval_loss": 0.6213397979736328, + "eval_rewards/accuracies": 0.6809999942779541, + "eval_rewards/chosen": -0.29564887285232544, + "eval_rewards/margins": 0.20419315993785858, + "eval_rewards/rejected": -0.4998420178890228, + "eval_runtime": 381.8433, + "eval_samples_per_second": 5.238, + "eval_steps_per_second": 0.655, + "step": 300 + }, + { + "epoch": 0.08, + "grad_norm": 4.03125, + "learning_rate": 4.046997389033943e-06, + "logits/chosen": -2.2418582439422607, + "logits/rejected": -2.04129695892334, + "logps/chosen": -316.5093994140625, + "logps/rejected": -291.79010009765625, + "loss": 0.5836, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.27603110671043396, + "rewards/margins": 0.2892019748687744, + "rewards/rejected": -0.565233051776886, + "step": 310 + }, + { + "epoch": 0.08, + "grad_norm": 4.40625, + "learning_rate": 4.177545691906005e-06, + "logits/chosen": -2.1178698539733887, + "logits/rejected": -1.9309499263763428, + "logps/chosen": -298.84527587890625, + "logps/rejected": -299.9272155761719, + "loss": 0.6369, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.40581315755844116, + "rewards/margins": 0.1810055673122406, + "rewards/rejected": -0.5868188142776489, + "step": 320 + }, + { + "epoch": 0.09, + "grad_norm": 4.4375, + "learning_rate": 4.308093994778068e-06, + "logits/chosen": -2.046699047088623, + "logits/rejected": -1.9039798974990845, + "logps/chosen": -296.7830505371094, + "logps/rejected": -293.9065246582031, + "loss": 0.6198, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3718874454498291, + "rewards/margins": 0.21303264796733856, + "rewards/rejected": -0.5849201083183289, + "step": 330 + }, + { + "epoch": 0.09, + "grad_norm": 5.375, + "learning_rate": 4.4386422976501306e-06, + "logits/chosen": -2.1172854900360107, + "logits/rejected": -2.0036845207214355, + "logps/chosen": -316.01226806640625, + "logps/rejected": -323.5932922363281, + "loss": 0.5946, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.26326456665992737, + "rewards/margins": 0.28980451822280884, + "rewards/rejected": -0.5530691146850586, + "step": 340 + }, + { + "epoch": 0.09, + "grad_norm": 3.8125, + "learning_rate": 4.569190600522193e-06, + "logits/chosen": -2.042684555053711, + "logits/rejected": -1.8946377038955688, + "logps/chosen": -352.21502685546875, + "logps/rejected": -358.153564453125, + "loss": 0.6413, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6295667886734009, + "rewards/margins": 0.17427758872509003, + "rewards/rejected": -0.8038444519042969, + "step": 350 + }, + { + "epoch": 0.09, + "grad_norm": 3.015625, + "learning_rate": 4.699738903394257e-06, + "logits/chosen": -2.011836528778076, + "logits/rejected": -1.9665615558624268, + "logps/chosen": -317.6282958984375, + "logps/rejected": -318.0123291015625, + "loss": 0.6161, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7664434909820557, + "rewards/margins": 0.21231558918952942, + "rewards/rejected": -0.9787591099739075, + "step": 360 + }, + { + "epoch": 0.1, + "grad_norm": 4.53125, + "learning_rate": 4.8302872062663196e-06, + "logits/chosen": -2.1028566360473633, + "logits/rejected": -1.9274108409881592, + "logps/chosen": -356.88507080078125, + "logps/rejected": -335.1341857910156, + "loss": 0.6264, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7998887300491333, + "rewards/margins": 0.22070667147636414, + "rewards/rejected": -1.0205953121185303, + "step": 370 + }, + { + "epoch": 0.1, + "grad_norm": 4.59375, + "learning_rate": 4.9608355091383814e-06, + "logits/chosen": -2.069827079772949, + "logits/rejected": -1.8606586456298828, + "logps/chosen": -364.96856689453125, + "logps/rejected": -353.82769775390625, + "loss": 0.6264, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6691190004348755, + "rewards/margins": 0.2223375141620636, + "rewards/rejected": -0.8914563059806824, + "step": 380 + }, + { + "epoch": 0.1, + "grad_norm": 4.78125, + "learning_rate": 4.9999488562447675e-06, + "logits/chosen": -2.088129997253418, + "logits/rejected": -1.971571683883667, + "logps/chosen": -316.87994384765625, + "logps/rejected": -327.4869079589844, + "loss": 0.5863, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3583374619483948, + "rewards/margins": 0.3061427175998688, + "rewards/rejected": -0.6644802093505859, + "step": 390 + }, + { + "epoch": 0.1, + "grad_norm": 5.125, + "learning_rate": 4.999698361256577e-06, + "logits/chosen": -2.119563341140747, + "logits/rejected": -1.8813574314117432, + "logps/chosen": -296.64593505859375, + "logps/rejected": -276.7133483886719, + "loss": 0.6237, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.31169393658638, + "rewards/margins": 0.207248717546463, + "rewards/rejected": -0.5189425945281982, + "step": 400 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -1.9655628204345703, + "eval_logits/rejected": -1.836700201034546, + "eval_logps/chosen": -310.03485107421875, + "eval_logps/rejected": -318.6169738769531, + "eval_loss": 0.6038790345191956, + "eval_rewards/accuracies": 0.6934999823570251, + "eval_rewards/chosen": -0.45382827520370483, + "eval_rewards/margins": 0.2862620949745178, + "eval_rewards/rejected": -0.7400903105735779, + "eval_runtime": 382.0228, + "eval_samples_per_second": 5.235, + "eval_steps_per_second": 0.654, + "step": 400 + }, + { + "epoch": 0.11, + "grad_norm": 4.0625, + "learning_rate": 4.999239142174581e-06, + "logits/chosen": -1.988959550857544, + "logits/rejected": -1.9292503595352173, + "logps/chosen": -292.4900817871094, + "logps/rejected": -307.29473876953125, + "loss": 0.6499, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.5016773343086243, + "rewards/margins": 0.16585329174995422, + "rewards/rejected": -0.6675306558609009, + "step": 410 + }, + { + "epoch": 0.11, + "grad_norm": 5.375, + "learning_rate": 4.99857123734344e-06, + "logits/chosen": -2.0150246620178223, + "logits/rejected": -1.8929126262664795, + "logps/chosen": -260.4281921386719, + "logps/rejected": -280.3924865722656, + "loss": 0.5908, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3163732588291168, + "rewards/margins": 0.29044631123542786, + "rewards/rejected": -0.6068195104598999, + "step": 420 + }, + { + "epoch": 0.11, + "grad_norm": 4.75, + "learning_rate": 4.997694702533016e-06, + "logits/chosen": -2.0086240768432617, + "logits/rejected": -1.9487006664276123, + "logps/chosen": -308.3887634277344, + "logps/rejected": -317.20904541015625, + "loss": 0.5817, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3084072172641754, + "rewards/margins": 0.3242705166339874, + "rewards/rejected": -0.6326777338981628, + "step": 430 + }, + { + "epoch": 0.12, + "grad_norm": 7.90625, + "learning_rate": 4.996609610933713e-06, + "logits/chosen": -2.112046718597412, + "logits/rejected": -2.027024984359741, + "logps/chosen": -303.4664306640625, + "logps/rejected": -303.01220703125, + "loss": 0.6025, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3131754994392395, + "rewards/margins": 0.2790473401546478, + "rewards/rejected": -0.5922229290008545, + "step": 440 + }, + { + "epoch": 0.12, + "grad_norm": 5.8125, + "learning_rate": 4.995316053150366e-06, + "logits/chosen": -1.9543377161026, + "logits/rejected": -1.8296692371368408, + "logps/chosen": -309.422119140625, + "logps/rejected": -325.46173095703125, + "loss": 0.5577, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.4158857762813568, + "rewards/margins": 0.3905051648616791, + "rewards/rejected": -0.8063910603523254, + "step": 450 + }, + { + "epoch": 0.12, + "grad_norm": 8.375, + "learning_rate": 4.9938141371946815e-06, + "logits/chosen": -1.9097979068756104, + "logits/rejected": -1.8239259719848633, + "logps/chosen": -370.8164978027344, + "logps/rejected": -396.86004638671875, + "loss": 0.5805, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0571677684783936, + "rewards/margins": 0.4056069254875183, + "rewards/rejected": -1.4627748727798462, + "step": 460 + }, + { + "epoch": 0.12, + "grad_norm": 5.40625, + "learning_rate": 4.992103988476206e-06, + "logits/chosen": -1.9127140045166016, + "logits/rejected": -1.7631990909576416, + "logps/chosen": -352.392822265625, + "logps/rejected": -381.87896728515625, + "loss": 0.5803, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0857564210891724, + "rewards/margins": 0.4201774597167969, + "rewards/rejected": -1.5059337615966797, + "step": 470 + }, + { + "epoch": 0.13, + "grad_norm": 5.0625, + "learning_rate": 4.990185749791866e-06, + "logits/chosen": -1.892653226852417, + "logits/rejected": -1.7571289539337158, + "logps/chosen": -333.1285095214844, + "logps/rejected": -386.10107421875, + "loss": 0.5413, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7624952793121338, + "rewards/margins": 0.5117734670639038, + "rewards/rejected": -1.2742688655853271, + "step": 480 + }, + { + "epoch": 0.13, + "grad_norm": 7.0, + "learning_rate": 4.9880595813140395e-06, + "logits/chosen": -1.8925682306289673, + "logits/rejected": -1.7469890117645264, + "logps/chosen": -369.3451232910156, + "logps/rejected": -387.9554443359375, + "loss": 0.5514, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8445364832878113, + "rewards/margins": 0.4895913600921631, + "rewards/rejected": -1.3341277837753296, + "step": 490 + }, + { + "epoch": 0.13, + "grad_norm": 5.4375, + "learning_rate": 4.985725660577184e-06, + "logits/chosen": -1.8205528259277344, + "logits/rejected": -1.6672782897949219, + "logps/chosen": -371.17864990234375, + "logps/rejected": -382.154296875, + "loss": 0.5534, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9741867780685425, + "rewards/margins": 0.522149920463562, + "rewards/rejected": -1.4963366985321045, + "step": 500 + }, + { + "epoch": 0.13, + "eval_logits/chosen": -1.671244502067566, + "eval_logits/rejected": -1.5403351783752441, + "eval_logps/chosen": -356.194580078125, + "eval_logps/rejected": -383.8828430175781, + "eval_loss": 0.5691964626312256, + "eval_rewards/accuracies": 0.7049999833106995, + "eval_rewards/chosen": -0.9154260158538818, + "eval_rewards/margins": 0.4773229658603668, + "eval_rewards/rejected": -1.3927491903305054, + "eval_runtime": 382.3757, + "eval_samples_per_second": 5.23, + "eval_steps_per_second": 0.654, + "step": 500 + }, + { + "epoch": 0.13, + "grad_norm": 6.53125, + "learning_rate": 4.983184182463009e-06, + "logits/chosen": -1.7440261840820312, + "logits/rejected": -1.6317085027694702, + "logps/chosen": -373.0206604003906, + "logps/rejected": -391.50970458984375, + "loss": 0.5646, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.947595477104187, + "rewards/margins": 0.555194079875946, + "rewards/rejected": -1.5027896165847778, + "step": 510 + }, + { + "epoch": 0.14, + "grad_norm": 7.65625, + "learning_rate": 4.980435359184203e-06, + "logits/chosen": -1.7637799978256226, + "logits/rejected": -1.7051684856414795, + "logps/chosen": -361.0028991699219, + "logps/rejected": -383.77392578125, + "loss": 0.6028, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.8794111013412476, + "rewards/margins": 0.3896932005882263, + "rewards/rejected": -1.2691043615341187, + "step": 520 + }, + { + "epoch": 0.14, + "grad_norm": 5.3125, + "learning_rate": 4.9774794202667236e-06, + "logits/chosen": -1.7085822820663452, + "logits/rejected": -1.6667120456695557, + "logps/chosen": -398.4223327636719, + "logps/rejected": -447.1837463378906, + "loss": 0.5797, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.373207688331604, + "rewards/margins": 0.4029502272605896, + "rewards/rejected": -1.776158094406128, + "step": 530 + }, + { + "epoch": 0.14, + "grad_norm": 8.0625, + "learning_rate": 4.974316612530615e-06, + "logits/chosen": -1.6480659246444702, + "logits/rejected": -1.4872467517852783, + "logps/chosen": -413.641845703125, + "logps/rejected": -420.10565185546875, + "loss": 0.5292, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3515903949737549, + "rewards/margins": 0.5323625206947327, + "rewards/rejected": -1.8839528560638428, + "step": 540 + }, + { + "epoch": 0.14, + "grad_norm": 9.375, + "learning_rate": 4.970947200069416e-06, + "logits/chosen": -1.6254298686981201, + "logits/rejected": -1.5536671876907349, + "logps/chosen": -402.1681213378906, + "logps/rejected": -431.54510498046875, + "loss": 0.5995, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2365509271621704, + "rewards/margins": 0.4915947914123535, + "rewards/rejected": -1.7281455993652344, + "step": 550 + }, + { + "epoch": 0.15, + "grad_norm": 5.90625, + "learning_rate": 4.967371464228096e-06, + "logits/chosen": -1.788649559020996, + "logits/rejected": -1.6893421411514282, + "logps/chosen": -362.63739013671875, + "logps/rejected": -421.24505615234375, + "loss": 0.5384, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0513819456100464, + "rewards/margins": 0.5819835066795349, + "rewards/rejected": -1.6333656311035156, + "step": 560 + }, + { + "epoch": 0.15, + "grad_norm": 7.28125, + "learning_rate": 4.963589703579569e-06, + "logits/chosen": -1.7911745309829712, + "logits/rejected": -1.6469875574111938, + "logps/chosen": -439.2314453125, + "logps/rejected": -465.60174560546875, + "loss": 0.5809, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.391915202140808, + "rewards/margins": 0.61050945520401, + "rewards/rejected": -2.002424716949463, + "step": 570 + }, + { + "epoch": 0.15, + "grad_norm": 8.375, + "learning_rate": 4.9596022338997615e-06, + "logits/chosen": -1.7446343898773193, + "logits/rejected": -1.5205295085906982, + "logps/chosen": -424.37664794921875, + "logps/rejected": -455.3761291503906, + "loss": 0.5342, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2658993005752563, + "rewards/margins": 0.7207783460617065, + "rewards/rejected": -1.9866775274276733, + "step": 580 + }, + { + "epoch": 0.15, + "grad_norm": 5.9375, + "learning_rate": 4.955409388141243e-06, + "logits/chosen": -1.5974572896957397, + "logits/rejected": -1.4778482913970947, + "logps/chosen": -365.91943359375, + "logps/rejected": -388.0648498535156, + "loss": 0.6027, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0861928462982178, + "rewards/margins": 0.45802631974220276, + "rewards/rejected": -1.5442192554473877, + "step": 590 + }, + { + "epoch": 0.16, + "grad_norm": 5.5625, + "learning_rate": 4.951011516405429e-06, + "logits/chosen": -1.682959794998169, + "logits/rejected": -1.6160876750946045, + "logps/chosen": -331.21978759765625, + "logps/rejected": -367.4974060058594, + "loss": 0.5613, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8046302795410156, + "rewards/margins": 0.5121658444404602, + "rewards/rejected": -1.316796064376831, + "step": 600 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -1.5049409866333008, + "eval_logits/rejected": -1.3701001405715942, + "eval_logps/chosen": -345.8829650878906, + "eval_logps/rejected": -376.7896423339844, + "eval_loss": 0.5658991932868958, + "eval_rewards/accuracies": 0.7024999856948853, + "eval_rewards/chosen": -0.8123093843460083, + "eval_rewards/margins": 0.5095077753067017, + "eval_rewards/rejected": -1.32181715965271, + "eval_runtime": 382.004, + "eval_samples_per_second": 5.236, + "eval_steps_per_second": 0.654, + "step": 600 + }, + { + "epoch": 0.16, + "grad_norm": 5.375, + "learning_rate": 4.946408985913344e-06, + "logits/chosen": -1.578046202659607, + "logits/rejected": -1.4836609363555908, + "logps/chosen": -328.2045593261719, + "logps/rejected": -375.481201171875, + "loss": 0.5276, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8234087228775024, + "rewards/margins": 0.6500986218452454, + "rewards/rejected": -1.473507285118103, + "step": 610 + }, + { + "epoch": 0.16, + "grad_norm": 11.875, + "learning_rate": 4.941602180974958e-06, + "logits/chosen": -1.5045579671859741, + "logits/rejected": -1.2604496479034424, + "logps/chosen": -402.4884338378906, + "logps/rejected": -422.79736328125, + "loss": 0.5241, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.146611213684082, + "rewards/margins": 0.7896040678024292, + "rewards/rejected": -1.9362151622772217, + "step": 620 + }, + { + "epoch": 0.16, + "grad_norm": 10.5, + "learning_rate": 4.936591502957101e-06, + "logits/chosen": -1.372164249420166, + "logits/rejected": -1.2230699062347412, + "logps/chosen": -414.8818359375, + "logps/rejected": -487.482421875, + "loss": 0.538, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.667520523071289, + "rewards/margins": 0.8288544416427612, + "rewards/rejected": -2.4963748455047607, + "step": 630 + }, + { + "epoch": 0.17, + "grad_norm": 9.6875, + "learning_rate": 4.931377370249946e-06, + "logits/chosen": -1.3338875770568848, + "logits/rejected": -1.1355304718017578, + "logps/chosen": -483.4081115722656, + "logps/rejected": -526.1396484375, + "loss": 0.5676, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.180846691131592, + "rewards/margins": 0.6847688555717468, + "rewards/rejected": -2.8656158447265625, + "step": 640 + }, + { + "epoch": 0.17, + "grad_norm": 15.6875, + "learning_rate": 4.925960218232073e-06, + "logits/chosen": -1.3147588968276978, + "logits/rejected": -1.1933101415634155, + "logps/chosen": -446.49346923828125, + "logps/rejected": -517.9827270507812, + "loss": 0.5392, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.9119131565093994, + "rewards/margins": 0.8099091649055481, + "rewards/rejected": -2.721822500228882, + "step": 650 + }, + { + "epoch": 0.17, + "grad_norm": 8.875, + "learning_rate": 4.920340499234116e-06, + "logits/chosen": -1.3101979494094849, + "logits/rejected": -1.1101386547088623, + "logps/chosen": -426.4873046875, + "logps/rejected": -446.02801513671875, + "loss": 0.5772, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6048357486724854, + "rewards/margins": 0.5474850535392761, + "rewards/rejected": -2.152320623397827, + "step": 660 + }, + { + "epoch": 0.18, + "grad_norm": 7.28125, + "learning_rate": 4.914518682500995e-06, + "logits/chosen": -1.4778305292129517, + "logits/rejected": -1.3038583993911743, + "logps/chosen": -432.8035583496094, + "logps/rejected": -459.92864990234375, + "loss": 0.5359, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4926092624664307, + "rewards/margins": 0.6784954071044922, + "rewards/rejected": -2.171104907989502, + "step": 670 + }, + { + "epoch": 0.18, + "grad_norm": 5.40625, + "learning_rate": 4.9084952541527315e-06, + "logits/chosen": -1.3521184921264648, + "logits/rejected": -1.1778732538223267, + "logps/chosen": -430.7608947753906, + "logps/rejected": -458.68072509765625, + "loss": 0.5029, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.5742666721343994, + "rewards/margins": 0.7503162622451782, + "rewards/rejected": -2.3245832920074463, + "step": 680 + }, + { + "epoch": 0.18, + "grad_norm": 7.6875, + "learning_rate": 4.902270717143858e-06, + "logits/chosen": -1.3213449716567993, + "logits/rejected": -1.228070855140686, + "logps/chosen": -417.1580505371094, + "logps/rejected": -537.0321044921875, + "loss": 0.4381, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8092586994171143, + "rewards/margins": 1.0819432735443115, + "rewards/rejected": -2.8912017345428467, + "step": 690 + }, + { + "epoch": 0.18, + "grad_norm": 6.5, + "learning_rate": 4.895845591221427e-06, + "logits/chosen": -1.2542212009429932, + "logits/rejected": -1.1810188293457031, + "logps/chosen": -466.4949645996094, + "logps/rejected": -549.9205932617188, + "loss": 0.5139, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.131845474243164, + "rewards/margins": 0.8790606260299683, + "rewards/rejected": -3.0109057426452637, + "step": 700 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -1.0174403190612793, + "eval_logits/rejected": -0.8923892974853516, + "eval_logps/chosen": -528.3277587890625, + "eval_logps/rejected": -591.3086547851562, + "eval_loss": 0.5571516156196594, + "eval_rewards/accuracies": 0.7145000100135803, + "eval_rewards/chosen": -2.6367568969726562, + "eval_rewards/margins": 0.8302499055862427, + "eval_rewards/rejected": -3.4670066833496094, + "eval_runtime": 382.0721, + "eval_samples_per_second": 5.235, + "eval_steps_per_second": 0.654, + "step": 700 + }, + { + "epoch": 0.19, + "grad_norm": 10.5625, + "learning_rate": 4.8892204128816e-06, + "logits/chosen": -1.1841003894805908, + "logits/rejected": -1.0792133808135986, + "logps/chosen": -517.9019775390625, + "logps/rejected": -578.2611083984375, + "loss": 0.5501, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5033812522888184, + "rewards/margins": 0.7395020127296448, + "rewards/rejected": -3.2428832054138184, + "step": 710 + }, + { + "epoch": 0.19, + "grad_norm": 8.875, + "learning_rate": 4.882395735324864e-06, + "logits/chosen": -1.1759226322174072, + "logits/rejected": -1.0294206142425537, + "logps/chosen": -477.3987731933594, + "logps/rejected": -544.5623779296875, + "loss": 0.4985, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.1131680011749268, + "rewards/margins": 0.8433287739753723, + "rewards/rejected": -2.9564967155456543, + "step": 720 + }, + { + "epoch": 0.19, + "grad_norm": 8.8125, + "learning_rate": 4.87537212840983e-06, + "logits/chosen": -1.1399719715118408, + "logits/rejected": -1.0124037265777588, + "logps/chosen": -500.2403259277344, + "logps/rejected": -533.0379028320312, + "loss": 0.5509, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.3398513793945312, + "rewards/margins": 0.6334503293037415, + "rewards/rejected": -2.973301887512207, + "step": 730 + }, + { + "epoch": 0.19, + "grad_norm": 12.375, + "learning_rate": 4.8681501786056545e-06, + "logits/chosen": -1.0892612934112549, + "logits/rejected": -0.941753089427948, + "logps/chosen": -450.81402587890625, + "logps/rejected": -503.46636962890625, + "loss": 0.5001, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.2259793281555176, + "rewards/margins": 0.8490058183670044, + "rewards/rejected": -3.0749852657318115, + "step": 740 + }, + { + "epoch": 0.2, + "grad_norm": 24.0, + "learning_rate": 4.860730488943068e-06, + "logits/chosen": -1.0790389776229858, + "logits/rejected": -1.0216121673583984, + "logps/chosen": -440.62109375, + "logps/rejected": -540.6531372070312, + "loss": 0.4802, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.060957431793213, + "rewards/margins": 1.019281029701233, + "rewards/rejected": -3.0802388191223145, + "step": 750 + }, + { + "epoch": 0.2, + "grad_norm": 7.0, + "learning_rate": 4.853113678964022e-06, + "logits/chosen": -1.1443126201629639, + "logits/rejected": -1.065063238143921, + "logps/chosen": -448.5615234375, + "logps/rejected": -542.3307495117188, + "loss": 0.505, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.76167893409729, + "rewards/margins": 1.016688346862793, + "rewards/rejected": -2.778367280960083, + "step": 760 + }, + { + "epoch": 0.2, + "grad_norm": 5.90625, + "learning_rate": 4.845300384669958e-06, + "logits/chosen": -1.23788583278656, + "logits/rejected": -1.1094398498535156, + "logps/chosen": -407.1124267578125, + "logps/rejected": -459.88226318359375, + "loss": 0.5488, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5390856266021729, + "rewards/margins": 0.7560388445854187, + "rewards/rejected": -2.2951245307922363, + "step": 770 + }, + { + "epoch": 0.2, + "grad_norm": 16.625, + "learning_rate": 4.837291258468701e-06, + "logits/chosen": -1.3532726764678955, + "logits/rejected": -1.2090624570846558, + "logps/chosen": -449.90447998046875, + "logps/rejected": -503.38067626953125, + "loss": 0.5803, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.645581603050232, + "rewards/margins": 0.783669114112854, + "rewards/rejected": -2.429250955581665, + "step": 780 + }, + { + "epoch": 0.21, + "grad_norm": 7.59375, + "learning_rate": 4.829086969119984e-06, + "logits/chosen": -1.2730779647827148, + "logits/rejected": -1.2738616466522217, + "logps/chosen": -398.4493103027344, + "logps/rejected": -460.91387939453125, + "loss": 0.5907, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.4772454500198364, + "rewards/margins": 0.6072799563407898, + "rewards/rejected": -2.0845253467559814, + "step": 790 + }, + { + "epoch": 0.21, + "grad_norm": 8.1875, + "learning_rate": 4.820688201679605e-06, + "logits/chosen": -1.559012770652771, + "logits/rejected": -1.2587218284606934, + "logps/chosen": -388.8677673339844, + "logps/rejected": -389.87957763671875, + "loss": 0.5184, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2623231410980225, + "rewards/margins": 0.6337946057319641, + "rewards/rejected": -1.8961181640625, + "step": 800 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -1.246036410331726, + "eval_logits/rejected": -1.1140612363815308, + "eval_logps/chosen": -413.7338562011719, + "eval_logps/rejected": -463.30914306640625, + "eval_loss": 0.5373813509941101, + "eval_rewards/accuracies": 0.7160000205039978, + "eval_rewards/chosen": -1.4908183813095093, + "eval_rewards/margins": 0.6961935758590698, + "eval_rewards/rejected": -2.187012195587158, + "eval_runtime": 382.1333, + "eval_samples_per_second": 5.234, + "eval_steps_per_second": 0.654, + "step": 800 + }, + { + "epoch": 0.21, + "grad_norm": 9.0625, + "learning_rate": 4.8120956574422315e-06, + "logits/chosen": -1.407278060913086, + "logits/rejected": -1.3845430612564087, + "logps/chosen": -428.33648681640625, + "logps/rejected": -478.8470764160156, + "loss": 0.6069, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.544116735458374, + "rewards/margins": 0.5756716132164001, + "rewards/rejected": -2.119788646697998, + "step": 810 + }, + { + "epoch": 0.21, + "grad_norm": 7.625, + "learning_rate": 4.803310053882831e-06, + "logits/chosen": -1.4305765628814697, + "logits/rejected": -1.4192079305648804, + "logps/chosen": -346.76165771484375, + "logps/rejected": -416.07073974609375, + "loss": 0.5573, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1743587255477905, + "rewards/margins": 0.5695887804031372, + "rewards/rejected": -1.7439473867416382, + "step": 820 + }, + { + "epoch": 0.22, + "grad_norm": 11.8125, + "learning_rate": 4.794332124596775e-06, + "logits/chosen": -1.4643322229385376, + "logits/rejected": -1.3541513681411743, + "logps/chosen": -378.71685791015625, + "logps/rejected": -430.7264709472656, + "loss": 0.5747, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1027584075927734, + "rewards/margins": 0.5569159984588623, + "rewards/rejected": -1.6596744060516357, + "step": 830 + }, + { + "epoch": 0.22, + "grad_norm": 7.28125, + "learning_rate": 4.785162619238575e-06, + "logits/chosen": -1.3610130548477173, + "logits/rejected": -1.2018978595733643, + "logps/chosen": -377.59130859375, + "logps/rejected": -424.17108154296875, + "loss": 0.516, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2374261617660522, + "rewards/margins": 0.7390316128730774, + "rewards/rejected": -1.9764575958251953, + "step": 840 + }, + { + "epoch": 0.22, + "grad_norm": 7.25, + "learning_rate": 4.775802303459288e-06, + "logits/chosen": -1.230850100517273, + "logits/rejected": -1.153451919555664, + "logps/chosen": -397.7276611328125, + "logps/rejected": -469.70037841796875, + "loss": 0.5533, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4996331930160522, + "rewards/margins": 0.7669634819030762, + "rewards/rejected": -2.266597032546997, + "step": 850 + }, + { + "epoch": 0.23, + "grad_norm": 10.8125, + "learning_rate": 4.766251958842589e-06, + "logits/chosen": -1.196821689605713, + "logits/rejected": -1.0929956436157227, + "logps/chosen": -442.42779541015625, + "logps/rejected": -496.02508544921875, + "loss": 0.5516, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.6992677450180054, + "rewards/margins": 0.640595018863678, + "rewards/rejected": -2.339862585067749, + "step": 860 + }, + { + "epoch": 0.23, + "grad_norm": 5.96875, + "learning_rate": 4.7565123828395066e-06, + "logits/chosen": -1.1287126541137695, + "logits/rejected": -1.0260584354400635, + "logps/chosen": -434.9798278808594, + "logps/rejected": -504.6143493652344, + "loss": 0.5191, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.790509819984436, + "rewards/margins": 0.7024968266487122, + "rewards/rejected": -2.493006467819214, + "step": 870 + }, + { + "epoch": 0.23, + "grad_norm": 9.4375, + "learning_rate": 4.746584388701831e-06, + "logits/chosen": -1.1179661750793457, + "logits/rejected": -1.0696125030517578, + "logps/chosen": -474.17364501953125, + "logps/rejected": -547.4193115234375, + "loss": 0.4941, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.1093432903289795, + "rewards/margins": 0.8745004534721375, + "rewards/rejected": -2.9838438034057617, + "step": 880 + }, + { + "epoch": 0.23, + "grad_norm": 11.0, + "learning_rate": 4.736468805414218e-06, + "logits/chosen": -1.0214884281158447, + "logits/rejected": -0.9855283498764038, + "logps/chosen": -477.1600646972656, + "logps/rejected": -576.8958740234375, + "loss": 0.5755, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.297248125076294, + "rewards/margins": 0.8587217330932617, + "rewards/rejected": -3.1559698581695557, + "step": 890 + }, + { + "epoch": 0.24, + "grad_norm": 14.9375, + "learning_rate": 4.7261664776249595e-06, + "logits/chosen": -0.8845041394233704, + "logits/rejected": -0.7875598073005676, + "logps/chosen": -482.1604919433594, + "logps/rejected": -565.8832397460938, + "loss": 0.5211, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.532474994659424, + "rewards/margins": 0.9273085594177246, + "rewards/rejected": -3.4597840309143066, + "step": 900 + }, + { + "epoch": 0.24, + "eval_logits/chosen": -0.9341001510620117, + "eval_logits/rejected": -0.8115790486335754, + "eval_logps/chosen": -518.949462890625, + "eval_logps/rejected": -584.0806274414062, + "eval_loss": 0.5331768989562988, + "eval_rewards/accuracies": 0.7179999947547913, + "eval_rewards/chosen": -2.5429742336273193, + "eval_rewards/margins": 0.8517529368400574, + "eval_rewards/rejected": -3.3947272300720215, + "eval_runtime": 382.1611, + "eval_samples_per_second": 5.233, + "eval_steps_per_second": 0.654, + "step": 900 + }, + { + "epoch": 0.24, + "grad_norm": 12.1875, + "learning_rate": 4.715678265575463e-06, + "logits/chosen": -1.1323182582855225, + "logits/rejected": -0.9318205118179321, + "logps/chosen": -521.3104248046875, + "logps/rejected": -533.2903442382812, + "loss": 0.5686, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.3703832626342773, + "rewards/margins": 0.6751216650009155, + "rewards/rejected": -3.0455050468444824, + "step": 910 + }, + { + "epoch": 0.24, + "grad_norm": 8.625, + "learning_rate": 4.705005045028415e-06, + "logits/chosen": -1.0868864059448242, + "logits/rejected": -0.9571698904037476, + "logps/chosen": -469.189208984375, + "logps/rejected": -530.5699462890625, + "loss": 0.5319, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.0605311393737793, + "rewards/margins": 0.7877290844917297, + "rewards/rejected": -2.8482604026794434, + "step": 920 + }, + { + "epoch": 0.24, + "grad_norm": 8.8125, + "learning_rate": 4.694147707194659e-06, + "logits/chosen": -1.1987128257751465, + "logits/rejected": -1.1085574626922607, + "logps/chosen": -479.1398010253906, + "logps/rejected": -532.23828125, + "loss": 0.5295, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.0726170539855957, + "rewards/margins": 0.7290612459182739, + "rewards/rejected": -2.80167818069458, + "step": 930 + }, + { + "epoch": 0.25, + "grad_norm": 7.3125, + "learning_rate": 4.683107158658782e-06, + "logits/chosen": -1.1448571681976318, + "logits/rejected": -1.0365805625915527, + "logps/chosen": -478.0250549316406, + "logps/rejected": -530.4112548828125, + "loss": 0.5083, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.8778432607650757, + "rewards/margins": 0.811355471611023, + "rewards/rejected": -2.6891987323760986, + "step": 940 + }, + { + "epoch": 0.25, + "grad_norm": 9.0625, + "learning_rate": 4.671884321303407e-06, + "logits/chosen": -1.2020542621612549, + "logits/rejected": -1.0928010940551758, + "logps/chosen": -440.04864501953125, + "logps/rejected": -496.198486328125, + "loss": 0.5249, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.9372259378433228, + "rewards/margins": 0.7049869298934937, + "rewards/rejected": -2.6422126293182373, + "step": 950 + }, + { + "epoch": 0.25, + "grad_norm": 6.875, + "learning_rate": 4.660480132232224e-06, + "logits/chosen": -1.2815606594085693, + "logits/rejected": -1.1846911907196045, + "logps/chosen": -445.06915283203125, + "logps/rejected": -479.39093017578125, + "loss": 0.5773, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.7325608730316162, + "rewards/margins": 0.5843728184700012, + "rewards/rejected": -2.3169338703155518, + "step": 960 + }, + { + "epoch": 0.25, + "grad_norm": 9.6875, + "learning_rate": 4.6488955436917414e-06, + "logits/chosen": -1.3540565967559814, + "logits/rejected": -1.1343624591827393, + "logps/chosen": -444.31640625, + "logps/rejected": -482.2098083496094, + "loss": 0.5099, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.6433531045913696, + "rewards/margins": 0.8446812629699707, + "rewards/rejected": -2.48803448677063, + "step": 970 + }, + { + "epoch": 0.26, + "grad_norm": 5.75, + "learning_rate": 4.6371315229917644e-06, + "logits/chosen": -1.3197797536849976, + "logits/rejected": -1.1996195316314697, + "logps/chosen": -457.05712890625, + "logps/rejected": -514.72802734375, + "loss": 0.5217, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.7364044189453125, + "rewards/margins": 0.780579149723053, + "rewards/rejected": -2.5169835090637207, + "step": 980 + }, + { + "epoch": 0.26, + "grad_norm": 13.6875, + "learning_rate": 4.625189052424638e-06, + "logits/chosen": -1.2102200984954834, + "logits/rejected": -1.0647470951080322, + "logps/chosen": -436.97991943359375, + "logps/rejected": -520.3751220703125, + "loss": 0.4535, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.9787667989730835, + "rewards/margins": 1.061232328414917, + "rewards/rejected": -3.039999008178711, + "step": 990 + }, + { + "epoch": 0.26, + "grad_norm": 8.25, + "learning_rate": 4.613069129183218e-06, + "logits/chosen": -1.240464687347412, + "logits/rejected": -1.0879384279251099, + "logps/chosen": -531.1487426757812, + "logps/rejected": -574.3619384765625, + "loss": 0.5553, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.2774546146392822, + "rewards/margins": 0.7940423488616943, + "rewards/rejected": -3.0714969635009766, + "step": 1000 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -0.981342613697052, + "eval_logits/rejected": -0.8557386994361877, + "eval_logps/chosen": -482.09930419921875, + "eval_logps/rejected": -548.8490600585938, + "eval_loss": 0.5178083777427673, + "eval_rewards/accuracies": 0.7315000295639038, + "eval_rewards/chosen": -2.1744725704193115, + "eval_rewards/margins": 0.8679391145706177, + "eval_rewards/rejected": -3.0424115657806396, + "eval_runtime": 382.1372, + "eval_samples_per_second": 5.234, + "eval_steps_per_second": 0.654, + "step": 1000 + }, + { + "epoch": 0.26, + "grad_norm": 8.0, + "learning_rate": 4.600772765277607e-06, + "logits/chosen": -1.0305756330490112, + "logits/rejected": -0.9370132684707642, + "logps/chosen": -448.99493408203125, + "logps/rejected": -530.3275146484375, + "loss": 0.4913, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.154376983642578, + "rewards/margins": 0.8647212982177734, + "rewards/rejected": -3.0190985202789307, + "step": 1010 + }, + { + "epoch": 0.27, + "grad_norm": 16.75, + "learning_rate": 4.588300987450652e-06, + "logits/chosen": -1.0989015102386475, + "logits/rejected": -0.9851810336112976, + "logps/chosen": -443.59423828125, + "logps/rejected": -486.5970764160156, + "loss": 0.5542, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8985025882720947, + "rewards/margins": 0.7655047178268433, + "rewards/rejected": -2.6640071868896484, + "step": 1020 + }, + { + "epoch": 0.27, + "grad_norm": 5.6875, + "learning_rate": 4.5756548370922136e-06, + "logits/chosen": -1.0507217645645142, + "logits/rejected": -0.9594799280166626, + "logps/chosen": -405.2181091308594, + "logps/rejected": -487.1499938964844, + "loss": 0.4835, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.6451423168182373, + "rewards/margins": 0.9089698791503906, + "rewards/rejected": -2.554112434387207, + "step": 1030 + }, + { + "epoch": 0.27, + "grad_norm": 13.5625, + "learning_rate": 4.562835370152206e-06, + "logits/chosen": -1.0573441982269287, + "logits/rejected": -0.8775628209114075, + "logps/chosen": -527.5038452148438, + "logps/rejected": -620.2794189453125, + "loss": 0.4742, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2627432346343994, + "rewards/margins": 1.2387964725494385, + "rewards/rejected": -3.501539707183838, + "step": 1040 + }, + { + "epoch": 0.27, + "grad_norm": 8.8125, + "learning_rate": 4.54984365705243e-06, + "logits/chosen": -0.9812475442886353, + "logits/rejected": -0.8811472654342651, + "logps/chosen": -502.1786193847656, + "logps/rejected": -618.7202758789062, + "loss": 0.4971, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.337085485458374, + "rewards/margins": 1.2312263250350952, + "rewards/rejected": -3.5683116912841797, + "step": 1050 + }, + { + "epoch": 0.28, + "grad_norm": 9.0, + "learning_rate": 4.536680782597191e-06, + "logits/chosen": -0.9585447311401367, + "logits/rejected": -0.8763798475265503, + "logps/chosen": -443.18878173828125, + "logps/rejected": -523.16015625, + "loss": 0.6028, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0716359615325928, + "rewards/margins": 0.855958104133606, + "rewards/rejected": -2.9275941848754883, + "step": 1060 + }, + { + "epoch": 0.28, + "grad_norm": 15.4375, + "learning_rate": 4.523347845882718e-06, + "logits/chosen": -1.122159481048584, + "logits/rejected": -0.9293369054794312, + "logps/chosen": -494.13037109375, + "logps/rejected": -562.1329345703125, + "loss": 0.4613, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.0596017837524414, + "rewards/margins": 1.1728570461273193, + "rewards/rejected": -3.2324588298797607, + "step": 1070 + }, + { + "epoch": 0.28, + "grad_norm": 8.125, + "learning_rate": 4.50984596020539e-06, + "logits/chosen": -0.8647342920303345, + "logits/rejected": -0.826617419719696, + "logps/chosen": -561.8629760742188, + "logps/rejected": -615.0023193359375, + "loss": 0.5557, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.8171119689941406, + "rewards/margins": 0.8539352416992188, + "rewards/rejected": -3.6710472106933594, + "step": 1080 + }, + { + "epoch": 0.29, + "grad_norm": 9.0, + "learning_rate": 4.4961762529687745e-06, + "logits/chosen": -1.0336081981658936, + "logits/rejected": -0.9252422451972961, + "logps/chosen": -563.8508911132812, + "logps/rejected": -638.390869140625, + "loss": 0.4855, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.0344927310943604, + "rewards/margins": 0.9103133082389832, + "rewards/rejected": -3.944805860519409, + "step": 1090 + }, + { + "epoch": 0.29, + "grad_norm": 6.9375, + "learning_rate": 4.482339865589492e-06, + "logits/chosen": -1.0671048164367676, + "logits/rejected": -0.9094209671020508, + "logps/chosen": -568.4443359375, + "logps/rejected": -596.6480712890625, + "loss": 0.5994, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.021576404571533, + "rewards/margins": 0.7217450141906738, + "rewards/rejected": -3.743321180343628, + "step": 1100 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -0.8895747661590576, + "eval_logits/rejected": -0.7614892721176147, + "eval_logps/chosen": -514.6676635742188, + "eval_logps/rejected": -577.3698120117188, + "eval_loss": 0.520658552646637, + "eval_rewards/accuracies": 0.7300000190734863, + "eval_rewards/chosen": -2.5001566410064697, + "eval_rewards/margins": 0.8274616599082947, + "eval_rewards/rejected": -3.3276185989379883, + "eval_runtime": 382.1502, + "eval_samples_per_second": 5.234, + "eval_steps_per_second": 0.654, + "step": 1100 + }, + { + "epoch": 0.29, + "grad_norm": 6.625, + "learning_rate": 4.468337953401909e-06, + "logits/chosen": -1.1065692901611328, + "logits/rejected": -1.0572447776794434, + "logps/chosen": -495.5409240722656, + "logps/rejected": -552.65966796875, + "loss": 0.5707, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2518980503082275, + "rewards/margins": 0.61982262134552, + "rewards/rejected": -2.871720790863037, + "step": 1110 + }, + { + "epoch": 0.29, + "grad_norm": 8.875, + "learning_rate": 4.45417168556166e-06, + "logits/chosen": -1.0463123321533203, + "logits/rejected": -0.9469770193099976, + "logps/chosen": -435.6727600097656, + "logps/rejected": -518.3145751953125, + "loss": 0.5007, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9452159404754639, + "rewards/margins": 0.8327676057815552, + "rewards/rejected": -2.7779834270477295, + "step": 1120 + }, + { + "epoch": 0.3, + "grad_norm": 9.6875, + "learning_rate": 4.439842244948036e-06, + "logits/chosen": -1.0293817520141602, + "logits/rejected": -0.8690570592880249, + "logps/chosen": -486.1783142089844, + "logps/rejected": -559.431396484375, + "loss": 0.5565, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2511630058288574, + "rewards/margins": 0.7881690263748169, + "rewards/rejected": -3.0393319129943848, + "step": 1130 + }, + { + "epoch": 0.3, + "grad_norm": 14.5, + "learning_rate": 4.425350828065204e-06, + "logits/chosen": -1.0534614324569702, + "logits/rejected": -0.8575074076652527, + "logps/chosen": -497.90167236328125, + "logps/rejected": -537.9634399414062, + "loss": 0.4913, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.1381561756134033, + "rewards/margins": 0.8793197870254517, + "rewards/rejected": -3.0174758434295654, + "step": 1140 + }, + { + "epoch": 0.3, + "grad_norm": 9.5625, + "learning_rate": 4.410698644942303e-06, + "logits/chosen": -1.0756770372390747, + "logits/rejected": -0.9290148615837097, + "logps/chosen": -489.197265625, + "logps/rejected": -558.8743286132812, + "loss": 0.4893, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.1688458919525146, + "rewards/margins": 0.9360774755477905, + "rewards/rejected": -3.1049234867095947, + "step": 1150 + }, + { + "epoch": 0.3, + "grad_norm": 12.25, + "learning_rate": 4.395886919032406e-06, + "logits/chosen": -0.9989307522773743, + "logits/rejected": -0.8515041470527649, + "logps/chosen": -480.94183349609375, + "logps/rejected": -542.0136108398438, + "loss": 0.5419, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.1710543632507324, + "rewards/margins": 0.8757139444351196, + "rewards/rejected": -3.0467686653137207, + "step": 1160 + }, + { + "epoch": 0.31, + "grad_norm": 8.625, + "learning_rate": 4.380916887110366e-06, + "logits/chosen": -1.1318533420562744, + "logits/rejected": -0.9459112286567688, + "logps/chosen": -481.12335205078125, + "logps/rejected": -544.0623779296875, + "loss": 0.5083, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2195496559143066, + "rewards/margins": 1.032907247543335, + "rewards/rejected": -3.2524571418762207, + "step": 1170 + }, + { + "epoch": 0.31, + "grad_norm": 9.3125, + "learning_rate": 4.365789799169539e-06, + "logits/chosen": -0.9683933258056641, + "logits/rejected": -1.0098755359649658, + "logps/chosen": -474.65283203125, + "logps/rejected": -566.4153442382812, + "loss": 0.5468, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.280418872833252, + "rewards/margins": 0.8640033006668091, + "rewards/rejected": -3.1444220542907715, + "step": 1180 + }, + { + "epoch": 0.31, + "grad_norm": 11.9375, + "learning_rate": 4.350506918317416e-06, + "logits/chosen": -1.1871801614761353, + "logits/rejected": -1.0333930253982544, + "logps/chosen": -443.02716064453125, + "logps/rejected": -521.8514404296875, + "loss": 0.5037, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.9543129205703735, + "rewards/margins": 0.8601529002189636, + "rewards/rejected": -2.8144659996032715, + "step": 1190 + }, + { + "epoch": 0.31, + "grad_norm": 9.0, + "learning_rate": 4.335069520670149e-06, + "logits/chosen": -0.9967072606086731, + "logits/rejected": -0.9244716763496399, + "logps/chosen": -455.01959228515625, + "logps/rejected": -528.6710205078125, + "loss": 0.5976, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2530674934387207, + "rewards/margins": 0.6545962691307068, + "rewards/rejected": -2.907663583755493, + "step": 1200 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -0.9595763087272644, + "eval_logits/rejected": -0.8350398540496826, + "eval_logps/chosen": -482.9834289550781, + "eval_logps/rejected": -543.660400390625, + "eval_loss": 0.5098230838775635, + "eval_rewards/accuracies": 0.7365000247955322, + "eval_rewards/chosen": -2.183314323425293, + "eval_rewards/margins": 0.8072100281715393, + "eval_rewards/rejected": -2.9905245304107666, + "eval_runtime": 382.4857, + "eval_samples_per_second": 5.229, + "eval_steps_per_second": 0.654, + "step": 1200 + }, + { + "epoch": 0.32, + "grad_norm": 6.40625, + "learning_rate": 4.319478895246e-06, + "logits/chosen": -1.070488691329956, + "logits/rejected": -0.886951744556427, + "logps/chosen": -466.0955505371094, + "logps/rejected": -520.3566284179688, + "loss": 0.4951, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.189800977706909, + "rewards/margins": 0.7895106077194214, + "rewards/rejected": -2.979311466217041, + "step": 1210 + }, + { + "epoch": 0.32, + "grad_norm": 11.0, + "learning_rate": 4.303736343857704e-06, + "logits/chosen": -1.0415198802947998, + "logits/rejected": -0.9387828707695007, + "logps/chosen": -499.1920471191406, + "logps/rejected": -617.3883666992188, + "loss": 0.4881, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.420851230621338, + "rewards/margins": 1.062877893447876, + "rewards/rejected": -3.483729124069214, + "step": 1220 + }, + { + "epoch": 0.32, + "grad_norm": 10.0, + "learning_rate": 4.287843181003772e-06, + "logits/chosen": -1.0625154972076416, + "logits/rejected": -0.9172189831733704, + "logps/chosen": -579.9913330078125, + "logps/rejected": -610.0975341796875, + "loss": 0.5905, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8613951206207275, + "rewards/margins": 0.7642954587936401, + "rewards/rejected": -3.6256909370422363, + "step": 1230 + }, + { + "epoch": 0.32, + "grad_norm": 7.59375, + "learning_rate": 4.27180073375873e-06, + "logits/chosen": -1.1162028312683105, + "logits/rejected": -0.9976137280464172, + "logps/chosen": -525.2400512695312, + "logps/rejected": -569.8626708984375, + "loss": 0.5269, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.3787271976470947, + "rewards/margins": 0.8617948293685913, + "rewards/rejected": -3.2405219078063965, + "step": 1240 + }, + { + "epoch": 0.33, + "grad_norm": 5.625, + "learning_rate": 4.255610341662304e-06, + "logits/chosen": -1.144928216934204, + "logits/rejected": -0.9519325494766235, + "logps/chosen": -472.40087890625, + "logps/rejected": -529.2858276367188, + "loss": 0.5525, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.171128511428833, + "rewards/margins": 0.767959475517273, + "rewards/rejected": -2.9390883445739746, + "step": 1250 + }, + { + "epoch": 0.33, + "grad_norm": 8.625, + "learning_rate": 4.2392733566075764e-06, + "logits/chosen": -1.11684250831604, + "logits/rejected": -0.9831358194351196, + "logps/chosen": -500.71484375, + "logps/rejected": -542.6422119140625, + "loss": 0.5654, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.447084903717041, + "rewards/margins": 0.5746163129806519, + "rewards/rejected": -3.0217010974884033, + "step": 1260 + }, + { + "epoch": 0.33, + "grad_norm": 7.65625, + "learning_rate": 4.2227911427280975e-06, + "logits/chosen": -1.0659453868865967, + "logits/rejected": -0.899361252784729, + "logps/chosen": -475.46148681640625, + "logps/rejected": -525.0037841796875, + "loss": 0.5081, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.251559257507324, + "rewards/margins": 0.823780357837677, + "rewards/rejected": -3.0753397941589355, + "step": 1270 + }, + { + "epoch": 0.33, + "grad_norm": 11.4375, + "learning_rate": 4.206165076283983e-06, + "logits/chosen": -1.096620798110962, + "logits/rejected": -0.9550498127937317, + "logps/chosen": -487.46136474609375, + "logps/rejected": -576.1992797851562, + "loss": 0.461, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.4152817726135254, + "rewards/margins": 1.0981849431991577, + "rewards/rejected": -3.5134663581848145, + "step": 1280 + }, + { + "epoch": 0.34, + "grad_norm": 10.6875, + "learning_rate": 4.189396545546995e-06, + "logits/chosen": -1.0538244247436523, + "logits/rejected": -0.9361982345581055, + "logps/chosen": -522.2523193359375, + "logps/rejected": -610.1349487304688, + "loss": 0.5054, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.730778217315674, + "rewards/margins": 1.0780103206634521, + "rewards/rejected": -3.808788776397705, + "step": 1290 + }, + { + "epoch": 0.34, + "grad_norm": 13.125, + "learning_rate": 4.172486950684627e-06, + "logits/chosen": -1.0185925960540771, + "logits/rejected": -0.9584161639213562, + "logps/chosen": -538.3131103515625, + "logps/rejected": -635.578369140625, + "loss": 0.5237, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.846707820892334, + "rewards/margins": 1.0040740966796875, + "rewards/rejected": -3.8507816791534424, + "step": 1300 + }, + { + "epoch": 0.34, + "eval_logits/chosen": -0.825871467590332, + "eval_logits/rejected": -0.7071986198425293, + "eval_logps/chosen": -574.3861694335938, + "eval_logps/rejected": -660.885009765625, + "eval_loss": 0.5165807008743286, + "eval_rewards/accuracies": 0.7350000143051147, + "eval_rewards/chosen": -3.097341775894165, + "eval_rewards/margins": 1.0654287338256836, + "eval_rewards/rejected": -4.162771224975586, + "eval_runtime": 382.0912, + "eval_samples_per_second": 5.234, + "eval_steps_per_second": 0.654, + "step": 1300 + }, + { + "epoch": 0.34, + "grad_norm": 11.625, + "learning_rate": 4.155437703643182e-06, + "logits/chosen": -1.0443698167800903, + "logits/rejected": -0.8676601648330688, + "logps/chosen": -536.4607543945312, + "logps/rejected": -606.3543701171875, + "loss": 0.5075, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.8971712589263916, + "rewards/margins": 0.9897411465644836, + "rewards/rejected": -3.8869121074676514, + "step": 1310 + }, + { + "epoch": 0.35, + "grad_norm": 11.375, + "learning_rate": 4.138250228029882e-06, + "logits/chosen": -1.000579595565796, + "logits/rejected": -0.9191876649856567, + "logps/chosen": -538.9154052734375, + "logps/rejected": -649.7552490234375, + "loss": 0.4767, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8198482990264893, + "rewards/margins": 1.0736055374145508, + "rewards/rejected": -3.893454074859619, + "step": 1320 + }, + { + "epoch": 0.35, + "grad_norm": 7.6875, + "learning_rate": 4.120925958993994e-06, + "logits/chosen": -0.9208280444145203, + "logits/rejected": -0.8555585741996765, + "logps/chosen": -512.56787109375, + "logps/rejected": -604.376220703125, + "loss": 0.5584, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.786665439605713, + "rewards/margins": 0.9612969160079956, + "rewards/rejected": -3.747962474822998, + "step": 1330 + }, + { + "epoch": 0.35, + "grad_norm": 14.0, + "learning_rate": 4.103466343106999e-06, + "logits/chosen": -1.1172326803207397, + "logits/rejected": -0.9976350665092468, + "logps/chosen": -514.8595581054688, + "logps/rejected": -575.3850708007812, + "loss": 0.5422, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4547386169433594, + "rewards/margins": 0.8639480471611023, + "rewards/rejected": -3.3186867237091064, + "step": 1340 + }, + { + "epoch": 0.35, + "grad_norm": 10.125, + "learning_rate": 4.085872838241797e-06, + "logits/chosen": -1.0706989765167236, + "logits/rejected": -0.9391083717346191, + "logps/chosen": -489.779296875, + "logps/rejected": -538.4210815429688, + "loss": 0.5948, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.274151563644409, + "rewards/margins": 0.6873086094856262, + "rewards/rejected": -2.9614596366882324, + "step": 1350 + }, + { + "epoch": 0.36, + "grad_norm": 11.125, + "learning_rate": 4.06814691345098e-06, + "logits/chosen": -1.0508559942245483, + "logits/rejected": -0.9001902341842651, + "logps/chosen": -451.5694274902344, + "logps/rejected": -517.9208984375, + "loss": 0.4809, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.9602162837982178, + "rewards/margins": 0.884141743183136, + "rewards/rejected": -2.844357967376709, + "step": 1360 + }, + { + "epoch": 0.36, + "grad_norm": 14.125, + "learning_rate": 4.050290048844171e-06, + "logits/chosen": -1.129167914390564, + "logits/rejected": -1.0560190677642822, + "logps/chosen": -474.2417907714844, + "logps/rejected": -552.0899047851562, + "loss": 0.5423, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.056283473968506, + "rewards/margins": 0.8298514485359192, + "rewards/rejected": -2.886134624481201, + "step": 1370 + }, + { + "epoch": 0.36, + "grad_norm": 9.5, + "learning_rate": 4.032303735464422e-06, + "logits/chosen": -1.1856621503829956, + "logits/rejected": -0.9643325805664062, + "logps/chosen": -502.15814208984375, + "logps/rejected": -594.064208984375, + "loss": 0.452, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.301772356033325, + "rewards/margins": 1.1499149799346924, + "rewards/rejected": -3.4516875743865967, + "step": 1380 + }, + { + "epoch": 0.36, + "grad_norm": 11.6875, + "learning_rate": 4.014189475163727e-06, + "logits/chosen": -0.96733558177948, + "logits/rejected": -0.853344738483429, + "logps/chosen": -489.39990234375, + "logps/rejected": -597.2086181640625, + "loss": 0.4757, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.3474299907684326, + "rewards/margins": 1.1593117713928223, + "rewards/rejected": -3.506741762161255, + "step": 1390 + }, + { + "epoch": 0.37, + "grad_norm": 12.75, + "learning_rate": 3.995948780477605e-06, + "logits/chosen": -1.1000730991363525, + "logits/rejected": -0.9693312644958496, + "logps/chosen": -477.19549560546875, + "logps/rejected": -542.30615234375, + "loss": 0.516, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.0844216346740723, + "rewards/margins": 0.8978837132453918, + "rewards/rejected": -2.9823052883148193, + "step": 1400 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -0.9127845168113708, + "eval_logits/rejected": -0.7864713668823242, + "eval_logps/chosen": -474.74249267578125, + "eval_logps/rejected": -551.2366943359375, + "eval_loss": 0.5107593536376953, + "eval_rewards/accuracies": 0.7350000143051147, + "eval_rewards/chosen": -2.100904941558838, + "eval_rewards/margins": 0.9653825163841248, + "eval_rewards/rejected": -3.0662872791290283, + "eval_runtime": 381.6083, + "eval_samples_per_second": 5.241, + "eval_steps_per_second": 0.655, + "step": 1400 + }, + { + "epoch": 0.37, + "grad_norm": 10.25, + "learning_rate": 3.977583174498816e-06, + "logits/chosen": -1.017508864402771, + "logits/rejected": -0.8959487676620483, + "logps/chosen": -488.11810302734375, + "logps/rejected": -602.2122802734375, + "loss": 0.3715, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.244345188140869, + "rewards/margins": 1.360781192779541, + "rewards/rejected": -3.6051268577575684, + "step": 1410 + }, + { + "epoch": 0.37, + "grad_norm": 12.125, + "learning_rate": 3.959094190750172e-06, + "logits/chosen": -1.0074245929718018, + "logits/rejected": -0.868901252746582, + "logps/chosen": -552.512939453125, + "logps/rejected": -637.4674072265625, + "loss": 0.4966, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6735260486602783, + "rewards/margins": 1.1185749769210815, + "rewards/rejected": -3.7921009063720703, + "step": 1420 + }, + { + "epoch": 0.37, + "grad_norm": 11.6875, + "learning_rate": 3.9404833730564975e-06, + "logits/chosen": -0.8478316068649292, + "logits/rejected": -0.7511281967163086, + "logps/chosen": -535.4224853515625, + "logps/rejected": -637.5137329101562, + "loss": 0.494, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.823219060897827, + "rewards/margins": 1.1367390155792236, + "rewards/rejected": -3.9599578380584717, + "step": 1430 + }, + { + "epoch": 0.38, + "grad_norm": 17.125, + "learning_rate": 3.921752275415712e-06, + "logits/chosen": -0.9650063514709473, + "logits/rejected": -0.8631266355514526, + "logps/chosen": -534.4532470703125, + "logps/rejected": -645.3438720703125, + "loss": 0.4351, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.8391730785369873, + "rewards/margins": 1.3146858215332031, + "rewards/rejected": -4.1538591384887695, + "step": 1440 + }, + { + "epoch": 0.38, + "grad_norm": 6.53125, + "learning_rate": 3.902902461869079e-06, + "logits/chosen": -0.9252153635025024, + "logits/rejected": -0.7948675751686096, + "logps/chosen": -540.6839599609375, + "logps/rejected": -642.1290283203125, + "loss": 0.5532, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0117030143737793, + "rewards/margins": 1.17899751663208, + "rewards/rejected": -4.190700531005859, + "step": 1450 + }, + { + "epoch": 0.38, + "grad_norm": 13.875, + "learning_rate": 3.883935506370605e-06, + "logits/chosen": -0.9731215238571167, + "logits/rejected": -0.8713979721069336, + "logps/chosen": -526.899658203125, + "logps/rejected": -591.6453857421875, + "loss": 0.5396, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.736586570739746, + "rewards/margins": 0.9257469177246094, + "rewards/rejected": -3.6623332500457764, + "step": 1460 + }, + { + "epoch": 0.38, + "grad_norm": 5.0625, + "learning_rate": 3.864852992655617e-06, + "logits/chosen": -1.115800380706787, + "logits/rejected": -1.0172771215438843, + "logps/chosen": -478.37420654296875, + "logps/rejected": -573.0581665039062, + "loss": 0.4365, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.2973954677581787, + "rewards/margins": 1.069636344909668, + "rewards/rejected": -3.3670318126678467, + "step": 1470 + }, + { + "epoch": 0.39, + "grad_norm": 7.0625, + "learning_rate": 3.845656514108516e-06, + "logits/chosen": -1.0454566478729248, + "logits/rejected": -0.8997499346733093, + "logps/chosen": -511.357177734375, + "logps/rejected": -557.3446655273438, + "loss": 0.4913, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.522265672683716, + "rewards/margins": 1.0096194744110107, + "rewards/rejected": -3.5318856239318848, + "step": 1480 + }, + { + "epoch": 0.39, + "grad_norm": 8.125, + "learning_rate": 3.826347673629738e-06, + "logits/chosen": -1.0593020915985107, + "logits/rejected": -0.8929145932197571, + "logps/chosen": -473.79302978515625, + "logps/rejected": -565.4286499023438, + "loss": 0.4657, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2048957347869873, + "rewards/margins": 1.1790317296981812, + "rewards/rejected": -3.3839271068573, + "step": 1490 + }, + { + "epoch": 0.39, + "grad_norm": 12.0625, + "learning_rate": 3.8069280835019062e-06, + "logits/chosen": -1.116262674331665, + "logits/rejected": -0.9613265991210938, + "logps/chosen": -477.24810791015625, + "logps/rejected": -587.962646484375, + "loss": 0.4593, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1293580532073975, + "rewards/margins": 1.2989779710769653, + "rewards/rejected": -3.4283363819122314, + "step": 1500 + }, + { + "epoch": 0.39, + "eval_logits/chosen": -1.0210601091384888, + "eval_logits/rejected": -0.8902665972709656, + "eval_logps/chosen": -496.3184509277344, + "eval_logps/rejected": -587.1505737304688, + "eval_loss": 0.5173963308334351, + "eval_rewards/accuracies": 0.7304999828338623, + "eval_rewards/chosen": -2.316664218902588, + "eval_rewards/margins": 1.1087615489959717, + "eval_rewards/rejected": -3.4254260063171387, + "eval_runtime": 382.2649, + "eval_samples_per_second": 5.232, + "eval_steps_per_second": 0.654, + "step": 1500 + }, + { + "epoch": 0.4, + "grad_norm": 13.5, + "learning_rate": 3.7873993652552077e-06, + "logits/chosen": -1.0803442001342773, + "logits/rejected": -0.9917434453964233, + "logps/chosen": -461.2118225097656, + "logps/rejected": -549.1537475585938, + "loss": 0.593, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.274977922439575, + "rewards/margins": 0.9378048777580261, + "rewards/rejected": -3.212782621383667, + "step": 1510 + }, + { + "epoch": 0.4, + "grad_norm": 8.3125, + "learning_rate": 3.7677631495319953e-06, + "logits/chosen": -1.2474887371063232, + "logits/rejected": -1.145392656326294, + "logps/chosen": -428.1084899902344, + "logps/rejected": -485.67694091796875, + "loss": 0.5245, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.6442362070083618, + "rewards/margins": 0.7559275031089783, + "rewards/rejected": -2.4001636505126953, + "step": 1520 + }, + { + "epoch": 0.4, + "grad_norm": 6.75, + "learning_rate": 3.748021075950633e-06, + "logits/chosen": -1.3161629438400269, + "logits/rejected": -1.232714295387268, + "logps/chosen": -440.6031188964844, + "logps/rejected": -481.67926025390625, + "loss": 0.5983, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6595981121063232, + "rewards/margins": 0.5171489119529724, + "rewards/rejected": -2.1767468452453613, + "step": 1530 + }, + { + "epoch": 0.4, + "grad_norm": 10.625, + "learning_rate": 3.7281747929685824e-06, + "logits/chosen": -1.132124662399292, + "logits/rejected": -1.0095793008804321, + "logps/chosen": -423.98553466796875, + "logps/rejected": -478.41015625, + "loss": 0.5368, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8633050918579102, + "rewards/margins": 0.7011392712593079, + "rewards/rejected": -2.5644445419311523, + "step": 1540 + }, + { + "epoch": 0.41, + "grad_norm": 8.625, + "learning_rate": 3.7082259577447604e-06, + "logits/chosen": -1.2295887470245361, + "logits/rejected": -1.1187238693237305, + "logps/chosen": -489.0294494628906, + "logps/rejected": -551.4732666015625, + "loss": 0.4858, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.1783862113952637, + "rewards/margins": 0.8242964744567871, + "rewards/rejected": -3.002682685852051, + "step": 1550 + }, + { + "epoch": 0.41, + "grad_norm": 10.0, + "learning_rate": 3.6881762360011688e-06, + "logits/chosen": -1.241201639175415, + "logits/rejected": -1.0382106304168701, + "logps/chosen": -548.8870849609375, + "logps/rejected": -611.2633666992188, + "loss": 0.4939, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.6739068031311035, + "rewards/margins": 0.9938074350357056, + "rewards/rejected": -3.6677143573760986, + "step": 1560 + }, + { + "epoch": 0.41, + "grad_norm": 11.8125, + "learning_rate": 3.668027301883802e-06, + "logits/chosen": -1.154157280921936, + "logits/rejected": -1.0291301012039185, + "logps/chosen": -542.0028076171875, + "logps/rejected": -634.2523803710938, + "loss": 0.5002, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.9064033031463623, + "rewards/margins": 1.071606993675232, + "rewards/rejected": -3.9780097007751465, + "step": 1570 + }, + { + "epoch": 0.41, + "grad_norm": 5.46875, + "learning_rate": 3.64778083782286e-06, + "logits/chosen": -1.0966026782989502, + "logits/rejected": -1.084398627281189, + "logps/chosen": -548.9720458984375, + "logps/rejected": -668.5007934570312, + "loss": 0.5301, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.963653087615967, + "rewards/margins": 0.9051497578620911, + "rewards/rejected": -3.868802547454834, + "step": 1580 + }, + { + "epoch": 0.42, + "grad_norm": 9.6875, + "learning_rate": 3.627438534392268e-06, + "logits/chosen": -1.2072285413742065, + "logits/rejected": -1.1841914653778076, + "logps/chosen": -524.2724609375, + "logps/rejected": -635.7026977539062, + "loss": 0.483, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.862274169921875, + "rewards/margins": 1.047090768814087, + "rewards/rejected": -3.909365177154541, + "step": 1590 + }, + { + "epoch": 0.42, + "grad_norm": 7.21875, + "learning_rate": 3.607002090168506e-06, + "logits/chosen": -1.0932730436325073, + "logits/rejected": -1.0192008018493652, + "logps/chosen": -579.1436157226562, + "logps/rejected": -652.6798095703125, + "loss": 0.5545, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.1483500003814697, + "rewards/margins": 0.9495010375976562, + "rewards/rejected": -4.097850799560547, + "step": 1600 + }, + { + "epoch": 0.42, + "eval_logits/chosen": -1.0082374811172485, + "eval_logits/rejected": -0.8800999522209167, + "eval_logps/chosen": -564.0355224609375, + "eval_logps/rejected": -652.812255859375, + "eval_loss": 0.5032184720039368, + "eval_rewards/accuracies": 0.7369999885559082, + "eval_rewards/chosen": -2.99383544921875, + "eval_rewards/margins": 1.088207483291626, + "eval_rewards/rejected": -4.082043170928955, + "eval_runtime": 381.8998, + "eval_samples_per_second": 5.237, + "eval_steps_per_second": 0.655, + "step": 1600 + }, + { + "epoch": 0.42, + "grad_norm": 6.71875, + "learning_rate": 3.586473211588787e-06, + "logits/chosen": -1.1385810375213623, + "logits/rejected": -1.0679770708084106, + "logps/chosen": -523.4324340820312, + "logps/rejected": -647.1407470703125, + "loss": 0.4495, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.787372350692749, + "rewards/margins": 1.170562744140625, + "rewards/rejected": -3.957934856414795, + "step": 1610 + }, + { + "epoch": 0.42, + "grad_norm": 13.0, + "learning_rate": 3.5658536128085623e-06, + "logits/chosen": -1.1914455890655518, + "logits/rejected": -1.0186755657196045, + "logps/chosen": -572.4912719726562, + "logps/rejected": -637.8251953125, + "loss": 0.5878, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.0957980155944824, + "rewards/margins": 0.9488485455513, + "rewards/rejected": -4.044646263122559, + "step": 1620 + }, + { + "epoch": 0.43, + "grad_norm": 10.4375, + "learning_rate": 3.545145015558399e-06, + "logits/chosen": -0.9681538343429565, + "logits/rejected": -0.9621971249580383, + "logps/chosen": -520.1128540039062, + "logps/rejected": -614.5860595703125, + "loss": 0.5109, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.8847546577453613, + "rewards/margins": 1.0869688987731934, + "rewards/rejected": -3.971724271774292, + "step": 1630 + }, + { + "epoch": 0.43, + "grad_norm": 5.46875, + "learning_rate": 3.5243491490002056e-06, + "logits/chosen": -1.09974205493927, + "logits/rejected": -1.019108533859253, + "logps/chosen": -545.1671142578125, + "logps/rejected": -630.2543334960938, + "loss": 0.5719, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.9147398471832275, + "rewards/margins": 0.9028825759887695, + "rewards/rejected": -3.817622423171997, + "step": 1640 + }, + { + "epoch": 0.43, + "grad_norm": 8.3125, + "learning_rate": 3.503467749582857e-06, + "logits/chosen": -1.1649540662765503, + "logits/rejected": -0.9812711477279663, + "logps/chosen": -496.32757568359375, + "logps/rejected": -530.1451416015625, + "loss": 0.5901, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.4510443210601807, + "rewards/margins": 0.6782389879226685, + "rewards/rejected": -3.1292831897735596, + "step": 1650 + }, + { + "epoch": 0.43, + "grad_norm": 11.0, + "learning_rate": 3.4825025608971947e-06, + "logits/chosen": -1.0830554962158203, + "logits/rejected": -1.0159814357757568, + "logps/chosen": -442.962646484375, + "logps/rejected": -521.5462646484375, + "loss": 0.5191, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.2101898193359375, + "rewards/margins": 0.7478699684143066, + "rewards/rejected": -2.958059549331665, + "step": 1660 + }, + { + "epoch": 0.44, + "grad_norm": 7.40625, + "learning_rate": 3.4614553335304407e-06, + "logits/chosen": -1.1321473121643066, + "logits/rejected": -0.9186077117919922, + "logps/chosen": -502.3970642089844, + "logps/rejected": -575.6217041015625, + "loss": 0.4608, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.3348631858825684, + "rewards/margins": 1.0501439571380615, + "rewards/rejected": -3.385007381439209, + "step": 1670 + }, + { + "epoch": 0.44, + "grad_norm": 9.625, + "learning_rate": 3.4403278249200222e-06, + "logits/chosen": -1.1406095027923584, + "logits/rejected": -0.9287969470024109, + "logps/chosen": -519.1994018554688, + "logps/rejected": -603.8717041015625, + "loss": 0.4608, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.365922689437866, + "rewards/margins": 1.2659895420074463, + "rewards/rejected": -3.6319122314453125, + "step": 1680 + }, + { + "epoch": 0.44, + "grad_norm": 16.5, + "learning_rate": 3.4191217992068293e-06, + "logits/chosen": -1.1879878044128418, + "logits/rejected": -0.9813734292984009, + "logps/chosen": -539.6956176757812, + "logps/rejected": -599.0775146484375, + "loss": 0.5446, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.6155307292938232, + "rewards/margins": 1.0494682788848877, + "rewards/rejected": -3.664999008178711, + "step": 1690 + }, + { + "epoch": 0.44, + "grad_norm": 12.1875, + "learning_rate": 3.3978390270879056e-06, + "logits/chosen": -1.0190632343292236, + "logits/rejected": -0.9378607869148254, + "logps/chosen": -550.7818603515625, + "logps/rejected": -662.2818603515625, + "loss": 0.5425, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.4076619148254395, + "rewards/margins": 1.0471140146255493, + "rewards/rejected": -4.454775810241699, + "step": 1700 + }, + { + "epoch": 0.44, + "eval_logits/chosen": -0.9685720205307007, + "eval_logits/rejected": -0.8382174968719482, + "eval_logps/chosen": -599.6095581054688, + "eval_logps/rejected": -685.2186889648438, + "eval_loss": 0.49963250756263733, + "eval_rewards/accuracies": 0.7404999732971191, + "eval_rewards/chosen": -3.349576234817505, + "eval_rewards/margins": 1.0565321445465088, + "eval_rewards/rejected": -4.406107425689697, + "eval_runtime": 382.4342, + "eval_samples_per_second": 5.23, + "eval_steps_per_second": 0.654, + "step": 1700 + }, + { + "epoch": 0.45, + "grad_norm": 11.75, + "learning_rate": 3.3764812856685995e-06, + "logits/chosen": -1.0968348979949951, + "logits/rejected": -1.0862301588058472, + "logps/chosen": -530.6864013671875, + "logps/rejected": -640.4039916992188, + "loss": 0.518, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0613017082214355, + "rewards/margins": 0.9621230959892273, + "rewards/rejected": -4.0234246253967285, + "step": 1710 + }, + { + "epoch": 0.45, + "grad_norm": 8.0, + "learning_rate": 3.3550503583141726e-06, + "logits/chosen": -1.2413816452026367, + "logits/rejected": -1.089429259300232, + "logps/chosen": -535.4332275390625, + "logps/rejected": -622.2586059570312, + "loss": 0.4864, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.66583251953125, + "rewards/margins": 1.01954185962677, + "rewards/rejected": -3.6853744983673096, + "step": 1720 + }, + { + "epoch": 0.45, + "grad_norm": 8.4375, + "learning_rate": 3.3335480345008907e-06, + "logits/chosen": -1.112958312034607, + "logits/rejected": -1.0259140729904175, + "logps/chosen": -486.234375, + "logps/rejected": -564.1868896484375, + "loss": 0.4673, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.260854721069336, + "rewards/margins": 1.0263946056365967, + "rewards/rejected": -3.2872490882873535, + "step": 1730 + }, + { + "epoch": 0.46, + "grad_norm": 10.4375, + "learning_rate": 3.3119761096666055e-06, + "logits/chosen": -1.1713676452636719, + "logits/rejected": -1.0070645809173584, + "logps/chosen": -514.056396484375, + "logps/rejected": -565.324951171875, + "loss": 0.5375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.390371084213257, + "rewards/margins": 0.8160451054573059, + "rewards/rejected": -3.206415891647339, + "step": 1740 + }, + { + "epoch": 0.46, + "grad_norm": 7.3125, + "learning_rate": 3.290336385060832e-06, + "logits/chosen": -1.3080298900604248, + "logits/rejected": -1.114485502243042, + "logps/chosen": -513.6076049804688, + "logps/rejected": -580.9697265625, + "loss": 0.5403, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.6475276947021484, + "rewards/margins": 0.8753725290298462, + "rewards/rejected": -3.522900104522705, + "step": 1750 + }, + { + "epoch": 0.46, + "grad_norm": 10.75, + "learning_rate": 3.268630667594348e-06, + "logits/chosen": -1.1190599203109741, + "logits/rejected": -1.0877625942230225, + "logps/chosen": -520.4367065429688, + "logps/rejected": -593.3540649414062, + "loss": 0.51, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.6478748321533203, + "rewards/margins": 0.9716035723686218, + "rewards/rejected": -3.619478225708008, + "step": 1760 + }, + { + "epoch": 0.46, + "grad_norm": 10.1875, + "learning_rate": 3.2468607696883147e-06, + "logits/chosen": -1.1805906295776367, + "logits/rejected": -1.1239099502563477, + "logps/chosen": -522.7432861328125, + "logps/rejected": -629.3782958984375, + "loss": 0.4844, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.695678949356079, + "rewards/margins": 1.022963285446167, + "rewards/rejected": -3.718641996383667, + "step": 1770 + }, + { + "epoch": 0.47, + "grad_norm": 7.0625, + "learning_rate": 3.225028509122944e-06, + "logits/chosen": -1.2425084114074707, + "logits/rejected": -1.1278479099273682, + "logps/chosen": -481.4998474121094, + "logps/rejected": -560.8279418945312, + "loss": 0.5179, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.449826717376709, + "rewards/margins": 0.9064075350761414, + "rewards/rejected": -3.356234073638916, + "step": 1780 + }, + { + "epoch": 0.47, + "grad_norm": 13.9375, + "learning_rate": 3.2031357088857083e-06, + "logits/chosen": -1.2350413799285889, + "logits/rejected": -1.1462427377700806, + "logps/chosen": -549.2757568359375, + "logps/rejected": -646.181640625, + "loss": 0.5022, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.7407171726226807, + "rewards/margins": 1.003739595413208, + "rewards/rejected": -3.7444565296173096, + "step": 1790 + }, + { + "epoch": 0.47, + "grad_norm": 14.625, + "learning_rate": 3.181184197019127e-06, + "logits/chosen": -0.9863433837890625, + "logits/rejected": -0.8817607164382935, + "logps/chosen": -533.1535034179688, + "logps/rejected": -698.6467895507812, + "loss": 0.4825, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.029388189315796, + "rewards/margins": 1.3928486108779907, + "rewards/rejected": -4.422236442565918, + "step": 1800 + }, + { + "epoch": 0.47, + "eval_logits/chosen": -1.0005792379379272, + "eval_logits/rejected": -0.8737620115280151, + "eval_logps/chosen": -569.109130859375, + "eval_logps/rejected": -657.4884033203125, + "eval_loss": 0.503667414188385, + "eval_rewards/accuracies": 0.7379999756813049, + "eval_rewards/chosen": -3.0445713996887207, + "eval_rewards/margins": 1.0842331647872925, + "eval_rewards/rejected": -4.1288042068481445, + "eval_runtime": 382.2565, + "eval_samples_per_second": 5.232, + "eval_steps_per_second": 0.654, + "step": 1800 + }, + { + "epoch": 0.47, + "grad_norm": 14.3125, + "learning_rate": 3.159175806468126e-06, + "logits/chosen": -1.0082833766937256, + "logits/rejected": -0.8253539800643921, + "logps/chosen": -556.5079956054688, + "logps/rejected": -636.0127563476562, + "loss": 0.5015, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.07832407951355, + "rewards/margins": 1.0969042778015137, + "rewards/rejected": -4.175228595733643, + "step": 1810 + }, + { + "epoch": 0.48, + "grad_norm": 11.1875, + "learning_rate": 3.1371123749269804e-06, + "logits/chosen": -1.1307703256607056, + "logits/rejected": -1.0529394149780273, + "logps/chosen": -595.5393676757812, + "logps/rejected": -662.37158203125, + "loss": 0.5659, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.1214325428009033, + "rewards/margins": 0.8287679553031921, + "rewards/rejected": -3.950200319290161, + "step": 1820 + }, + { + "epoch": 0.48, + "grad_norm": 8.5625, + "learning_rate": 3.114995744685877e-06, + "logits/chosen": -1.07692551612854, + "logits/rejected": -1.0323340892791748, + "logps/chosen": -533.2166748046875, + "logps/rejected": -612.94140625, + "loss": 0.5153, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8589041233062744, + "rewards/margins": 0.9276583790779114, + "rewards/rejected": -3.786562442779541, + "step": 1830 + }, + { + "epoch": 0.48, + "grad_norm": 6.40625, + "learning_rate": 3.0928277624770743e-06, + "logits/chosen": -1.2703588008880615, + "logits/rejected": -1.0852762460708618, + "logps/chosen": -551.0806274414062, + "logps/rejected": -643.0982666015625, + "loss": 0.4817, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.663365125656128, + "rewards/margins": 1.2043039798736572, + "rewards/rejected": -3.8676695823669434, + "step": 1840 + }, + { + "epoch": 0.48, + "grad_norm": 6.8125, + "learning_rate": 3.070610279320708e-06, + "logits/chosen": -1.248780608177185, + "logits/rejected": -1.084285020828247, + "logps/chosen": -551.0938110351562, + "logps/rejected": -643.5797729492188, + "loss": 0.4411, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.640122652053833, + "rewards/margins": 1.165264368057251, + "rewards/rejected": -3.805387020111084, + "step": 1850 + }, + { + "epoch": 0.49, + "grad_norm": 6.09375, + "learning_rate": 3.0483451503702264e-06, + "logits/chosen": -1.1745688915252686, + "logits/rejected": -1.0959160327911377, + "logps/chosen": -581.6795654296875, + "logps/rejected": -661.7645263671875, + "loss": 0.5518, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.9558444023132324, + "rewards/margins": 1.0012142658233643, + "rewards/rejected": -3.9570584297180176, + "step": 1860 + }, + { + "epoch": 0.49, + "grad_norm": 11.875, + "learning_rate": 3.0260342347574916e-06, + "logits/chosen": -1.1434388160705566, + "logits/rejected": -0.9975016713142395, + "logps/chosen": -543.2282104492188, + "logps/rejected": -666.7279052734375, + "loss": 0.4206, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.7089195251464844, + "rewards/margins": 1.425309419631958, + "rewards/rejected": -4.134228706359863, + "step": 1870 + }, + { + "epoch": 0.49, + "grad_norm": 11.0, + "learning_rate": 3.0036793954375358e-06, + "logits/chosen": -1.0967297554016113, + "logits/rejected": -0.9473203420639038, + "logps/chosen": -603.4558715820312, + "logps/rejected": -692.9251708984375, + "loss": 0.4466, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.3335928916931152, + "rewards/margins": 1.3170349597930908, + "rewards/rejected": -4.650628089904785, + "step": 1880 + }, + { + "epoch": 0.49, + "grad_norm": 13.0, + "learning_rate": 2.981282499033009e-06, + "logits/chosen": -1.0985617637634277, + "logits/rejected": -0.9863265156745911, + "logps/chosen": -607.0682373046875, + "logps/rejected": -701.697509765625, + "loss": 0.5071, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.307284116744995, + "rewards/margins": 1.200660228729248, + "rewards/rejected": -4.507944583892822, + "step": 1890 + }, + { + "epoch": 0.5, + "grad_norm": 10.0625, + "learning_rate": 2.9588454156783163e-06, + "logits/chosen": -1.1454726457595825, + "logits/rejected": -0.9831218719482422, + "logps/chosen": -579.2799682617188, + "logps/rejected": -706.1749877929688, + "loss": 0.4455, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.979241371154785, + "rewards/margins": 1.4865919351577759, + "rewards/rejected": -4.4658331871032715, + "step": 1900 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -1.0213502645492554, + "eval_logits/rejected": -0.891007125377655, + "eval_logps/chosen": -566.8839721679688, + "eval_logps/rejected": -659.4305419921875, + "eval_loss": 0.49620321393013, + "eval_rewards/accuracies": 0.7419999837875366, + "eval_rewards/chosen": -3.0223195552825928, + "eval_rewards/margins": 1.1259068250656128, + "eval_rewards/rejected": -4.148226737976074, + "eval_runtime": 382.1041, + "eval_samples_per_second": 5.234, + "eval_steps_per_second": 0.654, + "step": 1900 + }, + { + "epoch": 0.5, + "grad_norm": 10.8125, + "learning_rate": 2.9363700188634597e-06, + "logits/chosen": -1.1352207660675049, + "logits/rejected": -1.0086506605148315, + "logps/chosen": -588.1229858398438, + "logps/rejected": -648.9054565429688, + "loss": 0.5063, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.230529308319092, + "rewards/margins": 0.9782280921936035, + "rewards/rejected": -4.208757400512695, + "step": 1910 + }, + { + "epoch": 0.5, + "grad_norm": 13.375, + "learning_rate": 2.9138581852776053e-06, + "logits/chosen": -1.1499899625778198, + "logits/rejected": -1.0288715362548828, + "logps/chosen": -581.2144775390625, + "logps/rejected": -680.3140869140625, + "loss": 0.496, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.2150332927703857, + "rewards/margins": 1.1205800771713257, + "rewards/rejected": -4.335613250732422, + "step": 1920 + }, + { + "epoch": 0.51, + "grad_norm": 7.3125, + "learning_rate": 2.8913117946523805e-06, + "logits/chosen": -1.1651884317398071, + "logits/rejected": -0.9733787775039673, + "logps/chosen": -579.3433227539062, + "logps/rejected": -649.0181884765625, + "loss": 0.4634, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.129295825958252, + "rewards/margins": 1.077726125717163, + "rewards/rejected": -4.207022190093994, + "step": 1930 + }, + { + "epoch": 0.51, + "grad_norm": 11.375, + "learning_rate": 2.8687327296049126e-06, + "logits/chosen": -1.163464069366455, + "logits/rejected": -1.0617696046829224, + "logps/chosen": -556.2322998046875, + "logps/rejected": -651.5863037109375, + "loss": 0.5142, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.971095561981201, + "rewards/margins": 1.0506844520568848, + "rewards/rejected": -4.021780014038086, + "step": 1940 + }, + { + "epoch": 0.51, + "grad_norm": 12.8125, + "learning_rate": 2.8461228754806376e-06, + "logits/chosen": -1.185319185256958, + "logits/rejected": -1.0036907196044922, + "logps/chosen": -566.9384155273438, + "logps/rejected": -628.1956787109375, + "loss": 0.5404, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.911479949951172, + "rewards/margins": 0.8705935478210449, + "rewards/rejected": -3.782073497772217, + "step": 1950 + }, + { + "epoch": 0.51, + "grad_norm": 7.09375, + "learning_rate": 2.823484120195865e-06, + "logits/chosen": -1.3058470487594604, + "logits/rejected": -1.113465666770935, + "logps/chosen": -529.6067504882812, + "logps/rejected": -606.2987060546875, + "loss": 0.4364, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.5179548263549805, + "rewards/margins": 1.1106722354888916, + "rewards/rejected": -3.628627061843872, + "step": 1960 + }, + { + "epoch": 0.52, + "grad_norm": 8.75, + "learning_rate": 2.8008183540801486e-06, + "logits/chosen": -1.12172269821167, + "logits/rejected": -0.968579888343811, + "logps/chosen": -553.111083984375, + "logps/rejected": -600.1488037109375, + "loss": 0.5074, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7947652339935303, + "rewards/margins": 0.9243541955947876, + "rewards/rejected": -3.7191195487976074, + "step": 1970 + }, + { + "epoch": 0.52, + "grad_norm": 10.75, + "learning_rate": 2.7781274697184353e-06, + "logits/chosen": -0.9661678075790405, + "logits/rejected": -0.9819488525390625, + "logps/chosen": -551.6143798828125, + "logps/rejected": -679.9763793945312, + "loss": 0.5141, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.2225677967071533, + "rewards/margins": 1.0803557634353638, + "rewards/rejected": -4.30292272567749, + "step": 1980 + }, + { + "epoch": 0.52, + "grad_norm": 7.625, + "learning_rate": 2.7554133617930397e-06, + "logits/chosen": -1.0553234815597534, + "logits/rejected": -0.9197478294372559, + "logps/chosen": -592.0967407226562, + "logps/rejected": -687.3663940429688, + "loss": 0.4817, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.442605495452881, + "rewards/margins": 1.1034131050109863, + "rewards/rejected": -4.546019077301025, + "step": 1990 + }, + { + "epoch": 0.52, + "grad_norm": 11.375, + "learning_rate": 2.7326779269254363e-06, + "logits/chosen": -1.1949965953826904, + "logits/rejected": -1.0267183780670166, + "logps/chosen": -653.2984619140625, + "logps/rejected": -709.1905517578125, + "loss": 0.4817, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.6396350860595703, + "rewards/margins": 1.1184080839157104, + "rewards/rejected": -4.75804328918457, + "step": 2000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -0.9427788257598877, + "eval_logits/rejected": -0.8139032125473022, + "eval_logps/chosen": -624.5250244140625, + "eval_logps/rejected": -711.0853271484375, + "eval_loss": 0.49741417169570923, + "eval_rewards/accuracies": 0.746999979019165, + "eval_rewards/chosen": -3.5987296104431152, + "eval_rewards/margins": 1.0660440921783447, + "eval_rewards/rejected": -4.664773941040039, + "eval_runtime": 382.3502, + "eval_samples_per_second": 5.231, + "eval_steps_per_second": 0.654, + "step": 2000 + }, + { + "epoch": 0.53, + "grad_norm": 10.5625, + "learning_rate": 2.7099230635178954e-06, + "logits/chosen": -1.0279147624969482, + "logits/rejected": -0.9855324625968933, + "logps/chosen": -615.8596801757812, + "logps/rejected": -704.7830200195312, + "loss": 0.5276, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.510098934173584, + "rewards/margins": 0.954069972038269, + "rewards/rejected": -4.464169025421143, + "step": 2010 + }, + { + "epoch": 0.53, + "grad_norm": 9.625, + "learning_rate": 2.6871506715949608e-06, + "logits/chosen": -1.177202582359314, + "logits/rejected": -1.0146461725234985, + "logps/chosen": -568.2487182617188, + "logps/rejected": -659.0941162109375, + "loss": 0.4583, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.152796745300293, + "rewards/margins": 1.0889527797698975, + "rewards/rejected": -4.2417497634887695, + "step": 2020 + }, + { + "epoch": 0.53, + "grad_norm": 13.6875, + "learning_rate": 2.6643626526448063e-06, + "logits/chosen": -1.2432745695114136, + "logits/rejected": -1.0716017484664917, + "logps/chosen": -619.502685546875, + "logps/rejected": -699.7628173828125, + "loss": 0.4576, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.2460086345672607, + "rewards/margins": 1.2264302968978882, + "rewards/rejected": -4.472439289093018, + "step": 2030 + }, + { + "epoch": 0.53, + "grad_norm": 9.875, + "learning_rate": 2.6415609094604562e-06, + "logits/chosen": -1.0596590042114258, + "logits/rejected": -1.0028278827667236, + "logps/chosen": -631.6947021484375, + "logps/rejected": -728.5841674804688, + "loss": 0.4471, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.6112823486328125, + "rewards/margins": 1.1590890884399414, + "rewards/rejected": -4.770371437072754, + "step": 2040 + }, + { + "epoch": 0.54, + "grad_norm": 11.375, + "learning_rate": 2.618747345980904e-06, + "logits/chosen": -1.067651629447937, + "logits/rejected": -0.8701795339584351, + "logps/chosen": -667.7681274414062, + "logps/rejected": -718.9295654296875, + "loss": 0.5561, + "rewards/accuracies": 0.71875, + "rewards/chosen": -4.1937079429626465, + "rewards/margins": 1.016485333442688, + "rewards/rejected": -5.210193634033203, + "step": 2050 + }, + { + "epoch": 0.54, + "grad_norm": 6.125, + "learning_rate": 2.595923867132136e-06, + "logits/chosen": -1.1067336797714233, + "logits/rejected": -0.9798781275749207, + "logps/chosen": -685.84228515625, + "logps/rejected": -784.4832763671875, + "loss": 0.4938, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.049218654632568, + "rewards/margins": 1.2331972122192383, + "rewards/rejected": -5.282416343688965, + "step": 2060 + }, + { + "epoch": 0.54, + "grad_norm": 7.9375, + "learning_rate": 2.5730923786680672e-06, + "logits/chosen": -1.017889380455017, + "logits/rejected": -1.0066477060317993, + "logps/chosen": -639.3632202148438, + "logps/rejected": -738.4698486328125, + "loss": 0.5372, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.8783206939697266, + "rewards/margins": 0.9146150350570679, + "rewards/rejected": -4.792935848236084, + "step": 2070 + }, + { + "epoch": 0.54, + "grad_norm": 7.3125, + "learning_rate": 2.5502547870114137e-06, + "logits/chosen": -1.1123883724212646, + "logits/rejected": -0.9572793245315552, + "logps/chosen": -607.7706909179688, + "logps/rejected": -670.916015625, + "loss": 0.5255, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.5239059925079346, + "rewards/margins": 0.9338981509208679, + "rewards/rejected": -4.457803726196289, + "step": 2080 + }, + { + "epoch": 0.55, + "grad_norm": 13.375, + "learning_rate": 2.527412999094507e-06, + "logits/chosen": -1.118983507156372, + "logits/rejected": -0.9597452282905579, + "logps/chosen": -620.9295043945312, + "logps/rejected": -721.0320434570312, + "loss": 0.4802, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.2717292308807373, + "rewards/margins": 1.1265954971313477, + "rewards/rejected": -4.398324489593506, + "step": 2090 + }, + { + "epoch": 0.55, + "grad_norm": 11.75, + "learning_rate": 2.504568922200064e-06, + "logits/chosen": -1.075067400932312, + "logits/rejected": -0.937818706035614, + "logps/chosen": -547.7574462890625, + "logps/rejected": -641.327392578125, + "loss": 0.5079, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.0641894340515137, + "rewards/margins": 1.0973466634750366, + "rewards/rejected": -4.161535739898682, + "step": 2100 + }, + { + "epoch": 0.55, + "eval_logits/chosen": -1.0030875205993652, + "eval_logits/rejected": -0.8739129900932312, + "eval_logps/chosen": -582.1657104492188, + "eval_logps/rejected": -667.5426025390625, + "eval_loss": 0.4922982156276703, + "eval_rewards/accuracies": 0.7519999742507935, + "eval_rewards/chosen": -3.1751370429992676, + "eval_rewards/margins": 1.0542099475860596, + "eval_rewards/rejected": -4.229346752166748, + "eval_runtime": 382.3169, + "eval_samples_per_second": 5.231, + "eval_steps_per_second": 0.654, + "step": 2100 + }, + { + "epoch": 0.55, + "grad_norm": 8.1875, + "learning_rate": 2.4817244638019333e-06, + "logits/chosen": -1.137091875076294, + "logits/rejected": -0.9877273440361023, + "logps/chosen": -593.8831787109375, + "logps/rejected": -648.8990478515625, + "loss": 0.5394, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.1807122230529785, + "rewards/margins": 0.9622472524642944, + "rewards/rejected": -4.1429595947265625, + "step": 2110 + }, + { + "epoch": 0.55, + "grad_norm": 14.1875, + "learning_rate": 2.4588815314058155e-06, + "logits/chosen": -1.117033839225769, + "logits/rejected": -1.0428097248077393, + "logps/chosen": -536.7808227539062, + "logps/rejected": -599.55908203125, + "loss": 0.4755, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.912168025970459, + "rewards/margins": 0.9705360531806946, + "rewards/rejected": -3.882704257965088, + "step": 2120 + }, + { + "epoch": 0.56, + "grad_norm": 9.0625, + "learning_rate": 2.4360420323899922e-06, + "logits/chosen": -1.1962370872497559, + "logits/rejected": -1.0757726430892944, + "logps/chosen": -545.7897338867188, + "logps/rejected": -594.7244873046875, + "loss": 0.5644, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6997714042663574, + "rewards/margins": 0.8151930570602417, + "rewards/rejected": -3.5149643421173096, + "step": 2130 + }, + { + "epoch": 0.56, + "grad_norm": 6.75, + "learning_rate": 2.4132078738460585e-06, + "logits/chosen": -1.2405675649642944, + "logits/rejected": -1.0946118831634521, + "logps/chosen": -528.01611328125, + "logps/rejected": -594.1393432617188, + "loss": 0.4643, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.6354479789733887, + "rewards/margins": 1.046671748161316, + "rewards/rejected": -3.682119846343994, + "step": 2140 + }, + { + "epoch": 0.56, + "grad_norm": 12.5625, + "learning_rate": 2.3903809624196826e-06, + "logits/chosen": -1.1746861934661865, + "logits/rejected": -1.0529396533966064, + "logps/chosen": -520.6478271484375, + "logps/rejected": -572.0309448242188, + "loss": 0.5516, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.8694403171539307, + "rewards/margins": 0.8386090397834778, + "rewards/rejected": -3.7080490589141846, + "step": 2150 + }, + { + "epoch": 0.57, + "grad_norm": 11.25, + "learning_rate": 2.3675632041513978e-06, + "logits/chosen": -1.2890937328338623, + "logits/rejected": -1.0460366010665894, + "logps/chosen": -595.07275390625, + "logps/rejected": -639.810791015625, + "loss": 0.4788, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.0806915760040283, + "rewards/margins": 1.094292402267456, + "rewards/rejected": -4.174983978271484, + "step": 2160 + }, + { + "epoch": 0.57, + "grad_norm": 11.5, + "learning_rate": 2.3447565043174533e-06, + "logits/chosen": -1.1292383670806885, + "logits/rejected": -0.9545844793319702, + "logps/chosen": -596.5003662109375, + "logps/rejected": -650.0792236328125, + "loss": 0.5136, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.3818931579589844, + "rewards/margins": 0.922932505607605, + "rewards/rejected": -4.304825782775879, + "step": 2170 + }, + { + "epoch": 0.57, + "grad_norm": 12.0, + "learning_rate": 2.321962767270724e-06, + "logits/chosen": -1.158575415611267, + "logits/rejected": -1.0298246145248413, + "logps/chosen": -583.9124755859375, + "logps/rejected": -629.5396118164062, + "loss": 0.5615, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.3395965099334717, + "rewards/margins": 0.8280007243156433, + "rewards/rejected": -4.16759729385376, + "step": 2180 + }, + { + "epoch": 0.57, + "grad_norm": 8.75, + "learning_rate": 2.299183896281692e-06, + "logits/chosen": -1.088763952255249, + "logits/rejected": -0.9791523218154907, + "logps/chosen": -556.0525512695312, + "logps/rejected": -641.457763671875, + "loss": 0.5181, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.0112056732177734, + "rewards/margins": 0.8770611882209778, + "rewards/rejected": -3.8882670402526855, + "step": 2190 + }, + { + "epoch": 0.58, + "grad_norm": 7.25, + "learning_rate": 2.2764217933795297e-06, + "logits/chosen": -1.2351996898651123, + "logits/rejected": -1.1065688133239746, + "logps/chosen": -519.6819458007812, + "logps/rejected": -608.1278686523438, + "loss": 0.477, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5824990272521973, + "rewards/margins": 1.0897197723388672, + "rewards/rejected": -3.6722190380096436, + "step": 2200 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -1.0880188941955566, + "eval_logits/rejected": -0.9566530585289001, + "eval_logps/chosen": -525.9181518554688, + "eval_logps/rejected": -601.7401733398438, + "eval_loss": 0.48973530530929565, + "eval_rewards/accuracies": 0.7409999966621399, + "eval_rewards/chosen": -2.612661123275757, + "eval_rewards/margins": 0.9586613774299622, + "eval_rewards/rejected": -3.571322441101074, + "eval_runtime": 382.0537, + "eval_samples_per_second": 5.235, + "eval_steps_per_second": 0.654, + "step": 2200 + }, + { + "epoch": 0.58, + "grad_norm": 5.71875, + "learning_rate": 2.2536783591932786e-06, + "logits/chosen": -1.2977464199066162, + "logits/rejected": -1.1296590566635132, + "logps/chosen": -553.06103515625, + "logps/rejected": -621.307861328125, + "loss": 0.5291, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7755794525146484, + "rewards/margins": 0.8637927174568176, + "rewards/rejected": -3.6393723487854004, + "step": 2210 + }, + { + "epoch": 0.58, + "grad_norm": 7.84375, + "learning_rate": 2.230955492793149e-06, + "logits/chosen": -1.0942963361740112, + "logits/rejected": -1.042419195175171, + "logps/chosen": -573.537841796875, + "logps/rejected": -642.611572265625, + "loss": 0.5884, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.997450351715088, + "rewards/margins": 0.8198318481445312, + "rewards/rejected": -3.8172824382781982, + "step": 2220 + }, + { + "epoch": 0.58, + "grad_norm": 5.71875, + "learning_rate": 2.208255091531947e-06, + "logits/chosen": -1.1044989824295044, + "logits/rejected": -1.0208889245986938, + "logps/chosen": -553.853515625, + "logps/rejected": -632.1079711914062, + "loss": 0.4853, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7685611248016357, + "rewards/margins": 1.132253646850586, + "rewards/rejected": -3.9008147716522217, + "step": 2230 + }, + { + "epoch": 0.59, + "grad_norm": 11.75, + "learning_rate": 2.1855790508866435e-06, + "logits/chosen": -1.1996960639953613, + "logits/rejected": -1.0961394309997559, + "logps/chosen": -557.0603637695312, + "logps/rejected": -641.5968017578125, + "loss": 0.5037, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.600390672683716, + "rewards/margins": 1.021994948387146, + "rewards/rejected": -3.6223855018615723, + "step": 2240 + }, + { + "epoch": 0.59, + "grad_norm": 6.28125, + "learning_rate": 2.162929264300107e-06, + "logits/chosen": -1.2133983373641968, + "logits/rejected": -1.109574556350708, + "logps/chosen": -511.7315979003906, + "logps/rejected": -615.6173095703125, + "loss": 0.416, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4276764392852783, + "rewards/margins": 1.2624719142913818, + "rewards/rejected": -3.690148115158081, + "step": 2250 + }, + { + "epoch": 0.59, + "grad_norm": 12.1875, + "learning_rate": 2.1403076230230006e-06, + "logits/chosen": -1.1181437969207764, + "logits/rejected": -0.9982963800430298, + "logps/chosen": -565.5302124023438, + "logps/rejected": -622.5106811523438, + "loss": 0.5759, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.9434773921966553, + "rewards/margins": 0.8478938341140747, + "rewards/rejected": -3.7913711071014404, + "step": 2260 + }, + { + "epoch": 0.59, + "grad_norm": 7.59375, + "learning_rate": 2.11771601595586e-06, + "logits/chosen": -1.2033512592315674, + "logits/rejected": -1.0716886520385742, + "logps/chosen": -557.2864379882812, + "logps/rejected": -603.1704711914062, + "loss": 0.5099, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.752382755279541, + "rewards/margins": 0.9814404249191284, + "rewards/rejected": -3.73382306098938, + "step": 2270 + }, + { + "epoch": 0.6, + "grad_norm": 12.8125, + "learning_rate": 2.0951563294913737e-06, + "logits/chosen": -1.177409052848816, + "logits/rejected": -0.9869596362113953, + "logps/chosen": -525.6967163085938, + "logps/rejected": -594.2974853515625, + "loss": 0.4644, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.641634941101074, + "rewards/margins": 0.9909149408340454, + "rewards/rejected": -3.63254976272583, + "step": 2280 + }, + { + "epoch": 0.6, + "grad_norm": 8.0625, + "learning_rate": 2.0726304473568693e-06, + "logits/chosen": -1.1395372152328491, + "logits/rejected": -1.0176304578781128, + "logps/chosen": -522.652099609375, + "logps/rejected": -593.3766479492188, + "loss": 0.4738, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.6371326446533203, + "rewards/margins": 1.0305713415145874, + "rewards/rejected": -3.667703628540039, + "step": 2290 + }, + { + "epoch": 0.6, + "grad_norm": 10.4375, + "learning_rate": 2.050140250457023e-06, + "logits/chosen": -1.2590233087539673, + "logits/rejected": -1.052428960800171, + "logps/chosen": -578.8065185546875, + "logps/rejected": -654.0260009765625, + "loss": 0.4829, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0124454498291016, + "rewards/margins": 1.0927618741989136, + "rewards/rejected": -4.1052069664001465, + "step": 2300 + }, + { + "epoch": 0.6, + "eval_logits/chosen": -1.0313422679901123, + "eval_logits/rejected": -0.9032019972801208, + "eval_logps/chosen": -559.955810546875, + "eval_logps/rejected": -654.1510620117188, + "eval_loss": 0.4887396991252899, + "eval_rewards/accuracies": 0.7484999895095825, + "eval_rewards/chosen": -2.953037738800049, + "eval_rewards/margins": 1.1423934698104858, + "eval_rewards/rejected": -4.095431804656982, + "eval_runtime": 381.9442, + "eval_samples_per_second": 5.236, + "eval_steps_per_second": 0.655, + "step": 2300 + }, + { + "epoch": 0.6, + "grad_norm": 14.625, + "learning_rate": 2.0276876167168042e-06, + "logits/chosen": -1.0072084665298462, + "logits/rejected": -0.9061794281005859, + "logps/chosen": -509.2284240722656, + "logps/rejected": -580.1068725585938, + "loss": 0.5548, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8472893238067627, + "rewards/margins": 1.0346016883850098, + "rewards/rejected": -3.8818912506103516, + "step": 2310 + }, + { + "epoch": 0.61, + "grad_norm": 8.25, + "learning_rate": 2.0052744209246682e-06, + "logits/chosen": -1.1624600887298584, + "logits/rejected": -1.04361891746521, + "logps/chosen": -552.9761962890625, + "logps/rejected": -621.9478759765625, + "loss": 0.5046, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.9763803482055664, + "rewards/margins": 1.018448829650879, + "rewards/rejected": -3.9948291778564453, + "step": 2320 + }, + { + "epoch": 0.61, + "grad_norm": 10.75, + "learning_rate": 1.9829025345760127e-06, + "logits/chosen": -1.1844617128372192, + "logits/rejected": -1.1262612342834473, + "logps/chosen": -559.8540649414062, + "logps/rejected": -640.3355712890625, + "loss": 0.549, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.8085100650787354, + "rewards/margins": 0.865519642829895, + "rewards/rejected": -3.67402982711792, + "step": 2330 + }, + { + "epoch": 0.61, + "grad_norm": 10.0625, + "learning_rate": 1.9605738257169115e-06, + "logits/chosen": -1.1309086084365845, + "logits/rejected": -0.9911936521530151, + "logps/chosen": -502.54608154296875, + "logps/rejected": -611.60693359375, + "loss": 0.4877, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.6956772804260254, + "rewards/margins": 1.1704528331756592, + "rewards/rejected": -3.8661301136016846, + "step": 2340 + }, + { + "epoch": 0.62, + "grad_norm": 9.1875, + "learning_rate": 1.9382901587881275e-06, + "logits/chosen": -1.196989893913269, + "logits/rejected": -1.0731130838394165, + "logps/chosen": -527.642578125, + "logps/rejected": -616.3968505859375, + "loss": 0.4233, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.6888630390167236, + "rewards/margins": 1.2105457782745361, + "rewards/rejected": -3.8994088172912598, + "step": 2350 + }, + { + "epoch": 0.62, + "grad_norm": 11.9375, + "learning_rate": 1.916053394469437e-06, + "logits/chosen": -1.2187442779541016, + "logits/rejected": -1.0278013944625854, + "logps/chosen": -555.1328125, + "logps/rejected": -650.1771240234375, + "loss": 0.5309, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.9184062480926514, + "rewards/margins": 1.0958768129348755, + "rewards/rejected": -4.014283180236816, + "step": 2360 + }, + { + "epoch": 0.62, + "grad_norm": 9.5625, + "learning_rate": 1.8938653895242604e-06, + "logits/chosen": -1.173482894897461, + "logits/rejected": -0.9950237274169922, + "logps/chosen": -563.7232666015625, + "logps/rejected": -654.51611328125, + "loss": 0.4349, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.9941768646240234, + "rewards/margins": 1.1962960958480835, + "rewards/rejected": -4.1904730796813965, + "step": 2370 + }, + { + "epoch": 0.62, + "grad_norm": 10.9375, + "learning_rate": 1.8717279966446267e-06, + "logits/chosen": -1.0182400941848755, + "logits/rejected": -0.9381190538406372, + "logps/chosen": -567.86376953125, + "logps/rejected": -672.0901489257812, + "loss": 0.4496, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1928865909576416, + "rewards/margins": 1.1378134489059448, + "rewards/rejected": -4.330699920654297, + "step": 2380 + }, + { + "epoch": 0.63, + "grad_norm": 6.90625, + "learning_rate": 1.8496430642964698e-06, + "logits/chosen": -1.0953130722045898, + "logits/rejected": -0.9763644337654114, + "logps/chosen": -591.7195434570312, + "logps/rejected": -673.8305053710938, + "loss": 0.4954, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.1930747032165527, + "rewards/margins": 1.0575921535491943, + "rewards/rejected": -4.250667095184326, + "step": 2390 + }, + { + "epoch": 0.63, + "grad_norm": 7.96875, + "learning_rate": 1.827612436565286e-06, + "logits/chosen": -1.093685507774353, + "logits/rejected": -0.9428181648254395, + "logps/chosen": -569.9864501953125, + "logps/rejected": -664.4702758789062, + "loss": 0.4752, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.0702195167541504, + "rewards/margins": 1.1502970457077026, + "rewards/rejected": -4.220516681671143, + "step": 2400 + }, + { + "epoch": 0.63, + "eval_logits/chosen": -0.9764781594276428, + "eval_logits/rejected": -0.849520742893219, + "eval_logps/chosen": -579.4506225585938, + "eval_logps/rejected": -672.75830078125, + "eval_loss": 0.49094268679618835, + "eval_rewards/accuracies": 0.7444999814033508, + "eval_rewards/chosen": -3.147986888885498, + "eval_rewards/margins": 1.1335173845291138, + "eval_rewards/rejected": -4.281503677368164, + "eval_runtime": 382.2569, + "eval_samples_per_second": 5.232, + "eval_steps_per_second": 0.654, + "step": 2400 + }, + { + "epoch": 0.63, + "grad_norm": 18.625, + "learning_rate": 1.8056379530021492e-06, + "logits/chosen": -1.1393061876296997, + "logits/rejected": -1.0437672138214111, + "logps/chosen": -565.1177978515625, + "logps/rejected": -631.9932861328125, + "loss": 0.5436, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.2091946601867676, + "rewards/margins": 0.9168522953987122, + "rewards/rejected": -4.126046180725098, + "step": 2410 + }, + { + "epoch": 0.63, + "grad_norm": 10.375, + "learning_rate": 1.7837214484701154e-06, + "logits/chosen": -1.182935118675232, + "logits/rejected": -1.0437054634094238, + "logps/chosen": -523.6812133789062, + "logps/rejected": -616.8724975585938, + "loss": 0.4678, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.713310480117798, + "rewards/margins": 1.1654255390167236, + "rewards/rejected": -3.8787360191345215, + "step": 2420 + }, + { + "epoch": 0.64, + "grad_norm": 14.125, + "learning_rate": 1.7618647529910043e-06, + "logits/chosen": -1.1824162006378174, + "logits/rejected": -1.051477313041687, + "logps/chosen": -526.3547973632812, + "logps/rejected": -624.6488647460938, + "loss": 0.4987, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.6958136558532715, + "rewards/margins": 1.1019628047943115, + "rewards/rejected": -3.797776460647583, + "step": 2430 + }, + { + "epoch": 0.64, + "grad_norm": 9.375, + "learning_rate": 1.7400696915925996e-06, + "logits/chosen": -1.1761425733566284, + "logits/rejected": -0.9889799952507019, + "logps/chosen": -560.6347045898438, + "logps/rejected": -604.340576171875, + "loss": 0.5198, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.900243043899536, + "rewards/margins": 1.019816279411316, + "rewards/rejected": -3.9200592041015625, + "step": 2440 + }, + { + "epoch": 0.64, + "grad_norm": 11.8125, + "learning_rate": 1.718338084156254e-06, + "logits/chosen": -1.1455858945846558, + "logits/rejected": -0.9903894662857056, + "logps/chosen": -568.4344482421875, + "logps/rejected": -638.8942260742188, + "loss": 0.4578, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.8164334297180176, + "rewards/margins": 1.0884320735931396, + "rewards/rejected": -3.9048657417297363, + "step": 2450 + }, + { + "epoch": 0.64, + "grad_norm": 9.625, + "learning_rate": 1.6966717452649372e-06, + "logits/chosen": -1.2747197151184082, + "logits/rejected": -1.101963758468628, + "logps/chosen": -554.3800659179688, + "logps/rejected": -616.3612060546875, + "loss": 0.4412, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.774376392364502, + "rewards/margins": 1.1384481191635132, + "rewards/rejected": -3.9128241539001465, + "step": 2460 + }, + { + "epoch": 0.65, + "grad_norm": 9.5625, + "learning_rate": 1.6750724840517103e-06, + "logits/chosen": -1.2133910655975342, + "logits/rejected": -1.1471474170684814, + "logps/chosen": -530.1273193359375, + "logps/rejected": -630.1476440429688, + "loss": 0.5062, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.7732110023498535, + "rewards/margins": 0.9591614007949829, + "rewards/rejected": -3.7323715686798096, + "step": 2470 + }, + { + "epoch": 0.65, + "grad_norm": 11.875, + "learning_rate": 1.6535421040486686e-06, + "logits/chosen": -1.0105046033859253, + "logits/rejected": -0.9159660339355469, + "logps/chosen": -560.3009643554688, + "logps/rejected": -653.0996704101562, + "loss": 0.4182, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.069490909576416, + "rewards/margins": 1.270485520362854, + "rewards/rejected": -4.3399763107299805, + "step": 2480 + }, + { + "epoch": 0.65, + "grad_norm": 13.4375, + "learning_rate": 1.6320824030363458e-06, + "logits/chosen": -1.0919368267059326, + "logits/rejected": -1.0423280000686646, + "logps/chosen": -547.108154296875, + "logps/rejected": -651.2943725585938, + "loss": 0.4663, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.1227710247039795, + "rewards/margins": 1.1962798833847046, + "rewards/rejected": -4.3190507888793945, + "step": 2490 + }, + { + "epoch": 0.65, + "grad_norm": 14.625, + "learning_rate": 1.6106951728936028e-06, + "logits/chosen": -1.1967922449111938, + "logits/rejected": -1.0710703134536743, + "logps/chosen": -573.5470581054688, + "logps/rejected": -666.3677978515625, + "loss": 0.5249, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -3.163433313369751, + "rewards/margins": 1.0070708990097046, + "rewards/rejected": -4.170504093170166, + "step": 2500 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -1.0434505939483643, + "eval_logits/rejected": -0.9135813117027283, + "eval_logps/chosen": -574.00927734375, + "eval_logps/rejected": -664.8961791992188, + "eval_loss": 0.4891022741794586, + "eval_rewards/accuracies": 0.7444999814033508, + "eval_rewards/chosen": -3.0935721397399902, + "eval_rewards/margins": 1.1093100309371948, + "eval_rewards/rejected": -4.202882289886475, + "eval_runtime": 382.3246, + "eval_samples_per_second": 5.231, + "eval_steps_per_second": 0.654, + "step": 2500 + }, + { + "epoch": 0.66, + "grad_norm": 9.4375, + "learning_rate": 1.5893821994479996e-06, + "logits/chosen": -1.1978858709335327, + "logits/rejected": -1.0786705017089844, + "logps/chosen": -573.3375244140625, + "logps/rejected": -648.0001831054688, + "loss": 0.4737, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.959294080734253, + "rewards/margins": 1.132147192955017, + "rewards/rejected": -4.0914411544799805, + "step": 2510 + }, + { + "epoch": 0.66, + "grad_norm": 7.875, + "learning_rate": 1.5681452623266868e-06, + "logits/chosen": -1.1913158893585205, + "logits/rejected": -0.9305517077445984, + "logps/chosen": -603.19873046875, + "logps/rejected": -671.5530395507812, + "loss": 0.4638, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1415326595306396, + "rewards/margins": 1.2662583589553833, + "rewards/rejected": -4.4077911376953125, + "step": 2520 + }, + { + "epoch": 0.66, + "grad_norm": 6.15625, + "learning_rate": 1.5469861348078014e-06, + "logits/chosen": -1.1753239631652832, + "logits/rejected": -1.0243064165115356, + "logps/chosen": -557.4925537109375, + "logps/rejected": -671.5239868164062, + "loss": 0.4264, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1072232723236084, + "rewards/margins": 1.246586561203003, + "rewards/rejected": -4.353809833526611, + "step": 2530 + }, + { + "epoch": 0.66, + "grad_norm": 8.5625, + "learning_rate": 1.5259065836724035e-06, + "logits/chosen": -1.0654633045196533, + "logits/rejected": -0.9947797656059265, + "logps/chosen": -555.5715942382812, + "logps/rejected": -674.6041259765625, + "loss": 0.428, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1076245307922363, + "rewards/margins": 1.2515560388565063, + "rewards/rejected": -4.359180927276611, + "step": 2540 + }, + { + "epoch": 0.67, + "grad_norm": 17.5, + "learning_rate": 1.5049083690569456e-06, + "logits/chosen": -1.117201328277588, + "logits/rejected": -1.024710774421692, + "logps/chosen": -542.8455200195312, + "logps/rejected": -661.6935424804688, + "loss": 0.4846, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.0537521839141846, + "rewards/margins": 1.2326675653457642, + "rewards/rejected": -4.286419868469238, + "step": 2550 + }, + { + "epoch": 0.67, + "grad_norm": 13.75, + "learning_rate": 1.4839932443063057e-06, + "logits/chosen": -1.1161174774169922, + "logits/rejected": -0.9579364061355591, + "logps/chosen": -589.6568603515625, + "logps/rejected": -655.3709716796875, + "loss": 0.4618, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.976780414581299, + "rewards/margins": 1.220205307006836, + "rewards/rejected": -4.196985721588135, + "step": 2560 + }, + { + "epoch": 0.67, + "grad_norm": 18.625, + "learning_rate": 1.4631629558273803e-06, + "logits/chosen": -1.1335794925689697, + "logits/rejected": -1.004740595817566, + "logps/chosen": -549.504150390625, + "logps/rejected": -625.6862182617188, + "loss": 0.631, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -3.0777907371520996, + "rewards/margins": 0.8784114122390747, + "rewards/rejected": -3.9562020301818848, + "step": 2570 + }, + { + "epoch": 0.68, + "grad_norm": 6.03125, + "learning_rate": 1.4424192429432657e-06, + "logits/chosen": -1.2103271484375, + "logits/rejected": -1.1048699617385864, + "logps/chosen": -521.5680541992188, + "logps/rejected": -641.9281616210938, + "loss": 0.4666, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.6569151878356934, + "rewards/margins": 1.1702333688735962, + "rewards/rejected": -3.8271484375, + "step": 2580 + }, + { + "epoch": 0.68, + "grad_norm": 13.0, + "learning_rate": 1.421763837748016e-06, + "logits/chosen": -1.1741114854812622, + "logits/rejected": -1.0814844369888306, + "logps/chosen": -523.6945190429688, + "logps/rejected": -640.1383056640625, + "loss": 0.4441, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.7306861877441406, + "rewards/margins": 1.2494643926620483, + "rewards/rejected": -3.9801506996154785, + "step": 2590 + }, + { + "epoch": 0.68, + "grad_norm": 10.75, + "learning_rate": 1.401198464962021e-06, + "logits/chosen": -1.2068405151367188, + "logits/rejected": -1.0479636192321777, + "logps/chosen": -556.838623046875, + "logps/rejected": -625.3237915039062, + "loss": 0.4596, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.882551908493042, + "rewards/margins": 1.0710302591323853, + "rewards/rejected": -3.953582286834717, + "step": 2600 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -1.0548917055130005, + "eval_logits/rejected": -0.9263830184936523, + "eval_logps/chosen": -559.5697631835938, + "eval_logps/rejected": -654.4569702148438, + "eval_loss": 0.493943989276886, + "eval_rewards/accuracies": 0.7400000095367432, + "eval_rewards/chosen": -2.9491782188415527, + "eval_rewards/margins": 1.149312973022461, + "eval_rewards/rejected": -4.098491191864014, + "eval_runtime": 381.8434, + "eval_samples_per_second": 5.238, + "eval_steps_per_second": 0.655, + "step": 2600 + }, + { + "epoch": 0.68, + "grad_norm": 8.5, + "learning_rate": 1.3807248417879896e-06, + "logits/chosen": -1.2618989944458008, + "logits/rejected": -1.1420848369598389, + "logps/chosen": -562.00146484375, + "logps/rejected": -670.0994873046875, + "loss": 0.4435, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.889355182647705, + "rewards/margins": 1.2824347019195557, + "rewards/rejected": -4.17179012298584, + "step": 2610 + }, + { + "epoch": 0.69, + "grad_norm": 29.125, + "learning_rate": 1.3603446777675665e-06, + "logits/chosen": -1.0890090465545654, + "logits/rejected": -0.966164767742157, + "logps/chosen": -583.3985595703125, + "logps/rejected": -678.4222412109375, + "loss": 0.5331, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.2020103931427, + "rewards/margins": 1.1710442304611206, + "rewards/rejected": -4.373054504394531, + "step": 2620 + }, + { + "epoch": 0.69, + "grad_norm": 6.84375, + "learning_rate": 1.3400596746385817e-06, + "logits/chosen": -1.2348748445510864, + "logits/rejected": -1.083888053894043, + "logps/chosen": -578.0357666015625, + "logps/rejected": -659.4061279296875, + "loss": 0.522, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.088522434234619, + "rewards/margins": 1.0845201015472412, + "rewards/rejected": -4.173042297363281, + "step": 2630 + }, + { + "epoch": 0.69, + "grad_norm": 8.6875, + "learning_rate": 1.3198715261929587e-06, + "logits/chosen": -1.1974236965179443, + "logits/rejected": -1.0507824420928955, + "logps/chosen": -558.0233764648438, + "logps/rejected": -667.1177978515625, + "loss": 0.4239, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.2156710624694824, + "rewards/margins": 1.2272260189056396, + "rewards/rejected": -4.442896842956543, + "step": 2640 + }, + { + "epoch": 0.69, + "grad_norm": 7.34375, + "learning_rate": 1.2997819181352823e-06, + "logits/chosen": -1.2283174991607666, + "logits/rejected": -1.0654624700546265, + "logps/chosen": -604.8272705078125, + "logps/rejected": -724.4739379882812, + "loss": 0.4118, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.079939365386963, + "rewards/margins": 1.4414037466049194, + "rewards/rejected": -4.521343231201172, + "step": 2650 + }, + { + "epoch": 0.7, + "grad_norm": 23.625, + "learning_rate": 1.2797925279420454e-06, + "logits/chosen": -1.1807067394256592, + "logits/rejected": -1.0574986934661865, + "logps/chosen": -610.4517822265625, + "logps/rejected": -721.9064331054688, + "loss": 0.489, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.385681629180908, + "rewards/margins": 1.2347917556762695, + "rewards/rejected": -4.620473384857178, + "step": 2660 + }, + { + "epoch": 0.7, + "grad_norm": 12.5, + "learning_rate": 1.2599050247215764e-06, + "logits/chosen": -1.129962682723999, + "logits/rejected": -1.0201483964920044, + "logps/chosen": -585.4744262695312, + "logps/rejected": -686.8712158203125, + "loss": 0.4794, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2754268646240234, + "rewards/margins": 1.2443504333496094, + "rewards/rejected": -4.519776821136475, + "step": 2670 + }, + { + "epoch": 0.7, + "grad_norm": 12.25, + "learning_rate": 1.2401210690746705e-06, + "logits/chosen": -1.155137300491333, + "logits/rejected": -1.012924313545227, + "logps/chosen": -587.5916748046875, + "logps/rejected": -667.8207397460938, + "loss": 0.5131, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.2043495178222656, + "rewards/margins": 1.134204626083374, + "rewards/rejected": -4.338554859161377, + "step": 2680 + }, + { + "epoch": 0.7, + "grad_norm": 12.4375, + "learning_rate": 1.2204423129559306e-06, + "logits/chosen": -1.1951662302017212, + "logits/rejected": -1.140353798866272, + "logps/chosen": -567.091552734375, + "logps/rejected": -681.1925048828125, + "loss": 0.4925, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0157859325408936, + "rewards/margins": 1.1822477579116821, + "rewards/rejected": -4.198033332824707, + "step": 2690 + }, + { + "epoch": 0.71, + "grad_norm": 15.0625, + "learning_rate": 1.20087039953583e-06, + "logits/chosen": -1.2230998277664185, + "logits/rejected": -1.1086806058883667, + "logps/chosen": -558.0277099609375, + "logps/rejected": -655.5286865234375, + "loss": 0.5152, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.9380927085876465, + "rewards/margins": 1.2388523817062378, + "rewards/rejected": -4.176945209503174, + "step": 2700 + }, + { + "epoch": 0.71, + "eval_logits/chosen": -1.052660346031189, + "eval_logits/rejected": -0.9249356985092163, + "eval_logps/chosen": -566.6193237304688, + "eval_logps/rejected": -660.3236083984375, + "eval_loss": 0.49224671721458435, + "eval_rewards/accuracies": 0.7440000176429749, + "eval_rewards/chosen": -3.0196733474731445, + "eval_rewards/margins": 1.1374843120574951, + "eval_rewards/rejected": -4.1571574211120605, + "eval_runtime": 382.3055, + "eval_samples_per_second": 5.231, + "eval_steps_per_second": 0.654, + "step": 2700 + }, + { + "epoch": 0.71, + "grad_norm": 10.625, + "learning_rate": 1.181406963063507e-06, + "logits/chosen": -1.1344083547592163, + "logits/rejected": -1.0651142597198486, + "logps/chosen": -557.28125, + "logps/rejected": -663.6448974609375, + "loss": 0.5133, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9143428802490234, + "rewards/margins": 1.0695984363555908, + "rewards/rejected": -3.9839415550231934, + "step": 2710 + }, + { + "epoch": 0.71, + "grad_norm": 6.84375, + "learning_rate": 1.1620536287303052e-06, + "logits/chosen": -1.2466278076171875, + "logits/rejected": -1.1265995502471924, + "logps/chosen": -571.1409301757812, + "logps/rejected": -636.3128662109375, + "loss": 0.5366, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.799872875213623, + "rewards/margins": 0.9532085657119751, + "rewards/rejected": -3.7530815601348877, + "step": 2720 + }, + { + "epoch": 0.71, + "grad_norm": 9.8125, + "learning_rate": 1.1428120125340717e-06, + "logits/chosen": -1.1743571758270264, + "logits/rejected": -1.024549126625061, + "logps/chosen": -524.5095825195312, + "logps/rejected": -638.3724365234375, + "loss": 0.3937, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.7037060260772705, + "rewards/margins": 1.5533134937286377, + "rewards/rejected": -4.257019519805908, + "step": 2730 + }, + { + "epoch": 0.72, + "grad_norm": 9.5625, + "learning_rate": 1.123683721144223e-06, + "logits/chosen": -1.186992883682251, + "logits/rejected": -1.0803272724151611, + "logps/chosen": -567.0985107421875, + "logps/rejected": -677.031005859375, + "loss": 0.4245, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.834251880645752, + "rewards/margins": 1.422716498374939, + "rewards/rejected": -4.256968021392822, + "step": 2740 + }, + { + "epoch": 0.72, + "grad_norm": 6.96875, + "learning_rate": 1.1046703517675848e-06, + "logits/chosen": -1.1976065635681152, + "logits/rejected": -1.1182498931884766, + "logps/chosen": -537.647216796875, + "logps/rejected": -647.6975708007812, + "loss": 0.5195, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.8253092765808105, + "rewards/margins": 1.0392690896987915, + "rewards/rejected": -3.8645782470703125, + "step": 2750 + }, + { + "epoch": 0.72, + "grad_norm": 10.9375, + "learning_rate": 1.085773492015028e-06, + "logits/chosen": -1.1978458166122437, + "logits/rejected": -1.0323292016983032, + "logps/chosen": -516.9109497070312, + "logps/rejected": -612.7794189453125, + "loss": 0.4273, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.7053141593933105, + "rewards/margins": 1.3017933368682861, + "rewards/rejected": -4.007107257843018, + "step": 2760 + }, + { + "epoch": 0.72, + "grad_norm": 18.5, + "learning_rate": 1.0669947197689034e-06, + "logits/chosen": -1.15623140335083, + "logits/rejected": -1.0121409893035889, + "logps/chosen": -561.9942626953125, + "logps/rejected": -639.5707397460938, + "loss": 0.5067, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.893584728240967, + "rewards/margins": 1.0627275705337524, + "rewards/rejected": -3.956312656402588, + "step": 2770 + }, + { + "epoch": 0.73, + "grad_norm": 9.4375, + "learning_rate": 1.048335603051291e-06, + "logits/chosen": -1.1546833515167236, + "logits/rejected": -1.0220603942871094, + "logps/chosen": -599.4776611328125, + "logps/rejected": -710.4974975585938, + "loss": 0.4331, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.1167383193969727, + "rewards/margins": 1.39237380027771, + "rewards/rejected": -4.5091118812561035, + "step": 2780 + }, + { + "epoch": 0.73, + "grad_norm": 9.75, + "learning_rate": 1.0297976998930665e-06, + "logits/chosen": -1.1516591310501099, + "logits/rejected": -1.0285645723342896, + "logps/chosen": -560.0816650390625, + "logps/rejected": -675.4591064453125, + "loss": 0.4367, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.9946718215942383, + "rewards/margins": 1.4317247867584229, + "rewards/rejected": -4.42639684677124, + "step": 2790 + }, + { + "epoch": 0.73, + "grad_norm": 10.9375, + "learning_rate": 1.0113825582038078e-06, + "logits/chosen": -1.1821314096450806, + "logits/rejected": -1.0650185346603394, + "logps/chosen": -576.8660278320312, + "logps/rejected": -679.5147705078125, + "loss": 0.4518, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.115962266921997, + "rewards/margins": 1.1937239170074463, + "rewards/rejected": -4.309685707092285, + "step": 2800 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -1.053481936454773, + "eval_logits/rejected": -0.9260234236717224, + "eval_logps/chosen": -571.3138427734375, + "eval_logps/rejected": -668.0293579101562, + "eval_loss": 0.49084553122520447, + "eval_rewards/accuracies": 0.7415000200271606, + "eval_rewards/chosen": -3.066617965698242, + "eval_rewards/margins": 1.1675963401794434, + "eval_rewards/rejected": -4.2342143058776855, + "eval_runtime": 382.1708, + "eval_samples_per_second": 5.233, + "eval_steps_per_second": 0.654, + "step": 2800 + }, + { + "epoch": 0.74, + "grad_norm": 10.625, + "learning_rate": 9.930917156425477e-07, + "logits/chosen": -1.1559561491012573, + "logits/rejected": -1.0568530559539795, + "logps/chosen": -582.1268310546875, + "logps/rejected": -691.96875, + "loss": 0.5368, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.210031509399414, + "rewards/margins": 1.1272989511489868, + "rewards/rejected": -4.337330341339111, + "step": 2810 + }, + { + "epoch": 0.74, + "grad_norm": 15.25, + "learning_rate": 9.749266994893756e-07, + "logits/chosen": -1.0973955392837524, + "logits/rejected": -0.9485132098197937, + "logps/chosen": -550.6517333984375, + "logps/rejected": -629.6903686523438, + "loss": 0.5621, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.0995354652404785, + "rewards/margins": 0.9246597290039062, + "rewards/rejected": -4.024195671081543, + "step": 2820 + }, + { + "epoch": 0.74, + "grad_norm": 15.3125, + "learning_rate": 9.56889026517913e-07, + "logits/chosen": -1.1514110565185547, + "logits/rejected": -1.0361002683639526, + "logps/chosen": -582.6224365234375, + "logps/rejected": -664.3800659179688, + "loss": 0.5019, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.2128403186798096, + "rewards/margins": 1.0774794816970825, + "rewards/rejected": -4.290319442749023, + "step": 2830 + }, + { + "epoch": 0.74, + "grad_norm": 7.03125, + "learning_rate": 9.389802028686617e-07, + "logits/chosen": -1.2338387966156006, + "logits/rejected": -1.1366431713104248, + "logps/chosen": -566.8738403320312, + "logps/rejected": -616.0252685546875, + "loss": 0.5826, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.0610568523406982, + "rewards/margins": 0.8211328387260437, + "rewards/rejected": -3.882189989089966, + "step": 2840 + }, + { + "epoch": 0.75, + "grad_norm": 9.75, + "learning_rate": 9.212017239232427e-07, + "logits/chosen": -1.1542332172393799, + "logits/rejected": -1.017268180847168, + "logps/chosen": -568.286376953125, + "logps/rejected": -668.4588623046875, + "loss": 0.4741, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.9533185958862305, + "rewards/margins": 1.2311924695968628, + "rewards/rejected": -4.184511184692383, + "step": 2850 + }, + { + "epoch": 0.75, + "grad_norm": 10.0625, + "learning_rate": 9.03555074179533e-07, + "logits/chosen": -1.1374441385269165, + "logits/rejected": -1.1105449199676514, + "logps/chosen": -544.0662231445312, + "logps/rejected": -676.7945556640625, + "loss": 0.446, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.872863292694092, + "rewards/margins": 1.2689627408981323, + "rewards/rejected": -4.1418256759643555, + "step": 2860 + }, + { + "epoch": 0.75, + "grad_norm": 14.5625, + "learning_rate": 8.860417271277067e-07, + "logits/chosen": -1.263672947883606, + "logits/rejected": -1.2044599056243896, + "logps/chosen": -563.6286010742188, + "logps/rejected": -651.6553955078125, + "loss": 0.4788, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.9439358711242676, + "rewards/margins": 0.9601505398750305, + "rewards/rejected": -3.9040865898132324, + "step": 2870 + }, + { + "epoch": 0.75, + "grad_norm": 8.75, + "learning_rate": 8.686631451272029e-07, + "logits/chosen": -1.2087829113006592, + "logits/rejected": -1.0665159225463867, + "logps/chosen": -564.14892578125, + "logps/rejected": -660.1915893554688, + "loss": 0.4861, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.1072099208831787, + "rewards/margins": 1.2149550914764404, + "rewards/rejected": -4.322165489196777, + "step": 2880 + }, + { + "epoch": 0.76, + "grad_norm": 8.625, + "learning_rate": 8.514207792846168e-07, + "logits/chosen": -1.2422146797180176, + "logits/rejected": -1.1245746612548828, + "logps/chosen": -556.6324462890625, + "logps/rejected": -642.3776245117188, + "loss": 0.4902, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.0577263832092285, + "rewards/margins": 1.1418030261993408, + "rewards/rejected": -4.19952917098999, + "step": 2890 + }, + { + "epoch": 0.76, + "grad_norm": 8.0625, + "learning_rate": 8.343160693325356e-07, + "logits/chosen": -1.1230237483978271, + "logits/rejected": -1.0151801109313965, + "logps/chosen": -566.5771484375, + "logps/rejected": -679.12646484375, + "loss": 0.5018, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.1102497577667236, + "rewards/margins": 1.1686756610870361, + "rewards/rejected": -4.278925895690918, + "step": 2900 + }, + { + "epoch": 0.76, + "eval_logits/chosen": -1.059489130973816, + "eval_logits/rejected": -0.9320334792137146, + "eval_logps/chosen": -574.426025390625, + "eval_logps/rejected": -668.4285278320312, + "eval_loss": 0.4876534342765808, + "eval_rewards/accuracies": 0.7465000152587891, + "eval_rewards/chosen": -3.0977394580841064, + "eval_rewards/margins": 1.1404662132263184, + "eval_rewards/rejected": -4.238205432891846, + "eval_runtime": 382.316, + "eval_samples_per_second": 5.231, + "eval_steps_per_second": 0.654, + "step": 2900 + }, + { + "epoch": 0.76, + "grad_norm": 8.125, + "learning_rate": 8.173504435093174e-07, + "logits/chosen": -1.1287494897842407, + "logits/rejected": -0.955623984336853, + "logps/chosen": -547.8873291015625, + "logps/rejected": -640.971923828125, + "loss": 0.477, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0631890296936035, + "rewards/margins": 1.2520211935043335, + "rewards/rejected": -4.315210342407227, + "step": 2910 + }, + { + "epoch": 0.76, + "grad_norm": 6.5625, + "learning_rate": 8.00525318439836e-07, + "logits/chosen": -1.158349871635437, + "logits/rejected": -1.0400350093841553, + "logps/chosen": -583.4833374023438, + "logps/rejected": -674.5729370117188, + "loss": 0.5408, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.0665407180786133, + "rewards/margins": 0.9871135950088501, + "rewards/rejected": -4.053654193878174, + "step": 2920 + }, + { + "epoch": 0.77, + "grad_norm": 7.6875, + "learning_rate": 7.838420990171927e-07, + "logits/chosen": -1.2469195127487183, + "logits/rejected": -1.0984286069869995, + "logps/chosen": -567.165283203125, + "logps/rejected": -650.6731567382812, + "loss": 0.5017, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.977949380874634, + "rewards/margins": 1.090990424156189, + "rewards/rejected": -4.068940162658691, + "step": 2930 + }, + { + "epoch": 0.77, + "grad_norm": 7.5625, + "learning_rate": 7.673021782854084e-07, + "logits/chosen": -1.1217727661132812, + "logits/rejected": -0.9839452505111694, + "logps/chosen": -561.6543579101562, + "logps/rejected": -643.6695556640625, + "loss": 0.4898, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.009657382965088, + "rewards/margins": 1.2389792203903198, + "rewards/rejected": -4.248636722564697, + "step": 2940 + }, + { + "epoch": 0.77, + "grad_norm": 9.9375, + "learning_rate": 7.509069373231039e-07, + "logits/chosen": -1.129913568496704, + "logits/rejected": -1.0110609531402588, + "logps/chosen": -554.6318969726562, + "logps/rejected": -622.6085205078125, + "loss": 0.5441, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0194828510284424, + "rewards/margins": 0.9275726079940796, + "rewards/rejected": -3.9470553398132324, + "step": 2950 + }, + { + "epoch": 0.77, + "grad_norm": 7.71875, + "learning_rate": 7.346577451281822e-07, + "logits/chosen": -1.1370588541030884, + "logits/rejected": -1.0633890628814697, + "logps/chosen": -551.51123046875, + "logps/rejected": -660.9559936523438, + "loss": 0.4596, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.892915725708008, + "rewards/margins": 1.3355481624603271, + "rewards/rejected": -4.228463649749756, + "step": 2960 + }, + { + "epoch": 0.78, + "grad_norm": 18.625, + "learning_rate": 7.185559585035138e-07, + "logits/chosen": -1.1904377937316895, + "logits/rejected": -1.0318008661270142, + "logps/chosen": -591.028564453125, + "logps/rejected": -693.4492797851562, + "loss": 0.4733, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.0825228691101074, + "rewards/margins": 1.1828874349594116, + "rewards/rejected": -4.26540994644165, + "step": 2970 + }, + { + "epoch": 0.78, + "grad_norm": 7.78125, + "learning_rate": 7.026029219436504e-07, + "logits/chosen": -1.2153565883636475, + "logits/rejected": -1.0524095296859741, + "logps/chosen": -546.4449462890625, + "logps/rejected": -655.5341186523438, + "loss": 0.4637, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9415435791015625, + "rewards/margins": 1.2188594341278076, + "rewards/rejected": -4.160403251647949, + "step": 2980 + }, + { + "epoch": 0.78, + "grad_norm": 9.5, + "learning_rate": 6.867999675225523e-07, + "logits/chosen": -1.2460225820541382, + "logits/rejected": -1.1109936237335205, + "logps/chosen": -518.8594970703125, + "logps/rejected": -621.4867553710938, + "loss": 0.4754, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.8794169425964355, + "rewards/margins": 1.1684167385101318, + "rewards/rejected": -4.047833442687988, + "step": 2990 + }, + { + "epoch": 0.79, + "grad_norm": 10.375, + "learning_rate": 6.711484147823663e-07, + "logits/chosen": -1.1477627754211426, + "logits/rejected": -1.0689526796340942, + "logps/chosen": -520.4979858398438, + "logps/rejected": -650.7647094726562, + "loss": 0.4592, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.862730026245117, + "rewards/margins": 1.2541263103485107, + "rewards/rejected": -4.116856575012207, + "step": 3000 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -1.0787907838821411, + "eval_logits/rejected": -0.9509702324867249, + "eval_logps/chosen": -563.9876708984375, + "eval_logps/rejected": -655.9471435546875, + "eval_loss": 0.48733198642730713, + "eval_rewards/accuracies": 0.7459999918937683, + "eval_rewards/chosen": -2.993356466293335, + "eval_rewards/margins": 1.1200352907180786, + "eval_rewards/rejected": -4.113391399383545, + "eval_runtime": 382.8007, + "eval_samples_per_second": 5.225, + "eval_steps_per_second": 0.653, + "step": 3000 + }, + { + "epoch": 0.79, + "grad_norm": 11.25, + "learning_rate": 6.556495706232413e-07, + "logits/chosen": -1.1598658561706543, + "logits/rejected": -1.0877033472061157, + "logps/chosen": -578.8084106445312, + "logps/rejected": -665.4705200195312, + "loss": 0.5453, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.1011300086975098, + "rewards/margins": 1.050903081893921, + "rewards/rejected": -4.152032852172852, + "step": 3010 + }, + { + "epoch": 0.79, + "grad_norm": 9.8125, + "learning_rate": 6.403047291942057e-07, + "logits/chosen": -1.0840625762939453, + "logits/rejected": -0.9331427812576294, + "logps/chosen": -521.8424682617188, + "logps/rejected": -612.9337768554688, + "loss": 0.495, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.95574688911438, + "rewards/margins": 1.1547616720199585, + "rewards/rejected": -4.110508441925049, + "step": 3020 + }, + { + "epoch": 0.79, + "grad_norm": 12.375, + "learning_rate": 6.251151717851023e-07, + "logits/chosen": -1.1582403182983398, + "logits/rejected": -1.0655838251113892, + "logps/chosen": -526.1175537109375, + "logps/rejected": -627.6626586914062, + "loss": 0.4861, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9271697998046875, + "rewards/margins": 1.1482912302017212, + "rewards/rejected": -4.075460910797119, + "step": 3030 + }, + { + "epoch": 0.8, + "grad_norm": 6.25, + "learning_rate": 6.100821667196041e-07, + "logits/chosen": -1.323209524154663, + "logits/rejected": -1.0637619495391846, + "logps/chosen": -561.310791015625, + "logps/rejected": -609.7546997070312, + "loss": 0.4726, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.856724500656128, + "rewards/margins": 1.1393463611602783, + "rewards/rejected": -3.9960708618164062, + "step": 3040 + }, + { + "epoch": 0.8, + "grad_norm": 55.5, + "learning_rate": 5.952069692493062e-07, + "logits/chosen": -1.1378008127212524, + "logits/rejected": -1.033092737197876, + "logps/chosen": -511.969482421875, + "logps/rejected": -648.4796752929688, + "loss": 0.4149, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.8365421295166016, + "rewards/margins": 1.3306509256362915, + "rewards/rejected": -4.1671929359436035, + "step": 3050 + }, + { + "epoch": 0.8, + "grad_norm": 10.5625, + "learning_rate": 5.80490821448918e-07, + "logits/chosen": -1.1030110120773315, + "logits/rejected": -1.0928280353546143, + "logps/chosen": -549.79052734375, + "logps/rejected": -727.48876953125, + "loss": 0.4284, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8726837635040283, + "rewards/margins": 1.3525440692901611, + "rewards/rejected": -4.225228309631348, + "step": 3060 + }, + { + "epoch": 0.8, + "grad_norm": 9.0625, + "learning_rate": 5.659349521125459e-07, + "logits/chosen": -1.2849022150039673, + "logits/rejected": -1.2295571565628052, + "logps/chosen": -560.9410400390625, + "logps/rejected": -645.2173461914062, + "loss": 0.4973, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.7952258586883545, + "rewards/margins": 1.048758864402771, + "rewards/rejected": -3.843984603881836, + "step": 3070 + }, + { + "epoch": 0.81, + "grad_norm": 6.90625, + "learning_rate": 5.5154057665109e-07, + "logits/chosen": -1.2467188835144043, + "logits/rejected": -1.0997190475463867, + "logps/chosen": -557.9779052734375, + "logps/rejected": -661.7819213867188, + "loss": 0.4889, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.9708826541900635, + "rewards/margins": 1.3023018836975098, + "rewards/rejected": -4.273184776306152, + "step": 3080 + }, + { + "epoch": 0.81, + "grad_norm": 8.25, + "learning_rate": 5.373088969907586e-07, + "logits/chosen": -1.2789522409439087, + "logits/rejected": -1.0984174013137817, + "logps/chosen": -573.76123046875, + "logps/rejected": -637.1810302734375, + "loss": 0.4581, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.969475269317627, + "rewards/margins": 1.136474370956421, + "rewards/rejected": -4.105949878692627, + "step": 3090 + }, + { + "epoch": 0.81, + "grad_norm": 8.625, + "learning_rate": 5.23241101472709e-07, + "logits/chosen": -1.1879446506500244, + "logits/rejected": -1.0638211965560913, + "logps/chosen": -563.8876342773438, + "logps/rejected": -645.8259887695312, + "loss": 0.4905, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.8758597373962402, + "rewards/margins": 1.0394397974014282, + "rewards/rejected": -3.9152991771698, + "step": 3100 + }, + { + "epoch": 0.81, + "eval_logits/chosen": -1.0740700960159302, + "eval_logits/rejected": -0.9464629888534546, + "eval_logps/chosen": -562.904296875, + "eval_logps/rejected": -656.5853271484375, + "eval_loss": 0.48781928420066833, + "eval_rewards/accuracies": 0.7429999709129333, + "eval_rewards/chosen": -2.982522964477539, + "eval_rewards/margins": 1.1372504234313965, + "eval_rewards/rejected": -4.1197733879089355, + "eval_runtime": 382.0441, + "eval_samples_per_second": 5.235, + "eval_steps_per_second": 0.654, + "step": 3100 + }, + { + "epoch": 0.81, + "grad_norm": 8.5625, + "learning_rate": 5.09338364753818e-07, + "logits/chosen": -1.2681617736816406, + "logits/rejected": -1.0949214696884155, + "logps/chosen": -578.9161376953125, + "logps/rejected": -673.3041381835938, + "loss": 0.5304, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9349396228790283, + "rewards/margins": 1.1018182039260864, + "rewards/rejected": -4.036757469177246, + "step": 3110 + }, + { + "epoch": 0.82, + "grad_norm": 10.125, + "learning_rate": 4.956018477086005e-07, + "logits/chosen": -1.2264713048934937, + "logits/rejected": -1.0714534521102905, + "logps/chosen": -574.7757568359375, + "logps/rejected": -661.6316528320312, + "loss": 0.5111, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.056429624557495, + "rewards/margins": 1.1420024633407593, + "rewards/rejected": -4.198431968688965, + "step": 3120 + }, + { + "epoch": 0.82, + "grad_norm": 11.625, + "learning_rate": 4.820326973322764e-07, + "logits/chosen": -1.1282936334609985, + "logits/rejected": -1.0485918521881104, + "logps/chosen": -566.1331787109375, + "logps/rejected": -665.1694946289062, + "loss": 0.5658, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.1516964435577393, + "rewards/margins": 1.0504977703094482, + "rewards/rejected": -4.202193737030029, + "step": 3130 + }, + { + "epoch": 0.82, + "grad_norm": 10.25, + "learning_rate": 4.686320466449981e-07, + "logits/chosen": -1.1074498891830444, + "logits/rejected": -0.9338695406913757, + "logps/chosen": -530.6743774414062, + "logps/rejected": -670.0709838867188, + "loss": 0.4495, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.914252281188965, + "rewards/margins": 1.4155068397521973, + "rewards/rejected": -4.329759120941162, + "step": 3140 + }, + { + "epoch": 0.82, + "grad_norm": 8.1875, + "learning_rate": 4.554010145972418e-07, + "logits/chosen": -1.2932242155075073, + "logits/rejected": -1.10805344581604, + "logps/chosen": -569.38818359375, + "logps/rejected": -671.8726806640625, + "loss": 0.551, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.0557217597961426, + "rewards/margins": 1.1381008625030518, + "rewards/rejected": -4.193822860717773, + "step": 3150 + }, + { + "epoch": 0.83, + "grad_norm": 8.75, + "learning_rate": 4.4234070597637455e-07, + "logits/chosen": -1.1201348304748535, + "logits/rejected": -1.0320645570755005, + "logps/chosen": -575.7613525390625, + "logps/rejected": -669.4164428710938, + "loss": 0.5272, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0022788047790527, + "rewards/margins": 1.0558512210845947, + "rewards/rejected": -4.058130264282227, + "step": 3160 + }, + { + "epoch": 0.83, + "grad_norm": 6.5625, + "learning_rate": 4.2945221131440783e-07, + "logits/chosen": -1.114639401435852, + "logits/rejected": -0.9161049127578735, + "logps/chosen": -552.2017211914062, + "logps/rejected": -653.031005859375, + "loss": 0.4203, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.864920139312744, + "rewards/margins": 1.3247652053833008, + "rewards/rejected": -4.189684867858887, + "step": 3170 + }, + { + "epoch": 0.83, + "grad_norm": 9.25, + "learning_rate": 4.167366067969381e-07, + "logits/chosen": -1.216722846031189, + "logits/rejected": -1.144590139389038, + "logps/chosen": -516.3905639648438, + "logps/rejected": -639.3621826171875, + "loss": 0.4982, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9348020553588867, + "rewards/margins": 0.9914267659187317, + "rewards/rejected": -3.9262290000915527, + "step": 3180 + }, + { + "epoch": 0.83, + "grad_norm": 6.125, + "learning_rate": 4.041949541732826e-07, + "logits/chosen": -1.1988582611083984, + "logits/rejected": -1.1241180896759033, + "logps/chosen": -567.2083740234375, + "logps/rejected": -659.522216796875, + "loss": 0.5194, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.033240795135498, + "rewards/margins": 1.0658702850341797, + "rewards/rejected": -4.0991106033325195, + "step": 3190 + }, + { + "epoch": 0.84, + "grad_norm": 10.1875, + "learning_rate": 3.9182830066782614e-07, + "logits/chosen": -1.1077312231063843, + "logits/rejected": -1.0953607559204102, + "logps/chosen": -557.6238403320312, + "logps/rejected": -689.794921875, + "loss": 0.485, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.9987998008728027, + "rewards/margins": 1.1976327896118164, + "rewards/rejected": -4.196433067321777, + "step": 3200 + }, + { + "epoch": 0.84, + "eval_logits/chosen": -1.0807329416275024, + "eval_logits/rejected": -0.9531368613243103, + "eval_logps/chosen": -559.239990234375, + "eval_logps/rejected": -652.1516723632812, + "eval_loss": 0.4873969852924347, + "eval_rewards/accuracies": 0.7455000281333923, + "eval_rewards/chosen": -2.9458799362182617, + "eval_rewards/margins": 1.1295573711395264, + "eval_rewards/rejected": -4.075437068939209, + "eval_runtime": 381.6886, + "eval_samples_per_second": 5.24, + "eval_steps_per_second": 0.655, + "step": 3200 + }, + { + "epoch": 0.84, + "grad_norm": 8.3125, + "learning_rate": 3.796376788925771e-07, + "logits/chosen": -1.1163936853408813, + "logits/rejected": -1.0554332733154297, + "logps/chosen": -541.477294921875, + "logps/rejected": -619.0269165039062, + "loss": 0.4946, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.800494909286499, + "rewards/margins": 1.0160177946090698, + "rewards/rejected": -3.8165130615234375, + "step": 3210 + }, + { + "epoch": 0.84, + "grad_norm": 7.625, + "learning_rate": 3.676241067609465e-07, + "logits/chosen": -1.2064073085784912, + "logits/rejected": -1.0841269493103027, + "logps/chosen": -582.91259765625, + "logps/rejected": -648.9725952148438, + "loss": 0.5138, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.892125129699707, + "rewards/margins": 1.091489315032959, + "rewards/rejected": -3.983614444732666, + "step": 3220 + }, + { + "epoch": 0.85, + "grad_norm": 15.1875, + "learning_rate": 3.5578858740274976e-07, + "logits/chosen": -1.123425841331482, + "logits/rejected": -1.0302746295928955, + "logps/chosen": -566.611328125, + "logps/rejected": -648.7924194335938, + "loss": 0.5326, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.048774003982544, + "rewards/margins": 0.9477185010910034, + "rewards/rejected": -3.996492385864258, + "step": 3230 + }, + { + "epoch": 0.85, + "grad_norm": 11.5625, + "learning_rate": 3.44132109080447e-07, + "logits/chosen": -1.3182079792022705, + "logits/rejected": -1.1424782276153564, + "logps/chosen": -549.4573364257812, + "logps/rejected": -634.7244873046875, + "loss": 0.4425, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.8423376083374023, + "rewards/margins": 1.2275350093841553, + "rewards/rejected": -4.069872856140137, + "step": 3240 + }, + { + "epoch": 0.85, + "grad_norm": 12.125, + "learning_rate": 3.3265564510662344e-07, + "logits/chosen": -1.2581889629364014, + "logits/rejected": -1.1089788675308228, + "logps/chosen": -572.9723510742188, + "logps/rejected": -676.4666137695312, + "loss": 0.4207, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.796302318572998, + "rewards/margins": 1.2862600088119507, + "rewards/rejected": -4.082562446594238, + "step": 3250 + }, + { + "epoch": 0.85, + "grad_norm": 14.3125, + "learning_rate": 3.213601537627195e-07, + "logits/chosen": -1.1619012355804443, + "logits/rejected": -1.0473229885101318, + "logps/chosen": -574.4371948242188, + "logps/rejected": -662.361083984375, + "loss": 0.5456, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.1731839179992676, + "rewards/margins": 1.0502443313598633, + "rewards/rejected": -4.223428249359131, + "step": 3260 + }, + { + "epoch": 0.86, + "grad_norm": 12.0, + "learning_rate": 3.1024657821901063e-07, + "logits/chosen": -1.2181814908981323, + "logits/rejected": -1.1487758159637451, + "logps/chosen": -531.4067993164062, + "logps/rejected": -627.6771240234375, + "loss": 0.5005, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.8366494178771973, + "rewards/margins": 1.1211111545562744, + "rewards/rejected": -3.9577605724334717, + "step": 3270 + }, + { + "epoch": 0.86, + "grad_norm": 14.25, + "learning_rate": 2.9931584645585654e-07, + "logits/chosen": -1.147289514541626, + "logits/rejected": -1.1335127353668213, + "logps/chosen": -557.3380737304688, + "logps/rejected": -666.0869140625, + "loss": 0.5042, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.828235626220703, + "rewards/margins": 1.0581908226013184, + "rewards/rejected": -3.8864264488220215, + "step": 3280 + }, + { + "epoch": 0.86, + "grad_norm": 6.96875, + "learning_rate": 2.885688711862136e-07, + "logits/chosen": -1.1895829439163208, + "logits/rejected": -1.1866552829742432, + "logps/chosen": -561.8271484375, + "logps/rejected": -686.0377197265625, + "loss": 0.51, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.007755756378174, + "rewards/margins": 1.261817216873169, + "rewards/rejected": -4.269573211669922, + "step": 3290 + }, + { + "epoch": 0.86, + "grad_norm": 7.6875, + "learning_rate": 2.7800654977942486e-07, + "logits/chosen": -1.1794744729995728, + "logits/rejected": -1.0672075748443604, + "logps/chosen": -547.8685302734375, + "logps/rejected": -650.7493286132812, + "loss": 0.5157, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.9051201343536377, + "rewards/margins": 1.0670777559280396, + "rewards/rejected": -3.972198009490967, + "step": 3300 + }, + { + "epoch": 0.86, + "eval_logits/chosen": -1.0755009651184082, + "eval_logits/rejected": -0.9480787515640259, + "eval_logps/chosen": -560.1488647460938, + "eval_logps/rejected": -652.9912109375, + "eval_loss": 0.4874354600906372, + "eval_rewards/accuracies": 0.7444999814033508, + "eval_rewards/chosen": -2.9549689292907715, + "eval_rewards/margins": 1.128864049911499, + "eval_rewards/rejected": -4.083832740783691, + "eval_runtime": 383.0008, + "eval_samples_per_second": 5.222, + "eval_steps_per_second": 0.653, + "step": 3300 + }, + { + "epoch": 0.87, + "grad_norm": 12.75, + "learning_rate": 2.6762976418628797e-07, + "logits/chosen": -1.1829874515533447, + "logits/rejected": -1.0443121194839478, + "logps/chosen": -508.328857421875, + "logps/rejected": -573.6398315429688, + "loss": 0.5093, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.785529851913452, + "rewards/margins": 1.0893114805221558, + "rewards/rejected": -3.8748409748077393, + "step": 3310 + }, + { + "epoch": 0.87, + "grad_norm": 9.375, + "learning_rate": 2.5743938086541354e-07, + "logits/chosen": -1.1776726245880127, + "logits/rejected": -1.0596325397491455, + "logps/chosen": -558.5306396484375, + "logps/rejected": -649.6300048828125, + "loss": 0.4969, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.942483425140381, + "rewards/margins": 1.159317135810852, + "rewards/rejected": -4.101800441741943, + "step": 3320 + }, + { + "epoch": 0.87, + "grad_norm": 13.0625, + "learning_rate": 2.4743625071087574e-07, + "logits/chosen": -1.3345047235488892, + "logits/rejected": -1.1562585830688477, + "logps/chosen": -557.7296142578125, + "logps/rejected": -661.87109375, + "loss": 0.4702, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.8015923500061035, + "rewards/margins": 1.3364170789718628, + "rewards/rejected": -4.138009548187256, + "step": 3330 + }, + { + "epoch": 0.87, + "grad_norm": 11.875, + "learning_rate": 2.3762120898116498e-07, + "logits/chosen": -1.1994738578796387, + "logits/rejected": -1.097899079322815, + "logps/chosen": -579.8328857421875, + "logps/rejected": -674.6861572265625, + "loss": 0.4926, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.1103997230529785, + "rewards/margins": 1.0284258127212524, + "rewards/rejected": -4.138825416564941, + "step": 3340 + }, + { + "epoch": 0.88, + "grad_norm": 8.0, + "learning_rate": 2.2799507522944048e-07, + "logits/chosen": -1.1523630619049072, + "logits/rejected": -1.0521692037582397, + "logps/chosen": -551.5980224609375, + "logps/rejected": -671.2841186523438, + "loss": 0.4455, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.8580057621002197, + "rewards/margins": 1.2792617082595825, + "rewards/rejected": -4.137267112731934, + "step": 3350 + }, + { + "epoch": 0.88, + "grad_norm": 8.5625, + "learning_rate": 2.1855865323510056e-07, + "logits/chosen": -1.2028191089630127, + "logits/rejected": -1.0033330917358398, + "logps/chosen": -563.6111450195312, + "logps/rejected": -704.59228515625, + "loss": 0.4213, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9093270301818848, + "rewards/margins": 1.4688284397125244, + "rewards/rejected": -4.378155708312988, + "step": 3360 + }, + { + "epoch": 0.88, + "grad_norm": 7.3125, + "learning_rate": 2.0931273093666575e-07, + "logits/chosen": -1.1482703685760498, + "logits/rejected": -1.0027369260787964, + "logps/chosen": -540.7926635742188, + "logps/rejected": -644.9227294921875, + "loss": 0.439, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.006060838699341, + "rewards/margins": 1.2438604831695557, + "rewards/rejected": -4.2499213218688965, + "step": 3370 + }, + { + "epoch": 0.88, + "grad_norm": 13.625, + "learning_rate": 2.002580803659873e-07, + "logits/chosen": -1.1630356311798096, + "logits/rejected": -1.0312206745147705, + "logps/chosen": -559.203125, + "logps/rejected": -652.8720092773438, + "loss": 0.4651, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.046480417251587, + "rewards/margins": 1.1182465553283691, + "rewards/rejected": -4.164727210998535, + "step": 3380 + }, + { + "epoch": 0.89, + "grad_norm": 6.71875, + "learning_rate": 1.913954575837826e-07, + "logits/chosen": -1.2169429063796997, + "logits/rejected": -0.9856022596359253, + "logps/chosen": -575.2197875976562, + "logps/rejected": -634.4151000976562, + "loss": 0.4808, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.0310537815093994, + "rewards/margins": 1.0889911651611328, + "rewards/rejected": -4.120044708251953, + "step": 3390 + }, + { + "epoch": 0.89, + "grad_norm": 10.3125, + "learning_rate": 1.827256026165028e-07, + "logits/chosen": -1.2307440042495728, + "logits/rejected": -1.0502979755401611, + "logps/chosen": -592.2626342773438, + "logps/rejected": -664.5699462890625, + "loss": 0.4474, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.771921396255493, + "rewards/margins": 1.2936856746673584, + "rewards/rejected": -4.065607070922852, + "step": 3400 + }, + { + "epoch": 0.89, + "eval_logits/chosen": -1.077279806137085, + "eval_logits/rejected": -0.9499141573905945, + "eval_logps/chosen": -561.6380615234375, + "eval_logps/rejected": -654.8016967773438, + "eval_loss": 0.4871050715446472, + "eval_rewards/accuracies": 0.7434999942779541, + "eval_rewards/chosen": -2.969860553741455, + "eval_rewards/margins": 1.1320772171020508, + "eval_rewards/rejected": -4.101937770843506, + "eval_runtime": 382.1089, + "eval_samples_per_second": 5.234, + "eval_steps_per_second": 0.654, + "step": 3400 + }, + { + "epoch": 0.89, + "grad_norm": 11.5625, + "learning_rate": 1.7424923939454274e-07, + "logits/chosen": -1.174843430519104, + "logits/rejected": -1.0021690130233765, + "logps/chosen": -579.2442626953125, + "logps/rejected": -661.9432373046875, + "loss": 0.4255, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.9789488315582275, + "rewards/margins": 1.2606755495071411, + "rewards/rejected": -4.239624500274658, + "step": 3410 + }, + { + "epoch": 0.9, + "grad_norm": 16.25, + "learning_rate": 1.6596707569179304e-07, + "logits/chosen": -1.2912896871566772, + "logits/rejected": -1.1392004489898682, + "logps/chosen": -576.8416748046875, + "logps/rejected": -653.64501953125, + "loss": 0.4901, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.956543207168579, + "rewards/margins": 1.119319200515747, + "rewards/rejected": -4.075861930847168, + "step": 3420 + }, + { + "epoch": 0.9, + "grad_norm": 9.625, + "learning_rate": 1.578798030665385e-07, + "logits/chosen": -1.2196199893951416, + "logits/rejected": -1.0388673543930054, + "logps/chosen": -565.8033447265625, + "logps/rejected": -686.4707641601562, + "loss": 0.4313, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9110517501831055, + "rewards/margins": 1.3773367404937744, + "rewards/rejected": -4.288388729095459, + "step": 3430 + }, + { + "epoch": 0.9, + "grad_norm": 8.25, + "learning_rate": 1.499880968037165e-07, + "logits/chosen": -1.1975353956222534, + "logits/rejected": -1.0588737726211548, + "logps/chosen": -544.4766845703125, + "logps/rejected": -618.7376098632812, + "loss": 0.513, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.888631820678711, + "rewards/margins": 1.1052820682525635, + "rewards/rejected": -3.9939143657684326, + "step": 3440 + }, + { + "epoch": 0.9, + "grad_norm": 14.5625, + "learning_rate": 1.4229261585852805e-07, + "logits/chosen": -1.230802297592163, + "logits/rejected": -1.1439770460128784, + "logps/chosen": -553.6980590820312, + "logps/rejected": -644.7520751953125, + "loss": 0.4489, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8526644706726074, + "rewards/margins": 1.1686475276947021, + "rewards/rejected": -4.0213117599487305, + "step": 3450 + }, + { + "epoch": 0.91, + "grad_norm": 10.6875, + "learning_rate": 1.3479400280141886e-07, + "logits/chosen": -1.1431211233139038, + "logits/rejected": -1.1035680770874023, + "logps/chosen": -544.6209106445312, + "logps/rejected": -662.7022705078125, + "loss": 0.4784, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9836173057556152, + "rewards/margins": 1.2009574174880981, + "rewards/rejected": -4.184575080871582, + "step": 3460 + }, + { + "epoch": 0.91, + "grad_norm": 9.625, + "learning_rate": 1.2749288376442044e-07, + "logits/chosen": -1.2415331602096558, + "logits/rejected": -1.0389716625213623, + "logps/chosen": -587.2742309570312, + "logps/rejected": -644.300048828125, + "loss": 0.4742, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9073996543884277, + "rewards/margins": 1.1453628540039062, + "rewards/rejected": -4.052762508392334, + "step": 3470 + }, + { + "epoch": 0.91, + "grad_norm": 8.875, + "learning_rate": 1.203898683888713e-07, + "logits/chosen": -1.2313424348831177, + "logits/rejected": -1.1037070751190186, + "logps/chosen": -548.36962890625, + "logps/rejected": -643.1497192382812, + "loss": 0.5615, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.043722152709961, + "rewards/margins": 0.9787699580192566, + "rewards/rejected": -4.022491931915283, + "step": 3480 + }, + { + "epoch": 0.91, + "grad_norm": 8.5625, + "learning_rate": 1.1348554977451132e-07, + "logits/chosen": -1.2611653804779053, + "logits/rejected": -1.1225281953811646, + "logps/chosen": -574.4703369140625, + "logps/rejected": -650.3907470703125, + "loss": 0.495, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.924811840057373, + "rewards/margins": 1.0881900787353516, + "rewards/rejected": -4.013001918792725, + "step": 3490 + }, + { + "epoch": 0.92, + "grad_norm": 6.875, + "learning_rate": 1.0678050442995802e-07, + "logits/chosen": -1.2225737571716309, + "logits/rejected": -1.0173273086547852, + "logps/chosen": -580.7540283203125, + "logps/rejected": -643.2467651367188, + "loss": 0.5379, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.9996070861816406, + "rewards/margins": 1.0754629373550415, + "rewards/rejected": -4.075070381164551, + "step": 3500 + }, + { + "epoch": 0.92, + "eval_logits/chosen": -1.074249267578125, + "eval_logits/rejected": -0.9468256831169128, + "eval_logps/chosen": -561.2808227539062, + "eval_logps/rejected": -654.5006103515625, + "eval_loss": 0.48737701773643494, + "eval_rewards/accuracies": 0.7429999709129333, + "eval_rewards/chosen": -2.9662883281707764, + "eval_rewards/margins": 1.1326382160186768, + "eval_rewards/rejected": -4.098926544189453, + "eval_runtime": 382.1229, + "eval_samples_per_second": 5.234, + "eval_steps_per_second": 0.654, + "step": 3500 + }, + { + "epoch": 0.92, + "grad_norm": 9.1875, + "learning_rate": 1.0027529222456755e-07, + "logits/chosen": -1.1973202228546143, + "logits/rejected": -1.0237270593643188, + "logps/chosen": -544.4231567382812, + "logps/rejected": -646.541015625, + "loss": 0.4368, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.902569532394409, + "rewards/margins": 1.1967476606369019, + "rewards/rejected": -4.0993170738220215, + "step": 3510 + }, + { + "epoch": 0.92, + "grad_norm": 8.4375, + "learning_rate": 9.397045634168766e-08, + "logits/chosen": -1.227426290512085, + "logits/rejected": -1.1496341228485107, + "logps/chosen": -555.9089965820312, + "logps/rejected": -687.0352783203125, + "loss": 0.4491, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.873260974884033, + "rewards/margins": 1.3088066577911377, + "rewards/rejected": -4.182066917419434, + "step": 3520 + }, + { + "epoch": 0.92, + "grad_norm": 10.9375, + "learning_rate": 8.78665232332998e-08, + "logits/chosen": -1.1654760837554932, + "logits/rejected": -1.0858592987060547, + "logps/chosen": -537.4627685546875, + "logps/rejected": -640.0810546875, + "loss": 0.489, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -3.0675268173217773, + "rewards/margins": 1.0338232517242432, + "rewards/rejected": -4.101349830627441, + "step": 3530 + }, + { + "epoch": 0.93, + "grad_norm": 8.1875, + "learning_rate": 8.196400257606208e-08, + "logits/chosen": -1.2670751810073853, + "logits/rejected": -1.104811191558838, + "logps/chosen": -576.2312622070312, + "logps/rejected": -708.0988159179688, + "loss": 0.4292, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9314615726470947, + "rewards/margins": 1.371382236480713, + "rewards/rejected": -4.3028435707092285, + "step": 3540 + }, + { + "epoch": 0.93, + "grad_norm": 9.5, + "learning_rate": 7.626338722875076e-08, + "logits/chosen": -1.1996467113494873, + "logits/rejected": -1.1349631547927856, + "logps/chosen": -546.021240234375, + "logps/rejected": -657.2860107421875, + "loss": 0.503, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.9192748069763184, + "rewards/margins": 1.102920651435852, + "rewards/rejected": -4.022195816040039, + "step": 3550 + }, + { + "epoch": 0.93, + "grad_norm": 5.84375, + "learning_rate": 7.076515319110688e-08, + "logits/chosen": -1.2043834924697876, + "logits/rejected": -1.0872790813446045, + "logps/chosen": -546.2125854492188, + "logps/rejected": -628.4691162109375, + "loss": 0.5091, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.899247646331787, + "rewards/margins": 1.2382572889328003, + "rewards/rejected": -4.1375041007995605, + "step": 3560 + }, + { + "epoch": 0.93, + "grad_norm": 8.1875, + "learning_rate": 6.54697595640899e-08, + "logits/chosen": -1.2246639728546143, + "logits/rejected": -1.1050646305084229, + "logps/chosen": -588.5670166015625, + "logps/rejected": -679.2962646484375, + "loss": 0.4803, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.9634485244750977, + "rewards/margins": 1.1589770317077637, + "rewards/rejected": -4.1224260330200195, + "step": 3570 + }, + { + "epoch": 0.94, + "grad_norm": 9.125, + "learning_rate": 6.037764851154426e-08, + "logits/chosen": -1.2126811742782593, + "logits/rejected": -1.1511167287826538, + "logps/chosen": -555.2306518554688, + "logps/rejected": -671.8084716796875, + "loss": 0.5096, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.911715030670166, + "rewards/margins": 1.1182584762573242, + "rewards/rejected": -4.029973030090332, + "step": 3580 + }, + { + "epoch": 0.94, + "grad_norm": 7.1875, + "learning_rate": 5.548924522327748e-08, + "logits/chosen": -1.1890180110931396, + "logits/rejected": -1.0672190189361572, + "logps/chosen": -549.8150634765625, + "logps/rejected": -647.8394775390625, + "loss": 0.4832, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.880577564239502, + "rewards/margins": 1.1271222829818726, + "rewards/rejected": -4.007699489593506, + "step": 3590 + }, + { + "epoch": 0.94, + "grad_norm": 11.0625, + "learning_rate": 5.0804957879556915e-08, + "logits/chosen": -1.109243392944336, + "logits/rejected": -1.0201053619384766, + "logps/chosen": -514.1246337890625, + "logps/rejected": -630.8916625976562, + "loss": 0.464, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8855831623077393, + "rewards/margins": 1.122081995010376, + "rewards/rejected": -4.007665157318115, + "step": 3600 + }, + { + "epoch": 0.94, + "eval_logits/chosen": -1.0748145580291748, + "eval_logits/rejected": -0.9474833607673645, + "eval_logps/chosen": -561.028564453125, + "eval_logps/rejected": -654.279052734375, + "eval_loss": 0.48736903071403503, + "eval_rewards/accuracies": 0.7425000071525574, + "eval_rewards/chosen": -2.9637651443481445, + "eval_rewards/margins": 1.1329458951950073, + "eval_rewards/rejected": -4.096711158752441, + "eval_runtime": 382.7111, + "eval_samples_per_second": 5.226, + "eval_steps_per_second": 0.653, + "step": 3600 + }, + { + "epoch": 0.94, + "grad_norm": 9.125, + "learning_rate": 4.632517761702815e-08, + "logits/chosen": -1.1433720588684082, + "logits/rejected": -1.0008645057678223, + "logps/chosen": -530.3574829101562, + "logps/rejected": -652.87255859375, + "loss": 0.4428, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9491429328918457, + "rewards/margins": 1.3483526706695557, + "rewards/rejected": -4.2974958419799805, + "step": 3610 + }, + { + "epoch": 0.95, + "grad_norm": 11.125, + "learning_rate": 4.205027849605359e-08, + "logits/chosen": -1.1681492328643799, + "logits/rejected": -1.0669422149658203, + "logps/chosen": -553.4034423828125, + "logps/rejected": -626.2314453125, + "loss": 0.5421, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0548007488250732, + "rewards/margins": 1.0290553569793701, + "rewards/rejected": -4.083855628967285, + "step": 3620 + }, + { + "epoch": 0.95, + "grad_norm": 9.9375, + "learning_rate": 3.798061746947995e-08, + "logits/chosen": -1.2855480909347534, + "logits/rejected": -1.1476643085479736, + "logps/chosen": -555.6473999023438, + "logps/rejected": -633.9293823242188, + "loss": 0.4785, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.9149746894836426, + "rewards/margins": 1.1746852397918701, + "rewards/rejected": -4.089660167694092, + "step": 3630 + }, + { + "epoch": 0.95, + "grad_norm": 10.25, + "learning_rate": 3.411653435283158e-08, + "logits/chosen": -1.1988470554351807, + "logits/rejected": -0.9911161661148071, + "logps/chosen": -560.5934448242188, + "logps/rejected": -617.925048828125, + "loss": 0.4611, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.873368740081787, + "rewards/margins": 1.1307556629180908, + "rewards/rejected": -4.004124641418457, + "step": 3640 + }, + { + "epoch": 0.96, + "grad_norm": 9.5, + "learning_rate": 3.04583517959367e-08, + "logits/chosen": -1.2440365552902222, + "logits/rejected": -1.0937076807022095, + "logps/chosen": -528.578125, + "logps/rejected": -617.3880004882812, + "loss": 0.448, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7646141052246094, + "rewards/margins": 1.2126356363296509, + "rewards/rejected": -3.9772496223449707, + "step": 3650 + }, + { + "epoch": 0.96, + "grad_norm": 10.0, + "learning_rate": 2.7006375255985984e-08, + "logits/chosen": -1.1879501342773438, + "logits/rejected": -1.1580005884170532, + "logps/chosen": -571.6791381835938, + "logps/rejected": -661.9193725585938, + "loss": 0.5788, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.06877064704895, + "rewards/margins": 0.8969556093215942, + "rewards/rejected": -3.965726375579834, + "step": 3660 + }, + { + "epoch": 0.96, + "grad_norm": 11.625, + "learning_rate": 2.3760892972027328e-08, + "logits/chosen": -1.303144931793213, + "logits/rejected": -1.1418662071228027, + "logps/chosen": -583.8892822265625, + "logps/rejected": -663.1383666992188, + "loss": 0.5326, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.113860845565796, + "rewards/margins": 1.1326040029525757, + "rewards/rejected": -4.246464729309082, + "step": 3670 + }, + { + "epoch": 0.96, + "grad_norm": 13.4375, + "learning_rate": 2.072217594089765e-08, + "logits/chosen": -1.156292200088501, + "logits/rejected": -1.146905541419983, + "logps/chosen": -559.3345336914062, + "logps/rejected": -672.4237060546875, + "loss": 0.4237, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.9779343605041504, + "rewards/margins": 1.253351092338562, + "rewards/rejected": -4.231285572052002, + "step": 3680 + }, + { + "epoch": 0.97, + "grad_norm": 8.9375, + "learning_rate": 1.789047789459375e-08, + "logits/chosen": -1.266901969909668, + "logits/rejected": -1.072322964668274, + "logps/chosen": -611.783203125, + "logps/rejected": -680.0989379882812, + "loss": 0.5071, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.9480648040771484, + "rewards/margins": 1.1735531091690063, + "rewards/rejected": -4.121617794036865, + "step": 3690 + }, + { + "epoch": 0.97, + "grad_norm": 8.9375, + "learning_rate": 1.5266035279088708e-08, + "logits/chosen": -1.1054164171218872, + "logits/rejected": -0.985053539276123, + "logps/chosen": -610.8778076171875, + "logps/rejected": -699.9169921875, + "loss": 0.4729, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.1426022052764893, + "rewards/margins": 1.1523752212524414, + "rewards/rejected": -4.29497766494751, + "step": 3700 + }, + { + "epoch": 0.97, + "eval_logits/chosen": -1.0769954919815063, + "eval_logits/rejected": -0.9495205879211426, + "eval_logps/chosen": -561.3129272460938, + "eval_logps/rejected": -654.6014404296875, + "eval_loss": 0.48729926347732544, + "eval_rewards/accuracies": 0.7444999814033508, + "eval_rewards/chosen": -2.966609239578247, + "eval_rewards/margins": 1.1333256959915161, + "eval_rewards/rejected": -4.099935054779053, + "eval_runtime": 382.1, + "eval_samples_per_second": 5.234, + "eval_steps_per_second": 0.654, + "step": 3700 + }, + { + "epoch": 0.97, + "grad_norm": 11.75, + "learning_rate": 1.2849067234584623e-08, + "logits/chosen": -1.0827583074569702, + "logits/rejected": -1.0175631046295166, + "logps/chosen": -534.8372192382812, + "logps/rejected": -647.8695678710938, + "loss": 0.4762, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9847655296325684, + "rewards/margins": 1.2061764001846313, + "rewards/rejected": -4.190942287445068, + "step": 3710 + }, + { + "epoch": 0.97, + "grad_norm": 11.1875, + "learning_rate": 1.0639775577218625e-08, + "logits/chosen": -1.0798698663711548, + "logits/rejected": -0.9149328470230103, + "logps/chosen": -549.2965087890625, + "logps/rejected": -631.1814575195312, + "loss": 0.5133, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.001096248626709, + "rewards/margins": 1.178213119506836, + "rewards/rejected": -4.179308891296387, + "step": 3720 + }, + { + "epoch": 0.98, + "grad_norm": 9.5, + "learning_rate": 8.638344782207486e-09, + "logits/chosen": -1.1081641912460327, + "logits/rejected": -1.0127241611480713, + "logps/chosen": -530.3636474609375, + "logps/rejected": -619.5350341796875, + "loss": 0.4791, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8507590293884277, + "rewards/margins": 1.1195757389068604, + "rewards/rejected": -3.97033429145813, + "step": 3730 + }, + { + "epoch": 0.98, + "grad_norm": 10.0, + "learning_rate": 6.84494196844715e-09, + "logits/chosen": -1.16922128200531, + "logits/rejected": -1.0506504774093628, + "logps/chosen": -563.3178100585938, + "logps/rejected": -685.6429443359375, + "loss": 0.4573, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.912113666534424, + "rewards/margins": 1.3391534090042114, + "rewards/rejected": -4.251267433166504, + "step": 3740 + }, + { + "epoch": 0.98, + "grad_norm": 10.1875, + "learning_rate": 5.259716884556121e-09, + "logits/chosen": -1.2230274677276611, + "logits/rejected": -1.0869773626327515, + "logps/chosen": -557.9898681640625, + "logps/rejected": -660.3572998046875, + "loss": 0.4564, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.9352307319641113, + "rewards/margins": 1.1718149185180664, + "rewards/rejected": -4.107045650482178, + "step": 3750 + }, + { + "epoch": 0.98, + "grad_norm": 9.75, + "learning_rate": 3.882801896372967e-09, + "logits/chosen": -1.2255470752716064, + "logits/rejected": -1.1375856399536133, + "logps/chosen": -556.98193359375, + "logps/rejected": -639.6429443359375, + "loss": 0.4908, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.91620135307312, + "rewards/margins": 1.1449532508850098, + "rewards/rejected": -4.061154842376709, + "step": 3760 + }, + { + "epoch": 0.99, + "grad_norm": 12.25, + "learning_rate": 2.7143119759026614e-09, + "logits/chosen": -1.242653727531433, + "logits/rejected": -1.0747482776641846, + "logps/chosen": -574.4716796875, + "logps/rejected": -665.8096313476562, + "loss": 0.4263, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.901991605758667, + "rewards/margins": 1.1683391332626343, + "rewards/rejected": -4.070330619812012, + "step": 3770 + }, + { + "epoch": 0.99, + "grad_norm": 9.4375, + "learning_rate": 1.754344691717591e-09, + "logits/chosen": -1.1282501220703125, + "logits/rejected": -1.0916457176208496, + "logps/chosen": -552.8446655273438, + "logps/rejected": -669.7666015625, + "loss": 0.5197, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.030531406402588, + "rewards/margins": 0.9483699798583984, + "rewards/rejected": -3.9789013862609863, + "step": 3780 + }, + { + "epoch": 0.99, + "grad_norm": 13.125, + "learning_rate": 1.0029802008096335e-09, + "logits/chosen": -1.1534841060638428, + "logits/rejected": -0.994836151599884, + "logps/chosen": -570.4867553710938, + "logps/rejected": -668.6637573242188, + "loss": 0.4803, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.9619479179382324, + "rewards/margins": 1.2098205089569092, + "rewards/rejected": -4.171768665313721, + "step": 3790 + }, + { + "epoch": 0.99, + "grad_norm": 8.5, + "learning_rate": 4.602812418974534e-10, + "logits/chosen": -1.2624783515930176, + "logits/rejected": -1.1238892078399658, + "logps/chosen": -582.1685180664062, + "logps/rejected": -673.0120239257812, + "loss": 0.5017, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.003277540206909, + "rewards/margins": 1.1538227796554565, + "rewards/rejected": -4.157099723815918, + "step": 3800 + }, + { + "epoch": 0.99, + "eval_logits/chosen": -1.0723599195480347, + "eval_logits/rejected": -0.9449748396873474, + "eval_logps/chosen": -561.3216552734375, + "eval_logps/rejected": -654.607177734375, + "eval_loss": 0.48731154203414917, + "eval_rewards/accuracies": 0.7444999814033508, + "eval_rewards/chosen": -2.966696262359619, + "eval_rewards/margins": 1.133296012878418, + "eval_rewards/rejected": -4.099992275238037, + "eval_runtime": 382.0182, + "eval_samples_per_second": 5.235, + "eval_steps_per_second": 0.654, + "step": 3800 + }, + { + "epoch": 1.0, + "grad_norm": 9.875, + "learning_rate": 1.2629313018819312e-10, + "logits/chosen": -1.171769142150879, + "logits/rejected": -1.0495896339416504, + "logps/chosen": -542.8326416015625, + "logps/rejected": -627.7073974609375, + "loss": 0.5191, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.9036014080047607, + "rewards/margins": 1.005274772644043, + "rewards/rejected": -3.9088759422302246, + "step": 3810 + }, + { + "epoch": 1.0, + "grad_norm": 18.25, + "learning_rate": 1.0437535929996855e-12, + "logits/chosen": -1.1617281436920166, + "logits/rejected": -0.9952475428581238, + "logps/chosen": -585.9136962890625, + "logps/rejected": -680.009521484375, + "loss": 0.4659, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.991471767425537, + "rewards/margins": 1.3875491619110107, + "rewards/rejected": -4.379020690917969, + "step": 3820 + }, + { + "epoch": 1.0, + "step": 3821, + "total_flos": 0.0, + "train_loss": 0.5021860111574015, + "train_runtime": 41123.41, + "train_samples_per_second": 1.487, + "train_steps_per_second": 0.093 + } + ], + "logging_steps": 10, + "max_steps": 3821, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}