{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 5000, "global_step": 20074, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004981568197668626, "grad_norm": 81.31034088134766, "learning_rate": 1.9999863931243543e-05, "logits/chosen": -19.35576057434082, "logits/rejected": -19.391923904418945, "logps/chosen": -488.51171875, "logps/rejected": -382.52825927734375, "loss": 0.6551, "rewards/accuracies": 0.59375, "rewards/chosen": 0.4239501953125, "rewards/margins": 1.4092838764190674, "rewards/rejected": -0.9853336215019226, "step": 100 }, { "epoch": 0.009963136395337252, "grad_norm": 27.6790828704834, "learning_rate": 1.9999455728677112e-05, "logits/chosen": -18.520322799682617, "logits/rejected": -18.58489227294922, "logps/chosen": -502.153564453125, "logps/rejected": -427.2685241699219, "loss": 1.088, "rewards/accuracies": 0.4699999988079071, "rewards/chosen": 1.2840694189071655, "rewards/margins": -0.12594786286354065, "rewards/rejected": 1.4100172519683838, "step": 200 }, { "epoch": 0.014944704593005878, "grad_norm": 14.743182182312012, "learning_rate": 1.999877540340943e-05, "logits/chosen": -18.121265411376953, "logits/rejected": -17.966760635375977, "logps/chosen": -480.9696960449219, "logps/rejected": -391.3818359375, "loss": 0.846, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 2.156686305999756, "rewards/margins": 0.4148028492927551, "rewards/rejected": 1.7418835163116455, "step": 300 }, { "epoch": 0.019926272790674503, "grad_norm": 0.3787066340446472, "learning_rate": 1.99978229739547e-05, "logits/chosen": -18.205398559570312, "logits/rejected": -18.042299270629883, "logps/chosen": -502.7016296386719, "logps/rejected": -388.99835205078125, "loss": 0.7988, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 2.600520372390747, "rewards/margins": 0.4970521926879883, "rewards/rejected": 2.1034679412841797, "step": 400 }, { "epoch": 0.02490784098834313, "grad_norm": 0.6821377873420715, "learning_rate": 1.9996598466232097e-05, "logits/chosen": -18.351791381835938, "logits/rejected": -18.350332260131836, "logps/chosen": -495.239501953125, "logps/rejected": -396.9656677246094, "loss": 0.9516, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 2.7651679515838623, "rewards/margins": 0.4416518807411194, "rewards/rejected": 2.3235161304473877, "step": 500 }, { "epoch": 0.029889409186011757, "grad_norm": 0.07361862808465958, "learning_rate": 1.9995101913565075e-05, "logits/chosen": -18.08759117126465, "logits/rejected": -18.078266143798828, "logps/chosen": -500.9162902832031, "logps/rejected": -413.58673095703125, "loss": 0.8741, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 3.083859443664551, "rewards/margins": 0.49825409054756165, "rewards/rejected": 2.5856053829193115, "step": 600 }, { "epoch": 0.034870977383680384, "grad_norm": 50.79634475708008, "learning_rate": 1.9993333356680442e-05, "logits/chosen": -17.93349838256836, "logits/rejected": -17.859838485717773, "logps/chosen": -576.14501953125, "logps/rejected": -481.7210693359375, "loss": 0.9994, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 2.9873363971710205, "rewards/margins": 0.3300691843032837, "rewards/rejected": 2.6572670936584473, "step": 700 }, { "epoch": 0.03985254558134901, "grad_norm": 11.607032775878906, "learning_rate": 1.999129284370727e-05, "logits/chosen": -18.006515502929688, "logits/rejected": -17.87860107421875, "logps/chosen": -511.5252990722656, "logps/rejected": -448.138916015625, "loss": 1.0173, "rewards/accuracies": 0.5299999713897705, "rewards/chosen": 3.2953906059265137, "rewards/margins": 0.5231221318244934, "rewards/rejected": 2.772268772125244, "step": 800 }, { "epoch": 0.04483411377901764, "grad_norm": 9.66622257232666, "learning_rate": 1.9988980430175565e-05, "logits/chosen": -17.94629669189453, "logits/rejected": -17.793624877929688, "logps/chosen": -471.1706237792969, "logps/rejected": -380.40625, "loss": 0.7616, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 3.2259023189544678, "rewards/margins": 0.9019778966903687, "rewards/rejected": 2.3239243030548096, "step": 900 }, { "epoch": 0.04981568197668626, "grad_norm": 33.115692138671875, "learning_rate": 1.998639617901478e-05, "logits/chosen": -18.29867935180664, "logits/rejected": -18.21681785583496, "logps/chosen": -492.39471435546875, "logps/rejected": -397.0972900390625, "loss": 0.8836, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 4.284320831298828, "rewards/margins": 1.2680495977401733, "rewards/rejected": 3.0162715911865234, "step": 1000 }, { "epoch": 0.05479725017435489, "grad_norm": 14.794266700744629, "learning_rate": 1.998354016055208e-05, "logits/chosen": -17.866899490356445, "logits/rejected": -17.793582916259766, "logps/chosen": -512.371337890625, "logps/rejected": -424.1750793457031, "loss": 1.2634, "rewards/accuracies": 0.5600000023841858, "rewards/chosen": 3.433595895767212, "rewards/margins": 0.3185270428657532, "rewards/rejected": 3.1150686740875244, "step": 1100 }, { "epoch": 0.059778818372023514, "grad_norm": 177.02552795410156, "learning_rate": 1.998041245251044e-05, "logits/chosen": -18.298795700073242, "logits/rejected": -18.078033447265625, "logps/chosen": -464.45086669921875, "logps/rejected": -387.45428466796875, "loss": 1.0062, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 3.7040328979492188, "rewards/margins": 0.5431541800498962, "rewards/rejected": 3.1608786582946777, "step": 1200 }, { "epoch": 0.06476038656969214, "grad_norm": 19.352441787719727, "learning_rate": 1.997701314000653e-05, "logits/chosen": -18.20465660095215, "logits/rejected": -18.182998657226562, "logps/chosen": -489.5882873535156, "logps/rejected": -431.6067199707031, "loss": 0.8781, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 4.215704917907715, "rewards/margins": 0.7704020738601685, "rewards/rejected": 3.445302963256836, "step": 1300 }, { "epoch": 0.06974195476736077, "grad_norm": 22.920259475708008, "learning_rate": 1.9973342315548398e-05, "logits/chosen": -18.116256713867188, "logits/rejected": -18.149843215942383, "logps/chosen": -447.0269775390625, "logps/rejected": -374.61383056640625, "loss": 1.0611, "rewards/accuracies": 0.5600000023841858, "rewards/chosen": 3.63736629486084, "rewards/margins": 0.6730349063873291, "rewards/rejected": 2.96433162689209, "step": 1400 }, { "epoch": 0.07472352296502939, "grad_norm": 20.65612030029297, "learning_rate": 1.9969400079032947e-05, "logits/chosen": -18.347074508666992, "logits/rejected": -18.040178298950195, "logps/chosen": -453.944091796875, "logps/rejected": -380.4056396484375, "loss": 1.0204, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 4.042062282562256, "rewards/margins": 0.9009463787078857, "rewards/rejected": 3.1411163806915283, "step": 1500 }, { "epoch": 0.07970509116269801, "grad_norm": 50.09711456298828, "learning_rate": 1.9965186537743215e-05, "logits/chosen": -18.355621337890625, "logits/rejected": -18.051054000854492, "logps/chosen": -502.7710876464844, "logps/rejected": -419.5497741699219, "loss": 1.1749, "rewards/accuracies": 0.5899999737739563, "rewards/chosen": 3.215543508529663, "rewards/margins": 0.48448604345321655, "rewards/rejected": 2.731057643890381, "step": 1600 }, { "epoch": 0.08468665936036664, "grad_norm": 0.4332411289215088, "learning_rate": 1.9960701806345472e-05, "logits/chosen": -18.210161209106445, "logits/rejected": -18.06623077392578, "logps/chosen": -449.0904235839844, "logps/rejected": -365.96173095703125, "loss": 0.7165, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 4.172327995300293, "rewards/margins": 1.4230843782424927, "rewards/rejected": 2.749243974685669, "step": 1700 }, { "epoch": 0.08966822755803527, "grad_norm": 5.466333389282227, "learning_rate": 1.9955946006886082e-05, "logits/chosen": -18.5748348236084, "logits/rejected": -18.16517448425293, "logps/chosen": -438.857421875, "logps/rejected": -416.3319396972656, "loss": 0.8766, "rewards/accuracies": 0.5699999928474426, "rewards/chosen": 4.177321910858154, "rewards/margins": 0.9888943433761597, "rewards/rejected": 3.188427686691284, "step": 1800 }, { "epoch": 0.0946497957557039, "grad_norm": 82.25857543945312, "learning_rate": 1.995091926878819e-05, "logits/chosen": -18.491161346435547, "logits/rejected": -18.283767700195312, "logps/chosen": -461.55023193359375, "logps/rejected": -394.8086242675781, "loss": 1.0777, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 3.3734805583953857, "rewards/margins": 0.7351935505867004, "rewards/rejected": 2.638287305831909, "step": 1900 }, { "epoch": 0.09963136395337252, "grad_norm": 9.021427154541016, "learning_rate": 1.9945621728848194e-05, "logits/chosen": -18.71115493774414, "logits/rejected": -18.314943313598633, "logps/chosen": -466.3949890136719, "logps/rejected": -393.8585205078125, "loss": 0.6798, "rewards/accuracies": 0.7300000190734863, "rewards/chosen": 3.843343496322632, "rewards/margins": 1.1383906602859497, "rewards/rejected": 2.7049529552459717, "step": 2000 }, { "epoch": 0.10461293215104114, "grad_norm": 146.21475219726562, "learning_rate": 1.9940053531232028e-05, "logits/chosen": -18.590173721313477, "logits/rejected": -18.441553115844727, "logps/chosen": -454.3846435546875, "logps/rejected": -387.5030212402344, "loss": 1.1674, "rewards/accuracies": 0.5699999928474426, "rewards/chosen": 4.06519079208374, "rewards/margins": 0.5631230473518372, "rewards/rejected": 3.5020673274993896, "step": 2100 }, { "epoch": 0.10959450034870978, "grad_norm": 4.808138847351074, "learning_rate": 1.9934214827471244e-05, "logits/chosen": -18.621475219726562, "logits/rejected": -18.35665512084961, "logps/chosen": -450.15582275390625, "logps/rejected": -386.0933532714844, "loss": 1.0704, "rewards/accuracies": 0.5699999928474426, "rewards/chosen": 3.885467290878296, "rewards/margins": 0.752030611038208, "rewards/rejected": 3.1334362030029297, "step": 2200 }, { "epoch": 0.1145760685463784, "grad_norm": 5.842870235443115, "learning_rate": 1.9928105776458864e-05, "logits/chosen": -18.336530685424805, "logits/rejected": -18.11532211303711, "logps/chosen": -466.25738525390625, "logps/rejected": -393.3591003417969, "loss": 1.1451, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 3.678475856781006, "rewards/margins": 0.4539036452770233, "rewards/rejected": 3.22457218170166, "step": 2300 }, { "epoch": 0.11955763674404703, "grad_norm": 37.61786651611328, "learning_rate": 1.9921726544445084e-05, "logits/chosen": -18.296964645385742, "logits/rejected": -18.364625930786133, "logps/chosen": -467.84027099609375, "logps/rejected": -397.1224670410156, "loss": 0.8784, "rewards/accuracies": 0.6899999976158142, "rewards/chosen": 3.5043909549713135, "rewards/margins": 1.0125629901885986, "rewards/rejected": 2.4918274879455566, "step": 2400 }, { "epoch": 0.12453920494171565, "grad_norm": 49.04602813720703, "learning_rate": 1.9915077305032748e-05, "logits/chosen": -18.40894317626953, "logits/rejected": -18.2955322265625, "logps/chosen": -503.1138610839844, "logps/rejected": -375.34442138671875, "loss": 0.9985, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 4.6265459060668945, "rewards/margins": 1.7104928493499756, "rewards/rejected": 2.91605281829834, "step": 2500 }, { "epoch": 0.1295207731393843, "grad_norm": 16.836668014526367, "learning_rate": 1.9908158239172596e-05, "logits/chosen": -18.674049377441406, "logits/rejected": -18.514965057373047, "logps/chosen": -455.519775390625, "logps/rejected": -374.22210693359375, "loss": 0.9035, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": 4.500489711761475, "rewards/margins": 1.446923017501831, "rewards/rejected": 3.0535662174224854, "step": 2600 }, { "epoch": 0.1345023413370529, "grad_norm": 0.04983401298522949, "learning_rate": 1.990096953515836e-05, "logits/chosen": -18.647964477539062, "logits/rejected": -18.637880325317383, "logps/chosen": -465.9200439453125, "logps/rejected": -411.40838623046875, "loss": 1.2207, "rewards/accuracies": 0.5899999737739563, "rewards/chosen": 3.8630013465881348, "rewards/margins": 0.7057845592498779, "rewards/rejected": 3.157216787338257, "step": 2700 }, { "epoch": 0.13948390953472153, "grad_norm": 0.5644310116767883, "learning_rate": 1.9893511388621652e-05, "logits/chosen": -18.66870880126953, "logits/rejected": -18.76462745666504, "logps/chosen": -513.6793823242188, "logps/rejected": -469.8594055175781, "loss": 1.5471, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 3.391056537628174, "rewards/margins": 0.3123472332954407, "rewards/rejected": 3.078709363937378, "step": 2800 }, { "epoch": 0.14446547773239016, "grad_norm": 1.6863278150558472, "learning_rate": 1.9885784002526616e-05, "logits/chosen": -18.729633331298828, "logits/rejected": -19.068260192871094, "logps/chosen": -447.2772521972656, "logps/rejected": -352.67791748046875, "loss": 1.0868, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 3.924001693725586, "rewards/margins": 0.8989758491516113, "rewards/rejected": 3.0250258445739746, "step": 2900 }, { "epoch": 0.14944704593005878, "grad_norm": 0.10964024066925049, "learning_rate": 1.987778758716441e-05, "logits/chosen": -19.014976501464844, "logits/rejected": -19.501911163330078, "logps/chosen": -475.1939392089844, "logps/rejected": -385.38238525390625, "loss": 1.0756, "rewards/accuracies": 0.5600000023841858, "rewards/chosen": 4.359086036682129, "rewards/margins": 1.1548995971679688, "rewards/rejected": 3.2041866779327393, "step": 3000 }, { "epoch": 0.1544286141277274, "grad_norm": 7.479519367218018, "learning_rate": 1.98695223601475e-05, "logits/chosen": -18.8636531829834, "logits/rejected": -19.16086196899414, "logps/chosen": -484.1092529296875, "logps/rejected": -399.1905212402344, "loss": 0.8348, "rewards/accuracies": 0.7099999785423279, "rewards/chosen": 4.458531379699707, "rewards/margins": 1.7625274658203125, "rewards/rejected": 2.6960039138793945, "step": 3100 }, { "epoch": 0.15941018232539603, "grad_norm": 15.998089790344238, "learning_rate": 1.986098854640371e-05, "logits/chosen": -18.937522888183594, "logits/rejected": -19.118017196655273, "logps/chosen": -463.34149169921875, "logps/rejected": -415.29827880859375, "loss": 1.298, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": 4.227731227874756, "rewards/margins": 0.5569795370101929, "rewards/rejected": 3.6707510948181152, "step": 3200 }, { "epoch": 0.16439175052306465, "grad_norm": 0.7655884027481079, "learning_rate": 1.9852186378170136e-05, "logits/chosen": -18.893104553222656, "logits/rejected": -19.257871627807617, "logps/chosen": -531.3560791015625, "logps/rejected": -465.7754821777344, "loss": 1.1944, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 4.285092353820801, "rewards/margins": 1.1729285717010498, "rewards/rejected": 3.1121633052825928, "step": 3300 }, { "epoch": 0.16937331872073327, "grad_norm": 10.586421012878418, "learning_rate": 1.9843116094986783e-05, "logits/chosen": -18.89116859436035, "logits/rejected": -19.32411003112793, "logps/chosen": -466.8319091796875, "logps/rejected": -388.9800109863281, "loss": 0.8162, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": 4.402754306793213, "rewards/margins": 1.1992639303207397, "rewards/rejected": 3.203490972518921, "step": 3400 }, { "epoch": 0.17435488691840192, "grad_norm": 0.0021288192365318537, "learning_rate": 1.983377794369009e-05, "logits/chosen": -18.90306854248047, "logits/rejected": -19.6688289642334, "logps/chosen": -506.9422912597656, "logps/rejected": -422.39703369140625, "loss": 0.9919, "rewards/accuracies": 0.5600000023841858, "rewards/chosen": 4.193739414215088, "rewards/margins": 1.1028869152069092, "rewards/rejected": 3.0908522605895996, "step": 3500 }, { "epoch": 0.17933645511607055, "grad_norm": 72.208251953125, "learning_rate": 1.982417217840618e-05, "logits/chosen": -19.198213577270508, "logits/rejected": -20.100387573242188, "logps/chosen": -498.4687805175781, "logps/rejected": -383.0224914550781, "loss": 0.9324, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 4.158178806304932, "rewards/margins": 1.5135530233383179, "rewards/rejected": 2.6446259021759033, "step": 3600 }, { "epoch": 0.18431802331373917, "grad_norm": 54.43558120727539, "learning_rate": 1.9814299060543965e-05, "logits/chosen": -19.100000381469727, "logits/rejected": -20.164613723754883, "logps/chosen": -523.7534790039062, "logps/rejected": -420.5929260253906, "loss": 1.0645, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": 4.555855751037598, "rewards/margins": 1.3967076539993286, "rewards/rejected": 3.1591484546661377, "step": 3700 }, { "epoch": 0.1892995915114078, "grad_norm": 44.80778503417969, "learning_rate": 1.980415885878801e-05, "logits/chosen": -19.23442840576172, "logits/rejected": -20.248851776123047, "logps/chosen": -470.9892578125, "logps/rejected": -387.8487548828125, "loss": 1.1345, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 4.9033355712890625, "rewards/margins": 1.5088270902633667, "rewards/rejected": 3.3945086002349854, "step": 3800 }, { "epoch": 0.19428115970907642, "grad_norm": 0.029925603419542313, "learning_rate": 1.979375184909125e-05, "logits/chosen": -19.161788940429688, "logits/rejected": -20.242706298828125, "logps/chosen": -451.88165283203125, "logps/rejected": -377.80078125, "loss": 1.0498, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 3.468371868133545, "rewards/margins": 1.5844755172729492, "rewards/rejected": 1.8838963508605957, "step": 3900 }, { "epoch": 0.19926272790674504, "grad_norm": 2.616316795349121, "learning_rate": 1.9783078314667465e-05, "logits/chosen": -19.053321838378906, "logits/rejected": -20.00080108642578, "logps/chosen": -502.9007568359375, "logps/rejected": -397.90020751953125, "loss": 0.9393, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 4.124849796295166, "rewards/margins": 1.3231381177902222, "rewards/rejected": 2.801711320877075, "step": 4000 }, { "epoch": 0.20424429610441366, "grad_norm": 44.81772232055664, "learning_rate": 1.9772138545983554e-05, "logits/chosen": -18.997940063476562, "logits/rejected": -19.999465942382812, "logps/chosen": -494.4920349121094, "logps/rejected": -401.1126708984375, "loss": 0.9059, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 3.8888561725616455, "rewards/margins": 1.703158974647522, "rewards/rejected": 2.185697317123413, "step": 4100 }, { "epoch": 0.2092258643020823, "grad_norm": 19.425662994384766, "learning_rate": 1.9760932840751663e-05, "logits/chosen": -18.9016056060791, "logits/rejected": -19.416828155517578, "logps/chosen": -483.6650390625, "logps/rejected": -388.0088195800781, "loss": 0.8963, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 5.2860283851623535, "rewards/margins": 1.5827534198760986, "rewards/rejected": 3.703275442123413, "step": 4200 }, { "epoch": 0.2142074324997509, "grad_norm": 124.74931335449219, "learning_rate": 1.9749461503921074e-05, "logits/chosen": -18.898042678833008, "logits/rejected": -19.673877716064453, "logps/chosen": -497.69476318359375, "logps/rejected": -364.8540954589844, "loss": 1.0963, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 4.136105537414551, "rewards/margins": 1.3833444118499756, "rewards/rejected": 2.752761125564575, "step": 4300 }, { "epoch": 0.21918900069741956, "grad_norm": 5.577426433563232, "learning_rate": 1.973772484766989e-05, "logits/chosen": -18.805566787719727, "logits/rejected": -19.62226104736328, "logps/chosen": -463.9582824707031, "logps/rejected": -351.70867919921875, "loss": 1.0113, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 4.1188740730285645, "rewards/margins": 1.5241186618804932, "rewards/rejected": 2.5947556495666504, "step": 4400 }, { "epoch": 0.22417056889508818, "grad_norm": 0.4124658405780792, "learning_rate": 1.9725723191396557e-05, "logits/chosen": -18.83307647705078, "logits/rejected": -19.278696060180664, "logps/chosen": -466.99859619140625, "logps/rejected": -398.4786682128906, "loss": 1.622, "rewards/accuracies": 0.5899999737739563, "rewards/chosen": 4.280797958374023, "rewards/margins": 0.31094062328338623, "rewards/rejected": 3.9698569774627686, "step": 4500 }, { "epoch": 0.2291521370927568, "grad_norm": 46.63705825805664, "learning_rate": 1.971345686171116e-05, "logits/chosen": -18.672901153564453, "logits/rejected": -19.950056076049805, "logps/chosen": -500.7174072265625, "logps/rejected": -417.15191650390625, "loss": 0.9252, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 4.5458478927612305, "rewards/margins": 1.576623558998108, "rewards/rejected": 2.969224452972412, "step": 4600 }, { "epoch": 0.23413370529042543, "grad_norm": 18.882240295410156, "learning_rate": 1.9700926192426554e-05, "logits/chosen": -19.082120895385742, "logits/rejected": -20.37308120727539, "logps/chosen": -429.0272521972656, "logps/rejected": -354.2383728027344, "loss": 1.1165, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 3.655416965484619, "rewards/margins": 1.2390317916870117, "rewards/rejected": 2.4163851737976074, "step": 4700 }, { "epoch": 0.23911527348809405, "grad_norm": 336.6611633300781, "learning_rate": 1.9688131524549242e-05, "logits/chosen": -19.020198822021484, "logits/rejected": -19.45013999938965, "logps/chosen": -459.5777587890625, "logps/rejected": -409.13922119140625, "loss": 1.2777, "rewards/accuracies": 0.5600000023841858, "rewards/chosen": 2.606769323348999, "rewards/margins": 0.6296383142471313, "rewards/rejected": 1.9771310091018677, "step": 4800 }, { "epoch": 0.24409684168576268, "grad_norm": 72.95674133300781, "learning_rate": 1.9675073206270148e-05, "logits/chosen": -18.523130416870117, "logits/rejected": -19.362272262573242, "logps/chosen": -499.9466552734375, "logps/rejected": -384.0169982910156, "loss": 0.9484, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 4.083195209503174, "rewards/margins": 1.7932839393615723, "rewards/rejected": 2.2899110317230225, "step": 4900 }, { "epoch": 0.2490784098834313, "grad_norm": 7.091724395751953, "learning_rate": 1.9661751592955086e-05, "logits/chosen": -18.576244354248047, "logits/rejected": -19.872777938842773, "logps/chosen": -543.4857177734375, "logps/rejected": -440.7587890625, "loss": 0.9586, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 4.305512428283691, "rewards/margins": 1.4709885120391846, "rewards/rejected": 2.8345236778259277, "step": 5000 }, { "epoch": 0.2490784098834313, "eval_logits/chosen": -21.543413162231445, "eval_logits/rejected": -22.481142044067383, "eval_logps/chosen": -475.4376525878906, "eval_logps/rejected": -402.1141052246094, "eval_loss": 1.169881820678711, "eval_rewards/accuracies": 0.6557591557502747, "eval_rewards/chosen": 4.376668930053711, "eval_rewards/margins": 1.5471277236938477, "eval_rewards/rejected": 2.829540729522705, "eval_runtime": 473.1936, "eval_samples_per_second": 3.216, "eval_steps_per_second": 0.404, "step": 5000 }, { "epoch": 0.2540599780810999, "grad_norm": 82.43436431884766, "learning_rate": 1.9648167047135133e-05, "logits/chosen": -19.058635711669922, "logits/rejected": -20.134428024291992, "logps/chosen": -495.65887451171875, "logps/rejected": -419.3161315917969, "loss": 1.1698, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 3.87038516998291, "rewards/margins": 1.3453155755996704, "rewards/rejected": 2.52506947517395, "step": 5100 }, { "epoch": 0.2590415462787686, "grad_norm": 3.7819454669952393, "learning_rate": 1.9634319938496742e-05, "logits/chosen": -19.017601013183594, "logits/rejected": -20.623193740844727, "logps/chosen": -478.9990539550781, "logps/rejected": -395.05450439453125, "loss": 1.1279, "rewards/accuracies": 0.6700000166893005, "rewards/chosen": 3.3154103755950928, "rewards/margins": 1.5955133438110352, "rewards/rejected": 1.7198967933654785, "step": 5200 }, { "epoch": 0.26402311447643717, "grad_norm": 1.351638674736023, "learning_rate": 1.962021064387168e-05, "logits/chosen": -18.885652542114258, "logits/rejected": -19.914079666137695, "logps/chosen": -510.6768493652344, "logps/rejected": -441.775634765625, "loss": 1.1382, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 4.05587911605835, "rewards/margins": 1.2654640674591064, "rewards/rejected": 2.790414810180664, "step": 5300 }, { "epoch": 0.2690046826741058, "grad_norm": 32.41090393066406, "learning_rate": 1.9605839547226785e-05, "logits/chosen": -19.33073616027832, "logits/rejected": -20.477949142456055, "logps/chosen": -492.5516052246094, "logps/rejected": -420.31036376953125, "loss": 1.0482, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": 3.8395843505859375, "rewards/margins": 1.6333999633789062, "rewards/rejected": 2.206184148788452, "step": 5400 }, { "epoch": 0.2739862508717744, "grad_norm": 32.58820724487305, "learning_rate": 1.9591207039653507e-05, "logits/chosen": -19.26167106628418, "logits/rejected": -20.89728546142578, "logps/chosen": -438.7798156738281, "logps/rejected": -363.2562255859375, "loss": 1.243, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": 3.1530792713165283, "rewards/margins": 1.1595901250839233, "rewards/rejected": 1.9934889078140259, "step": 5500 }, { "epoch": 0.27896781906944307, "grad_norm": 93.86406707763672, "learning_rate": 1.9576313519357265e-05, "logits/chosen": -19.064878463745117, "logits/rejected": -20.905057907104492, "logps/chosen": -519.572509765625, "logps/rejected": -442.0079040527344, "loss": 1.2293, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 3.4798271656036377, "rewards/margins": 1.09993577003479, "rewards/rejected": 2.3798913955688477, "step": 5600 }, { "epoch": 0.28394938726711166, "grad_norm": 26.577350616455078, "learning_rate": 1.9561159391646618e-05, "logits/chosen": -19.34862518310547, "logits/rejected": -22.218530654907227, "logps/chosen": -500.4524230957031, "logps/rejected": -402.18389892578125, "loss": 0.9679, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 3.561279535293579, "rewards/margins": 1.6947131156921387, "rewards/rejected": 1.8665661811828613, "step": 5700 }, { "epoch": 0.2889309554647803, "grad_norm": 20.811817169189453, "learning_rate": 1.9545745068922225e-05, "logits/chosen": -19.238685607910156, "logits/rejected": -20.936817169189453, "logps/chosen": -499.0253601074219, "logps/rejected": -417.6446533203125, "loss": 1.1599, "rewards/accuracies": 0.5299999713897705, "rewards/chosen": 4.803900241851807, "rewards/margins": 1.0667366981506348, "rewards/rejected": 3.7371630668640137, "step": 5800 }, { "epoch": 0.29391252366244897, "grad_norm": 23.467056274414062, "learning_rate": 1.9530070970665638e-05, "logits/chosen": -19.428844451904297, "logits/rejected": -21.77304458618164, "logps/chosen": -498.2739562988281, "logps/rejected": -398.3455810546875, "loss": 0.9347, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 4.099443435668945, "rewards/margins": 1.4808638095855713, "rewards/rejected": 2.6185789108276367, "step": 5900 }, { "epoch": 0.29889409186011756, "grad_norm": 44.77325439453125, "learning_rate": 1.951413752342786e-05, "logits/chosen": -19.230180740356445, "logits/rejected": -21.40241050720215, "logps/chosen": -522.796630859375, "logps/rejected": -409.90936279296875, "loss": 1.0653, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 3.7546486854553223, "rewards/margins": 2.06999135017395, "rewards/rejected": 1.6846575736999512, "step": 6000 }, { "epoch": 0.3038756600577862, "grad_norm": 0.1034678965806961, "learning_rate": 1.949794516081777e-05, "logits/chosen": -19.376697540283203, "logits/rejected": -21.691804885864258, "logps/chosen": -482.21807861328125, "logps/rejected": -379.0809326171875, "loss": 0.954, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 2.9128754138946533, "rewards/margins": 1.5702927112579346, "rewards/rejected": 1.3425830602645874, "step": 6100 }, { "epoch": 0.3088572282554548, "grad_norm": 8.153312683105469, "learning_rate": 1.9481494323490292e-05, "logits/chosen": -20.608989715576172, "logits/rejected": -25.289657592773438, "logps/chosen": -456.2762756347656, "logps/rejected": -363.2749938964844, "loss": 1.0848, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.7271530628204346, "rewards/margins": 1.8869810104370117, "rewards/rejected": 0.8401720523834229, "step": 6200 }, { "epoch": 0.31383879645312346, "grad_norm": 173.4281463623047, "learning_rate": 1.9464785459134422e-05, "logits/chosen": -20.402162551879883, "logits/rejected": -26.399858474731445, "logps/chosen": -496.73199462890625, "logps/rejected": -371.1173400878906, "loss": 1.0173, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 3.2006542682647705, "rewards/margins": 2.35373592376709, "rewards/rejected": 0.8469181060791016, "step": 6300 }, { "epoch": 0.31882036465079205, "grad_norm": 71.35045623779297, "learning_rate": 1.9447819022461036e-05, "logits/chosen": -20.591211318969727, "logits/rejected": -21.949787139892578, "logps/chosen": -505.4292297363281, "logps/rejected": -457.5049133300781, "loss": 1.334, "rewards/accuracies": 0.5899999737739563, "rewards/chosen": 3.6680984497070312, "rewards/margins": 1.0007195472717285, "rewards/rejected": 2.6673781871795654, "step": 6400 }, { "epoch": 0.3238019328484607, "grad_norm": 71.10392761230469, "learning_rate": 1.9430595475190528e-05, "logits/chosen": -20.5976505279541, "logits/rejected": -22.98895263671875, "logps/chosen": -485.20758056640625, "logps/rejected": -426.62078857421875, "loss": 1.1786, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 3.1840951442718506, "rewards/margins": 1.0180020332336426, "rewards/rejected": 2.166092872619629, "step": 6500 }, { "epoch": 0.3287835010461293, "grad_norm": 20.788406372070312, "learning_rate": 1.9413115286040228e-05, "logits/chosen": -20.659372329711914, "logits/rejected": -24.63149642944336, "logps/chosen": -496.0128173828125, "logps/rejected": -426.76092529296875, "loss": 1.2417, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 2.6715264320373535, "rewards/margins": 1.3509292602539062, "rewards/rejected": 1.3205969333648682, "step": 6600 }, { "epoch": 0.33376506924379795, "grad_norm": 47.43177032470703, "learning_rate": 1.9395378930711654e-05, "logits/chosen": -21.26150894165039, "logits/rejected": -27.082225799560547, "logps/chosen": -484.2604675292969, "logps/rejected": -407.31512451171875, "loss": 1.0294, "rewards/accuracies": 0.7099999785423279, "rewards/chosen": 2.974027633666992, "rewards/margins": 2.4509451389312744, "rewards/rejected": 0.5230829119682312, "step": 6700 }, { "epoch": 0.33874663744146655, "grad_norm": 29.3293514251709, "learning_rate": 1.9377386891877572e-05, "logits/chosen": -20.902864456176758, "logits/rejected": -24.1262264251709, "logps/chosen": -505.2213134765625, "logps/rejected": -438.3580017089844, "loss": 1.5894, "rewards/accuracies": 0.5, "rewards/chosen": 1.993202567100525, "rewards/margins": 0.9017642736434937, "rewards/rejected": 1.0914379358291626, "step": 6800 }, { "epoch": 0.3437282056391352, "grad_norm": 85.70783996582031, "learning_rate": 1.9359139659168845e-05, "logits/chosen": -19.933032989501953, "logits/rejected": -23.047008514404297, "logps/chosen": -496.6371154785156, "logps/rejected": -435.8622131347656, "loss": 1.0725, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 1.8428994417190552, "rewards/margins": 1.234021782875061, "rewards/rejected": 0.6088778376579285, "step": 6900 }, { "epoch": 0.34870977383680385, "grad_norm": 44.99619674682617, "learning_rate": 1.9340637729161137e-05, "logits/chosen": -20.23802375793457, "logits/rejected": -22.14116668701172, "logps/chosen": -505.2397155761719, "logps/rejected": -458.6205139160156, "loss": 1.4139, "rewards/accuracies": 0.5699999928474426, "rewards/chosen": 1.7950440645217896, "rewards/margins": 1.0467220544815063, "rewards/rejected": 0.7483220100402832, "step": 7000 }, { "epoch": 0.35369134203447244, "grad_norm": 2.535125886993228e-08, "learning_rate": 1.9321881605361363e-05, "logits/chosen": -19.776222229003906, "logits/rejected": -22.802228927612305, "logps/chosen": -552.8232421875, "logps/rejected": -477.6277770996094, "loss": 1.0137, "rewards/accuracies": 0.6899999976158142, "rewards/chosen": 3.883512020111084, "rewards/margins": 2.6781344413757324, "rewards/rejected": 1.2053773403167725, "step": 7100 }, { "epoch": 0.3586729102321411, "grad_norm": 104.68194580078125, "learning_rate": 1.9302871798194005e-05, "logits/chosen": -21.631492614746094, "logits/rejected": -25.338726043701172, "logps/chosen": -458.9781494140625, "logps/rejected": -421.4054260253906, "loss": 1.5316, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5002456903457642, "rewards/margins": 1.058099389076233, "rewards/rejected": 0.4421464204788208, "step": 7200 }, { "epoch": 0.3636544784298097, "grad_norm": 15.395011901855469, "learning_rate": 1.9283608824987236e-05, "logits/chosen": -21.326448440551758, "logits/rejected": -27.821928024291992, "logps/chosen": -519.0068969726562, "logps/rejected": -431.6269226074219, "loss": 1.4097, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 2.3849658966064453, "rewards/margins": 2.2866411209106445, "rewards/rejected": 0.09832416474819183, "step": 7300 }, { "epoch": 0.36863604662747834, "grad_norm": 5.503931999206543, "learning_rate": 1.9264093209958822e-05, "logits/chosen": -22.2663631439209, "logits/rejected": -30.09918212890625, "logps/chosen": -519.89013671875, "logps/rejected": -434.9560241699219, "loss": 0.6146, "rewards/accuracies": 0.7400000095367432, "rewards/chosen": 1.8443881273269653, "rewards/margins": 3.3645260334014893, "rewards/rejected": -1.5201376676559448, "step": 7400 }, { "epoch": 0.37361761482514694, "grad_norm": 0.397699773311615, "learning_rate": 1.9244325484201844e-05, "logits/chosen": -21.55438995361328, "logits/rejected": -26.417490005493164, "logps/chosen": -578.6663208007812, "logps/rejected": -494.0219421386719, "loss": 1.32, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.8062126636505127, "rewards/margins": 1.9167245626449585, "rewards/rejected": 0.8894882798194885, "step": 7500 }, { "epoch": 0.3785991830228156, "grad_norm": 1.4006325006484985, "learning_rate": 1.9224306185670284e-05, "logits/chosen": -21.26766586303711, "logits/rejected": -23.203746795654297, "logps/chosen": -524.5022583007812, "logps/rejected": -450.9858093261719, "loss": 1.2222, "rewards/accuracies": 0.5699999928474426, "rewards/chosen": 2.8659799098968506, "rewards/margins": 1.5051510334014893, "rewards/rejected": 1.3608287572860718, "step": 7600 }, { "epoch": 0.3835807512204842, "grad_norm": 77.18798828125, "learning_rate": 1.9204035859164346e-05, "logits/chosen": -20.718629837036133, "logits/rejected": -24.271589279174805, "logps/chosen": -482.0295715332031, "logps/rejected": -402.9772644042969, "loss": 1.4243, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 2.1575722694396973, "rewards/margins": 1.1561360359191895, "rewards/rejected": 1.0014359951019287, "step": 7700 }, { "epoch": 0.38856231941815284, "grad_norm": 7.084451675415039, "learning_rate": 1.9183515056315664e-05, "logits/chosen": -20.11510467529297, "logits/rejected": -22.786483764648438, "logps/chosen": -521.9429931640625, "logps/rejected": -447.221923828125, "loss": 1.2535, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": 3.086232900619507, "rewards/margins": 1.8227348327636719, "rewards/rejected": 1.2634981870651245, "step": 7800 }, { "epoch": 0.3935438876158215, "grad_norm": 26.435029983520508, "learning_rate": 1.9162744335572254e-05, "logits/chosen": -20.078449249267578, "logits/rejected": -21.64859390258789, "logps/chosen": -495.9106750488281, "logps/rejected": -439.6401672363281, "loss": 1.3286, "rewards/accuracies": 0.5600000023841858, "rewards/chosen": 2.769630193710327, "rewards/margins": 0.8031193017959595, "rewards/rejected": 1.9665107727050781, "step": 7900 }, { "epoch": 0.3985254558134901, "grad_norm": 55.87978744506836, "learning_rate": 1.9141724262183347e-05, "logits/chosen": -19.4700927734375, "logits/rejected": -24.67208480834961, "logps/chosen": -476.494873046875, "logps/rejected": -370.9423828125, "loss": 1.1024, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 3.3464269638061523, "rewards/margins": 1.9073596000671387, "rewards/rejected": 1.4390674829483032, "step": 8000 }, { "epoch": 0.40350702401115873, "grad_norm": 72.6644515991211, "learning_rate": 1.9120455408183996e-05, "logits/chosen": -19.84633445739746, "logits/rejected": -23.482101440429688, "logps/chosen": -469.52984619140625, "logps/rejected": -371.6656494140625, "loss": 1.0347, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 3.2709081172943115, "rewards/margins": 1.9965094327926636, "rewards/rejected": 1.2743984460830688, "step": 8100 }, { "epoch": 0.40848859220882733, "grad_norm": 7.036466121673584, "learning_rate": 1.9098938352379497e-05, "logits/chosen": -19.558134078979492, "logits/rejected": -22.82000732421875, "logps/chosen": -516.8818359375, "logps/rejected": -426.9209289550781, "loss": 1.3704, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.8734841346740723, "rewards/margins": 1.5538628101348877, "rewards/rejected": 1.3196213245391846, "step": 8200 }, { "epoch": 0.413470160406496, "grad_norm": 19.03754997253418, "learning_rate": 1.9077173680329667e-05, "logits/chosen": -19.861852645874023, "logits/rejected": -20.495954513549805, "logps/chosen": -434.7484436035156, "logps/rejected": -418.93182373046875, "loss": 1.4406, "rewards/accuracies": 0.5400000214576721, "rewards/chosen": 3.662661552429199, "rewards/margins": 1.0773922204971313, "rewards/rejected": 2.5852692127227783, "step": 8300 }, { "epoch": 0.4184517286041646, "grad_norm": 0.007115426007658243, "learning_rate": 1.9055161984332865e-05, "logits/chosen": -19.320505142211914, "logits/rejected": -20.781309127807617, "logps/chosen": -505.4100036621094, "logps/rejected": -399.3970031738281, "loss": 1.2344, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 2.867283582687378, "rewards/margins": 1.5176202058792114, "rewards/rejected": 1.3496633768081665, "step": 8400 }, { "epoch": 0.4234332968018332, "grad_norm": 27.222930908203125, "learning_rate": 1.9032903863409916e-05, "logits/chosen": -19.227426528930664, "logits/rejected": -20.481821060180664, "logps/chosen": -504.85076904296875, "logps/rejected": -399.4018249511719, "loss": 1.1368, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 2.5695528984069824, "rewards/margins": 1.6272262334823608, "rewards/rejected": 0.9423269033432007, "step": 8500 }, { "epoch": 0.4284148649995018, "grad_norm": 118.92176055908203, "learning_rate": 1.901039992328779e-05, "logits/chosen": -19.390897750854492, "logits/rejected": -20.656518936157227, "logps/chosen": -502.2049865722656, "logps/rejected": -406.15740966796875, "loss": 1.3008, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 2.6097073554992676, "rewards/margins": 1.4985759258270264, "rewards/rejected": 1.1111317873001099, "step": 8600 }, { "epoch": 0.43339643319717047, "grad_norm": 135.84136962890625, "learning_rate": 1.8987650776383116e-05, "logits/chosen": -19.83563232421875, "logits/rejected": -20.294017791748047, "logps/chosen": -496.2801513671875, "logps/rejected": -452.8775634765625, "loss": 1.689, "rewards/accuracies": 0.5699999928474426, "rewards/chosen": 2.978971481323242, "rewards/margins": 0.9833757281303406, "rewards/rejected": 1.9955962896347046, "step": 8700 }, { "epoch": 0.4383780013948391, "grad_norm": 0.03969337046146393, "learning_rate": 1.896465704178551e-05, "logits/chosen": -19.218610763549805, "logits/rejected": -20.0975284576416, "logps/chosen": -479.8434753417969, "logps/rejected": -410.504150390625, "loss": 1.2651, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.871466875076294, "rewards/margins": 1.330002784729004, "rewards/rejected": 1.5414642095565796, "step": 8800 }, { "epoch": 0.4433595695925077, "grad_norm": 2.9111685752868652, "learning_rate": 1.8941419345240763e-05, "logits/chosen": -19.58942222595215, "logits/rejected": -20.811443328857422, "logps/chosen": -435.19122314453125, "logps/rejected": -364.0976867675781, "loss": 1.164, "rewards/accuracies": 0.5899999737739563, "rewards/chosen": 3.163036823272705, "rewards/margins": 1.286778450012207, "rewards/rejected": 1.876258373260498, "step": 8900 }, { "epoch": 0.44834113779017637, "grad_norm": 92.33987426757812, "learning_rate": 1.891793831913376e-05, "logits/chosen": -19.12569808959961, "logits/rejected": -20.312471389770508, "logps/chosen": -539.9608764648438, "logps/rejected": -456.8437805175781, "loss": 1.0481, "rewards/accuracies": 0.6700000166893005, "rewards/chosen": 2.520881175994873, "rewards/margins": 1.5386346578598022, "rewards/rejected": 0.982246458530426, "step": 9000 }, { "epoch": 0.45332270598784496, "grad_norm": 24.77559471130371, "learning_rate": 1.8894214602471307e-05, "logits/chosen": -19.473718643188477, "logits/rejected": -21.318897247314453, "logps/chosen": -499.727783203125, "logps/rejected": -432.5865478515625, "loss": 1.3744, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.8033368587493896, "rewards/margins": 1.5860238075256348, "rewards/rejected": 1.2173125743865967, "step": 9100 }, { "epoch": 0.4583042741855136, "grad_norm": 2.553715467453003, "learning_rate": 1.887024884086473e-05, "logits/chosen": -19.989469528198242, "logits/rejected": -21.36966323852539, "logps/chosen": -485.37139892578125, "logps/rejected": -414.8341369628906, "loss": 1.2352, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 2.2446627616882324, "rewards/margins": 2.1422224044799805, "rewards/rejected": 0.10244012624025345, "step": 9200 }, { "epoch": 0.4632858423831822, "grad_norm": 9.945940017700195, "learning_rate": 1.88460416865123e-05, "logits/chosen": -19.838525772094727, "logits/rejected": -21.396879196166992, "logps/chosen": -510.5537109375, "logps/rejected": -417.2762451171875, "loss": 1.1751, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": 1.9794914722442627, "rewards/margins": 1.851491093635559, "rewards/rejected": 0.12800025939941406, "step": 9300 }, { "epoch": 0.46826741058085086, "grad_norm": 15.31814193725586, "learning_rate": 1.88215937981815e-05, "logits/chosen": -19.403379440307617, "logits/rejected": -20.53765869140625, "logps/chosen": -476.90802001953125, "logps/rejected": -390.69744873046875, "loss": 1.7465, "rewards/accuracies": 0.5099999904632568, "rewards/chosen": 3.1520519256591797, "rewards/margins": 0.6633343696594238, "rewards/rejected": 2.488717555999756, "step": 9400 }, { "epoch": 0.47324897877851946, "grad_norm": 2.4341812133789062, "learning_rate": 1.879690584119108e-05, "logits/chosen": -18.863977432250977, "logits/rejected": -19.929393768310547, "logps/chosen": -451.2972717285156, "logps/rejected": -356.81878662109375, "loss": 1.0762, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 3.5310535430908203, "rewards/margins": 0.9716143012046814, "rewards/rejected": 2.5594394207000732, "step": 9500 }, { "epoch": 0.4782305469761881, "grad_norm": 0.5188534259796143, "learning_rate": 1.8771978487392965e-05, "logits/chosen": -19.067102432250977, "logits/rejected": -19.693904876708984, "logps/chosen": -436.10125732421875, "logps/rejected": -371.744140625, "loss": 1.2393, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.686614990234375, "rewards/margins": 1.6963415145874023, "rewards/rejected": 1.9902732372283936, "step": 9600 }, { "epoch": 0.4832121151738567, "grad_norm": 0.9292926788330078, "learning_rate": 1.874681241515396e-05, "logits/chosen": -18.863676071166992, "logits/rejected": -18.958852767944336, "logps/chosen": -462.2861328125, "logps/rejected": -413.9620361328125, "loss": 1.2257, "rewards/accuracies": 0.5899999737739563, "rewards/chosen": 3.4554378986358643, "rewards/margins": 0.9302346110343933, "rewards/rejected": 2.525202989578247, "step": 9700 }, { "epoch": 0.48819368337152536, "grad_norm": 35.07600402832031, "learning_rate": 1.8721408309337295e-05, "logits/chosen": -18.920787811279297, "logits/rejected": -19.5614070892334, "logps/chosen": -475.8876647949219, "logps/rejected": -409.3224182128906, "loss": 1.2497, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 3.723018169403076, "rewards/margins": 1.3657230138778687, "rewards/rejected": 2.357295036315918, "step": 9800 }, { "epoch": 0.493175251569194, "grad_norm": 0.003431697143241763, "learning_rate": 1.8695766861283987e-05, "logits/chosen": -18.851102828979492, "logits/rejected": -19.668804168701172, "logps/chosen": -505.24945068359375, "logps/rejected": -430.45428466796875, "loss": 1.1504, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 3.6089422702789307, "rewards/margins": 1.7381848096847534, "rewards/rejected": 1.8707573413848877, "step": 9900 }, { "epoch": 0.4981568197668626, "grad_norm": 65.25814819335938, "learning_rate": 1.8669888768794024e-05, "logits/chosen": -18.943655014038086, "logits/rejected": -19.943601608276367, "logps/chosen": -456.38531494140625, "logps/rejected": -394.1759033203125, "loss": 1.1225, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": 3.0651655197143555, "rewards/margins": 1.2026199102401733, "rewards/rejected": 1.8625457286834717, "step": 10000 }, { "epoch": 0.4981568197668626, "eval_logits/chosen": -20.5496826171875, "eval_logits/rejected": -21.546123504638672, "eval_logps/chosen": -477.2846984863281, "eval_logps/rejected": -404.45428466796875, "eval_loss": 1.1951801776885986, "eval_rewards/accuracies": 0.6335078477859497, "eval_rewards/chosen": 4.191972732543945, "eval_rewards/margins": 1.5964468717575073, "eval_rewards/rejected": 2.5955255031585693, "eval_runtime": 472.8478, "eval_samples_per_second": 3.219, "eval_steps_per_second": 0.404, "step": 10000 }, { "epoch": 0.5031383879645313, "grad_norm": 94.86180114746094, "learning_rate": 1.8643774736107384e-05, "logits/chosen": -18.719505310058594, "logits/rejected": -19.4000244140625, "logps/chosen": -508.5538330078125, "logps/rejected": -445.2476806640625, "loss": 1.3446, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 3.225598096847534, "rewards/margins": 1.1447488069534302, "rewards/rejected": 2.0808494091033936, "step": 10100 }, { "epoch": 0.5081199561621998, "grad_norm": 48.567291259765625, "learning_rate": 1.8617425473884855e-05, "logits/chosen": -18.674545288085938, "logits/rejected": -19.417861938476562, "logps/chosen": -519.6810302734375, "logps/rejected": -433.57940673828125, "loss": 1.3277, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.415245771408081, "rewards/margins": 1.2748632431030273, "rewards/rejected": 1.1403824090957642, "step": 10200 }, { "epoch": 0.5131015243598684, "grad_norm": 6.8724141120910645, "learning_rate": 1.859084169918871e-05, "logits/chosen": -18.91655731201172, "logits/rejected": -19.582307815551758, "logps/chosen": -490.5546569824219, "logps/rejected": -427.4134521484375, "loss": 1.5115, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 2.807913303375244, "rewards/margins": 1.6051901578903198, "rewards/rejected": 1.2027232646942139, "step": 10300 }, { "epoch": 0.5180830925575372, "grad_norm": 11.200637817382812, "learning_rate": 1.8564024135463173e-05, "logits/chosen": -18.65709114074707, "logits/rejected": -18.75111198425293, "logps/chosen": -456.7597961425781, "logps/rejected": -418.4756774902344, "loss": 1.4256, "rewards/accuracies": 0.5699999928474426, "rewards/chosen": 1.6474815607070923, "rewards/margins": 0.9819788336753845, "rewards/rejected": 0.6655027866363525, "step": 10400 }, { "epoch": 0.5230646607552057, "grad_norm": 14.02713394165039, "learning_rate": 1.8536973512514762e-05, "logits/chosen": -18.270898818969727, "logits/rejected": -18.737123489379883, "logps/chosen": -495.671875, "logps/rejected": -404.7850646972656, "loss": 1.1738, "rewards/accuracies": 0.6700000166893005, "rewards/chosen": 3.6311376094818115, "rewards/margins": 1.3220287561416626, "rewards/rejected": 2.3091087341308594, "step": 10500 }, { "epoch": 0.5280462289528743, "grad_norm": 88.36447143554688, "learning_rate": 1.85096905664924e-05, "logits/chosen": -18.358213424682617, "logits/rejected": -18.569581985473633, "logps/chosen": -460.0679931640625, "logps/rejected": -420.5664978027344, "loss": 1.4962, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.870842456817627, "rewards/margins": 1.0018635988235474, "rewards/rejected": 0.8689790368080139, "step": 10600 }, { "epoch": 0.5330277971505429, "grad_norm": 2.6545143127441406, "learning_rate": 1.848217603986739e-05, "logits/chosen": -18.19515037536621, "logits/rejected": -18.503700256347656, "logps/chosen": -482.6021423339844, "logps/rejected": -439.8750915527344, "loss": 1.3315, "rewards/accuracies": 0.5899999737739563, "rewards/chosen": 2.491628408432007, "rewards/margins": 0.7911645770072937, "rewards/rejected": 1.700463891029358, "step": 10700 }, { "epoch": 0.5380093653482116, "grad_norm": 54.594303131103516, "learning_rate": 1.845443068141322e-05, "logits/chosen": -18.29205894470215, "logits/rejected": -19.060501098632812, "logps/chosen": -499.4178466796875, "logps/rejected": -410.4550476074219, "loss": 1.3476, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": 3.079913377761841, "rewards/margins": 1.4285519123077393, "rewards/rejected": 1.6513612270355225, "step": 10800 }, { "epoch": 0.5429909335458802, "grad_norm": 7.620671272277832, "learning_rate": 1.8426455246185177e-05, "logits/chosen": -18.482242584228516, "logits/rejected": -19.06051254272461, "logps/chosen": -488.9962463378906, "logps/rejected": -414.5852355957031, "loss": 1.118, "rewards/accuracies": 0.7099999785423279, "rewards/chosen": 3.6191320419311523, "rewards/margins": 1.9326629638671875, "rewards/rejected": 1.6864690780639648, "step": 10900 }, { "epoch": 0.5479725017435488, "grad_norm": 44.31614685058594, "learning_rate": 1.8398250495499796e-05, "logits/chosen": -18.507394790649414, "logits/rejected": -19.16678810119629, "logps/chosen": -496.0931701660156, "logps/rejected": -424.0758056640625, "loss": 1.3294, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.764357805252075, "rewards/margins": 1.4526183605194092, "rewards/rejected": 2.311739444732666, "step": 11000 }, { "epoch": 0.5529540699412175, "grad_norm": 113.41361999511719, "learning_rate": 1.8369817196914145e-05, "logits/chosen": -18.77853775024414, "logits/rejected": -19.550212860107422, "logps/chosen": -458.4302978515625, "logps/rejected": -376.8597717285156, "loss": 1.2723, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": 2.409397602081299, "rewards/margins": 1.0338749885559082, "rewards/rejected": 1.375522255897522, "step": 11100 }, { "epoch": 0.5579356381388861, "grad_norm": 39.32530212402344, "learning_rate": 1.8341156124204943e-05, "logits/chosen": -18.80110740661621, "logits/rejected": -19.126850128173828, "logps/chosen": -444.9359436035156, "logps/rejected": -403.28546142578125, "loss": 1.2621, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 3.3543567657470703, "rewards/margins": 1.4829553365707397, "rewards/rejected": 1.8714015483856201, "step": 11200 }, { "epoch": 0.5629172063365547, "grad_norm": 0.01231900043785572, "learning_rate": 1.8312268057347488e-05, "logits/chosen": -19.005640029907227, "logits/rejected": -19.596464157104492, "logps/chosen": -466.5137634277344, "logps/rejected": -408.3565673828125, "loss": 1.2726, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.5804996490478516, "rewards/margins": 1.2464163303375244, "rewards/rejected": 1.3340831995010376, "step": 11300 }, { "epoch": 0.5678987745342233, "grad_norm": 78.91645050048828, "learning_rate": 1.8283153782494457e-05, "logits/chosen": -19.66242218017578, "logits/rejected": -20.03885269165039, "logps/chosen": -495.5252380371094, "logps/rejected": -428.739501953125, "loss": 1.1003, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 0.8916977047920227, "rewards/margins": 2.0576887130737305, "rewards/rejected": -1.165990948677063, "step": 11400 }, { "epoch": 0.572880342731892, "grad_norm": 2.7297961711883545, "learning_rate": 1.8253814091954476e-05, "logits/chosen": -19.751190185546875, "logits/rejected": -20.76055335998535, "logps/chosen": -470.715576171875, "logps/rejected": -401.6990661621094, "loss": 1.1496, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": -0.058783989399671555, "rewards/margins": 1.678650975227356, "rewards/rejected": -1.7374348640441895, "step": 11500 }, { "epoch": 0.5778619109295606, "grad_norm": 11.752656936645508, "learning_rate": 1.8224249784170595e-05, "logits/chosen": -19.580923080444336, "logits/rejected": -20.732593536376953, "logps/chosen": -517.3013916015625, "logps/rejected": -441.9253845214844, "loss": 1.2111, "rewards/accuracies": 0.6700000166893005, "rewards/chosen": -0.6827618479728699, "rewards/margins": 1.910689115524292, "rewards/rejected": -2.5934510231018066, "step": 11600 }, { "epoch": 0.5828434791272292, "grad_norm": 96.53925323486328, "learning_rate": 1.8194461663698524e-05, "logits/chosen": -19.67738914489746, "logits/rejected": -21.431556701660156, "logps/chosen": -518.2506103515625, "logps/rejected": -389.5613708496094, "loss": 1.3043, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12440891563892365, "rewards/margins": 2.5978763103485107, "rewards/rejected": -2.4734673500061035, "step": 11700 }, { "epoch": 0.5878250473248979, "grad_norm": 62.21799850463867, "learning_rate": 1.8164450541184768e-05, "logits/chosen": -19.013898849487305, "logits/rejected": -19.318574905395508, "logps/chosen": -564.2166137695312, "logps/rejected": -528.39111328125, "loss": 1.3621, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 0.12132181972265244, "rewards/margins": 1.5959105491638184, "rewards/rejected": -1.4745885133743286, "step": 11800 }, { "epoch": 0.5928066155225665, "grad_norm": 0.12062743306159973, "learning_rate": 1.8134217233344556e-05, "logits/chosen": -19.182098388671875, "logits/rejected": -19.83804702758789, "logps/chosen": -525.335693359375, "logps/rejected": -455.0185852050781, "loss": 1.3079, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": -0.4299677908420563, "rewards/margins": 1.420630931854248, "rewards/rejected": -1.8505988121032715, "step": 11900 }, { "epoch": 0.5977881837202351, "grad_norm": 4.478858470916748, "learning_rate": 1.81037625629396e-05, "logits/chosen": -18.84477996826172, "logits/rejected": -19.84359359741211, "logps/chosen": -519.6325073242188, "logps/rejected": -437.3699951171875, "loss": 1.2444, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": 1.3549919128417969, "rewards/margins": 2.0743324756622314, "rewards/rejected": -0.7193406820297241, "step": 12000 }, { "epoch": 0.6027697519179037, "grad_norm": 43.38969039916992, "learning_rate": 1.8073087358755735e-05, "logits/chosen": -18.777620315551758, "logits/rejected": -19.37495231628418, "logps/chosen": -485.4015197753906, "logps/rejected": -413.2806396484375, "loss": 1.2657, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 2.3500404357910156, "rewards/margins": 2.2481086254119873, "rewards/rejected": 0.10193166881799698, "step": 12100 }, { "epoch": 0.6077513201155724, "grad_norm": 29.47319984436035, "learning_rate": 1.804219245558033e-05, "logits/chosen": -18.747379302978516, "logits/rejected": -19.17096519470215, "logps/chosen": -476.1571044921875, "logps/rejected": -414.704345703125, "loss": 1.3264, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 1.9210524559020996, "rewards/margins": 0.9324368238449097, "rewards/rejected": 0.9886155724525452, "step": 12200 }, { "epoch": 0.612732888313241, "grad_norm": 57.94328689575195, "learning_rate": 1.8011078694179602e-05, "logits/chosen": -18.417835235595703, "logits/rejected": -18.728105545043945, "logps/chosen": -466.6083068847656, "logps/rejected": -417.28936767578125, "loss": 1.3787, "rewards/accuracies": 0.5899999737739563, "rewards/chosen": 2.9616811275482178, "rewards/margins": 0.9320456981658936, "rewards/rejected": 2.0296356678009033, "step": 12300 }, { "epoch": 0.6177144565109096, "grad_norm": 0.6535269021987915, "learning_rate": 1.7979746921275713e-05, "logits/chosen": -18.470064163208008, "logits/rejected": -19.071678161621094, "logps/chosen": -499.7461242675781, "logps/rejected": -414.9991455078125, "loss": 1.0886, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.9048681259155273, "rewards/margins": 1.7158997058868408, "rewards/rejected": 1.188968300819397, "step": 12400 }, { "epoch": 0.6226960247085782, "grad_norm": 47.690162658691406, "learning_rate": 1.794819798952374e-05, "logits/chosen": -18.49652862548828, "logits/rejected": -18.98128318786621, "logps/chosen": -571.6408081054688, "logps/rejected": -473.7767333984375, "loss": 1.2108, "rewards/accuracies": 0.7099999785423279, "rewards/chosen": 2.7018349170684814, "rewards/margins": 2.354581832885742, "rewards/rejected": 0.34725311398506165, "step": 12500 }, { "epoch": 0.6276775929062469, "grad_norm": 69.55400085449219, "learning_rate": 1.7916432757488467e-05, "logits/chosen": -19.46697235107422, "logits/rejected": -20.09600830078125, "logps/chosen": -524.7301635742188, "logps/rejected": -433.07293701171875, "loss": 1.0183, "rewards/accuracies": 0.6700000166893005, "rewards/chosen": 1.3079893589019775, "rewards/margins": 3.2492458820343018, "rewards/rejected": -1.9412565231323242, "step": 12600 }, { "epoch": 0.6326591611039155, "grad_norm": 14.792251586914062, "learning_rate": 1.7884452089621012e-05, "logits/chosen": -19.28809928894043, "logits/rejected": -20.2492733001709, "logps/chosen": -578.1820068359375, "logps/rejected": -456.3279724121094, "loss": 1.1159, "rewards/accuracies": 0.6700000166893005, "rewards/chosen": 1.191977858543396, "rewards/margins": 2.5655128955841064, "rewards/rejected": -1.37353515625, "step": 12700 }, { "epoch": 0.6376407293015841, "grad_norm": 0.21572743356227875, "learning_rate": 1.7852256856235318e-05, "logits/chosen": -19.648353576660156, "logits/rejected": -20.134416580200195, "logps/chosen": -495.5775146484375, "logps/rejected": -438.1684265136719, "loss": 1.387, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": -0.19859656691551208, "rewards/margins": 2.191715955734253, "rewards/rejected": -2.390312433242798, "step": 12800 }, { "epoch": 0.6426222974992528, "grad_norm": 102.35507202148438, "learning_rate": 1.7819847933484467e-05, "logits/chosen": -19.353174209594727, "logits/rejected": -20.048927307128906, "logps/chosen": -524.4760131835938, "logps/rejected": -446.4917907714844, "loss": 1.1967, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.8889893293380737, "rewards/margins": 1.9711395502090454, "rewards/rejected": -1.0821502208709717, "step": 12900 }, { "epoch": 0.6476038656969214, "grad_norm": 0.06541766971349716, "learning_rate": 1.778722620333681e-05, "logits/chosen": -19.828271865844727, "logits/rejected": -20.194868087768555, "logps/chosen": -602.9769287109375, "logps/rejected": -530.201904296875, "loss": 1.6761, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2178092002868652, "rewards/margins": 1.5633704662322998, "rewards/rejected": -3.781179904937744, "step": 13000 }, { "epoch": 0.65258543389459, "grad_norm": 58.162445068359375, "learning_rate": 1.775439255355201e-05, "logits/chosen": -19.331708908081055, "logits/rejected": -19.971097946166992, "logps/chosen": -570.8577880859375, "logps/rejected": -473.57196044921875, "loss": 1.1495, "rewards/accuracies": 0.6700000166893005, "rewards/chosen": 0.24211058020591736, "rewards/margins": 2.4019970893859863, "rewards/rejected": -2.159886598587036, "step": 13100 }, { "epoch": 0.6575670020922586, "grad_norm": 1.0752054452896118, "learning_rate": 1.772134787765684e-05, "logits/chosen": -19.27989959716797, "logits/rejected": -19.537317276000977, "logps/chosen": -541.393798828125, "logps/rejected": -501.2279968261719, "loss": 1.6051, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": -1.2508366107940674, "rewards/margins": 0.7745574712753296, "rewards/rejected": -2.0253942012786865, "step": 13200 }, { "epoch": 0.6625485702899273, "grad_norm": 8.162809371948242, "learning_rate": 1.768809307492089e-05, "logits/chosen": -18.722593307495117, "logits/rejected": -19.011571884155273, "logps/chosen": -513.6095581054688, "logps/rejected": -469.2226257324219, "loss": 1.3372, "rewards/accuracies": 0.5400000214576721, "rewards/chosen": -0.9962272047996521, "rewards/margins": 0.9051995873451233, "rewards/rejected": -1.9014270305633545, "step": 13300 }, { "epoch": 0.6675301384875959, "grad_norm": 97.5845947265625, "learning_rate": 1.765462905033209e-05, "logits/chosen": -19.051023483276367, "logits/rejected": -19.420806884765625, "logps/chosen": -478.5913391113281, "logps/rejected": -437.26995849609375, "loss": 1.3999, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": -1.0214941501617432, "rewards/margins": 0.9453433752059937, "rewards/rejected": -1.9668372869491577, "step": 13400 }, { "epoch": 0.6725117066852645, "grad_norm": 29.43077850341797, "learning_rate": 1.762095671457209e-05, "logits/chosen": -19.13440704345703, "logits/rejected": -19.925010681152344, "logps/chosen": -496.3144836425781, "logps/rejected": -413.5525817871094, "loss": 1.3997, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": -0.08645965903997421, "rewards/margins": 1.8843421936035156, "rewards/rejected": -1.970801830291748, "step": 13500 }, { "epoch": 0.6774932748829331, "grad_norm": 0.7588065266609192, "learning_rate": 1.7587076983991457e-05, "logits/chosen": -19.021947860717773, "logits/rejected": -19.49304962158203, "logps/chosen": -520.0108032226562, "logps/rejected": -473.15020751953125, "loss": 1.9126, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.3220689296722412, "rewards/margins": 0.6901782155036926, "rewards/rejected": 0.631890594959259, "step": 13600 }, { "epoch": 0.6824748430806018, "grad_norm": 110.71784973144531, "learning_rate": 1.755299078058475e-05, "logits/chosen": -19.794466018676758, "logits/rejected": -20.945425033569336, "logps/chosen": -485.5846862792969, "logps/rejected": -422.5166931152344, "loss": 1.0623, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": -0.4341191351413727, "rewards/margins": 1.9454231262207031, "rewards/rejected": -2.379542589187622, "step": 13700 }, { "epoch": 0.6874564112782704, "grad_norm": 92.97657012939453, "learning_rate": 1.751869903196543e-05, "logits/chosen": -19.072101593017578, "logits/rejected": -20.458724975585938, "logps/chosen": -550.3908081054688, "logps/rejected": -454.7740173339844, "loss": 1.1903, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 0.9165298342704773, "rewards/margins": 1.906398892402649, "rewards/rejected": -0.9898689389228821, "step": 13800 }, { "epoch": 0.692437979475939, "grad_norm": 4.199160575866699, "learning_rate": 1.748420267134062e-05, "logits/chosen": -18.836036682128906, "logits/rejected": -19.922813415527344, "logps/chosen": -539.5211181640625, "logps/rejected": -479.9672546386719, "loss": 1.2681, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 1.260705828666687, "rewards/margins": 1.744788408279419, "rewards/rejected": -0.48408252000808716, "step": 13900 }, { "epoch": 0.6974195476736077, "grad_norm": 107.6252212524414, "learning_rate": 1.74495026374857e-05, "logits/chosen": -19.50172233581543, "logits/rejected": -20.288314819335938, "logps/chosen": -514.2687377929688, "logps/rejected": -449.1997375488281, "loss": 1.4695, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 0.515940248966217, "rewards/margins": 1.3783401250839233, "rewards/rejected": -0.8623998761177063, "step": 14000 }, { "epoch": 0.7024011158712763, "grad_norm": 5.941022872924805, "learning_rate": 1.7414599874718753e-05, "logits/chosen": -18.767423629760742, "logits/rejected": -19.68829917907715, "logps/chosen": -545.5341796875, "logps/rejected": -463.9657287597656, "loss": 1.1235, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 1.2197209596633911, "rewards/margins": 1.6720809936523438, "rewards/rejected": -0.4523601531982422, "step": 14100 }, { "epoch": 0.7073826840689449, "grad_norm": 26.461627960205078, "learning_rate": 1.737949533287489e-05, "logits/chosen": -18.46575355529785, "logits/rejected": -19.159351348876953, "logps/chosen": -517.9618530273438, "logps/rejected": -417.8922119140625, "loss": 1.14, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": 1.9558275938034058, "rewards/margins": 2.069073438644409, "rewards/rejected": -0.11324585229158401, "step": 14200 }, { "epoch": 0.7123642522666135, "grad_norm": 1.1918169260025024, "learning_rate": 1.7344189967280383e-05, "logits/chosen": -19.000808715820312, "logits/rejected": -20.075515747070312, "logps/chosen": -474.513916015625, "logps/rejected": -400.20196533203125, "loss": 0.9665, "rewards/accuracies": 0.7099999785423279, "rewards/chosen": 1.894654393196106, "rewards/margins": 2.998011589050293, "rewards/rejected": -1.103356957435608, "step": 14300 }, { "epoch": 0.7173458204642822, "grad_norm": 60.40048599243164, "learning_rate": 1.7308684738726668e-05, "logits/chosen": -18.980615615844727, "logits/rejected": -20.142223358154297, "logps/chosen": -510.573974609375, "logps/rejected": -441.4659729003906, "loss": 1.266, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 0.2977685332298279, "rewards/margins": 1.5726754665374756, "rewards/rejected": -1.274907112121582, "step": 14400 }, { "epoch": 0.7223273886619508, "grad_norm": 0.5988157391548157, "learning_rate": 1.7272980613444206e-05, "logits/chosen": -18.941259384155273, "logits/rejected": -20.322023391723633, "logps/chosen": -531.8062744140625, "logps/rejected": -474.36785888671875, "loss": 1.2675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3074489235877991, "rewards/margins": 1.548563838005066, "rewards/rejected": -1.8560125827789307, "step": 14500 }, { "epoch": 0.7273089568596194, "grad_norm": 6.901514530181885, "learning_rate": 1.7237078563076178e-05, "logits/chosen": -19.498384475708008, "logits/rejected": -21.36153793334961, "logps/chosen": -511.9005126953125, "logps/rejected": -439.97003173828125, "loss": 1.2244, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": -0.4899732172489166, "rewards/margins": 1.9954489469528198, "rewards/rejected": -2.485422372817993, "step": 14600 }, { "epoch": 0.7322905250572881, "grad_norm": 0.03188573196530342, "learning_rate": 1.7200979564652064e-05, "logits/chosen": -18.785024642944336, "logits/rejected": -21.15717124938965, "logps/chosen": -520.9817504882812, "logps/rejected": -426.6506652832031, "loss": 1.3959, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": 1.1841310262680054, "rewards/margins": 1.9892054796218872, "rewards/rejected": -0.8050744533538818, "step": 14700 }, { "epoch": 0.7372720932549567, "grad_norm": 30.52501678466797, "learning_rate": 1.7164684600561018e-05, "logits/chosen": -18.466907501220703, "logits/rejected": -20.27123260498047, "logps/chosen": -531.89501953125, "logps/rejected": -429.2342834472656, "loss": 1.3491, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": 2.848252773284912, "rewards/margins": 2.337670087814331, "rewards/rejected": 0.5105829834938049, "step": 14800 }, { "epoch": 0.7422536614526253, "grad_norm": 4.541143894195557, "learning_rate": 1.712819465852517e-05, "logits/chosen": -18.570043563842773, "logits/rejected": -20.75904083251953, "logps/chosen": -493.5054931640625, "logps/rejected": -380.9721984863281, "loss": 1.2685, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 1.8648478984832764, "rewards/margins": 2.2077205181121826, "rewards/rejected": -0.3428727388381958, "step": 14900 }, { "epoch": 0.7472352296502939, "grad_norm": 0.44224098324775696, "learning_rate": 1.7091510731572725e-05, "logits/chosen": -18.91974449157715, "logits/rejected": -20.655202865600586, "logps/chosen": -514.9723510742188, "logps/rejected": -452.38983154296875, "loss": 1.7715, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 2.1622519493103027, "rewards/margins": 1.3429455757141113, "rewards/rejected": 0.8193062543869019, "step": 15000 }, { "epoch": 0.7472352296502939, "eval_logits/chosen": -22.020517349243164, "eval_logits/rejected": -23.86900520324707, "eval_logps/chosen": -484.0854187011719, "eval_logps/rejected": -414.28009033203125, "eval_loss": 1.2986581325531006, "eval_rewards/accuracies": 0.6367800831794739, "eval_rewards/chosen": 3.5119001865386963, "eval_rewards/margins": 1.8989582061767578, "eval_rewards/rejected": 1.6129425764083862, "eval_runtime": 472.8477, "eval_samples_per_second": 3.219, "eval_steps_per_second": 0.404, "step": 15000 }, { "epoch": 0.7522167978479626, "grad_norm": 10.687481880187988, "learning_rate": 1.7054633818010954e-05, "logits/chosen": -18.568933486938477, "logits/rejected": -20.1440372467041, "logps/chosen": -456.9844055175781, "logps/rejected": -408.1793212890625, "loss": 1.0277, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": 3.049287796020508, "rewards/margins": 1.8809983730316162, "rewards/rejected": 1.168289065361023, "step": 15100 }, { "epoch": 0.7571983660456312, "grad_norm": 67.85148620605469, "learning_rate": 1.7017564921399e-05, "logits/chosen": -18.714679718017578, "logits/rejected": -21.288236618041992, "logps/chosen": -508.66015625, "logps/rejected": -424.9830322265625, "loss": 1.362, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 2.725537061691284, "rewards/margins": 1.661694049835205, "rewards/rejected": 1.0638428926467896, "step": 15200 }, { "epoch": 0.7621799342432998, "grad_norm": 44.31140899658203, "learning_rate": 1.698030505052061e-05, "logits/chosen": -18.928104400634766, "logits/rejected": -20.428186416625977, "logps/chosen": -470.02459716796875, "logps/rejected": -381.4432678222656, "loss": 1.504, "rewards/accuracies": 0.5600000023841858, "rewards/chosen": 2.468132734298706, "rewards/margins": 0.8910315036773682, "rewards/rejected": 1.577101469039917, "step": 15300 }, { "epoch": 0.7671615024409684, "grad_norm": 10.858190536499023, "learning_rate": 1.6942855219356634e-05, "logits/chosen": -18.520444869995117, "logits/rejected": -20.245197296142578, "logps/chosen": -491.31854248046875, "logps/rejected": -416.8392333984375, "loss": 1.0534, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 3.1176271438598633, "rewards/margins": 1.8960695266723633, "rewards/rejected": 1.2215576171875, "step": 15400 }, { "epoch": 0.7721430706386371, "grad_norm": 0.026119831949472427, "learning_rate": 1.6905216447057467e-05, "logits/chosen": -19.040843963623047, "logits/rejected": -20.839155197143555, "logps/chosen": -495.08428955078125, "logps/rejected": -410.4775390625, "loss": 1.5746, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": 2.2889349460601807, "rewards/margins": 1.6078674793243408, "rewards/rejected": 0.6810672283172607, "step": 15500 }, { "epoch": 0.7771246388363057, "grad_norm": 70.28289031982422, "learning_rate": 1.686738975791529e-05, "logits/chosen": -18.932214736938477, "logits/rejected": -20.42650032043457, "logps/chosen": -493.1498718261719, "logps/rejected": -429.498779296875, "loss": 1.4583, "rewards/accuracies": 0.5699999928474426, "rewards/chosen": 2.8957221508026123, "rewards/margins": 1.429540991783142, "rewards/rejected": 1.4661809206008911, "step": 15600 }, { "epoch": 0.7821062070339743, "grad_norm": 0.8262832164764404, "learning_rate": 1.6829376181336225e-05, "logits/chosen": -19.566686630249023, "logits/rejected": -21.254444122314453, "logps/chosen": -483.1775817871094, "logps/rejected": -451.6460266113281, "loss": 1.238, "rewards/accuracies": 0.5899999737739563, "rewards/chosen": 1.937461495399475, "rewards/margins": 1.0156903266906738, "rewards/rejected": 0.9217712879180908, "step": 15700 }, { "epoch": 0.787087775231643, "grad_norm": 10.062756538391113, "learning_rate": 1.6791176751812282e-05, "logits/chosen": -19.597667694091797, "logits/rejected": -21.418546676635742, "logps/chosen": -485.7425537109375, "logps/rejected": -417.196533203125, "loss": 1.1054, "rewards/accuracies": 0.6899999976158142, "rewards/chosen": 1.6439099311828613, "rewards/margins": 2.0514931678771973, "rewards/rejected": -0.40758341550827026, "step": 15800 }, { "epoch": 0.7920693434293116, "grad_norm": 11.171751022338867, "learning_rate": 1.675279250889324e-05, "logits/chosen": -19.189016342163086, "logits/rejected": -21.2204532623291, "logps/chosen": -523.8226318359375, "logps/rejected": -432.4778137207031, "loss": 1.2853, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": 2.265528917312622, "rewards/margins": 2.169790744781494, "rewards/rejected": 0.0957380086183548, "step": 15900 }, { "epoch": 0.7970509116269802, "grad_norm": 5.786453723907471, "learning_rate": 1.6714224497158334e-05, "logits/chosen": -19.52318572998047, "logits/rejected": -22.174915313720703, "logps/chosen": -500.9818115234375, "logps/rejected": -427.1512451171875, "loss": 1.2191, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03365034982562065, "rewards/margins": 2.265183210372925, "rewards/rejected": -2.2315328121185303, "step": 16000 }, { "epoch": 0.8020324798246488, "grad_norm": 0.03890511766076088, "learning_rate": 1.667547376618785e-05, "logits/chosen": -19.749061584472656, "logits/rejected": -21.982847213745117, "logps/chosen": -521.017822265625, "logps/rejected": -434.4532165527344, "loss": 1.6337, "rewards/accuracies": 0.5600000023841858, "rewards/chosen": 1.2707146406173706, "rewards/margins": 2.021421432495117, "rewards/rejected": -0.7507067918777466, "step": 16100 }, { "epoch": 0.8070140480223175, "grad_norm": 7.803345680236816, "learning_rate": 1.6636541370534537e-05, "logits/chosen": -19.421598434448242, "logits/rejected": -20.04302406311035, "logps/chosen": -481.74334716796875, "logps/rejected": -437.2704162597656, "loss": 1.5953, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 1.658532738685608, "rewards/margins": 1.250680685043335, "rewards/rejected": 0.40785208344459534, "step": 16200 }, { "epoch": 0.8119956162199861, "grad_norm": 0.0178745836019516, "learning_rate": 1.6597428369694934e-05, "logits/chosen": -18.97728157043457, "logits/rejected": -20.238012313842773, "logps/chosen": -544.66015625, "logps/rejected": -475.9863586425781, "loss": 1.2509, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 1.6321719884872437, "rewards/margins": 1.595544695854187, "rewards/rejected": 0.03662717714905739, "step": 16300 }, { "epoch": 0.8169771844176547, "grad_norm": 115.91627502441406, "learning_rate": 1.655813582808051e-05, "logits/chosen": -19.31316566467285, "logits/rejected": -20.653505325317383, "logps/chosen": -523.737060546875, "logps/rejected": -466.0438232421875, "loss": 1.4616, "rewards/accuracies": 0.5600000023841858, "rewards/chosen": 1.028465986251831, "rewards/margins": 0.8755133152008057, "rewards/rejected": 0.15295258164405823, "step": 16400 }, { "epoch": 0.8219587526153233, "grad_norm": 19.2689266204834, "learning_rate": 1.651866481498873e-05, "logits/chosen": -19.784204483032227, "logits/rejected": -21.526857376098633, "logps/chosen": -484.1365966796875, "logps/rejected": -423.5486755371094, "loss": 1.0354, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": 1.5439956188201904, "rewards/margins": 2.2780568599700928, "rewards/rejected": -0.7340614199638367, "step": 16500 }, { "epoch": 0.826940320812992, "grad_norm": 22.2554988861084, "learning_rate": 1.6479016404573916e-05, "logits/chosen": -19.650360107421875, "logits/rejected": -21.203350067138672, "logps/chosen": -505.99871826171875, "logps/rejected": -474.4420166015625, "loss": 1.6364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.20957961678504944, "rewards/margins": 1.9317030906677246, "rewards/rejected": -2.141282558441162, "step": 16600 }, { "epoch": 0.8319218890106606, "grad_norm": 44.10676574707031, "learning_rate": 1.6439191675818056e-05, "logits/chosen": -19.524065017700195, "logits/rejected": -22.584871292114258, "logps/chosen": -469.15264892578125, "logps/rejected": -362.7622985839844, "loss": 0.9529, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": -0.1507752537727356, "rewards/margins": 2.6078405380249023, "rewards/rejected": -2.7586159706115723, "step": 16700 }, { "epoch": 0.8369034572083291, "grad_norm": 143.54794311523438, "learning_rate": 1.6399191712501417e-05, "logits/chosen": -19.261682510375977, "logits/rejected": -21.39293670654297, "logps/chosen": -543.3509521484375, "logps/rejected": -472.142822265625, "loss": 1.4329, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": 1.02738356590271, "rewards/margins": 1.6261159181594849, "rewards/rejected": -0.5987322926521301, "step": 16800 }, { "epoch": 0.8418850254059979, "grad_norm": 0.004016869701445103, "learning_rate": 1.6359017603173043e-05, "logits/chosen": -19.182754516601562, "logits/rejected": -21.46161651611328, "logps/chosen": -525.286376953125, "logps/rejected": -444.2633972167969, "loss": 1.361, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 0.7399688959121704, "rewards/margins": 2.1688730716705322, "rewards/rejected": -1.4289040565490723, "step": 16900 }, { "epoch": 0.8468665936036665, "grad_norm": 85.75580596923828, "learning_rate": 1.6318670441121157e-05, "logits/chosen": -19.6074161529541, "logits/rejected": -20.843006134033203, "logps/chosen": -514.9994506835938, "logps/rejected": -458.54803466796875, "loss": 1.9907, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": -0.5896009802818298, "rewards/margins": 1.107519507408142, "rewards/rejected": -1.6971205472946167, "step": 17000 }, { "epoch": 0.851848161801335, "grad_norm": 0.005596471484750509, "learning_rate": 1.6278151324343395e-05, "logits/chosen": -18.718494415283203, "logits/rejected": -19.837169647216797, "logps/chosen": -527.9190673828125, "logps/rejected": -452.2406005859375, "loss": 0.9701, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": -0.13445429503917694, "rewards/margins": 2.05269193649292, "rewards/rejected": -2.1871461868286133, "step": 17100 }, { "epoch": 0.8568297299990036, "grad_norm": 76.1462631225586, "learning_rate": 1.6237461355516918e-05, "logits/chosen": -18.651123046875, "logits/rejected": -19.52649688720703, "logps/chosen": -539.2634887695312, "logps/rejected": -478.9902648925781, "loss": 1.575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8822598457336426, "rewards/margins": 1.2877403497695923, "rewards/rejected": -0.4054804742336273, "step": 17200 }, { "epoch": 0.8618112981966723, "grad_norm": 90.55237579345703, "learning_rate": 1.6196601641968425e-05, "logits/chosen": -18.68253517150879, "logits/rejected": -20.140342712402344, "logps/chosen": -523.9414672851562, "logps/rejected": -457.2998046875, "loss": 1.1014, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.5566680431365967, "rewards/margins": 1.6561553478240967, "rewards/rejected": 0.9005125164985657, "step": 17300 }, { "epoch": 0.8667928663943409, "grad_norm": 3.8211495876312256, "learning_rate": 1.6155573295643993e-05, "logits/chosen": -19.115205764770508, "logits/rejected": -19.974811553955078, "logps/chosen": -527.80419921875, "logps/rejected": -491.970458984375, "loss": 1.5425, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": 2.147320032119751, "rewards/margins": 0.9348466396331787, "rewards/rejected": 1.2124736309051514, "step": 17400 }, { "epoch": 0.8717744345920095, "grad_norm": 12.620223999023438, "learning_rate": 1.611437743307884e-05, "logits/chosen": -19.070911407470703, "logits/rejected": -20.162216186523438, "logps/chosen": -515.4886474609375, "logps/rejected": -440.5010070800781, "loss": 1.2029, "rewards/accuracies": 0.7200000286102295, "rewards/chosen": 2.0181946754455566, "rewards/margins": 1.9606748819351196, "rewards/rejected": 0.057520028203725815, "step": 17500 }, { "epoch": 0.8767560027896782, "grad_norm": 59.42154312133789, "learning_rate": 1.6073015175366914e-05, "logits/chosen": -18.614526748657227, "logits/rejected": -19.882549285888672, "logps/chosen": -515.3157958984375, "logps/rejected": -447.471435546875, "loss": 1.3499, "rewards/accuracies": 0.5600000023841858, "rewards/chosen": 2.700169563293457, "rewards/margins": 0.998585045337677, "rewards/rejected": 1.701583981513977, "step": 17600 }, { "epoch": 0.8817375709873468, "grad_norm": 66.42816925048828, "learning_rate": 1.603148764813042e-05, "logits/chosen": -18.400327682495117, "logits/rejected": -19.4934024810791, "logps/chosen": -477.818603515625, "logps/rejected": -401.1089782714844, "loss": 1.1853, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 3.3441994190216064, "rewards/margins": 1.493272066116333, "rewards/rejected": 1.8509272336959839, "step": 17700 }, { "epoch": 0.8867191391850154, "grad_norm": 0.004200187046080828, "learning_rate": 1.5989795981489155e-05, "logits/chosen": -18.308795928955078, "logits/rejected": -19.249317169189453, "logps/chosen": -512.5999755859375, "logps/rejected": -438.9172668457031, "loss": 1.1657, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 3.686070442199707, "rewards/margins": 2.335679292678833, "rewards/rejected": 1.3503910303115845, "step": 17800 }, { "epoch": 0.891700707382684, "grad_norm": 4.083571910858154, "learning_rate": 1.5947941310029755e-05, "logits/chosen": -18.304054260253906, "logits/rejected": -19.744394302368164, "logps/chosen": -475.91485595703125, "logps/rejected": -379.3888244628906, "loss": 1.1195, "rewards/accuracies": 0.6899999976158142, "rewards/chosen": 3.1028084754943848, "rewards/margins": 1.9994087219238281, "rewards/rejected": 1.1033999919891357, "step": 17900 }, { "epoch": 0.8966822755803527, "grad_norm": 51.022762298583984, "learning_rate": 1.5905924772774855e-05, "logits/chosen": -18.618383407592773, "logits/rejected": -19.63192367553711, "logps/chosen": -479.7633972167969, "logps/rejected": -392.1904602050781, "loss": 1.1562, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": 2.1094658374786377, "rewards/margins": 1.7458257675170898, "rewards/rejected": 0.3636399209499359, "step": 18000 }, { "epoch": 0.9016638437780213, "grad_norm": 0.00022725010057911277, "learning_rate": 1.586374751315204e-05, "logits/chosen": -19.1419620513916, "logits/rejected": -20.252761840820312, "logps/chosen": -530.3628540039062, "logps/rejected": -461.61639404296875, "loss": 1.3509, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 3.2607533931732178, "rewards/margins": 1.972005009651184, "rewards/rejected": 1.2887482643127441, "step": 18100 }, { "epoch": 0.9066454119756899, "grad_norm": 120.93099212646484, "learning_rate": 1.5821410678962764e-05, "logits/chosen": -19.30841636657715, "logits/rejected": -20.281227111816406, "logps/chosen": -450.5699157714844, "logps/rejected": -398.7843017578125, "loss": 1.4407, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 2.621293067932129, "rewards/margins": 1.4806004762649536, "rewards/rejected": 1.1406925916671753, "step": 18200 }, { "epoch": 0.9116269801733585, "grad_norm": 0.014844976365566254, "learning_rate": 1.5778915422351102e-05, "logits/chosen": -18.85603141784668, "logits/rejected": -19.324058532714844, "logps/chosen": -493.2317199707031, "logps/rejected": -453.1470947265625, "loss": 0.913, "rewards/accuracies": 0.6700000166893005, "rewards/chosen": 4.14647102355957, "rewards/margins": 1.9552674293518066, "rewards/rejected": 2.1912038326263428, "step": 18300 }, { "epoch": 0.9166085483710272, "grad_norm": 90.2131576538086, "learning_rate": 1.5736262899772407e-05, "logits/chosen": -19.094078063964844, "logits/rejected": -20.168027877807617, "logps/chosen": -512.7498779296875, "logps/rejected": -448.6697692871094, "loss": 1.1494, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 2.183354139328003, "rewards/margins": 1.9273743629455566, "rewards/rejected": 0.2559796869754791, "step": 18400 }, { "epoch": 0.9215901165686958, "grad_norm": 0.30067598819732666, "learning_rate": 1.569345427196181e-05, "logits/chosen": -19.207042694091797, "logits/rejected": -20.777149200439453, "logps/chosen": -521.3990478515625, "logps/rejected": -436.8664855957031, "loss": 1.2902, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 2.0493509769439697, "rewards/margins": 1.9620305299758911, "rewards/rejected": 0.08732038736343384, "step": 18500 }, { "epoch": 0.9265716847663644, "grad_norm": 128.35350036621094, "learning_rate": 1.5650490703902666e-05, "logits/chosen": -19.485790252685547, "logits/rejected": -20.323007583618164, "logps/chosen": -496.7759094238281, "logps/rejected": -440.96759033203125, "loss": 1.4797, "rewards/accuracies": 0.5899999737739563, "rewards/chosen": 0.3512002229690552, "rewards/margins": 1.2790648937225342, "rewards/rejected": -0.927864670753479, "step": 18600 }, { "epoch": 0.9315532529640331, "grad_norm": 23.355682373046875, "learning_rate": 1.5607373364794836e-05, "logits/chosen": -19.615062713623047, "logits/rejected": -20.484060287475586, "logps/chosen": -479.2476806640625, "logps/rejected": -415.4735412597656, "loss": 1.2546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3234963417053223, "rewards/margins": 2.4520487785339355, "rewards/rejected": -1.1285524368286133, "step": 18700 }, { "epoch": 0.9365348211617017, "grad_norm": 41.07186508178711, "learning_rate": 1.5564103428022855e-05, "logits/chosen": -19.126056671142578, "logits/rejected": -19.900794982910156, "logps/chosen": -530.3897094726562, "logps/rejected": -466.9271545410156, "loss": 1.1304, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 1.134138584136963, "rewards/margins": 2.5866689682006836, "rewards/rejected": -1.4525303840637207, "step": 18800 }, { "epoch": 0.9415163893593703, "grad_norm": 3.795400381088257, "learning_rate": 1.552068207112402e-05, "logits/chosen": -18.972984313964844, "logits/rejected": -19.319841384887695, "logps/chosen": -517.9902954101562, "logps/rejected": -463.4305725097656, "loss": 1.6333, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 0.04141408950090408, "rewards/margins": 1.2581309080123901, "rewards/rejected": -1.2167168855667114, "step": 18900 }, { "epoch": 0.9464979575570389, "grad_norm": 0.2743411362171173, "learning_rate": 1.547711047575635e-05, "logits/chosen": -18.696569442749023, "logits/rejected": -19.06745719909668, "logps/chosen": -540.3345947265625, "logps/rejected": -484.9985656738281, "loss": 1.6534, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 2.271423816680908, "rewards/margins": 1.7851731777191162, "rewards/rejected": 0.48625069856643677, "step": 19000 }, { "epoch": 0.9514795257547076, "grad_norm": 27.537288665771484, "learning_rate": 1.543338982766639e-05, "logits/chosen": -18.664011001586914, "logits/rejected": -19.458066940307617, "logps/chosen": -518.22021484375, "logps/rejected": -418.8706359863281, "loss": 1.0862, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": 2.7771294116973877, "rewards/margins": 2.859891891479492, "rewards/rejected": -0.08276252448558807, "step": 19100 }, { "epoch": 0.9564610939523762, "grad_norm": 0.9008951783180237, "learning_rate": 1.5389521316656992e-05, "logits/chosen": -18.849185943603516, "logits/rejected": -19.619192123413086, "logps/chosen": -494.7686462402344, "logps/rejected": -408.1366882324219, "loss": 1.1158, "rewards/accuracies": 0.7300000190734863, "rewards/chosen": 2.1583621501922607, "rewards/margins": 2.396536350250244, "rewards/rejected": -0.2381744235754013, "step": 19200 }, { "epoch": 0.9614426621500448, "grad_norm": 0.1889144629240036, "learning_rate": 1.5345506136554898e-05, "logits/chosen": -18.629066467285156, "logits/rejected": -19.594579696655273, "logps/chosen": -524.626708984375, "logps/rejected": -428.97344970703125, "loss": 1.1882, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 2.8290598392486572, "rewards/margins": 2.019918203353882, "rewards/rejected": 0.8091418743133545, "step": 19300 }, { "epoch": 0.9664242303477134, "grad_norm": 1.7383246421813965, "learning_rate": 1.5301345485178282e-05, "logits/chosen": -18.85825538635254, "logits/rejected": -19.791507720947266, "logps/chosen": -480.490234375, "logps/rejected": -380.1109619140625, "loss": 1.2643, "rewards/accuracies": 0.6700000166893005, "rewards/chosen": 2.243396520614624, "rewards/margins": 1.7966461181640625, "rewards/rejected": 0.44675034284591675, "step": 19400 }, { "epoch": 0.9714057985453821, "grad_norm": 73.9409408569336, "learning_rate": 1.525704056430412e-05, "logits/chosen": -18.52286148071289, "logits/rejected": -18.89972496032715, "logps/chosen": -528.177001953125, "logps/rejected": -484.8839111328125, "loss": 1.5849, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 2.4241716861724854, "rewards/margins": 1.1765490770339966, "rewards/rejected": 1.2476229667663574, "step": 19500 }, { "epoch": 0.9763873667430507, "grad_norm": 34.82683563232422, "learning_rate": 1.5212592579635512e-05, "logits/chosen": -18.213794708251953, "logits/rejected": -19.081279754638672, "logps/chosen": -520.0574340820312, "logps/rejected": -446.2875061035156, "loss": 1.171, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.9521498680114746, "rewards/margins": 1.636811375617981, "rewards/rejected": 0.315338671207428, "step": 19600 }, { "epoch": 0.9813689349407193, "grad_norm": 8.361115455627441, "learning_rate": 1.5168002740768857e-05, "logits/chosen": -18.713205337524414, "logits/rejected": -19.391826629638672, "logps/chosen": -503.5995178222656, "logps/rejected": -472.6417236328125, "loss": 1.1879, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 1.9274494647979736, "rewards/margins": 1.9319396018981934, "rewards/rejected": -0.00449012778699398, "step": 19700 }, { "epoch": 0.986350503138388, "grad_norm": 0.049951426684856415, "learning_rate": 1.512327226116094e-05, "logits/chosen": -19.126710891723633, "logits/rejected": -20.143205642700195, "logps/chosen": -521.0743408203125, "logps/rejected": -409.7085876464844, "loss": 1.2551, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 1.6364871263504028, "rewards/margins": 2.0774831771850586, "rewards/rejected": -0.44099605083465576, "step": 19800 }, { "epoch": 0.9913320713360566, "grad_norm": 0.5147112011909485, "learning_rate": 1.507840235809591e-05, "logits/chosen": -18.839317321777344, "logits/rejected": -19.660390853881836, "logps/chosen": -511.8052062988281, "logps/rejected": -445.6004638671875, "loss": 1.422, "rewards/accuracies": 0.6299999952316284, "rewards/chosen": 1.8398367166519165, "rewards/margins": 1.5229132175445557, "rewards/rejected": 0.3169235587120056, "step": 19900 }, { "epoch": 0.9963136395337252, "grad_norm": 19.235801696777344, "learning_rate": 1.503339425265215e-05, "logits/chosen": -18.8520450592041, "logits/rejected": -19.381229400634766, "logps/chosen": -479.9813537597656, "logps/rejected": -451.1719970703125, "loss": 1.5368, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 1.7022852897644043, "rewards/margins": 1.5142347812652588, "rewards/rejected": 0.1880505383014679, "step": 20000 }, { "epoch": 0.9963136395337252, "eval_logits/chosen": -19.80689811706543, "eval_logits/rejected": -20.71694564819336, "eval_logps/chosen": -488.4328308105469, "eval_logps/rejected": -420.06622314453125, "eval_loss": 1.361470103263855, "eval_rewards/accuracies": 0.6511780023574829, "eval_rewards/chosen": 3.077153444290161, "eval_rewards/margins": 2.0428242683410645, "eval_rewards/rejected": 1.0343292951583862, "eval_runtime": 472.868, "eval_samples_per_second": 3.219, "eval_steps_per_second": 0.404, "step": 20000 } ], "logging_steps": 100, "max_steps": 60222, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }