diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3085 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 5000, + "global_step": 20074, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004981568197668626, + "grad_norm": 81.31034088134766, + "learning_rate": 1.9999863931243543e-05, + "logits/chosen": -19.35576057434082, + "logits/rejected": -19.391923904418945, + "logps/chosen": -488.51171875, + "logps/rejected": -382.52825927734375, + "loss": 0.6551, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.4239501953125, + "rewards/margins": 1.4092838764190674, + "rewards/rejected": -0.9853336215019226, + "step": 100 + }, + { + "epoch": 0.009963136395337252, + "grad_norm": 27.6790828704834, + "learning_rate": 1.9999455728677112e-05, + "logits/chosen": -18.520322799682617, + "logits/rejected": -18.58489227294922, + "logps/chosen": -502.153564453125, + "logps/rejected": -427.2685241699219, + "loss": 1.088, + "rewards/accuracies": 0.4699999988079071, + "rewards/chosen": 1.2840694189071655, + "rewards/margins": -0.12594786286354065, + "rewards/rejected": 1.4100172519683838, + "step": 200 + }, + { + "epoch": 0.014944704593005878, + "grad_norm": 14.743182182312012, + "learning_rate": 1.999877540340943e-05, + "logits/chosen": -18.121265411376953, + "logits/rejected": -17.966760635375977, + "logps/chosen": -480.9696960449219, + "logps/rejected": -391.3818359375, + "loss": 0.846, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 2.156686305999756, + "rewards/margins": 0.4148028492927551, + "rewards/rejected": 1.7418835163116455, + "step": 300 + }, + { + "epoch": 0.019926272790674503, + "grad_norm": 0.3787066340446472, + "learning_rate": 1.99978229739547e-05, + "logits/chosen": -18.205398559570312, + "logits/rejected": -18.042299270629883, + "logps/chosen": -502.7016296386719, + "logps/rejected": -388.99835205078125, + "loss": 0.7988, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 2.600520372390747, + "rewards/margins": 0.4970521926879883, + "rewards/rejected": 2.1034679412841797, + "step": 400 + }, + { + "epoch": 0.02490784098834313, + "grad_norm": 0.6821377873420715, + "learning_rate": 1.9996598466232097e-05, + "logits/chosen": -18.351791381835938, + "logits/rejected": -18.350332260131836, + "logps/chosen": -495.239501953125, + "logps/rejected": -396.9656677246094, + "loss": 0.9516, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 2.7651679515838623, + "rewards/margins": 0.4416518807411194, + "rewards/rejected": 2.3235161304473877, + "step": 500 + }, + { + "epoch": 0.029889409186011757, + "grad_norm": 0.07361862808465958, + "learning_rate": 1.9995101913565075e-05, + "logits/chosen": -18.08759117126465, + "logits/rejected": -18.078266143798828, + "logps/chosen": -500.9162902832031, + "logps/rejected": -413.58673095703125, + "loss": 0.8741, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 3.083859443664551, + "rewards/margins": 0.49825409054756165, + "rewards/rejected": 2.5856053829193115, + "step": 600 + }, + { + "epoch": 0.034870977383680384, + "grad_norm": 50.79634475708008, + "learning_rate": 1.9993333356680442e-05, + "logits/chosen": -17.93349838256836, + "logits/rejected": -17.859838485717773, + "logps/chosen": -576.14501953125, + "logps/rejected": -481.7210693359375, + "loss": 0.9994, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 2.9873363971710205, + "rewards/margins": 0.3300691843032837, + "rewards/rejected": 2.6572670936584473, + "step": 700 + }, + { + "epoch": 0.03985254558134901, + "grad_norm": 11.607032775878906, + "learning_rate": 1.999129284370727e-05, + "logits/chosen": -18.006515502929688, + "logits/rejected": -17.87860107421875, + "logps/chosen": -511.5252990722656, + "logps/rejected": -448.138916015625, + "loss": 1.0173, + "rewards/accuracies": 0.5299999713897705, + "rewards/chosen": 3.2953906059265137, + "rewards/margins": 0.5231221318244934, + "rewards/rejected": 2.772268772125244, + "step": 800 + }, + { + "epoch": 0.04483411377901764, + "grad_norm": 9.66622257232666, + "learning_rate": 1.9988980430175565e-05, + "logits/chosen": -17.94629669189453, + "logits/rejected": -17.793624877929688, + "logps/chosen": -471.1706237792969, + "logps/rejected": -380.40625, + "loss": 0.7616, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 3.2259023189544678, + "rewards/margins": 0.9019778966903687, + "rewards/rejected": 2.3239243030548096, + "step": 900 + }, + { + "epoch": 0.04981568197668626, + "grad_norm": 33.115692138671875, + "learning_rate": 1.998639617901478e-05, + "logits/chosen": -18.29867935180664, + "logits/rejected": -18.21681785583496, + "logps/chosen": -492.39471435546875, + "logps/rejected": -397.0972900390625, + "loss": 0.8836, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 4.284320831298828, + "rewards/margins": 1.2680495977401733, + "rewards/rejected": 3.0162715911865234, + "step": 1000 + }, + { + "epoch": 0.05479725017435489, + "grad_norm": 14.794266700744629, + "learning_rate": 1.998354016055208e-05, + "logits/chosen": -17.866899490356445, + "logits/rejected": -17.793582916259766, + "logps/chosen": -512.371337890625, + "logps/rejected": -424.1750793457031, + "loss": 1.2634, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 3.433595895767212, + "rewards/margins": 0.3185270428657532, + "rewards/rejected": 3.1150686740875244, + "step": 1100 + }, + { + "epoch": 0.059778818372023514, + "grad_norm": 177.02552795410156, + "learning_rate": 1.998041245251044e-05, + "logits/chosen": -18.298795700073242, + "logits/rejected": -18.078033447265625, + "logps/chosen": -464.45086669921875, + "logps/rejected": -387.45428466796875, + "loss": 1.0062, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 3.7040328979492188, + "rewards/margins": 0.5431541800498962, + "rewards/rejected": 3.1608786582946777, + "step": 1200 + }, + { + "epoch": 0.06476038656969214, + "grad_norm": 19.352441787719727, + "learning_rate": 1.997701314000653e-05, + "logits/chosen": -18.20465660095215, + "logits/rejected": -18.182998657226562, + "logps/chosen": -489.5882873535156, + "logps/rejected": -431.6067199707031, + "loss": 0.8781, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 4.215704917907715, + "rewards/margins": 0.7704020738601685, + "rewards/rejected": 3.445302963256836, + "step": 1300 + }, + { + "epoch": 0.06974195476736077, + "grad_norm": 22.920259475708008, + "learning_rate": 1.9973342315548398e-05, + "logits/chosen": -18.116256713867188, + "logits/rejected": -18.149843215942383, + "logps/chosen": -447.0269775390625, + "logps/rejected": -374.61383056640625, + "loss": 1.0611, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 3.63736629486084, + "rewards/margins": 0.6730349063873291, + "rewards/rejected": 2.96433162689209, + "step": 1400 + }, + { + "epoch": 0.07472352296502939, + "grad_norm": 20.65612030029297, + "learning_rate": 1.9969400079032947e-05, + "logits/chosen": -18.347074508666992, + "logits/rejected": -18.040178298950195, + "logps/chosen": -453.944091796875, + "logps/rejected": -380.4056396484375, + "loss": 1.0204, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 4.042062282562256, + "rewards/margins": 0.9009463787078857, + "rewards/rejected": 3.1411163806915283, + "step": 1500 + }, + { + "epoch": 0.07970509116269801, + "grad_norm": 50.09711456298828, + "learning_rate": 1.9965186537743215e-05, + "logits/chosen": -18.355621337890625, + "logits/rejected": -18.051054000854492, + "logps/chosen": -502.7710876464844, + "logps/rejected": -419.5497741699219, + "loss": 1.1749, + "rewards/accuracies": 0.5899999737739563, + "rewards/chosen": 3.215543508529663, + "rewards/margins": 0.48448604345321655, + "rewards/rejected": 2.731057643890381, + "step": 1600 + }, + { + "epoch": 0.08468665936036664, + "grad_norm": 0.4332411289215088, + "learning_rate": 1.9960701806345472e-05, + "logits/chosen": -18.210161209106445, + "logits/rejected": -18.06623077392578, + "logps/chosen": -449.0904235839844, + "logps/rejected": -365.96173095703125, + "loss": 0.7165, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 4.172327995300293, + "rewards/margins": 1.4230843782424927, + "rewards/rejected": 2.749243974685669, + "step": 1700 + }, + { + "epoch": 0.08966822755803527, + "grad_norm": 5.466333389282227, + "learning_rate": 1.9955946006886082e-05, + "logits/chosen": -18.5748348236084, + "logits/rejected": -18.16517448425293, + "logps/chosen": -438.857421875, + "logps/rejected": -416.3319396972656, + "loss": 0.8766, + "rewards/accuracies": 0.5699999928474426, + "rewards/chosen": 4.177321910858154, + "rewards/margins": 0.9888943433761597, + "rewards/rejected": 3.188427686691284, + "step": 1800 + }, + { + "epoch": 0.0946497957557039, + "grad_norm": 82.25857543945312, + "learning_rate": 1.995091926878819e-05, + "logits/chosen": -18.491161346435547, + "logits/rejected": -18.283767700195312, + "logps/chosen": -461.55023193359375, + "logps/rejected": -394.8086242675781, + "loss": 1.0777, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 3.3734805583953857, + "rewards/margins": 0.7351935505867004, + "rewards/rejected": 2.638287305831909, + "step": 1900 + }, + { + "epoch": 0.09963136395337252, + "grad_norm": 9.021427154541016, + "learning_rate": 1.9945621728848194e-05, + "logits/chosen": -18.71115493774414, + "logits/rejected": -18.314943313598633, + "logps/chosen": -466.3949890136719, + "logps/rejected": -393.8585205078125, + "loss": 0.6798, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": 3.843343496322632, + "rewards/margins": 1.1383906602859497, + "rewards/rejected": 2.7049529552459717, + "step": 2000 + }, + { + "epoch": 0.10461293215104114, + "grad_norm": 146.21475219726562, + "learning_rate": 1.9940053531232028e-05, + "logits/chosen": -18.590173721313477, + "logits/rejected": -18.441553115844727, + "logps/chosen": -454.3846435546875, + "logps/rejected": -387.5030212402344, + "loss": 1.1674, + "rewards/accuracies": 0.5699999928474426, + "rewards/chosen": 4.06519079208374, + "rewards/margins": 0.5631230473518372, + "rewards/rejected": 3.5020673274993896, + "step": 2100 + }, + { + "epoch": 0.10959450034870978, + "grad_norm": 4.808138847351074, + "learning_rate": 1.9934214827471244e-05, + "logits/chosen": -18.621475219726562, + "logits/rejected": -18.35665512084961, + "logps/chosen": -450.15582275390625, + "logps/rejected": -386.0933532714844, + "loss": 1.0704, + "rewards/accuracies": 0.5699999928474426, + "rewards/chosen": 3.885467290878296, + "rewards/margins": 0.752030611038208, + "rewards/rejected": 3.1334362030029297, + "step": 2200 + }, + { + "epoch": 0.1145760685463784, + "grad_norm": 5.842870235443115, + "learning_rate": 1.9928105776458864e-05, + "logits/chosen": -18.336530685424805, + "logits/rejected": -18.11532211303711, + "logps/chosen": -466.25738525390625, + "logps/rejected": -393.3591003417969, + "loss": 1.1451, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 3.678475856781006, + "rewards/margins": 0.4539036452770233, + "rewards/rejected": 3.22457218170166, + "step": 2300 + }, + { + "epoch": 0.11955763674404703, + "grad_norm": 37.61786651611328, + "learning_rate": 1.9921726544445084e-05, + "logits/chosen": -18.296964645385742, + "logits/rejected": -18.364625930786133, + "logps/chosen": -467.84027099609375, + "logps/rejected": -397.1224670410156, + "loss": 0.8784, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": 3.5043909549713135, + "rewards/margins": 1.0125629901885986, + "rewards/rejected": 2.4918274879455566, + "step": 2400 + }, + { + "epoch": 0.12453920494171565, + "grad_norm": 49.04602813720703, + "learning_rate": 1.9915077305032748e-05, + "logits/chosen": -18.40894317626953, + "logits/rejected": -18.2955322265625, + "logps/chosen": -503.1138610839844, + "logps/rejected": -375.34442138671875, + "loss": 0.9985, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 4.6265459060668945, + "rewards/margins": 1.7104928493499756, + "rewards/rejected": 2.91605281829834, + "step": 2500 + }, + { + "epoch": 0.1295207731393843, + "grad_norm": 16.836668014526367, + "learning_rate": 1.9908158239172596e-05, + "logits/chosen": -18.674049377441406, + "logits/rejected": -18.514965057373047, + "logps/chosen": -455.519775390625, + "logps/rejected": -374.22210693359375, + "loss": 0.9035, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 4.500489711761475, + "rewards/margins": 1.446923017501831, + "rewards/rejected": 3.0535662174224854, + "step": 2600 + }, + { + "epoch": 0.1345023413370529, + "grad_norm": 0.04983401298522949, + "learning_rate": 1.990096953515836e-05, + "logits/chosen": -18.647964477539062, + "logits/rejected": -18.637880325317383, + "logps/chosen": -465.9200439453125, + "logps/rejected": -411.40838623046875, + "loss": 1.2207, + "rewards/accuracies": 0.5899999737739563, + "rewards/chosen": 3.8630013465881348, + "rewards/margins": 0.7057845592498779, + "rewards/rejected": 3.157216787338257, + "step": 2700 + }, + { + "epoch": 0.13948390953472153, + "grad_norm": 0.5644310116767883, + "learning_rate": 1.9893511388621652e-05, + "logits/chosen": -18.66870880126953, + "logits/rejected": -18.76462745666504, + "logps/chosen": -513.6793823242188, + "logps/rejected": -469.8594055175781, + "loss": 1.5471, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 3.391056537628174, + "rewards/margins": 0.3123472332954407, + "rewards/rejected": 3.078709363937378, + "step": 2800 + }, + { + "epoch": 0.14446547773239016, + "grad_norm": 1.6863278150558472, + "learning_rate": 1.9885784002526616e-05, + "logits/chosen": -18.729633331298828, + "logits/rejected": -19.068260192871094, + "logps/chosen": -447.2772521972656, + "logps/rejected": -352.67791748046875, + "loss": 1.0868, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 3.924001693725586, + "rewards/margins": 0.8989758491516113, + "rewards/rejected": 3.0250258445739746, + "step": 2900 + }, + { + "epoch": 0.14944704593005878, + "grad_norm": 0.10964024066925049, + "learning_rate": 1.987778758716441e-05, + "logits/chosen": -19.014976501464844, + "logits/rejected": -19.501911163330078, + "logps/chosen": -475.1939392089844, + "logps/rejected": -385.38238525390625, + "loss": 1.0756, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 4.359086036682129, + "rewards/margins": 1.1548995971679688, + "rewards/rejected": 3.2041866779327393, + "step": 3000 + }, + { + "epoch": 0.1544286141277274, + "grad_norm": 7.479519367218018, + "learning_rate": 1.98695223601475e-05, + "logits/chosen": -18.8636531829834, + "logits/rejected": -19.16086196899414, + "logps/chosen": -484.1092529296875, + "logps/rejected": -399.1905212402344, + "loss": 0.8348, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": 4.458531379699707, + "rewards/margins": 1.7625274658203125, + "rewards/rejected": 2.6960039138793945, + "step": 3100 + }, + { + "epoch": 0.15941018232539603, + "grad_norm": 15.998089790344238, + "learning_rate": 1.986098854640371e-05, + "logits/chosen": -18.937522888183594, + "logits/rejected": -19.118017196655273, + "logps/chosen": -463.34149169921875, + "logps/rejected": -415.29827880859375, + "loss": 1.298, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 4.227731227874756, + "rewards/margins": 0.5569795370101929, + "rewards/rejected": 3.6707510948181152, + "step": 3200 + }, + { + "epoch": 0.16439175052306465, + "grad_norm": 0.7655884027481079, + "learning_rate": 1.9852186378170136e-05, + "logits/chosen": -18.893104553222656, + "logits/rejected": -19.257871627807617, + "logps/chosen": -531.3560791015625, + "logps/rejected": -465.7754821777344, + "loss": 1.1944, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 4.285092353820801, + "rewards/margins": 1.1729285717010498, + "rewards/rejected": 3.1121633052825928, + "step": 3300 + }, + { + "epoch": 0.16937331872073327, + "grad_norm": 10.586421012878418, + "learning_rate": 1.9843116094986783e-05, + "logits/chosen": -18.89116859436035, + "logits/rejected": -19.32411003112793, + "logps/chosen": -466.8319091796875, + "logps/rejected": -388.9800109863281, + "loss": 0.8162, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 4.402754306793213, + "rewards/margins": 1.1992639303207397, + "rewards/rejected": 3.203490972518921, + "step": 3400 + }, + { + "epoch": 0.17435488691840192, + "grad_norm": 0.0021288192365318537, + "learning_rate": 1.983377794369009e-05, + "logits/chosen": -18.90306854248047, + "logits/rejected": -19.6688289642334, + "logps/chosen": -506.9422912597656, + "logps/rejected": -422.39703369140625, + "loss": 0.9919, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 4.193739414215088, + "rewards/margins": 1.1028869152069092, + "rewards/rejected": 3.0908522605895996, + "step": 3500 + }, + { + "epoch": 0.17933645511607055, + "grad_norm": 72.208251953125, + "learning_rate": 1.982417217840618e-05, + "logits/chosen": -19.198213577270508, + "logits/rejected": -20.100387573242188, + "logps/chosen": -498.4687805175781, + "logps/rejected": -383.0224914550781, + "loss": 0.9324, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 4.158178806304932, + "rewards/margins": 1.5135530233383179, + "rewards/rejected": 2.6446259021759033, + "step": 3600 + }, + { + "epoch": 0.18431802331373917, + "grad_norm": 54.43558120727539, + "learning_rate": 1.9814299060543965e-05, + "logits/chosen": -19.100000381469727, + "logits/rejected": -20.164613723754883, + "logps/chosen": -523.7534790039062, + "logps/rejected": -420.5929260253906, + "loss": 1.0645, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 4.555855751037598, + "rewards/margins": 1.3967076539993286, + "rewards/rejected": 3.1591484546661377, + "step": 3700 + }, + { + "epoch": 0.1892995915114078, + "grad_norm": 44.80778503417969, + "learning_rate": 1.980415885878801e-05, + "logits/chosen": -19.23442840576172, + "logits/rejected": -20.248851776123047, + "logps/chosen": -470.9892578125, + "logps/rejected": -387.8487548828125, + "loss": 1.1345, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 4.9033355712890625, + "rewards/margins": 1.5088270902633667, + "rewards/rejected": 3.3945086002349854, + "step": 3800 + }, + { + "epoch": 0.19428115970907642, + "grad_norm": 0.029925603419542313, + "learning_rate": 1.979375184909125e-05, + "logits/chosen": -19.161788940429688, + "logits/rejected": -20.242706298828125, + "logps/chosen": -451.88165283203125, + "logps/rejected": -377.80078125, + "loss": 1.0498, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 3.468371868133545, + "rewards/margins": 1.5844755172729492, + "rewards/rejected": 1.8838963508605957, + "step": 3900 + }, + { + "epoch": 0.19926272790674504, + "grad_norm": 2.616316795349121, + "learning_rate": 1.9783078314667465e-05, + "logits/chosen": -19.053321838378906, + "logits/rejected": -20.00080108642578, + "logps/chosen": -502.9007568359375, + "logps/rejected": -397.90020751953125, + "loss": 0.9393, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 4.124849796295166, + "rewards/margins": 1.3231381177902222, + "rewards/rejected": 2.801711320877075, + "step": 4000 + }, + { + "epoch": 0.20424429610441366, + "grad_norm": 44.81772232055664, + "learning_rate": 1.9772138545983554e-05, + "logits/chosen": -18.997940063476562, + "logits/rejected": -19.999465942382812, + "logps/chosen": -494.4920349121094, + "logps/rejected": -401.1126708984375, + "loss": 0.9059, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 3.8888561725616455, + "rewards/margins": 1.703158974647522, + "rewards/rejected": 2.185697317123413, + "step": 4100 + }, + { + "epoch": 0.2092258643020823, + "grad_norm": 19.425662994384766, + "learning_rate": 1.9760932840751663e-05, + "logits/chosen": -18.9016056060791, + "logits/rejected": -19.416828155517578, + "logps/chosen": -483.6650390625, + "logps/rejected": -388.0088195800781, + "loss": 0.8963, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 5.2860283851623535, + "rewards/margins": 1.5827534198760986, + "rewards/rejected": 3.703275442123413, + "step": 4200 + }, + { + "epoch": 0.2142074324997509, + "grad_norm": 124.74931335449219, + "learning_rate": 1.9749461503921074e-05, + "logits/chosen": -18.898042678833008, + "logits/rejected": -19.673877716064453, + "logps/chosen": -497.69476318359375, + "logps/rejected": -364.8540954589844, + "loss": 1.0963, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 4.136105537414551, + "rewards/margins": 1.3833444118499756, + "rewards/rejected": 2.752761125564575, + "step": 4300 + }, + { + "epoch": 0.21918900069741956, + "grad_norm": 5.577426433563232, + "learning_rate": 1.973772484766989e-05, + "logits/chosen": -18.805566787719727, + "logits/rejected": -19.62226104736328, + "logps/chosen": -463.9582824707031, + "logps/rejected": -351.70867919921875, + "loss": 1.0113, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 4.1188740730285645, + "rewards/margins": 1.5241186618804932, + "rewards/rejected": 2.5947556495666504, + "step": 4400 + }, + { + "epoch": 0.22417056889508818, + "grad_norm": 0.4124658405780792, + "learning_rate": 1.9725723191396557e-05, + "logits/chosen": -18.83307647705078, + "logits/rejected": -19.278696060180664, + "logps/chosen": -466.99859619140625, + "logps/rejected": -398.4786682128906, + "loss": 1.622, + "rewards/accuracies": 0.5899999737739563, + "rewards/chosen": 4.280797958374023, + "rewards/margins": 0.31094062328338623, + "rewards/rejected": 3.9698569774627686, + "step": 4500 + }, + { + "epoch": 0.2291521370927568, + "grad_norm": 46.63705825805664, + "learning_rate": 1.971345686171116e-05, + "logits/chosen": -18.672901153564453, + "logits/rejected": -19.950056076049805, + "logps/chosen": -500.7174072265625, + "logps/rejected": -417.15191650390625, + "loss": 0.9252, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 4.5458478927612305, + "rewards/margins": 1.576623558998108, + "rewards/rejected": 2.969224452972412, + "step": 4600 + }, + { + "epoch": 0.23413370529042543, + "grad_norm": 18.882240295410156, + "learning_rate": 1.9700926192426554e-05, + "logits/chosen": -19.082120895385742, + "logits/rejected": -20.37308120727539, + "logps/chosen": -429.0272521972656, + "logps/rejected": -354.2383728027344, + "loss": 1.1165, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 3.655416965484619, + "rewards/margins": 1.2390317916870117, + "rewards/rejected": 2.4163851737976074, + "step": 4700 + }, + { + "epoch": 0.23911527348809405, + "grad_norm": 336.6611633300781, + "learning_rate": 1.9688131524549242e-05, + "logits/chosen": -19.020198822021484, + "logits/rejected": -19.45013999938965, + "logps/chosen": -459.5777587890625, + "logps/rejected": -409.13922119140625, + "loss": 1.2777, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 2.606769323348999, + "rewards/margins": 0.6296383142471313, + "rewards/rejected": 1.9771310091018677, + "step": 4800 + }, + { + "epoch": 0.24409684168576268, + "grad_norm": 72.95674133300781, + "learning_rate": 1.9675073206270148e-05, + "logits/chosen": -18.523130416870117, + "logits/rejected": -19.362272262573242, + "logps/chosen": -499.9466552734375, + "logps/rejected": -384.0169982910156, + "loss": 0.9484, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 4.083195209503174, + "rewards/margins": 1.7932839393615723, + "rewards/rejected": 2.2899110317230225, + "step": 4900 + }, + { + "epoch": 0.2490784098834313, + "grad_norm": 7.091724395751953, + "learning_rate": 1.9661751592955086e-05, + "logits/chosen": -18.576244354248047, + "logits/rejected": -19.872777938842773, + "logps/chosen": -543.4857177734375, + "logps/rejected": -440.7587890625, + "loss": 0.9586, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 4.305512428283691, + "rewards/margins": 1.4709885120391846, + "rewards/rejected": 2.8345236778259277, + "step": 5000 + }, + { + "epoch": 0.2490784098834313, + "eval_logits/chosen": -21.543413162231445, + "eval_logits/rejected": -22.481142044067383, + "eval_logps/chosen": -475.4376525878906, + "eval_logps/rejected": -402.1141052246094, + "eval_loss": 1.169881820678711, + "eval_rewards/accuracies": 0.6557591557502747, + "eval_rewards/chosen": 4.376668930053711, + "eval_rewards/margins": 1.5471277236938477, + "eval_rewards/rejected": 2.829540729522705, + "eval_runtime": 473.1936, + "eval_samples_per_second": 3.216, + "eval_steps_per_second": 0.404, + "step": 5000 + }, + { + "epoch": 0.2540599780810999, + "grad_norm": 82.43436431884766, + "learning_rate": 1.9648167047135133e-05, + "logits/chosen": -19.058635711669922, + "logits/rejected": -20.134428024291992, + "logps/chosen": -495.65887451171875, + "logps/rejected": -419.3161315917969, + "loss": 1.1698, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 3.87038516998291, + "rewards/margins": 1.3453155755996704, + "rewards/rejected": 2.52506947517395, + "step": 5100 + }, + { + "epoch": 0.2590415462787686, + "grad_norm": 3.7819454669952393, + "learning_rate": 1.9634319938496742e-05, + "logits/chosen": -19.017601013183594, + "logits/rejected": -20.623193740844727, + "logps/chosen": -478.9990539550781, + "logps/rejected": -395.05450439453125, + "loss": 1.1279, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": 3.3154103755950928, + "rewards/margins": 1.5955133438110352, + "rewards/rejected": 1.7198967933654785, + "step": 5200 + }, + { + "epoch": 0.26402311447643717, + "grad_norm": 1.351638674736023, + "learning_rate": 1.962021064387168e-05, + "logits/chosen": -18.885652542114258, + "logits/rejected": -19.914079666137695, + "logps/chosen": -510.6768493652344, + "logps/rejected": -441.775634765625, + "loss": 1.1382, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 4.05587911605835, + "rewards/margins": 1.2654640674591064, + "rewards/rejected": 2.790414810180664, + "step": 5300 + }, + { + "epoch": 0.2690046826741058, + "grad_norm": 32.41090393066406, + "learning_rate": 1.9605839547226785e-05, + "logits/chosen": -19.33073616027832, + "logits/rejected": -20.477949142456055, + "logps/chosen": -492.5516052246094, + "logps/rejected": -420.31036376953125, + "loss": 1.0482, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 3.8395843505859375, + "rewards/margins": 1.6333999633789062, + "rewards/rejected": 2.206184148788452, + "step": 5400 + }, + { + "epoch": 0.2739862508717744, + "grad_norm": 32.58820724487305, + "learning_rate": 1.9591207039653507e-05, + "logits/chosen": -19.26167106628418, + "logits/rejected": -20.89728546142578, + "logps/chosen": -438.7798156738281, + "logps/rejected": -363.2562255859375, + "loss": 1.243, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 3.1530792713165283, + "rewards/margins": 1.1595901250839233, + "rewards/rejected": 1.9934889078140259, + "step": 5500 + }, + { + "epoch": 0.27896781906944307, + "grad_norm": 93.86406707763672, + "learning_rate": 1.9576313519357265e-05, + "logits/chosen": -19.064878463745117, + "logits/rejected": -20.905057907104492, + "logps/chosen": -519.572509765625, + "logps/rejected": -442.0079040527344, + "loss": 1.2293, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 3.4798271656036377, + "rewards/margins": 1.09993577003479, + "rewards/rejected": 2.3798913955688477, + "step": 5600 + }, + { + "epoch": 0.28394938726711166, + "grad_norm": 26.577350616455078, + "learning_rate": 1.9561159391646618e-05, + "logits/chosen": -19.34862518310547, + "logits/rejected": -22.218530654907227, + "logps/chosen": -500.4524230957031, + "logps/rejected": -402.18389892578125, + "loss": 0.9679, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 3.561279535293579, + "rewards/margins": 1.6947131156921387, + "rewards/rejected": 1.8665661811828613, + "step": 5700 + }, + { + "epoch": 0.2889309554647803, + "grad_norm": 20.811817169189453, + "learning_rate": 1.9545745068922225e-05, + "logits/chosen": -19.238685607910156, + "logits/rejected": -20.936817169189453, + "logps/chosen": -499.0253601074219, + "logps/rejected": -417.6446533203125, + "loss": 1.1599, + "rewards/accuracies": 0.5299999713897705, + "rewards/chosen": 4.803900241851807, + "rewards/margins": 1.0667366981506348, + "rewards/rejected": 3.7371630668640137, + "step": 5800 + }, + { + "epoch": 0.29391252366244897, + "grad_norm": 23.467056274414062, + "learning_rate": 1.9530070970665638e-05, + "logits/chosen": -19.428844451904297, + "logits/rejected": -21.77304458618164, + "logps/chosen": -498.2739562988281, + "logps/rejected": -398.3455810546875, + "loss": 0.9347, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 4.099443435668945, + "rewards/margins": 1.4808638095855713, + "rewards/rejected": 2.6185789108276367, + "step": 5900 + }, + { + "epoch": 0.29889409186011756, + "grad_norm": 44.77325439453125, + "learning_rate": 1.951413752342786e-05, + "logits/chosen": -19.230180740356445, + "logits/rejected": -21.40241050720215, + "logps/chosen": -522.796630859375, + "logps/rejected": -409.90936279296875, + "loss": 1.0653, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 3.7546486854553223, + "rewards/margins": 2.06999135017395, + "rewards/rejected": 1.6846575736999512, + "step": 6000 + }, + { + "epoch": 0.3038756600577862, + "grad_norm": 0.1034678965806961, + "learning_rate": 1.949794516081777e-05, + "logits/chosen": -19.376697540283203, + "logits/rejected": -21.691804885864258, + "logps/chosen": -482.21807861328125, + "logps/rejected": -379.0809326171875, + "loss": 0.954, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 2.9128754138946533, + "rewards/margins": 1.5702927112579346, + "rewards/rejected": 1.3425830602645874, + "step": 6100 + }, + { + "epoch": 0.3088572282554548, + "grad_norm": 8.153312683105469, + "learning_rate": 1.9481494323490292e-05, + "logits/chosen": -20.608989715576172, + "logits/rejected": -25.289657592773438, + "logps/chosen": -456.2762756347656, + "logps/rejected": -363.2749938964844, + "loss": 1.0848, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 2.7271530628204346, + "rewards/margins": 1.8869810104370117, + "rewards/rejected": 0.8401720523834229, + "step": 6200 + }, + { + "epoch": 0.31383879645312346, + "grad_norm": 173.4281463623047, + "learning_rate": 1.9464785459134422e-05, + "logits/chosen": -20.402162551879883, + "logits/rejected": -26.399858474731445, + "logps/chosen": -496.73199462890625, + "logps/rejected": -371.1173400878906, + "loss": 1.0173, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 3.2006542682647705, + "rewards/margins": 2.35373592376709, + "rewards/rejected": 0.8469181060791016, + "step": 6300 + }, + { + "epoch": 0.31882036465079205, + "grad_norm": 71.35045623779297, + "learning_rate": 1.9447819022461036e-05, + "logits/chosen": -20.591211318969727, + "logits/rejected": -21.949787139892578, + "logps/chosen": -505.4292297363281, + "logps/rejected": -457.5049133300781, + "loss": 1.334, + "rewards/accuracies": 0.5899999737739563, + "rewards/chosen": 3.6680984497070312, + "rewards/margins": 1.0007195472717285, + "rewards/rejected": 2.6673781871795654, + "step": 6400 + }, + { + "epoch": 0.3238019328484607, + "grad_norm": 71.10392761230469, + "learning_rate": 1.9430595475190528e-05, + "logits/chosen": -20.5976505279541, + "logits/rejected": -22.98895263671875, + "logps/chosen": -485.20758056640625, + "logps/rejected": -426.62078857421875, + "loss": 1.1786, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 3.1840951442718506, + "rewards/margins": 1.0180020332336426, + "rewards/rejected": 2.166092872619629, + "step": 6500 + }, + { + "epoch": 0.3287835010461293, + "grad_norm": 20.788406372070312, + "learning_rate": 1.9413115286040228e-05, + "logits/chosen": -20.659372329711914, + "logits/rejected": -24.63149642944336, + "logps/chosen": -496.0128173828125, + "logps/rejected": -426.76092529296875, + "loss": 1.2417, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 2.6715264320373535, + "rewards/margins": 1.3509292602539062, + "rewards/rejected": 1.3205969333648682, + "step": 6600 + }, + { + "epoch": 0.33376506924379795, + "grad_norm": 47.43177032470703, + "learning_rate": 1.9395378930711654e-05, + "logits/chosen": -21.26150894165039, + "logits/rejected": -27.082225799560547, + "logps/chosen": -484.2604675292969, + "logps/rejected": -407.31512451171875, + "loss": 1.0294, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": 2.974027633666992, + "rewards/margins": 2.4509451389312744, + "rewards/rejected": 0.5230829119682312, + "step": 6700 + }, + { + "epoch": 0.33874663744146655, + "grad_norm": 29.3293514251709, + "learning_rate": 1.9377386891877572e-05, + "logits/chosen": -20.902864456176758, + "logits/rejected": -24.1262264251709, + "logps/chosen": -505.2213134765625, + "logps/rejected": -438.3580017089844, + "loss": 1.5894, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.993202567100525, + "rewards/margins": 0.9017642736434937, + "rewards/rejected": 1.0914379358291626, + "step": 6800 + }, + { + "epoch": 0.3437282056391352, + "grad_norm": 85.70783996582031, + "learning_rate": 1.9359139659168845e-05, + "logits/chosen": -19.933032989501953, + "logits/rejected": -23.047008514404297, + "logps/chosen": -496.6371154785156, + "logps/rejected": -435.8622131347656, + "loss": 1.0725, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 1.8428994417190552, + "rewards/margins": 1.234021782875061, + "rewards/rejected": 0.6088778376579285, + "step": 6900 + }, + { + "epoch": 0.34870977383680385, + "grad_norm": 44.99619674682617, + "learning_rate": 1.9340637729161137e-05, + "logits/chosen": -20.23802375793457, + "logits/rejected": -22.14116668701172, + "logps/chosen": -505.2397155761719, + "logps/rejected": -458.6205139160156, + "loss": 1.4139, + "rewards/accuracies": 0.5699999928474426, + "rewards/chosen": 1.7950440645217896, + "rewards/margins": 1.0467220544815063, + "rewards/rejected": 0.7483220100402832, + "step": 7000 + }, + { + "epoch": 0.35369134203447244, + "grad_norm": 2.535125886993228e-08, + "learning_rate": 1.9321881605361363e-05, + "logits/chosen": -19.776222229003906, + "logits/rejected": -22.802228927612305, + "logps/chosen": -552.8232421875, + "logps/rejected": -477.6277770996094, + "loss": 1.0137, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": 3.883512020111084, + "rewards/margins": 2.6781344413757324, + "rewards/rejected": 1.2053773403167725, + "step": 7100 + }, + { + "epoch": 0.3586729102321411, + "grad_norm": 104.68194580078125, + "learning_rate": 1.9302871798194005e-05, + "logits/chosen": -21.631492614746094, + "logits/rejected": -25.338726043701172, + "logps/chosen": -458.9781494140625, + "logps/rejected": -421.4054260253906, + "loss": 1.5316, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 1.5002456903457642, + "rewards/margins": 1.058099389076233, + "rewards/rejected": 0.4421464204788208, + "step": 7200 + }, + { + "epoch": 0.3636544784298097, + "grad_norm": 15.395011901855469, + "learning_rate": 1.9283608824987236e-05, + "logits/chosen": -21.326448440551758, + "logits/rejected": -27.821928024291992, + "logps/chosen": -519.0068969726562, + "logps/rejected": -431.6269226074219, + "loss": 1.4097, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 2.3849658966064453, + "rewards/margins": 2.2866411209106445, + "rewards/rejected": 0.09832416474819183, + "step": 7300 + }, + { + "epoch": 0.36863604662747834, + "grad_norm": 5.503931999206543, + "learning_rate": 1.9264093209958822e-05, + "logits/chosen": -22.2663631439209, + "logits/rejected": -30.09918212890625, + "logps/chosen": -519.89013671875, + "logps/rejected": -434.9560241699219, + "loss": 0.6146, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": 1.8443881273269653, + "rewards/margins": 3.3645260334014893, + "rewards/rejected": -1.5201376676559448, + "step": 7400 + }, + { + "epoch": 0.37361761482514694, + "grad_norm": 0.397699773311615, + "learning_rate": 1.9244325484201844e-05, + "logits/chosen": -21.55438995361328, + "logits/rejected": -26.417490005493164, + "logps/chosen": -578.6663208007812, + "logps/rejected": -494.0219421386719, + "loss": 1.32, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 2.8062126636505127, + "rewards/margins": 1.9167245626449585, + "rewards/rejected": 0.8894882798194885, + "step": 7500 + }, + { + "epoch": 0.3785991830228156, + "grad_norm": 1.4006325006484985, + "learning_rate": 1.9224306185670284e-05, + "logits/chosen": -21.26766586303711, + "logits/rejected": -23.203746795654297, + "logps/chosen": -524.5022583007812, + "logps/rejected": -450.9858093261719, + "loss": 1.2222, + "rewards/accuracies": 0.5699999928474426, + "rewards/chosen": 2.8659799098968506, + "rewards/margins": 1.5051510334014893, + "rewards/rejected": 1.3608287572860718, + "step": 7600 + }, + { + "epoch": 0.3835807512204842, + "grad_norm": 77.18798828125, + "learning_rate": 1.9204035859164346e-05, + "logits/chosen": -20.718629837036133, + "logits/rejected": -24.271589279174805, + "logps/chosen": -482.0295715332031, + "logps/rejected": -402.9772644042969, + "loss": 1.4243, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 2.1575722694396973, + "rewards/margins": 1.1561360359191895, + "rewards/rejected": 1.0014359951019287, + "step": 7700 + }, + { + "epoch": 0.38856231941815284, + "grad_norm": 7.084451675415039, + "learning_rate": 1.9183515056315664e-05, + "logits/chosen": -20.11510467529297, + "logits/rejected": -22.786483764648438, + "logps/chosen": -521.9429931640625, + "logps/rejected": -447.221923828125, + "loss": 1.2535, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 3.086232900619507, + "rewards/margins": 1.8227348327636719, + "rewards/rejected": 1.2634981870651245, + "step": 7800 + }, + { + "epoch": 0.3935438876158215, + "grad_norm": 26.435029983520508, + "learning_rate": 1.9162744335572254e-05, + "logits/chosen": -20.078449249267578, + "logits/rejected": -21.64859390258789, + "logps/chosen": -495.9106750488281, + "logps/rejected": -439.6401672363281, + "loss": 1.3286, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 2.769630193710327, + "rewards/margins": 0.8031193017959595, + "rewards/rejected": 1.9665107727050781, + "step": 7900 + }, + { + "epoch": 0.3985254558134901, + "grad_norm": 55.87978744506836, + "learning_rate": 1.9141724262183347e-05, + "logits/chosen": -19.4700927734375, + "logits/rejected": -24.67208480834961, + "logps/chosen": -476.494873046875, + "logps/rejected": -370.9423828125, + "loss": 1.1024, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 3.3464269638061523, + "rewards/margins": 1.9073596000671387, + "rewards/rejected": 1.4390674829483032, + "step": 8000 + }, + { + "epoch": 0.40350702401115873, + "grad_norm": 72.6644515991211, + "learning_rate": 1.9120455408183996e-05, + "logits/chosen": -19.84633445739746, + "logits/rejected": -23.482101440429688, + "logps/chosen": -469.52984619140625, + "logps/rejected": -371.6656494140625, + "loss": 1.0347, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 3.2709081172943115, + "rewards/margins": 1.9965094327926636, + "rewards/rejected": 1.2743984460830688, + "step": 8100 + }, + { + "epoch": 0.40848859220882733, + "grad_norm": 7.036466121673584, + "learning_rate": 1.9098938352379497e-05, + "logits/chosen": -19.558134078979492, + "logits/rejected": -22.82000732421875, + "logps/chosen": -516.8818359375, + "logps/rejected": -426.9209289550781, + "loss": 1.3704, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 2.8734841346740723, + "rewards/margins": 1.5538628101348877, + "rewards/rejected": 1.3196213245391846, + "step": 8200 + }, + { + "epoch": 0.413470160406496, + "grad_norm": 19.03754997253418, + "learning_rate": 1.9077173680329667e-05, + "logits/chosen": -19.861852645874023, + "logits/rejected": -20.495954513549805, + "logps/chosen": -434.7484436035156, + "logps/rejected": -418.93182373046875, + "loss": 1.4406, + "rewards/accuracies": 0.5400000214576721, + "rewards/chosen": 3.662661552429199, + "rewards/margins": 1.0773922204971313, + "rewards/rejected": 2.5852692127227783, + "step": 8300 + }, + { + "epoch": 0.4184517286041646, + "grad_norm": 0.007115426007658243, + "learning_rate": 1.9055161984332865e-05, + "logits/chosen": -19.320505142211914, + "logits/rejected": -20.781309127807617, + "logps/chosen": -505.4100036621094, + "logps/rejected": -399.3970031738281, + "loss": 1.2344, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 2.867283582687378, + "rewards/margins": 1.5176202058792114, + "rewards/rejected": 1.3496633768081665, + "step": 8400 + }, + { + "epoch": 0.4234332968018332, + "grad_norm": 27.222930908203125, + "learning_rate": 1.9032903863409916e-05, + "logits/chosen": -19.227426528930664, + "logits/rejected": -20.481821060180664, + "logps/chosen": -504.85076904296875, + "logps/rejected": -399.4018249511719, + "loss": 1.1368, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 2.5695528984069824, + "rewards/margins": 1.6272262334823608, + "rewards/rejected": 0.9423269033432007, + "step": 8500 + }, + { + "epoch": 0.4284148649995018, + "grad_norm": 118.92176055908203, + "learning_rate": 1.901039992328779e-05, + "logits/chosen": -19.390897750854492, + "logits/rejected": -20.656518936157227, + "logps/chosen": -502.2049865722656, + "logps/rejected": -406.15740966796875, + "loss": 1.3008, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 2.6097073554992676, + "rewards/margins": 1.4985759258270264, + "rewards/rejected": 1.1111317873001099, + "step": 8600 + }, + { + "epoch": 0.43339643319717047, + "grad_norm": 135.84136962890625, + "learning_rate": 1.8987650776383116e-05, + "logits/chosen": -19.83563232421875, + "logits/rejected": -20.294017791748047, + "logps/chosen": -496.2801513671875, + "logps/rejected": -452.8775634765625, + "loss": 1.689, + "rewards/accuracies": 0.5699999928474426, + "rewards/chosen": 2.978971481323242, + "rewards/margins": 0.9833757281303406, + "rewards/rejected": 1.9955962896347046, + "step": 8700 + }, + { + "epoch": 0.4383780013948391, + "grad_norm": 0.03969337046146393, + "learning_rate": 1.896465704178551e-05, + "logits/chosen": -19.218610763549805, + "logits/rejected": -20.0975284576416, + "logps/chosen": -479.8434753417969, + "logps/rejected": -410.504150390625, + "loss": 1.2651, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 2.871466875076294, + "rewards/margins": 1.330002784729004, + "rewards/rejected": 1.5414642095565796, + "step": 8800 + }, + { + "epoch": 0.4433595695925077, + "grad_norm": 2.9111685752868652, + "learning_rate": 1.8941419345240763e-05, + "logits/chosen": -19.58942222595215, + "logits/rejected": -20.811443328857422, + "logps/chosen": -435.19122314453125, + "logps/rejected": -364.0976867675781, + "loss": 1.164, + "rewards/accuracies": 0.5899999737739563, + "rewards/chosen": 3.163036823272705, + "rewards/margins": 1.286778450012207, + "rewards/rejected": 1.876258373260498, + "step": 8900 + }, + { + "epoch": 0.44834113779017637, + "grad_norm": 92.33987426757812, + "learning_rate": 1.891793831913376e-05, + "logits/chosen": -19.12569808959961, + "logits/rejected": -20.312471389770508, + "logps/chosen": -539.9608764648438, + "logps/rejected": -456.8437805175781, + "loss": 1.0481, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": 2.520881175994873, + "rewards/margins": 1.5386346578598022, + "rewards/rejected": 0.982246458530426, + "step": 9000 + }, + { + "epoch": 0.45332270598784496, + "grad_norm": 24.77559471130371, + "learning_rate": 1.8894214602471307e-05, + "logits/chosen": -19.473718643188477, + "logits/rejected": -21.318897247314453, + "logps/chosen": -499.727783203125, + "logps/rejected": -432.5865478515625, + "loss": 1.3744, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 2.8033368587493896, + "rewards/margins": 1.5860238075256348, + "rewards/rejected": 1.2173125743865967, + "step": 9100 + }, + { + "epoch": 0.4583042741855136, + "grad_norm": 2.553715467453003, + "learning_rate": 1.887024884086473e-05, + "logits/chosen": -19.989469528198242, + "logits/rejected": -21.36966323852539, + "logps/chosen": -485.37139892578125, + "logps/rejected": -414.8341369628906, + "loss": 1.2352, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 2.2446627616882324, + "rewards/margins": 2.1422224044799805, + "rewards/rejected": 0.10244012624025345, + "step": 9200 + }, + { + "epoch": 0.4632858423831822, + "grad_norm": 9.945940017700195, + "learning_rate": 1.88460416865123e-05, + "logits/chosen": -19.838525772094727, + "logits/rejected": -21.396879196166992, + "logps/chosen": -510.5537109375, + "logps/rejected": -417.2762451171875, + "loss": 1.1751, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 1.9794914722442627, + "rewards/margins": 1.851491093635559, + "rewards/rejected": 0.12800025939941406, + "step": 9300 + }, + { + "epoch": 0.46826741058085086, + "grad_norm": 15.31814193725586, + "learning_rate": 1.88215937981815e-05, + "logits/chosen": -19.403379440307617, + "logits/rejected": -20.53765869140625, + "logps/chosen": -476.90802001953125, + "logps/rejected": -390.69744873046875, + "loss": 1.7465, + "rewards/accuracies": 0.5099999904632568, + "rewards/chosen": 3.1520519256591797, + "rewards/margins": 0.6633343696594238, + "rewards/rejected": 2.488717555999756, + "step": 9400 + }, + { + "epoch": 0.47324897877851946, + "grad_norm": 2.4341812133789062, + "learning_rate": 1.879690584119108e-05, + "logits/chosen": -18.863977432250977, + "logits/rejected": -19.929393768310547, + "logps/chosen": -451.2972717285156, + "logps/rejected": -356.81878662109375, + "loss": 1.0762, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 3.5310535430908203, + "rewards/margins": 0.9716143012046814, + "rewards/rejected": 2.5594394207000732, + "step": 9500 + }, + { + "epoch": 0.4782305469761881, + "grad_norm": 0.5188534259796143, + "learning_rate": 1.8771978487392965e-05, + "logits/chosen": -19.067102432250977, + "logits/rejected": -19.693904876708984, + "logps/chosen": -436.10125732421875, + "logps/rejected": -371.744140625, + "loss": 1.2393, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 3.686614990234375, + "rewards/margins": 1.6963415145874023, + "rewards/rejected": 1.9902732372283936, + "step": 9600 + }, + { + "epoch": 0.4832121151738567, + "grad_norm": 0.9292926788330078, + "learning_rate": 1.874681241515396e-05, + "logits/chosen": -18.863676071166992, + "logits/rejected": -18.958852767944336, + "logps/chosen": -462.2861328125, + "logps/rejected": -413.9620361328125, + "loss": 1.2257, + "rewards/accuracies": 0.5899999737739563, + "rewards/chosen": 3.4554378986358643, + "rewards/margins": 0.9302346110343933, + "rewards/rejected": 2.525202989578247, + "step": 9700 + }, + { + "epoch": 0.48819368337152536, + "grad_norm": 35.07600402832031, + "learning_rate": 1.8721408309337295e-05, + "logits/chosen": -18.920787811279297, + "logits/rejected": -19.5614070892334, + "logps/chosen": -475.8876647949219, + "logps/rejected": -409.3224182128906, + "loss": 1.2497, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 3.723018169403076, + "rewards/margins": 1.3657230138778687, + "rewards/rejected": 2.357295036315918, + "step": 9800 + }, + { + "epoch": 0.493175251569194, + "grad_norm": 0.003431697143241763, + "learning_rate": 1.8695766861283987e-05, + "logits/chosen": -18.851102828979492, + "logits/rejected": -19.668804168701172, + "logps/chosen": -505.24945068359375, + "logps/rejected": -430.45428466796875, + "loss": 1.1504, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 3.6089422702789307, + "rewards/margins": 1.7381848096847534, + "rewards/rejected": 1.8707573413848877, + "step": 9900 + }, + { + "epoch": 0.4981568197668626, + "grad_norm": 65.25814819335938, + "learning_rate": 1.8669888768794024e-05, + "logits/chosen": -18.943655014038086, + "logits/rejected": -19.943601608276367, + "logps/chosen": -456.38531494140625, + "logps/rejected": -394.1759033203125, + "loss": 1.1225, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 3.0651655197143555, + "rewards/margins": 1.2026199102401733, + "rewards/rejected": 1.8625457286834717, + "step": 10000 + }, + { + "epoch": 0.4981568197668626, + "eval_logits/chosen": -20.5496826171875, + "eval_logits/rejected": -21.546123504638672, + "eval_logps/chosen": -477.2846984863281, + "eval_logps/rejected": -404.45428466796875, + "eval_loss": 1.1951801776885986, + "eval_rewards/accuracies": 0.6335078477859497, + "eval_rewards/chosen": 4.191972732543945, + "eval_rewards/margins": 1.5964468717575073, + "eval_rewards/rejected": 2.5955255031585693, + "eval_runtime": 472.8478, + "eval_samples_per_second": 3.219, + "eval_steps_per_second": 0.404, + "step": 10000 + }, + { + "epoch": 0.5031383879645313, + "grad_norm": 94.86180114746094, + "learning_rate": 1.8643774736107384e-05, + "logits/chosen": -18.719505310058594, + "logits/rejected": -19.4000244140625, + "logps/chosen": -508.5538330078125, + "logps/rejected": -445.2476806640625, + "loss": 1.3446, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 3.225598096847534, + "rewards/margins": 1.1447488069534302, + "rewards/rejected": 2.0808494091033936, + "step": 10100 + }, + { + "epoch": 0.5081199561621998, + "grad_norm": 48.567291259765625, + "learning_rate": 1.8617425473884855e-05, + "logits/chosen": -18.674545288085938, + "logits/rejected": -19.417861938476562, + "logps/chosen": -519.6810302734375, + "logps/rejected": -433.57940673828125, + "loss": 1.3277, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 2.415245771408081, + "rewards/margins": 1.2748632431030273, + "rewards/rejected": 1.1403824090957642, + "step": 10200 + }, + { + "epoch": 0.5131015243598684, + "grad_norm": 6.8724141120910645, + "learning_rate": 1.859084169918871e-05, + "logits/chosen": -18.91655731201172, + "logits/rejected": -19.582307815551758, + "logps/chosen": -490.5546569824219, + "logps/rejected": -427.4134521484375, + "loss": 1.5115, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 2.807913303375244, + "rewards/margins": 1.6051901578903198, + "rewards/rejected": 1.2027232646942139, + "step": 10300 + }, + { + "epoch": 0.5180830925575372, + "grad_norm": 11.200637817382812, + "learning_rate": 1.8564024135463173e-05, + "logits/chosen": -18.65709114074707, + "logits/rejected": -18.75111198425293, + "logps/chosen": -456.7597961425781, + "logps/rejected": -418.4756774902344, + "loss": 1.4256, + "rewards/accuracies": 0.5699999928474426, + "rewards/chosen": 1.6474815607070923, + "rewards/margins": 0.9819788336753845, + "rewards/rejected": 0.6655027866363525, + "step": 10400 + }, + { + "epoch": 0.5230646607552057, + "grad_norm": 14.02713394165039, + "learning_rate": 1.8536973512514762e-05, + "logits/chosen": -18.270898818969727, + "logits/rejected": -18.737123489379883, + "logps/chosen": -495.671875, + "logps/rejected": -404.7850646972656, + "loss": 1.1738, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": 3.6311376094818115, + "rewards/margins": 1.3220287561416626, + "rewards/rejected": 2.3091087341308594, + "step": 10500 + }, + { + "epoch": 0.5280462289528743, + "grad_norm": 88.36447143554688, + "learning_rate": 1.85096905664924e-05, + "logits/chosen": -18.358213424682617, + "logits/rejected": -18.569581985473633, + "logps/chosen": -460.0679931640625, + "logps/rejected": -420.5664978027344, + "loss": 1.4962, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 1.870842456817627, + "rewards/margins": 1.0018635988235474, + "rewards/rejected": 0.8689790368080139, + "step": 10600 + }, + { + "epoch": 0.5330277971505429, + "grad_norm": 2.6545143127441406, + "learning_rate": 1.848217603986739e-05, + "logits/chosen": -18.19515037536621, + "logits/rejected": -18.503700256347656, + "logps/chosen": -482.6021423339844, + "logps/rejected": -439.8750915527344, + "loss": 1.3315, + "rewards/accuracies": 0.5899999737739563, + "rewards/chosen": 2.491628408432007, + "rewards/margins": 0.7911645770072937, + "rewards/rejected": 1.700463891029358, + "step": 10700 + }, + { + "epoch": 0.5380093653482116, + "grad_norm": 54.594303131103516, + "learning_rate": 1.845443068141322e-05, + "logits/chosen": -18.29205894470215, + "logits/rejected": -19.060501098632812, + "logps/chosen": -499.4178466796875, + "logps/rejected": -410.4550476074219, + "loss": 1.3476, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 3.079913377761841, + "rewards/margins": 1.4285519123077393, + "rewards/rejected": 1.6513612270355225, + "step": 10800 + }, + { + "epoch": 0.5429909335458802, + "grad_norm": 7.620671272277832, + "learning_rate": 1.8426455246185177e-05, + "logits/chosen": -18.482242584228516, + "logits/rejected": -19.06051254272461, + "logps/chosen": -488.9962463378906, + "logps/rejected": -414.5852355957031, + "loss": 1.118, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": 3.6191320419311523, + "rewards/margins": 1.9326629638671875, + "rewards/rejected": 1.6864690780639648, + "step": 10900 + }, + { + "epoch": 0.5479725017435488, + "grad_norm": 44.31614685058594, + "learning_rate": 1.8398250495499796e-05, + "logits/chosen": -18.507394790649414, + "logits/rejected": -19.16678810119629, + "logps/chosen": -496.0931701660156, + "logps/rejected": -424.0758056640625, + "loss": 1.3294, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 3.764357805252075, + "rewards/margins": 1.4526183605194092, + "rewards/rejected": 2.311739444732666, + "step": 11000 + }, + { + "epoch": 0.5529540699412175, + "grad_norm": 113.41361999511719, + "learning_rate": 1.8369817196914145e-05, + "logits/chosen": -18.77853775024414, + "logits/rejected": -19.550212860107422, + "logps/chosen": -458.4302978515625, + "logps/rejected": -376.8597717285156, + "loss": 1.2723, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 2.409397602081299, + "rewards/margins": 1.0338749885559082, + "rewards/rejected": 1.375522255897522, + "step": 11100 + }, + { + "epoch": 0.5579356381388861, + "grad_norm": 39.32530212402344, + "learning_rate": 1.8341156124204943e-05, + "logits/chosen": -18.80110740661621, + "logits/rejected": -19.126850128173828, + "logps/chosen": -444.9359436035156, + "logps/rejected": -403.28546142578125, + "loss": 1.2621, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 3.3543567657470703, + "rewards/margins": 1.4829553365707397, + "rewards/rejected": 1.8714015483856201, + "step": 11200 + }, + { + "epoch": 0.5629172063365547, + "grad_norm": 0.01231900043785572, + "learning_rate": 1.8312268057347488e-05, + "logits/chosen": -19.005640029907227, + "logits/rejected": -19.596464157104492, + "logps/chosen": -466.5137634277344, + "logps/rejected": -408.3565673828125, + "loss": 1.2726, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 2.5804996490478516, + "rewards/margins": 1.2464163303375244, + "rewards/rejected": 1.3340831995010376, + "step": 11300 + }, + { + "epoch": 0.5678987745342233, + "grad_norm": 78.91645050048828, + "learning_rate": 1.8283153782494457e-05, + "logits/chosen": -19.66242218017578, + "logits/rejected": -20.03885269165039, + "logps/chosen": -495.5252380371094, + "logps/rejected": -428.739501953125, + "loss": 1.1003, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 0.8916977047920227, + "rewards/margins": 2.0576887130737305, + "rewards/rejected": -1.165990948677063, + "step": 11400 + }, + { + "epoch": 0.572880342731892, + "grad_norm": 2.7297961711883545, + "learning_rate": 1.8253814091954476e-05, + "logits/chosen": -19.751190185546875, + "logits/rejected": -20.76055335998535, + "logps/chosen": -470.715576171875, + "logps/rejected": -401.6990661621094, + "loss": 1.1496, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": -0.058783989399671555, + "rewards/margins": 1.678650975227356, + "rewards/rejected": -1.7374348640441895, + "step": 11500 + }, + { + "epoch": 0.5778619109295606, + "grad_norm": 11.752656936645508, + "learning_rate": 1.8224249784170595e-05, + "logits/chosen": -19.580923080444336, + "logits/rejected": -20.732593536376953, + "logps/chosen": -517.3013916015625, + "logps/rejected": -441.9253845214844, + "loss": 1.2111, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -0.6827618479728699, + "rewards/margins": 1.910689115524292, + "rewards/rejected": -2.5934510231018066, + "step": 11600 + }, + { + "epoch": 0.5828434791272292, + "grad_norm": 96.53925323486328, + "learning_rate": 1.8194461663698524e-05, + "logits/chosen": -19.67738914489746, + "logits/rejected": -21.431556701660156, + "logps/chosen": -518.2506103515625, + "logps/rejected": -389.5613708496094, + "loss": 1.3043, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.12440891563892365, + "rewards/margins": 2.5978763103485107, + "rewards/rejected": -2.4734673500061035, + "step": 11700 + }, + { + "epoch": 0.5878250473248979, + "grad_norm": 62.21799850463867, + "learning_rate": 1.8164450541184768e-05, + "logits/chosen": -19.013898849487305, + "logits/rejected": -19.318574905395508, + "logps/chosen": -564.2166137695312, + "logps/rejected": -528.39111328125, + "loss": 1.3621, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 0.12132181972265244, + "rewards/margins": 1.5959105491638184, + "rewards/rejected": -1.4745885133743286, + "step": 11800 + }, + { + "epoch": 0.5928066155225665, + "grad_norm": 0.12062743306159973, + "learning_rate": 1.8134217233344556e-05, + "logits/chosen": -19.182098388671875, + "logits/rejected": -19.83804702758789, + "logps/chosen": -525.335693359375, + "logps/rejected": -455.0185852050781, + "loss": 1.3079, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -0.4299677908420563, + "rewards/margins": 1.420630931854248, + "rewards/rejected": -1.8505988121032715, + "step": 11900 + }, + { + "epoch": 0.5977881837202351, + "grad_norm": 4.478858470916748, + "learning_rate": 1.81037625629396e-05, + "logits/chosen": -18.84477996826172, + "logits/rejected": -19.84359359741211, + "logps/chosen": -519.6325073242188, + "logps/rejected": -437.3699951171875, + "loss": 1.2444, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 1.3549919128417969, + "rewards/margins": 2.0743324756622314, + "rewards/rejected": -0.7193406820297241, + "step": 12000 + }, + { + "epoch": 0.6027697519179037, + "grad_norm": 43.38969039916992, + "learning_rate": 1.8073087358755735e-05, + "logits/chosen": -18.777620315551758, + "logits/rejected": -19.37495231628418, + "logps/chosen": -485.4015197753906, + "logps/rejected": -413.2806396484375, + "loss": 1.2657, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 2.3500404357910156, + "rewards/margins": 2.2481086254119873, + "rewards/rejected": 0.10193166881799698, + "step": 12100 + }, + { + "epoch": 0.6077513201155724, + "grad_norm": 29.47319984436035, + "learning_rate": 1.804219245558033e-05, + "logits/chosen": -18.747379302978516, + "logits/rejected": -19.17096519470215, + "logps/chosen": -476.1571044921875, + "logps/rejected": -414.704345703125, + "loss": 1.3264, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 1.9210524559020996, + "rewards/margins": 0.9324368238449097, + "rewards/rejected": 0.9886155724525452, + "step": 12200 + }, + { + "epoch": 0.612732888313241, + "grad_norm": 57.94328689575195, + "learning_rate": 1.8011078694179602e-05, + "logits/chosen": -18.417835235595703, + "logits/rejected": -18.728105545043945, + "logps/chosen": -466.6083068847656, + "logps/rejected": -417.28936767578125, + "loss": 1.3787, + "rewards/accuracies": 0.5899999737739563, + "rewards/chosen": 2.9616811275482178, + "rewards/margins": 0.9320456981658936, + "rewards/rejected": 2.0296356678009033, + "step": 12300 + }, + { + "epoch": 0.6177144565109096, + "grad_norm": 0.6535269021987915, + "learning_rate": 1.7979746921275713e-05, + "logits/chosen": -18.470064163208008, + "logits/rejected": -19.071678161621094, + "logps/chosen": -499.7461242675781, + "logps/rejected": -414.9991455078125, + "loss": 1.0886, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 2.9048681259155273, + "rewards/margins": 1.7158997058868408, + "rewards/rejected": 1.188968300819397, + "step": 12400 + }, + { + "epoch": 0.6226960247085782, + "grad_norm": 47.690162658691406, + "learning_rate": 1.794819798952374e-05, + "logits/chosen": -18.49652862548828, + "logits/rejected": -18.98128318786621, + "logps/chosen": -571.6408081054688, + "logps/rejected": -473.7767333984375, + "loss": 1.2108, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": 2.7018349170684814, + "rewards/margins": 2.354581832885742, + "rewards/rejected": 0.34725311398506165, + "step": 12500 + }, + { + "epoch": 0.6276775929062469, + "grad_norm": 69.55400085449219, + "learning_rate": 1.7916432757488467e-05, + "logits/chosen": -19.46697235107422, + "logits/rejected": -20.09600830078125, + "logps/chosen": -524.7301635742188, + "logps/rejected": -433.07293701171875, + "loss": 1.0183, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": 1.3079893589019775, + "rewards/margins": 3.2492458820343018, + "rewards/rejected": -1.9412565231323242, + "step": 12600 + }, + { + "epoch": 0.6326591611039155, + "grad_norm": 14.792251586914062, + "learning_rate": 1.7884452089621012e-05, + "logits/chosen": -19.28809928894043, + "logits/rejected": -20.2492733001709, + "logps/chosen": -578.1820068359375, + "logps/rejected": -456.3279724121094, + "loss": 1.1159, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": 1.191977858543396, + "rewards/margins": 2.5655128955841064, + "rewards/rejected": -1.37353515625, + "step": 12700 + }, + { + "epoch": 0.6376407293015841, + "grad_norm": 0.21572743356227875, + "learning_rate": 1.7852256856235318e-05, + "logits/chosen": -19.648353576660156, + "logits/rejected": -20.134416580200195, + "logps/chosen": -495.5775146484375, + "logps/rejected": -438.1684265136719, + "loss": 1.387, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -0.19859656691551208, + "rewards/margins": 2.191715955734253, + "rewards/rejected": -2.390312433242798, + "step": 12800 + }, + { + "epoch": 0.6426222974992528, + "grad_norm": 102.35507202148438, + "learning_rate": 1.7819847933484467e-05, + "logits/chosen": -19.353174209594727, + "logits/rejected": -20.048927307128906, + "logps/chosen": -524.4760131835938, + "logps/rejected": -446.4917907714844, + "loss": 1.1967, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.8889893293380737, + "rewards/margins": 1.9711395502090454, + "rewards/rejected": -1.0821502208709717, + "step": 12900 + }, + { + "epoch": 0.6476038656969214, + "grad_norm": 0.06541766971349716, + "learning_rate": 1.778722620333681e-05, + "logits/chosen": -19.828271865844727, + "logits/rejected": -20.194868087768555, + "logps/chosen": -602.9769287109375, + "logps/rejected": -530.201904296875, + "loss": 1.6761, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2178092002868652, + "rewards/margins": 1.5633704662322998, + "rewards/rejected": -3.781179904937744, + "step": 13000 + }, + { + "epoch": 0.65258543389459, + "grad_norm": 58.162445068359375, + "learning_rate": 1.775439255355201e-05, + "logits/chosen": -19.331708908081055, + "logits/rejected": -19.971097946166992, + "logps/chosen": -570.8577880859375, + "logps/rejected": -473.57196044921875, + "loss": 1.1495, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": 0.24211058020591736, + "rewards/margins": 2.4019970893859863, + "rewards/rejected": -2.159886598587036, + "step": 13100 + }, + { + "epoch": 0.6575670020922586, + "grad_norm": 1.0752054452896118, + "learning_rate": 1.772134787765684e-05, + "logits/chosen": -19.27989959716797, + "logits/rejected": -19.537317276000977, + "logps/chosen": -541.393798828125, + "logps/rejected": -501.2279968261719, + "loss": 1.6051, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": -1.2508366107940674, + "rewards/margins": 0.7745574712753296, + "rewards/rejected": -2.0253942012786865, + "step": 13200 + }, + { + "epoch": 0.6625485702899273, + "grad_norm": 8.162809371948242, + "learning_rate": 1.768809307492089e-05, + "logits/chosen": -18.722593307495117, + "logits/rejected": -19.011571884155273, + "logps/chosen": -513.6095581054688, + "logps/rejected": -469.2226257324219, + "loss": 1.3372, + "rewards/accuracies": 0.5400000214576721, + "rewards/chosen": -0.9962272047996521, + "rewards/margins": 0.9051995873451233, + "rewards/rejected": -1.9014270305633545, + "step": 13300 + }, + { + "epoch": 0.6675301384875959, + "grad_norm": 97.5845947265625, + "learning_rate": 1.765462905033209e-05, + "logits/chosen": -19.051023483276367, + "logits/rejected": -19.420806884765625, + "logps/chosen": -478.5913391113281, + "logps/rejected": -437.26995849609375, + "loss": 1.3999, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -1.0214941501617432, + "rewards/margins": 0.9453433752059937, + "rewards/rejected": -1.9668372869491577, + "step": 13400 + }, + { + "epoch": 0.6725117066852645, + "grad_norm": 29.43077850341797, + "learning_rate": 1.762095671457209e-05, + "logits/chosen": -19.13440704345703, + "logits/rejected": -19.925010681152344, + "logps/chosen": -496.3144836425781, + "logps/rejected": -413.5525817871094, + "loss": 1.3997, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": -0.08645965903997421, + "rewards/margins": 1.8843421936035156, + "rewards/rejected": -1.970801830291748, + "step": 13500 + }, + { + "epoch": 0.6774932748829331, + "grad_norm": 0.7588065266609192, + "learning_rate": 1.7587076983991457e-05, + "logits/chosen": -19.021947860717773, + "logits/rejected": -19.49304962158203, + "logps/chosen": -520.0108032226562, + "logps/rejected": -473.15020751953125, + "loss": 1.9126, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 1.3220689296722412, + "rewards/margins": 0.6901782155036926, + "rewards/rejected": 0.631890594959259, + "step": 13600 + }, + { + "epoch": 0.6824748430806018, + "grad_norm": 110.71784973144531, + "learning_rate": 1.755299078058475e-05, + "logits/chosen": -19.794466018676758, + "logits/rejected": -20.945425033569336, + "logps/chosen": -485.5846862792969, + "logps/rejected": -422.5166931152344, + "loss": 1.0623, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -0.4341191351413727, + "rewards/margins": 1.9454231262207031, + "rewards/rejected": -2.379542589187622, + "step": 13700 + }, + { + "epoch": 0.6874564112782704, + "grad_norm": 92.97657012939453, + "learning_rate": 1.751869903196543e-05, + "logits/chosen": -19.072101593017578, + "logits/rejected": -20.458724975585938, + "logps/chosen": -550.3908081054688, + "logps/rejected": -454.7740173339844, + "loss": 1.1903, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 0.9165298342704773, + "rewards/margins": 1.906398892402649, + "rewards/rejected": -0.9898689389228821, + "step": 13800 + }, + { + "epoch": 0.692437979475939, + "grad_norm": 4.199160575866699, + "learning_rate": 1.748420267134062e-05, + "logits/chosen": -18.836036682128906, + "logits/rejected": -19.922813415527344, + "logps/chosen": -539.5211181640625, + "logps/rejected": -479.9672546386719, + "loss": 1.2681, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 1.260705828666687, + "rewards/margins": 1.744788408279419, + "rewards/rejected": -0.48408252000808716, + "step": 13900 + }, + { + "epoch": 0.6974195476736077, + "grad_norm": 107.6252212524414, + "learning_rate": 1.74495026374857e-05, + "logits/chosen": -19.50172233581543, + "logits/rejected": -20.288314819335938, + "logps/chosen": -514.2687377929688, + "logps/rejected": -449.1997375488281, + "loss": 1.4695, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 0.515940248966217, + "rewards/margins": 1.3783401250839233, + "rewards/rejected": -0.8623998761177063, + "step": 14000 + }, + { + "epoch": 0.7024011158712763, + "grad_norm": 5.941022872924805, + "learning_rate": 1.7414599874718753e-05, + "logits/chosen": -18.767423629760742, + "logits/rejected": -19.68829917907715, + "logps/chosen": -545.5341796875, + "logps/rejected": -463.9657287597656, + "loss": 1.1235, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 1.2197209596633911, + "rewards/margins": 1.6720809936523438, + "rewards/rejected": -0.4523601531982422, + "step": 14100 + }, + { + "epoch": 0.7073826840689449, + "grad_norm": 26.461627960205078, + "learning_rate": 1.737949533287489e-05, + "logits/chosen": -18.46575355529785, + "logits/rejected": -19.159351348876953, + "logps/chosen": -517.9618530273438, + "logps/rejected": -417.8922119140625, + "loss": 1.14, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 1.9558275938034058, + "rewards/margins": 2.069073438644409, + "rewards/rejected": -0.11324585229158401, + "step": 14200 + }, + { + "epoch": 0.7123642522666135, + "grad_norm": 1.1918169260025024, + "learning_rate": 1.7344189967280383e-05, + "logits/chosen": -19.000808715820312, + "logits/rejected": -20.075515747070312, + "logps/chosen": -474.513916015625, + "logps/rejected": -400.20196533203125, + "loss": 0.9665, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": 1.894654393196106, + "rewards/margins": 2.998011589050293, + "rewards/rejected": -1.103356957435608, + "step": 14300 + }, + { + "epoch": 0.7173458204642822, + "grad_norm": 60.40048599243164, + "learning_rate": 1.7308684738726668e-05, + "logits/chosen": -18.980615615844727, + "logits/rejected": -20.142223358154297, + "logps/chosen": -510.573974609375, + "logps/rejected": -441.4659729003906, + "loss": 1.266, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 0.2977685332298279, + "rewards/margins": 1.5726754665374756, + "rewards/rejected": -1.274907112121582, + "step": 14400 + }, + { + "epoch": 0.7223273886619508, + "grad_norm": 0.5988157391548157, + "learning_rate": 1.7272980613444206e-05, + "logits/chosen": -18.941259384155273, + "logits/rejected": -20.322023391723633, + "logps/chosen": -531.8062744140625, + "logps/rejected": -474.36785888671875, + "loss": 1.2675, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3074489235877991, + "rewards/margins": 1.548563838005066, + "rewards/rejected": -1.8560125827789307, + "step": 14500 + }, + { + "epoch": 0.7273089568596194, + "grad_norm": 6.901514530181885, + "learning_rate": 1.7237078563076178e-05, + "logits/chosen": -19.498384475708008, + "logits/rejected": -21.36153793334961, + "logps/chosen": -511.9005126953125, + "logps/rejected": -439.97003173828125, + "loss": 1.2244, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -0.4899732172489166, + "rewards/margins": 1.9954489469528198, + "rewards/rejected": -2.485422372817993, + "step": 14600 + }, + { + "epoch": 0.7322905250572881, + "grad_norm": 0.03188573196530342, + "learning_rate": 1.7200979564652064e-05, + "logits/chosen": -18.785024642944336, + "logits/rejected": -21.15717124938965, + "logps/chosen": -520.9817504882812, + "logps/rejected": -426.6506652832031, + "loss": 1.3959, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 1.1841310262680054, + "rewards/margins": 1.9892054796218872, + "rewards/rejected": -0.8050744533538818, + "step": 14700 + }, + { + "epoch": 0.7372720932549567, + "grad_norm": 30.52501678466797, + "learning_rate": 1.7164684600561018e-05, + "logits/chosen": -18.466907501220703, + "logits/rejected": -20.27123260498047, + "logps/chosen": -531.89501953125, + "logps/rejected": -429.2342834472656, + "loss": 1.3491, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 2.848252773284912, + "rewards/margins": 2.337670087814331, + "rewards/rejected": 0.5105829834938049, + "step": 14800 + }, + { + "epoch": 0.7422536614526253, + "grad_norm": 4.541143894195557, + "learning_rate": 1.712819465852517e-05, + "logits/chosen": -18.570043563842773, + "logits/rejected": -20.75904083251953, + "logps/chosen": -493.5054931640625, + "logps/rejected": -380.9721984863281, + "loss": 1.2685, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 1.8648478984832764, + "rewards/margins": 2.2077205181121826, + "rewards/rejected": -0.3428727388381958, + "step": 14900 + }, + { + "epoch": 0.7472352296502939, + "grad_norm": 0.44224098324775696, + "learning_rate": 1.7091510731572725e-05, + "logits/chosen": -18.91974449157715, + "logits/rejected": -20.655202865600586, + "logps/chosen": -514.9723510742188, + "logps/rejected": -452.38983154296875, + "loss": 1.7715, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 2.1622519493103027, + "rewards/margins": 1.3429455757141113, + "rewards/rejected": 0.8193062543869019, + "step": 15000 + }, + { + "epoch": 0.7472352296502939, + "eval_logits/chosen": -22.020517349243164, + "eval_logits/rejected": -23.86900520324707, + "eval_logps/chosen": -484.0854187011719, + "eval_logps/rejected": -414.28009033203125, + "eval_loss": 1.2986581325531006, + "eval_rewards/accuracies": 0.6367800831794739, + "eval_rewards/chosen": 3.5119001865386963, + "eval_rewards/margins": 1.8989582061767578, + "eval_rewards/rejected": 1.6129425764083862, + "eval_runtime": 472.8477, + "eval_samples_per_second": 3.219, + "eval_steps_per_second": 0.404, + "step": 15000 + }, + { + "epoch": 0.7522167978479626, + "grad_norm": 10.687481880187988, + "learning_rate": 1.7054633818010954e-05, + "logits/chosen": -18.568933486938477, + "logits/rejected": -20.1440372467041, + "logps/chosen": -456.9844055175781, + "logps/rejected": -408.1793212890625, + "loss": 1.0277, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 3.049287796020508, + "rewards/margins": 1.8809983730316162, + "rewards/rejected": 1.168289065361023, + "step": 15100 + }, + { + "epoch": 0.7571983660456312, + "grad_norm": 67.85148620605469, + "learning_rate": 1.7017564921399e-05, + "logits/chosen": -18.714679718017578, + "logits/rejected": -21.288236618041992, + "logps/chosen": -508.66015625, + "logps/rejected": -424.9830322265625, + "loss": 1.362, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 2.725537061691284, + "rewards/margins": 1.661694049835205, + "rewards/rejected": 1.0638428926467896, + "step": 15200 + }, + { + "epoch": 0.7621799342432998, + "grad_norm": 44.31140899658203, + "learning_rate": 1.698030505052061e-05, + "logits/chosen": -18.928104400634766, + "logits/rejected": -20.428186416625977, + "logps/chosen": -470.02459716796875, + "logps/rejected": -381.4432678222656, + "loss": 1.504, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 2.468132734298706, + "rewards/margins": 0.8910315036773682, + "rewards/rejected": 1.577101469039917, + "step": 15300 + }, + { + "epoch": 0.7671615024409684, + "grad_norm": 10.858190536499023, + "learning_rate": 1.6942855219356634e-05, + "logits/chosen": -18.520444869995117, + "logits/rejected": -20.245197296142578, + "logps/chosen": -491.31854248046875, + "logps/rejected": -416.8392333984375, + "loss": 1.0534, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 3.1176271438598633, + "rewards/margins": 1.8960695266723633, + "rewards/rejected": 1.2215576171875, + "step": 15400 + }, + { + "epoch": 0.7721430706386371, + "grad_norm": 0.026119831949472427, + "learning_rate": 1.6905216447057467e-05, + "logits/chosen": -19.040843963623047, + "logits/rejected": -20.839155197143555, + "logps/chosen": -495.08428955078125, + "logps/rejected": -410.4775390625, + "loss": 1.5746, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 2.2889349460601807, + "rewards/margins": 1.6078674793243408, + "rewards/rejected": 0.6810672283172607, + "step": 15500 + }, + { + "epoch": 0.7771246388363057, + "grad_norm": 70.28289031982422, + "learning_rate": 1.686738975791529e-05, + "logits/chosen": -18.932214736938477, + "logits/rejected": -20.42650032043457, + "logps/chosen": -493.1498718261719, + "logps/rejected": -429.498779296875, + "loss": 1.4583, + "rewards/accuracies": 0.5699999928474426, + "rewards/chosen": 2.8957221508026123, + "rewards/margins": 1.429540991783142, + "rewards/rejected": 1.4661809206008911, + "step": 15600 + }, + { + "epoch": 0.7821062070339743, + "grad_norm": 0.8262832164764404, + "learning_rate": 1.6829376181336225e-05, + "logits/chosen": -19.566686630249023, + "logits/rejected": -21.254444122314453, + "logps/chosen": -483.1775817871094, + "logps/rejected": -451.6460266113281, + "loss": 1.238, + "rewards/accuracies": 0.5899999737739563, + "rewards/chosen": 1.937461495399475, + "rewards/margins": 1.0156903266906738, + "rewards/rejected": 0.9217712879180908, + "step": 15700 + }, + { + "epoch": 0.787087775231643, + "grad_norm": 10.062756538391113, + "learning_rate": 1.6791176751812282e-05, + "logits/chosen": -19.597667694091797, + "logits/rejected": -21.418546676635742, + "logps/chosen": -485.7425537109375, + "logps/rejected": -417.196533203125, + "loss": 1.1054, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": 1.6439099311828613, + "rewards/margins": 2.0514931678771973, + "rewards/rejected": -0.40758341550827026, + "step": 15800 + }, + { + "epoch": 0.7920693434293116, + "grad_norm": 11.171751022338867, + "learning_rate": 1.675279250889324e-05, + "logits/chosen": -19.189016342163086, + "logits/rejected": -21.2204532623291, + "logps/chosen": -523.8226318359375, + "logps/rejected": -432.4778137207031, + "loss": 1.2853, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 2.265528917312622, + "rewards/margins": 2.169790744781494, + "rewards/rejected": 0.0957380086183548, + "step": 15900 + }, + { + "epoch": 0.7970509116269802, + "grad_norm": 5.786453723907471, + "learning_rate": 1.6714224497158334e-05, + "logits/chosen": -19.52318572998047, + "logits/rejected": -22.174915313720703, + "logps/chosen": -500.9818115234375, + "logps/rejected": -427.1512451171875, + "loss": 1.2191, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03365034982562065, + "rewards/margins": 2.265183210372925, + "rewards/rejected": -2.2315328121185303, + "step": 16000 + }, + { + "epoch": 0.8020324798246488, + "grad_norm": 0.03890511766076088, + "learning_rate": 1.667547376618785e-05, + "logits/chosen": -19.749061584472656, + "logits/rejected": -21.982847213745117, + "logps/chosen": -521.017822265625, + "logps/rejected": -434.4532165527344, + "loss": 1.6337, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 1.2707146406173706, + "rewards/margins": 2.021421432495117, + "rewards/rejected": -0.7507067918777466, + "step": 16100 + }, + { + "epoch": 0.8070140480223175, + "grad_norm": 7.803345680236816, + "learning_rate": 1.6636541370534537e-05, + "logits/chosen": -19.421598434448242, + "logits/rejected": -20.04302406311035, + "logps/chosen": -481.74334716796875, + "logps/rejected": -437.2704162597656, + "loss": 1.5953, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 1.658532738685608, + "rewards/margins": 1.250680685043335, + "rewards/rejected": 0.40785208344459534, + "step": 16200 + }, + { + "epoch": 0.8119956162199861, + "grad_norm": 0.0178745836019516, + "learning_rate": 1.6597428369694934e-05, + "logits/chosen": -18.97728157043457, + "logits/rejected": -20.238012313842773, + "logps/chosen": -544.66015625, + "logps/rejected": -475.9863586425781, + "loss": 1.2509, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 1.6321719884872437, + "rewards/margins": 1.595544695854187, + "rewards/rejected": 0.03662717714905739, + "step": 16300 + }, + { + "epoch": 0.8169771844176547, + "grad_norm": 115.91627502441406, + "learning_rate": 1.655813582808051e-05, + "logits/chosen": -19.31316566467285, + "logits/rejected": -20.653505325317383, + "logps/chosen": -523.737060546875, + "logps/rejected": -466.0438232421875, + "loss": 1.4616, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 1.028465986251831, + "rewards/margins": 0.8755133152008057, + "rewards/rejected": 0.15295258164405823, + "step": 16400 + }, + { + "epoch": 0.8219587526153233, + "grad_norm": 19.2689266204834, + "learning_rate": 1.651866481498873e-05, + "logits/chosen": -19.784204483032227, + "logits/rejected": -21.526857376098633, + "logps/chosen": -484.1365966796875, + "logps/rejected": -423.5486755371094, + "loss": 1.0354, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 1.5439956188201904, + "rewards/margins": 2.2780568599700928, + "rewards/rejected": -0.7340614199638367, + "step": 16500 + }, + { + "epoch": 0.826940320812992, + "grad_norm": 22.2554988861084, + "learning_rate": 1.6479016404573916e-05, + "logits/chosen": -19.650360107421875, + "logits/rejected": -21.203350067138672, + "logps/chosen": -505.99871826171875, + "logps/rejected": -474.4420166015625, + "loss": 1.6364, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20957961678504944, + "rewards/margins": 1.9317030906677246, + "rewards/rejected": -2.141282558441162, + "step": 16600 + }, + { + "epoch": 0.8319218890106606, + "grad_norm": 44.10676574707031, + "learning_rate": 1.6439191675818056e-05, + "logits/chosen": -19.524065017700195, + "logits/rejected": -22.584871292114258, + "logps/chosen": -469.15264892578125, + "logps/rejected": -362.7622985839844, + "loss": 0.9529, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -0.1507752537727356, + "rewards/margins": 2.6078405380249023, + "rewards/rejected": -2.7586159706115723, + "step": 16700 + }, + { + "epoch": 0.8369034572083291, + "grad_norm": 143.54794311523438, + "learning_rate": 1.6399191712501417e-05, + "logits/chosen": -19.261682510375977, + "logits/rejected": -21.39293670654297, + "logps/chosen": -543.3509521484375, + "logps/rejected": -472.142822265625, + "loss": 1.4329, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 1.02738356590271, + "rewards/margins": 1.6261159181594849, + "rewards/rejected": -0.5987322926521301, + "step": 16800 + }, + { + "epoch": 0.8418850254059979, + "grad_norm": 0.004016869701445103, + "learning_rate": 1.6359017603173043e-05, + "logits/chosen": -19.182754516601562, + "logits/rejected": -21.46161651611328, + "logps/chosen": -525.286376953125, + "logps/rejected": -444.2633972167969, + "loss": 1.361, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 0.7399688959121704, + "rewards/margins": 2.1688730716705322, + "rewards/rejected": -1.4289040565490723, + "step": 16900 + }, + { + "epoch": 0.8468665936036665, + "grad_norm": 85.75580596923828, + "learning_rate": 1.6318670441121157e-05, + "logits/chosen": -19.6074161529541, + "logits/rejected": -20.843006134033203, + "logps/chosen": -514.9994506835938, + "logps/rejected": -458.54803466796875, + "loss": 1.9907, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": -0.5896009802818298, + "rewards/margins": 1.107519507408142, + "rewards/rejected": -1.6971205472946167, + "step": 17000 + }, + { + "epoch": 0.851848161801335, + "grad_norm": 0.005596471484750509, + "learning_rate": 1.6278151324343395e-05, + "logits/chosen": -18.718494415283203, + "logits/rejected": -19.837169647216797, + "logps/chosen": -527.9190673828125, + "logps/rejected": -452.2406005859375, + "loss": 0.9701, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -0.13445429503917694, + "rewards/margins": 2.05269193649292, + "rewards/rejected": -2.1871461868286133, + "step": 17100 + }, + { + "epoch": 0.8568297299990036, + "grad_norm": 76.1462631225586, + "learning_rate": 1.6237461355516918e-05, + "logits/chosen": -18.651123046875, + "logits/rejected": -19.52649688720703, + "logps/chosen": -539.2634887695312, + "logps/rejected": -478.9902648925781, + "loss": 1.575, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.8822598457336426, + "rewards/margins": 1.2877403497695923, + "rewards/rejected": -0.4054804742336273, + "step": 17200 + }, + { + "epoch": 0.8618112981966723, + "grad_norm": 90.55237579345703, + "learning_rate": 1.6196601641968425e-05, + "logits/chosen": -18.68253517150879, + "logits/rejected": -20.140342712402344, + "logps/chosen": -523.9414672851562, + "logps/rejected": -457.2998046875, + "loss": 1.1014, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 2.5566680431365967, + "rewards/margins": 1.6561553478240967, + "rewards/rejected": 0.9005125164985657, + "step": 17300 + }, + { + "epoch": 0.8667928663943409, + "grad_norm": 3.8211495876312256, + "learning_rate": 1.6155573295643993e-05, + "logits/chosen": -19.115205764770508, + "logits/rejected": -19.974811553955078, + "logps/chosen": -527.80419921875, + "logps/rejected": -491.970458984375, + "loss": 1.5425, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": 2.147320032119751, + "rewards/margins": 0.9348466396331787, + "rewards/rejected": 1.2124736309051514, + "step": 17400 + }, + { + "epoch": 0.8717744345920095, + "grad_norm": 12.620223999023438, + "learning_rate": 1.611437743307884e-05, + "logits/chosen": -19.070911407470703, + "logits/rejected": -20.162216186523438, + "logps/chosen": -515.4886474609375, + "logps/rejected": -440.5010070800781, + "loss": 1.2029, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": 2.0181946754455566, + "rewards/margins": 1.9606748819351196, + "rewards/rejected": 0.057520028203725815, + "step": 17500 + }, + { + "epoch": 0.8767560027896782, + "grad_norm": 59.42154312133789, + "learning_rate": 1.6073015175366914e-05, + "logits/chosen": -18.614526748657227, + "logits/rejected": -19.882549285888672, + "logps/chosen": -515.3157958984375, + "logps/rejected": -447.471435546875, + "loss": 1.3499, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": 2.700169563293457, + "rewards/margins": 0.998585045337677, + "rewards/rejected": 1.701583981513977, + "step": 17600 + }, + { + "epoch": 0.8817375709873468, + "grad_norm": 66.42816925048828, + "learning_rate": 1.603148764813042e-05, + "logits/chosen": -18.400327682495117, + "logits/rejected": -19.4934024810791, + "logps/chosen": -477.818603515625, + "logps/rejected": -401.1089782714844, + "loss": 1.1853, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 3.3441994190216064, + "rewards/margins": 1.493272066116333, + "rewards/rejected": 1.8509272336959839, + "step": 17700 + }, + { + "epoch": 0.8867191391850154, + "grad_norm": 0.004200187046080828, + "learning_rate": 1.5989795981489155e-05, + "logits/chosen": -18.308795928955078, + "logits/rejected": -19.249317169189453, + "logps/chosen": -512.5999755859375, + "logps/rejected": -438.9172668457031, + "loss": 1.1657, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 3.686070442199707, + "rewards/margins": 2.335679292678833, + "rewards/rejected": 1.3503910303115845, + "step": 17800 + }, + { + "epoch": 0.891700707382684, + "grad_norm": 4.083571910858154, + "learning_rate": 1.5947941310029755e-05, + "logits/chosen": -18.304054260253906, + "logits/rejected": -19.744394302368164, + "logps/chosen": -475.91485595703125, + "logps/rejected": -379.3888244628906, + "loss": 1.1195, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": 3.1028084754943848, + "rewards/margins": 1.9994087219238281, + "rewards/rejected": 1.1033999919891357, + "step": 17900 + }, + { + "epoch": 0.8966822755803527, + "grad_norm": 51.022762298583984, + "learning_rate": 1.5905924772774855e-05, + "logits/chosen": -18.618383407592773, + "logits/rejected": -19.63192367553711, + "logps/chosen": -479.7633972167969, + "logps/rejected": -392.1904602050781, + "loss": 1.1562, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 2.1094658374786377, + "rewards/margins": 1.7458257675170898, + "rewards/rejected": 0.3636399209499359, + "step": 18000 + }, + { + "epoch": 0.9016638437780213, + "grad_norm": 0.00022725010057911277, + "learning_rate": 1.586374751315204e-05, + "logits/chosen": -19.1419620513916, + "logits/rejected": -20.252761840820312, + "logps/chosen": -530.3628540039062, + "logps/rejected": -461.61639404296875, + "loss": 1.3509, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 3.2607533931732178, + "rewards/margins": 1.972005009651184, + "rewards/rejected": 1.2887482643127441, + "step": 18100 + }, + { + "epoch": 0.9066454119756899, + "grad_norm": 120.93099212646484, + "learning_rate": 1.5821410678962764e-05, + "logits/chosen": -19.30841636657715, + "logits/rejected": -20.281227111816406, + "logps/chosen": -450.5699157714844, + "logps/rejected": -398.7843017578125, + "loss": 1.4407, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 2.621293067932129, + "rewards/margins": 1.4806004762649536, + "rewards/rejected": 1.1406925916671753, + "step": 18200 + }, + { + "epoch": 0.9116269801733585, + "grad_norm": 0.014844976365566254, + "learning_rate": 1.5778915422351102e-05, + "logits/chosen": -18.85603141784668, + "logits/rejected": -19.324058532714844, + "logps/chosen": -493.2317199707031, + "logps/rejected": -453.1470947265625, + "loss": 0.913, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": 4.14647102355957, + "rewards/margins": 1.9552674293518066, + "rewards/rejected": 2.1912038326263428, + "step": 18300 + }, + { + "epoch": 0.9166085483710272, + "grad_norm": 90.2131576538086, + "learning_rate": 1.5736262899772407e-05, + "logits/chosen": -19.094078063964844, + "logits/rejected": -20.168027877807617, + "logps/chosen": -512.7498779296875, + "logps/rejected": -448.6697692871094, + "loss": 1.1494, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 2.183354139328003, + "rewards/margins": 1.9273743629455566, + "rewards/rejected": 0.2559796869754791, + "step": 18400 + }, + { + "epoch": 0.9215901165686958, + "grad_norm": 0.30067598819732666, + "learning_rate": 1.569345427196181e-05, + "logits/chosen": -19.207042694091797, + "logits/rejected": -20.777149200439453, + "logps/chosen": -521.3990478515625, + "logps/rejected": -436.8664855957031, + "loss": 1.2902, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 2.0493509769439697, + "rewards/margins": 1.9620305299758911, + "rewards/rejected": 0.08732038736343384, + "step": 18500 + }, + { + "epoch": 0.9265716847663644, + "grad_norm": 128.35350036621094, + "learning_rate": 1.5650490703902666e-05, + "logits/chosen": -19.485790252685547, + "logits/rejected": -20.323007583618164, + "logps/chosen": -496.7759094238281, + "logps/rejected": -440.96759033203125, + "loss": 1.4797, + "rewards/accuracies": 0.5899999737739563, + "rewards/chosen": 0.3512002229690552, + "rewards/margins": 1.2790648937225342, + "rewards/rejected": -0.927864670753479, + "step": 18600 + }, + { + "epoch": 0.9315532529640331, + "grad_norm": 23.355682373046875, + "learning_rate": 1.5607373364794836e-05, + "logits/chosen": -19.615062713623047, + "logits/rejected": -20.484060287475586, + "logps/chosen": -479.2476806640625, + "logps/rejected": -415.4735412597656, + "loss": 1.2546, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 1.3234963417053223, + "rewards/margins": 2.4520487785339355, + "rewards/rejected": -1.1285524368286133, + "step": 18700 + }, + { + "epoch": 0.9365348211617017, + "grad_norm": 41.07186508178711, + "learning_rate": 1.5564103428022855e-05, + "logits/chosen": -19.126056671142578, + "logits/rejected": -19.900794982910156, + "logps/chosen": -530.3897094726562, + "logps/rejected": -466.9271545410156, + "loss": 1.1304, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": 1.134138584136963, + "rewards/margins": 2.5866689682006836, + "rewards/rejected": -1.4525303840637207, + "step": 18800 + }, + { + "epoch": 0.9415163893593703, + "grad_norm": 3.795400381088257, + "learning_rate": 1.552068207112402e-05, + "logits/chosen": -18.972984313964844, + "logits/rejected": -19.319841384887695, + "logps/chosen": -517.9902954101562, + "logps/rejected": -463.4305725097656, + "loss": 1.6333, + "rewards/accuracies": 0.5799999833106995, + "rewards/chosen": 0.04141408950090408, + "rewards/margins": 1.2581309080123901, + "rewards/rejected": -1.2167168855667114, + "step": 18900 + }, + { + "epoch": 0.9464979575570389, + "grad_norm": 0.2743411362171173, + "learning_rate": 1.547711047575635e-05, + "logits/chosen": -18.696569442749023, + "logits/rejected": -19.06745719909668, + "logps/chosen": -540.3345947265625, + "logps/rejected": -484.9985656738281, + "loss": 1.6534, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 2.271423816680908, + "rewards/margins": 1.7851731777191162, + "rewards/rejected": 0.48625069856643677, + "step": 19000 + }, + { + "epoch": 0.9514795257547076, + "grad_norm": 27.537288665771484, + "learning_rate": 1.543338982766639e-05, + "logits/chosen": -18.664011001586914, + "logits/rejected": -19.458066940307617, + "logps/chosen": -518.22021484375, + "logps/rejected": -418.8706359863281, + "loss": 1.0862, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": 2.7771294116973877, + "rewards/margins": 2.859891891479492, + "rewards/rejected": -0.08276252448558807, + "step": 19100 + }, + { + "epoch": 0.9564610939523762, + "grad_norm": 0.9008951783180237, + "learning_rate": 1.5389521316656992e-05, + "logits/chosen": -18.849185943603516, + "logits/rejected": -19.619192123413086, + "logps/chosen": -494.7686462402344, + "logps/rejected": -408.1366882324219, + "loss": 1.1158, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": 2.1583621501922607, + "rewards/margins": 2.396536350250244, + "rewards/rejected": -0.2381744235754013, + "step": 19200 + }, + { + "epoch": 0.9614426621500448, + "grad_norm": 0.1889144629240036, + "learning_rate": 1.5345506136554898e-05, + "logits/chosen": -18.629066467285156, + "logits/rejected": -19.594579696655273, + "logps/chosen": -524.626708984375, + "logps/rejected": -428.97344970703125, + "loss": 1.1882, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 2.8290598392486572, + "rewards/margins": 2.019918203353882, + "rewards/rejected": 0.8091418743133545, + "step": 19300 + }, + { + "epoch": 0.9664242303477134, + "grad_norm": 1.7383246421813965, + "learning_rate": 1.5301345485178282e-05, + "logits/chosen": -18.85825538635254, + "logits/rejected": -19.791507720947266, + "logps/chosen": -480.490234375, + "logps/rejected": -380.1109619140625, + "loss": 1.2643, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": 2.243396520614624, + "rewards/margins": 1.7966461181640625, + "rewards/rejected": 0.44675034284591675, + "step": 19400 + }, + { + "epoch": 0.9714057985453821, + "grad_norm": 73.9409408569336, + "learning_rate": 1.525704056430412e-05, + "logits/chosen": -18.52286148071289, + "logits/rejected": -18.89972496032715, + "logps/chosen": -528.177001953125, + "logps/rejected": -484.8839111328125, + "loss": 1.5849, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 2.4241716861724854, + "rewards/margins": 1.1765490770339966, + "rewards/rejected": 1.2476229667663574, + "step": 19500 + }, + { + "epoch": 0.9763873667430507, + "grad_norm": 34.82683563232422, + "learning_rate": 1.5212592579635512e-05, + "logits/chosen": -18.213794708251953, + "logits/rejected": -19.081279754638672, + "logps/chosen": -520.0574340820312, + "logps/rejected": -446.2875061035156, + "loss": 1.171, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 1.9521498680114746, + "rewards/margins": 1.636811375617981, + "rewards/rejected": 0.315338671207428, + "step": 19600 + }, + { + "epoch": 0.9813689349407193, + "grad_norm": 8.361115455627441, + "learning_rate": 1.5168002740768857e-05, + "logits/chosen": -18.713205337524414, + "logits/rejected": -19.391826629638672, + "logps/chosen": -503.5995178222656, + "logps/rejected": -472.6417236328125, + "loss": 1.1879, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": 1.9274494647979736, + "rewards/margins": 1.9319396018981934, + "rewards/rejected": -0.00449012778699398, + "step": 19700 + }, + { + "epoch": 0.986350503138388, + "grad_norm": 0.049951426684856415, + "learning_rate": 1.512327226116094e-05, + "logits/chosen": -19.126710891723633, + "logits/rejected": -20.143205642700195, + "logps/chosen": -521.0743408203125, + "logps/rejected": -409.7085876464844, + "loss": 1.2551, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 1.6364871263504028, + "rewards/margins": 2.0774831771850586, + "rewards/rejected": -0.44099605083465576, + "step": 19800 + }, + { + "epoch": 0.9913320713360566, + "grad_norm": 0.5147112011909485, + "learning_rate": 1.507840235809591e-05, + "logits/chosen": -18.839317321777344, + "logits/rejected": -19.660390853881836, + "logps/chosen": -511.8052062988281, + "logps/rejected": -445.6004638671875, + "loss": 1.422, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": 1.8398367166519165, + "rewards/margins": 1.5229132175445557, + "rewards/rejected": 0.3169235587120056, + "step": 19900 + }, + { + "epoch": 0.9963136395337252, + "grad_norm": 19.235801696777344, + "learning_rate": 1.503339425265215e-05, + "logits/chosen": -18.8520450592041, + "logits/rejected": -19.381229400634766, + "logps/chosen": -479.9813537597656, + "logps/rejected": -451.1719970703125, + "loss": 1.5368, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": 1.7022852897644043, + "rewards/margins": 1.5142347812652588, + "rewards/rejected": 0.1880505383014679, + "step": 20000 + }, + { + "epoch": 0.9963136395337252, + "eval_logits/chosen": -19.80689811706543, + "eval_logits/rejected": -20.71694564819336, + "eval_logps/chosen": -488.4328308105469, + "eval_logps/rejected": -420.06622314453125, + "eval_loss": 1.361470103263855, + "eval_rewards/accuracies": 0.6511780023574829, + "eval_rewards/chosen": 3.077153444290161, + "eval_rewards/margins": 2.0428242683410645, + "eval_rewards/rejected": 1.0343292951583862, + "eval_runtime": 472.868, + "eval_samples_per_second": 3.219, + "eval_steps_per_second": 0.404, + "step": 20000 + } + ], + "logging_steps": 100, + "max_steps": 60222, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}