{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 11.376984012348446, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.0700208768248558, "logits/rejected": 0.13611328601837158, "logps/chosen": -1.7159068584442139, "logps/rejected": -1.8896043300628662, "loss": 2.0451, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.7159068584442139, "rewards/margins": 0.17369739711284637, "rewards/rejected": -1.8896043300628662, "semantic_entropy": 0.6584857702255249, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 24.19347604964631, "learning_rate": 1.7825311942959e-08, "logits/chosen": 0.011100532487034798, "logits/rejected": 0.1315733939409256, "logps/chosen": -1.8017240762710571, "logps/rejected": -1.844752311706543, "loss": 2.1215, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.8017240762710571, "rewards/margins": 0.04302836209535599, "rewards/rejected": -1.844752311706543, "semantic_entropy": 0.6395233869552612, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 22.20269240723229, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.024352211505174637, "logits/rejected": 0.07368211448192596, "logps/chosen": -1.634371042251587, "logps/rejected": -1.7646675109863281, "loss": 1.9811, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.634371042251587, "rewards/margins": 0.13029658794403076, "rewards/rejected": -1.7646675109863281, "semantic_entropy": 0.6935275197029114, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 12.818737936221252, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.032861120998859406, "logits/rejected": 0.05129063129425049, "logps/chosen": -1.7252594232559204, "logps/rejected": -1.805481195449829, "loss": 2.0595, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.7252594232559204, "rewards/margins": 0.08022178709506989, "rewards/rejected": -1.805481195449829, "semantic_entropy": 0.6685115694999695, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 30.102049748464427, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.04978736490011215, "logits/rejected": 0.03416309878230095, "logps/chosen": -1.8682762384414673, "logps/rejected": -1.7789058685302734, "loss": 2.19, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -1.8682762384414673, "rewards/margins": -0.08937013149261475, "rewards/rejected": -1.7789058685302734, "semantic_entropy": 0.6433957815170288, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 26.31250337779293, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.09875942021608353, "logits/rejected": -0.007670873310416937, "logps/chosen": -1.9081414937973022, "logps/rejected": -1.83219313621521, "loss": 2.2171, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.9081414937973022, "rewards/margins": -0.07594846934080124, "rewards/rejected": -1.83219313621521, "semantic_entropy": 0.6178222298622131, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 19.815945867867445, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.04346621781587601, "logits/rejected": 0.11645905673503876, "logps/chosen": -1.8456659317016602, "logps/rejected": -1.9967705011367798, "loss": 2.1632, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.8456659317016602, "rewards/margins": 0.15110479295253754, "rewards/rejected": -1.9967705011367798, "semantic_entropy": 0.6351101994514465, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 25.850744864599687, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.04363862797617912, "logits/rejected": 0.21761982142925262, "logps/chosen": -1.8787521123886108, "logps/rejected": -1.7407077550888062, "loss": 2.2006, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.8787521123886108, "rewards/margins": -0.13804419338703156, "rewards/rejected": -1.7407077550888062, "semantic_entropy": 0.6437305212020874, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 23.672241609904134, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.020205600187182426, "logits/rejected": 0.216068834066391, "logps/chosen": -1.833592176437378, "logps/rejected": -1.8679527044296265, "loss": 2.1586, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.833592176437378, "rewards/margins": 0.03436052054166794, "rewards/rejected": -1.8679527044296265, "semantic_entropy": 0.6499934792518616, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 26.96947137050376, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.033810101449489594, "logits/rejected": 0.1169959083199501, "logps/chosen": -1.8925907611846924, "logps/rejected": -1.7731659412384033, "loss": 2.2102, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.8925907611846924, "rewards/margins": -0.11942480504512787, "rewards/rejected": -1.7731659412384033, "semantic_entropy": 0.6351879835128784, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 21.767819800264927, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.09762094914913177, "logits/rejected": 0.1255563497543335, "logps/chosen": -1.8238786458969116, "logps/rejected": -1.8578907251358032, "loss": 2.1468, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8238786458969116, "rewards/margins": 0.034012071788311005, "rewards/rejected": -1.8578907251358032, "semantic_entropy": 0.6459096670150757, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 25.184325213213036, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.07045845687389374, "logits/rejected": 0.12166911363601685, "logps/chosen": -1.7790727615356445, "logps/rejected": -1.8826583623886108, "loss": 2.0983, "rewards/accuracies": 0.53125, "rewards/chosen": -1.7790727615356445, "rewards/margins": 0.10358550399541855, "rewards/rejected": -1.8826583623886108, "semantic_entropy": 0.6385111212730408, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 24.256083977882692, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.022278426215052605, "logits/rejected": 0.1225414127111435, "logps/chosen": -1.6275218725204468, "logps/rejected": -1.7554759979248047, "loss": 1.9772, "rewards/accuracies": 0.53125, "rewards/chosen": -1.6275218725204468, "rewards/margins": 0.12795425951480865, "rewards/rejected": -1.7554759979248047, "semantic_entropy": 0.6993609666824341, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 25.047673369041256, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.07010319083929062, "logits/rejected": 0.0813789814710617, "logps/chosen": -1.7527236938476562, "logps/rejected": -1.7957979440689087, "loss": 2.0818, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.7527236938476562, "rewards/margins": 0.04307413846254349, "rewards/rejected": -1.7957979440689087, "semantic_entropy": 0.6582383513450623, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 20.004001245904735, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.0565624013543129, "logits/rejected": 0.12649355828762054, "logps/chosen": -1.7445976734161377, "logps/rejected": -1.999079704284668, "loss": 2.0662, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7445976734161377, "rewards/margins": 0.25448185205459595, "rewards/rejected": -1.999079704284668, "semantic_entropy": 0.6432304382324219, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 18.273424708398476, "learning_rate": 1.42602495543672e-07, "logits/chosen": -0.007264330983161926, "logits/rejected": 0.09273283183574677, "logps/chosen": -1.6760584115982056, "logps/rejected": -1.7100263833999634, "loss": 2.0165, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.6760584115982056, "rewards/margins": 0.033968135714530945, "rewards/rejected": -1.7100263833999634, "semantic_entropy": 0.6808029413223267, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 14.055974747293446, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.16058330237865448, "logits/rejected": 0.08205445855855942, "logps/chosen": -1.7438071966171265, "logps/rejected": -1.9097425937652588, "loss": 2.0739, "rewards/accuracies": 0.5, "rewards/chosen": -1.7438071966171265, "rewards/margins": 0.16593532264232635, "rewards/rejected": -1.9097425937652588, "semantic_entropy": 0.6601464152336121, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 24.329488537638213, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.07435176521539688, "logits/rejected": 0.03373418375849724, "logps/chosen": -1.6907930374145508, "logps/rejected": -1.7318710088729858, "loss": 2.0319, "rewards/accuracies": 0.46875, "rewards/chosen": -1.6907930374145508, "rewards/margins": 0.04107801243662834, "rewards/rejected": -1.7318710088729858, "semantic_entropy": 0.6823070645332336, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 29.89452831027371, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.09076512604951859, "logits/rejected": 0.05535387247800827, "logps/chosen": -1.7278614044189453, "logps/rejected": -1.8538306951522827, "loss": 2.0569, "rewards/accuracies": 0.59375, "rewards/chosen": -1.7278614044189453, "rewards/margins": 0.12596920132637024, "rewards/rejected": -1.8538306951522827, "semantic_entropy": 0.6580398678779602, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 13.39127829114991, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.05378703027963638, "logits/rejected": 0.008480613119900227, "logps/chosen": -1.6090600490570068, "logps/rejected": -1.7139631509780884, "loss": 1.9583, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.6090600490570068, "rewards/margins": 0.10490302741527557, "rewards/rejected": -1.7139631509780884, "semantic_entropy": 0.6984864473342896, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 19.740764634075045, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.029609236866235733, "logits/rejected": 0.05327732115983963, "logps/chosen": -1.5253149271011353, "logps/rejected": -1.695037841796875, "loss": 1.8851, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5253149271011353, "rewards/margins": 0.16972306370735168, "rewards/rejected": -1.695037841796875, "semantic_entropy": 0.7196601629257202, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 18.897785049015074, "learning_rate": 1.96078431372549e-07, "logits/chosen": -0.027175869792699814, "logits/rejected": 0.06376960128545761, "logps/chosen": -1.5159661769866943, "logps/rejected": -1.5756003856658936, "loss": 1.8828, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5159661769866943, "rewards/margins": 0.059634238481521606, "rewards/rejected": -1.5756003856658936, "semantic_entropy": 0.733715832233429, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 20.10606690646166, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.0032399953342974186, "logits/rejected": 0.20449189841747284, "logps/chosen": -1.5170791149139404, "logps/rejected": -1.75203537940979, "loss": 1.8744, "rewards/accuracies": 0.625, "rewards/chosen": -1.5170791149139404, "rewards/margins": 0.23495633900165558, "rewards/rejected": -1.75203537940979, "semantic_entropy": 0.7145556807518005, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 21.9266532790033, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.09828811883926392, "logits/rejected": 0.068391352891922, "logps/chosen": -1.5508136749267578, "logps/rejected": -1.6538305282592773, "loss": 1.9081, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.5508136749267578, "rewards/margins": 0.10301699489355087, "rewards/rejected": -1.6538305282592773, "semantic_entropy": 0.7145067453384399, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 11.236655918198759, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.079493448138237, "logits/rejected": 0.05032643675804138, "logps/chosen": -1.5037782192230225, "logps/rejected": -1.4786624908447266, "loss": 1.8788, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.5037782192230225, "rewards/margins": -0.025115689262747765, "rewards/rejected": -1.4786624908447266, "semantic_entropy": 0.7499553561210632, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 26.349007425802636, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.012896949425339699, "logits/rejected": 0.1461629420518875, "logps/chosen": -1.5245033502578735, "logps/rejected": -1.6363542079925537, "loss": 1.8839, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.5245033502578735, "rewards/margins": 0.11185093969106674, "rewards/rejected": -1.6363542079925537, "semantic_entropy": 0.7187734842300415, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 19.40461034547531, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.08400774002075195, "logits/rejected": 0.027191072702407837, "logps/chosen": -1.5319654941558838, "logps/rejected": -1.5793735980987549, "loss": 1.897, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5319654941558838, "rewards/margins": 0.04740828275680542, "rewards/rejected": -1.5793735980987549, "semantic_entropy": 0.7300479412078857, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 9.053338193412312, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.09487392008304596, "logits/rejected": 0.06036794185638428, "logps/chosen": -1.4625428915023804, "logps/rejected": -1.525730848312378, "loss": 1.8409, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.4625428915023804, "rewards/margins": 0.06318791210651398, "rewards/rejected": -1.525730848312378, "semantic_entropy": 0.7567033767700195, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 12.678045017534211, "learning_rate": 2.5846702317290554e-07, "logits/chosen": -0.057889461517333984, "logits/rejected": 0.08762963861227036, "logps/chosen": -1.3567085266113281, "logps/rejected": -1.4510586261749268, "loss": 1.746, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3567085266113281, "rewards/margins": 0.09435023367404938, "rewards/rejected": -1.4510586261749268, "semantic_entropy": 0.7786242961883545, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 8.164611212162841, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.11613424867391586, "logits/rejected": 0.028972595930099487, "logps/chosen": -1.2914960384368896, "logps/rejected": -1.3020532131195068, "loss": 1.7073, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2914960384368896, "rewards/margins": 0.010557165369391441, "rewards/rejected": -1.3020532131195068, "semantic_entropy": 0.8315240144729614, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 9.540084214904168, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.13283373415470123, "logits/rejected": -0.08733997493982315, "logps/chosen": -1.3017879724502563, "logps/rejected": -1.4189629554748535, "loss": 1.7037, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3017879724502563, "rewards/margins": 0.11717505753040314, "rewards/rejected": -1.4189629554748535, "semantic_entropy": 0.8038010597229004, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 7.597233239021867, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.22021861374378204, "logits/rejected": -0.09513047337532043, "logps/chosen": -1.3893672227859497, "logps/rejected": -1.3699947595596313, "loss": 1.784, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3893672227859497, "rewards/margins": -0.019372448325157166, "rewards/rejected": -1.3699947595596313, "semantic_entropy": 0.789264440536499, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 7.282322726887535, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.10352806746959686, "logits/rejected": 0.05425085499882698, "logps/chosen": -1.3076562881469727, "logps/rejected": -1.3788952827453613, "loss": 1.7084, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3076562881469727, "rewards/margins": 0.07123909145593643, "rewards/rejected": -1.3788952827453613, "semantic_entropy": 0.801568329334259, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 9.727211863097557, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.13868187367916107, "logits/rejected": -0.0900764912366867, "logps/chosen": -1.4210662841796875, "logps/rejected": -1.4909498691558838, "loss": 1.8035, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.4210662841796875, "rewards/margins": 0.06988342851400375, "rewards/rejected": -1.4909498691558838, "semantic_entropy": 0.7648892402648926, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 8.536590618748646, "learning_rate": 3.1194295900178254e-07, "logits/chosen": -0.030160877853631973, "logits/rejected": -0.03454091399908066, "logps/chosen": -1.3080017566680908, "logps/rejected": -1.3943734169006348, "loss": 1.7092, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3080017566680908, "rewards/margins": 0.08637170493602753, "rewards/rejected": -1.3943734169006348, "semantic_entropy": 0.8024147152900696, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 6.624644282877486, "learning_rate": 3.2085561497326203e-07, "logits/chosen": -0.035559941083192825, "logits/rejected": -0.037415988743305206, "logps/chosen": -1.3264715671539307, "logps/rejected": -1.5298666954040527, "loss": 1.7193, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3264715671539307, "rewards/margins": 0.20339521765708923, "rewards/rejected": -1.5298666954040527, "semantic_entropy": 0.7856879234313965, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 6.905660591365061, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.1871793121099472, "logits/rejected": -0.10748082399368286, "logps/chosen": -1.322962999343872, "logps/rejected": -1.368895411491394, "loss": 1.7204, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.322962999343872, "rewards/margins": 0.04593244194984436, "rewards/rejected": -1.368895411491394, "semantic_entropy": 0.7949329614639282, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 7.261187443878559, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.07674706727266312, "logits/rejected": 0.03475344553589821, "logps/chosen": -1.2406651973724365, "logps/rejected": -1.3728145360946655, "loss": 1.6424, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2406651973724365, "rewards/margins": 0.1321493685245514, "rewards/rejected": -1.3728145360946655, "semantic_entropy": 0.8034818768501282, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 8.487595559285204, "learning_rate": 3.475935828877005e-07, "logits/chosen": -0.018404539674520493, "logits/rejected": 0.1258333921432495, "logps/chosen": -1.23270583152771, "logps/rejected": -1.3943848609924316, "loss": 1.6478, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.23270583152771, "rewards/margins": 0.1616789847612381, "rewards/rejected": -1.3943848609924316, "semantic_entropy": 0.8301059007644653, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 17.82512733274846, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.09469480067491531, "logits/rejected": 0.03381982073187828, "logps/chosen": -1.3475282192230225, "logps/rejected": -1.3933377265930176, "loss": 1.7438, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3475282192230225, "rewards/margins": 0.04580947011709213, "rewards/rejected": -1.3933377265930176, "semantic_entropy": 0.7926307916641235, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 11.603405637164395, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.07309295237064362, "logits/rejected": 0.06095800921320915, "logps/chosen": -1.25830078125, "logps/rejected": -1.3350467681884766, "loss": 1.6611, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.25830078125, "rewards/margins": 0.07674606144428253, "rewards/rejected": -1.3350467681884766, "semantic_entropy": 0.8055895566940308, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 10.130849234425577, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.16702201962471008, "logits/rejected": 0.00448529701679945, "logps/chosen": -1.3401620388031006, "logps/rejected": -1.4438027143478394, "loss": 1.7377, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3401620388031006, "rewards/margins": 0.1036406010389328, "rewards/rejected": -1.4438027143478394, "semantic_entropy": 0.7951610088348389, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 6.808469536040368, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.17679491639137268, "logits/rejected": 0.059899140149354935, "logps/chosen": -1.3666898012161255, "logps/rejected": -1.4197365045547485, "loss": 1.7668, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3666898012161255, "rewards/margins": 0.05304650589823723, "rewards/rejected": -1.4197365045547485, "semantic_entropy": 0.8002446889877319, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 14.231471833327573, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.05585740879178047, "logits/rejected": 0.14913101494312286, "logps/chosen": -1.284807801246643, "logps/rejected": -1.4230337142944336, "loss": 1.6901, "rewards/accuracies": 0.5625, "rewards/chosen": -1.284807801246643, "rewards/margins": 0.13822603225708008, "rewards/rejected": -1.4230337142944336, "semantic_entropy": 0.8106751441955566, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 6.789723413832884, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.10103759914636612, "logits/rejected": 0.06058083847165108, "logps/chosen": -1.2982558012008667, "logps/rejected": -1.4222100973129272, "loss": 1.6991, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2982558012008667, "rewards/margins": 0.12395427376031876, "rewards/rejected": -1.4222100973129272, "semantic_entropy": 0.8016201257705688, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 6.561102377083612, "learning_rate": 4.09982174688057e-07, "logits/chosen": -0.025685584172606468, "logits/rejected": 0.04433635249733925, "logps/chosen": -1.2934366464614868, "logps/rejected": -1.445988655090332, "loss": 1.6912, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2934366464614868, "rewards/margins": 0.15255194902420044, "rewards/rejected": -1.445988655090332, "semantic_entropy": 0.7955934405326843, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 7.547155521097882, "learning_rate": 4.188948306595365e-07, "logits/chosen": 0.011114180088043213, "logits/rejected": 0.13509520888328552, "logps/chosen": -1.2693490982055664, "logps/rejected": -1.429602026939392, "loss": 1.6714, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2693490982055664, "rewards/margins": 0.1602528840303421, "rewards/rejected": -1.429602026939392, "semantic_entropy": 0.8040856122970581, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 6.301680124856554, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.029990240931510925, "logits/rejected": 0.08853377401828766, "logps/chosen": -1.285729169845581, "logps/rejected": -1.4612315893173218, "loss": 1.6765, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.285729169845581, "rewards/margins": 0.17550256848335266, "rewards/rejected": -1.4612315893173218, "semantic_entropy": 0.7815099358558655, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 6.780517728147376, "learning_rate": 4.3672014260249554e-07, "logits/chosen": 0.02078728750348091, "logits/rejected": 0.12903298437595367, "logps/chosen": -1.4015637636184692, "logps/rejected": -1.4240480661392212, "loss": 1.7889, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4015637636184692, "rewards/margins": 0.02248447574675083, "rewards/rejected": -1.4240480661392212, "semantic_entropy": 0.7747452259063721, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 8.434082555460368, "learning_rate": 4.4563279857397503e-07, "logits/chosen": -0.07730443775653839, "logits/rejected": 0.0713261291384697, "logps/chosen": -1.2792367935180664, "logps/rejected": -1.3356412649154663, "loss": 1.6831, "rewards/accuracies": 0.5, "rewards/chosen": -1.2792367935180664, "rewards/margins": 0.05640435963869095, "rewards/rejected": -1.3356412649154663, "semantic_entropy": 0.8077448606491089, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 6.335754288999235, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.024661067873239517, "logits/rejected": 0.1087626963853836, "logps/chosen": -1.251853346824646, "logps/rejected": -1.3522131443023682, "loss": 1.6632, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.251853346824646, "rewards/margins": 0.10035989433526993, "rewards/rejected": -1.3522131443023682, "semantic_entropy": 0.8226425051689148, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 7.25948094724864, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.24771995842456818, "logits/rejected": -0.1541024148464203, "logps/chosen": -1.336179256439209, "logps/rejected": -1.4960668087005615, "loss": 1.7273, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.336179256439209, "rewards/margins": 0.15988758206367493, "rewards/rejected": -1.4960668087005615, "semantic_entropy": 0.7821539044380188, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 7.974935502381261, "learning_rate": 4.723707664884135e-07, "logits/chosen": -0.09956977516412735, "logits/rejected": -0.02170068398118019, "logps/chosen": -1.3100250959396362, "logps/rejected": -1.4861724376678467, "loss": 1.6954, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3100250959396362, "rewards/margins": 0.1761474907398224, "rewards/rejected": -1.4861724376678467, "semantic_entropy": 0.7707664370536804, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 7.07656019807777, "learning_rate": 4.81283422459893e-07, "logits/chosen": -0.08991505205631256, "logits/rejected": 0.030794387683272362, "logps/chosen": -1.3038660287857056, "logps/rejected": -1.3950450420379639, "loss": 1.7021, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3038660287857056, "rewards/margins": 0.09117896854877472, "rewards/rejected": -1.3950450420379639, "semantic_entropy": 0.7964199185371399, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 8.154518125964355, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.04051875323057175, "logits/rejected": 0.051808975636959076, "logps/chosen": -1.2503533363342285, "logps/rejected": -1.4003863334655762, "loss": 1.6591, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2503533363342285, "rewards/margins": 0.1500329077243805, "rewards/rejected": -1.4003863334655762, "semantic_entropy": 0.817530632019043, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 9.944208916087495, "learning_rate": 4.99108734402852e-07, "logits/chosen": -0.10453246533870697, "logits/rejected": 0.037034012377262115, "logps/chosen": -1.2926570177078247, "logps/rejected": -1.3936669826507568, "loss": 1.7005, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2926570177078247, "rewards/margins": 0.10101006180047989, "rewards/rejected": -1.3936669826507568, "semantic_entropy": 0.8156482577323914, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 6.560689707922062, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.08232948929071426, "logits/rejected": 0.045796047896146774, "logps/chosen": -1.3346055746078491, "logps/rejected": -1.4090654850006104, "loss": 1.732, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3346055746078491, "rewards/margins": 0.07445989549160004, "rewards/rejected": -1.4090654850006104, "semantic_entropy": 0.7947149276733398, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 5.928658265663115, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.12613533437252045, "logits/rejected": 0.14914485812187195, "logps/chosen": -1.3526628017425537, "logps/rejected": -1.4613511562347412, "loss": 1.7475, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3526628017425537, "rewards/margins": 0.10868847370147705, "rewards/rejected": -1.4613511562347412, "semantic_entropy": 0.7897647619247437, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 9.415773736322583, "learning_rate": 5.258467023172905e-07, "logits/chosen": -0.09889186173677444, "logits/rejected": -0.04858065024018288, "logps/chosen": -1.2496532201766968, "logps/rejected": -1.384414553642273, "loss": 1.6524, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2496532201766968, "rewards/margins": 0.1347615122795105, "rewards/rejected": -1.384414553642273, "semantic_entropy": 0.8055901527404785, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 7.056183470689298, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.09319017082452774, "logits/rejected": 0.05931227281689644, "logps/chosen": -1.2849849462509155, "logps/rejected": -1.3575935363769531, "loss": 1.6895, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2849849462509155, "rewards/margins": 0.07260854542255402, "rewards/rejected": -1.3575935363769531, "semantic_entropy": 0.8090287446975708, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 4.648156795023801, "learning_rate": 5.436720142602496e-07, "logits/chosen": -0.0584363229572773, "logits/rejected": 0.006430687848478556, "logps/chosen": -1.391749382019043, "logps/rejected": -1.3972457647323608, "loss": 1.7879, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.391749382019043, "rewards/margins": 0.005496317055076361, "rewards/rejected": -1.3972457647323608, "semantic_entropy": 0.7922763228416443, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 6.400426454638685, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.2234455794095993, "logits/rejected": -0.14297693967819214, "logps/chosen": -1.3538219928741455, "logps/rejected": -1.4309254884719849, "loss": 1.751, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3538219928741455, "rewards/margins": 0.07710349559783936, "rewards/rejected": -1.4309254884719849, "semantic_entropy": 0.7944218516349792, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 8.250007207893784, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.04704821854829788, "logits/rejected": 0.1016455888748169, "logps/chosen": -1.3473138809204102, "logps/rejected": -1.4864037036895752, "loss": 1.7347, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3473138809204102, "rewards/margins": 0.13908977806568146, "rewards/rejected": -1.4864037036895752, "semantic_entropy": 0.7747327089309692, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 4.672968226869114, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.0753047913312912, "logits/rejected": 0.04744366556406021, "logps/chosen": -1.3072644472122192, "logps/rejected": -1.3549467325210571, "loss": 1.7117, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3072644472122192, "rewards/margins": 0.04768242686986923, "rewards/rejected": -1.3549467325210571, "semantic_entropy": 0.808958888053894, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 7.17669868166072, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.15202639997005463, "logits/rejected": -0.05050492286682129, "logps/chosen": -1.297430396080017, "logps/rejected": -1.5311055183410645, "loss": 1.69, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.297430396080017, "rewards/margins": 0.2336750030517578, "rewards/rejected": -1.5311055183410645, "semantic_entropy": 0.7851811051368713, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 5.216579150390592, "learning_rate": 5.88235294117647e-07, "logits/chosen": -0.04230841249227524, "logits/rejected": 0.09224589914083481, "logps/chosen": -1.3198926448822021, "logps/rejected": -1.4801380634307861, "loss": 1.7147, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3198926448822021, "rewards/margins": 0.16024520993232727, "rewards/rejected": -1.4801380634307861, "semantic_entropy": 0.7896062135696411, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 11.132180167314118, "learning_rate": 5.971479500891266e-07, "logits/chosen": 0.024430666118860245, "logits/rejected": 0.1171652302145958, "logps/chosen": -1.3264687061309814, "logps/rejected": -1.3565508127212524, "loss": 1.7304, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3264687061309814, "rewards/margins": 0.03008202277123928, "rewards/rejected": -1.3565508127212524, "semantic_entropy": 0.8078718185424805, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 8.52384554315305, "learning_rate": 6.060606060606061e-07, "logits/chosen": -0.05536968633532524, "logits/rejected": 0.07354720681905746, "logps/chosen": -1.3824102878570557, "logps/rejected": -1.461477518081665, "loss": 1.7657, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3824102878570557, "rewards/margins": 0.07906736433506012, "rewards/rejected": -1.461477518081665, "semantic_entropy": 0.766586184501648, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 8.152067123014046, "learning_rate": 6.149732620320855e-07, "logits/chosen": 0.04473737254738808, "logits/rejected": 0.06798554956912994, "logps/chosen": -1.2844898700714111, "logps/rejected": -1.421230673789978, "loss": 1.6859, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2844898700714111, "rewards/margins": 0.13674090802669525, "rewards/rejected": -1.421230673789978, "semantic_entropy": 0.8028990030288696, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 8.17035910219124, "learning_rate": 6.238859180035651e-07, "logits/chosen": 0.0052978964522480965, "logits/rejected": 0.0881340503692627, "logps/chosen": -1.2658754587173462, "logps/rejected": -1.394208550453186, "loss": 1.6696, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2658754587173462, "rewards/margins": 0.12833306193351746, "rewards/rejected": -1.394208550453186, "semantic_entropy": 0.8075162768363953, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 7.585827565686257, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.08834262937307358, "logits/rejected": 0.1186922937631607, "logps/chosen": -1.373750925064087, "logps/rejected": -1.4008820056915283, "loss": 1.7681, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.373750925064087, "rewards/margins": 0.02713100239634514, "rewards/rejected": -1.4008820056915283, "semantic_entropy": 0.7887953519821167, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 8.351236928029566, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.06808433681726456, "logits/rejected": 0.004757398273795843, "logps/chosen": -1.3002817630767822, "logps/rejected": -1.4211763143539429, "loss": 1.7017, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3002817630767822, "rewards/margins": 0.12089458853006363, "rewards/rejected": -1.4211763143539429, "semantic_entropy": 0.8028769493103027, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 9.249363655263565, "learning_rate": 6.506238859180035e-07, "logits/chosen": 0.01690538600087166, "logits/rejected": 0.09055240452289581, "logps/chosen": -1.2801201343536377, "logps/rejected": -1.3836716413497925, "loss": 1.6888, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2801201343536377, "rewards/margins": 0.10355156660079956, "rewards/rejected": -1.3836716413497925, "semantic_entropy": 0.8173823356628418, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 4.696653680507905, "learning_rate": 6.59536541889483e-07, "logits/chosen": 0.004238727502524853, "logits/rejected": 0.09182579815387726, "logps/chosen": -1.2765114307403564, "logps/rejected": -1.3157193660736084, "loss": 1.6955, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2765114307403564, "rewards/margins": 0.039207689464092255, "rewards/rejected": -1.3157193660736084, "semantic_entropy": 0.837990939617157, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 5.877196359422753, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.05769134685397148, "logits/rejected": 0.09260667860507965, "logps/chosen": -1.2571821212768555, "logps/rejected": -1.3915026187896729, "loss": 1.6619, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2571821212768555, "rewards/margins": 0.13432055711746216, "rewards/rejected": -1.3915026187896729, "semantic_entropy": 0.8095145225524902, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 6.500327283162041, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.03391874581575394, "logits/rejected": 0.039167601615190506, "logps/chosen": -1.273229956626892, "logps/rejected": -1.4286125898361206, "loss": 1.6765, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.273229956626892, "rewards/margins": 0.15538260340690613, "rewards/rejected": -1.4286125898361206, "semantic_entropy": 0.8064988255500793, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 5.448294503063138, "learning_rate": 6.862745098039216e-07, "logits/chosen": 0.0034145142417401075, "logits/rejected": 0.07555432617664337, "logps/chosen": -1.3799712657928467, "logps/rejected": -1.3624684810638428, "loss": 1.7765, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3799712657928467, "rewards/margins": -0.0175027959048748, "rewards/rejected": -1.3624684810638428, "semantic_entropy": 0.7931293249130249, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 8.359296229911426, "learning_rate": 6.95187165775401e-07, "logits/chosen": 0.0651121661067009, "logits/rejected": 0.22345077991485596, "logps/chosen": -1.360288381576538, "logps/rejected": -1.41848886013031, "loss": 1.7455, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.360288381576538, "rewards/margins": 0.05820056051015854, "rewards/rejected": -1.41848886013031, "semantic_entropy": 0.7703979015350342, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 6.248428134715459, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.03631047531962395, "logits/rejected": 0.11398360878229141, "logps/chosen": -1.311118245124817, "logps/rejected": -1.3174058198928833, "loss": 1.721, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.311118245124817, "rewards/margins": 0.006287521217018366, "rewards/rejected": -1.3174058198928833, "semantic_entropy": 0.8197193145751953, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 4.7955833273558275, "learning_rate": 7.1301247771836e-07, "logits/chosen": 0.061477649956941605, "logits/rejected": 0.1442108452320099, "logps/chosen": -1.294195532798767, "logps/rejected": -1.3802944421768188, "loss": 1.6978, "rewards/accuracies": 0.5625, "rewards/chosen": -1.294195532798767, "rewards/margins": 0.08609884977340698, "rewards/rejected": -1.3802944421768188, "semantic_entropy": 0.8072103261947632, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.24398677051067352, "eval_logits/rejected": 0.3247275948524475, "eval_logps/chosen": -1.3251334428787231, "eval_logps/rejected": -1.4461066722869873, "eval_loss": 1.7216598987579346, "eval_rewards/accuracies": 0.5541542768478394, "eval_rewards/chosen": -1.3251334428787231, "eval_rewards/margins": 0.12097327411174774, "eval_rewards/rejected": -1.4461066722869873, "eval_runtime": 35.1665, "eval_samples_per_second": 38.247, "eval_semantic_entropy": 0.792643129825592, "eval_steps_per_second": 9.583, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 6.785540303766075, "learning_rate": 7.219251336898395e-07, "logits/chosen": -0.024030113592743874, "logits/rejected": 0.06308787316083908, "logps/chosen": -1.2909350395202637, "logps/rejected": -1.3556811809539795, "loss": 1.6946, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2909350395202637, "rewards/margins": 0.06474622339010239, "rewards/rejected": -1.3556811809539795, "semantic_entropy": 0.8073827624320984, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 6.618465737253283, "learning_rate": 7.30837789661319e-07, "logits/chosen": 0.04678395017981529, "logits/rejected": 0.17371992766857147, "logps/chosen": -1.2655959129333496, "logps/rejected": -1.3502428531646729, "loss": 1.6716, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2655959129333496, "rewards/margins": 0.0846467837691307, "rewards/rejected": -1.3502428531646729, "semantic_entropy": 0.8119094967842102, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 4.5455957879942295, "learning_rate": 7.397504456327985e-07, "logits/chosen": 0.00915374793112278, "logits/rejected": 0.04307233542203903, "logps/chosen": -1.2642295360565186, "logps/rejected": -1.4209240674972534, "loss": 1.669, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2642295360565186, "rewards/margins": 0.1566944569349289, "rewards/rejected": -1.4209240674972534, "semantic_entropy": 0.8094478845596313, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 5.256345885781984, "learning_rate": 7.486631016042781e-07, "logits/chosen": -0.016872655600309372, "logits/rejected": 0.15947112441062927, "logps/chosen": -1.2531123161315918, "logps/rejected": -1.3379991054534912, "loss": 1.6683, "rewards/accuracies": 0.5, "rewards/chosen": -1.2531123161315918, "rewards/margins": 0.08488687872886658, "rewards/rejected": -1.3379991054534912, "semantic_entropy": 0.8304449915885925, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 4.835360574988573, "learning_rate": 7.575757575757575e-07, "logits/chosen": -0.05998952314257622, "logits/rejected": 0.1286803036928177, "logps/chosen": -1.2900707721710205, "logps/rejected": -1.4398845434188843, "loss": 1.6936, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2900707721710205, "rewards/margins": 0.14981389045715332, "rewards/rejected": -1.4398845434188843, "semantic_entropy": 0.8071367144584656, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 5.959871169751677, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.0660131573677063, "logits/rejected": 0.12218798696994781, "logps/chosen": -1.3276304006576538, "logps/rejected": -1.4402986764907837, "loss": 1.7219, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3276304006576538, "rewards/margins": 0.11266813427209854, "rewards/rejected": -1.4402986764907837, "semantic_entropy": 0.7884837985038757, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 10.389614081002057, "learning_rate": 7.754010695187165e-07, "logits/chosen": -0.01691102422773838, "logits/rejected": 0.0614364854991436, "logps/chosen": -1.192183017730713, "logps/rejected": -1.3228236436843872, "loss": 1.6094, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.192183017730713, "rewards/margins": 0.13064037263393402, "rewards/rejected": -1.3228236436843872, "semantic_entropy": 0.83436518907547, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 5.561840651458556, "learning_rate": 7.84313725490196e-07, "logits/chosen": -0.017634226009249687, "logits/rejected": 0.06345956027507782, "logps/chosen": -1.2711617946624756, "logps/rejected": -1.354431390762329, "loss": 1.6818, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2711617946624756, "rewards/margins": 0.0832696408033371, "rewards/rejected": -1.354431390762329, "semantic_entropy": 0.8211802244186401, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 6.9186909578776, "learning_rate": 7.932263814616755e-07, "logits/chosen": -0.04539051279425621, "logits/rejected": 0.05634162575006485, "logps/chosen": -1.298185110092163, "logps/rejected": -1.419070839881897, "loss": 1.699, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.298185110092163, "rewards/margins": 0.1208857074379921, "rewards/rejected": -1.419070839881897, "semantic_entropy": 0.8016935586929321, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 7.253084207002215, "learning_rate": 8.02139037433155e-07, "logits/chosen": 0.012308264151215553, "logits/rejected": 0.12940391898155212, "logps/chosen": -1.2800040245056152, "logps/rejected": -1.4156650304794312, "loss": 1.6873, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2800040245056152, "rewards/margins": 0.13566121459007263, "rewards/rejected": -1.4156650304794312, "semantic_entropy": 0.8146024942398071, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 6.697042540445374, "learning_rate": 8.110516934046346e-07, "logits/chosen": -0.005744749214500189, "logits/rejected": 0.07181303203105927, "logps/chosen": -1.2279679775238037, "logps/rejected": -1.3971221446990967, "loss": 1.6349, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2279679775238037, "rewards/margins": 0.16915421187877655, "rewards/rejected": -1.3971221446990967, "semantic_entropy": 0.8139179944992065, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 6.519099850135918, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.1108812540769577, "logits/rejected": 0.004727782215923071, "logps/chosen": -1.3544937372207642, "logps/rejected": -1.399938702583313, "loss": 1.7517, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3544937372207642, "rewards/margins": 0.045444995164871216, "rewards/rejected": -1.399938702583313, "semantic_entropy": 0.7944735288619995, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 6.558380384174074, "learning_rate": 8.288770053475936e-07, "logits/chosen": 0.12613479793071747, "logits/rejected": 0.13478240370750427, "logps/chosen": -1.2532012462615967, "logps/rejected": -1.4091618061065674, "loss": 1.655, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2532012462615967, "rewards/margins": 0.1559605747461319, "rewards/rejected": -1.4091618061065674, "semantic_entropy": 0.8035104870796204, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 7.037624043045453, "learning_rate": 8.37789661319073e-07, "logits/chosen": 0.14286640286445618, "logits/rejected": 0.09529170393943787, "logps/chosen": -1.2096443176269531, "logps/rejected": -1.3893190622329712, "loss": 1.6226, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2096443176269531, "rewards/margins": 0.1796746402978897, "rewards/rejected": -1.3893190622329712, "semantic_entropy": 0.8259340524673462, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 5.154632074643995, "learning_rate": 8.467023172905525e-07, "logits/chosen": -0.031199157238006592, "logits/rejected": 0.10272257030010223, "logps/chosen": -1.2859687805175781, "logps/rejected": -1.4617053270339966, "loss": 1.6873, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2859687805175781, "rewards/margins": 0.17573665082454681, "rewards/rejected": -1.4617053270339966, "semantic_entropy": 0.802662193775177, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 7.903214553322106, "learning_rate": 8.55614973262032e-07, "logits/chosen": -0.030785713344812393, "logits/rejected": 0.16307750344276428, "logps/chosen": -1.2503442764282227, "logps/rejected": -1.308578610420227, "loss": 1.6591, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2503442764282227, "rewards/margins": 0.05823441594839096, "rewards/rejected": -1.308578610420227, "semantic_entropy": 0.817587673664093, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 8.16417284982753, "learning_rate": 8.645276292335115e-07, "logits/chosen": 0.01921394281089306, "logits/rejected": 0.05297957733273506, "logps/chosen": -1.3405605554580688, "logps/rejected": -1.4139060974121094, "loss": 1.7315, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3405605554580688, "rewards/margins": 0.07334571331739426, "rewards/rejected": -1.4139060974121094, "semantic_entropy": 0.7819523215293884, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 6.845278525633353, "learning_rate": 8.734402852049911e-07, "logits/chosen": -0.005242282059043646, "logits/rejected": 0.06069497391581535, "logps/chosen": -1.2866027355194092, "logps/rejected": -1.3450382947921753, "loss": 1.6948, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2866027355194092, "rewards/margins": 0.05843549966812134, "rewards/rejected": -1.3450382947921753, "semantic_entropy": 0.8164412379264832, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 6.6718225706358805, "learning_rate": 8.823529411764705e-07, "logits/chosen": -0.04170520231127739, "logits/rejected": -0.026141276583075523, "logps/chosen": -1.2930090427398682, "logps/rejected": -1.3975330591201782, "loss": 1.6995, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2930090427398682, "rewards/margins": 0.10452393442392349, "rewards/rejected": -1.3975330591201782, "semantic_entropy": 0.8130000829696655, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 6.643467210288479, "learning_rate": 8.912655971479501e-07, "logits/chosen": -0.04618716612458229, "logits/rejected": 0.047276176512241364, "logps/chosen": -1.2023783922195435, "logps/rejected": -1.3515993356704712, "loss": 1.6166, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2023783922195435, "rewards/margins": 0.14922089874744415, "rewards/rejected": -1.3515993356704712, "semantic_entropy": 0.8283529281616211, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 6.215133405549783, "learning_rate": 9.001782531194295e-07, "logits/chosen": -0.07341426610946655, "logits/rejected": 0.054486047476530075, "logps/chosen": -1.3212913274765015, "logps/rejected": -1.3590233325958252, "loss": 1.7234, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3212913274765015, "rewards/margins": 0.03773219510912895, "rewards/rejected": -1.3590233325958252, "semantic_entropy": 0.8041555285453796, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 6.671308479169473, "learning_rate": 9.09090909090909e-07, "logits/chosen": 0.08501291275024414, "logits/rejected": 0.14178766310214996, "logps/chosen": -1.280250906944275, "logps/rejected": -1.4360954761505127, "loss": 1.6809, "rewards/accuracies": 0.59375, "rewards/chosen": -1.280250906944275, "rewards/margins": 0.1558445394039154, "rewards/rejected": -1.4360954761505127, "semantic_entropy": 0.8013203740119934, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 7.044236046477622, "learning_rate": 9.180035650623885e-07, "logits/chosen": 0.04025426506996155, "logits/rejected": 0.1256372481584549, "logps/chosen": -1.2208656072616577, "logps/rejected": -1.366850733757019, "loss": 1.6271, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2208656072616577, "rewards/margins": 0.1459851861000061, "rewards/rejected": -1.366850733757019, "semantic_entropy": 0.8124540448188782, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 5.086498695053294, "learning_rate": 9.26916221033868e-07, "logits/chosen": -0.07386355102062225, "logits/rejected": 0.054715901613235474, "logps/chosen": -1.2658967971801758, "logps/rejected": -1.3534698486328125, "loss": 1.6665, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2658967971801758, "rewards/margins": 0.08757311850786209, "rewards/rejected": -1.3534698486328125, "semantic_entropy": 0.8011196255683899, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 10.709938926240365, "learning_rate": 9.358288770053476e-07, "logits/chosen": 0.0951719656586647, "logits/rejected": 0.1541096270084381, "logps/chosen": -1.2358475923538208, "logps/rejected": -1.393944501876831, "loss": 1.6403, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2358475923538208, "rewards/margins": 0.1580970585346222, "rewards/rejected": -1.393944501876831, "semantic_entropy": 0.8089720010757446, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 5.008040987956659, "learning_rate": 9.44741532976827e-07, "logits/chosen": 0.05394287779927254, "logits/rejected": 0.13304811716079712, "logps/chosen": -1.21664297580719, "logps/rejected": -1.3086540699005127, "loss": 1.6376, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.21664297580719, "rewards/margins": 0.09201101213693619, "rewards/rejected": -1.3086540699005127, "semantic_entropy": 0.8418585062026978, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 5.514339920944808, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.09195814281702042, "logits/rejected": 0.1514480710029602, "logps/chosen": -1.2381861209869385, "logps/rejected": -1.2903103828430176, "loss": 1.6517, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2381861209869385, "rewards/margins": 0.052124302834272385, "rewards/rejected": -1.2903103828430176, "semantic_entropy": 0.8271163702011108, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 8.339889701960038, "learning_rate": 9.62566844919786e-07, "logits/chosen": 0.031591713428497314, "logits/rejected": 0.09598143398761749, "logps/chosen": -1.3635337352752686, "logps/rejected": -1.4101234674453735, "loss": 1.7553, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3635337352752686, "rewards/margins": 0.04658966511487961, "rewards/rejected": -1.4101234674453735, "semantic_entropy": 0.7835661172866821, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 5.24718299152537, "learning_rate": 9.714795008912655e-07, "logits/chosen": -0.09395425021648407, "logits/rejected": 0.09385137259960175, "logps/chosen": -1.259730339050293, "logps/rejected": -1.3610131740570068, "loss": 1.6688, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.259730339050293, "rewards/margins": 0.10128290951251984, "rewards/rejected": -1.3610131740570068, "semantic_entropy": 0.8182238340377808, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 7.074082100119162, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.04542509838938713, "logits/rejected": 0.10549955070018768, "logps/chosen": -1.2633665800094604, "logps/rejected": -1.3900935649871826, "loss": 1.6678, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2633665800094604, "rewards/margins": 0.126726895570755, "rewards/rejected": -1.3900935649871826, "semantic_entropy": 0.8088127374649048, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 8.522358994363875, "learning_rate": 9.893048128342244e-07, "logits/chosen": -0.06427808851003647, "logits/rejected": 0.05658269673585892, "logps/chosen": -1.3347742557525635, "logps/rejected": -1.392332673072815, "loss": 1.7242, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3347742557525635, "rewards/margins": 0.057558417320251465, "rewards/rejected": -1.392332673072815, "semantic_entropy": 0.7788748741149902, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 5.828843735345829, "learning_rate": 9.98217468805704e-07, "logits/chosen": 0.050944674760103226, "logits/rejected": 0.05961176007986069, "logps/chosen": -1.1887964010238647, "logps/rejected": -1.3146507740020752, "loss": 1.5995, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1887964010238647, "rewards/margins": 0.1258545219898224, "rewards/rejected": -1.3146507740020752, "semantic_entropy": 0.8213583827018738, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 5.270827933074834, "learning_rate": 9.999984476788462e-07, "logits/chosen": 0.01299972366541624, "logits/rejected": 0.051746636629104614, "logps/chosen": -1.3081862926483154, "logps/rejected": -1.4173951148986816, "loss": 1.7065, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3081862926483154, "rewards/margins": 0.10920894145965576, "rewards/rejected": -1.4173951148986816, "semantic_entropy": 0.7966623902320862, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 7.112472208692866, "learning_rate": 9.999921413906797e-07, "logits/chosen": -0.07408381998538971, "logits/rejected": 0.12202043831348419, "logps/chosen": -1.2839566469192505, "logps/rejected": -1.3526939153671265, "loss": 1.6777, "rewards/accuracies": 0.5, "rewards/chosen": -1.2839566469192505, "rewards/margins": 0.06873737275600433, "rewards/rejected": -1.3526939153671265, "semantic_entropy": 0.7875763773918152, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 6.028030782718354, "learning_rate": 9.999809841765644e-07, "logits/chosen": -0.04523703455924988, "logits/rejected": 0.007324503269046545, "logps/chosen": -1.2189090251922607, "logps/rejected": -1.3163892030715942, "loss": 1.626, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2189090251922607, "rewards/margins": 0.09748013317584991, "rewards/rejected": -1.3163892030715942, "semantic_entropy": 0.8142744898796082, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 5.824121872858462, "learning_rate": 9.999649761447477e-07, "logits/chosen": -0.0395292229950428, "logits/rejected": 0.09673064947128296, "logps/chosen": -1.2138586044311523, "logps/rejected": -1.3709722757339478, "loss": 1.6333, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2138586044311523, "rewards/margins": 0.15711364150047302, "rewards/rejected": -1.3709722757339478, "semantic_entropy": 0.838982105255127, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 6.070829782215976, "learning_rate": 9.999441174505398e-07, "logits/chosen": -0.07076728343963623, "logits/rejected": 0.017448240891098976, "logps/chosen": -1.3492457866668701, "logps/rejected": -1.3825386762619019, "loss": 1.7406, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3492457866668701, "rewards/margins": 0.03329288214445114, "rewards/rejected": -1.3825386762619019, "semantic_entropy": 0.7826189398765564, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 7.879992521531221, "learning_rate": 9.999184082963116e-07, "logits/chosen": -0.04973824322223663, "logits/rejected": 0.06556639820337296, "logps/chosen": -1.3307232856750488, "logps/rejected": -1.350843071937561, "loss": 1.7293, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3307232856750488, "rewards/margins": 0.02011987194418907, "rewards/rejected": -1.350843071937561, "semantic_entropy": 0.7972001433372498, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 6.269382685017996, "learning_rate": 9.998878489314937e-07, "logits/chosen": 0.011141306720674038, "logits/rejected": 0.12483570724725723, "logps/chosen": -1.260568618774414, "logps/rejected": -1.3039849996566772, "loss": 1.6712, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.260568618774414, "rewards/margins": 0.04341646283864975, "rewards/rejected": -1.3039849996566772, "semantic_entropy": 0.8212615251541138, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 5.292619989378927, "learning_rate": 9.99852439652573e-07, "logits/chosen": -0.058684349060058594, "logits/rejected": 0.07137693464756012, "logps/chosen": -1.2534990310668945, "logps/rejected": -1.261439323425293, "loss": 1.673, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2534990310668945, "rewards/margins": 0.007940268144011497, "rewards/rejected": -1.261439323425293, "semantic_entropy": 0.8390199542045593, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 6.670116039325565, "learning_rate": 9.998121808030904e-07, "logits/chosen": -0.08982774615287781, "logits/rejected": -0.010232815518975258, "logps/chosen": -1.307307481765747, "logps/rejected": -1.443682074546814, "loss": 1.7085, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.307307481765747, "rewards/margins": 0.1363748162984848, "rewards/rejected": -1.443682074546814, "semantic_entropy": 0.8024643659591675, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 10.965257727367042, "learning_rate": 9.997670727736379e-07, "logits/chosen": 0.012210799381136894, "logits/rejected": 0.14065444469451904, "logps/chosen": -1.2826316356658936, "logps/rejected": -1.3330223560333252, "loss": 1.6794, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2826316356658936, "rewards/margins": 0.05039060860872269, "rewards/rejected": -1.3330223560333252, "semantic_entropy": 0.7935566902160645, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 4.658454910548433, "learning_rate": 9.99717116001853e-07, "logits/chosen": -0.07386257499456406, "logits/rejected": 0.02019677497446537, "logps/chosen": -1.2897520065307617, "logps/rejected": -1.409834623336792, "loss": 1.6938, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2897520065307617, "rewards/margins": 0.12008273601531982, "rewards/rejected": -1.409834623336792, "semantic_entropy": 0.8081550598144531, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 5.822019861759727, "learning_rate": 9.996623109724173e-07, "logits/chosen": 0.010493127629160881, "logits/rejected": 0.06393267214298248, "logps/chosen": -1.3465783596038818, "logps/rejected": -1.4457759857177734, "loss": 1.7427, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3465783596038818, "rewards/margins": 0.09919761121273041, "rewards/rejected": -1.4457759857177734, "semantic_entropy": 0.792236328125, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 6.644138656483114, "learning_rate": 9.996026582170488e-07, "logits/chosen": 0.038259416818618774, "logits/rejected": 0.13830366730690002, "logps/chosen": -1.2653391361236572, "logps/rejected": -1.3718128204345703, "loss": 1.667, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2653391361236572, "rewards/margins": 0.10647352784872055, "rewards/rejected": -1.3718128204345703, "semantic_entropy": 0.8033905029296875, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 5.844506879107339, "learning_rate": 9.995381583144996e-07, "logits/chosen": -0.03819793090224266, "logits/rejected": 0.05285152792930603, "logps/chosen": -1.2982299327850342, "logps/rejected": -1.424512267112732, "loss": 1.6998, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2982299327850342, "rewards/margins": 0.1262824833393097, "rewards/rejected": -1.424512267112732, "semantic_entropy": 0.8031641244888306, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 4.836676095295266, "learning_rate": 9.994688118905471e-07, "logits/chosen": -0.0023306801449507475, "logits/rejected": 0.2206401526927948, "logps/chosen": -1.365534782409668, "logps/rejected": -1.385879635810852, "loss": 1.758, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.365534782409668, "rewards/margins": 0.020344818010926247, "rewards/rejected": -1.385879635810852, "semantic_entropy": 0.7848553657531738, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 6.801804511099342, "learning_rate": 9.993946196179912e-07, "logits/chosen": -0.08890081942081451, "logits/rejected": 0.09990646690130234, "logps/chosen": -1.299070954322815, "logps/rejected": -1.3819762468338013, "loss": 1.6821, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.299070954322815, "rewards/margins": 0.08290513604879379, "rewards/rejected": -1.3819762468338013, "semantic_entropy": 0.7660870552062988, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 7.338925850343158, "learning_rate": 9.993155822166455e-07, "logits/chosen": -0.0808345228433609, "logits/rejected": -0.0039797513745725155, "logps/chosen": -1.2071410417556763, "logps/rejected": -1.36195969581604, "loss": 1.6201, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2071410417556763, "rewards/margins": 0.15481875836849213, "rewards/rejected": -1.36195969581604, "semantic_entropy": 0.8258221745491028, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 7.992463947444347, "learning_rate": 9.992317004533313e-07, "logits/chosen": -0.04419615864753723, "logits/rejected": 0.0839843899011612, "logps/chosen": -1.359161138534546, "logps/rejected": -1.4897550344467163, "loss": 1.7512, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.359161138534546, "rewards/margins": 0.13059385120868683, "rewards/rejected": -1.4897550344467163, "semantic_entropy": 0.7841397523880005, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 10.05252682846952, "learning_rate": 9.991429751418696e-07, "logits/chosen": 0.031102057546377182, "logits/rejected": 0.03404795378446579, "logps/chosen": -1.2870821952819824, "logps/rejected": -1.47232985496521, "loss": 1.6884, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2870821952819824, "rewards/margins": 0.18524782359600067, "rewards/rejected": -1.47232985496521, "semantic_entropy": 0.8025497198104858, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 5.2067438634763885, "learning_rate": 9.99049407143074e-07, "logits/chosen": 0.0008175998809747398, "logits/rejected": 0.1155385822057724, "logps/chosen": -1.2435680627822876, "logps/rejected": -1.265260100364685, "loss": 1.6484, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2435680627822876, "rewards/margins": 0.021692020818591118, "rewards/rejected": -1.265260100364685, "semantic_entropy": 0.8096579313278198, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 6.149939496436752, "learning_rate": 9.989509973647416e-07, "logits/chosen": -0.023006999865174294, "logits/rejected": 0.10326042026281357, "logps/chosen": -1.2181217670440674, "logps/rejected": -1.3380296230316162, "loss": 1.6322, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2181217670440674, "rewards/margins": 0.11990783363580704, "rewards/rejected": -1.3380296230316162, "semantic_entropy": 0.8282052874565125, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 4.993913484725182, "learning_rate": 9.988477467616445e-07, "logits/chosen": -0.03270528092980385, "logits/rejected": 0.15705007314682007, "logps/chosen": -1.2492694854736328, "logps/rejected": -1.2878323793411255, "loss": 1.6655, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2492694854736328, "rewards/margins": 0.03856272250413895, "rewards/rejected": -1.2878323793411255, "semantic_entropy": 0.8325369954109192, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 6.683032474346436, "learning_rate": 9.987396563355205e-07, "logits/chosen": -0.06723952293395996, "logits/rejected": 0.0031424493063241243, "logps/chosen": -1.2471332550048828, "logps/rejected": -1.4549734592437744, "loss": 1.653, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2471332550048828, "rewards/margins": 0.2078404426574707, "rewards/rejected": -1.4549734592437744, "semantic_entropy": 0.8118308782577515, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 4.737190401260763, "learning_rate": 9.986267271350631e-07, "logits/chosen": 0.02632244862616062, "logits/rejected": 0.17290523648262024, "logps/chosen": -1.2929366827011108, "logps/rejected": -1.3351116180419922, "loss": 1.6976, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2929366827011108, "rewards/margins": 0.04217498376965523, "rewards/rejected": -1.3351116180419922, "semantic_entropy": 0.8093430399894714, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 7.324810401366871, "learning_rate": 9.985089602559123e-07, "logits/chosen": -0.013068673200905323, "logits/rejected": 0.1272987574338913, "logps/chosen": -1.272894263267517, "logps/rejected": -1.3167647123336792, "loss": 1.6876, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.272894263267517, "rewards/margins": 0.04387057572603226, "rewards/rejected": -1.3167647123336792, "semantic_entropy": 0.8293739557266235, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 6.090860461048212, "learning_rate": 9.983863568406428e-07, "logits/chosen": 0.01832166127860546, "logits/rejected": 0.04478534311056137, "logps/chosen": -1.2772231101989746, "logps/rejected": -1.3811113834381104, "loss": 1.6793, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2772231101989746, "rewards/margins": 0.103888139128685, "rewards/rejected": -1.3811113834381104, "semantic_entropy": 0.8040772676467896, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 6.379641531280263, "learning_rate": 9.982589180787532e-07, "logits/chosen": -0.015962710604071617, "logits/rejected": 0.06847533583641052, "logps/chosen": -1.1553322076797485, "logps/rejected": -1.320736289024353, "loss": 1.5723, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1553322076797485, "rewards/margins": 0.16540402173995972, "rewards/rejected": -1.320736289024353, "semantic_entropy": 0.8339877128601074, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 4.785827866361221, "learning_rate": 9.981266452066553e-07, "logits/chosen": -0.11345241963863373, "logits/rejected": 0.012134233489632607, "logps/chosen": -1.3373454809188843, "logps/rejected": -1.3976542949676514, "loss": 1.7329, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3373454809188843, "rewards/margins": 0.06030888482928276, "rewards/rejected": -1.3976542949676514, "semantic_entropy": 0.7911182641983032, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 4.855210978191871, "learning_rate": 9.979895395076608e-07, "logits/chosen": -0.09092748165130615, "logits/rejected": 0.06946993619203568, "logps/chosen": -1.2846109867095947, "logps/rejected": -1.4262562990188599, "loss": 1.6857, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2846109867095947, "rewards/margins": 0.1416453868150711, "rewards/rejected": -1.4262562990188599, "semantic_entropy": 0.8022518157958984, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 6.558468916982928, "learning_rate": 9.9784760231197e-07, "logits/chosen": 0.029046881943941116, "logits/rejected": 0.11394127458333969, "logps/chosen": -1.2279114723205566, "logps/rejected": -1.3471637964248657, "loss": 1.6462, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2279114723205566, "rewards/margins": 0.11925222724676132, "rewards/rejected": -1.3471637964248657, "semantic_entropy": 0.8365498781204224, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 7.690626480638701, "learning_rate": 9.97700834996658e-07, "logits/chosen": -0.044077157974243164, "logits/rejected": 0.10181716829538345, "logps/chosen": -1.296002984046936, "logps/rejected": -1.4112696647644043, "loss": 1.694, "rewards/accuracies": 0.53125, "rewards/chosen": -1.296002984046936, "rewards/margins": 0.11526668071746826, "rewards/rejected": -1.4112696647644043, "semantic_entropy": 0.7959758043289185, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 6.145218155411696, "learning_rate": 9.97549238985662e-07, "logits/chosen": 0.02074914239346981, "logits/rejected": 0.1918230950832367, "logps/chosen": -1.3434631824493408, "logps/rejected": -1.3907240629196167, "loss": 1.7368, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3434631824493408, "rewards/margins": 0.04726102575659752, "rewards/rejected": -1.3907240629196167, "semantic_entropy": 0.786609947681427, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 7.340841512760343, "learning_rate": 9.973928157497674e-07, "logits/chosen": -0.10986989736557007, "logits/rejected": 0.009096875786781311, "logps/chosen": -1.1812317371368408, "logps/rejected": -1.4321582317352295, "loss": 1.5923, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1812317371368408, "rewards/margins": 0.25092652440071106, "rewards/rejected": -1.4321582317352295, "semantic_entropy": 0.8221518397331238, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 6.16531823228256, "learning_rate": 9.972315668065927e-07, "logits/chosen": -0.1507844626903534, "logits/rejected": -0.0018353245686739683, "logps/chosen": -1.295781135559082, "logps/rejected": -1.364011526107788, "loss": 1.7007, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.295781135559082, "rewards/margins": 0.06823040544986725, "rewards/rejected": -1.364011526107788, "semantic_entropy": 0.8098312616348267, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 4.748232324397783, "learning_rate": 9.97065493720576e-07, "logits/chosen": -0.08753814548254013, "logits/rejected": 0.00508784968405962, "logps/chosen": -1.3096133470535278, "logps/rejected": -1.3565256595611572, "loss": 1.7046, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3096133470535278, "rewards/margins": 0.046912338584661484, "rewards/rejected": -1.3565256595611572, "semantic_entropy": 0.7900453805923462, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 7.607653609873328, "learning_rate": 9.968945981029594e-07, "logits/chosen": -0.08457894623279572, "logits/rejected": 0.06773854792118073, "logps/chosen": -1.3573706150054932, "logps/rejected": -1.384264588356018, "loss": 1.7464, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3573706150054932, "rewards/margins": 0.026894202455878258, "rewards/rejected": -1.384264588356018, "semantic_entropy": 0.7781510949134827, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 5.603614717819182, "learning_rate": 9.967188816117726e-07, "logits/chosen": 0.03603760525584221, "logits/rejected": 0.10309578478336334, "logps/chosen": -1.3373528718948364, "logps/rejected": -1.5172970294952393, "loss": 1.7335, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3373528718948364, "rewards/margins": 0.17994414269924164, "rewards/rejected": -1.5172970294952393, "semantic_entropy": 0.7922581434249878, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 5.65310374965835, "learning_rate": 9.965383459518179e-07, "logits/chosen": -0.05358581990003586, "logits/rejected": 0.09219758212566376, "logps/chosen": -1.2726496458053589, "logps/rejected": -1.422507405281067, "loss": 1.6802, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2726496458053589, "rewards/margins": 0.149857759475708, "rewards/rejected": -1.422507405281067, "semantic_entropy": 0.8150676488876343, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 4.562817065500144, "learning_rate": 9.963529928746533e-07, "logits/chosen": 0.007364857010543346, "logits/rejected": 0.11612896621227264, "logps/chosen": -1.3071093559265137, "logps/rejected": -1.3755913972854614, "loss": 1.7132, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3071093559265137, "rewards/margins": 0.06848201155662537, "rewards/rejected": -1.3755913972854614, "semantic_entropy": 0.8121932744979858, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 4.9439086851561305, "learning_rate": 9.961628241785746e-07, "logits/chosen": -0.08942331373691559, "logits/rejected": -0.03078708052635193, "logps/chosen": -1.3266046047210693, "logps/rejected": -1.448326826095581, "loss": 1.7226, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3266046047210693, "rewards/margins": 0.12172216176986694, "rewards/rejected": -1.448326826095581, "semantic_entropy": 0.7919363379478455, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 10.540231809645602, "learning_rate": 9.959678417085998e-07, "logits/chosen": -0.06775631755590439, "logits/rejected": 0.012430024333298206, "logps/chosen": -1.2820237874984741, "logps/rejected": -1.3685405254364014, "loss": 1.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2820237874984741, "rewards/margins": 0.08651675283908844, "rewards/rejected": -1.3685405254364014, "semantic_entropy": 0.8221932649612427, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 6.303314651900729, "learning_rate": 9.957680473564493e-07, "logits/chosen": 0.03633784502744675, "logits/rejected": 0.1401088982820511, "logps/chosen": -1.2339290380477905, "logps/rejected": -1.4252681732177734, "loss": 1.6414, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2339290380477905, "rewards/margins": 0.19133910536766052, "rewards/rejected": -1.4252681732177734, "semantic_entropy": 0.8149137496948242, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 6.963815725177742, "learning_rate": 9.95563443060529e-07, "logits/chosen": -0.1007881611585617, "logits/rejected": 0.04456842690706253, "logps/chosen": -1.3177720308303833, "logps/rejected": -1.4700305461883545, "loss": 1.7056, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3177720308303833, "rewards/margins": 0.15225863456726074, "rewards/rejected": -1.4700305461883545, "semantic_entropy": 0.7757259607315063, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 4.120724695239209, "learning_rate": 9.95354030805911e-07, "logits/chosen": -0.16211049258708954, "logits/rejected": -0.031117672100663185, "logps/chosen": -1.238765001296997, "logps/rejected": -1.3849303722381592, "loss": 1.6445, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.238765001296997, "rewards/margins": 0.14616529643535614, "rewards/rejected": -1.3849303722381592, "semantic_entropy": 0.811566948890686, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 5.83389934294886, "learning_rate": 9.951398126243133e-07, "logits/chosen": 0.0010517224436625838, "logits/rejected": 0.11070144176483154, "logps/chosen": -1.2123851776123047, "logps/rejected": -1.407547950744629, "loss": 1.6213, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2123851776123047, "rewards/margins": 0.195162832736969, "rewards/rejected": -1.407547950744629, "semantic_entropy": 0.8177574276924133, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 4.778807805052967, "learning_rate": 9.94920790594082e-07, "logits/chosen": -0.0766272321343422, "logits/rejected": 0.031587257981300354, "logps/chosen": -1.2893496751785278, "logps/rejected": -1.3295207023620605, "loss": 1.6978, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2893496751785278, "rewards/margins": 0.040170926600694656, "rewards/rejected": -1.3295207023620605, "semantic_entropy": 0.8168370127677917, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 4.709267826356956, "learning_rate": 9.946969668401696e-07, "logits/chosen": -0.12412413209676743, "logits/rejected": 0.04277144372463226, "logps/chosen": -1.2618663311004639, "logps/rejected": -1.3690122365951538, "loss": 1.6633, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2618663311004639, "rewards/margins": 0.10714612156152725, "rewards/rejected": -1.3690122365951538, "semantic_entropy": 0.8027721643447876, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 6.461071798919975, "learning_rate": 9.944683435341155e-07, "logits/chosen": -0.06665455549955368, "logits/rejected": -0.0012310475576668978, "logps/chosen": -1.2604715824127197, "logps/rejected": -1.2975454330444336, "loss": 1.6742, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2604715824127197, "rewards/margins": 0.037073828279972076, "rewards/rejected": -1.2975454330444336, "semantic_entropy": 0.8273698687553406, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.22974801063537598, "eval_logits/rejected": 0.30486640334129333, "eval_logps/chosen": -1.294273018836975, "eval_logps/rejected": -1.4240339994430542, "eval_loss": 1.6948989629745483, "eval_rewards/accuracies": 0.5511869192123413, "eval_rewards/chosen": -1.294273018836975, "eval_rewards/margins": 0.1297609955072403, "eval_rewards/rejected": -1.4240339994430542, "eval_runtime": 34.4066, "eval_samples_per_second": 39.091, "eval_semantic_entropy": 0.8009408116340637, "eval_steps_per_second": 9.795, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 5.729780379972113, "learning_rate": 9.942349228940236e-07, "logits/chosen": -0.0887727215886116, "logits/rejected": 0.042473893612623215, "logps/chosen": -1.311654806137085, "logps/rejected": -1.4440586566925049, "loss": 1.7089, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.311654806137085, "rewards/margins": 0.1324038952589035, "rewards/rejected": -1.4440586566925049, "semantic_entropy": 0.7943916320800781, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 6.3056351035989815, "learning_rate": 9.939967071845424e-07, "logits/chosen": -0.0005749579286202788, "logits/rejected": 0.06514312326908112, "logps/chosen": -1.2101976871490479, "logps/rejected": -1.343214750289917, "loss": 1.6274, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2101976871490479, "rewards/margins": 0.13301712274551392, "rewards/rejected": -1.343214750289917, "semantic_entropy": 0.83441561460495, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 5.955712322813652, "learning_rate": 9.937536987168413e-07, "logits/chosen": -0.026268433779478073, "logits/rejected": 0.08210448175668716, "logps/chosen": -1.2018334865570068, "logps/rejected": -1.400437831878662, "loss": 1.6077, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2018334865570068, "rewards/margins": 0.19860419631004333, "rewards/rejected": -1.400437831878662, "semantic_entropy": 0.8116356730461121, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 6.24196017814686, "learning_rate": 9.935058998485896e-07, "logits/chosen": -0.007223173044621944, "logits/rejected": 0.026631023734807968, "logps/chosen": -1.2475472688674927, "logps/rejected": -1.4108431339263916, "loss": 1.6528, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2475472688674927, "rewards/margins": 0.16329602897167206, "rewards/rejected": -1.4108431339263916, "semantic_entropy": 0.8105626106262207, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 5.993201198697254, "learning_rate": 9.932533129839333e-07, "logits/chosen": -0.07429758459329605, "logits/rejected": 0.03444807231426239, "logps/chosen": -1.1925114393234253, "logps/rejected": -1.2790920734405518, "loss": 1.607, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.1925114393234253, "rewards/margins": 0.08658073842525482, "rewards/rejected": -1.2790920734405518, "semantic_entropy": 0.8289631605148315, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 5.9186034378854915, "learning_rate": 9.929959405734711e-07, "logits/chosen": 0.0060964832082390785, "logits/rejected": 0.14987081289291382, "logps/chosen": -1.322318196296692, "logps/rejected": -1.3656644821166992, "loss": 1.7204, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.322318196296692, "rewards/margins": 0.043346308171749115, "rewards/rejected": -1.3656644821166992, "semantic_entropy": 0.7962054014205933, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 7.565471727922867, "learning_rate": 9.927337851142314e-07, "logits/chosen": -0.039504896849393845, "logits/rejected": 0.07449696213006973, "logps/chosen": -1.224980115890503, "logps/rejected": -1.3414291143417358, "loss": 1.6388, "rewards/accuracies": 0.5625, "rewards/chosen": -1.224980115890503, "rewards/margins": 0.11644892394542694, "rewards/rejected": -1.3414291143417358, "semantic_entropy": 0.8276527523994446, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 4.4405337588940075, "learning_rate": 9.924668491496474e-07, "logits/chosen": -0.061466772109270096, "logits/rejected": 0.07065530121326447, "logps/chosen": -1.272127389907837, "logps/rejected": -1.4281586408615112, "loss": 1.6797, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.272127389907837, "rewards/margins": 0.15603134036064148, "rewards/rejected": -1.4281586408615112, "semantic_entropy": 0.8151651620864868, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 4.383002624766522, "learning_rate": 9.92195135269533e-07, "logits/chosen": 0.0011878699297085404, "logits/rejected": 0.05920947715640068, "logps/chosen": -1.2676090002059937, "logps/rejected": -1.3204584121704102, "loss": 1.6708, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2676090002059937, "rewards/margins": 0.05284947156906128, "rewards/rejected": -1.3204584121704102, "semantic_entropy": 0.8064379692077637, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 7.809191956654642, "learning_rate": 9.919186461100574e-07, "logits/chosen": -0.0510123185813427, "logits/rejected": -0.005624666810035706, "logps/chosen": -1.2289793491363525, "logps/rejected": -1.3539388179779053, "loss": 1.6438, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2289793491363525, "rewards/margins": 0.12495940923690796, "rewards/rejected": -1.3539388179779053, "semantic_entropy": 0.82958984375, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 5.013457052931991, "learning_rate": 9.9163738435372e-07, "logits/chosen": -0.07818715274333954, "logits/rejected": 0.03409842401742935, "logps/chosen": -1.294175386428833, "logps/rejected": -1.455856204032898, "loss": 1.6935, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.294175386428833, "rewards/margins": 0.16168084740638733, "rewards/rejected": -1.455856204032898, "semantic_entropy": 0.7987454533576965, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 4.1699784710206105, "learning_rate": 9.913513527293234e-07, "logits/chosen": -0.10032539069652557, "logits/rejected": 0.032848574221134186, "logps/chosen": -1.3269323110580444, "logps/rejected": -1.4952493906021118, "loss": 1.7205, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3269323110580444, "rewards/margins": 0.16831697523593903, "rewards/rejected": -1.4952493906021118, "semantic_entropy": 0.7871569991111755, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 9.862936243648212, "learning_rate": 9.910605540119474e-07, "logits/chosen": -0.03875257819890976, "logits/rejected": 0.04339424893260002, "logps/chosen": -1.2214460372924805, "logps/rejected": -1.3906432390213013, "loss": 1.6311, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2214460372924805, "rewards/margins": 0.1691972017288208, "rewards/rejected": -1.3906432390213013, "semantic_entropy": 0.8192817568778992, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 5.742173047918602, "learning_rate": 9.907649910229227e-07, "logits/chosen": -0.134866401553154, "logits/rejected": 0.07490620017051697, "logps/chosen": -1.2751744985580444, "logps/rejected": -1.354170560836792, "loss": 1.6822, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2751744985580444, "rewards/margins": 0.07899585366249084, "rewards/rejected": -1.354170560836792, "semantic_entropy": 0.814003586769104, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 5.662280745204919, "learning_rate": 9.90464666629803e-07, "logits/chosen": -0.029971729964017868, "logits/rejected": 0.026624251157045364, "logps/chosen": -1.3017122745513916, "logps/rejected": -1.417639970779419, "loss": 1.7056, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3017122745513916, "rewards/margins": 0.11592777818441391, "rewards/rejected": -1.417639970779419, "semantic_entropy": 0.8076773881912231, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 4.840580918025604, "learning_rate": 9.901595837463363e-07, "logits/chosen": -0.012104947119951248, "logits/rejected": 0.11367936432361603, "logps/chosen": -1.359113335609436, "logps/rejected": -1.45298171043396, "loss": 1.75, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.359113335609436, "rewards/margins": 0.09386838972568512, "rewards/rejected": -1.45298171043396, "semantic_entropy": 0.7817215323448181, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 5.724016329356782, "learning_rate": 9.898497453324384e-07, "logits/chosen": -0.09079904854297638, "logits/rejected": -0.03051898442208767, "logps/chosen": -1.2383393049240112, "logps/rejected": -1.4239834547042847, "loss": 1.6408, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2383393049240112, "rewards/margins": 0.18564417958259583, "rewards/rejected": -1.4239834547042847, "semantic_entropy": 0.805009663105011, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 7.105357212476832, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.1709037721157074, "logits/rejected": -0.07077976316213608, "logps/chosen": -1.2961513996124268, "logps/rejected": -1.4149913787841797, "loss": 1.6894, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2961513996124268, "rewards/margins": 0.11883995682001114, "rewards/rejected": -1.4149913787841797, "semantic_entropy": 0.786443829536438, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 4.0085476527312895, "learning_rate": 9.892158139836724e-07, "logits/chosen": 0.021383142098784447, "logits/rejected": 0.11859546601772308, "logps/chosen": -1.1932836771011353, "logps/rejected": -1.2869081497192383, "loss": 1.6141, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1932836771011353, "rewards/margins": 0.09362450987100601, "rewards/rejected": -1.2869081497192383, "semantic_entropy": 0.8415895700454712, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 5.657127894373696, "learning_rate": 9.88891727199209e-07, "logits/chosen": -0.11935652792453766, "logits/rejected": -0.07516419887542725, "logps/chosen": -1.1745407581329346, "logps/rejected": -1.4227032661437988, "loss": 1.5917, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1745407581329346, "rewards/margins": 0.2481626719236374, "rewards/rejected": -1.4227032661437988, "semantic_entropy": 0.8343180418014526, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 7.460285770748455, "learning_rate": 9.885628971850641e-07, "logits/chosen": -0.009941866621375084, "logits/rejected": 0.14143416285514832, "logps/chosen": -1.2601745128631592, "logps/rejected": -1.4471454620361328, "loss": 1.6595, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2601745128631592, "rewards/margins": 0.1869707852602005, "rewards/rejected": -1.4471454620361328, "semantic_entropy": 0.7985892295837402, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 4.5241766074516105, "learning_rate": 9.882293271315481e-07, "logits/chosen": -0.06738373637199402, "logits/rejected": 0.027979236096143723, "logps/chosen": -1.3194259405136108, "logps/rejected": -1.3836421966552734, "loss": 1.7083, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3194259405136108, "rewards/margins": 0.06421609967947006, "rewards/rejected": -1.3836421966552734, "semantic_entropy": 0.777675986289978, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 5.803742289493355, "learning_rate": 9.878910202749589e-07, "logits/chosen": -0.07463528960943222, "logits/rejected": 0.06728587299585342, "logps/chosen": -1.2414133548736572, "logps/rejected": -1.366472601890564, "loss": 1.6563, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2414133548736572, "rewards/margins": 0.1250593066215515, "rewards/rejected": -1.366472601890564, "semantic_entropy": 0.8296843767166138, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 7.018010190252095, "learning_rate": 9.875479798975512e-07, "logits/chosen": 0.04854060336947441, "logits/rejected": 0.148795947432518, "logps/chosen": -1.1994060277938843, "logps/rejected": -1.3423380851745605, "loss": 1.6145, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1994060277938843, "rewards/margins": 0.14293184876441956, "rewards/rejected": -1.3423380851745605, "semantic_entropy": 0.8302057981491089, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 4.665904624885369, "learning_rate": 9.87200209327504e-07, "logits/chosen": -0.08912471681833267, "logits/rejected": 0.04548186436295509, "logps/chosen": -1.2907472848892212, "logps/rejected": -1.328770637512207, "loss": 1.6981, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2907472848892212, "rewards/margins": 0.03802347928285599, "rewards/rejected": -1.328770637512207, "semantic_entropy": 0.8147459030151367, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 7.886776790132184, "learning_rate": 9.868477119388894e-07, "logits/chosen": -0.09120460599660873, "logits/rejected": 0.00849437527358532, "logps/chosen": -1.2729679346084595, "logps/rejected": -1.4548171758651733, "loss": 1.6736, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2729679346084595, "rewards/margins": 0.18184921145439148, "rewards/rejected": -1.4548171758651733, "semantic_entropy": 0.8013606071472168, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 6.145708283044674, "learning_rate": 9.864904911516383e-07, "logits/chosen": -0.030297860503196716, "logits/rejected": -0.009022199548780918, "logps/chosen": -1.1875230073928833, "logps/rejected": -1.3673925399780273, "loss": 1.5928, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1875230073928833, "rewards/margins": 0.17986960709095, "rewards/rejected": -1.3673925399780273, "semantic_entropy": 0.8106422424316406, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 6.941811830199769, "learning_rate": 9.861285504315084e-07, "logits/chosen": -0.06077421456575394, "logits/rejected": 0.032173313200473785, "logps/chosen": -1.2578964233398438, "logps/rejected": -1.2990083694458008, "loss": 1.6655, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2578964233398438, "rewards/margins": 0.041112013161182404, "rewards/rejected": -1.2990083694458008, "semantic_entropy": 0.815250039100647, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 6.4475167684703285, "learning_rate": 9.857618932900502e-07, "logits/chosen": -0.09597314149141312, "logits/rejected": 0.0008383929962292314, "logps/chosen": -1.2381160259246826, "logps/rejected": -1.3863779306411743, "loss": 1.6481, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2381160259246826, "rewards/margins": 0.14826196432113647, "rewards/rejected": -1.3863779306411743, "semantic_entropy": 0.8199016451835632, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 4.4781975263578815, "learning_rate": 9.853905232845727e-07, "logits/chosen": -0.06874515116214752, "logits/rejected": 0.06295228004455566, "logps/chosen": -1.3361942768096924, "logps/rejected": -1.3537805080413818, "loss": 1.7319, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3361942768096924, "rewards/margins": 0.017586246132850647, "rewards/rejected": -1.3537805080413818, "semantic_entropy": 0.7914425134658813, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 4.373992571855831, "learning_rate": 9.850144440181095e-07, "logits/chosen": -0.046835221350193024, "logits/rejected": 0.13084354996681213, "logps/chosen": -1.3202214241027832, "logps/rejected": -1.3676820993423462, "loss": 1.7221, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3202214241027832, "rewards/margins": 0.04746072366833687, "rewards/rejected": -1.3676820993423462, "semantic_entropy": 0.8037006258964539, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 5.68650803524955, "learning_rate": 9.846336591393832e-07, "logits/chosen": -0.07980336248874664, "logits/rejected": 0.042290784418582916, "logps/chosen": -1.2750084400177002, "logps/rejected": -1.3530489206314087, "loss": 1.6818, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2750084400177002, "rewards/margins": 0.07804040610790253, "rewards/rejected": -1.3530489206314087, "semantic_entropy": 0.8134954571723938, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 5.905379034581222, "learning_rate": 9.842481723427704e-07, "logits/chosen": -0.016775688156485558, "logits/rejected": -0.035073526203632355, "logps/chosen": -1.317571997642517, "logps/rejected": -1.4701688289642334, "loss": 1.7122, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.317571997642517, "rewards/margins": 0.15259678661823273, "rewards/rejected": -1.4701688289642334, "semantic_entropy": 0.7892157435417175, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 5.322853003212504, "learning_rate": 9.838579873682658e-07, "logits/chosen": 0.0018246069084852934, "logits/rejected": -0.005102044437080622, "logps/chosen": -1.2018992900848389, "logps/rejected": -1.281451940536499, "loss": 1.6133, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2018992900848389, "rewards/margins": 0.07955250889062881, "rewards/rejected": -1.281451940536499, "semantic_entropy": 0.8227464556694031, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 4.570398758903866, "learning_rate": 9.834631080014457e-07, "logits/chosen": -0.1413174271583557, "logits/rejected": -0.002082021441310644, "logps/chosen": -1.2748171091079712, "logps/rejected": -1.3551783561706543, "loss": 1.6757, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2748171091079712, "rewards/margins": 0.08036132156848907, "rewards/rejected": -1.3551783561706543, "semantic_entropy": 0.8017622828483582, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 6.487684394573732, "learning_rate": 9.830635380734312e-07, "logits/chosen": -0.1309872418642044, "logits/rejected": 0.03308798000216484, "logps/chosen": -1.2985713481903076, "logps/rejected": -1.3936169147491455, "loss": 1.7005, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2985713481903076, "rewards/margins": 0.09504582732915878, "rewards/rejected": -1.3936169147491455, "semantic_entropy": 0.8039037585258484, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 5.273452654861285, "learning_rate": 9.826592814608517e-07, "logits/chosen": -0.030388465151190758, "logits/rejected": 0.11683666706085205, "logps/chosen": -1.2726335525512695, "logps/rejected": -1.3951536417007446, "loss": 1.6721, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2726335525512695, "rewards/margins": 0.12252019345760345, "rewards/rejected": -1.3951536417007446, "semantic_entropy": 0.798933207988739, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 5.852347294691708, "learning_rate": 9.822503420858067e-07, "logits/chosen": -0.024298015981912613, "logits/rejected": 0.011081969365477562, "logps/chosen": -1.154376745223999, "logps/rejected": -1.3615131378173828, "loss": 1.5788, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.154376745223999, "rewards/margins": 0.2071363925933838, "rewards/rejected": -1.3615131378173828, "semantic_entropy": 0.8487502932548523, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 5.074357732344139, "learning_rate": 9.818367239158277e-07, "logits/chosen": -0.004331943579018116, "logits/rejected": 0.04032442718744278, "logps/chosen": -1.2761890888214111, "logps/rejected": -1.284867286682129, "loss": 1.6881, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.2761890888214111, "rewards/margins": 0.008678276091814041, "rewards/rejected": -1.284867286682129, "semantic_entropy": 0.8238998651504517, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 5.941013733242882, "learning_rate": 9.8141843096384e-07, "logits/chosen": -0.02687777206301689, "logits/rejected": 0.045518893748521805, "logps/chosen": -1.3052326440811157, "logps/rejected": -1.404588222503662, "loss": 1.7069, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3052326440811157, "rewards/margins": 0.09935589134693146, "rewards/rejected": -1.404588222503662, "semantic_entropy": 0.8034124374389648, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 6.451149964497969, "learning_rate": 9.809954672881237e-07, "logits/chosen": -0.06602369248867035, "logits/rejected": 0.06709353625774384, "logps/chosen": -1.286184549331665, "logps/rejected": -1.3610591888427734, "loss": 1.6782, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.286184549331665, "rewards/margins": 0.0748746320605278, "rewards/rejected": -1.3610591888427734, "semantic_entropy": 0.7839921712875366, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 4.856458650190477, "learning_rate": 9.80567836992274e-07, "logits/chosen": -0.05687612295150757, "logits/rejected": 0.08947905153036118, "logps/chosen": -1.1534565687179565, "logps/rejected": -1.3500792980194092, "loss": 1.572, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1534565687179565, "rewards/margins": 0.19662261009216309, "rewards/rejected": -1.3500792980194092, "semantic_entropy": 0.8370656967163086, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 9.09693076695947, "learning_rate": 9.801355442251625e-07, "logits/chosen": -0.05984802171587944, "logits/rejected": 0.07851599156856537, "logps/chosen": -1.225954294204712, "logps/rejected": -1.3791025876998901, "loss": 1.6331, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.225954294204712, "rewards/margins": 0.1531483381986618, "rewards/rejected": -1.3791025876998901, "semantic_entropy": 0.8143377304077148, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 6.428218745071331, "learning_rate": 9.796985931808949e-07, "logits/chosen": -0.04947753995656967, "logits/rejected": 0.057066332548856735, "logps/chosen": -1.2769850492477417, "logps/rejected": -1.4287335872650146, "loss": 1.6784, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2769850492477417, "rewards/margins": 0.15174850821495056, "rewards/rejected": -1.4287335872650146, "semantic_entropy": 0.8028755187988281, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 6.9524742999751785, "learning_rate": 9.792569880987724e-07, "logits/chosen": -0.0834953635931015, "logits/rejected": 0.0040039075538516045, "logps/chosen": -1.1778647899627686, "logps/rejected": -1.4044684171676636, "loss": 1.5912, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1778647899627686, "rewards/margins": 0.22660358250141144, "rewards/rejected": -1.4044684171676636, "semantic_entropy": 0.8267693519592285, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 5.412696350599322, "learning_rate": 9.788107332632493e-07, "logits/chosen": -0.06177670508623123, "logits/rejected": 0.01058930717408657, "logps/chosen": -1.252097725868225, "logps/rejected": -1.3170757293701172, "loss": 1.6596, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.252097725868225, "rewards/margins": 0.06497790664434433, "rewards/rejected": -1.3170757293701172, "semantic_entropy": 0.8149473071098328, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 6.152628496292267, "learning_rate": 9.783598330038924e-07, "logits/chosen": -0.079267218708992, "logits/rejected": 0.012737743556499481, "logps/chosen": -1.3500885963439941, "logps/rejected": -1.3691620826721191, "loss": 1.7389, "rewards/accuracies": 0.5, "rewards/chosen": -1.3500885963439941, "rewards/margins": 0.019073298200964928, "rewards/rejected": -1.3691620826721191, "semantic_entropy": 0.77772057056427, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 6.054409475341403, "learning_rate": 9.779042916953376e-07, "logits/chosen": -0.07672189921140671, "logits/rejected": 0.029551442712545395, "logps/chosen": -1.2665832042694092, "logps/rejected": -1.374276876449585, "loss": 1.671, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2665832042694092, "rewards/margins": 0.10769368708133698, "rewards/rejected": -1.374276876449585, "semantic_entropy": 0.8089017868041992, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 6.594558674885469, "learning_rate": 9.774441137572487e-07, "logits/chosen": -0.13329772651195526, "logits/rejected": -0.03153461590409279, "logps/chosen": -1.236494779586792, "logps/rejected": -1.3972508907318115, "loss": 1.6356, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.236494779586792, "rewards/margins": 0.16075614094734192, "rewards/rejected": -1.3972508907318115, "semantic_entropy": 0.7981476783752441, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 6.2378735099748175, "learning_rate": 9.76979303654274e-07, "logits/chosen": -0.1355057805776596, "logits/rejected": -0.061175353825092316, "logps/chosen": -1.2859466075897217, "logps/rejected": -1.3893991708755493, "loss": 1.6866, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2859466075897217, "rewards/margins": 0.1034526601433754, "rewards/rejected": -1.3893991708755493, "semantic_entropy": 0.8012644052505493, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 6.702405001670137, "learning_rate": 9.765098658960035e-07, "logits/chosen": -0.06573408842086792, "logits/rejected": -0.00926362443715334, "logps/chosen": -1.2723217010498047, "logps/rejected": -1.4050137996673584, "loss": 1.6645, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2723217010498047, "rewards/margins": 0.1326920986175537, "rewards/rejected": -1.4050137996673584, "semantic_entropy": 0.7843021154403687, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 4.652893544506757, "learning_rate": 9.76035805036924e-07, "logits/chosen": -0.03727786988019943, "logits/rejected": 0.09711579233407974, "logps/chosen": -1.3429769277572632, "logps/rejected": -1.4389255046844482, "loss": 1.7311, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3429769277572632, "rewards/margins": 0.09594844281673431, "rewards/rejected": -1.4389255046844482, "semantic_entropy": 0.7761676907539368, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 4.575329117799633, "learning_rate": 9.755571256763764e-07, "logits/chosen": -0.013535404577851295, "logits/rejected": 0.08599988371133804, "logps/chosen": -1.1973073482513428, "logps/rejected": -1.3925042152404785, "loss": 1.5952, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1973073482513428, "rewards/margins": 0.19519677758216858, "rewards/rejected": -1.3925042152404785, "semantic_entropy": 0.7956871390342712, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 6.039637849063027, "learning_rate": 9.750738324585097e-07, "logits/chosen": -0.16207699477672577, "logits/rejected": 0.03539344295859337, "logps/chosen": -1.2645537853240967, "logps/rejected": -1.4365944862365723, "loss": 1.6722, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2645537853240967, "rewards/margins": 0.17204074561595917, "rewards/rejected": -1.4365944862365723, "semantic_entropy": 0.8153823018074036, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 4.92231425006322, "learning_rate": 9.74585930072237e-07, "logits/chosen": -0.054649896919727325, "logits/rejected": 0.04811540246009827, "logps/chosen": -1.2129769325256348, "logps/rejected": -1.4050449132919312, "loss": 1.626, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2129769325256348, "rewards/margins": 0.19206802546977997, "rewards/rejected": -1.4050449132919312, "semantic_entropy": 0.8261134028434753, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 5.824395228506507, "learning_rate": 9.740934232511892e-07, "logits/chosen": -0.1496378481388092, "logits/rejected": -0.06796258687973022, "logps/chosen": -1.3280683755874634, "logps/rejected": -1.3489418029785156, "loss": 1.7236, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3280683755874634, "rewards/margins": 0.02087341621518135, "rewards/rejected": -1.3489418029785156, "semantic_entropy": 0.7911087274551392, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 5.289978259920299, "learning_rate": 9.735963167736698e-07, "logits/chosen": -0.048243455588817596, "logits/rejected": 0.08265761286020279, "logps/chosen": -1.2845392227172852, "logps/rejected": -1.3139842748641968, "loss": 1.6914, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2845392227172852, "rewards/margins": 0.02944515272974968, "rewards/rejected": -1.3139842748641968, "semantic_entropy": 0.8136458396911621, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 3.7787138295297917, "learning_rate": 9.730946154626078e-07, "logits/chosen": -0.036325398832559586, "logits/rejected": 0.043635670095682144, "logps/chosen": -1.2710992097854614, "logps/rejected": -1.2817459106445312, "loss": 1.6818, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2710992097854614, "rewards/margins": 0.010646870359778404, "rewards/rejected": -1.2817459106445312, "semantic_entropy": 0.8214797973632812, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 7.603211465020814, "learning_rate": 9.725883241855117e-07, "logits/chosen": -0.17086748778820038, "logits/rejected": -0.05951831862330437, "logps/chosen": -1.2142055034637451, "logps/rejected": -1.3367023468017578, "loss": 1.6234, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2142055034637451, "rewards/margins": 0.12249686568975449, "rewards/rejected": -1.3367023468017578, "semantic_entropy": 0.8184215426445007, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 7.22968941577663, "learning_rate": 9.720774478544218e-07, "logits/chosen": -0.05939372628927231, "logits/rejected": 0.022553209215402603, "logps/chosen": -1.119882583618164, "logps/rejected": -1.417140245437622, "loss": 1.5402, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.119882583618164, "rewards/margins": 0.2972577214241028, "rewards/rejected": -1.417140245437622, "semantic_entropy": 0.840645432472229, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 4.893417943904533, "learning_rate": 9.715619914258624e-07, "logits/chosen": -0.08329765498638153, "logits/rejected": -0.0301411934196949, "logps/chosen": -1.255217432975769, "logps/rejected": -1.3497917652130127, "loss": 1.6593, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.255217432975769, "rewards/margins": 0.09457440674304962, "rewards/rejected": -1.3497917652130127, "semantic_entropy": 0.80824214220047, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 4.790826730341822, "learning_rate": 9.710419599007937e-07, "logits/chosen": -0.08490295708179474, "logits/rejected": 0.010399902239441872, "logps/chosen": -1.2295831441879272, "logps/rejected": -1.2567975521087646, "loss": 1.6448, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2295831441879272, "rewards/margins": 0.027214478701353073, "rewards/rejected": -1.2567975521087646, "semantic_entropy": 0.8304053544998169, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 5.102388018211781, "learning_rate": 9.705173583245643e-07, "logits/chosen": 0.01671953871846199, "logits/rejected": 0.10801713168621063, "logps/chosen": -1.1554386615753174, "logps/rejected": -1.3290585279464722, "loss": 1.5679, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1554386615753174, "rewards/margins": 0.1736198365688324, "rewards/rejected": -1.3290585279464722, "semantic_entropy": 0.8249346017837524, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 7.559835044093027, "learning_rate": 9.699881917868609e-07, "logits/chosen": -0.18613296747207642, "logits/rejected": -0.10510899871587753, "logps/chosen": -1.2279211282730103, "logps/rejected": -1.3681596517562866, "loss": 1.6395, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2279211282730103, "rewards/margins": 0.14023858308792114, "rewards/rejected": -1.3681596517562866, "semantic_entropy": 0.8231204152107239, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 7.4615401584952, "learning_rate": 9.694544654216594e-07, "logits/chosen": -0.18098454177379608, "logits/rejected": -0.03044978901743889, "logps/chosen": -1.2271541357040405, "logps/rejected": -1.3366339206695557, "loss": 1.6407, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2271541357040405, "rewards/margins": 0.1094796285033226, "rewards/rejected": -1.3366339206695557, "semantic_entropy": 0.8271785974502563, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 5.165941882049845, "learning_rate": 9.689161844071755e-07, "logits/chosen": 0.0018145025242120028, "logits/rejected": 0.04768190160393715, "logps/chosen": -1.2725623846054077, "logps/rejected": -1.4055455923080444, "loss": 1.685, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2725623846054077, "rewards/margins": 0.13298305869102478, "rewards/rejected": -1.4055455923080444, "semantic_entropy": 0.8249580264091492, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 8.318788204718762, "learning_rate": 9.683733539658138e-07, "logits/chosen": -0.0912027508020401, "logits/rejected": 0.03808989003300667, "logps/chosen": -1.28428053855896, "logps/rejected": -1.4574658870697021, "loss": 1.6887, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.28428053855896, "rewards/margins": 0.1731850653886795, "rewards/rejected": -1.4574658870697021, "semantic_entropy": 0.8087494969367981, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 5.999270599121113, "learning_rate": 9.678259793641178e-07, "logits/chosen": -0.06737272441387177, "logits/rejected": -0.048637717962265015, "logps/chosen": -1.2452597618103027, "logps/rejected": -1.265748381614685, "loss": 1.6534, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2452597618103027, "rewards/margins": 0.020488616079092026, "rewards/rejected": -1.265748381614685, "semantic_entropy": 0.8163356781005859, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 5.508831121782663, "learning_rate": 9.672740659127183e-07, "logits/chosen": -0.17554977536201477, "logits/rejected": -0.08991789072751999, "logps/chosen": -1.2864652872085571, "logps/rejected": -1.3848521709442139, "loss": 1.6894, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2864652872085571, "rewards/margins": 0.09838694334030151, "rewards/rejected": -1.3848521709442139, "semantic_entropy": 0.8059255480766296, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 6.112396464576824, "learning_rate": 9.667176189662818e-07, "logits/chosen": -0.20758655667304993, "logits/rejected": -0.09571430832147598, "logps/chosen": -1.1736363172531128, "logps/rejected": -1.3086981773376465, "loss": 1.6004, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1736363172531128, "rewards/margins": 0.1350618302822113, "rewards/rejected": -1.3086981773376465, "semantic_entropy": 0.8535706400871277, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 5.8695350456902755, "learning_rate": 9.661566439234592e-07, "logits/chosen": -0.07988302409648895, "logits/rejected": -0.01854768767952919, "logps/chosen": -1.2746518850326538, "logps/rejected": -1.356823444366455, "loss": 1.6758, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2746518850326538, "rewards/margins": 0.08217150717973709, "rewards/rejected": -1.356823444366455, "semantic_entropy": 0.8022555112838745, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 7.945161579813259, "learning_rate": 9.655911462268327e-07, "logits/chosen": -0.016491182148456573, "logits/rejected": 0.0392584502696991, "logps/chosen": -1.225669503211975, "logps/rejected": -1.323616862297058, "loss": 1.6373, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.225669503211975, "rewards/margins": 0.09794744849205017, "rewards/rejected": -1.323616862297058, "semantic_entropy": 0.8233088254928589, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 4.727779133526624, "learning_rate": 9.650211313628636e-07, "logits/chosen": -0.08063717931509018, "logits/rejected": -0.020816197618842125, "logps/chosen": -1.168159008026123, "logps/rejected": -1.3313496112823486, "loss": 1.5847, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.168159008026123, "rewards/margins": 0.16319069266319275, "rewards/rejected": -1.3313496112823486, "semantic_entropy": 0.8331020474433899, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 5.3504809525254995, "learning_rate": 9.644466048618386e-07, "logits/chosen": -0.13816484808921814, "logits/rejected": -0.009775301441550255, "logps/chosen": -1.3746297359466553, "logps/rejected": -1.3813049793243408, "loss": 1.7636, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3746297359466553, "rewards/margins": 0.006675147917121649, "rewards/rejected": -1.3813049793243408, "semantic_entropy": 0.7778648138046265, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 4.625652143662692, "learning_rate": 9.63867572297816e-07, "logits/chosen": -0.08815641701221466, "logits/rejected": 0.05138349533081055, "logps/chosen": -1.216698169708252, "logps/rejected": -1.2960002422332764, "loss": 1.6314, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.216698169708252, "rewards/margins": 0.0793021097779274, "rewards/rejected": -1.2960002422332764, "semantic_entropy": 0.8294572830200195, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 5.366815180343774, "learning_rate": 9.632840392885727e-07, "logits/chosen": -0.09156795591115952, "logits/rejected": 0.0058028725907206535, "logps/chosen": -1.3214962482452393, "logps/rejected": -1.466416597366333, "loss": 1.7077, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3214962482452393, "rewards/margins": 0.14492037892341614, "rewards/rejected": -1.466416597366333, "semantic_entropy": 0.7724655270576477, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 6.620145659020206, "learning_rate": 9.626960114955483e-07, "logits/chosen": -0.042393796145915985, "logits/rejected": 0.060936808586120605, "logps/chosen": -1.3519549369812012, "logps/rejected": -1.3962143659591675, "loss": 1.7402, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.3519549369812012, "rewards/margins": 0.044259607791900635, "rewards/rejected": -1.3962143659591675, "semantic_entropy": 0.7764344215393066, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 6.6999968036851785, "learning_rate": 9.621034946237909e-07, "logits/chosen": -0.10990222543478012, "logits/rejected": -0.006593751255422831, "logps/chosen": -1.3035919666290283, "logps/rejected": -1.443670630455017, "loss": 1.6995, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3035919666290283, "rewards/margins": 0.1400786191225052, "rewards/rejected": -1.443670630455017, "semantic_entropy": 0.7917920351028442, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 6.104512036297406, "learning_rate": 9.615064944219021e-07, "logits/chosen": -0.04242805391550064, "logits/rejected": 0.05067376047372818, "logps/chosen": -1.1737377643585205, "logps/rejected": -1.3679195642471313, "loss": 1.5909, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1737377643585205, "rewards/margins": 0.19418184459209442, "rewards/rejected": -1.3679195642471313, "semantic_entropy": 0.8343908190727234, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 4.744132306353752, "learning_rate": 9.609050166819803e-07, "logits/chosen": -0.11240573227405548, "logits/rejected": -0.06260828673839569, "logps/chosen": -1.2196130752563477, "logps/rejected": -1.3561015129089355, "loss": 1.6257, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2196130752563477, "rewards/margins": 0.13648828864097595, "rewards/rejected": -1.3561015129089355, "semantic_entropy": 0.812096118927002, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.242101788520813, "eval_logits/rejected": 0.31842076778411865, "eval_logps/chosen": -1.2855284214019775, "eval_logps/rejected": -1.4132660627365112, "eval_loss": 1.687199354171753, "eval_rewards/accuracies": 0.5497032403945923, "eval_rewards/chosen": -1.2855284214019775, "eval_rewards/margins": 0.12773776054382324, "eval_rewards/rejected": -1.4132660627365112, "eval_runtime": 34.0869, "eval_samples_per_second": 39.458, "eval_semantic_entropy": 0.8030252456665039, "eval_steps_per_second": 9.887, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 6.923107699901272, "learning_rate": 9.602990672395653e-07, "logits/chosen": -0.18005791306495667, "logits/rejected": -0.04249081760644913, "logps/chosen": -1.235028862953186, "logps/rejected": -1.3514426946640015, "loss": 1.6462, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.235028862953186, "rewards/margins": 0.11641402542591095, "rewards/rejected": -1.3514426946640015, "semantic_entropy": 0.8222451210021973, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 8.373933668787751, "learning_rate": 9.59688651973581e-07, "logits/chosen": -0.08936551958322525, "logits/rejected": 0.0659591555595398, "logps/chosen": -1.271532416343689, "logps/rejected": -1.3605878353118896, "loss": 1.6731, "rewards/accuracies": 0.53125, "rewards/chosen": -1.271532416343689, "rewards/margins": 0.08905527740716934, "rewards/rejected": -1.3605878353118896, "semantic_entropy": 0.8031315803527832, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 5.203442198488215, "learning_rate": 9.590737768062792e-07, "logits/chosen": -0.14214135706424713, "logits/rejected": -0.041958488523960114, "logps/chosen": -1.2639185190200806, "logps/rejected": -1.2839939594268799, "loss": 1.6789, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2639185190200806, "rewards/margins": 0.02007540687918663, "rewards/rejected": -1.2839939594268799, "semantic_entropy": 0.8300067782402039, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 6.713609701908544, "learning_rate": 9.584544477031816e-07, "logits/chosen": 0.02781575545668602, "logits/rejected": 0.11226655542850494, "logps/chosen": -1.1800429821014404, "logps/rejected": -1.3071404695510864, "loss": 1.5916, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1800429821014404, "rewards/margins": 0.12709753215312958, "rewards/rejected": -1.3071404695510864, "semantic_entropy": 0.8231449127197266, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 4.6524330644827385, "learning_rate": 9.578306706730215e-07, "logits/chosen": -0.19468732178211212, "logits/rejected": -0.022149646654725075, "logps/chosen": -1.283217191696167, "logps/rejected": -1.369322419166565, "loss": 1.6809, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.283217191696167, "rewards/margins": 0.08610522747039795, "rewards/rejected": -1.369322419166565, "semantic_entropy": 0.795279860496521, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 4.274508614052168, "learning_rate": 9.572024517676865e-07, "logits/chosen": -0.08797045797109604, "logits/rejected": 0.012316776439547539, "logps/chosen": -1.208664894104004, "logps/rejected": -1.366068959236145, "loss": 1.6208, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.208664894104004, "rewards/margins": 0.15740405023097992, "rewards/rejected": -1.366068959236145, "semantic_entropy": 0.82428377866745, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 4.886244240636574, "learning_rate": 9.565697970821593e-07, "logits/chosen": -0.04591931775212288, "logits/rejected": 0.06214721128344536, "logps/chosen": -1.270061731338501, "logps/rejected": -1.3458906412124634, "loss": 1.6748, "rewards/accuracies": 0.5, "rewards/chosen": -1.270061731338501, "rewards/margins": 0.07582899928092957, "rewards/rejected": -1.3458906412124634, "semantic_entropy": 0.8094123601913452, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 7.065370561254463, "learning_rate": 9.559327127544585e-07, "logits/chosen": -0.1800922453403473, "logits/rejected": -0.0745965838432312, "logps/chosen": -1.2233721017837524, "logps/rejected": -1.3452575206756592, "loss": 1.629, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2233721017837524, "rewards/margins": 0.12188547849655151, "rewards/rejected": -1.3452575206756592, "semantic_entropy": 0.8111746907234192, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 5.706150177682406, "learning_rate": 9.552912049655789e-07, "logits/chosen": -0.08651044964790344, "logits/rejected": 0.05582255870103836, "logps/chosen": -1.3290008306503296, "logps/rejected": -1.3583253622055054, "loss": 1.7287, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.3290008306503296, "rewards/margins": 0.02932468056678772, "rewards/rejected": -1.3583253622055054, "semantic_entropy": 0.799343466758728, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 5.629731219475135, "learning_rate": 9.546452799394315e-07, "logits/chosen": -0.08527232706546783, "logits/rejected": 0.06784255057573318, "logps/chosen": -1.2995926141738892, "logps/rejected": -1.3354548215866089, "loss": 1.697, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.2995926141738892, "rewards/margins": 0.03586230427026749, "rewards/rejected": -1.3354548215866089, "semantic_entropy": 0.7948542833328247, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 9.436545748472868, "learning_rate": 9.539949439427846e-07, "logits/chosen": -0.10347219556570053, "logits/rejected": -0.006440461613237858, "logps/chosen": -1.2572054862976074, "logps/rejected": -1.348205327987671, "loss": 1.6619, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2572054862976074, "rewards/margins": 0.09099972248077393, "rewards/rejected": -1.348205327987671, "semantic_entropy": 0.8094714879989624, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 5.946813346094193, "learning_rate": 9.533402032852002e-07, "logits/chosen": -0.13315753638744354, "logits/rejected": -0.028364678844809532, "logps/chosen": -1.201436161994934, "logps/rejected": -1.3646718263626099, "loss": 1.6145, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.201436161994934, "rewards/margins": 0.16323569416999817, "rewards/rejected": -1.3646718263626099, "semantic_entropy": 0.8262044787406921, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 4.49579221238323, "learning_rate": 9.526810643189754e-07, "logits/chosen": -0.06065717339515686, "logits/rejected": 0.037495069205760956, "logps/chosen": -1.2485520839691162, "logps/rejected": -1.3358757495880127, "loss": 1.6611, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2485520839691162, "rewards/margins": 0.08732354640960693, "rewards/rejected": -1.3358757495880127, "semantic_entropy": 0.8250206112861633, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 6.967594671903231, "learning_rate": 9.52017533439079e-07, "logits/chosen": -0.0887458547949791, "logits/rejected": 0.005970892496407032, "logps/chosen": -1.2038989067077637, "logps/rejected": -1.3513823747634888, "loss": 1.6167, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2038989067077637, "rewards/margins": 0.14748327434062958, "rewards/rejected": -1.3513823747634888, "semantic_entropy": 0.8255760073661804, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 3.86811979262425, "learning_rate": 9.513496170830909e-07, "logits/chosen": -0.0811188593506813, "logits/rejected": 0.006213420070707798, "logps/chosen": -1.2642120122909546, "logps/rejected": -1.3279263973236084, "loss": 1.6723, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2642120122909546, "rewards/margins": 0.06371432542800903, "rewards/rejected": -1.3279263973236084, "semantic_entropy": 0.8161550760269165, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 6.272491267177896, "learning_rate": 9.506773217311382e-07, "logits/chosen": -0.09896093606948853, "logits/rejected": 0.018528467044234276, "logps/chosen": -1.3335891962051392, "logps/rejected": -1.3908222913742065, "loss": 1.7273, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3335891962051392, "rewards/margins": 0.057233065366744995, "rewards/rejected": -1.3908222913742065, "semantic_entropy": 0.7875096201896667, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 5.370837927286815, "learning_rate": 9.500006539058334e-07, "logits/chosen": -0.0932573452591896, "logits/rejected": 0.016738364472985268, "logps/chosen": -1.2149693965911865, "logps/rejected": -1.310187816619873, "loss": 1.6236, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2149693965911865, "rewards/margins": 0.09521829336881638, "rewards/rejected": -1.310187816619873, "semantic_entropy": 0.8173314332962036, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 6.492356980182854, "learning_rate": 9.493196201722109e-07, "logits/chosen": -0.19326862692832947, "logits/rejected": -0.06877744942903519, "logps/chosen": -1.2750942707061768, "logps/rejected": -1.3119337558746338, "loss": 1.6725, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2750942707061768, "rewards/margins": 0.03683941066265106, "rewards/rejected": -1.3119337558746338, "semantic_entropy": 0.7947137355804443, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 4.862119719790757, "learning_rate": 9.486342271376628e-07, "logits/chosen": -0.13678404688835144, "logits/rejected": -0.12899407744407654, "logps/chosen": -1.2473198175430298, "logps/rejected": -1.3947761058807373, "loss": 1.6479, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2473198175430298, "rewards/margins": 0.14745637774467468, "rewards/rejected": -1.3947761058807373, "semantic_entropy": 0.8010649681091309, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 5.158933647353221, "learning_rate": 9.479444814518755e-07, "logits/chosen": -0.12214778363704681, "logits/rejected": 0.07019232213497162, "logps/chosen": -1.2472740411758423, "logps/rejected": -1.37154221534729, "loss": 1.6558, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2472740411758423, "rewards/margins": 0.12426825612783432, "rewards/rejected": -1.37154221534729, "semantic_entropy": 0.8170510530471802, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 4.530209328264566, "learning_rate": 9.472503898067645e-07, "logits/chosen": -0.03624961897730827, "logits/rejected": 0.012759166769683361, "logps/chosen": -1.2507244348526, "logps/rejected": -1.3828155994415283, "loss": 1.6562, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2507244348526, "rewards/margins": 0.13209107518196106, "rewards/rejected": -1.3828155994415283, "semantic_entropy": 0.8109642267227173, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 6.136279993150266, "learning_rate": 9.465519589364099e-07, "logits/chosen": -0.040978915989398956, "logits/rejected": 0.017650585621595383, "logps/chosen": -1.2742854356765747, "logps/rejected": -1.3824121952056885, "loss": 1.6737, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2742854356765747, "rewards/margins": 0.10812674462795258, "rewards/rejected": -1.3824121952056885, "semantic_entropy": 0.7988699674606323, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 5.161748258853723, "learning_rate": 9.458491956169914e-07, "logits/chosen": -0.10625822842121124, "logits/rejected": 0.04520512744784355, "logps/chosen": -1.2079899311065674, "logps/rejected": -1.4105498790740967, "loss": 1.6119, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2079899311065674, "rewards/margins": 0.2025599181652069, "rewards/rejected": -1.4105498790740967, "semantic_entropy": 0.8078333139419556, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 4.538704832836553, "learning_rate": 9.451421066667215e-07, "logits/chosen": -0.19896438717842102, "logits/rejected": -0.028302934020757675, "logps/chosen": -1.1961162090301514, "logps/rejected": -1.3436466455459595, "loss": 1.6118, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1961162090301514, "rewards/margins": 0.1475304216146469, "rewards/rejected": -1.3436466455459595, "semantic_entropy": 0.8314204216003418, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 5.23989836113406, "learning_rate": 9.444306989457805e-07, "logits/chosen": -0.07057606428861618, "logits/rejected": 0.012409623712301254, "logps/chosen": -1.2712451219558716, "logps/rejected": -1.342938780784607, "loss": 1.6731, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2712451219558716, "rewards/margins": 0.0716938003897667, "rewards/rejected": -1.342938780784607, "semantic_entropy": 0.8037014007568359, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 5.060728972233231, "learning_rate": 9.437149793562489e-07, "logits/chosen": -0.09365855157375336, "logits/rejected": 0.006497299764305353, "logps/chosen": -1.2644718885421753, "logps/rejected": -1.3352326154708862, "loss": 1.679, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2644718885421753, "rewards/margins": 0.07076076418161392, "rewards/rejected": -1.3352326154708862, "semantic_entropy": 0.8291547894477844, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 7.245529812499478, "learning_rate": 9.429949548420417e-07, "logits/chosen": -0.06301262974739075, "logits/rejected": -0.0016861639451235533, "logps/chosen": -1.3326492309570312, "logps/rejected": -1.4537262916564941, "loss": 1.7303, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3326492309570312, "rewards/margins": 0.12107710540294647, "rewards/rejected": -1.4537262916564941, "semantic_entropy": 0.7953641414642334, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 6.678566479186103, "learning_rate": 9.422706323888396e-07, "logits/chosen": -0.053825557231903076, "logits/rejected": -0.04011346027255058, "logps/chosen": -1.3036760091781616, "logps/rejected": -1.404149055480957, "loss": 1.6958, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3036760091781616, "rewards/margins": 0.1004730835556984, "rewards/rejected": -1.404149055480957, "semantic_entropy": 0.7841836214065552, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 4.106028135519392, "learning_rate": 9.415420190240225e-07, "logits/chosen": -0.03366105258464813, "logits/rejected": 0.10668013244867325, "logps/chosen": -1.2901681661605835, "logps/rejected": -1.3541765213012695, "loss": 1.693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2901681661605835, "rewards/margins": 0.06400825083255768, "rewards/rejected": -1.3541765213012695, "semantic_entropy": 0.8056913614273071, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 6.631121155196355, "learning_rate": 9.408091218166002e-07, "logits/chosen": -0.013771966099739075, "logits/rejected": 0.025859486311674118, "logps/chosen": -1.268865704536438, "logps/rejected": -1.2891730070114136, "loss": 1.6762, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.268865704536438, "rewards/margins": 0.02030733972787857, "rewards/rejected": -1.2891730070114136, "semantic_entropy": 0.8145980834960938, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 5.569122058125533, "learning_rate": 9.400719478771449e-07, "logits/chosen": -0.07208576053380966, "logits/rejected": 0.17064586281776428, "logps/chosen": -1.319538950920105, "logps/rejected": -1.3693417310714722, "loss": 1.7107, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.319538950920105, "rewards/margins": 0.04980277270078659, "rewards/rejected": -1.3693417310714722, "semantic_entropy": 0.7823154926300049, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 6.926190485925234, "learning_rate": 9.393305043577209e-07, "logits/chosen": -0.16461318731307983, "logits/rejected": -0.04330313950777054, "logps/chosen": -1.3348883390426636, "logps/rejected": -1.4493154287338257, "loss": 1.7274, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.3348883390426636, "rewards/margins": 0.11442685127258301, "rewards/rejected": -1.4493154287338257, "semantic_entropy": 0.7851219177246094, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 4.387845431058533, "learning_rate": 9.38584798451817e-07, "logits/chosen": -0.05470917746424675, "logits/rejected": 0.05655473470687866, "logps/chosen": -1.257434368133545, "logps/rejected": -1.3779549598693848, "loss": 1.6592, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.257434368133545, "rewards/margins": 0.12052062898874283, "rewards/rejected": -1.3779549598693848, "semantic_entropy": 0.8035548329353333, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 6.640227985591931, "learning_rate": 9.37834837394275e-07, "logits/chosen": -0.07121571898460388, "logits/rejected": 0.022976867854595184, "logps/chosen": -1.3208197355270386, "logps/rejected": -1.4821239709854126, "loss": 1.7116, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3208197355270386, "rewards/margins": 0.16130423545837402, "rewards/rejected": -1.4821239709854126, "semantic_entropy": 0.7814778089523315, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 6.858933173739217, "learning_rate": 9.370806284612203e-07, "logits/chosen": -0.08027736097574234, "logits/rejected": 0.0420321524143219, "logps/chosen": -1.2354836463928223, "logps/rejected": -1.4372429847717285, "loss": 1.639, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2354836463928223, "rewards/margins": 0.20175938308238983, "rewards/rejected": -1.4372429847717285, "semantic_entropy": 0.8070446252822876, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 5.6852122461614, "learning_rate": 9.363221789699912e-07, "logits/chosen": -0.13024868071079254, "logits/rejected": -0.038981303572654724, "logps/chosen": -1.2636048793792725, "logps/rejected": -1.3045796155929565, "loss": 1.6682, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2636048793792725, "rewards/margins": 0.04097478464245796, "rewards/rejected": -1.3045796155929565, "semantic_entropy": 0.8092597723007202, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 6.155471849704311, "learning_rate": 9.355594962790682e-07, "logits/chosen": -0.11948658525943756, "logits/rejected": -0.030333450064063072, "logps/chosen": -1.2237807512283325, "logps/rejected": -1.3036266565322876, "loss": 1.6406, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2237807512283325, "rewards/margins": 0.07984593510627747, "rewards/rejected": -1.3036266565322876, "semantic_entropy": 0.8335784673690796, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 7.486402505103461, "learning_rate": 9.34792587788002e-07, "logits/chosen": -0.06079493835568428, "logits/rejected": 0.018440932035446167, "logps/chosen": -1.268488883972168, "logps/rejected": -1.3754494190216064, "loss": 1.6727, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.268488883972168, "rewards/margins": 0.10696063935756683, "rewards/rejected": -1.3754494190216064, "semantic_entropy": 0.808351993560791, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 5.022102181016947, "learning_rate": 9.34021460937342e-07, "logits/chosen": -0.02157517895102501, "logits/rejected": 0.055281806737184525, "logps/chosen": -1.221922755241394, "logps/rejected": -1.2992260456085205, "loss": 1.6323, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.221922755241394, "rewards/margins": 0.0773034617304802, "rewards/rejected": -1.2992260456085205, "semantic_entropy": 0.8208505511283875, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 4.051694677282333, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.19488653540611267, "logits/rejected": -0.09321919083595276, "logps/chosen": -1.3008278608322144, "logps/rejected": -1.3413996696472168, "loss": 1.7009, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3008278608322144, "rewards/margins": 0.04057171940803528, "rewards/rejected": -1.3413996696472168, "semantic_entropy": 0.8001400828361511, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 4.264731213338136, "learning_rate": 9.324665821239998e-07, "logits/chosen": -0.10242609679698944, "logits/rejected": 0.045723624527454376, "logps/chosen": -1.1527506113052368, "logps/rejected": -1.3518441915512085, "loss": 1.572, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1527506113052368, "rewards/margins": 0.1990937888622284, "rewards/rejected": -1.3518441915512085, "semantic_entropy": 0.8384343385696411, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 5.420063099821713, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.14833596348762512, "logits/rejected": -0.0082835853099823, "logps/chosen": -1.2772537469863892, "logps/rejected": -1.4049264192581177, "loss": 1.6796, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2772537469863892, "rewards/margins": 0.12767258286476135, "rewards/rejected": -1.4049264192581177, "semantic_entropy": 0.8047257661819458, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 5.4007354315603715, "learning_rate": 9.30894920180659e-07, "logits/chosen": -0.0524408221244812, "logits/rejected": 0.06384892016649246, "logps/chosen": -1.306804895401001, "logps/rejected": -1.2644994258880615, "loss": 1.7015, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.306804895401001, "rewards/margins": -0.042305391281843185, "rewards/rejected": -1.2644994258880615, "semantic_entropy": 0.7894402146339417, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 5.348554945703939, "learning_rate": 9.301028145701543e-07, "logits/chosen": -0.07659892737865448, "logits/rejected": 0.009451461024582386, "logps/chosen": -1.2014533281326294, "logps/rejected": -1.4177606105804443, "loss": 1.6122, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2014533281326294, "rewards/margins": 0.21630725264549255, "rewards/rejected": -1.4177606105804443, "semantic_entropy": 0.8214476704597473, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 7.549937018208959, "learning_rate": 9.293065361002563e-07, "logits/chosen": -0.03188856691122055, "logits/rejected": 0.012728346511721611, "logps/chosen": -1.216644525527954, "logps/rejected": -1.4594084024429321, "loss": 1.623, "rewards/accuracies": 0.59375, "rewards/chosen": -1.216644525527954, "rewards/margins": 0.24276407063007355, "rewards/rejected": -1.4594084024429321, "semantic_entropy": 0.812727153301239, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 5.5726452929114245, "learning_rate": 9.285060924964622e-07, "logits/chosen": -0.12857107818126678, "logits/rejected": -0.029686283320188522, "logps/chosen": -1.2757725715637207, "logps/rejected": -1.3304836750030518, "loss": 1.677, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2757725715637207, "rewards/margins": 0.05471107363700867, "rewards/rejected": -1.3304836750030518, "semantic_entropy": 0.80244380235672, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 5.190169281566926, "learning_rate": 9.277014915246792e-07, "logits/chosen": -0.028204062953591347, "logits/rejected": 0.00216894899494946, "logps/chosen": -1.2279856204986572, "logps/rejected": -1.3897521495819092, "loss": 1.6374, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2279856204986572, "rewards/margins": 0.16176660358905792, "rewards/rejected": -1.3897521495819092, "semantic_entropy": 0.8188594579696655, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 6.015067737256819, "learning_rate": 9.268927409911498e-07, "logits/chosen": -0.09832440316677094, "logits/rejected": -0.026854485273361206, "logps/chosen": -1.283251166343689, "logps/rejected": -1.324908971786499, "loss": 1.6846, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.283251166343689, "rewards/margins": 0.041657861322164536, "rewards/rejected": -1.324908971786499, "semantic_entropy": 0.802607536315918, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 5.662145675304518, "learning_rate": 9.260798487423749e-07, "logits/chosen": -0.15407513082027435, "logits/rejected": 0.01992628537118435, "logps/chosen": -1.3193175792694092, "logps/rejected": -1.4092413187026978, "loss": 1.7142, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3193175792694092, "rewards/margins": 0.08992379158735275, "rewards/rejected": -1.4092413187026978, "semantic_entropy": 0.7897613048553467, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 6.23370213890255, "learning_rate": 9.252628226650389e-07, "logits/chosen": -0.041138242930173874, "logits/rejected": 0.03306160122156143, "logps/chosen": -1.1997438669204712, "logps/rejected": -1.3120715618133545, "loss": 1.609, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1997438669204712, "rewards/margins": 0.11232763528823853, "rewards/rejected": -1.3120715618133545, "semantic_entropy": 0.8184932470321655, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 6.702534973769941, "learning_rate": 9.244416706859321e-07, "logits/chosen": -0.09069786965847015, "logits/rejected": 0.02401887997984886, "logps/chosen": -1.231222152709961, "logps/rejected": -1.4159982204437256, "loss": 1.6306, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.231222152709961, "rewards/margins": 0.1847762167453766, "rewards/rejected": -1.4159982204437256, "semantic_entropy": 0.7987192273139954, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 4.030206278130517, "learning_rate": 9.23616400771875e-07, "logits/chosen": -0.07227951288223267, "logits/rejected": 0.03814525157213211, "logps/chosen": -1.1940546035766602, "logps/rejected": -1.3633575439453125, "loss": 1.611, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1940546035766602, "rewards/margins": 0.16930294036865234, "rewards/rejected": -1.3633575439453125, "semantic_entropy": 0.8339089155197144, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 5.2586052924565125, "learning_rate": 9.227870209296395e-07, "logits/chosen": -0.05615914985537529, "logits/rejected": 0.017281047999858856, "logps/chosen": -1.3146439790725708, "logps/rejected": -1.4204986095428467, "loss": 1.7036, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3146439790725708, "rewards/margins": 0.10585472732782364, "rewards/rejected": -1.4204986095428467, "semantic_entropy": 0.7779796719551086, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 5.476726941586784, "learning_rate": 9.219535392058728e-07, "logits/chosen": -0.15622743964195251, "logits/rejected": -0.1328008472919464, "logps/chosen": -1.2732717990875244, "logps/rejected": -1.3330633640289307, "loss": 1.6695, "rewards/accuracies": 0.5, "rewards/chosen": -1.2732717990875244, "rewards/margins": 0.05979137867689133, "rewards/rejected": -1.3330633640289307, "semantic_entropy": 0.7925142049789429, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 3.956957583021994, "learning_rate": 9.211159636870181e-07, "logits/chosen": -0.12523582577705383, "logits/rejected": 0.008432741276919842, "logps/chosen": -1.2441338300704956, "logps/rejected": -1.3975975513458252, "loss": 1.6504, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2441338300704956, "rewards/margins": 0.15346381068229675, "rewards/rejected": -1.3975975513458252, "semantic_entropy": 0.812585175037384, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 6.2910711501661005, "learning_rate": 9.202743024992367e-07, "logits/chosen": -0.060137201100587845, "logits/rejected": 0.023209083825349808, "logps/chosen": -1.2254173755645752, "logps/rejected": -1.406592845916748, "loss": 1.6208, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2254173755645752, "rewards/margins": 0.18117542564868927, "rewards/rejected": -1.406592845916748, "semantic_entropy": 0.7907504439353943, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 6.547313369985432, "learning_rate": 9.194285638083293e-07, "logits/chosen": -0.05219448730349541, "logits/rejected": 0.06861809641122818, "logps/chosen": -1.296473741531372, "logps/rejected": -1.4099611043930054, "loss": 1.6944, "rewards/accuracies": 0.5, "rewards/chosen": -1.296473741531372, "rewards/margins": 0.11348732560873032, "rewards/rejected": -1.4099611043930054, "semantic_entropy": 0.7959024906158447, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 6.331445478391226, "learning_rate": 9.185787558196562e-07, "logits/chosen": -0.1084272712469101, "logits/rejected": -0.037467967718839645, "logps/chosen": -1.2708816528320312, "logps/rejected": -1.2850621938705444, "loss": 1.682, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.2708816528320312, "rewards/margins": 0.014180442318320274, "rewards/rejected": -1.2850621938705444, "semantic_entropy": 0.8223308324813843, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 6.367948039146346, "learning_rate": 9.177248867780583e-07, "logits/chosen": -0.11837979406118393, "logits/rejected": -0.025302868336439133, "logps/chosen": -1.3539178371429443, "logps/rejected": -1.3627796173095703, "loss": 1.7409, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3539178371429443, "rewards/margins": 0.008861854672431946, "rewards/rejected": -1.3627796173095703, "semantic_entropy": 0.7739442586898804, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 8.409486096359968, "learning_rate": 9.168669649677769e-07, "logits/chosen": -0.12011842429637909, "logits/rejected": -0.034160882234573364, "logps/chosen": -1.2351648807525635, "logps/rejected": -1.375335693359375, "loss": 1.6362, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2351648807525635, "rewards/margins": 0.1401708871126175, "rewards/rejected": -1.375335693359375, "semantic_entropy": 0.8021503686904907, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 6.762925113171552, "learning_rate": 9.16004998712373e-07, "logits/chosen": -0.04947897046804428, "logits/rejected": -0.01087774895131588, "logps/chosen": -1.189206838607788, "logps/rejected": -1.3980597257614136, "loss": 1.6087, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.189206838607788, "rewards/margins": 0.20885300636291504, "rewards/rejected": -1.3980597257614136, "semantic_entropy": 0.8389900922775269, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 5.0678116051486555, "learning_rate": 9.151389963746472e-07, "logits/chosen": -0.14438539743423462, "logits/rejected": 0.06495974957942963, "logps/chosen": -1.2952748537063599, "logps/rejected": -1.3950061798095703, "loss": 1.6885, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2952748537063599, "rewards/margins": 0.09973135590553284, "rewards/rejected": -1.3950061798095703, "semantic_entropy": 0.7863964438438416, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 5.823023614155188, "learning_rate": 9.142689663565577e-07, "logits/chosen": -0.07175926119089127, "logits/rejected": -0.0254055168479681, "logps/chosen": -1.2393056154251099, "logps/rejected": -1.3561909198760986, "loss": 1.6484, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2393056154251099, "rewards/margins": 0.11688525974750519, "rewards/rejected": -1.3561909198760986, "semantic_entropy": 0.8181071281433105, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 6.185998222752432, "learning_rate": 9.133949170991397e-07, "logits/chosen": -0.06148476153612137, "logits/rejected": -0.0034057931043207645, "logps/chosen": -1.2757203578948975, "logps/rejected": -1.3847815990447998, "loss": 1.6734, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2757203578948975, "rewards/margins": 0.10906125605106354, "rewards/rejected": -1.3847815990447998, "semantic_entropy": 0.7953287959098816, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 5.842897388338918, "learning_rate": 9.125168570824231e-07, "logits/chosen": -0.09118136763572693, "logits/rejected": 0.04449200630187988, "logps/chosen": -1.2590014934539795, "logps/rejected": -1.3106647729873657, "loss": 1.6638, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2590014934539795, "rewards/margins": 0.05166320130228996, "rewards/rejected": -1.3106647729873657, "semantic_entropy": 0.8095622062683105, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 6.343824060695247, "learning_rate": 9.116347948253496e-07, "logits/chosen": -0.10047060251235962, "logits/rejected": -0.02201777510344982, "logps/chosen": -1.2973721027374268, "logps/rejected": -1.3814334869384766, "loss": 1.706, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2973721027374268, "rewards/margins": 0.08406132459640503, "rewards/rejected": -1.3814334869384766, "semantic_entropy": 0.8172227740287781, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 7.814271954910578, "learning_rate": 9.107487388856916e-07, "logits/chosen": -0.1282876431941986, "logits/rejected": 0.0010832727421075106, "logps/chosen": -1.203259825706482, "logps/rejected": -1.3714569807052612, "loss": 1.6077, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.203259825706482, "rewards/margins": 0.1681971698999405, "rewards/rejected": -1.3714569807052612, "semantic_entropy": 0.8088679313659668, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 7.477106265696943, "learning_rate": 9.098586978599673e-07, "logits/chosen": -0.07497015595436096, "logits/rejected": 0.047747742384672165, "logps/chosen": -1.2527451515197754, "logps/rejected": -1.4383820295333862, "loss": 1.6507, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2527451515197754, "rewards/margins": 0.18563704192638397, "rewards/rejected": -1.4383820295333862, "semantic_entropy": 0.7959467768669128, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 5.711722800938201, "learning_rate": 9.089646803833588e-07, "logits/chosen": -0.055538348853588104, "logits/rejected": 0.08085554838180542, "logps/chosen": -1.2551352977752686, "logps/rejected": -1.3234446048736572, "loss": 1.6647, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2551352977752686, "rewards/margins": 0.06830926984548569, "rewards/rejected": -1.3234446048736572, "semantic_entropy": 0.8191736340522766, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 6.524922495360491, "learning_rate": 9.080666951296276e-07, "logits/chosen": -0.20068490505218506, "logits/rejected": 0.010654734447598457, "logps/chosen": -1.3146311044692993, "logps/rejected": -1.3960249423980713, "loss": 1.7143, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3146311044692993, "rewards/margins": 0.08139385282993317, "rewards/rejected": -1.3960249423980713, "semantic_entropy": 0.7994362711906433, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 3.7220922263387997, "learning_rate": 9.071647508110305e-07, "logits/chosen": -0.14626644551753998, "logits/rejected": 0.047650035470724106, "logps/chosen": -1.3367946147918701, "logps/rejected": -1.4443151950836182, "loss": 1.7301, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3367946147918701, "rewards/margins": 0.10752041637897491, "rewards/rejected": -1.4443151950836182, "semantic_entropy": 0.786649227142334, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 6.081730270542523, "learning_rate": 9.062588561782354e-07, "logits/chosen": -0.04031764343380928, "logits/rejected": 0.012033308856189251, "logps/chosen": -1.289323329925537, "logps/rejected": -1.419641137123108, "loss": 1.6869, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.289323329925537, "rewards/margins": 0.1303177773952484, "rewards/rejected": -1.419641137123108, "semantic_entropy": 0.7951552271842957, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 5.115485461218401, "learning_rate": 9.053490200202358e-07, "logits/chosen": -0.056047361344099045, "logits/rejected": 0.022970888763666153, "logps/chosen": -1.2908254861831665, "logps/rejected": -1.3853756189346313, "loss": 1.6928, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2908254861831665, "rewards/margins": 0.09455028921365738, "rewards/rejected": -1.3853756189346313, "semantic_entropy": 0.8040348291397095, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 6.593641814003039, "learning_rate": 9.044352511642661e-07, "logits/chosen": -0.032947491854429245, "logits/rejected": -0.02290326915681362, "logps/chosen": -1.1785624027252197, "logps/rejected": -1.3062907457351685, "loss": 1.5977, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1785624027252197, "rewards/margins": 0.12772832810878754, "rewards/rejected": -1.3062907457351685, "semantic_entropy": 0.8382250666618347, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 5.738931483572361, "learning_rate": 9.03517558475716e-07, "logits/chosen": -0.06174477934837341, "logits/rejected": 0.01776334084570408, "logps/chosen": -1.2718206644058228, "logps/rejected": -1.3104609251022339, "loss": 1.6831, "rewards/accuracies": 0.5, "rewards/chosen": -1.2718206644058228, "rewards/margins": 0.03864036127924919, "rewards/rejected": -1.3104609251022339, "semantic_entropy": 0.8225141763687134, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 5.59104362373392, "learning_rate": 9.025959508580436e-07, "logits/chosen": -0.014590287581086159, "logits/rejected": 0.17901010811328888, "logps/chosen": -1.2556663751602173, "logps/rejected": -1.3845726251602173, "loss": 1.6664, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2556663751602173, "rewards/margins": 0.1289062201976776, "rewards/rejected": -1.3845726251602173, "semantic_entropy": 0.8215236663818359, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 5.408830982732125, "learning_rate": 9.016704372526905e-07, "logits/chosen": -0.05887206271290779, "logits/rejected": 0.06500892341136932, "logps/chosen": -1.19480562210083, "logps/rejected": -1.4278016090393066, "loss": 1.6026, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.19480562210083, "rewards/margins": 0.23299583792686462, "rewards/rejected": -1.4278016090393066, "semantic_entropy": 0.815626323223114, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 5.923895617065594, "learning_rate": 9.007410266389934e-07, "logits/chosen": -0.12767180800437927, "logits/rejected": -0.05649426579475403, "logps/chosen": -1.2113678455352783, "logps/rejected": -1.3004612922668457, "loss": 1.6207, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2113678455352783, "rewards/margins": 0.08909336477518082, "rewards/rejected": -1.3004612922668457, "semantic_entropy": 0.8187181353569031, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 7.52877498267525, "learning_rate": 8.998077280340981e-07, "logits/chosen": -0.03592721000313759, "logits/rejected": 0.02420203946530819, "logps/chosen": -1.3308932781219482, "logps/rejected": -1.3026918172836304, "loss": 1.7328, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3308932781219482, "rewards/margins": -0.02820158377289772, "rewards/rejected": -1.3026918172836304, "semantic_entropy": 0.8038387298583984, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 5.863624631471971, "learning_rate": 8.988705504928722e-07, "logits/chosen": -0.12852129340171814, "logits/rejected": 0.021799670532345772, "logps/chosen": -1.3022496700286865, "logps/rejected": -1.4470789432525635, "loss": 1.6968, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3022496700286865, "rewards/margins": 0.14482924342155457, "rewards/rejected": -1.4470789432525635, "semantic_entropy": 0.7890971899032593, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.20284461975097656, "eval_logits/rejected": 0.27608925104141235, "eval_logps/chosen": -1.2795928716659546, "eval_logps/rejected": -1.4092683792114258, "eval_loss": 1.6819959878921509, "eval_rewards/accuracies": 0.5497032403945923, "eval_rewards/chosen": -1.2795928716659546, "eval_rewards/margins": 0.1296754777431488, "eval_rewards/rejected": -1.4092683792114258, "eval_runtime": 34.1505, "eval_samples_per_second": 39.384, "eval_semantic_entropy": 0.8045028448104858, "eval_steps_per_second": 9.868, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 4.384289998485035, "learning_rate": 8.979295031078157e-07, "logits/chosen": -0.13876444101333618, "logits/rejected": 0.04191622883081436, "logps/chosen": -1.2357057332992554, "logps/rejected": -1.442514181137085, "loss": 1.6474, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2357057332992554, "rewards/margins": 0.2068084478378296, "rewards/rejected": -1.442514181137085, "semantic_entropy": 0.8234742879867554, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 4.472959968612943, "learning_rate": 8.969845950089751e-07, "logits/chosen": -0.14780445396900177, "logits/rejected": 0.0005028113955631852, "logps/chosen": -1.2203980684280396, "logps/rejected": -1.3630597591400146, "loss": 1.6259, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2203980684280396, "rewards/margins": 0.14266183972358704, "rewards/rejected": -1.3630597591400146, "semantic_entropy": 0.8109768629074097, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 4.881966408376479, "learning_rate": 8.960358353638526e-07, "logits/chosen": -0.09959457069635391, "logits/rejected": -0.021598082035779953, "logps/chosen": -1.2859299182891846, "logps/rejected": -1.440955638885498, "loss": 1.6845, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2859299182891846, "rewards/margins": 0.1550256907939911, "rewards/rejected": -1.440955638885498, "semantic_entropy": 0.7970997095108032, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 5.6845943162016885, "learning_rate": 8.950832333773184e-07, "logits/chosen": -0.04391685128211975, "logits/rejected": 0.0690603107213974, "logps/chosen": -1.151642918586731, "logps/rejected": -1.4087110757827759, "loss": 1.5672, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.151642918586731, "rewards/margins": 0.25706806778907776, "rewards/rejected": -1.4087110757827759, "semantic_entropy": 0.8311318159103394, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 7.123721480734956, "learning_rate": 8.941267982915213e-07, "logits/chosen": 0.004077051766216755, "logits/rejected": 0.052182864397764206, "logps/chosen": -1.3248927593231201, "logps/rejected": -1.4417506456375122, "loss": 1.7197, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3248927593231201, "rewards/margins": 0.11685770750045776, "rewards/rejected": -1.4417506456375122, "semantic_entropy": 0.7896238565444946, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 5.689040132299322, "learning_rate": 8.931665393857983e-07, "logits/chosen": -0.04262044280767441, "logits/rejected": 0.07575557380914688, "logps/chosen": -1.290032148361206, "logps/rejected": -1.3361377716064453, "loss": 1.695, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.290032148361206, "rewards/margins": 0.04610564559698105, "rewards/rejected": -1.3361377716064453, "semantic_entropy": 0.8099185824394226, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 6.4158379990991365, "learning_rate": 8.922024659765861e-07, "logits/chosen": -0.12830926477909088, "logits/rejected": -0.03494611382484436, "logps/chosen": -1.1956336498260498, "logps/rejected": -1.3366754055023193, "loss": 1.608, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1956336498260498, "rewards/margins": 0.1410418450832367, "rewards/rejected": -1.3366754055023193, "semantic_entropy": 0.8247678875923157, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 5.39383494286562, "learning_rate": 8.912345874173288e-07, "logits/chosen": -0.11883975565433502, "logits/rejected": -0.039402447640895844, "logps/chosen": -1.197518229484558, "logps/rejected": -1.3319509029388428, "loss": 1.6098, "rewards/accuracies": 0.53125, "rewards/chosen": -1.197518229484558, "rewards/margins": 0.13443264365196228, "rewards/rejected": -1.3319509029388428, "semantic_entropy": 0.8246237635612488, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 6.3248401909525125, "learning_rate": 8.902629130983885e-07, "logits/chosen": -0.09083203226327896, "logits/rejected": -0.05413592979311943, "logps/chosen": -1.1915227174758911, "logps/rejected": -1.3364546298980713, "loss": 1.6099, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1915227174758911, "rewards/margins": 0.14493197202682495, "rewards/rejected": -1.3364546298980713, "semantic_entropy": 0.8367231488227844, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 7.545426919187759, "learning_rate": 8.892874524469537e-07, "logits/chosen": -0.003770361887291074, "logits/rejected": 0.04842919111251831, "logps/chosen": -1.2200462818145752, "logps/rejected": -1.3947572708129883, "loss": 1.6205, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2200462818145752, "rewards/margins": 0.1747109740972519, "rewards/rejected": -1.3947572708129883, "semantic_entropy": 0.8009975552558899, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 6.157898013566581, "learning_rate": 8.883082149269478e-07, "logits/chosen": -0.10937478393316269, "logits/rejected": -0.028118645772337914, "logps/chosen": -1.2830754518508911, "logps/rejected": -1.3396224975585938, "loss": 1.6882, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2830754518508911, "rewards/margins": 0.056547004729509354, "rewards/rejected": -1.3396224975585938, "semantic_entropy": 0.8102725744247437, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 4.918286409229276, "learning_rate": 8.873252100389377e-07, "logits/chosen": -0.02799239382147789, "logits/rejected": -0.03634902089834213, "logps/chosen": -1.2154041528701782, "logps/rejected": -1.3670237064361572, "loss": 1.625, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2154041528701782, "rewards/margins": 0.15161976218223572, "rewards/rejected": -1.3670237064361572, "semantic_entropy": 0.8191461563110352, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 4.5807876748203284, "learning_rate": 8.863384473200411e-07, "logits/chosen": -0.06188160181045532, "logits/rejected": -0.007136875297874212, "logps/chosen": -1.2827012538909912, "logps/rejected": -1.4021260738372803, "loss": 1.6844, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2827012538909912, "rewards/margins": 0.11942493915557861, "rewards/rejected": -1.4021260738372803, "semantic_entropy": 0.8033052682876587, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 4.646362055866736, "learning_rate": 8.853479363438342e-07, "logits/chosen": -0.023722397163510323, "logits/rejected": 0.10943897813558578, "logps/chosen": -1.2921464443206787, "logps/rejected": -1.3392903804779053, "loss": 1.6951, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2921464443206787, "rewards/margins": 0.047143835574388504, "rewards/rejected": -1.3392903804779053, "semantic_entropy": 0.8059177398681641, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 3.8898477039656463, "learning_rate": 8.843536867202588e-07, "logits/chosen": -0.053395211696624756, "logits/rejected": 0.11513286828994751, "logps/chosen": -1.2908045053482056, "logps/rejected": -1.481777548789978, "loss": 1.6794, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2908045053482056, "rewards/margins": 0.19097325205802917, "rewards/rejected": -1.481777548789978, "semantic_entropy": 0.7772699594497681, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 5.588916714146491, "learning_rate": 8.833557080955292e-07, "logits/chosen": -0.1316641867160797, "logits/rejected": -0.03959476202726364, "logps/chosen": -1.3211480379104614, "logps/rejected": -1.407669186592102, "loss": 1.7234, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3211480379104614, "rewards/margins": 0.08652111142873764, "rewards/rejected": -1.407669186592102, "semantic_entropy": 0.8044565916061401, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 6.272144181334391, "learning_rate": 8.823540101520381e-07, "logits/chosen": -0.09557892382144928, "logits/rejected": 0.0968320220708847, "logps/chosen": -1.2634527683258057, "logps/rejected": -1.359107255935669, "loss": 1.6742, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2634527683258057, "rewards/margins": 0.09565435349941254, "rewards/rejected": -1.359107255935669, "semantic_entropy": 0.8214870691299438, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 5.305696703650051, "learning_rate": 8.813486026082637e-07, "logits/chosen": -0.11224023997783661, "logits/rejected": 0.038044869899749756, "logps/chosen": -1.1937047243118286, "logps/rejected": -1.3374521732330322, "loss": 1.6015, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1937047243118286, "rewards/margins": 0.14374738931655884, "rewards/rejected": -1.3374521732330322, "semantic_entropy": 0.8155407905578613, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 8.323864559825305, "learning_rate": 8.803394952186742e-07, "logits/chosen": -0.1992841213941574, "logits/rejected": -0.08126775920391083, "logps/chosen": -1.2971670627593994, "logps/rejected": -1.3862276077270508, "loss": 1.6861, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2971670627593994, "rewards/margins": 0.08906050771474838, "rewards/rejected": -1.3862276077270508, "semantic_entropy": 0.7778486609458923, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 7.306770644005474, "learning_rate": 8.793266977736342e-07, "logits/chosen": -0.05684704706072807, "logits/rejected": -0.09301404654979706, "logps/chosen": -1.3076379299163818, "logps/rejected": -1.3280279636383057, "loss": 1.7053, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.3076379299163818, "rewards/margins": 0.020390162244439125, "rewards/rejected": -1.3280279636383057, "semantic_entropy": 0.7954174280166626, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 5.450090018952432, "learning_rate": 8.783102200993085e-07, "logits/chosen": -0.03902271389961243, "logits/rejected": 0.08206261694431305, "logps/chosen": -1.276665449142456, "logps/rejected": -1.3414543867111206, "loss": 1.6838, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.276665449142456, "rewards/margins": 0.06478889286518097, "rewards/rejected": -1.3414543867111206, "semantic_entropy": 0.8141754269599915, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 6.381756118605349, "learning_rate": 8.772900720575683e-07, "logits/chosen": -0.07775630056858063, "logits/rejected": -0.006949305534362793, "logps/chosen": -1.2149673700332642, "logps/rejected": -1.336089849472046, "loss": 1.6281, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2149673700332642, "rewards/margins": 0.121122345328331, "rewards/rejected": -1.336089849472046, "semantic_entropy": 0.8262467384338379, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 4.90686169252877, "learning_rate": 8.762662635458944e-07, "logits/chosen": -0.05986091494560242, "logits/rejected": 0.12209532409906387, "logps/chosen": -1.3066062927246094, "logps/rejected": -1.3526155948638916, "loss": 1.7157, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.3066062927246094, "rewards/margins": 0.04600915685296059, "rewards/rejected": -1.3526155948638916, "semantic_entropy": 0.8180992007255554, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 4.141945213242054, "learning_rate": 8.752388044972811e-07, "logits/chosen": -0.07696859538555145, "logits/rejected": -0.030216004699468613, "logps/chosen": -1.1413288116455078, "logps/rejected": -1.3630990982055664, "loss": 1.5489, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1413288116455078, "rewards/margins": 0.22177033126354218, "rewards/rejected": -1.3630990982055664, "semantic_entropy": 0.8151764869689941, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 5.111699789917334, "learning_rate": 8.74207704880141e-07, "logits/chosen": -0.07430355250835419, "logits/rejected": 0.011962112970650196, "logps/chosen": -1.2972158193588257, "logps/rejected": -1.4589394330978394, "loss": 1.696, "rewards/accuracies": 0.5, "rewards/chosen": -1.2972158193588257, "rewards/margins": 0.1617235392332077, "rewards/rejected": -1.4589394330978394, "semantic_entropy": 0.797549843788147, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 7.469027977840694, "learning_rate": 8.731729746982068e-07, "logits/chosen": 0.010954265482723713, "logits/rejected": 0.044957805424928665, "logps/chosen": -1.2623331546783447, "logps/rejected": -1.3253600597381592, "loss": 1.6638, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2623331546783447, "rewards/margins": 0.06302676349878311, "rewards/rejected": -1.3253600597381592, "semantic_entropy": 0.8028810620307922, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 5.3536613892090745, "learning_rate": 8.721346239904355e-07, "logits/chosen": -0.1453741490840912, "logits/rejected": -0.018662970513105392, "logps/chosen": -1.197269320487976, "logps/rejected": -1.496684193611145, "loss": 1.6082, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.197269320487976, "rewards/margins": 0.2994149327278137, "rewards/rejected": -1.496684193611145, "semantic_entropy": 0.8218716382980347, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 5.956009860130467, "learning_rate": 8.710926628309101e-07, "logits/chosen": -0.08086270093917847, "logits/rejected": 0.020478151738643646, "logps/chosen": -1.2466636896133423, "logps/rejected": -1.3346598148345947, "loss": 1.6513, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2466636896133423, "rewards/margins": 0.08799618482589722, "rewards/rejected": -1.3346598148345947, "semantic_entropy": 0.8093579411506653, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 4.963897423199058, "learning_rate": 8.700471013287424e-07, "logits/chosen": -0.031677573919296265, "logits/rejected": -0.01724698767066002, "logps/chosen": -1.2455298900604248, "logps/rejected": -1.3457484245300293, "loss": 1.6526, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2455298900604248, "rewards/margins": 0.10021861642599106, "rewards/rejected": -1.3457484245300293, "semantic_entropy": 0.814106822013855, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 9.348788132420529, "learning_rate": 8.689979496279746e-07, "logits/chosen": -0.08055023849010468, "logits/rejected": -0.023808099329471588, "logps/chosen": -1.2197928428649902, "logps/rejected": -1.4276221990585327, "loss": 1.6278, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2197928428649902, "rewards/margins": 0.20782938599586487, "rewards/rejected": -1.4276221990585327, "semantic_entropy": 0.8159586191177368, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 4.177875859795629, "learning_rate": 8.679452179074811e-07, "logits/chosen": -0.0706687867641449, "logits/rejected": -0.003461043583229184, "logps/chosen": -1.2233806848526, "logps/rejected": -1.3133671283721924, "loss": 1.6363, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2233806848526, "rewards/margins": 0.08998642861843109, "rewards/rejected": -1.3133671283721924, "semantic_entropy": 0.8258953094482422, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 4.643160538785565, "learning_rate": 8.668889163808698e-07, "logits/chosen": -0.061286427080631256, "logits/rejected": 0.023486142978072166, "logps/chosen": -1.2042796611785889, "logps/rejected": -1.3470985889434814, "loss": 1.6104, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2042796611785889, "rewards/margins": 0.1428188979625702, "rewards/rejected": -1.3470985889434814, "semantic_entropy": 0.8123264312744141, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 5.06339476651371, "learning_rate": 8.658290552963827e-07, "logits/chosen": -0.046496741473674774, "logits/rejected": -0.03083505854010582, "logps/chosen": -1.2464790344238281, "logps/rejected": -1.3767074346542358, "loss": 1.6619, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2464790344238281, "rewards/margins": 0.13022840023040771, "rewards/rejected": -1.3767074346542358, "semantic_entropy": 0.8307523727416992, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 5.71205758758314, "learning_rate": 8.647656449367966e-07, "logits/chosen": -0.037111908197402954, "logits/rejected": 0.0983063206076622, "logps/chosen": -1.2675782442092896, "logps/rejected": -1.312664270401001, "loss": 1.667, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2675782442092896, "rewards/margins": 0.04508592560887337, "rewards/rejected": -1.312664270401001, "semantic_entropy": 0.7989345788955688, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 6.235335563830592, "learning_rate": 8.636986956193235e-07, "logits/chosen": -0.1255255490541458, "logits/rejected": -0.07064412534236908, "logps/chosen": -1.21595299243927, "logps/rejected": -1.3236777782440186, "loss": 1.6337, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.21595299243927, "rewards/margins": 0.10772490501403809, "rewards/rejected": -1.3236777782440186, "semantic_entropy": 0.8355158567428589, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 6.017407187646775, "learning_rate": 8.626282176955104e-07, "logits/chosen": -0.09471640735864639, "logits/rejected": 0.0032616101671010256, "logps/chosen": -1.2412772178649902, "logps/rejected": -1.3881863355636597, "loss": 1.6512, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2412772178649902, "rewards/margins": 0.1469092071056366, "rewards/rejected": -1.3881863355636597, "semantic_entropy": 0.8199090957641602, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 5.891945179569932, "learning_rate": 8.615542215511389e-07, "logits/chosen": -0.017667558044195175, "logits/rejected": 0.03805273398756981, "logps/chosen": -1.17448890209198, "logps/rejected": -1.2676188945770264, "loss": 1.5892, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.17448890209198, "rewards/margins": 0.09313005208969116, "rewards/rejected": -1.2676188945770264, "semantic_entropy": 0.8293744921684265, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 5.18130604337313, "learning_rate": 8.604767176061241e-07, "logits/chosen": 0.0026161670684814453, "logits/rejected": 0.028079137206077576, "logps/chosen": -1.2620675563812256, "logps/rejected": -1.3588144779205322, "loss": 1.6698, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2620675563812256, "rewards/margins": 0.09674691408872604, "rewards/rejected": -1.3588144779205322, "semantic_entropy": 0.8153875470161438, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 5.512070948234341, "learning_rate": 8.593957163144141e-07, "logits/chosen": -0.13053461909294128, "logits/rejected": -0.0246591754257679, "logps/chosen": -1.2394025325775146, "logps/rejected": -1.3598873615264893, "loss": 1.65, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2394025325775146, "rewards/margins": 0.12048468738794327, "rewards/rejected": -1.3598873615264893, "semantic_entropy": 0.821113109588623, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 4.271867342206073, "learning_rate": 8.58311228163888e-07, "logits/chosen": -0.034976787865161896, "logits/rejected": 0.015078052878379822, "logps/chosen": -1.2165769338607788, "logps/rejected": -1.3060983419418335, "loss": 1.6203, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2165769338607788, "rewards/margins": 0.0895213857293129, "rewards/rejected": -1.3060983419418335, "semantic_entropy": 0.8073854446411133, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 5.5036134733206605, "learning_rate": 8.57223263676255e-07, "logits/chosen": -0.14540931582450867, "logits/rejected": -0.041130583733320236, "logps/chosen": -1.1819874048233032, "logps/rejected": -1.3687695264816284, "loss": 1.5972, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1819874048233032, "rewards/margins": 0.18678195774555206, "rewards/rejected": -1.3687695264816284, "semantic_entropy": 0.8303861618041992, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 5.5849069289329805, "learning_rate": 8.561318334069511e-07, "logits/chosen": -0.03188782185316086, "logits/rejected": 0.08816667646169662, "logps/chosen": -1.2140344381332397, "logps/rejected": -1.3072352409362793, "loss": 1.6277, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2140344381332397, "rewards/margins": 0.09320063889026642, "rewards/rejected": -1.3072352409362793, "semantic_entropy": 0.8273522257804871, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 5.201705922580334, "learning_rate": 8.550369479450375e-07, "logits/chosen": -0.06574490666389465, "logits/rejected": 0.03563268110156059, "logps/chosen": -1.2429221868515015, "logps/rejected": -1.3118363618850708, "loss": 1.6555, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2429221868515015, "rewards/margins": 0.06891416013240814, "rewards/rejected": -1.3118363618850708, "semantic_entropy": 0.8250991702079773, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 6.170089758754659, "learning_rate": 8.539386179130977e-07, "logits/chosen": -0.05231968313455582, "logits/rejected": 0.0024616725277155638, "logps/chosen": -1.2225955724716187, "logps/rejected": -1.3166120052337646, "loss": 1.6379, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2225955724716187, "rewards/margins": 0.09401656687259674, "rewards/rejected": -1.3166120052337646, "semantic_entropy": 0.8306851387023926, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 6.150684559907577, "learning_rate": 8.528368539671347e-07, "logits/chosen": -0.1345938742160797, "logits/rejected": -0.022956429049372673, "logps/chosen": -1.2154592275619507, "logps/rejected": -1.332251787185669, "loss": 1.633, "rewards/accuracies": 0.5, "rewards/chosen": -1.2154592275619507, "rewards/margins": 0.11679251492023468, "rewards/rejected": -1.332251787185669, "semantic_entropy": 0.8350217938423157, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 4.560836416149868, "learning_rate": 8.51731666796467e-07, "logits/chosen": 0.038713328540325165, "logits/rejected": 0.05721114203333855, "logps/chosen": -1.2955657243728638, "logps/rejected": -1.3158836364746094, "loss": 1.698, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2955657243728638, "rewards/margins": 0.02031794935464859, "rewards/rejected": -1.3158836364746094, "semantic_entropy": 0.8049464225769043, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 6.581695796240072, "learning_rate": 8.506230671236254e-07, "logits/chosen": -0.06470213085412979, "logits/rejected": -0.019947880879044533, "logps/chosen": -1.2453309297561646, "logps/rejected": -1.2584359645843506, "loss": 1.6533, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2453309297561646, "rewards/margins": 0.013105114921927452, "rewards/rejected": -1.2584359645843506, "semantic_entropy": 0.8159577250480652, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 5.086438298599752, "learning_rate": 8.495110657042488e-07, "logits/chosen": -0.03997425362467766, "logits/rejected": 0.04699445515871048, "logps/chosen": -1.2495410442352295, "logps/rejected": -1.4128048419952393, "loss": 1.661, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2495410442352295, "rewards/margins": 0.16326376795768738, "rewards/rejected": -1.4128048419952393, "semantic_entropy": 0.8229337930679321, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 7.8941799107768915, "learning_rate": 8.483956733269799e-07, "logits/chosen": -0.08547230064868927, "logits/rejected": -0.012736174277961254, "logps/chosen": -1.2722465991973877, "logps/rejected": -1.310508370399475, "loss": 1.6763, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2722465991973877, "rewards/margins": 0.038261778652668, "rewards/rejected": -1.310508370399475, "semantic_entropy": 0.8081877827644348, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 5.41255300685634, "learning_rate": 8.472769008133602e-07, "logits/chosen": -0.20678424835205078, "logits/rejected": -0.10173904895782471, "logps/chosen": -1.2819212675094604, "logps/rejected": -1.2640694379806519, "loss": 1.6992, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2819212675094604, "rewards/margins": -0.017851637676358223, "rewards/rejected": -1.2640694379806519, "semantic_entropy": 0.8345072865486145, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 5.873896376335427, "learning_rate": 8.461547590177259e-07, "logits/chosen": -0.07598160952329636, "logits/rejected": -0.0017624751199036837, "logps/chosen": -1.209578514099121, "logps/rejected": -1.3187006711959839, "loss": 1.6135, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.209578514099121, "rewards/margins": 0.10912225395441055, "rewards/rejected": -1.3187006711959839, "semantic_entropy": 0.8078088760375977, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 5.938488533822175, "learning_rate": 8.450292588271014e-07, "logits/chosen": -0.04042092710733414, "logits/rejected": 0.020467430353164673, "logps/chosen": -1.2804787158966064, "logps/rejected": -1.3579797744750977, "loss": 1.686, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2804787158966064, "rewards/margins": 0.07750102132558823, "rewards/rejected": -1.3579797744750977, "semantic_entropy": 0.8109739422798157, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 5.560358976815608, "learning_rate": 8.439004111610945e-07, "logits/chosen": -0.08492116630077362, "logits/rejected": -0.026765596121549606, "logps/chosen": -1.1385555267333984, "logps/rejected": -1.3582890033721924, "loss": 1.5542, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1385555267333984, "rewards/margins": 0.21973347663879395, "rewards/rejected": -1.3582890033721924, "semantic_entropy": 0.8313388824462891, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 5.407227347445459, "learning_rate": 8.427682269717901e-07, "logits/chosen": -0.12608292698860168, "logits/rejected": -0.0014965459704399109, "logps/chosen": -1.3027405738830566, "logps/rejected": -1.3205264806747437, "loss": 1.7095, "rewards/accuracies": 0.46875, "rewards/chosen": -1.3027405738830566, "rewards/margins": 0.017785947769880295, "rewards/rejected": -1.3205264806747437, "semantic_entropy": 0.8135707974433899, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 4.854356141130862, "learning_rate": 8.416327172436446e-07, "logits/chosen": -0.16710661351680756, "logits/rejected": -0.058289725333452225, "logps/chosen": -1.273058533668518, "logps/rejected": -1.3336715698242188, "loss": 1.6862, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.273058533668518, "rewards/margins": 0.060613006353378296, "rewards/rejected": -1.3336715698242188, "semantic_entropy": 0.8262566328048706, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 5.258151230973433, "learning_rate": 8.404938929933778e-07, "logits/chosen": -0.056605733931064606, "logits/rejected": 0.0835668072104454, "logps/chosen": -1.2166283130645752, "logps/rejected": -1.4617342948913574, "loss": 1.6224, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2166283130645752, "rewards/margins": 0.24510610103607178, "rewards/rejected": -1.4617342948913574, "semantic_entropy": 0.8114503622055054, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 6.11411474750491, "learning_rate": 8.39351765269868e-07, "logits/chosen": -0.09445194154977798, "logits/rejected": -0.031572047621011734, "logps/chosen": -1.173677682876587, "logps/rejected": -1.3270585536956787, "loss": 1.5895, "rewards/accuracies": 0.5625, "rewards/chosen": -1.173677682876587, "rewards/margins": 0.15338096022605896, "rewards/rejected": -1.3270585536956787, "semantic_entropy": 0.8316260576248169, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 4.58749164329407, "learning_rate": 8.382063451540431e-07, "logits/chosen": -0.09511774778366089, "logits/rejected": 0.07577136904001236, "logps/chosen": -1.2444308996200562, "logps/rejected": -1.3644696474075317, "loss": 1.6485, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2444308996200562, "rewards/margins": 0.12003858387470245, "rewards/rejected": -1.3644696474075317, "semantic_entropy": 0.8081377148628235, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 5.48126487344059, "learning_rate": 8.370576437587742e-07, "logits/chosen": -0.055772941559553146, "logits/rejected": -0.010040899738669395, "logps/chosen": -1.2120511531829834, "logps/rejected": -1.3385181427001953, "loss": 1.6225, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2120511531829834, "rewards/margins": 0.12646697461605072, "rewards/rejected": -1.3385181427001953, "semantic_entropy": 0.820903480052948, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 4.691863357552319, "learning_rate": 8.359056722287674e-07, "logits/chosen": -0.1622186303138733, "logits/rejected": 0.04522024467587471, "logps/chosen": -1.2555381059646606, "logps/rejected": -1.3378403186798096, "loss": 1.6523, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2555381059646606, "rewards/margins": 0.08230231702327728, "rewards/rejected": -1.3378403186798096, "semantic_entropy": 0.7935494184494019, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 5.554844099710117, "learning_rate": 8.347504417404553e-07, "logits/chosen": -0.09096790105104446, "logits/rejected": 0.027966951951384544, "logps/chosen": -1.2693850994110107, "logps/rejected": -1.3648723363876343, "loss": 1.671, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2693850994110107, "rewards/margins": 0.09548751264810562, "rewards/rejected": -1.3648723363876343, "semantic_entropy": 0.8032118678092957, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 6.013951197935747, "learning_rate": 8.335919635018893e-07, "logits/chosen": -0.16043396294116974, "logits/rejected": -0.057415615767240524, "logps/chosen": -1.2434272766113281, "logps/rejected": -1.382820725440979, "loss": 1.6446, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2434272766113281, "rewards/margins": 0.13939353823661804, "rewards/rejected": -1.382820725440979, "semantic_entropy": 0.802310585975647, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 4.360970116488663, "learning_rate": 8.324302487526303e-07, "logits/chosen": -0.11019855737686157, "logits/rejected": -0.05252770334482193, "logps/chosen": -1.2062979936599731, "logps/rejected": -1.285973310470581, "loss": 1.6207, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2062979936599731, "rewards/margins": 0.07967547327280045, "rewards/rejected": -1.285973310470581, "semantic_entropy": 0.8288606405258179, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 4.8511294530155356, "learning_rate": 8.312653087636398e-07, "logits/chosen": -0.122353695333004, "logits/rejected": -0.059392500668764114, "logps/chosen": -1.1329548358917236, "logps/rejected": -1.3003451824188232, "loss": 1.5532, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1329548358917236, "rewards/margins": 0.16739040613174438, "rewards/rejected": -1.3003451824188232, "semantic_entropy": 0.8405397534370422, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 5.610388045486456, "learning_rate": 8.300971548371711e-07, "logits/chosen": -0.2473979890346527, "logits/rejected": -0.07191173732280731, "logps/chosen": -1.312682867050171, "logps/rejected": -1.3695838451385498, "loss": 1.7164, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.312682867050171, "rewards/margins": 0.05690097063779831, "rewards/rejected": -1.3695838451385498, "semantic_entropy": 0.8075262308120728, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 5.44967227300987, "learning_rate": 8.289257983066582e-07, "logits/chosen": -0.16695664823055267, "logits/rejected": -0.052744895219802856, "logps/chosen": -1.1696319580078125, "logps/rejected": -1.338442087173462, "loss": 1.5793, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1696319580078125, "rewards/margins": 0.16881009936332703, "rewards/rejected": -1.338442087173462, "semantic_entropy": 0.819388747215271, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 4.73045752977982, "learning_rate": 8.277512505366077e-07, "logits/chosen": -0.19597987830638885, "logits/rejected": -0.04908065125346184, "logps/chosen": -1.2401734590530396, "logps/rejected": -1.3636665344238281, "loss": 1.6504, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2401734590530396, "rewards/margins": 0.12349303066730499, "rewards/rejected": -1.3636665344238281, "semantic_entropy": 0.820360541343689, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 6.3412471503952785, "learning_rate": 8.265735229224868e-07, "logits/chosen": -0.09433646500110626, "logits/rejected": -0.010939409956336021, "logps/chosen": -1.2406576871871948, "logps/rejected": -1.3782161474227905, "loss": 1.6421, "rewards/accuracies": 0.5, "rewards/chosen": -1.2406576871871948, "rewards/margins": 0.1375584602355957, "rewards/rejected": -1.3782161474227905, "semantic_entropy": 0.8028027415275574, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 6.567286056755344, "learning_rate": 8.253926268906144e-07, "logits/chosen": -0.17711855471134186, "logits/rejected": -0.03515800088644028, "logps/chosen": -1.2306888103485107, "logps/rejected": -1.328763723373413, "loss": 1.6433, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2306888103485107, "rewards/margins": 0.09807483106851578, "rewards/rejected": -1.328763723373413, "semantic_entropy": 0.8251951932907104, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 5.148684242874604, "learning_rate": 8.242085738980487e-07, "logits/chosen": -0.11584653705358505, "logits/rejected": 0.05853165313601494, "logps/chosen": -1.2751115560531616, "logps/rejected": -1.3643481731414795, "loss": 1.6745, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2751115560531616, "rewards/margins": 0.08923659473657608, "rewards/rejected": -1.3643481731414795, "semantic_entropy": 0.7988117337226868, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 4.975063636361525, "learning_rate": 8.230213754324772e-07, "logits/chosen": -0.08722196519374847, "logits/rejected": -0.03299152851104736, "logps/chosen": -1.1688838005065918, "logps/rejected": -1.3613990545272827, "loss": 1.5865, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1688838005065918, "rewards/margins": 0.19251517951488495, "rewards/rejected": -1.3613990545272827, "semantic_entropy": 0.8352289199829102, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 5.999892890389526, "learning_rate": 8.218310430121045e-07, "logits/chosen": -0.17663012444972992, "logits/rejected": -0.15736006200313568, "logps/chosen": -1.2256001234054565, "logps/rejected": -1.329024314880371, "loss": 1.627, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2256001234054565, "rewards/margins": 0.10342420637607574, "rewards/rejected": -1.329024314880371, "semantic_entropy": 0.8028705716133118, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 6.698901615267117, "learning_rate": 8.20637588185541e-07, "logits/chosen": -0.06786809116601944, "logits/rejected": -0.01443662028759718, "logps/chosen": -1.1722146272659302, "logps/rejected": -1.4164336919784546, "loss": 1.5854, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1722146272659302, "rewards/margins": 0.244219109416008, "rewards/rejected": -1.4164336919784546, "semantic_entropy": 0.8263869285583496, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 6.229810639925225, "learning_rate": 8.194410225316906e-07, "logits/chosen": -0.11604873836040497, "logits/rejected": -0.003046951489523053, "logps/chosen": -1.2357218265533447, "logps/rejected": -1.386966347694397, "loss": 1.6307, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2357218265533447, "rewards/margins": 0.1512444019317627, "rewards/rejected": -1.386966347694397, "semantic_entropy": 0.7900136113166809, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 6.145679677567474, "learning_rate": 8.182413576596385e-07, "logits/chosen": -0.057274747639894485, "logits/rejected": 0.015822093933820724, "logps/chosen": -1.1706548929214478, "logps/rejected": -1.334033489227295, "loss": 1.5827, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1706548929214478, "rewards/margins": 0.1633785367012024, "rewards/rejected": -1.334033489227295, "semantic_entropy": 0.8241391181945801, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 5.87762531156081, "learning_rate": 8.170386052085389e-07, "logits/chosen": -0.009640065021812916, "logits/rejected": 0.09695431590080261, "logps/chosen": -1.239251971244812, "logps/rejected": -1.3678679466247559, "loss": 1.6433, "rewards/accuracies": 0.5, "rewards/chosen": -1.239251971244812, "rewards/margins": 0.12861596047878265, "rewards/rejected": -1.3678679466247559, "semantic_entropy": 0.8080918192863464, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 6.769308474700579, "learning_rate": 8.158327768475008e-07, "logits/chosen": -0.08172724395990372, "logits/rejected": 0.05255339294672012, "logps/chosen": -1.2671904563903809, "logps/rejected": -1.320107102394104, "loss": 1.6708, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2671904563903809, "rewards/margins": 0.05291660875082016, "rewards/rejected": -1.320107102394104, "semantic_entropy": 0.8073102831840515, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 6.197003910367964, "learning_rate": 8.146238842754767e-07, "logits/chosen": -0.1044461578130722, "logits/rejected": -0.025031039491295815, "logps/chosen": -1.267733097076416, "logps/rejected": -1.3939491510391235, "loss": 1.6754, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.267733097076416, "rewards/margins": 0.1262160688638687, "rewards/rejected": -1.3939491510391235, "semantic_entropy": 0.8153647184371948, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 6.632881977215973, "learning_rate": 8.134119392211476e-07, "logits/chosen": -0.006975066848099232, "logits/rejected": 0.11662169545888901, "logps/chosen": -1.186049222946167, "logps/rejected": -1.4129503965377808, "loss": 1.5971, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.186049222946167, "rewards/margins": 0.22690114378929138, "rewards/rejected": -1.4129503965377808, "semantic_entropy": 0.8221061825752258, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 5.8780646925649105, "learning_rate": 8.121969534428094e-07, "logits/chosen": -0.11857473850250244, "logits/rejected": 0.012356376275420189, "logps/chosen": -1.270699143409729, "logps/rejected": -1.2836253643035889, "loss": 1.6822, "rewards/accuracies": 0.46875, "rewards/chosen": -1.270699143409729, "rewards/margins": 0.012926402501761913, "rewards/rejected": -1.2836253643035889, "semantic_entropy": 0.8229135274887085, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.15091478824615479, "eval_logits/rejected": 0.22157339751720428, "eval_logps/chosen": -1.2760646343231201, "eval_logps/rejected": -1.408192753791809, "eval_loss": 1.678680419921875, "eval_rewards/accuracies": 0.5474777221679688, "eval_rewards/chosen": -1.2760646343231201, "eval_rewards/margins": 0.13212820887565613, "eval_rewards/rejected": -1.408192753791809, "eval_runtime": 34.2721, "eval_samples_per_second": 39.245, "eval_semantic_entropy": 0.8049272298812866, "eval_steps_per_second": 9.833, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 7.12442580804879, "learning_rate": 8.109789387282599e-07, "logits/chosen": -0.08601008355617523, "logits/rejected": -0.024334026500582695, "logps/chosen": -1.2772554159164429, "logps/rejected": -1.3116166591644287, "loss": 1.6866, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2772554159164429, "rewards/margins": 0.03436121717095375, "rewards/rejected": -1.3116166591644287, "semantic_entropy": 0.818762481212616, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 6.91230109964103, "learning_rate": 8.097579068946827e-07, "logits/chosen": -0.03125903010368347, "logits/rejected": 0.0528080090880394, "logps/chosen": -1.2067348957061768, "logps/rejected": -1.3177826404571533, "loss": 1.6186, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2067348957061768, "rewards/margins": 0.11104774475097656, "rewards/rejected": -1.3177826404571533, "semantic_entropy": 0.8237894177436829, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 6.6597771440944715, "learning_rate": 8.085338697885344e-07, "logits/chosen": -0.0915103554725647, "logits/rejected": 0.042414985597133636, "logps/chosen": -1.175306797027588, "logps/rejected": -1.314026951789856, "loss": 1.5955, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.175306797027588, "rewards/margins": 0.13872012495994568, "rewards/rejected": -1.314026951789856, "semantic_entropy": 0.8403580784797668, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 4.596477098746115, "learning_rate": 8.073068392854282e-07, "logits/chosen": -0.126971036195755, "logits/rejected": 0.0011516213417053223, "logps/chosen": -1.2869563102722168, "logps/rejected": -1.4157096147537231, "loss": 1.6897, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2869563102722168, "rewards/margins": 0.12875331938266754, "rewards/rejected": -1.4157096147537231, "semantic_entropy": 0.8054868578910828, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 5.475068062106701, "learning_rate": 8.060768272900193e-07, "logits/chosen": -0.05900750681757927, "logits/rejected": 0.03933548182249069, "logps/chosen": -1.2418310642242432, "logps/rejected": -1.375864028930664, "loss": 1.6448, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2418310642242432, "rewards/margins": 0.13403303921222687, "rewards/rejected": -1.375864028930664, "semantic_entropy": 0.8059372901916504, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 3.852417609025192, "learning_rate": 8.0484384573589e-07, "logits/chosen": -0.1400444060564041, "logits/rejected": -0.12435641139745712, "logps/chosen": -1.1791813373565674, "logps/rejected": -1.3292489051818848, "loss": 1.5849, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1791813373565674, "rewards/margins": 0.15006767213344574, "rewards/rejected": -1.3292489051818848, "semantic_entropy": 0.8114988207817078, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 5.890726774201513, "learning_rate": 8.03607906585432e-07, "logits/chosen": -0.14998051524162292, "logits/rejected": -0.01278824545443058, "logps/chosen": -1.1815717220306396, "logps/rejected": -1.3059090375900269, "loss": 1.5955, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1815717220306396, "rewards/margins": 0.12433715164661407, "rewards/rejected": -1.3059090375900269, "semantic_entropy": 0.8277875781059265, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 7.627714481061457, "learning_rate": 8.023690218297329e-07, "logits/chosen": -0.1823725700378418, "logits/rejected": -0.1373450607061386, "logps/chosen": -1.2081711292266846, "logps/rejected": -1.2956795692443848, "loss": 1.6158, "rewards/accuracies": 0.5, "rewards/chosen": -1.2081711292266846, "rewards/margins": 0.08750840276479721, "rewards/rejected": -1.2956795692443848, "semantic_entropy": 0.8151972889900208, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 6.7546257721574205, "learning_rate": 8.01127203488458e-07, "logits/chosen": -0.060518860816955566, "logits/rejected": -0.024515826255083084, "logps/chosen": -1.2172151803970337, "logps/rejected": -1.346688985824585, "loss": 1.629, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2172151803970337, "rewards/margins": 0.12947380542755127, "rewards/rejected": -1.346688985824585, "semantic_entropy": 0.8236492872238159, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 4.347474255951684, "learning_rate": 7.998824636097339e-07, "logits/chosen": -0.1731574833393097, "logits/rejected": -0.09120061248540878, "logps/chosen": -1.2536484003067017, "logps/rejected": -1.3015358448028564, "loss": 1.6562, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2536484003067017, "rewards/margins": 0.04788777604699135, "rewards/rejected": -1.3015358448028564, "semantic_entropy": 0.8051711916923523, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 5.802444663379252, "learning_rate": 7.986348142700328e-07, "logits/chosen": -0.08063340187072754, "logits/rejected": 0.01585637405514717, "logps/chosen": -1.2236363887786865, "logps/rejected": -1.307787299156189, "loss": 1.627, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2236363887786865, "rewards/margins": 0.08415073156356812, "rewards/rejected": -1.307787299156189, "semantic_entropy": 0.8066498637199402, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 5.281015991832143, "learning_rate": 7.973842675740539e-07, "logits/chosen": -0.06917105615139008, "logits/rejected": -0.03510887175798416, "logps/chosen": -1.2487597465515137, "logps/rejected": -1.3957663774490356, "loss": 1.6504, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2487597465515137, "rewards/margins": 0.1470065414905548, "rewards/rejected": -1.3957663774490356, "semantic_entropy": 0.8032780885696411, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 4.942510451681948, "learning_rate": 7.961308356546066e-07, "logits/chosen": -0.09045930206775665, "logits/rejected": 0.006872578524053097, "logps/chosen": -1.2319642305374146, "logps/rejected": -1.3069250583648682, "loss": 1.6399, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2319642305374146, "rewards/margins": 0.07496093213558197, "rewards/rejected": -1.3069250583648682, "semantic_entropy": 0.8157804608345032, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 8.848017579817293, "learning_rate": 7.948745306724931e-07, "logits/chosen": -0.1362834870815277, "logits/rejected": -0.0147573072463274, "logps/chosen": -1.1804348230361938, "logps/rejected": -1.399850606918335, "loss": 1.5928, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1804348230361938, "rewards/margins": 0.21941566467285156, "rewards/rejected": -1.399850606918335, "semantic_entropy": 0.8247357606887817, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 5.878551679616, "learning_rate": 7.936153648163897e-07, "logits/chosen": -0.14504846930503845, "logits/rejected": -0.06255142390727997, "logps/chosen": -1.21340811252594, "logps/rejected": -1.3962210416793823, "loss": 1.6249, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.21340811252594, "rewards/margins": 0.18281318247318268, "rewards/rejected": -1.3962210416793823, "semantic_entropy": 0.8230404853820801, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 5.599952342951236, "learning_rate": 7.92353350302729e-07, "logits/chosen": -0.1722463071346283, "logits/rejected": -0.03986423462629318, "logps/chosen": -1.1484935283660889, "logps/rejected": -1.39655339717865, "loss": 1.566, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1484935283660889, "rewards/margins": 0.24805989861488342, "rewards/rejected": -1.39655339717865, "semantic_entropy": 0.8349694013595581, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 6.696513810589443, "learning_rate": 7.910884993755816e-07, "logits/chosen": -0.13606911897659302, "logits/rejected": -0.04054518789052963, "logps/chosen": -1.2057595252990723, "logps/rejected": -1.3205280303955078, "loss": 1.6114, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2057595252990723, "rewards/margins": 0.11476851999759674, "rewards/rejected": -1.3205280303955078, "semantic_entropy": 0.8113565444946289, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 6.63338589183981, "learning_rate": 7.898208243065367e-07, "logits/chosen": -0.1825679987668991, "logits/rejected": -0.18062610924243927, "logps/chosen": -1.1790488958358765, "logps/rejected": -1.3007428646087646, "loss": 1.5887, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1790488958358765, "rewards/margins": 0.12169384956359863, "rewards/rejected": -1.3007428646087646, "semantic_entropy": 0.8192791938781738, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 3.695656933857439, "learning_rate": 7.88550337394583e-07, "logits/chosen": -0.12944301962852478, "logits/rejected": -0.021017763763666153, "logps/chosen": -1.3422536849975586, "logps/rejected": -1.4266749620437622, "loss": 1.7329, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3422536849975586, "rewards/margins": 0.08442128449678421, "rewards/rejected": -1.4266749620437622, "semantic_entropy": 0.7812927961349487, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 6.558995484640078, "learning_rate": 7.872770509659905e-07, "logits/chosen": -0.055547647178173065, "logits/rejected": -0.03080040216445923, "logps/chosen": -1.3481782674789429, "logps/rejected": -1.3808497190475464, "loss": 1.7358, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3481782674789429, "rewards/margins": 0.03267144411802292, "rewards/rejected": -1.3808497190475464, "semantic_entropy": 0.7753017544746399, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 4.49592132578851, "learning_rate": 7.860009773741896e-07, "logits/chosen": -0.08066151291131973, "logits/rejected": 0.006268954835832119, "logps/chosen": -1.2701483964920044, "logps/rejected": -1.3477660417556763, "loss": 1.6773, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2701483964920044, "rewards/margins": 0.07761768996715546, "rewards/rejected": -1.3477660417556763, "semantic_entropy": 0.8142101168632507, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 6.481057816900277, "learning_rate": 7.84722128999652e-07, "logits/chosen": -0.15883247554302216, "logits/rejected": -0.02258806861937046, "logps/chosen": -1.2088545560836792, "logps/rejected": -1.5045421123504639, "loss": 1.6142, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2088545560836792, "rewards/margins": 0.29568761587142944, "rewards/rejected": -1.5045421123504639, "semantic_entropy": 0.8106829524040222, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 6.051907453925619, "learning_rate": 7.834405182497699e-07, "logits/chosen": -0.0689288005232811, "logits/rejected": -0.026320893317461014, "logps/chosen": -1.2077903747558594, "logps/rejected": -1.315643310546875, "loss": 1.6132, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2077903747558594, "rewards/margins": 0.1078527420759201, "rewards/rejected": -1.315643310546875, "semantic_entropy": 0.810825526714325, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 5.797932814362074, "learning_rate": 7.821561575587368e-07, "logits/chosen": -0.12890690565109253, "logits/rejected": -0.1112586036324501, "logps/chosen": -1.259321928024292, "logps/rejected": -1.3588775396347046, "loss": 1.6607, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.259321928024292, "rewards/margins": 0.09955565631389618, "rewards/rejected": -1.3588775396347046, "semantic_entropy": 0.8028402328491211, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 4.978055970222138, "learning_rate": 7.808690593874254e-07, "logits/chosen": -0.09839232265949249, "logits/rejected": -0.05145517736673355, "logps/chosen": -1.1765741109848022, "logps/rejected": -1.3666942119598389, "loss": 1.5914, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1765741109848022, "rewards/margins": 0.19012007117271423, "rewards/rejected": -1.3666942119598389, "semantic_entropy": 0.8296197056770325, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 5.782895372860012, "learning_rate": 7.79579236223268e-07, "logits/chosen": -0.07921566069126129, "logits/rejected": 0.10856817662715912, "logps/chosen": -1.2558425664901733, "logps/rejected": -1.3866268396377563, "loss": 1.6563, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2558425664901733, "rewards/margins": 0.13078443706035614, "rewards/rejected": -1.3866268396377563, "semantic_entropy": 0.8008186221122742, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 4.544358347979543, "learning_rate": 7.782867005801346e-07, "logits/chosen": -0.07371363788843155, "logits/rejected": 0.05031762272119522, "logps/chosen": -1.263201117515564, "logps/rejected": -1.3672269582748413, "loss": 1.6638, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.263201117515564, "rewards/margins": 0.10402567684650421, "rewards/rejected": -1.3672269582748413, "semantic_entropy": 0.8011261224746704, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 6.113314574359988, "learning_rate": 7.769914649982117e-07, "logits/chosen": -0.12351904064416885, "logits/rejected": -0.014905254356563091, "logps/chosen": -1.2339433431625366, "logps/rejected": -1.3314110040664673, "loss": 1.6481, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2339433431625366, "rewards/margins": 0.09746749699115753, "rewards/rejected": -1.3314110040664673, "semantic_entropy": 0.8284038305282593, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 5.520358629521573, "learning_rate": 7.756935420438803e-07, "logits/chosen": -0.10774321854114532, "logits/rejected": -0.0506608672440052, "logps/chosen": -1.1220496892929077, "logps/rejected": -1.3310532569885254, "loss": 1.5395, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1220496892929077, "rewards/margins": 0.2090034782886505, "rewards/rejected": -1.3310532569885254, "semantic_entropy": 0.8348110914230347, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 4.175086560902407, "learning_rate": 7.743929443095951e-07, "logits/chosen": -0.12377146631479263, "logits/rejected": -0.08511961251497269, "logps/chosen": -1.2904143333435059, "logps/rejected": -1.3529952764511108, "loss": 1.6882, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2904143333435059, "rewards/margins": 0.06258104741573334, "rewards/rejected": -1.3529952764511108, "semantic_entropy": 0.7956458330154419, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 5.507957117734676, "learning_rate": 7.730896844137609e-07, "logits/chosen": -0.06910867989063263, "logits/rejected": -0.017303049564361572, "logps/chosen": -1.2697410583496094, "logps/rejected": -1.393251895904541, "loss": 1.6705, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2697410583496094, "rewards/margins": 0.12351079285144806, "rewards/rejected": -1.393251895904541, "semantic_entropy": 0.8015950918197632, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 6.568410109727489, "learning_rate": 7.717837750006106e-07, "logits/chosen": -0.16185645759105682, "logits/rejected": -0.07282097637653351, "logps/chosen": -1.1826726198196411, "logps/rejected": -1.3627774715423584, "loss": 1.5808, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1826726198196411, "rewards/margins": 0.1801048219203949, "rewards/rejected": -1.3627774715423584, "semantic_entropy": 0.796201765537262, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 5.698854330243038, "learning_rate": 7.704752287400832e-07, "logits/chosen": -0.15882770717144012, "logits/rejected": -0.014342134818434715, "logps/chosen": -1.2423174381256104, "logps/rejected": -1.4141271114349365, "loss": 1.6497, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2423174381256104, "rewards/margins": 0.17180964350700378, "rewards/rejected": -1.4141271114349365, "semantic_entropy": 0.8147643804550171, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 4.955824670510153, "learning_rate": 7.691640583277004e-07, "logits/chosen": -0.12275056540966034, "logits/rejected": 0.00627850741147995, "logps/chosen": -1.2063573598861694, "logps/rejected": -1.388870358467102, "loss": 1.6169, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2063573598861694, "rewards/margins": 0.182513028383255, "rewards/rejected": -1.388870358467102, "semantic_entropy": 0.8210151791572571, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 6.107857513309157, "learning_rate": 7.678502764844433e-07, "logits/chosen": -0.13856028020381927, "logits/rejected": -0.004801489412784576, "logps/chosen": -1.2714061737060547, "logps/rejected": -1.3162667751312256, "loss": 1.6822, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2714061737060547, "rewards/margins": 0.044860538095235825, "rewards/rejected": -1.3162667751312256, "semantic_entropy": 0.8215457797050476, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 7.755495190158921, "learning_rate": 7.665338959566288e-07, "logits/chosen": -0.13084974884986877, "logits/rejected": -0.08004920929670334, "logps/chosen": -1.208836317062378, "logps/rejected": -1.3525402545928955, "loss": 1.6235, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.208836317062378, "rewards/margins": 0.14370402693748474, "rewards/rejected": -1.3525402545928955, "semantic_entropy": 0.829369843006134, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 6.678665354798164, "learning_rate": 7.652149295157868e-07, "logits/chosen": -0.041762907058000565, "logits/rejected": 0.06948588043451309, "logps/chosen": -1.2378121614456177, "logps/rejected": -1.320497751235962, "loss": 1.6471, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2378121614456177, "rewards/margins": 0.08268565684556961, "rewards/rejected": -1.320497751235962, "semantic_entropy": 0.8186162710189819, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 6.311668433509126, "learning_rate": 7.638933899585354e-07, "logits/chosen": -0.0028499483596533537, "logits/rejected": 0.03154969587922096, "logps/chosen": -1.2197879552841187, "logps/rejected": -1.3148596286773682, "loss": 1.6316, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2197879552841187, "rewards/margins": 0.09507165849208832, "rewards/rejected": -1.3148596286773682, "semantic_entropy": 0.8235753178596497, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 6.998674551266936, "learning_rate": 7.625692901064573e-07, "logits/chosen": -0.06386388838291168, "logits/rejected": 0.011675643734633923, "logps/chosen": -1.200219750404358, "logps/rejected": -1.432844877243042, "loss": 1.6112, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.200219750404358, "rewards/margins": 0.23262527585029602, "rewards/rejected": -1.432844877243042, "semantic_entropy": 0.8219127655029297, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 5.560226923737064, "learning_rate": 7.61242642805975e-07, "logits/chosen": -0.15221264958381653, "logits/rejected": -0.1647745817899704, "logps/chosen": -1.22306489944458, "logps/rejected": -1.3500452041625977, "loss": 1.6251, "rewards/accuracies": 0.59375, "rewards/chosen": -1.22306489944458, "rewards/margins": 0.12698033452033997, "rewards/rejected": -1.3500452041625977, "semantic_entropy": 0.8039810061454773, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 4.340448775548745, "learning_rate": 7.599134609282266e-07, "logits/chosen": -0.14433810114860535, "logits/rejected": 0.004729229956865311, "logps/chosen": -1.1667072772979736, "logps/rejected": -1.3115758895874023, "loss": 1.5869, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1667072772979736, "rewards/margins": 0.14486858248710632, "rewards/rejected": -1.3115758895874023, "semantic_entropy": 0.8402953147888184, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 4.994918232662711, "learning_rate": 7.585817573689402e-07, "logits/chosen": -0.17033329606056213, "logits/rejected": -0.0755605548620224, "logps/chosen": -1.1162188053131104, "logps/rejected": -1.3223466873168945, "loss": 1.5412, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1162188053131104, "rewards/margins": 0.20612795650959015, "rewards/rejected": -1.3223466873168945, "semantic_entropy": 0.8500041961669922, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 5.422076240093669, "learning_rate": 7.572475450483098e-07, "logits/chosen": -0.14876317977905273, "logits/rejected": -0.08922292292118073, "logps/chosen": -1.3371882438659668, "logps/rejected": -1.4671502113342285, "loss": 1.7277, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3371882438659668, "rewards/margins": 0.12996208667755127, "rewards/rejected": -1.4671502113342285, "semantic_entropy": 0.7809482216835022, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 4.762839490853145, "learning_rate": 7.559108369108689e-07, "logits/chosen": -0.1990375965833664, "logits/rejected": -0.09777601808309555, "logps/chosen": -1.1730186939239502, "logps/rejected": -1.2726038694381714, "loss": 1.5945, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1730186939239502, "rewards/margins": 0.09958525747060776, "rewards/rejected": -1.2726038694381714, "semantic_entropy": 0.8429107666015625, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 5.703454242644063, "learning_rate": 7.54571645925366e-07, "logits/chosen": -0.16057462990283966, "logits/rejected": 0.0023884635884314775, "logps/chosen": -1.1899974346160889, "logps/rejected": -1.3832486867904663, "loss": 1.6011, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1899974346160889, "rewards/margins": 0.19325141608715057, "rewards/rejected": -1.3832486867904663, "semantic_entropy": 0.8221065402030945, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 7.511591090739507, "learning_rate": 7.532299850846378e-07, "logits/chosen": -0.19030122458934784, "logits/rejected": -0.08395327627658844, "logps/chosen": -1.2479732036590576, "logps/rejected": -1.4739317893981934, "loss": 1.6504, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2479732036590576, "rewards/margins": 0.22595839202404022, "rewards/rejected": -1.4739317893981934, "semantic_entropy": 0.8047725558280945, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 7.133565895701561, "learning_rate": 7.518858674054838e-07, "logits/chosen": -0.13109268248081207, "logits/rejected": 0.02032286301255226, "logps/chosen": -1.1968202590942383, "logps/rejected": -1.405747890472412, "loss": 1.6087, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1968202590942383, "rewards/margins": 0.2089277058839798, "rewards/rejected": -1.405747890472412, "semantic_entropy": 0.823853611946106, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 7.024677528613155, "learning_rate": 7.505393059285394e-07, "logits/chosen": -0.1542159616947174, "logits/rejected": -0.03739406168460846, "logps/chosen": -1.2088768482208252, "logps/rejected": -1.3838523626327515, "loss": 1.6177, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2088768482208252, "rewards/margins": 0.1749754697084427, "rewards/rejected": -1.3838523626327515, "semantic_entropy": 0.817724883556366, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 6.219570499652679, "learning_rate": 7.491903137181501e-07, "logits/chosen": -0.11178360134363174, "logits/rejected": -0.07373546063899994, "logps/chosen": -1.1953959465026855, "logps/rejected": -1.3635036945343018, "loss": 1.6071, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1953959465026855, "rewards/margins": 0.16810792684555054, "rewards/rejected": -1.3635036945343018, "semantic_entropy": 0.8234769701957703, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 5.456445667731688, "learning_rate": 7.478389038622441e-07, "logits/chosen": -0.06342735886573792, "logits/rejected": -0.05262907221913338, "logps/chosen": -1.1494568586349487, "logps/rejected": -1.3829313516616821, "loss": 1.5712, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1494568586349487, "rewards/margins": 0.23347464203834534, "rewards/rejected": -1.3829313516616821, "semantic_entropy": 0.8435637354850769, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 4.523496968480591, "learning_rate": 7.46485089472206e-07, "logits/chosen": -0.13733550906181335, "logits/rejected": -0.055334098637104034, "logps/chosen": -1.2892239093780518, "logps/rejected": -1.3000915050506592, "loss": 1.6906, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2892239093780518, "rewards/margins": 0.010867486707866192, "rewards/rejected": -1.3000915050506592, "semantic_entropy": 0.8026750683784485, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 5.614572808056941, "learning_rate": 7.451288836827487e-07, "logits/chosen": -0.08839946985244751, "logits/rejected": -0.10939983278512955, "logps/chosen": -1.2368186712265015, "logps/rejected": -1.3294631242752075, "loss": 1.6467, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2368186712265015, "rewards/margins": 0.09264441579580307, "rewards/rejected": -1.3294631242752075, "semantic_entropy": 0.8196660280227661, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 7.347558544314041, "learning_rate": 7.437702996517869e-07, "logits/chosen": -0.1450868546962738, "logits/rejected": -0.06468832492828369, "logps/chosen": -1.2681916952133179, "logps/rejected": -1.3621282577514648, "loss": 1.6784, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2681916952133179, "rewards/margins": 0.09393645823001862, "rewards/rejected": -1.3621282577514648, "semantic_entropy": 0.8203548192977905, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 5.909449862055901, "learning_rate": 7.424093505603087e-07, "logits/chosen": -0.24202656745910645, "logits/rejected": -0.11305799335241318, "logps/chosen": -1.20766282081604, "logps/rejected": -1.3693866729736328, "loss": 1.6166, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.20766282081604, "rewards/margins": 0.1617238074541092, "rewards/rejected": -1.3693866729736328, "semantic_entropy": 0.8178375363349915, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 5.878738941571922, "learning_rate": 7.410460496122482e-07, "logits/chosen": -0.10737913846969604, "logits/rejected": -0.02067171037197113, "logps/chosen": -1.1881327629089355, "logps/rejected": -1.3999359607696533, "loss": 1.5831, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1881327629089355, "rewards/margins": 0.21180307865142822, "rewards/rejected": -1.3999359607696533, "semantic_entropy": 0.7900146842002869, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 7.863887152374021, "learning_rate": 7.396804100343572e-07, "logits/chosen": -0.17957209050655365, "logits/rejected": -0.054455287754535675, "logps/chosen": -1.1284328699111938, "logps/rejected": -1.2784782648086548, "loss": 1.5384, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1284328699111938, "rewards/margins": 0.15004530549049377, "rewards/rejected": -1.2784782648086548, "semantic_entropy": 0.8200238943099976, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 5.175525847700918, "learning_rate": 7.383124450760768e-07, "logits/chosen": -0.12626244127750397, "logits/rejected": 0.03427205607295036, "logps/chosen": -1.2621924877166748, "logps/rejected": -1.3602550029754639, "loss": 1.6547, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2621924877166748, "rewards/margins": 0.09806253761053085, "rewards/rejected": -1.3602550029754639, "semantic_entropy": 0.7849631905555725, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 4.5777103688631655, "learning_rate": 7.369421680094091e-07, "logits/chosen": -0.2007174789905548, "logits/rejected": -0.07519026845693588, "logps/chosen": -1.1244957447052002, "logps/rejected": -1.2559866905212402, "loss": 1.5495, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1244957447052002, "rewards/margins": 0.13149099051952362, "rewards/rejected": -1.2559866905212402, "semantic_entropy": 0.8499841690063477, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 6.496576815068786, "learning_rate": 7.355695921287881e-07, "logits/chosen": -0.1820032298564911, "logits/rejected": -0.11129873991012573, "logps/chosen": -1.2022125720977783, "logps/rejected": -1.3064225912094116, "loss": 1.6159, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2022125720977783, "rewards/margins": 0.10421004146337509, "rewards/rejected": -1.3064225912094116, "semantic_entropy": 0.8274194002151489, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 5.901257450983097, "learning_rate": 7.341947307509513e-07, "logits/chosen": -0.12611427903175354, "logits/rejected": -0.02826990745961666, "logps/chosen": -1.2148966789245605, "logps/rejected": -1.289393663406372, "loss": 1.6234, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2148966789245605, "rewards/margins": 0.07449693977832794, "rewards/rejected": -1.289393663406372, "semantic_entropy": 0.8170779943466187, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 5.488545109697815, "learning_rate": 7.328175972148094e-07, "logits/chosen": -0.1568264663219452, "logits/rejected": -0.03753449022769928, "logps/chosen": -1.3359990119934082, "logps/rejected": -1.3999249935150146, "loss": 1.7259, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3359990119934082, "rewards/margins": 0.06392593681812286, "rewards/rejected": -1.3999249935150146, "semantic_entropy": 0.7797099351882935, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 5.5213742877820975, "learning_rate": 7.314382048813185e-07, "logits/chosen": -0.12995395064353943, "logits/rejected": 0.08562605082988739, "logps/chosen": -1.2672265768051147, "logps/rejected": -1.4004318714141846, "loss": 1.6733, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2672265768051147, "rewards/margins": 0.133205384016037, "rewards/rejected": -1.4004318714141846, "semantic_entropy": 0.8120694160461426, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 6.284776461348122, "learning_rate": 7.300565671333486e-07, "logits/chosen": -0.113334059715271, "logits/rejected": 0.020840290933847427, "logps/chosen": -1.2331507205963135, "logps/rejected": -1.353446364402771, "loss": 1.6443, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2331507205963135, "rewards/margins": 0.12029560655355453, "rewards/rejected": -1.353446364402771, "semantic_entropy": 0.8222762942314148, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 6.043738584021165, "learning_rate": 7.286726973755554e-07, "logits/chosen": -0.039151567965745926, "logits/rejected": -0.0331735797226429, "logps/chosen": -1.2521510124206543, "logps/rejected": -1.4187867641448975, "loss": 1.6595, "rewards/accuracies": 0.5, "rewards/chosen": -1.2521510124206543, "rewards/margins": 0.16663573682308197, "rewards/rejected": -1.4187867641448975, "semantic_entropy": 0.8146284222602844, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 7.292115291313658, "learning_rate": 7.272866090342493e-07, "logits/chosen": -0.004319125320762396, "logits/rejected": 0.04982283338904381, "logps/chosen": -1.2701714038848877, "logps/rejected": -1.4697325229644775, "loss": 1.6765, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2701714038848877, "rewards/margins": 0.19956117868423462, "rewards/rejected": -1.4697325229644775, "semantic_entropy": 0.812658965587616, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 5.498960686809322, "learning_rate": 7.258983155572656e-07, "logits/chosen": -0.19341018795967102, "logits/rejected": -0.11042682826519012, "logps/chosen": -1.2186018228530884, "logps/rejected": -1.371458649635315, "loss": 1.6223, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2186018228530884, "rewards/margins": 0.15285678207874298, "rewards/rejected": -1.371458649635315, "semantic_entropy": 0.8073134422302246, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 4.669431184226155, "learning_rate": 7.245078304138335e-07, "logits/chosen": -0.056274425238370895, "logits/rejected": -0.018299013376235962, "logps/chosen": -1.20284104347229, "logps/rejected": -1.4121390581130981, "loss": 1.6085, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.20284104347229, "rewards/margins": 0.20929816365242004, "rewards/rejected": -1.4121390581130981, "semantic_entropy": 0.811257004737854, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 4.4990688474735965, "learning_rate": 7.231151670944462e-07, "logits/chosen": -0.22236430644989014, "logits/rejected": -0.0794583186507225, "logps/chosen": -1.236060380935669, "logps/rejected": -1.352013349533081, "loss": 1.6491, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.236060380935669, "rewards/margins": 0.11595281213521957, "rewards/rejected": -1.352013349533081, "semantic_entropy": 0.8260337710380554, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 6.654374874739354, "learning_rate": 7.217203391107291e-07, "logits/chosen": -0.15274329483509064, "logits/rejected": -0.027907446026802063, "logps/chosen": -1.22507643699646, "logps/rejected": -1.3488413095474243, "loss": 1.6256, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.22507643699646, "rewards/margins": 0.1237650141119957, "rewards/rejected": -1.3488413095474243, "semantic_entropy": 0.8010417819023132, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 5.709255153895783, "learning_rate": 7.203233599953096e-07, "logits/chosen": -0.17273621261119843, "logits/rejected": -0.07505740970373154, "logps/chosen": -1.274780035018921, "logps/rejected": -1.3442848920822144, "loss": 1.6836, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.274780035018921, "rewards/margins": 0.0695047602057457, "rewards/rejected": -1.3442848920822144, "semantic_entropy": 0.8175514936447144, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 6.13503638727982, "learning_rate": 7.189242433016852e-07, "logits/chosen": -0.083819180727005, "logits/rejected": 0.0232203658670187, "logps/chosen": -1.1507251262664795, "logps/rejected": -1.3347519636154175, "loss": 1.5624, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1507251262664795, "rewards/margins": 0.18402671813964844, "rewards/rejected": -1.3347519636154175, "semantic_entropy": 0.8233819007873535, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 5.926993601456036, "learning_rate": 7.17523002604092e-07, "logits/chosen": -0.09735113382339478, "logits/rejected": -0.010250061750411987, "logps/chosen": -1.1950181722640991, "logps/rejected": -1.4907152652740479, "loss": 1.599, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1950181722640991, "rewards/margins": 0.2956971526145935, "rewards/rejected": -1.4907152652740479, "semantic_entropy": 0.8079550862312317, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 3.9309051120834164, "learning_rate": 7.161196514973734e-07, "logits/chosen": -0.09799648821353912, "logits/rejected": -0.013210950419306755, "logps/chosen": -1.2256428003311157, "logps/rejected": -1.3695268630981445, "loss": 1.6336, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2256428003311157, "rewards/margins": 0.14388404786586761, "rewards/rejected": -1.3695268630981445, "semantic_entropy": 0.8159721493721008, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 5.680278266087749, "learning_rate": 7.147142035968483e-07, "logits/chosen": -0.09906782954931259, "logits/rejected": -0.02010853961110115, "logps/chosen": -1.1759237051010132, "logps/rejected": -1.3414690494537354, "loss": 1.5863, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1759237051010132, "rewards/margins": 0.16554540395736694, "rewards/rejected": -1.3414690494537354, "semantic_entropy": 0.820824921131134, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 6.933232379576688, "learning_rate": 7.133066725381781e-07, "logits/chosen": -0.20840568840503693, "logits/rejected": -0.08091268688440323, "logps/chosen": -1.1429585218429565, "logps/rejected": -1.206095576286316, "loss": 1.5714, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1429585218429565, "rewards/margins": 0.06313714385032654, "rewards/rejected": -1.206095576286316, "semantic_entropy": 0.8569475412368774, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 5.275737639264357, "learning_rate": 7.118970719772354e-07, "logits/chosen": -0.21102456748485565, "logits/rejected": -0.035500261932611465, "logps/chosen": -1.2574834823608398, "logps/rejected": -1.3747073411941528, "loss": 1.6645, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2574834823608398, "rewards/margins": 0.11722388118505478, "rewards/rejected": -1.3747073411941528, "semantic_entropy": 0.814084529876709, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 5.017742139138589, "learning_rate": 7.104854155899711e-07, "logits/chosen": -0.09605003893375397, "logits/rejected": -0.011888441629707813, "logps/chosen": -1.2278046607971191, "logps/rejected": -1.335472822189331, "loss": 1.6331, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2278046607971191, "rewards/margins": 0.10766802728176117, "rewards/rejected": -1.335472822189331, "semantic_entropy": 0.810516357421875, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 5.687377025452369, "learning_rate": 7.090717170722817e-07, "logits/chosen": -0.11238744109869003, "logits/rejected": -0.068781778216362, "logps/chosen": -1.2058119773864746, "logps/rejected": -1.366644263267517, "loss": 1.6182, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2058119773864746, "rewards/margins": 0.16083236038684845, "rewards/rejected": -1.366644263267517, "semantic_entropy": 0.8248723745346069, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 4.727515093355691, "learning_rate": 7.076559901398762e-07, "logits/chosen": -0.2691853940486908, "logits/rejected": -0.18083608150482178, "logps/chosen": -1.135913610458374, "logps/rejected": -1.3195006847381592, "loss": 1.5517, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.135913610458374, "rewards/margins": 0.18358710408210754, "rewards/rejected": -1.3195006847381592, "semantic_entropy": 0.8315603137016296, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 7.906425988800124, "learning_rate": 7.062382485281436e-07, "logits/chosen": -0.1354060173034668, "logits/rejected": -0.05403820797801018, "logps/chosen": -1.1928424835205078, "logps/rejected": -1.3466193675994873, "loss": 1.6094, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1928424835205078, "rewards/margins": 0.15377672016620636, "rewards/rejected": -1.3466193675994873, "semantic_entropy": 0.8330352902412415, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.07229965925216675, "eval_logits/rejected": 0.13771548867225647, "eval_logps/chosen": -1.2720904350280762, "eval_logps/rejected": -1.4071999788284302, "eval_loss": 1.6751289367675781, "eval_rewards/accuracies": 0.5511869192123413, "eval_rewards/chosen": -1.2720904350280762, "eval_rewards/margins": 0.13510943949222565, "eval_rewards/rejected": -1.4071999788284302, "eval_runtime": 34.2181, "eval_samples_per_second": 39.307, "eval_semantic_entropy": 0.8057713508605957, "eval_steps_per_second": 9.849, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 5.378140050218315, "learning_rate": 7.048185059920193e-07, "logits/chosen": -0.17549605667591095, "logits/rejected": -0.0575527660548687, "logps/chosen": -1.2238993644714355, "logps/rejected": -1.3859851360321045, "loss": 1.6281, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2238993644714355, "rewards/margins": 0.16208595037460327, "rewards/rejected": -1.3859851360321045, "semantic_entropy": 0.8083804845809937, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 4.599326746595441, "learning_rate": 7.033967763058516e-07, "logits/chosen": -0.2760392129421234, "logits/rejected": -0.11813102662563324, "logps/chosen": -1.2013671398162842, "logps/rejected": -1.265901803970337, "loss": 1.6138, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2013671398162842, "rewards/margins": 0.06453461199998856, "rewards/rejected": -1.265901803970337, "semantic_entropy": 0.8248791694641113, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 5.088359766233692, "learning_rate": 7.019730732632681e-07, "logits/chosen": -0.08856335282325745, "logits/rejected": -0.041765011847019196, "logps/chosen": -1.1347663402557373, "logps/rejected": -1.3736943006515503, "loss": 1.5561, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1347663402557373, "rewards/margins": 0.23892803490161896, "rewards/rejected": -1.3736943006515503, "semantic_entropy": 0.8426318168640137, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 6.492559297915971, "learning_rate": 7.005474106770418e-07, "logits/chosen": -0.22064849734306335, "logits/rejected": -0.11930135637521744, "logps/chosen": -1.2490975856781006, "logps/rejected": -1.4646494388580322, "loss": 1.6464, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2490975856781006, "rewards/margins": 0.2155519425868988, "rewards/rejected": -1.4646494388580322, "semantic_entropy": 0.7946311235427856, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 5.158255452642812, "learning_rate": 6.991198023789577e-07, "logits/chosen": -0.11545030772686005, "logits/rejected": -0.0548534169793129, "logps/chosen": -1.1823089122772217, "logps/rejected": -1.3512859344482422, "loss": 1.5905, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1823089122772217, "rewards/margins": 0.1689770370721817, "rewards/rejected": -1.3512859344482422, "semantic_entropy": 0.8163334727287292, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 6.157349134419108, "learning_rate": 6.976902622196776e-07, "logits/chosen": -0.10665549337863922, "logits/rejected": -0.0878884568810463, "logps/chosen": -1.3222874402999878, "logps/rejected": -1.4526039361953735, "loss": 1.715, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3222874402999878, "rewards/margins": 0.13031631708145142, "rewards/rejected": -1.4526039361953735, "semantic_entropy": 0.7853298187255859, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 3.629357388627039, "learning_rate": 6.962588040686064e-07, "logits/chosen": -0.09356193244457245, "logits/rejected": -0.00042872800258919597, "logps/chosen": -1.1945819854736328, "logps/rejected": -1.32026207447052, "loss": 1.606, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1945819854736328, "rewards/margins": 0.1256801038980484, "rewards/rejected": -1.32026207447052, "semantic_entropy": 0.8228558301925659, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 6.775769062814004, "learning_rate": 6.948254418137573e-07, "logits/chosen": -0.22728125751018524, "logits/rejected": -0.1402924507856369, "logps/chosen": -1.1760672330856323, "logps/rejected": -1.3227156400680542, "loss": 1.5956, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1760672330856323, "rewards/margins": 0.1466483771800995, "rewards/rejected": -1.3227156400680542, "semantic_entropy": 0.8389729261398315, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 10.05685929693179, "learning_rate": 6.933901893616174e-07, "logits/chosen": -0.16023103892803192, "logits/rejected": -0.04378854110836983, "logps/chosen": -1.2156217098236084, "logps/rejected": -1.3252859115600586, "loss": 1.6293, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2156217098236084, "rewards/margins": 0.10966415703296661, "rewards/rejected": -1.3252859115600586, "semantic_entropy": 0.8272579312324524, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 4.766847476767063, "learning_rate": 6.919530606370121e-07, "logits/chosen": -0.15695008635520935, "logits/rejected": -0.033334456384181976, "logps/chosen": -1.1796422004699707, "logps/rejected": -1.399123191833496, "loss": 1.5981, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1796422004699707, "rewards/margins": 0.219480961561203, "rewards/rejected": -1.399123191833496, "semantic_entropy": 0.8369055986404419, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 3.8345247008195984, "learning_rate": 6.905140695829706e-07, "logits/chosen": -0.14908668398857117, "logits/rejected": 0.02593136951327324, "logps/chosen": -1.270948052406311, "logps/rejected": -1.325573205947876, "loss": 1.6779, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.270948052406311, "rewards/margins": 0.054625190794467926, "rewards/rejected": -1.325573205947876, "semantic_entropy": 0.813858151435852, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 8.30965319202814, "learning_rate": 6.890732301605904e-07, "logits/chosen": -0.0933978408575058, "logits/rejected": -0.013651154935359955, "logps/chosen": -1.2539231777191162, "logps/rejected": -1.3113439083099365, "loss": 1.6631, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2539231777191162, "rewards/margins": 0.05742098018527031, "rewards/rejected": -1.3113439083099365, "semantic_entropy": 0.818356990814209, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 5.0231495074516195, "learning_rate": 6.876305563489021e-07, "logits/chosen": -0.11422399431467056, "logits/rejected": -0.06813754886388779, "logps/chosen": -1.224760890007019, "logps/rejected": -1.3764041662216187, "loss": 1.641, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.224760890007019, "rewards/margins": 0.15164323151111603, "rewards/rejected": -1.3764041662216187, "semantic_entropy": 0.8324982523918152, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 7.769260605312175, "learning_rate": 6.861860621447331e-07, "logits/chosen": -0.23308250308036804, "logits/rejected": -0.1324826180934906, "logps/chosen": -1.2042156457901, "logps/rejected": -1.2514582872390747, "loss": 1.6229, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2042156457901, "rewards/margins": 0.047242797911167145, "rewards/rejected": -1.2514582872390747, "semantic_entropy": 0.8373289108276367, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 5.1037629876319475, "learning_rate": 6.847397615625725e-07, "logits/chosen": -0.14529304206371307, "logits/rejected": -0.09840992838144302, "logps/chosen": -1.2229526042938232, "logps/rejected": -1.4011389017105103, "loss": 1.632, "rewards/accuracies": 0.625, "rewards/chosen": -1.2229526042938232, "rewards/margins": 0.17818620800971985, "rewards/rejected": -1.4011389017105103, "semantic_entropy": 0.8181083798408508, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 5.045201641002432, "learning_rate": 6.83291668634435e-07, "logits/chosen": -0.2630137801170349, "logits/rejected": -0.13462164998054504, "logps/chosen": -1.2370854616165161, "logps/rejected": -1.4101415872573853, "loss": 1.6332, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2370854616165161, "rewards/margins": 0.17305605113506317, "rewards/rejected": -1.4101415872573853, "semantic_entropy": 0.7922090291976929, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 6.749671493262752, "learning_rate": 6.818417974097246e-07, "logits/chosen": -0.09886298328638077, "logits/rejected": 0.05703229829668999, "logps/chosen": -1.2613810300827026, "logps/rejected": -1.3969571590423584, "loss": 1.6667, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2613810300827026, "rewards/margins": 0.13557596504688263, "rewards/rejected": -1.3969571590423584, "semantic_entropy": 0.8105382919311523, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 6.1217856390812715, "learning_rate": 6.803901619550981e-07, "logits/chosen": -0.20109161734580994, "logits/rejected": -0.1641930341720581, "logps/chosen": -1.2442984580993652, "logps/rejected": -1.4554297924041748, "loss": 1.6476, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2442984580993652, "rewards/margins": 0.211131289601326, "rewards/rejected": -1.4554297924041748, "semantic_entropy": 0.8066480755805969, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 5.425406515959951, "learning_rate": 6.789367763543292e-07, "logits/chosen": -0.09479651600122452, "logits/rejected": -0.10191066563129425, "logps/chosen": -1.227798342704773, "logps/rejected": -1.365805983543396, "loss": 1.6346, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.227798342704773, "rewards/margins": 0.13800756633281708, "rewards/rejected": -1.365805983543396, "semantic_entropy": 0.8135111927986145, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 6.398625807218401, "learning_rate": 6.774816547081714e-07, "logits/chosen": -0.09087280184030533, "logits/rejected": 0.03535875678062439, "logps/chosen": -1.1724708080291748, "logps/rejected": -1.316805362701416, "loss": 1.5934, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1724708080291748, "rewards/margins": 0.14433453977108002, "rewards/rejected": -1.316805362701416, "semantic_entropy": 0.8418784141540527, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 6.27838053684418, "learning_rate": 6.760248111342211e-07, "logits/chosen": -0.12069705873727798, "logits/rejected": 0.015942582860589027, "logps/chosen": -1.1771373748779297, "logps/rejected": -1.3059653043746948, "loss": 1.5994, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1771373748779297, "rewards/margins": 0.1288280040025711, "rewards/rejected": -1.3059653043746948, "semantic_entropy": 0.8445422053337097, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 6.224918322925079, "learning_rate": 6.745662597667813e-07, "logits/chosen": -0.19664594531059265, "logits/rejected": -0.10287900269031525, "logps/chosen": -1.1806488037109375, "logps/rejected": -1.3338449001312256, "loss": 1.597, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1806488037109375, "rewards/margins": 0.15319623053073883, "rewards/rejected": -1.3338449001312256, "semantic_entropy": 0.8327757716178894, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 5.993386335669986, "learning_rate": 6.731060147567236e-07, "logits/chosen": -0.10171717405319214, "logits/rejected": -0.02145753614604473, "logps/chosen": -1.257420539855957, "logps/rejected": -1.3123220205307007, "loss": 1.6634, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.257420539855957, "rewards/margins": 0.054901618510484695, "rewards/rejected": -1.3123220205307007, "semantic_entropy": 0.8119403719902039, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 6.712107357466461, "learning_rate": 6.716440902713515e-07, "logits/chosen": -0.1944049894809723, "logits/rejected": -0.1376335322856903, "logps/chosen": -1.2233657836914062, "logps/rejected": -1.3471635580062866, "loss": 1.6305, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2233657836914062, "rewards/margins": 0.12379767745733261, "rewards/rejected": -1.3471635580062866, "semantic_entropy": 0.8143411874771118, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 7.149887971362489, "learning_rate": 6.701805004942627e-07, "logits/chosen": -0.16603481769561768, "logits/rejected": -0.11718587577342987, "logps/chosen": -1.2550067901611328, "logps/rejected": -1.3475435972213745, "loss": 1.6527, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2550067901611328, "rewards/margins": 0.0925367921590805, "rewards/rejected": -1.3475435972213745, "semantic_entropy": 0.7954351305961609, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 5.295363779189558, "learning_rate": 6.687152596252119e-07, "logits/chosen": -0.19062277674674988, "logits/rejected": -0.16226159036159515, "logps/chosen": -1.2014892101287842, "logps/rejected": -1.340798020362854, "loss": 1.613, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2014892101287842, "rewards/margins": 0.13930867612361908, "rewards/rejected": -1.340798020362854, "semantic_entropy": 0.823023796081543, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 4.463889466769918, "learning_rate": 6.672483818799722e-07, "logits/chosen": -0.20911714434623718, "logits/rejected": -0.1077265590429306, "logps/chosen": -1.2200536727905273, "logps/rejected": -1.3797017335891724, "loss": 1.6258, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2200536727905273, "rewards/margins": 0.15964800119400024, "rewards/rejected": -1.3797017335891724, "semantic_entropy": 0.8115269541740417, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 5.668374446324004, "learning_rate": 6.657798814901978e-07, "logits/chosen": -0.11806541681289673, "logits/rejected": 0.007719153072685003, "logps/chosen": -1.3082528114318848, "logps/rejected": -1.371173620223999, "loss": 1.6973, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3082528114318848, "rewards/margins": 0.06292072683572769, "rewards/rejected": -1.371173620223999, "semantic_entropy": 0.7781856060028076, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 4.883514668712541, "learning_rate": 6.643097727032863e-07, "logits/chosen": -0.11601561307907104, "logits/rejected": 0.001872205757535994, "logps/chosen": -1.2205229997634888, "logps/rejected": -1.3799033164978027, "loss": 1.6272, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2205229997634888, "rewards/margins": 0.15938030183315277, "rewards/rejected": -1.3799033164978027, "semantic_entropy": 0.8132728338241577, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 5.156440314653821, "learning_rate": 6.628380697822392e-07, "logits/chosen": -0.1449512541294098, "logits/rejected": -0.02476201020181179, "logps/chosen": -1.2050577402114868, "logps/rejected": -1.2572379112243652, "loss": 1.6225, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2050577402114868, "rewards/margins": 0.05218012258410454, "rewards/rejected": -1.2572379112243652, "semantic_entropy": 0.8349574208259583, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 5.465884254690918, "learning_rate": 6.61364787005525e-07, "logits/chosen": -0.09708616882562637, "logits/rejected": -0.04060886427760124, "logps/chosen": -1.1497955322265625, "logps/rejected": -1.3986761569976807, "loss": 1.5657, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1497955322265625, "rewards/margins": 0.24888058006763458, "rewards/rejected": -1.3986761569976807, "semantic_entropy": 0.8317192196846008, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 6.094135844057021, "learning_rate": 6.598899386669395e-07, "logits/chosen": -0.08624229580163956, "logits/rejected": 0.008372211828827858, "logps/chosen": -1.195389747619629, "logps/rejected": -1.3508546352386475, "loss": 1.6065, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.195389747619629, "rewards/margins": 0.15546497702598572, "rewards/rejected": -1.3508546352386475, "semantic_entropy": 0.8222860097885132, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 6.255239458992339, "learning_rate": 6.584135390754679e-07, "logits/chosen": -0.10109671205282211, "logits/rejected": -0.006981375627219677, "logps/chosen": -1.181278944015503, "logps/rejected": -1.3388015031814575, "loss": 1.5966, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.181278944015503, "rewards/margins": 0.15752258896827698, "rewards/rejected": -1.3388015031814575, "semantic_entropy": 0.8306370973587036, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 4.590629213283337, "learning_rate": 6.569356025551454e-07, "logits/chosen": -0.0568915493786335, "logits/rejected": -0.010566475801169872, "logps/chosen": -1.1780459880828857, "logps/rejected": -1.371070146560669, "loss": 1.5952, "rewards/accuracies": 0.625, "rewards/chosen": -1.1780459880828857, "rewards/margins": 0.19302408397197723, "rewards/rejected": -1.371070146560669, "semantic_entropy": 0.8343206644058228, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 6.59035222587758, "learning_rate": 6.554561434449186e-07, "logits/chosen": -0.19788631796836853, "logits/rejected": -0.08153335750102997, "logps/chosen": -1.1772761344909668, "logps/rejected": -1.3713045120239258, "loss": 1.5933, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1772761344909668, "rewards/margins": 0.19402840733528137, "rewards/rejected": -1.3713045120239258, "semantic_entropy": 0.831951916217804, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 5.518103628766024, "learning_rate": 6.539751760985063e-07, "logits/chosen": -0.13015994429588318, "logits/rejected": -0.07607077062129974, "logps/chosen": -1.285044550895691, "logps/rejected": -1.3895542621612549, "loss": 1.6736, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.285044550895691, "rewards/margins": 0.10450981557369232, "rewards/rejected": -1.3895542621612549, "semantic_entropy": 0.777174174785614, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 5.853222989046294, "learning_rate": 6.524927148842602e-07, "logits/chosen": -0.0450812429189682, "logits/rejected": 0.08632221072912216, "logps/chosen": -1.1112303733825684, "logps/rejected": -1.3421175479888916, "loss": 1.5216, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1112303733825684, "rewards/margins": 0.23088721930980682, "rewards/rejected": -1.3421175479888916, "semantic_entropy": 0.8207334280014038, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 7.275224610682173, "learning_rate": 6.510087741850254e-07, "logits/chosen": -0.12566128373146057, "logits/rejected": -0.008653797209262848, "logps/chosen": -1.1678478717803955, "logps/rejected": -1.3530265092849731, "loss": 1.5881, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1678478717803955, "rewards/margins": 0.18517863750457764, "rewards/rejected": -1.3530265092849731, "semantic_entropy": 0.8404285311698914, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 3.9084848226040743, "learning_rate": 6.495233683980012e-07, "logits/chosen": -0.0836976170539856, "logits/rejected": -0.059690773487091064, "logps/chosen": -1.1858875751495361, "logps/rejected": -1.3508541584014893, "loss": 1.6044, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1858875751495361, "rewards/margins": 0.16496649384498596, "rewards/rejected": -1.3508541584014893, "semantic_entropy": 0.837052047252655, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 7.267436434059178, "learning_rate": 6.480365119346011e-07, "logits/chosen": -0.01738749071955681, "logits/rejected": 0.08665905892848969, "logps/chosen": -1.2142324447631836, "logps/rejected": -1.3214452266693115, "loss": 1.6267, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2142324447631836, "rewards/margins": 0.10721276700496674, "rewards/rejected": -1.3214452266693115, "semantic_entropy": 0.8249403238296509, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 6.096225790707586, "learning_rate": 6.465482192203129e-07, "logits/chosen": -0.006175222806632519, "logits/rejected": 0.01734001375734806, "logps/chosen": -1.2086519002914429, "logps/rejected": -1.3251694440841675, "loss": 1.6167, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2086519002914429, "rewards/margins": 0.11651742458343506, "rewards/rejected": -1.3251694440841675, "semantic_entropy": 0.8161107897758484, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 5.493919523437892, "learning_rate": 6.45058504694559e-07, "logits/chosen": -0.015668602660298347, "logits/rejected": 0.0459660142660141, "logps/chosen": -1.2521498203277588, "logps/rejected": -1.328094720840454, "loss": 1.6536, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2521498203277588, "rewards/margins": 0.0759449377655983, "rewards/rejected": -1.328094720840454, "semantic_entropy": 0.8028348088264465, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 7.743008654228581, "learning_rate": 6.435673828105564e-07, "logits/chosen": -0.09785537421703339, "logits/rejected": 0.022359225898981094, "logps/chosen": -1.1671931743621826, "logps/rejected": -1.3939273357391357, "loss": 1.576, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1671931743621826, "rewards/margins": 0.2267342358827591, "rewards/rejected": -1.3939273357391357, "semantic_entropy": 0.817538857460022, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 5.898648449894003, "learning_rate": 6.420748680351763e-07, "logits/chosen": -0.11539480835199356, "logits/rejected": -0.13054174184799194, "logps/chosen": -1.2993159294128418, "logps/rejected": -1.333200216293335, "loss": 1.6941, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.2993159294128418, "rewards/margins": 0.033884190022945404, "rewards/rejected": -1.333200216293335, "semantic_entropy": 0.7894706726074219, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 6.345870527593569, "learning_rate": 6.405809748488032e-07, "logits/chosen": -0.05681665614247322, "logits/rejected": 0.06590834259986877, "logps/chosen": -1.2215360403060913, "logps/rejected": -1.3570382595062256, "loss": 1.6419, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2215360403060913, "rewards/margins": 0.13550205528736115, "rewards/rejected": -1.3570382595062256, "semantic_entropy": 0.84070885181427, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 6.058786043266676, "learning_rate": 6.390857177451956e-07, "logits/chosen": -0.21972425282001495, "logits/rejected": -0.04886932298541069, "logps/chosen": -1.2841347455978394, "logps/rejected": -1.3082153797149658, "loss": 1.6894, "rewards/accuracies": 0.46875, "rewards/chosen": -1.2841347455978394, "rewards/margins": 0.02408064529299736, "rewards/rejected": -1.3082153797149658, "semantic_entropy": 0.8104531168937683, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 7.258349365117021, "learning_rate": 6.375891112313445e-07, "logits/chosen": -0.13270898163318634, "logits/rejected": -0.07187141478061676, "logps/chosen": -1.2032992839813232, "logps/rejected": -1.3315246105194092, "loss": 1.614, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2032992839813232, "rewards/margins": 0.12822547554969788, "rewards/rejected": -1.3315246105194092, "semantic_entropy": 0.8214617967605591, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 4.4015062696443605, "learning_rate": 6.360911698273326e-07, "logits/chosen": -0.06931308656930923, "logits/rejected": -0.023100432008504868, "logps/chosen": -1.2615067958831787, "logps/rejected": -1.3956094980239868, "loss": 1.667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2615067958831787, "rewards/margins": 0.13410256803035736, "rewards/rejected": -1.3956094980239868, "semantic_entropy": 0.8109481930732727, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 5.16657933480286, "learning_rate": 6.345919080661944e-07, "logits/chosen": -0.10669048130512238, "logits/rejected": -0.04523846134543419, "logps/chosen": -1.2117180824279785, "logps/rejected": -1.419703483581543, "loss": 1.6225, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2117180824279785, "rewards/margins": 0.20798540115356445, "rewards/rejected": -1.419703483581543, "semantic_entropy": 0.8215371966362, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 4.325195156469862, "learning_rate": 6.330913404937737e-07, "logits/chosen": -0.1781974583864212, "logits/rejected": -0.0565606951713562, "logps/chosen": -1.205597162246704, "logps/rejected": -1.4912530183792114, "loss": 1.6032, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.205597162246704, "rewards/margins": 0.2856559753417969, "rewards/rejected": -1.4912530183792114, "semantic_entropy": 0.7951996922492981, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 6.339530930587291, "learning_rate": 6.315894816685838e-07, "logits/chosen": -0.1049392968416214, "logits/rejected": 0.021061481907963753, "logps/chosen": -1.103271722793579, "logps/rejected": -1.2556746006011963, "loss": 1.525, "rewards/accuracies": 0.5625, "rewards/chosen": -1.103271722793579, "rewards/margins": 0.1524028480052948, "rewards/rejected": -1.2556746006011963, "semantic_entropy": 0.8435392379760742, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 5.745109216696766, "learning_rate": 6.300863461616657e-07, "logits/chosen": -0.028934497386217117, "logits/rejected": -0.005610386375337839, "logps/chosen": -1.081808090209961, "logps/rejected": -1.2815072536468506, "loss": 1.5139, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.081808090209961, "rewards/margins": 0.19969908893108368, "rewards/rejected": -1.2815072536468506, "semantic_entropy": 0.8642523884773254, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 5.1772912768879795, "learning_rate": 6.285819485564465e-07, "logits/chosen": -0.1522144377231598, "logits/rejected": -0.06073923036456108, "logps/chosen": -1.2383556365966797, "logps/rejected": -1.3871220350265503, "loss": 1.6443, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2383556365966797, "rewards/margins": 0.148766428232193, "rewards/rejected": -1.3871220350265503, "semantic_entropy": 0.8118423223495483, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 6.0907919376709705, "learning_rate": 6.270763034485986e-07, "logits/chosen": -0.1050073653459549, "logits/rejected": -0.04993366077542305, "logps/chosen": -1.3402453660964966, "logps/rejected": -1.413372278213501, "loss": 1.7358, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3402453660964966, "rewards/margins": 0.07312674820423126, "rewards/rejected": -1.413372278213501, "semantic_entropy": 0.7910217046737671, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 6.4564673142832545, "learning_rate": 6.255694254458972e-07, "logits/chosen": -0.11926700174808502, "logits/rejected": 0.004981190897524357, "logps/chosen": -1.251169204711914, "logps/rejected": -1.3392000198364258, "loss": 1.6623, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.251169204711914, "rewards/margins": 0.08803063631057739, "rewards/rejected": -1.3392000198364258, "semantic_entropy": 0.8221951723098755, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 6.4348736664787465, "learning_rate": 6.240613291680795e-07, "logits/chosen": -0.155964195728302, "logits/rejected": -0.022763323038816452, "logps/chosen": -1.2326780557632446, "logps/rejected": -1.3299533128738403, "loss": 1.6344, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2326780557632446, "rewards/margins": 0.09727514535188675, "rewards/rejected": -1.3299533128738403, "semantic_entropy": 0.8034775853157043, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 6.330078111363216, "learning_rate": 6.225520292467021e-07, "logits/chosen": -0.13968291878700256, "logits/rejected": 0.030344059690833092, "logps/chosen": -1.2413203716278076, "logps/rejected": -1.31222403049469, "loss": 1.6543, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2413203716278076, "rewards/margins": 0.07090376317501068, "rewards/rejected": -1.31222403049469, "semantic_entropy": 0.8259509205818176, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 7.911091860572046, "learning_rate": 6.210415403249993e-07, "logits/chosen": -0.24794287979602814, "logits/rejected": -0.0543971061706543, "logps/chosen": -1.219376802444458, "logps/rejected": -1.4394868612289429, "loss": 1.621, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.219376802444458, "rewards/margins": 0.22011017799377441, "rewards/rejected": -1.4394868612289429, "semantic_entropy": 0.8032897710800171, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 4.741674023731132, "learning_rate": 6.195298770577415e-07, "logits/chosen": -0.10703140497207642, "logits/rejected": -0.08902278542518616, "logps/chosen": -1.210728406906128, "logps/rejected": -1.379828929901123, "loss": 1.6315, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.210728406906128, "rewards/margins": 0.16910049319267273, "rewards/rejected": -1.379828929901123, "semantic_entropy": 0.8415881991386414, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 5.6484644004361035, "learning_rate": 6.180170541110923e-07, "logits/chosen": -0.11590003967285156, "logits/rejected": 0.04657958820462227, "logps/chosen": -1.2512283325195312, "logps/rejected": -1.3973382711410522, "loss": 1.6548, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2512283325195312, "rewards/margins": 0.14610998332500458, "rewards/rejected": -1.3973382711410522, "semantic_entropy": 0.8071807026863098, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 4.796728338464775, "learning_rate": 6.165030861624663e-07, "logits/chosen": -0.15281710028648376, "logits/rejected": 0.023782672360539436, "logps/chosen": -1.140514612197876, "logps/rejected": -1.3642228841781616, "loss": 1.5661, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.140514612197876, "rewards/margins": 0.2237083464860916, "rewards/rejected": -1.3642228841781616, "semantic_entropy": 0.8511430621147156, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 6.2598030586384, "learning_rate": 6.149879879003876e-07, "logits/chosen": -0.07830878347158432, "logits/rejected": -0.057846419513225555, "logps/chosen": -1.22551429271698, "logps/rejected": -1.4066619873046875, "loss": 1.6376, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.22551429271698, "rewards/margins": 0.18114769458770752, "rewards/rejected": -1.4066619873046875, "semantic_entropy": 0.8242387771606445, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 6.085148505154918, "learning_rate": 6.13471774024346e-07, "logits/chosen": -0.18015453219413757, "logits/rejected": -0.1009650006890297, "logps/chosen": -1.1565239429473877, "logps/rejected": -1.3100000619888306, "loss": 1.5627, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1565239429473877, "rewards/margins": 0.15347608923912048, "rewards/rejected": -1.3100000619888306, "semantic_entropy": 0.8124001622200012, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 6.745708674481021, "learning_rate": 6.119544592446551e-07, "logits/chosen": -0.14728213846683502, "logits/rejected": -0.05001986026763916, "logps/chosen": -1.1888298988342285, "logps/rejected": -1.2700729370117188, "loss": 1.6131, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1888298988342285, "rewards/margins": 0.08124302327632904, "rewards/rejected": -1.2700729370117188, "semantic_entropy": 0.8484548330307007, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 7.20810175658846, "learning_rate": 6.104360582823096e-07, "logits/chosen": -0.11317580938339233, "logits/rejected": -0.018452277407050133, "logps/chosen": -1.1965445280075073, "logps/rejected": -1.3726069927215576, "loss": 1.6023, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1965445280075073, "rewards/margins": 0.17606250941753387, "rewards/rejected": -1.3726069927215576, "semantic_entropy": 0.8114873766899109, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 6.487347129048763, "learning_rate": 6.089165858688423e-07, "logits/chosen": -0.16594000160694122, "logits/rejected": -0.03701313957571983, "logps/chosen": -1.2097290754318237, "logps/rejected": -1.3855531215667725, "loss": 1.6155, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2097290754318237, "rewards/margins": 0.17582403123378754, "rewards/rejected": -1.3855531215667725, "semantic_entropy": 0.8115992546081543, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 5.486046125851214, "learning_rate": 6.073960567461811e-07, "logits/chosen": -0.12393466383218765, "logits/rejected": 0.042426858097314835, "logps/chosen": -1.1190146207809448, "logps/rejected": -1.4066413640975952, "loss": 1.5306, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1190146207809448, "rewards/margins": 0.28762686252593994, "rewards/rejected": -1.4066413640975952, "semantic_entropy": 0.8230944871902466, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 7.060101270113696, "learning_rate": 6.058744856665065e-07, "logits/chosen": -0.15223434567451477, "logits/rejected": -0.05552518367767334, "logps/chosen": -1.141435980796814, "logps/rejected": -1.3540828227996826, "loss": 1.5559, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.141435980796814, "rewards/margins": 0.2126467227935791, "rewards/rejected": -1.3540828227996826, "semantic_entropy": 0.8289759755134583, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 6.464163442616756, "learning_rate": 6.043518873921074e-07, "logits/chosen": -0.1238318681716919, "logits/rejected": -0.03663468360900879, "logps/chosen": -1.1750373840332031, "logps/rejected": -1.26582932472229, "loss": 1.5882, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1750373840332031, "rewards/margins": 0.09079186618328094, "rewards/rejected": -1.26582932472229, "semantic_entropy": 0.8262761235237122, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 6.2078776917316345, "learning_rate": 6.028282766952393e-07, "logits/chosen": -0.13994525372982025, "logits/rejected": -0.06310413032770157, "logps/chosen": -1.2490947246551514, "logps/rejected": -1.4495458602905273, "loss": 1.6437, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2490947246551514, "rewards/margins": 0.20045098662376404, "rewards/rejected": -1.4495458602905273, "semantic_entropy": 0.7891373038291931, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 6.1401332094444765, "learning_rate": 6.013036683579798e-07, "logits/chosen": -0.07750773429870605, "logits/rejected": 0.04555783420801163, "logps/chosen": -1.1894856691360474, "logps/rejected": -1.295517086982727, "loss": 1.611, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1894856691360474, "rewards/margins": 0.10603152215480804, "rewards/rejected": -1.295517086982727, "semantic_entropy": 0.8430088758468628, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 6.855994170237085, "learning_rate": 5.997780771720854e-07, "logits/chosen": -0.20978447794914246, "logits/rejected": -0.06448744237422943, "logps/chosen": -1.239942193031311, "logps/rejected": -1.4445959329605103, "loss": 1.6447, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.239942193031311, "rewards/margins": 0.20465371012687683, "rewards/rejected": -1.4445959329605103, "semantic_entropy": 0.8094766736030579, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 6.156600258979842, "learning_rate": 5.982515179388486e-07, "logits/chosen": -0.09966900199651718, "logits/rejected": 0.01168000977486372, "logps/chosen": -1.1446138620376587, "logps/rejected": -1.3221535682678223, "loss": 1.559, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1446138620376587, "rewards/margins": 0.1775399148464203, "rewards/rejected": -1.3221535682678223, "semantic_entropy": 0.8287068605422974, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 6.169631759672591, "learning_rate": 5.967240054689541e-07, "logits/chosen": -0.1631147563457489, "logits/rejected": -0.1024804562330246, "logps/chosen": -1.1909414529800415, "logps/rejected": -1.2281302213668823, "loss": 1.6129, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1909414529800415, "rewards/margins": 0.03718879818916321, "rewards/rejected": -1.2281302213668823, "semantic_entropy": 0.8439064025878906, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 3.9824276983535905, "learning_rate": 5.951955545823342e-07, "logits/chosen": -0.1436646431684494, "logits/rejected": -0.13075539469718933, "logps/chosen": -1.1404297351837158, "logps/rejected": -1.3118574619293213, "loss": 1.5656, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1404297351837158, "rewards/margins": 0.17142778635025024, "rewards/rejected": -1.3118574619293213, "semantic_entropy": 0.8504346609115601, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 4.48578481866457, "learning_rate": 5.936661801080263e-07, "logits/chosen": -0.14257480204105377, "logits/rejected": -0.06165775656700134, "logps/chosen": -1.3203927278518677, "logps/rejected": -1.4257984161376953, "loss": 1.717, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3203927278518677, "rewards/margins": 0.105405792593956, "rewards/rejected": -1.4257984161376953, "semantic_entropy": 0.7931500673294067, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 7.232690691089748, "learning_rate": 5.92135896884028e-07, "logits/chosen": -0.14047248661518097, "logits/rejected": -0.03672455996274948, "logps/chosen": -1.2762057781219482, "logps/rejected": -1.3535068035125732, "loss": 1.6799, "rewards/accuracies": 0.5, "rewards/chosen": -1.2762057781219482, "rewards/margins": 0.07730090618133545, "rewards/rejected": -1.3535068035125732, "semantic_entropy": 0.8072978854179382, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 9.547860842311033, "learning_rate": 5.906047197571541e-07, "logits/chosen": -0.1101255863904953, "logits/rejected": -0.12727369368076324, "logps/chosen": -1.1571381092071533, "logps/rejected": -1.3399074077606201, "loss": 1.5717, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1571381092071533, "rewards/margins": 0.18276932835578918, "rewards/rejected": -1.3399074077606201, "semantic_entropy": 0.8291651010513306, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 5.953921295852294, "learning_rate": 5.890726635828919e-07, "logits/chosen": 0.008559176698327065, "logits/rejected": 0.02435842715203762, "logps/chosen": -1.129023790359497, "logps/rejected": -1.2308628559112549, "loss": 1.543, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.129023790359497, "rewards/margins": 0.101839080452919, "rewards/rejected": -1.2308628559112549, "semantic_entropy": 0.8279472589492798, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 6.326862010766406, "learning_rate": 5.875397432252569e-07, "logits/chosen": -0.17562103271484375, "logits/rejected": -0.10477634519338608, "logps/chosen": -1.2848960161209106, "logps/rejected": -1.4800294637680054, "loss": 1.6739, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2848960161209106, "rewards/margins": 0.19513356685638428, "rewards/rejected": -1.4800294637680054, "semantic_entropy": 0.7780656814575195, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.19540680944919586, "eval_logits/rejected": 0.27146899700164795, "eval_logps/chosen": -1.2697910070419312, "eval_logps/rejected": -1.403380274772644, "eval_loss": 1.6729706525802612, "eval_rewards/accuracies": 0.5497032403945923, "eval_rewards/chosen": -1.2697910070419312, "eval_rewards/margins": 0.13358932733535767, "eval_rewards/rejected": -1.403380274772644, "eval_runtime": 34.1149, "eval_samples_per_second": 39.426, "eval_semantic_entropy": 0.8060900568962097, "eval_steps_per_second": 9.878, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 5.63038914404412, "learning_rate": 5.860059735566491e-07, "logits/chosen": -0.22994522750377655, "logits/rejected": -0.10492175817489624, "logps/chosen": -1.1260526180267334, "logps/rejected": -1.3412573337554932, "loss": 1.5401, "rewards/accuracies": 0.625, "rewards/chosen": -1.1260526180267334, "rewards/margins": 0.21520483493804932, "rewards/rejected": -1.3412573337554932, "semantic_entropy": 0.828183650970459, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 7.7273495547260795, "learning_rate": 5.844713694577087e-07, "logits/chosen": -0.11024744808673859, "logits/rejected": -0.09179636090993881, "logps/chosen": -1.2523460388183594, "logps/rejected": -1.4172759056091309, "loss": 1.6559, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2523460388183594, "rewards/margins": 0.1649298369884491, "rewards/rejected": -1.4172759056091309, "semantic_entropy": 0.8071457147598267, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 6.136780043097159, "learning_rate": 5.829359458171714e-07, "logits/chosen": -0.05301692336797714, "logits/rejected": 0.056801266968250275, "logps/chosen": -1.2442411184310913, "logps/rejected": -1.3706305027008057, "loss": 1.6556, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2442411184310913, "rewards/margins": 0.12638936936855316, "rewards/rejected": -1.3706305027008057, "semantic_entropy": 0.8226641416549683, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 4.9490660420076225, "learning_rate": 5.81399717531724e-07, "logits/chosen": -0.08142216503620148, "logits/rejected": 0.05183114483952522, "logps/chosen": -1.2240002155303955, "logps/rejected": -1.2105592489242554, "loss": 1.6393, "rewards/accuracies": 0.5, "rewards/chosen": -1.2240002155303955, "rewards/margins": -0.013441130518913269, "rewards/rejected": -1.2105592489242554, "semantic_entropy": 0.8305248022079468, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 5.557259810381784, "learning_rate": 5.798626995058602e-07, "logits/chosen": -0.12940673530101776, "logits/rejected": 0.009943870827555656, "logps/chosen": -1.2820829153060913, "logps/rejected": -1.4880571365356445, "loss": 1.6855, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2820829153060913, "rewards/margins": 0.20597422122955322, "rewards/rejected": -1.4880571365356445, "semantic_entropy": 0.806861400604248, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 4.998153483141493, "learning_rate": 5.783249066517354e-07, "logits/chosen": -0.10761232674121857, "logits/rejected": 0.016319667920470238, "logps/chosen": -1.2690792083740234, "logps/rejected": -1.2605798244476318, "loss": 1.6743, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.2690792083740234, "rewards/margins": -0.008499261923134327, "rewards/rejected": -1.2605798244476318, "semantic_entropy": 0.8103424906730652, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 7.239248506561627, "learning_rate": 5.767863538890228e-07, "logits/chosen": -0.12728750705718994, "logits/rejected": 0.014859259128570557, "logps/chosen": -1.19269597530365, "logps/rejected": -1.2871057987213135, "loss": 1.6101, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.19269597530365, "rewards/margins": 0.09440989047288895, "rewards/rejected": -1.2871057987213135, "semantic_entropy": 0.8347412943840027, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 6.036806100749923, "learning_rate": 5.75247056144768e-07, "logits/chosen": -0.12051887810230255, "logits/rejected": -0.02856467291712761, "logps/chosen": -1.226078748703003, "logps/rejected": -1.336825966835022, "loss": 1.632, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.226078748703003, "rewards/margins": 0.11074721813201904, "rewards/rejected": -1.336825966835022, "semantic_entropy": 0.8118170499801636, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 4.340986013640831, "learning_rate": 5.737070283532444e-07, "logits/chosen": -0.10340622812509537, "logits/rejected": -0.03779247775673866, "logps/chosen": -1.164251685142517, "logps/rejected": -1.346777319908142, "loss": 1.578, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.164251685142517, "rewards/margins": 0.18252554535865784, "rewards/rejected": -1.346777319908142, "semantic_entropy": 0.827580451965332, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 6.860114709054871, "learning_rate": 5.721662854558084e-07, "logits/chosen": -0.15930306911468506, "logits/rejected": -0.10065138339996338, "logps/chosen": -1.255277395248413, "logps/rejected": -1.3365130424499512, "loss": 1.6635, "rewards/accuracies": 0.46875, "rewards/chosen": -1.255277395248413, "rewards/margins": 0.08123566210269928, "rewards/rejected": -1.3365130424499512, "semantic_entropy": 0.8165020942687988, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 4.871834240558112, "learning_rate": 5.706248424007545e-07, "logits/chosen": -0.16191063821315765, "logits/rejected": -0.034408584237098694, "logps/chosen": -1.3010262250900269, "logps/rejected": -1.4511836767196655, "loss": 1.6979, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3010262250900269, "rewards/margins": 0.15015748143196106, "rewards/rejected": -1.4511836767196655, "semantic_entropy": 0.793840765953064, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 6.561531346907125, "learning_rate": 5.690827141431699e-07, "logits/chosen": -0.19866201281547546, "logits/rejected": -0.06085973232984543, "logps/chosen": -1.1741880178451538, "logps/rejected": -1.2669432163238525, "loss": 1.5966, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1741880178451538, "rewards/margins": 0.09275525063276291, "rewards/rejected": -1.2669432163238525, "semantic_entropy": 0.8448039889335632, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 5.741514287950989, "learning_rate": 5.675399156447897e-07, "logits/chosen": -0.24980394542217255, "logits/rejected": -0.14167962968349457, "logps/chosen": -1.1795849800109863, "logps/rejected": -1.3308684825897217, "loss": 1.5952, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1795849800109863, "rewards/margins": 0.15128347277641296, "rewards/rejected": -1.3308684825897217, "semantic_entropy": 0.8312327265739441, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 4.649324507811795, "learning_rate": 5.659964618738515e-07, "logits/chosen": -0.15841853618621826, "logits/rejected": -0.06766881048679352, "logps/chosen": -1.2464814186096191, "logps/rejected": -1.2519334554672241, "loss": 1.6533, "rewards/accuracies": 0.4375, "rewards/chosen": -1.2464814186096191, "rewards/margins": 0.005452039651572704, "rewards/rejected": -1.2519334554672241, "semantic_entropy": 0.8136681318283081, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 5.567999471488443, "learning_rate": 5.644523678049509e-07, "logits/chosen": -0.15640899538993835, "logits/rejected": -0.0978294238448143, "logps/chosen": -1.2400473356246948, "logps/rejected": -1.3229585886001587, "loss": 1.6456, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2400473356246948, "rewards/margins": 0.08291115611791611, "rewards/rejected": -1.3229585886001587, "semantic_entropy": 0.8111278414726257, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 5.975187258809538, "learning_rate": 5.629076484188952e-07, "logits/chosen": -0.04754520207643509, "logits/rejected": 0.045732222497463226, "logps/chosen": -1.1906394958496094, "logps/rejected": -1.3566877841949463, "loss": 1.6042, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1906394958496094, "rewards/margins": 0.16604813933372498, "rewards/rejected": -1.3566877841949463, "semantic_entropy": 0.8272072672843933, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 5.543436820704793, "learning_rate": 5.613623187025587e-07, "logits/chosen": -0.11546238511800766, "logits/rejected": -0.011519989930093288, "logps/chosen": -1.202824592590332, "logps/rejected": -1.3249739408493042, "loss": 1.6144, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.202824592590332, "rewards/margins": 0.1221492737531662, "rewards/rejected": -1.3249739408493042, "semantic_entropy": 0.823070228099823, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 5.025853264844424, "learning_rate": 5.598163936487369e-07, "logits/chosen": -0.2136080265045166, "logits/rejected": -0.06162614747881889, "logps/chosen": -1.2097876071929932, "logps/rejected": -1.3256707191467285, "loss": 1.6293, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2097876071929932, "rewards/margins": 0.11588303744792938, "rewards/rejected": -1.3256707191467285, "semantic_entropy": 0.8389533758163452, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 5.1027913478319595, "learning_rate": 5.582698882560017e-07, "logits/chosen": -0.1764424592256546, "logits/rejected": -0.06768536567687988, "logps/chosen": -1.1323297023773193, "logps/rejected": -1.235107660293579, "loss": 1.5548, "rewards/accuracies": 0.5, "rewards/chosen": -1.1323297023773193, "rewards/margins": 0.10277803242206573, "rewards/rejected": -1.235107660293579, "semantic_entropy": 0.8449912071228027, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 4.414575888852886, "learning_rate": 5.567228175285549e-07, "logits/chosen": -0.1007847934961319, "logits/rejected": -0.02534860372543335, "logps/chosen": -1.2302358150482178, "logps/rejected": -1.337963581085205, "loss": 1.6356, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2302358150482178, "rewards/margins": 0.10772774368524551, "rewards/rejected": -1.337963581085205, "semantic_entropy": 0.8107962608337402, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 4.908793390572466, "learning_rate": 5.551751964760838e-07, "logits/chosen": -0.03650009632110596, "logits/rejected": -0.015034017153084278, "logps/chosen": -1.182477355003357, "logps/rejected": -1.340631365776062, "loss": 1.6009, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.182477355003357, "rewards/margins": 0.15815414488315582, "rewards/rejected": -1.340631365776062, "semantic_entropy": 0.8367658853530884, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 5.247455686184667, "learning_rate": 5.536270401136145e-07, "logits/chosen": -0.09710849821567535, "logits/rejected": -0.04422349855303764, "logps/chosen": -1.165979027748108, "logps/rejected": -1.2971729040145874, "loss": 1.5811, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.165979027748108, "rewards/margins": 0.13119389116764069, "rewards/rejected": -1.2971729040145874, "semantic_entropy": 0.8301803469657898, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 6.68214584790981, "learning_rate": 5.520783634613667e-07, "logits/chosen": -0.09847624599933624, "logits/rejected": 0.03739653900265694, "logps/chosen": -1.263550877571106, "logps/rejected": -1.3496769666671753, "loss": 1.6652, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.263550877571106, "rewards/margins": 0.08612625300884247, "rewards/rejected": -1.3496769666671753, "semantic_entropy": 0.8033725619316101, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 3.9912579198805562, "learning_rate": 5.505291815446082e-07, "logits/chosen": -0.06070940941572189, "logits/rejected": 0.034014929085969925, "logps/chosen": -1.2295587062835693, "logps/rejected": -1.388777494430542, "loss": 1.6334, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2295587062835693, "rewards/margins": 0.15921886265277863, "rewards/rejected": -1.388777494430542, "semantic_entropy": 0.8076674342155457, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 6.043422009215785, "learning_rate": 5.489795093935089e-07, "logits/chosen": -0.06177800893783569, "logits/rejected": -0.0232731681317091, "logps/chosen": -1.143060326576233, "logps/rejected": -1.3920073509216309, "loss": 1.5632, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.143060326576233, "rewards/margins": 0.24894718825817108, "rewards/rejected": -1.3920073509216309, "semantic_entropy": 0.8402158617973328, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 6.7251335150506835, "learning_rate": 5.474293620429946e-07, "logits/chosen": -0.19400207698345184, "logits/rejected": -0.06920813769102097, "logps/chosen": -1.1665343046188354, "logps/rejected": -1.3956111669540405, "loss": 1.5817, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1665343046188354, "rewards/margins": 0.22907690703868866, "rewards/rejected": -1.3956111669540405, "semantic_entropy": 0.8304238319396973, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 5.711588197403849, "learning_rate": 5.458787545326018e-07, "logits/chosen": -0.1671774983406067, "logits/rejected": -0.04706890881061554, "logps/chosen": -1.221055269241333, "logps/rejected": -1.373512864112854, "loss": 1.6214, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.221055269241333, "rewards/margins": 0.15245762467384338, "rewards/rejected": -1.373512864112854, "semantic_entropy": 0.8006470799446106, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 6.019218278815665, "learning_rate": 5.443277019063311e-07, "logits/chosen": -0.14369963109493256, "logits/rejected": -0.0023625430185347795, "logps/chosen": -1.2207027673721313, "logps/rejected": -1.3831660747528076, "loss": 1.6255, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2207027673721313, "rewards/margins": 0.1624635010957718, "rewards/rejected": -1.3831660747528076, "semantic_entropy": 0.809648334980011, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 6.046497057566938, "learning_rate": 5.427762192125023e-07, "logits/chosen": -0.18315675854682922, "logits/rejected": -0.06661573052406311, "logps/chosen": -1.1895192861557007, "logps/rejected": -1.2774180173873901, "loss": 1.6094, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1895192861557007, "rewards/margins": 0.08789865672588348, "rewards/rejected": -1.2774180173873901, "semantic_entropy": 0.8398206830024719, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 6.070664134825884, "learning_rate": 5.41224321503607e-07, "logits/chosen": -0.05338507145643234, "logits/rejected": 0.1543077528476715, "logps/chosen": -1.1586867570877075, "logps/rejected": -1.3194737434387207, "loss": 1.5741, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1586867570877075, "rewards/margins": 0.1607869565486908, "rewards/rejected": -1.3194737434387207, "semantic_entropy": 0.830750584602356, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 7.365101806810655, "learning_rate": 5.396720238361637e-07, "logits/chosen": -0.07123441249132156, "logits/rejected": 0.010223140008747578, "logps/chosen": -1.1625137329101562, "logps/rejected": -1.3636242151260376, "loss": 1.568, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1625137329101562, "rewards/margins": 0.20111048221588135, "rewards/rejected": -1.3636242151260376, "semantic_entropy": 0.8109195828437805, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 6.868362056377428, "learning_rate": 5.381193412705711e-07, "logits/chosen": -0.1454911082983017, "logits/rejected": -0.047428540885448456, "logps/chosen": -1.2020363807678223, "logps/rejected": -1.332728624343872, "loss": 1.6157, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2020363807678223, "rewards/margins": 0.13069215416908264, "rewards/rejected": -1.332728624343872, "semantic_entropy": 0.8272560238838196, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 6.88718021938576, "learning_rate": 5.365662888709622e-07, "logits/chosen": -0.1258665919303894, "logits/rejected": -0.054121606051921844, "logps/chosen": -1.1625771522521973, "logps/rejected": -1.2729923725128174, "loss": 1.5824, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1625771522521973, "rewards/margins": 0.11041511595249176, "rewards/rejected": -1.2729923725128174, "semantic_entropy": 0.8395683169364929, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 6.314457334477912, "learning_rate": 5.350128817050585e-07, "logits/chosen": -0.13267606496810913, "logits/rejected": 0.005651143379509449, "logps/chosen": -1.2684270143508911, "logps/rejected": -1.3238356113433838, "loss": 1.6735, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2684270143508911, "rewards/margins": 0.05540867894887924, "rewards/rejected": -1.3238356113433838, "semantic_entropy": 0.8101409077644348, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 8.136866508207527, "learning_rate": 5.334591348440229e-07, "logits/chosen": -0.08514733612537384, "logits/rejected": 0.02358965016901493, "logps/chosen": -1.2275893688201904, "logps/rejected": -1.439321517944336, "loss": 1.629, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2275893688201904, "rewards/margins": 0.21173210442066193, "rewards/rejected": -1.439321517944336, "semantic_entropy": 0.8029065132141113, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 5.669220530424876, "learning_rate": 5.319050633623141e-07, "logits/chosen": -0.18848130106925964, "logits/rejected": -0.06446901708841324, "logps/chosen": -1.2636384963989258, "logps/rejected": -1.4012552499771118, "loss": 1.6643, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2636384963989258, "rewards/margins": 0.13761678338050842, "rewards/rejected": -1.4012552499771118, "semantic_entropy": 0.8012672662734985, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 4.258674299582184, "learning_rate": 5.303506823375409e-07, "logits/chosen": -0.14964158833026886, "logits/rejected": -0.005484253168106079, "logps/chosen": -1.287639856338501, "logps/rejected": -1.3286447525024414, "loss": 1.696, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.287639856338501, "rewards/margins": 0.04100499302148819, "rewards/rejected": -1.3286447525024414, "semantic_entropy": 0.8167705535888672, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 6.537220627414246, "learning_rate": 5.287960068503143e-07, "logits/chosen": -0.12154004722833633, "logits/rejected": 0.015939751639962196, "logps/chosen": -1.1688786745071411, "logps/rejected": -1.3592965602874756, "loss": 1.579, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1688786745071411, "rewards/margins": 0.19041800498962402, "rewards/rejected": -1.3592965602874756, "semantic_entropy": 0.8202179074287415, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 6.565318289967094, "learning_rate": 5.272410519841032e-07, "logits/chosen": -0.1225869208574295, "logits/rejected": -0.058536093682050705, "logps/chosen": -1.265282392501831, "logps/rejected": -1.4457156658172607, "loss": 1.6589, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.265282392501831, "rewards/margins": 0.18043319880962372, "rewards/rejected": -1.4457156658172607, "semantic_entropy": 0.7872107625007629, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 5.43647753478363, "learning_rate": 5.256858328250861e-07, "logits/chosen": -0.15391549468040466, "logits/rejected": -0.046058669686317444, "logps/chosen": -1.2547824382781982, "logps/rejected": -1.3729344606399536, "loss": 1.6613, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2547824382781982, "rewards/margins": 0.11815198510885239, "rewards/rejected": -1.3729344606399536, "semantic_entropy": 0.8129531145095825, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 5.636253695037929, "learning_rate": 5.241303644620063e-07, "logits/chosen": -0.20083267986774445, "logits/rejected": -0.08073526620864868, "logps/chosen": -1.152616262435913, "logps/rejected": -1.3062798976898193, "loss": 1.5602, "rewards/accuracies": 0.5625, "rewards/chosen": -1.152616262435913, "rewards/margins": 0.15366388857364655, "rewards/rejected": -1.3062798976898193, "semantic_entropy": 0.8152149319648743, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 6.994637866460833, "learning_rate": 5.225746619860248e-07, "logits/chosen": -0.17367422580718994, "logits/rejected": -0.07299056649208069, "logps/chosen": -1.214210033416748, "logps/rejected": -1.321740746498108, "loss": 1.6277, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.214210033416748, "rewards/margins": 0.1075306311249733, "rewards/rejected": -1.321740746498108, "semantic_entropy": 0.8269670605659485, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 8.488473547808432, "learning_rate": 5.210187404905735e-07, "logits/chosen": -0.004179264884442091, "logits/rejected": 0.06427419185638428, "logps/chosen": -1.18704092502594, "logps/rejected": -1.3910752534866333, "loss": 1.5989, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.18704092502594, "rewards/margins": 0.2040342539548874, "rewards/rejected": -1.3910752534866333, "semantic_entropy": 0.8237525224685669, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 4.397742700092215, "learning_rate": 5.194626150712098e-07, "logits/chosen": -0.17931561172008514, "logits/rejected": -0.04716377705335617, "logps/chosen": -1.1970975399017334, "logps/rejected": -1.2703498601913452, "loss": 1.608, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1970975399017334, "rewards/margins": 0.07325248420238495, "rewards/rejected": -1.2703498601913452, "semantic_entropy": 0.8217490315437317, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 5.622592093287334, "learning_rate": 5.179063008254695e-07, "logits/chosen": -0.11533653736114502, "logits/rejected": 0.001404840499162674, "logps/chosen": -1.1734718084335327, "logps/rejected": -1.2877283096313477, "loss": 1.5836, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1734718084335327, "rewards/margins": 0.11425647884607315, "rewards/rejected": -1.2877283096313477, "semantic_entropy": 0.8202381134033203, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 3.4372133090695494, "learning_rate": 5.163498128527199e-07, "logits/chosen": -0.10103825479745865, "logits/rejected": 0.019106391817331314, "logps/chosen": -1.2589111328125, "logps/rejected": -1.3125823736190796, "loss": 1.6691, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2589111328125, "rewards/margins": 0.05367133021354675, "rewards/rejected": -1.3125823736190796, "semantic_entropy": 0.8203374743461609, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 6.919056823663009, "learning_rate": 5.147931662540144e-07, "logits/chosen": 0.014481092803180218, "logits/rejected": 0.10071420669555664, "logps/chosen": -1.2425141334533691, "logps/rejected": -1.2807538509368896, "loss": 1.6515, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2425141334533691, "rewards/margins": 0.03823966532945633, "rewards/rejected": -1.2807538509368896, "semantic_entropy": 0.8180323839187622, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 8.08494858220377, "learning_rate": 5.132363761319449e-07, "logits/chosen": -0.09809155762195587, "logits/rejected": -0.05222964286804199, "logps/chosen": -1.1516083478927612, "logps/rejected": -1.3431200981140137, "loss": 1.5579, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1516083478927612, "rewards/margins": 0.1915116310119629, "rewards/rejected": -1.3431200981140137, "semantic_entropy": 0.8126410245895386, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 5.5181087992313795, "learning_rate": 5.116794575904962e-07, "logits/chosen": -0.10511163622140884, "logits/rejected": -0.02133854851126671, "logps/chosen": -1.173319697380066, "logps/rejected": -1.2604511976242065, "loss": 1.5859, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.173319697380066, "rewards/margins": 0.08713146299123764, "rewards/rejected": -1.2604511976242065, "semantic_entropy": 0.8251628875732422, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 6.665935783523218, "learning_rate": 5.101224257348987e-07, "logits/chosen": -0.14194393157958984, "logits/rejected": -0.007206745445728302, "logps/chosen": -1.2372273206710815, "logps/rejected": -1.408913016319275, "loss": 1.6379, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2372273206710815, "rewards/margins": 0.17168568074703217, "rewards/rejected": -1.408913016319275, "semantic_entropy": 0.8012622594833374, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 4.575527053326428, "learning_rate": 5.085652956714823e-07, "logits/chosen": -0.1321101188659668, "logits/rejected": -0.01879306696355343, "logps/chosen": -1.1764107942581177, "logps/rejected": -1.36277437210083, "loss": 1.5881, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1764107942581177, "rewards/margins": 0.18636366724967957, "rewards/rejected": -1.36277437210083, "semantic_entropy": 0.8234395980834961, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 4.9263743971929985, "learning_rate": 5.070080825075298e-07, "logits/chosen": -0.1430414617061615, "logits/rejected": 0.002384514780715108, "logps/chosen": -1.2190344333648682, "logps/rejected": -1.381971001625061, "loss": 1.6277, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2190344333648682, "rewards/margins": 0.16293638944625854, "rewards/rejected": -1.381971001625061, "semantic_entropy": 0.8173419833183289, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 5.229639935401469, "learning_rate": 5.0545080135113e-07, "logits/chosen": -0.03433236852288246, "logits/rejected": -0.011723564937710762, "logps/chosen": -1.215117335319519, "logps/rejected": -1.4475805759429932, "loss": 1.6178, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.215117335319519, "rewards/margins": 0.23246312141418457, "rewards/rejected": -1.4475805759429932, "semantic_entropy": 0.8053798675537109, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 5.092667988584575, "learning_rate": 5.038934673110316e-07, "logits/chosen": -0.17678381502628326, "logits/rejected": -0.07962271571159363, "logps/chosen": -1.2139376401901245, "logps/rejected": -1.3598401546478271, "loss": 1.6203, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2139376401901245, "rewards/margins": 0.14590251445770264, "rewards/rejected": -1.3598401546478271, "semantic_entropy": 0.8128005266189575, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 5.191873722835031, "learning_rate": 5.023360954964963e-07, "logits/chosen": -0.17590485513210297, "logits/rejected": -0.12840516865253448, "logps/chosen": -1.1468520164489746, "logps/rejected": -1.3217122554779053, "loss": 1.5567, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1468520164489746, "rewards/margins": 0.17486020922660828, "rewards/rejected": -1.3217122554779053, "semantic_entropy": 0.8197919726371765, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 5.0111570951761495, "learning_rate": 5.007787010171524e-07, "logits/chosen": -0.23295195400714874, "logits/rejected": -0.07315609604120255, "logps/chosen": -1.1449476480484009, "logps/rejected": -1.2894866466522217, "loss": 1.5605, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1449476480484009, "rewards/margins": 0.1445390284061432, "rewards/rejected": -1.2894866466522217, "semantic_entropy": 0.8311077952384949, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 4.712561461054052, "learning_rate": 4.992212989828477e-07, "logits/chosen": -0.05673236772418022, "logits/rejected": -0.054450999945402145, "logps/chosen": -1.146056890487671, "logps/rejected": -1.2999672889709473, "loss": 1.5615, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.146056890487671, "rewards/margins": 0.15391036868095398, "rewards/rejected": -1.2999672889709473, "semantic_entropy": 0.8308122754096985, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 4.866125573145966, "learning_rate": 4.976639045035036e-07, "logits/chosen": -0.0447673536837101, "logits/rejected": 0.014107504859566689, "logps/chosen": -1.1855405569076538, "logps/rejected": -1.2995439767837524, "loss": 1.5953, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1855405569076538, "rewards/margins": 0.11400334537029266, "rewards/rejected": -1.2995439767837524, "semantic_entropy": 0.819513201713562, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 10.207099967014551, "learning_rate": 4.961065326889683e-07, "logits/chosen": -0.10861162841320038, "logits/rejected": 0.0035353421699255705, "logps/chosen": -1.195389986038208, "logps/rejected": -1.324704885482788, "loss": 1.6081, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.195389986038208, "rewards/margins": 0.12931494414806366, "rewards/rejected": -1.324704885482788, "semantic_entropy": 0.8253867030143738, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 7.504944446841151, "learning_rate": 4.9454919864887e-07, "logits/chosen": -0.2492876499891281, "logits/rejected": -0.12856172025203705, "logps/chosen": -1.243750810623169, "logps/rejected": -1.3395717144012451, "loss": 1.6463, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.243750810623169, "rewards/margins": 0.09582091867923737, "rewards/rejected": -1.3395717144012451, "semantic_entropy": 0.805180549621582, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 6.063718593480017, "learning_rate": 4.929919174924701e-07, "logits/chosen": -0.15436768531799316, "logits/rejected": -0.0006494149565696716, "logps/chosen": -1.2472807168960571, "logps/rejected": -1.3590681552886963, "loss": 1.6494, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2472807168960571, "rewards/margins": 0.11178745329380035, "rewards/rejected": -1.3590681552886963, "semantic_entropy": 0.8043055534362793, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 5.273175905434649, "learning_rate": 4.914347043285177e-07, "logits/chosen": -0.10299321264028549, "logits/rejected": -0.007911767810583115, "logps/chosen": -1.2196804285049438, "logps/rejected": -1.3927323818206787, "loss": 1.6306, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2196804285049438, "rewards/margins": 0.1730518639087677, "rewards/rejected": -1.3927323818206787, "semantic_entropy": 0.8217862248420715, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 4.06982587067828, "learning_rate": 4.898775742651013e-07, "logits/chosen": -0.03970777988433838, "logits/rejected": 0.021127009764313698, "logps/chosen": -1.2253153324127197, "logps/rejected": -1.4284207820892334, "loss": 1.6228, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2253153324127197, "rewards/margins": 0.2031053602695465, "rewards/rejected": -1.4284207820892334, "semantic_entropy": 0.795028030872345, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 4.305002656277895, "learning_rate": 4.883205424095037e-07, "logits/chosen": -0.16761167347431183, "logits/rejected": -0.04943222552537918, "logps/chosen": -1.2858798503875732, "logps/rejected": -1.4015756845474243, "loss": 1.6823, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2858798503875732, "rewards/margins": 0.11569585651159286, "rewards/rejected": -1.4015756845474243, "semantic_entropy": 0.7928104400634766, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 6.672438752384622, "learning_rate": 4.86763623868055e-07, "logits/chosen": -0.09022007882595062, "logits/rejected": -0.00973374955356121, "logps/chosen": -1.2549588680267334, "logps/rejected": -1.4177494049072266, "loss": 1.6541, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2549588680267334, "rewards/margins": 0.16279056668281555, "rewards/rejected": -1.4177494049072266, "semantic_entropy": 0.798285961151123, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 5.847742975652068, "learning_rate": 4.852068337459856e-07, "logits/chosen": -0.0824538916349411, "logits/rejected": 0.02332053892314434, "logps/chosen": -1.2845314741134644, "logps/rejected": -1.407356858253479, "loss": 1.6859, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2845314741134644, "rewards/margins": 0.12282538414001465, "rewards/rejected": -1.407356858253479, "semantic_entropy": 0.8026834726333618, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 5.5488402368386485, "learning_rate": 4.8365018714728e-07, "logits/chosen": -0.042258985340595245, "logits/rejected": -0.0013355724513530731, "logps/chosen": -1.280870795249939, "logps/rejected": -1.3795171976089478, "loss": 1.6775, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.280870795249939, "rewards/margins": 0.09864632785320282, "rewards/rejected": -1.3795171976089478, "semantic_entropy": 0.7932527661323547, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 4.099034867813307, "learning_rate": 4.820936991745304e-07, "logits/chosen": -0.29895660281181335, "logits/rejected": -0.16551679372787476, "logps/chosen": -1.1626226902008057, "logps/rejected": -1.2347488403320312, "loss": 1.5759, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1626226902008057, "rewards/margins": 0.0721263661980629, "rewards/rejected": -1.2347488403320312, "semantic_entropy": 0.8265777826309204, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 5.579087611631961, "learning_rate": 4.8053738492879e-07, "logits/chosen": -0.06860321760177612, "logits/rejected": 0.04703530669212341, "logps/chosen": -1.2178609371185303, "logps/rejected": -1.2500449419021606, "loss": 1.6261, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.2178609371185303, "rewards/margins": 0.0321841724216938, "rewards/rejected": -1.2500449419021606, "semantic_entropy": 0.8165380358695984, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 4.6236758995458365, "learning_rate": 4.789812595094265e-07, "logits/chosen": -0.21539707481861115, "logits/rejected": -0.11026833206415176, "logps/chosen": -1.2884395122528076, "logps/rejected": -1.4137952327728271, "loss": 1.6907, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2884395122528076, "rewards/margins": 0.12535569071769714, "rewards/rejected": -1.4137952327728271, "semantic_entropy": 0.8044666051864624, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 6.559079051177834, "learning_rate": 4.774253380139752e-07, "logits/chosen": -0.23214061558246613, "logits/rejected": -0.13806509971618652, "logps/chosen": -1.1555263996124268, "logps/rejected": -1.3288911581039429, "loss": 1.5635, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1555263996124268, "rewards/margins": 0.17336486279964447, "rewards/rejected": -1.3288911581039429, "semantic_entropy": 0.8158619999885559, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 6.329149217469804, "learning_rate": 4.758696355379936e-07, "logits/chosen": -0.1698877513408661, "logits/rejected": -0.1539454162120819, "logps/chosen": -1.1881452798843384, "logps/rejected": -1.3873047828674316, "loss": 1.5903, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1881452798843384, "rewards/margins": 0.19915971159934998, "rewards/rejected": -1.3873047828674316, "semantic_entropy": 0.8042497634887695, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 4.726041508662298, "learning_rate": 4.743141671749138e-07, "logits/chosen": -0.19326838850975037, "logits/rejected": -0.11986947059631348, "logps/chosen": -1.2500478029251099, "logps/rejected": -1.3514444828033447, "loss": 1.6526, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2500478029251099, "rewards/margins": 0.10139666497707367, "rewards/rejected": -1.3514444828033447, "semantic_entropy": 0.8051587343215942, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 4.598050463313362, "learning_rate": 4.727589480158968e-07, "logits/chosen": -0.1591026484966278, "logits/rejected": -0.07732214033603668, "logps/chosen": -1.229172945022583, "logps/rejected": -1.3371316194534302, "loss": 1.6334, "rewards/accuracies": 0.53125, "rewards/chosen": -1.229172945022583, "rewards/margins": 0.10795873403549194, "rewards/rejected": -1.3371316194534302, "semantic_entropy": 0.8085302114486694, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 6.029095422144384, "learning_rate": 4.712039931496855e-07, "logits/chosen": -0.19477275013923645, "logits/rejected": -0.1219252496957779, "logps/chosen": -1.1716020107269287, "logps/rejected": -1.3204153776168823, "loss": 1.5781, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1716020107269287, "rewards/margins": 0.14881326258182526, "rewards/rejected": -1.3204153776168823, "semantic_entropy": 0.8129401206970215, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 3.8956233365092072, "learning_rate": 4.6964931766245905e-07, "logits/chosen": -0.07562275975942612, "logits/rejected": -0.02932235598564148, "logps/chosen": -1.2428587675094604, "logps/rejected": -1.412390947341919, "loss": 1.647, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2428587675094604, "rewards/margins": 0.16953222453594208, "rewards/rejected": -1.412390947341919, "semantic_entropy": 0.8082922101020813, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 5.909717564981051, "learning_rate": 4.6809493663768575e-07, "logits/chosen": -0.11245963722467422, "logits/rejected": -0.09732633829116821, "logps/chosen": -1.132175087928772, "logps/rejected": -1.2739183902740479, "loss": 1.549, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.132175087928772, "rewards/margins": 0.14174345135688782, "rewards/rejected": -1.2739183902740479, "semantic_entropy": 0.8336782455444336, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 6.476439952452069, "learning_rate": 4.6654086515597716e-07, "logits/chosen": -0.18529292941093445, "logits/rejected": -0.06773144006729126, "logps/chosen": -1.190319538116455, "logps/rejected": -1.3888801336288452, "loss": 1.6071, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.190319538116455, "rewards/margins": 0.19856056571006775, "rewards/rejected": -1.3888801336288452, "semantic_entropy": 0.8334620594978333, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 3.7052022123494672, "learning_rate": 4.6498711829494154e-07, "logits/chosen": -0.2057032585144043, "logits/rejected": -0.11081378161907196, "logps/chosen": -1.1898534297943115, "logps/rejected": -1.4005017280578613, "loss": 1.5986, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1898534297943115, "rewards/margins": 0.2106482982635498, "rewards/rejected": -1.4005017280578613, "semantic_entropy": 0.8174898028373718, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 5.561979050896484, "learning_rate": 4.6343371112903777e-07, "logits/chosen": -0.12655028700828552, "logits/rejected": 0.01055997982621193, "logps/chosen": -1.2290369272232056, "logps/rejected": -1.4747931957244873, "loss": 1.6298, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2290369272232056, "rewards/margins": 0.24575646221637726, "rewards/rejected": -1.4747931957244873, "semantic_entropy": 0.8014976382255554, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.15706905722618103, "eval_logits/rejected": 0.2306501567363739, "eval_logps/chosen": -1.2683799266815186, "eval_logps/rejected": -1.4016380310058594, "eval_loss": 1.6719552278518677, "eval_rewards/accuracies": 0.5482195615768433, "eval_rewards/chosen": -1.2683799266815186, "eval_rewards/margins": 0.1332581490278244, "eval_rewards/rejected": -1.4016380310058594, "eval_runtime": 34.1129, "eval_samples_per_second": 39.428, "eval_semantic_entropy": 0.8068575859069824, "eval_steps_per_second": 9.879, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 4.030609492345825, "learning_rate": 4.618806587294291e-07, "logits/chosen": -0.2222311943769455, "logits/rejected": -0.12223508208990097, "logps/chosen": -1.2507129907608032, "logps/rejected": -1.3963932991027832, "loss": 1.6544, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2507129907608032, "rewards/margins": 0.14568038284778595, "rewards/rejected": -1.3963932991027832, "semantic_entropy": 0.8073335886001587, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 6.799594237899346, "learning_rate": 4.603279761638365e-07, "logits/chosen": -0.21741971373558044, "logits/rejected": -0.13557752966880798, "logps/chosen": -1.2192527055740356, "logps/rejected": -1.3542286157608032, "loss": 1.6243, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2192527055740356, "rewards/margins": 0.13497599959373474, "rewards/rejected": -1.3542286157608032, "semantic_entropy": 0.8101743459701538, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 6.109447811257887, "learning_rate": 4.5877567849639315e-07, "logits/chosen": -0.17854872345924377, "logits/rejected": -0.07427726686000824, "logps/chosen": -1.1879421472549438, "logps/rejected": -1.2928428649902344, "loss": 1.6037, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1879421472549438, "rewards/margins": 0.10490091145038605, "rewards/rejected": -1.2928428649902344, "semantic_entropy": 0.8316082954406738, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 4.693544488806758, "learning_rate": 4.572237807874979e-07, "logits/chosen": -0.1722884476184845, "logits/rejected": 0.0007740765577182174, "logps/chosen": -1.2750649452209473, "logps/rejected": -1.3824412822723389, "loss": 1.6835, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2750649452209473, "rewards/margins": 0.10737638175487518, "rewards/rejected": -1.3824412822723389, "semantic_entropy": 0.8169659376144409, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 6.5195095075345995, "learning_rate": 4.5567229809366895e-07, "logits/chosen": -0.18015733361244202, "logits/rejected": -0.061840254813432693, "logps/chosen": -1.141385555267334, "logps/rejected": -1.324022650718689, "loss": 1.5632, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.141385555267334, "rewards/margins": 0.18263721466064453, "rewards/rejected": -1.324022650718689, "semantic_entropy": 0.8435813784599304, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 7.411344164829411, "learning_rate": 4.541212454673984e-07, "logits/chosen": -0.1762259304523468, "logits/rejected": -0.05209894850850105, "logps/chosen": -1.189832091331482, "logps/rejected": -1.3646304607391357, "loss": 1.5985, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.189832091331482, "rewards/margins": 0.17479829490184784, "rewards/rejected": -1.3646304607391357, "semantic_entropy": 0.8172465562820435, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 6.249454271712939, "learning_rate": 4.525706379570055e-07, "logits/chosen": -0.16369444131851196, "logits/rejected": -0.12465399503707886, "logps/chosen": -1.2152215242385864, "logps/rejected": -1.3639843463897705, "loss": 1.6269, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2152215242385864, "rewards/margins": 0.14876294136047363, "rewards/rejected": -1.3639843463897705, "semantic_entropy": 0.8232955932617188, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 4.785132877559013, "learning_rate": 4.510204906064911e-07, "logits/chosen": -0.0706116110086441, "logits/rejected": 0.009426767006516457, "logps/chosen": -1.1597034931182861, "logps/rejected": -1.36921226978302, "loss": 1.5769, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1597034931182861, "rewards/margins": 0.2095087468624115, "rewards/rejected": -1.36921226978302, "semantic_entropy": 0.8344395756721497, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 6.049887797833276, "learning_rate": 4.4947081845539177e-07, "logits/chosen": -0.2660774886608124, "logits/rejected": -0.15233579277992249, "logps/chosen": -1.1617317199707031, "logps/rejected": -1.3076413869857788, "loss": 1.5795, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1617317199707031, "rewards/margins": 0.14590972661972046, "rewards/rejected": -1.3076413869857788, "semantic_entropy": 0.8356338739395142, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 4.323210711916206, "learning_rate": 4.479216365386333e-07, "logits/chosen": -0.06329520791769028, "logits/rejected": 0.032438650727272034, "logps/chosen": -1.2195427417755127, "logps/rejected": -1.4048019647598267, "loss": 1.6321, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2195427417755127, "rewards/margins": 0.18525920808315277, "rewards/rejected": -1.4048019647598267, "semantic_entropy": 0.8251484632492065, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 6.627695564052719, "learning_rate": 4.4637295988638555e-07, "logits/chosen": -0.0887155830860138, "logits/rejected": -0.023407921195030212, "logps/chosen": -1.3042999505996704, "logps/rejected": -1.3267024755477905, "loss": 1.7095, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3042999505996704, "rewards/margins": 0.022402595728635788, "rewards/rejected": -1.3267024755477905, "semantic_entropy": 0.8103273510932922, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 5.757735024886385, "learning_rate": 4.4482480352391623e-07, "logits/chosen": -0.24507632851600647, "logits/rejected": -0.13562801480293274, "logps/chosen": -1.263668179512024, "logps/rejected": -1.3243052959442139, "loss": 1.6718, "rewards/accuracies": 0.53125, "rewards/chosen": -1.263668179512024, "rewards/margins": 0.06063701957464218, "rewards/rejected": -1.3243052959442139, "semantic_entropy": 0.8162403106689453, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 7.391297614423148, "learning_rate": 4.4327718247144507e-07, "logits/chosen": -0.14886632561683655, "logits/rejected": -0.06549838930368423, "logps/chosen": -1.1530205011367798, "logps/rejected": -1.3352727890014648, "loss": 1.5639, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1530205011367798, "rewards/margins": 0.18225237727165222, "rewards/rejected": -1.3352727890014648, "semantic_entropy": 0.821846604347229, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 5.967592455163711, "learning_rate": 4.417301117439984e-07, "logits/chosen": -0.1192823201417923, "logits/rejected": 0.021567070856690407, "logps/chosen": -1.1176789999008179, "logps/rejected": -1.3391681909561157, "loss": 1.53, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1176789999008179, "rewards/margins": 0.22148919105529785, "rewards/rejected": -1.3391681909561157, "semantic_entropy": 0.8246389627456665, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 4.29101357646068, "learning_rate": 4.401836063512631e-07, "logits/chosen": -0.14104005694389343, "logits/rejected": 0.11650697141885757, "logps/chosen": -1.2440601587295532, "logps/rejected": -1.2841408252716064, "loss": 1.6583, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2440601587295532, "rewards/margins": 0.040080759674310684, "rewards/rejected": -1.2841408252716064, "semantic_entropy": 0.8285083770751953, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 7.617697270479064, "learning_rate": 4.386376812974413e-07, "logits/chosen": -0.13221105933189392, "logits/rejected": -0.06275731325149536, "logps/chosen": -1.1802763938903809, "logps/rejected": -1.3162603378295898, "loss": 1.598, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1802763938903809, "rewards/margins": 0.13598410785198212, "rewards/rejected": -1.3162603378295898, "semantic_entropy": 0.8355461955070496, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 5.244082096875538, "learning_rate": 4.370923515811048e-07, "logits/chosen": -0.17659877240657806, "logits/rejected": -0.004963183309882879, "logps/chosen": -1.179945707321167, "logps/rejected": -1.3663866519927979, "loss": 1.5873, "rewards/accuracies": 0.5625, "rewards/chosen": -1.179945707321167, "rewards/margins": 0.186441108584404, "rewards/rejected": -1.3663866519927979, "semantic_entropy": 0.8147540092468262, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 6.21446486779921, "learning_rate": 4.35547632195049e-07, "logits/chosen": -0.11461041122674942, "logits/rejected": -0.023071039468050003, "logps/chosen": -1.1895140409469604, "logps/rejected": -1.2419681549072266, "loss": 1.6036, "rewards/accuracies": 0.46875, "rewards/chosen": -1.1895140409469604, "rewards/margins": 0.05245404690504074, "rewards/rejected": -1.2419681549072266, "semantic_entropy": 0.8282513618469238, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 5.822419158417961, "learning_rate": 4.340035381261484e-07, "logits/chosen": -0.15038305521011353, "logits/rejected": -0.1075802817940712, "logps/chosen": -1.2781537771224976, "logps/rejected": -1.3294751644134521, "loss": 1.6816, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.2781537771224976, "rewards/margins": 0.05132138729095459, "rewards/rejected": -1.3294751644134521, "semantic_entropy": 0.806951642036438, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 5.643391220674146, "learning_rate": 4.324600843552104e-07, "logits/chosen": -0.22628605365753174, "logits/rejected": -0.12314046919345856, "logps/chosen": -1.2772160768508911, "logps/rejected": -1.3669157028198242, "loss": 1.6738, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2772160768508911, "rewards/margins": 0.08969976007938385, "rewards/rejected": -1.3669157028198242, "semantic_entropy": 0.7931562662124634, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 6.738768877948948, "learning_rate": 4.309172858568302e-07, "logits/chosen": -0.22219625115394592, "logits/rejected": -0.11699037253856659, "logps/chosen": -1.2453372478485107, "logps/rejected": -1.3203930854797363, "loss": 1.6526, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2453372478485107, "rewards/margins": 0.07505574822425842, "rewards/rejected": -1.3203930854797363, "semantic_entropy": 0.8145149350166321, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 5.421139483378023, "learning_rate": 4.293751575992455e-07, "logits/chosen": -0.06719529628753662, "logits/rejected": -0.03887028247117996, "logps/chosen": -1.204026699066162, "logps/rejected": -1.3320423364639282, "loss": 1.6151, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.204026699066162, "rewards/margins": 0.12801559269428253, "rewards/rejected": -1.3320423364639282, "semantic_entropy": 0.822232723236084, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 5.4296710030815545, "learning_rate": 4.278337145441916e-07, "logits/chosen": -0.25129789113998413, "logits/rejected": -0.14234867691993713, "logps/chosen": -1.1753960847854614, "logps/rejected": -1.3111867904663086, "loss": 1.5894, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1753960847854614, "rewards/margins": 0.13579079508781433, "rewards/rejected": -1.3111867904663086, "semantic_entropy": 0.8280342221260071, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 4.945267570357546, "learning_rate": 4.262929716467556e-07, "logits/chosen": -0.15285399556159973, "logits/rejected": -0.006856948137283325, "logps/chosen": -1.200858473777771, "logps/rejected": -1.353650450706482, "loss": 1.6158, "rewards/accuracies": 0.59375, "rewards/chosen": -1.200858473777771, "rewards/margins": 0.15279200673103333, "rewards/rejected": -1.353650450706482, "semantic_entropy": 0.8299810290336609, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 6.175314163064279, "learning_rate": 4.247529438552321e-07, "logits/chosen": -0.25768908858299255, "logits/rejected": -0.10906098037958145, "logps/chosen": -1.2218049764633179, "logps/rejected": -1.3719913959503174, "loss": 1.6324, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2218049764633179, "rewards/margins": 0.15018633008003235, "rewards/rejected": -1.3719913959503174, "semantic_entropy": 0.8211399912834167, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 5.435453918857955, "learning_rate": 4.232136461109773e-07, "logits/chosen": -0.15011705458164215, "logits/rejected": -0.06333153694868088, "logps/chosen": -1.1230441331863403, "logps/rejected": -1.322990894317627, "loss": 1.5466, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1230441331863403, "rewards/margins": 0.19994691014289856, "rewards/rejected": -1.322990894317627, "semantic_entropy": 0.8471616506576538, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 7.409299605711857, "learning_rate": 4.216750933482646e-07, "logits/chosen": -0.1594366431236267, "logits/rejected": -0.02245226502418518, "logps/chosen": -1.2408745288848877, "logps/rejected": -1.3627347946166992, "loss": 1.6514, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2408745288848877, "rewards/margins": 0.12186039984226227, "rewards/rejected": -1.3627347946166992, "semantic_entropy": 0.8209710121154785, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 6.206736843510111, "learning_rate": 4.2013730049413986e-07, "logits/chosen": -0.11758165061473846, "logits/rejected": 0.000500044203363359, "logps/chosen": -1.1889066696166992, "logps/rejected": -1.3342405557632446, "loss": 1.5986, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1889066696166992, "rewards/margins": 0.1453338861465454, "rewards/rejected": -1.3342405557632446, "semantic_entropy": 0.819352924823761, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 5.221874827286934, "learning_rate": 4.1860028246827594e-07, "logits/chosen": -0.13973954319953918, "logits/rejected": -0.008200794458389282, "logps/chosen": -1.1194616556167603, "logps/rejected": -1.2684390544891357, "loss": 1.5371, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1194616556167603, "rewards/margins": 0.14897751808166504, "rewards/rejected": -1.2684390544891357, "semantic_entropy": 0.8353075981140137, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 6.082659565461534, "learning_rate": 4.170640541828285e-07, "logits/chosen": -0.259488046169281, "logits/rejected": -0.14523951709270477, "logps/chosen": -1.292336106300354, "logps/rejected": -1.3654285669326782, "loss": 1.6998, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.292336106300354, "rewards/margins": 0.07309224456548691, "rewards/rejected": -1.3654285669326782, "semantic_entropy": 0.8149253129959106, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 5.261518193808437, "learning_rate": 4.1552863054229116e-07, "logits/chosen": -0.007036385126411915, "logits/rejected": 0.023227611556649208, "logps/chosen": -1.2631580829620361, "logps/rejected": -1.3087761402130127, "loss": 1.6642, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.2631580829620361, "rewards/margins": 0.04561809077858925, "rewards/rejected": -1.3087761402130127, "semantic_entropy": 0.8021450042724609, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 6.805558653696994, "learning_rate": 4.139940264433508e-07, "logits/chosen": -0.13760831952095032, "logits/rejected": 0.03529058396816254, "logps/chosen": -1.1655330657958984, "logps/rejected": -1.2747875452041626, "loss": 1.5766, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1655330657958984, "rewards/margins": 0.10925441980361938, "rewards/rejected": -1.2747875452041626, "semantic_entropy": 0.8221963047981262, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 4.034864413971433, "learning_rate": 4.1246025677474303e-07, "logits/chosen": -0.1979226917028427, "logits/rejected": -0.07164677232503891, "logps/chosen": -1.2039868831634521, "logps/rejected": -1.3445204496383667, "loss": 1.6146, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2039868831634521, "rewards/margins": 0.14053377509117126, "rewards/rejected": -1.3445204496383667, "semantic_entropy": 0.8212072253227234, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 4.150548498071516, "learning_rate": 4.10927336417108e-07, "logits/chosen": -0.15216335654258728, "logits/rejected": -0.027900153771042824, "logps/chosen": -1.1925677061080933, "logps/rejected": -1.3101913928985596, "loss": 1.6137, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1925677061080933, "rewards/margins": 0.11762356758117676, "rewards/rejected": -1.3101913928985596, "semantic_entropy": 0.8422799110412598, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 6.143413582312531, "learning_rate": 4.093952802428457e-07, "logits/chosen": -0.018914466723799706, "logits/rejected": 0.012826879508793354, "logps/chosen": -1.2293373346328735, "logps/rejected": -1.262089729309082, "loss": 1.6499, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2293373346328735, "rewards/margins": 0.0327523872256279, "rewards/rejected": -1.262089729309082, "semantic_entropy": 0.8411647081375122, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 5.346007692704446, "learning_rate": 4.0786410311597184e-07, "logits/chosen": -0.21355560421943665, "logits/rejected": -0.09527961909770966, "logps/chosen": -1.2023805379867554, "logps/rejected": -1.3750451803207397, "loss": 1.6114, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2023805379867554, "rewards/margins": 0.17266473174095154, "rewards/rejected": -1.3750451803207397, "semantic_entropy": 0.8181254267692566, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 5.459434475610684, "learning_rate": 4.063338198919737e-07, "logits/chosen": -0.17607930302619934, "logits/rejected": -0.15211963653564453, "logps/chosen": -1.2619664669036865, "logps/rejected": -1.3837125301361084, "loss": 1.6697, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2619664669036865, "rewards/margins": 0.12174613773822784, "rewards/rejected": -1.3837125301361084, "semantic_entropy": 0.815564751625061, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 7.7057283295115875, "learning_rate": 4.0480444541766575e-07, "logits/chosen": -0.1297137588262558, "logits/rejected": -0.04311042279005051, "logps/chosen": -1.2701561450958252, "logps/rejected": -1.2976572513580322, "loss": 1.6738, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2701561450958252, "rewards/margins": 0.027501001954078674, "rewards/rejected": -1.2976572513580322, "semantic_entropy": 0.8072202801704407, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 5.812303256812474, "learning_rate": 4.0327599453104606e-07, "logits/chosen": -0.17803992331027985, "logits/rejected": -0.10949493944644928, "logps/chosen": -1.154779314994812, "logps/rejected": -1.3430436849594116, "loss": 1.5752, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.154779314994812, "rewards/margins": 0.18826429545879364, "rewards/rejected": -1.3430436849594116, "semantic_entropy": 0.8407546877861023, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 5.5090717915898555, "learning_rate": 4.017484820611514e-07, "logits/chosen": -0.13978919386863708, "logits/rejected": -0.04207531362771988, "logps/chosen": -1.2176260948181152, "logps/rejected": -1.2981302738189697, "loss": 1.6254, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2176260948181152, "rewards/margins": 0.0805041640996933, "rewards/rejected": -1.2981302738189697, "semantic_entropy": 0.8155018091201782, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 7.191145626150399, "learning_rate": 4.002219228279148e-07, "logits/chosen": -0.1555519998073578, "logits/rejected": -0.03399502485990524, "logps/chosen": -1.209133267402649, "logps/rejected": -1.3701066970825195, "loss": 1.6182, "rewards/accuracies": 0.53125, "rewards/chosen": -1.209133267402649, "rewards/margins": 0.1609734743833542, "rewards/rejected": -1.3701066970825195, "semantic_entropy": 0.8181806802749634, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 5.99190372570237, "learning_rate": 3.9869633164202045e-07, "logits/chosen": -0.16096758842468262, "logits/rejected": 0.011413075029850006, "logps/chosen": -1.3497059345245361, "logps/rejected": -1.417412281036377, "loss": 1.7341, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3497059345245361, "rewards/margins": 0.0677061676979065, "rewards/rejected": -1.417412281036377, "semantic_entropy": 0.7687610983848572, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 5.511328779254243, "learning_rate": 3.9717172330476077e-07, "logits/chosen": -0.14211739599704742, "logits/rejected": -0.06177808716893196, "logps/chosen": -1.194837212562561, "logps/rejected": -1.361581563949585, "loss": 1.6045, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.194837212562561, "rewards/margins": 0.16674433648586273, "rewards/rejected": -1.361581563949585, "semantic_entropy": 0.8193379640579224, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 7.364789263395226, "learning_rate": 3.956481126078927e-07, "logits/chosen": -0.08699332177639008, "logits/rejected": 0.007392974104732275, "logps/chosen": -1.2145850658416748, "logps/rejected": -1.4298367500305176, "loss": 1.6061, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2145850658416748, "rewards/margins": 0.2152515947818756, "rewards/rejected": -1.4298367500305176, "semantic_entropy": 0.7831075191497803, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 4.3787969221110625, "learning_rate": 3.941255143334937e-07, "logits/chosen": -0.15131120383739471, "logits/rejected": -0.12497417628765106, "logps/chosen": -1.1981700658798218, "logps/rejected": -1.3269027471542358, "loss": 1.615, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1981700658798218, "rewards/margins": 0.12873251736164093, "rewards/rejected": -1.3269027471542358, "semantic_entropy": 0.8337594866752625, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 6.087648022138966, "learning_rate": 3.9260394325381895e-07, "logits/chosen": -0.15585334599018097, "logits/rejected": -0.05552041530609131, "logps/chosen": -1.2685312032699585, "logps/rejected": -1.3782851696014404, "loss": 1.665, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2685312032699585, "rewards/margins": 0.10975408554077148, "rewards/rejected": -1.3782851696014404, "semantic_entropy": 0.7930231094360352, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 7.543392857978031, "learning_rate": 3.9108341413115784e-07, "logits/chosen": -0.16407601535320282, "logits/rejected": -0.10204075276851654, "logps/chosen": -1.223729133605957, "logps/rejected": -1.4076542854309082, "loss": 1.625, "rewards/accuracies": 0.59375, "rewards/chosen": -1.223729133605957, "rewards/margins": 0.18392536044120789, "rewards/rejected": -1.4076542854309082, "semantic_entropy": 0.8024584650993347, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 7.042123108168308, "learning_rate": 3.895639417176905e-07, "logits/chosen": -0.1879500299692154, "logits/rejected": -0.1410655528306961, "logps/chosen": -1.141558051109314, "logps/rejected": -1.309556245803833, "loss": 1.5546, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.141558051109314, "rewards/margins": 0.16799823939800262, "rewards/rejected": -1.309556245803833, "semantic_entropy": 0.8261731266975403, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 6.091780971379791, "learning_rate": 3.8804554075534497e-07, "logits/chosen": -0.20251493155956268, "logits/rejected": -0.004118237178772688, "logps/chosen": -1.1863311529159546, "logps/rejected": -1.3247241973876953, "loss": 1.6006, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1863311529159546, "rewards/margins": 0.13839302957057953, "rewards/rejected": -1.3247241973876953, "semantic_entropy": 0.8285509943962097, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 7.430725656577851, "learning_rate": 3.8652822597565403e-07, "logits/chosen": -0.2952393889427185, "logits/rejected": -0.14281456172466278, "logps/chosen": -1.2142690420150757, "logps/rejected": -1.4030681848526, "loss": 1.6131, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2142690420150757, "rewards/margins": 0.1887992024421692, "rewards/rejected": -1.4030681848526, "semantic_entropy": 0.7976645231246948, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 5.114175693583057, "learning_rate": 3.850120120996123e-07, "logits/chosen": -0.15127256512641907, "logits/rejected": -0.021896863356232643, "logps/chosen": -1.3812482357025146, "logps/rejected": -1.5103403329849243, "loss": 1.7581, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.3812482357025146, "rewards/margins": 0.1290920078754425, "rewards/rejected": -1.5103403329849243, "semantic_entropy": 0.7537614107131958, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 5.552643993407361, "learning_rate": 3.8349691383753356e-07, "logits/chosen": -0.05006720870733261, "logits/rejected": 0.05411955714225769, "logps/chosen": -1.1849968433380127, "logps/rejected": -1.3443763256072998, "loss": 1.5913, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1849968433380127, "rewards/margins": 0.1593795120716095, "rewards/rejected": -1.3443763256072998, "semantic_entropy": 0.8126962780952454, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 4.927719893461688, "learning_rate": 3.819829458889078e-07, "logits/chosen": -0.22501492500305176, "logits/rejected": -0.11417822539806366, "logps/chosen": -1.1695407629013062, "logps/rejected": -1.3125121593475342, "loss": 1.5916, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1695407629013062, "rewards/margins": 0.14297160506248474, "rewards/rejected": -1.3125121593475342, "semantic_entropy": 0.8440596461296082, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 6.2838812283136924, "learning_rate": 3.804701229422585e-07, "logits/chosen": -0.2139137089252472, "logits/rejected": -0.13356271386146545, "logps/chosen": -1.2886669635772705, "logps/rejected": -1.3861716985702515, "loss": 1.6867, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2886669635772705, "rewards/margins": 0.09750493615865707, "rewards/rejected": -1.3861716985702515, "semantic_entropy": 0.7960596084594727, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 4.846291904369144, "learning_rate": 3.789584596750007e-07, "logits/chosen": -0.2536435127258301, "logits/rejected": -0.19954252243041992, "logps/chosen": -1.2243876457214355, "logps/rejected": -1.3743317127227783, "loss": 1.6312, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2243876457214355, "rewards/margins": 0.14994411170482635, "rewards/rejected": -1.3743317127227783, "semantic_entropy": 0.8136740922927856, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 6.264211145351103, "learning_rate": 3.77447970753298e-07, "logits/chosen": -0.10900802910327911, "logits/rejected": -0.08469494432210922, "logps/chosen": -1.247909426689148, "logps/rejected": -1.4390449523925781, "loss": 1.6456, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.247909426689148, "rewards/margins": 0.19113555550575256, "rewards/rejected": -1.4390449523925781, "semantic_entropy": 0.7954090237617493, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 4.171459339447132, "learning_rate": 3.7593867083192057e-07, "logits/chosen": -0.16354867815971375, "logits/rejected": -0.07365956902503967, "logps/chosen": -1.1997108459472656, "logps/rejected": -1.340932846069336, "loss": 1.6076, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1997108459472656, "rewards/margins": 0.14122198522090912, "rewards/rejected": -1.340932846069336, "semantic_entropy": 0.8156986236572266, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 5.36808656236982, "learning_rate": 3.7443057455410276e-07, "logits/chosen": -0.14484265446662903, "logits/rejected": -0.04628019034862518, "logps/chosen": -1.2837755680084229, "logps/rejected": -1.3195767402648926, "loss": 1.6935, "rewards/accuracies": 0.5, "rewards/chosen": -1.2837755680084229, "rewards/margins": 0.035801153630018234, "rewards/rejected": -1.3195767402648926, "semantic_entropy": 0.8193610906600952, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 5.841780489604164, "learning_rate": 3.7292369655140145e-07, "logits/chosen": -0.24346332252025604, "logits/rejected": -0.10360114276409149, "logps/chosen": -1.2137044668197632, "logps/rejected": -1.3534233570098877, "loss": 1.6265, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2137044668197632, "rewards/margins": 0.13971878588199615, "rewards/rejected": -1.3534233570098877, "semantic_entropy": 0.8255079388618469, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 5.917866226071935, "learning_rate": 3.714180514435534e-07, "logits/chosen": -0.16130122542381287, "logits/rejected": -0.03269638493657112, "logps/chosen": -1.2572224140167236, "logps/rejected": -1.4397376775741577, "loss": 1.662, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2572224140167236, "rewards/margins": 0.18251541256904602, "rewards/rejected": -1.4397376775741577, "semantic_entropy": 0.8095139265060425, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 5.938773320369447, "learning_rate": 3.6991365383833426e-07, "logits/chosen": -0.15208014845848083, "logits/rejected": -0.0583471953868866, "logps/chosen": -1.264249563217163, "logps/rejected": -1.4046415090560913, "loss": 1.6651, "rewards/accuracies": 0.5625, "rewards/chosen": -1.264249563217163, "rewards/margins": 0.14039184153079987, "rewards/rejected": -1.4046415090560913, "semantic_entropy": 0.8017934560775757, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 7.437046187474604, "learning_rate": 3.684105183314162e-07, "logits/chosen": -0.17367902398109436, "logits/rejected": -0.12024188041687012, "logps/chosen": -1.191589593887329, "logps/rejected": -1.32552969455719, "loss": 1.5998, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.191589593887329, "rewards/margins": 0.13394011557102203, "rewards/rejected": -1.32552969455719, "semantic_entropy": 0.816363513469696, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 5.016861920842301, "learning_rate": 3.669086595062263e-07, "logits/chosen": -0.16461606323719025, "logits/rejected": 0.0016112476587295532, "logps/chosen": -1.2526557445526123, "logps/rejected": -1.4273823499679565, "loss": 1.667, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2526557445526123, "rewards/margins": 0.1747264713048935, "rewards/rejected": -1.4273823499679565, "semantic_entropy": 0.8287761807441711, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 5.743447405245681, "learning_rate": 3.654080919338056e-07, "logits/chosen": -0.2051958590745926, "logits/rejected": -0.09191267937421799, "logps/chosen": -1.2507450580596924, "logps/rejected": -1.4062261581420898, "loss": 1.6476, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2507450580596924, "rewards/margins": 0.15548093616962433, "rewards/rejected": -1.4062261581420898, "semantic_entropy": 0.7936326265335083, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 3.822538284743097, "learning_rate": 3.639088301726673e-07, "logits/chosen": -0.12189755588769913, "logits/rejected": 0.045930854976177216, "logps/chosen": -1.2094333171844482, "logps/rejected": -1.3646732568740845, "loss": 1.621, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2094333171844482, "rewards/margins": 0.1552397906780243, "rewards/rejected": -1.3646732568740845, "semantic_entropy": 0.8230690956115723, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 5.642299323857447, "learning_rate": 3.624108887686556e-07, "logits/chosen": -0.13524536788463593, "logits/rejected": -0.07214657217264175, "logps/chosen": -1.1971880197525024, "logps/rejected": -1.3882826566696167, "loss": 1.6039, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1971880197525024, "rewards/margins": 0.19109483063220978, "rewards/rejected": -1.3882826566696167, "semantic_entropy": 0.8134153485298157, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 5.022108338218041, "learning_rate": 3.6091428225480433e-07, "logits/chosen": -0.2366868257522583, "logits/rejected": -0.12684670090675354, "logps/chosen": -1.1533561944961548, "logps/rejected": -1.2362805604934692, "loss": 1.5668, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.1533561944961548, "rewards/margins": 0.0829242467880249, "rewards/rejected": -1.2362805604934692, "semantic_entropy": 0.8268747329711914, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 6.562619181454215, "learning_rate": 3.5941902515119674e-07, "logits/chosen": -0.1783335953950882, "logits/rejected": 0.02293567545711994, "logps/chosen": -1.1941397190093994, "logps/rejected": -1.3688085079193115, "loss": 1.6013, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1941397190093994, "rewards/margins": 0.1746688038110733, "rewards/rejected": -1.3688085079193115, "semantic_entropy": 0.8144105672836304, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 6.264675632308237, "learning_rate": 3.5792513196482373e-07, "logits/chosen": -0.31508368253707886, "logits/rejected": -0.09736363589763641, "logps/chosen": -1.2091516256332397, "logps/rejected": -1.27398681640625, "loss": 1.6298, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2091516256332397, "rewards/margins": 0.06483516842126846, "rewards/rejected": -1.27398681640625, "semantic_entropy": 0.8412978053092957, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 5.05903838466378, "learning_rate": 3.5643261718944346e-07, "logits/chosen": -0.14795942604541779, "logits/rejected": -0.07155384868383408, "logps/chosen": -1.1945841312408447, "logps/rejected": -1.3057942390441895, "loss": 1.6216, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1945841312408447, "rewards/margins": 0.11120996624231339, "rewards/rejected": -1.3057942390441895, "semantic_entropy": 0.8539379239082336, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 5.225350345222494, "learning_rate": 3.5494149530544087e-07, "logits/chosen": -0.23703277111053467, "logits/rejected": -0.11489404737949371, "logps/chosen": -1.1428353786468506, "logps/rejected": -1.3061351776123047, "loss": 1.5539, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1428353786468506, "rewards/margins": 0.16329970955848694, "rewards/rejected": -1.3061351776123047, "semantic_entropy": 0.8222171068191528, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 8.63783284070978, "learning_rate": 3.534517807796871e-07, "logits/chosen": -0.14546352624893188, "logits/rejected": -0.08400402963161469, "logps/chosen": -1.2439192533493042, "logps/rejected": -1.3931090831756592, "loss": 1.654, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2439192533493042, "rewards/margins": 0.14918965101242065, "rewards/rejected": -1.3931090831756592, "semantic_entropy": 0.8201648592948914, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 4.832301971395009, "learning_rate": 3.519634880653988e-07, "logits/chosen": -0.15857051312923431, "logits/rejected": -0.11423659324645996, "logps/chosen": -1.1624062061309814, "logps/rejected": -1.3315969705581665, "loss": 1.5772, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1624062061309814, "rewards/margins": 0.1691906452178955, "rewards/rejected": -1.3315969705581665, "semantic_entropy": 0.8296812772750854, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 6.163495417785928, "learning_rate": 3.504766316019987e-07, "logits/chosen": -0.20783457159996033, "logits/rejected": -0.09797810763120651, "logps/chosen": -1.2113168239593506, "logps/rejected": -1.3645415306091309, "loss": 1.6208, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2113168239593506, "rewards/margins": 0.15322482585906982, "rewards/rejected": -1.3645415306091309, "semantic_entropy": 0.8190408945083618, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 5.682176803651159, "learning_rate": 3.489912258149745e-07, "logits/chosen": -0.09180374443531036, "logits/rejected": 0.007614802569150925, "logps/chosen": -1.1562148332595825, "logps/rejected": -1.317445993423462, "loss": 1.5749, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1562148332595825, "rewards/margins": 0.16123108565807343, "rewards/rejected": -1.317445993423462, "semantic_entropy": 0.8374015092849731, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 4.99972286117323, "learning_rate": 3.475072851157397e-07, "logits/chosen": -0.15539932250976562, "logits/rejected": -0.11838625371456146, "logps/chosen": -1.1824909448623657, "logps/rejected": -1.3795429468154907, "loss": 1.5932, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1824909448623657, "rewards/margins": 0.1970519721508026, "rewards/rejected": -1.3795429468154907, "semantic_entropy": 0.8215069770812988, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 6.796175913320517, "learning_rate": 3.460248239014936e-07, "logits/chosen": -0.1097722202539444, "logits/rejected": -0.0737352967262268, "logps/chosen": -1.2802150249481201, "logps/rejected": -1.3931413888931274, "loss": 1.6675, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2802150249481201, "rewards/margins": 0.11292636394500732, "rewards/rejected": -1.3931413888931274, "semantic_entropy": 0.7745494246482849, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 6.79632770470215, "learning_rate": 3.4454385655508134e-07, "logits/chosen": -0.13162429630756378, "logits/rejected": -0.06141994148492813, "logps/chosen": -1.2405445575714111, "logps/rejected": -1.3443615436553955, "loss": 1.647, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2405445575714111, "rewards/margins": 0.10381714254617691, "rewards/rejected": -1.3443615436553955, "semantic_entropy": 0.8129959106445312, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 4.764040465211182, "learning_rate": 3.4306439744485447e-07, "logits/chosen": -0.2631062865257263, "logits/rejected": -0.08462729305028915, "logps/chosen": -1.1859188079833984, "logps/rejected": -1.3421556949615479, "loss": 1.5989, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1859188079833984, "rewards/margins": 0.1562369465827942, "rewards/rejected": -1.3421556949615479, "semantic_entropy": 0.8260003328323364, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 8.85953959080172, "learning_rate": 3.415864609245322e-07, "logits/chosen": -0.12442197650671005, "logits/rejected": 0.03357607498764992, "logps/chosen": -1.1649069786071777, "logps/rejected": -1.3341917991638184, "loss": 1.5782, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1649069786071777, "rewards/margins": 0.16928483545780182, "rewards/rejected": -1.3341917991638184, "semantic_entropy": 0.8265705108642578, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.038508813828229904, "eval_logits/rejected": 0.10378167033195496, "eval_logps/chosen": -1.2673684358596802, "eval_logps/rejected": -1.40103280544281, "eval_loss": 1.6708446741104126, "eval_rewards/accuracies": 0.5474777221679688, "eval_rewards/chosen": -1.2673684358596802, "eval_rewards/margins": 0.1336645632982254, "eval_rewards/rejected": -1.40103280544281, "eval_runtime": 34.1071, "eval_samples_per_second": 39.435, "eval_semantic_entropy": 0.8066647052764893, "eval_steps_per_second": 9.881, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 6.482551124397992, "learning_rate": 3.401100613330605e-07, "logits/chosen": -0.2395254671573639, "logits/rejected": -0.21277913451194763, "logps/chosen": -1.233805775642395, "logps/rejected": -1.3033500909805298, "loss": 1.6466, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.233805775642395, "rewards/margins": 0.06954426318407059, "rewards/rejected": -1.3033500909805298, "semantic_entropy": 0.8255395889282227, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 4.534318610252998, "learning_rate": 3.3863521299447514e-07, "logits/chosen": -0.1707824319601059, "logits/rejected": -0.07940541952848434, "logps/chosen": -1.1924688816070557, "logps/rejected": -1.3495910167694092, "loss": 1.5974, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1924688816070557, "rewards/margins": 0.15712225437164307, "rewards/rejected": -1.3495910167694092, "semantic_entropy": 0.8097764849662781, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 4.293140368488616, "learning_rate": 3.371619302177609e-07, "logits/chosen": -0.11060144007205963, "logits/rejected": -0.021043527871370316, "logps/chosen": -1.2954049110412598, "logps/rejected": -1.3638674020767212, "loss": 1.6968, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2954049110412598, "rewards/margins": 0.06846243143081665, "rewards/rejected": -1.3638674020767212, "semantic_entropy": 0.8028589487075806, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 8.112722215617302, "learning_rate": 3.3569022729671393e-07, "logits/chosen": -0.1413435935974121, "logits/rejected": -0.08158606290817261, "logps/chosen": -1.2285503149032593, "logps/rejected": -1.301710844039917, "loss": 1.6333, "rewards/accuracies": 0.5, "rewards/chosen": -1.2285503149032593, "rewards/margins": 0.07316039502620697, "rewards/rejected": -1.301710844039917, "semantic_entropy": 0.8095226287841797, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 6.929849748346605, "learning_rate": 3.342201185098024e-07, "logits/chosen": -0.10768525302410126, "logits/rejected": -0.09338851273059845, "logps/chosen": -1.2079904079437256, "logps/rejected": -1.4147666692733765, "loss": 1.6127, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2079904079437256, "rewards/margins": 0.2067762166261673, "rewards/rejected": -1.4147666692733765, "semantic_entropy": 0.8094978332519531, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 5.251327183156706, "learning_rate": 3.3275161812002807e-07, "logits/chosen": -0.19211602210998535, "logits/rejected": -0.15069589018821716, "logps/chosen": -1.2270386219024658, "logps/rejected": -1.3500269651412964, "loss": 1.6337, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2270386219024658, "rewards/margins": 0.12298832833766937, "rewards/rejected": -1.3500269651412964, "semantic_entropy": 0.8133300542831421, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 6.5882848774451706, "learning_rate": 3.312847403747883e-07, "logits/chosen": -0.21079416573047638, "logits/rejected": -0.14149264991283417, "logps/chosen": -1.2503912448883057, "logps/rejected": -1.375687837600708, "loss": 1.6511, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2503912448883057, "rewards/margins": 0.1252966821193695, "rewards/rejected": -1.375687837600708, "semantic_entropy": 0.8014103174209595, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 5.592801915864075, "learning_rate": 3.2981949950573733e-07, "logits/chosen": -0.16907276213169098, "logits/rejected": -0.0674993246793747, "logps/chosen": -1.337431788444519, "logps/rejected": -1.3838446140289307, "loss": 1.7267, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.337431788444519, "rewards/margins": 0.04641289263963699, "rewards/rejected": -1.3838446140289307, "semantic_entropy": 0.7786122560501099, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 4.364277386942955, "learning_rate": 3.283559097286486e-07, "logits/chosen": -0.2151741087436676, "logits/rejected": -0.11096904426813126, "logps/chosen": -1.3248178958892822, "logps/rejected": -1.42057204246521, "loss": 1.7157, "rewards/accuracies": 0.5, "rewards/chosen": -1.3248178958892822, "rewards/margins": 0.0957542434334755, "rewards/rejected": -1.42057204246521, "semantic_entropy": 0.7817228436470032, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 4.8416290465701985, "learning_rate": 3.268939852432765e-07, "logits/chosen": -0.2221498191356659, "logits/rejected": -0.13794749975204468, "logps/chosen": -1.2023205757141113, "logps/rejected": -1.3233706951141357, "loss": 1.6128, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2023205757141113, "rewards/margins": 0.12105011940002441, "rewards/rejected": -1.3233706951141357, "semantic_entropy": 0.820940375328064, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 6.260679060562593, "learning_rate": 3.254337402332187e-07, "logits/chosen": -0.19244083762168884, "logits/rejected": -0.07393975555896759, "logps/chosen": -1.2655022144317627, "logps/rejected": -1.3712488412857056, "loss": 1.6708, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2655022144317627, "rewards/margins": 0.10574676096439362, "rewards/rejected": -1.3712488412857056, "semantic_entropy": 0.810564398765564, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 7.892391799766181, "learning_rate": 3.239751888657788e-07, "logits/chosen": -0.22627322375774384, "logits/rejected": -0.12204619497060776, "logps/chosen": -1.1559617519378662, "logps/rejected": -1.2586816549301147, "loss": 1.5675, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1559617519378662, "rewards/margins": 0.10271978378295898, "rewards/rejected": -1.2586816549301147, "semantic_entropy": 0.8231647610664368, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 6.575754831979405, "learning_rate": 3.2251834529182856e-07, "logits/chosen": -0.15279237926006317, "logits/rejected": -0.04916970804333687, "logps/chosen": -1.1837831735610962, "logps/rejected": -1.3049086332321167, "loss": 1.5971, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1837831735610962, "rewards/margins": 0.12112554162740707, "rewards/rejected": -1.3049086332321167, "semantic_entropy": 0.8265365362167358, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 5.375923543346594, "learning_rate": 3.2106322364567075e-07, "logits/chosen": -0.21697697043418884, "logits/rejected": -0.09994576126337051, "logps/chosen": -1.2299638986587524, "logps/rejected": -1.3828942775726318, "loss": 1.634, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2299638986587524, "rewards/margins": 0.15293069183826447, "rewards/rejected": -1.3828942775726318, "semantic_entropy": 0.8081141710281372, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 7.213305586639739, "learning_rate": 3.1960983804490183e-07, "logits/chosen": -0.17418284714221954, "logits/rejected": -0.06238603591918945, "logps/chosen": -1.248923897743225, "logps/rejected": -1.479159951210022, "loss": 1.6506, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.248923897743225, "rewards/margins": 0.23023590445518494, "rewards/rejected": -1.479159951210022, "semantic_entropy": 0.8033756017684937, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 7.07555006943867, "learning_rate": 3.1815820259027537e-07, "logits/chosen": -0.1886719912290573, "logits/rejected": -0.0970771461725235, "logps/chosen": -1.0972247123718262, "logps/rejected": -1.2721375226974487, "loss": 1.5176, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.0972247123718262, "rewards/margins": 0.17491288483142853, "rewards/rejected": -1.2721375226974487, "semantic_entropy": 0.840740978717804, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 6.200014868797879, "learning_rate": 3.16708331365565e-07, "logits/chosen": -0.2204626351594925, "logits/rejected": -0.16195020079612732, "logps/chosen": -1.2046582698822021, "logps/rejected": -1.3792825937271118, "loss": 1.6098, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2046582698822021, "rewards/margins": 0.1746242642402649, "rewards/rejected": -1.3792825937271118, "semantic_entropy": 0.8101860880851746, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 6.89406407570187, "learning_rate": 3.152602384374275e-07, "logits/chosen": -0.16586263477802277, "logits/rejected": -0.043767429888248444, "logps/chosen": -1.2317785024642944, "logps/rejected": -1.3680107593536377, "loss": 1.6333, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.2317785024642944, "rewards/margins": 0.13623221218585968, "rewards/rejected": -1.3680107593536377, "semantic_entropy": 0.802967369556427, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 4.735693998799008, "learning_rate": 3.1381393785526697e-07, "logits/chosen": -0.13649600744247437, "logits/rejected": -0.11805985122919083, "logps/chosen": -1.2810300588607788, "logps/rejected": -1.475710153579712, "loss": 1.6737, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2810300588607788, "rewards/margins": 0.1946801245212555, "rewards/rejected": -1.475710153579712, "semantic_entropy": 0.7853351831436157, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 5.174744707830051, "learning_rate": 3.123694436510979e-07, "logits/chosen": -0.10247603803873062, "logits/rejected": -0.015486510470509529, "logps/chosen": -1.200918436050415, "logps/rejected": -1.3600919246673584, "loss": 1.61, "rewards/accuracies": 0.625, "rewards/chosen": -1.200918436050415, "rewards/margins": 0.15917348861694336, "rewards/rejected": -1.3600919246673584, "semantic_entropy": 0.8182055354118347, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 6.670138639530281, "learning_rate": 3.1092676983940946e-07, "logits/chosen": -0.16784876585006714, "logits/rejected": -0.1122293695807457, "logps/chosen": -1.2634930610656738, "logps/rejected": -1.4016636610031128, "loss": 1.6716, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2634930610656738, "rewards/margins": 0.1381704956293106, "rewards/rejected": -1.4016636610031128, "semantic_entropy": 0.8161520957946777, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 5.474503597579559, "learning_rate": 3.094859304170293e-07, "logits/chosen": -0.07121769338846207, "logits/rejected": -0.03368829935789108, "logps/chosen": -1.2650954723358154, "logps/rejected": -1.3728530406951904, "loss": 1.6658, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2650954723358154, "rewards/margins": 0.10775750875473022, "rewards/rejected": -1.3728530406951904, "semantic_entropy": 0.8013364672660828, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 4.12747953299853, "learning_rate": 3.0804693936298795e-07, "logits/chosen": -0.11438391357660294, "logits/rejected": -0.05041012167930603, "logps/chosen": -1.2487103939056396, "logps/rejected": -1.381676197052002, "loss": 1.6573, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2487103939056396, "rewards/margins": 0.1329658329486847, "rewards/rejected": -1.381676197052002, "semantic_entropy": 0.817214846611023, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 4.777629572627635, "learning_rate": 3.066098106383826e-07, "logits/chosen": -0.16425888240337372, "logits/rejected": -0.10339069366455078, "logps/chosen": -1.200781226158142, "logps/rejected": -1.352173924446106, "loss": 1.6024, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.200781226158142, "rewards/margins": 0.15139265358448029, "rewards/rejected": -1.352173924446106, "semantic_entropy": 0.8031978607177734, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 5.829597505798078, "learning_rate": 3.0517455818624263e-07, "logits/chosen": -0.21849043667316437, "logits/rejected": -0.1395927369594574, "logps/chosen": -1.2062909603118896, "logps/rejected": -1.319591760635376, "loss": 1.6234, "rewards/accuracies": 0.5, "rewards/chosen": -1.2062909603118896, "rewards/margins": 0.11330103874206543, "rewards/rejected": -1.319591760635376, "semantic_entropy": 0.8342618942260742, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 6.077517250073789, "learning_rate": 3.037411959313936e-07, "logits/chosen": -0.11190253496170044, "logits/rejected": -0.009822065010666847, "logps/chosen": -1.1696685552597046, "logps/rejected": -1.3574388027191162, "loss": 1.5783, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1696685552597046, "rewards/margins": 0.187770277261734, "rewards/rejected": -1.3574388027191162, "semantic_entropy": 0.8172141909599304, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 5.755464900387589, "learning_rate": 3.023097377803224e-07, "logits/chosen": -0.09369830787181854, "logits/rejected": -0.03624532371759415, "logps/chosen": -1.3062700033187866, "logps/rejected": -1.3613829612731934, "loss": 1.7089, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3062700033187866, "rewards/margins": 0.05511295795440674, "rewards/rejected": -1.3613829612731934, "semantic_entropy": 0.8052226901054382, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 5.491574237922123, "learning_rate": 3.008801976210423e-07, "logits/chosen": -0.0747738778591156, "logits/rejected": -0.048963237553834915, "logps/chosen": -1.2854511737823486, "logps/rejected": -1.3538997173309326, "loss": 1.6865, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.2854511737823486, "rewards/margins": 0.06844840198755264, "rewards/rejected": -1.3538997173309326, "semantic_entropy": 0.8021589517593384, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 4.689242383554069, "learning_rate": 2.994525893229581e-07, "logits/chosen": -0.15412834286689758, "logits/rejected": -0.06884375214576721, "logps/chosen": -1.2361555099487305, "logps/rejected": -1.354602336883545, "loss": 1.6528, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2361555099487305, "rewards/margins": 0.11844678223133087, "rewards/rejected": -1.354602336883545, "semantic_entropy": 0.8332095146179199, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 5.778444378975784, "learning_rate": 2.98026926736732e-07, "logits/chosen": -0.19410383701324463, "logits/rejected": -0.1257806122303009, "logps/chosen": -1.1432162523269653, "logps/rejected": -1.3668062686920166, "loss": 1.5632, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1432162523269653, "rewards/margins": 0.22359001636505127, "rewards/rejected": -1.3668062686920166, "semantic_entropy": 0.8399227261543274, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 5.395990044209341, "learning_rate": 2.9660322369414846e-07, "logits/chosen": -0.17290571331977844, "logits/rejected": -0.09452417492866516, "logps/chosen": -1.1539366245269775, "logps/rejected": -1.4179812669754028, "loss": 1.5619, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1539366245269775, "rewards/margins": 0.2640445828437805, "rewards/rejected": -1.4179812669754028, "semantic_entropy": 0.8159976005554199, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 5.911322747002084, "learning_rate": 2.9518149400798063e-07, "logits/chosen": -0.2552509009838104, "logits/rejected": -0.24411804974079132, "logps/chosen": -1.2113406658172607, "logps/rejected": -1.3777507543563843, "loss": 1.6179, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2113406658172607, "rewards/margins": 0.1664101779460907, "rewards/rejected": -1.3777507543563843, "semantic_entropy": 0.8131579160690308, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 6.911455402667682, "learning_rate": 2.9376175147185633e-07, "logits/chosen": -0.12070800364017487, "logits/rejected": 0.02008051797747612, "logps/chosen": -1.183771014213562, "logps/rejected": -1.417751669883728, "loss": 1.5974, "rewards/accuracies": 0.625, "rewards/chosen": -1.183771014213562, "rewards/margins": 0.23398077487945557, "rewards/rejected": -1.417751669883728, "semantic_entropy": 0.8273506164550781, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 6.929319187977619, "learning_rate": 2.9234400986012376e-07, "logits/chosen": -0.2543033957481384, "logits/rejected": -0.12824256718158722, "logps/chosen": -1.1335548162460327, "logps/rejected": -1.4909895658493042, "loss": 1.5449, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1335548162460327, "rewards/margins": 0.3574346899986267, "rewards/rejected": -1.4909895658493042, "semantic_entropy": 0.8227757215499878, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 4.267854236443263, "learning_rate": 2.9092828292771817e-07, "logits/chosen": -0.19758227467536926, "logits/rejected": -0.15474224090576172, "logps/chosen": -1.1823774576187134, "logps/rejected": -1.3160388469696045, "loss": 1.592, "rewards/accuracies": 0.46875, "rewards/chosen": -1.1823774576187134, "rewards/margins": 0.13366149365901947, "rewards/rejected": -1.3160388469696045, "semantic_entropy": 0.8192998766899109, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 5.991885030124014, "learning_rate": 2.8951458441002875e-07, "logits/chosen": -0.14701411128044128, "logits/rejected": -0.12248693406581879, "logps/chosen": -1.2217414379119873, "logps/rejected": -1.3971037864685059, "loss": 1.6245, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2217414379119873, "rewards/margins": 0.17536231875419617, "rewards/rejected": -1.3971037864685059, "semantic_entropy": 0.8054389953613281, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 5.815312792752397, "learning_rate": 2.881029280227643e-07, "logits/chosen": -0.1834588199853897, "logits/rejected": -0.09105970710515976, "logps/chosen": -1.2431914806365967, "logps/rejected": -1.4880024194717407, "loss": 1.6434, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2431914806365967, "rewards/margins": 0.24481067061424255, "rewards/rejected": -1.4880024194717407, "semantic_entropy": 0.8003460168838501, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 4.7538001904372935, "learning_rate": 2.8669332746182177e-07, "logits/chosen": -0.22037498652935028, "logits/rejected": -0.0798591896891594, "logps/chosen": -1.1921221017837524, "logps/rejected": -1.3540624380111694, "loss": 1.5936, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1921221017837524, "rewards/margins": 0.16194020211696625, "rewards/rejected": -1.3540624380111694, "semantic_entropy": 0.8028610944747925, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 4.789472607346182, "learning_rate": 2.8528579640315156e-07, "logits/chosen": -0.18207648396492004, "logits/rejected": -0.16313223540782928, "logps/chosen": -1.1706411838531494, "logps/rejected": -1.3130903244018555, "loss": 1.5862, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1706411838531494, "rewards/margins": 0.14244937896728516, "rewards/rejected": -1.3130903244018555, "semantic_entropy": 0.8312179446220398, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 6.514402007553702, "learning_rate": 2.8388034850262646e-07, "logits/chosen": -0.15355579555034637, "logits/rejected": -0.05637284368276596, "logps/chosen": -1.2523047924041748, "logps/rejected": -1.450862169265747, "loss": 1.6474, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2523047924041748, "rewards/margins": 0.19855746626853943, "rewards/rejected": -1.450862169265747, "semantic_entropy": 0.7901880145072937, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 6.3263903867735465, "learning_rate": 2.824769973959079e-07, "logits/chosen": -0.11148805916309357, "logits/rejected": -0.018462086096405983, "logps/chosen": -1.1543951034545898, "logps/rejected": -1.359931468963623, "loss": 1.5718, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1543951034545898, "rewards/margins": 0.20553644001483917, "rewards/rejected": -1.359931468963623, "semantic_entropy": 0.8347604870796204, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 5.873980503388585, "learning_rate": 2.81075756698315e-07, "logits/chosen": -0.06418409198522568, "logits/rejected": 0.008335360325872898, "logps/chosen": -1.186737298965454, "logps/rejected": -1.3718688488006592, "loss": 1.5911, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.186737298965454, "rewards/margins": 0.18513131141662598, "rewards/rejected": -1.3718688488006592, "semantic_entropy": 0.8088175654411316, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 5.508516891516533, "learning_rate": 2.7967664000469035e-07, "logits/chosen": -0.2559741139411926, "logits/rejected": -0.15002498030662537, "logps/chosen": -1.2175886631011963, "logps/rejected": -1.315505027770996, "loss": 1.6279, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2175886631011963, "rewards/margins": 0.09791664779186249, "rewards/rejected": -1.315505027770996, "semantic_entropy": 0.8206413984298706, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 7.6696329541020125, "learning_rate": 2.7827966088927095e-07, "logits/chosen": -0.22907774150371552, "logits/rejected": -0.06115894764661789, "logps/chosen": -1.277761459350586, "logps/rejected": -1.2731486558914185, "loss": 1.6797, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.277761459350586, "rewards/margins": -0.0046127974055707455, "rewards/rejected": -1.2731486558914185, "semantic_entropy": 0.8038827776908875, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 6.021922629305392, "learning_rate": 2.768848329055538e-07, "logits/chosen": -0.19161252677440643, "logits/rejected": -0.1032009944319725, "logps/chosen": -1.1881550550460815, "logps/rejected": -1.2905912399291992, "loss": 1.5972, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1881550550460815, "rewards/margins": 0.10243628174066544, "rewards/rejected": -1.2905912399291992, "semantic_entropy": 0.8180972933769226, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 6.178012719810023, "learning_rate": 2.7549216958616657e-07, "logits/chosen": -0.2568252682685852, "logits/rejected": -0.143551304936409, "logps/chosen": -1.2573485374450684, "logps/rejected": -1.4436380863189697, "loss": 1.6509, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2573485374450684, "rewards/margins": 0.18628954887390137, "rewards/rejected": -1.4436380863189697, "semantic_entropy": 0.7871571779251099, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 5.140311348874454, "learning_rate": 2.741016844427344e-07, "logits/chosen": -0.1734830141067505, "logits/rejected": -0.07045549154281616, "logps/chosen": -1.251528024673462, "logps/rejected": -1.3976876735687256, "loss": 1.6558, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.251528024673462, "rewards/margins": 0.14615947008132935, "rewards/rejected": -1.3976876735687256, "semantic_entropy": 0.808489203453064, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 6.6154511247097565, "learning_rate": 2.7271339096575073e-07, "logits/chosen": -0.12721320986747742, "logits/rejected": -0.0483049601316452, "logps/chosen": -1.127101182937622, "logps/rejected": -1.4103493690490723, "loss": 1.5425, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.127101182937622, "rewards/margins": 0.28324824571609497, "rewards/rejected": -1.4103493690490723, "semantic_entropy": 0.8307526707649231, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 5.4904760187301465, "learning_rate": 2.713273026244446e-07, "logits/chosen": -0.294910192489624, "logits/rejected": -0.10399266332387924, "logps/chosen": -1.2637052536010742, "logps/rejected": -1.439330816268921, "loss": 1.6617, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2637052536010742, "rewards/margins": 0.1756257712841034, "rewards/rejected": -1.439330816268921, "semantic_entropy": 0.7959521412849426, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 5.510688174077247, "learning_rate": 2.6994343286665156e-07, "logits/chosen": -0.1835429072380066, "logits/rejected": -0.05256185680627823, "logps/chosen": -1.2593345642089844, "logps/rejected": -1.3756139278411865, "loss": 1.6649, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2593345642089844, "rewards/margins": 0.11627931892871857, "rewards/rejected": -1.3756139278411865, "semantic_entropy": 0.8110499382019043, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 6.223447691398862, "learning_rate": 2.6856179511868156e-07, "logits/chosen": -0.14808690547943115, "logits/rejected": 0.003693693783134222, "logps/chosen": -1.1979299783706665, "logps/rejected": -1.4187183380126953, "loss": 1.6088, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1979299783706665, "rewards/margins": 0.22078831493854523, "rewards/rejected": -1.4187183380126953, "semantic_entropy": 0.8217847943305969, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 4.559489574288957, "learning_rate": 2.6718240278519056e-07, "logits/chosen": -0.17574508488178253, "logits/rejected": -0.03929417207837105, "logps/chosen": -1.2104413509368896, "logps/rejected": -1.3232942819595337, "loss": 1.624, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2104413509368896, "rewards/margins": 0.11285281181335449, "rewards/rejected": -1.3232942819595337, "semantic_entropy": 0.8270261883735657, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 7.65623251312822, "learning_rate": 2.6580526924904866e-07, "logits/chosen": -0.2637561559677124, "logits/rejected": -0.10861550271511078, "logps/chosen": -1.254930019378662, "logps/rejected": -1.3255680799484253, "loss": 1.6594, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.254930019378662, "rewards/margins": 0.07063815742731094, "rewards/rejected": -1.3255680799484253, "semantic_entropy": 0.8089202046394348, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 6.141321091564854, "learning_rate": 2.6443040787121186e-07, "logits/chosen": -0.18153631687164307, "logits/rejected": -0.1639922559261322, "logps/chosen": -1.1136391162872314, "logps/rejected": -1.2336668968200684, "loss": 1.5421, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1136391162872314, "rewards/margins": 0.12002797424793243, "rewards/rejected": -1.2336668968200684, "semantic_entropy": 0.857016921043396, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 6.557154389995247, "learning_rate": 2.6305783199059084e-07, "logits/chosen": -0.1902744472026825, "logits/rejected": -0.08390746265649796, "logps/chosen": -1.1837255954742432, "logps/rejected": -1.3694044351577759, "loss": 1.5825, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1837255954742432, "rewards/margins": 0.18567882478237152, "rewards/rejected": -1.3694044351577759, "semantic_entropy": 0.7975504398345947, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 5.08135124827265, "learning_rate": 2.6168755492392324e-07, "logits/chosen": -0.18824735283851624, "logits/rejected": -0.06400458514690399, "logps/chosen": -1.1156243085861206, "logps/rejected": -1.2793570756912231, "loss": 1.5405, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1156243085861206, "rewards/margins": 0.16373273730278015, "rewards/rejected": -1.2793570756912231, "semantic_entropy": 0.8496578931808472, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 5.363320850477034, "learning_rate": 2.6031958996564274e-07, "logits/chosen": -0.24391797184944153, "logits/rejected": -0.13319388031959534, "logps/chosen": -1.1656526327133179, "logps/rejected": -1.350420594215393, "loss": 1.5812, "rewards/accuracies": 0.5, "rewards/chosen": -1.1656526327133179, "rewards/margins": 0.1847679167985916, "rewards/rejected": -1.350420594215393, "semantic_entropy": 0.8310827016830444, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 5.600037932287894, "learning_rate": 2.589539503877518e-07, "logits/chosen": -0.15581141412258148, "logits/rejected": -0.08389084041118622, "logps/chosen": -1.1992270946502686, "logps/rejected": -1.3070600032806396, "loss": 1.604, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1992270946502686, "rewards/margins": 0.10783299058675766, "rewards/rejected": -1.3070600032806396, "semantic_entropy": 0.8095453381538391, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 6.301243983920328, "learning_rate": 2.5759064943969125e-07, "logits/chosen": -0.2042999565601349, "logits/rejected": -0.014372768811881542, "logps/chosen": -1.1586798429489136, "logps/rejected": -1.305345058441162, "loss": 1.5669, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1586798429489136, "rewards/margins": 0.14666512608528137, "rewards/rejected": -1.305345058441162, "semantic_entropy": 0.8165403604507446, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 5.576184345036018, "learning_rate": 2.562297003482131e-07, "logits/chosen": -0.07925567775964737, "logits/rejected": -0.08834794908761978, "logps/chosen": -1.1824979782104492, "logps/rejected": -1.329949140548706, "loss": 1.5992, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1824979782104492, "rewards/margins": 0.14745107293128967, "rewards/rejected": -1.329949140548706, "semantic_entropy": 0.8333484530448914, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 5.016867476246942, "learning_rate": 2.548711163172512e-07, "logits/chosen": -0.17539267241954803, "logits/rejected": -0.09553857147693634, "logps/chosen": -1.1988788843154907, "logps/rejected": -1.3043853044509888, "loss": 1.6172, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1988788843154907, "rewards/margins": 0.10550644248723984, "rewards/rejected": -1.3043853044509888, "semantic_entropy": 0.8367107510566711, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 5.872866095686595, "learning_rate": 2.53514910527794e-07, "logits/chosen": -0.09808802604675293, "logits/rejected": -0.012594198808073997, "logps/chosen": -1.1075093746185303, "logps/rejected": -1.2916184663772583, "loss": 1.5245, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1075093746185303, "rewards/margins": 0.18410901725292206, "rewards/rejected": -1.2916184663772583, "semantic_entropy": 0.8339918255805969, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 4.120086934898274, "learning_rate": 2.5216109613775573e-07, "logits/chosen": -0.2076595276594162, "logits/rejected": -0.08024019002914429, "logps/chosen": -1.2108917236328125, "logps/rejected": -1.4296541213989258, "loss": 1.6178, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2108917236328125, "rewards/margins": 0.21876242756843567, "rewards/rejected": -1.4296541213989258, "semantic_entropy": 0.8138198852539062, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 4.591815632551328, "learning_rate": 2.5080968628184993e-07, "logits/chosen": -0.2165403813123703, "logits/rejected": -0.08812186121940613, "logps/chosen": -1.2249259948730469, "logps/rejected": -1.4245822429656982, "loss": 1.6327, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2249259948730469, "rewards/margins": 0.199656143784523, "rewards/rejected": -1.4245822429656982, "semantic_entropy": 0.8154486417770386, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 4.7338220119389645, "learning_rate": 2.494606940714605e-07, "logits/chosen": -0.18992513418197632, "logits/rejected": -0.09159987419843674, "logps/chosen": -1.1564725637435913, "logps/rejected": -1.3397786617279053, "loss": 1.5588, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1564725637435913, "rewards/margins": 0.18330605328083038, "rewards/rejected": -1.3397786617279053, "semantic_entropy": 0.8046928644180298, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 4.285139370278244, "learning_rate": 2.4811413259451625e-07, "logits/chosen": -0.27960288524627686, "logits/rejected": -0.18352502584457397, "logps/chosen": -1.2469394207000732, "logps/rejected": -1.3242943286895752, "loss": 1.6541, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2469394207000732, "rewards/margins": 0.07735507190227509, "rewards/rejected": -1.3242943286895752, "semantic_entropy": 0.8142436742782593, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 5.923035655047226, "learning_rate": 2.46770014915362e-07, "logits/chosen": -0.1432248204946518, "logits/rejected": -0.10125571489334106, "logps/chosen": -1.1850011348724365, "logps/rejected": -1.401366114616394, "loss": 1.5916, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1850011348724365, "rewards/margins": 0.21636483073234558, "rewards/rejected": -1.401366114616394, "semantic_entropy": 0.8131051063537598, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 6.495244344118805, "learning_rate": 2.45428354074634e-07, "logits/chosen": -0.1723155826330185, "logits/rejected": -0.14587955176830292, "logps/chosen": -1.1003910303115845, "logps/rejected": -1.4250515699386597, "loss": 1.5197, "rewards/accuracies": 0.625, "rewards/chosen": -1.1003910303115845, "rewards/margins": 0.3246605396270752, "rewards/rejected": -1.4250515699386597, "semantic_entropy": 0.8385760188102722, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 6.196094117254832, "learning_rate": 2.4408916308913105e-07, "logits/chosen": -0.17971453070640564, "logits/rejected": -0.0378548726439476, "logps/chosen": -1.2348908185958862, "logps/rejected": -1.334380865097046, "loss": 1.6387, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2348908185958862, "rewards/margins": 0.09949006885290146, "rewards/rejected": -1.334380865097046, "semantic_entropy": 0.8076288104057312, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 5.336551954494671, "learning_rate": 2.4275245495169025e-07, "logits/chosen": -0.12708039581775665, "logits/rejected": -0.004563375376164913, "logps/chosen": -1.1842257976531982, "logps/rejected": -1.27142333984375, "loss": 1.6023, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1842257976531982, "rewards/margins": 0.08719761669635773, "rewards/rejected": -1.27142333984375, "semantic_entropy": 0.8360783457756042, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 6.374692590628831, "learning_rate": 2.414182426310597e-07, "logits/chosen": -0.24820616841316223, "logits/rejected": -0.18374893069267273, "logps/chosen": -1.137328863143921, "logps/rejected": -1.2954647541046143, "loss": 1.5524, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.137328863143921, "rewards/margins": 0.158135786652565, "rewards/rejected": -1.2954647541046143, "semantic_entropy": 0.8302084803581238, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 9.17952831490863, "learning_rate": 2.400865390717734e-07, "logits/chosen": -0.16721966862678528, "logits/rejected": -0.07405487447977066, "logps/chosen": -1.2248809337615967, "logps/rejected": -1.4471752643585205, "loss": 1.6201, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2248809337615967, "rewards/margins": 0.22229428589344025, "rewards/rejected": -1.4471752643585205, "semantic_entropy": 0.7905289530754089, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 5.413734546411877, "learning_rate": 2.3875735719402475e-07, "logits/chosen": -0.1232418417930603, "logits/rejected": -0.040323495864868164, "logps/chosen": -1.1384468078613281, "logps/rejected": -1.373263955116272, "loss": 1.5577, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1384468078613281, "rewards/margins": 0.234817236661911, "rewards/rejected": -1.373263955116272, "semantic_entropy": 0.8385455012321472, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 5.279149671531881, "learning_rate": 2.3743070989354258e-07, "logits/chosen": -0.19218100607395172, "logits/rejected": -0.11850804090499878, "logps/chosen": -1.1703581809997559, "logps/rejected": -1.3823697566986084, "loss": 1.57, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1703581809997559, "rewards/margins": 0.21201157569885254, "rewards/rejected": -1.3823697566986084, "semantic_entropy": 0.79924076795578, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 4.473288913062101, "learning_rate": 2.3610661004146454e-07, "logits/chosen": -0.09848616272211075, "logits/rejected": -0.03416893631219864, "logps/chosen": -1.0744106769561768, "logps/rejected": -1.3005332946777344, "loss": 1.4797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0744106769561768, "rewards/margins": 0.22612233459949493, "rewards/rejected": -1.3005332946777344, "semantic_entropy": 0.8106128573417664, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 5.863968283550726, "learning_rate": 2.3478507048421314e-07, "logits/chosen": -0.24405434727668762, "logits/rejected": -0.19618570804595947, "logps/chosen": -1.099838137626648, "logps/rejected": -1.3882933855056763, "loss": 1.507, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.099838137626648, "rewards/margins": 0.2884552776813507, "rewards/rejected": -1.3882933855056763, "semantic_entropy": 0.814230740070343, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 8.402875772972521, "learning_rate": 2.334661040433713e-07, "logits/chosen": -0.24784092605113983, "logits/rejected": -0.16681021451950073, "logps/chosen": -1.1715034246444702, "logps/rejected": -1.3564860820770264, "loss": 1.5817, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1715034246444702, "rewards/margins": 0.1849825084209442, "rewards/rejected": -1.3564860820770264, "semantic_entropy": 0.8203716278076172, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 4.569468078199057, "learning_rate": 2.321497235155568e-07, "logits/chosen": -0.2595081329345703, "logits/rejected": -0.15864713490009308, "logps/chosen": -1.1593282222747803, "logps/rejected": -1.2965118885040283, "loss": 1.575, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1593282222747803, "rewards/margins": 0.13718362152576447, "rewards/rejected": -1.2965118885040283, "semantic_entropy": 0.831413745880127, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 5.038396328435748, "learning_rate": 2.3083594167229965e-07, "logits/chosen": -0.3314756453037262, "logits/rejected": -0.13507400453090668, "logps/chosen": -1.1881109476089478, "logps/rejected": -1.4228702783584595, "loss": 1.5896, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1881109476089478, "rewards/margins": 0.23475944995880127, "rewards/rejected": -1.4228702783584595, "semantic_entropy": 0.8029600977897644, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 6.711683599460851, "learning_rate": 2.295247712599167e-07, "logits/chosen": -0.20440669357776642, "logits/rejected": -0.1178549975156784, "logps/chosen": -1.1627848148345947, "logps/rejected": -1.3498769998550415, "loss": 1.5741, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1627848148345947, "rewards/margins": 0.18709222972393036, "rewards/rejected": -1.3498769998550415, "semantic_entropy": 0.8226519823074341, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.10342211276292801, "eval_logits/rejected": 0.17440342903137207, "eval_logps/chosen": -1.267951250076294, "eval_logps/rejected": -1.4048658609390259, "eval_loss": 1.6708178520202637, "eval_rewards/accuracies": 0.5497032403945923, "eval_rewards/chosen": -1.267951250076294, "eval_rewards/margins": 0.13691446185112, "eval_rewards/rejected": -1.4048658609390259, "eval_runtime": 34.4659, "eval_samples_per_second": 39.024, "eval_semantic_entropy": 0.8054755330085754, "eval_steps_per_second": 9.778, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 5.262532967531314, "learning_rate": 2.2821622499938948e-07, "logits/chosen": -0.21820764243602753, "logits/rejected": -0.04783173277974129, "logps/chosen": -1.3216861486434937, "logps/rejected": -1.4393781423568726, "loss": 1.7201, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3216861486434937, "rewards/margins": 0.11769211292266846, "rewards/rejected": -1.4393781423568726, "semantic_entropy": 0.7967873811721802, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 4.219033429970568, "learning_rate": 2.269103155862391e-07, "logits/chosen": -0.21247649192810059, "logits/rejected": -0.142580047249794, "logps/chosen": -1.2338751554489136, "logps/rejected": -1.3403874635696411, "loss": 1.6346, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2338751554489136, "rewards/margins": 0.10651236772537231, "rewards/rejected": -1.3403874635696411, "semantic_entropy": 0.8013619184494019, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 5.2005812956713955, "learning_rate": 2.2560705569040483e-07, "logits/chosen": -0.23554973304271698, "logits/rejected": -0.03494542837142944, "logps/chosen": -1.2127416133880615, "logps/rejected": -1.3317720890045166, "loss": 1.6238, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2127416133880615, "rewards/margins": 0.11903029680252075, "rewards/rejected": -1.3317720890045166, "semantic_entropy": 0.82215416431427, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 4.179060712653269, "learning_rate": 2.2430645795611963e-07, "logits/chosen": -0.28321409225463867, "logits/rejected": -0.1724487543106079, "logps/chosen": -1.2476980686187744, "logps/rejected": -1.3455796241760254, "loss": 1.6605, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2476980686187744, "rewards/margins": 0.09788144379854202, "rewards/rejected": -1.3455796241760254, "semantic_entropy": 0.8257006406784058, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 5.758610801053121, "learning_rate": 2.230085350017884e-07, "logits/chosen": -0.19225016236305237, "logits/rejected": -0.11318112909793854, "logps/chosen": -1.1326768398284912, "logps/rejected": -1.2687228918075562, "loss": 1.5474, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1326768398284912, "rewards/margins": 0.1360461413860321, "rewards/rejected": -1.2687228918075562, "semantic_entropy": 0.8293469548225403, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 6.5725769413582364, "learning_rate": 2.2171329941986554e-07, "logits/chosen": -0.24895811080932617, "logits/rejected": -0.18902724981307983, "logps/chosen": -1.1415627002716064, "logps/rejected": -1.3662025928497314, "loss": 1.5469, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1415627002716064, "rewards/margins": 0.22463993728160858, "rewards/rejected": -1.3662025928497314, "semantic_entropy": 0.8107425570487976, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 6.233722596330523, "learning_rate": 2.2042076377673202e-07, "logits/chosen": -0.23054400086402893, "logits/rejected": -0.20186567306518555, "logps/chosen": -1.169611930847168, "logps/rejected": -1.2342936992645264, "loss": 1.5846, "rewards/accuracies": 0.5625, "rewards/chosen": -1.169611930847168, "rewards/margins": 0.06468170881271362, "rewards/rejected": -1.2342936992645264, "semantic_entropy": 0.8299460411071777, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 6.707237465826684, "learning_rate": 2.1913094061257476e-07, "logits/chosen": -0.23418477177619934, "logits/rejected": -0.2039678394794464, "logps/chosen": -1.134570837020874, "logps/rejected": -1.232934832572937, "loss": 1.5619, "rewards/accuracies": 0.5625, "rewards/chosen": -1.134570837020874, "rewards/margins": 0.09836404025554657, "rewards/rejected": -1.232934832572937, "semantic_entropy": 0.8547365069389343, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 7.079081550659214, "learning_rate": 2.178438424412633e-07, "logits/chosen": -0.1887328028678894, "logits/rejected": -0.09386876225471497, "logps/chosen": -1.2358038425445557, "logps/rejected": -1.3405746221542358, "loss": 1.6411, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2358038425445557, "rewards/margins": 0.10477089881896973, "rewards/rejected": -1.3405746221542358, "semantic_entropy": 0.8106830716133118, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 3.9138947322126003, "learning_rate": 2.165594817502302e-07, "logits/chosen": -0.25412747263908386, "logits/rejected": -0.17609265446662903, "logps/chosen": -1.2161505222320557, "logps/rejected": -1.30418860912323, "loss": 1.6154, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2161505222320557, "rewards/margins": 0.08803816139698029, "rewards/rejected": -1.30418860912323, "semantic_entropy": 0.798462450504303, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 5.169053848322378, "learning_rate": 2.1527787100034806e-07, "logits/chosen": -0.1513521671295166, "logits/rejected": -0.10855606943368912, "logps/chosen": -1.1978471279144287, "logps/rejected": -1.3494360446929932, "loss": 1.6158, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1978471279144287, "rewards/margins": 0.15158873796463013, "rewards/rejected": -1.3494360446929932, "semantic_entropy": 0.8359264135360718, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 6.747097512716577, "learning_rate": 2.1399902262581037e-07, "logits/chosen": -0.06469866633415222, "logits/rejected": 0.004696385934948921, "logps/chosen": -1.1217443943023682, "logps/rejected": -1.2025887966156006, "loss": 1.5504, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1217443943023682, "rewards/margins": 0.08084437996149063, "rewards/rejected": -1.2025887966156006, "semantic_entropy": 0.857334315776825, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 6.098195333856992, "learning_rate": 2.127229490340094e-07, "logits/chosen": -0.28054842352867126, "logits/rejected": -0.21356305480003357, "logps/chosen": -1.197876214981079, "logps/rejected": -1.389706015586853, "loss": 1.6125, "rewards/accuracies": 0.5625, "rewards/chosen": -1.197876214981079, "rewards/margins": 0.19182994961738586, "rewards/rejected": -1.389706015586853, "semantic_entropy": 0.8292211294174194, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 6.276837218226126, "learning_rate": 2.1144966260541698e-07, "logits/chosen": -0.18982744216918945, "logits/rejected": -0.03638936206698418, "logps/chosen": -1.1508163213729858, "logps/rejected": -1.3158237934112549, "loss": 1.5714, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1508163213729858, "rewards/margins": 0.16500720381736755, "rewards/rejected": -1.3158237934112549, "semantic_entropy": 0.8411868810653687, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 5.406347299136873, "learning_rate": 2.1017917569346332e-07, "logits/chosen": -0.21322894096374512, "logits/rejected": -0.07341529428958893, "logps/chosen": -1.2403171062469482, "logps/rejected": -1.3609035015106201, "loss": 1.6457, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2403171062469482, "rewards/margins": 0.12058645486831665, "rewards/rejected": -1.3609035015106201, "semantic_entropy": 0.8106715083122253, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 5.431805880356793, "learning_rate": 2.0891150062441837e-07, "logits/chosen": -0.19288824498653412, "logits/rejected": -0.08635319769382477, "logps/chosen": -1.2276254892349243, "logps/rejected": -1.3734530210494995, "loss": 1.6257, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2276254892349243, "rewards/margins": 0.14582760632038116, "rewards/rejected": -1.3734530210494995, "semantic_entropy": 0.7962437272071838, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 4.57459664315818, "learning_rate": 2.0764664969727086e-07, "logits/chosen": -0.1827898472547531, "logits/rejected": -0.10113134235143661, "logps/chosen": -1.2043447494506836, "logps/rejected": -1.3055150508880615, "loss": 1.6029, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2043447494506836, "rewards/margins": 0.10117051750421524, "rewards/rejected": -1.3055150508880615, "semantic_entropy": 0.7971200942993164, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 4.201405252077814, "learning_rate": 2.0638463518361033e-07, "logits/chosen": -0.2338356077671051, "logits/rejected": -0.08818233013153076, "logps/chosen": -1.1827020645141602, "logps/rejected": -1.3149657249450684, "loss": 1.5865, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1827020645141602, "rewards/margins": 0.13226349651813507, "rewards/rejected": -1.3149657249450684, "semantic_entropy": 0.8076278567314148, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 5.414823178839107, "learning_rate": 2.0512546932750702e-07, "logits/chosen": -0.21907532215118408, "logits/rejected": -0.15696975588798523, "logps/chosen": -1.269371747970581, "logps/rejected": -1.3269436359405518, "loss": 1.6651, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.269371747970581, "rewards/margins": 0.05757167190313339, "rewards/rejected": -1.3269436359405518, "semantic_entropy": 0.7914665341377258, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 6.3137373860371815, "learning_rate": 2.0386916434539343e-07, "logits/chosen": -0.16755308210849762, "logits/rejected": -0.07352401316165924, "logps/chosen": -1.110514521598816, "logps/rejected": -1.2984755039215088, "loss": 1.5169, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.110514521598816, "rewards/margins": 0.18796098232269287, "rewards/rejected": -1.2984755039215088, "semantic_entropy": 0.8126772046089172, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 4.750232879434187, "learning_rate": 2.0261573242594627e-07, "logits/chosen": -0.20642590522766113, "logits/rejected": -0.04566860944032669, "logps/chosen": -1.2400540113449097, "logps/rejected": -1.3274204730987549, "loss": 1.646, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2400540113449097, "rewards/margins": 0.0873665064573288, "rewards/rejected": -1.3274204730987549, "semantic_entropy": 0.8118304014205933, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 6.780631854903201, "learning_rate": 2.0136518572996724e-07, "logits/chosen": -0.14376579225063324, "logits/rejected": -0.013981973752379417, "logps/chosen": -1.1632236242294312, "logps/rejected": -1.2992104291915894, "loss": 1.5795, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1632236242294312, "rewards/margins": 0.13598696887493134, "rewards/rejected": -1.2992104291915894, "semantic_entropy": 0.832513689994812, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 4.249818208070201, "learning_rate": 2.0011753639026617e-07, "logits/chosen": -0.15985305607318878, "logits/rejected": -0.1411779820919037, "logps/chosen": -1.1830618381500244, "logps/rejected": -1.392379641532898, "loss": 1.6002, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1830618381500244, "rewards/margins": 0.20931783318519592, "rewards/rejected": -1.392379641532898, "semantic_entropy": 0.8342801928520203, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 3.467255799881492, "learning_rate": 1.988727965115421e-07, "logits/chosen": -0.18735571205615997, "logits/rejected": -0.14499355852603912, "logps/chosen": -1.136539101600647, "logps/rejected": -1.3206449747085571, "loss": 1.551, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.136539101600647, "rewards/margins": 0.184105783700943, "rewards/rejected": -1.3206449747085571, "semantic_entropy": 0.8288604617118835, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 3.835352710443175, "learning_rate": 1.9763097817026713e-07, "logits/chosen": -0.230129212141037, "logits/rejected": -0.07808873057365417, "logps/chosen": -1.1623951196670532, "logps/rejected": -1.280320644378662, "loss": 1.5745, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1623951196670532, "rewards/margins": 0.11792556941509247, "rewards/rejected": -1.280320644378662, "semantic_entropy": 0.8242538571357727, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 5.845494846114426, "learning_rate": 1.9639209341456796e-07, "logits/chosen": -0.19133403897285461, "logits/rejected": -0.1287078708410263, "logps/chosen": -1.1727728843688965, "logps/rejected": -1.363059401512146, "loss": 1.5872, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1727728843688965, "rewards/margins": 0.19028660655021667, "rewards/rejected": -1.363059401512146, "semantic_entropy": 0.828804612159729, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 8.772404039630908, "learning_rate": 1.951561542641102e-07, "logits/chosen": -0.20310905575752258, "logits/rejected": -0.21289415657520294, "logps/chosen": -1.211479902267456, "logps/rejected": -1.3541817665100098, "loss": 1.611, "rewards/accuracies": 0.5, "rewards/chosen": -1.211479902267456, "rewards/margins": 0.14270183444023132, "rewards/rejected": -1.3541817665100098, "semantic_entropy": 0.7990657091140747, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 4.942988929624094, "learning_rate": 1.939231727099806e-07, "logits/chosen": -0.2749512493610382, "logits/rejected": -0.20915856957435608, "logps/chosen": -1.1415526866912842, "logps/rejected": -1.350078821182251, "loss": 1.5662, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1415526866912842, "rewards/margins": 0.20852604508399963, "rewards/rejected": -1.350078821182251, "semantic_entropy": 0.84922856092453, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 4.932696780900478, "learning_rate": 1.926931607145719e-07, "logits/chosen": -0.14148059487342834, "logits/rejected": -0.03152935579419136, "logps/chosen": -1.2343857288360596, "logps/rejected": -1.436859130859375, "loss": 1.6328, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2343857288360596, "rewards/margins": 0.20247332751750946, "rewards/rejected": -1.436859130859375, "semantic_entropy": 0.7968149185180664, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 5.197315965712173, "learning_rate": 1.9146613021146564e-07, "logits/chosen": -0.15177489817142487, "logits/rejected": -0.08891390264034271, "logps/chosen": -1.1240689754486084, "logps/rejected": -1.312318205833435, "loss": 1.541, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1240689754486084, "rewards/margins": 0.1882493793964386, "rewards/rejected": -1.312318205833435, "semantic_entropy": 0.8338991403579712, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 6.7953451825331594, "learning_rate": 1.9024209310531736e-07, "logits/chosen": -0.1090053915977478, "logits/rejected": -0.1383194476366043, "logps/chosen": -1.175878882408142, "logps/rejected": -1.3272706270217896, "loss": 1.5876, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.175878882408142, "rewards/margins": 0.1513918787240982, "rewards/rejected": -1.3272706270217896, "semantic_entropy": 0.8234698176383972, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 5.802609834250621, "learning_rate": 1.890210612717401e-07, "logits/chosen": -0.19636309146881104, "logits/rejected": -0.0740322470664978, "logps/chosen": -1.2193504571914673, "logps/rejected": -1.3553255796432495, "loss": 1.6198, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2193504571914673, "rewards/margins": 0.13597525656223297, "rewards/rejected": -1.3553255796432495, "semantic_entropy": 0.800805926322937, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 4.903012905087021, "learning_rate": 1.8780304655719054e-07, "logits/chosen": -0.19576838612556458, "logits/rejected": -0.11823102086782455, "logps/chosen": -1.219238519668579, "logps/rejected": -1.3857297897338867, "loss": 1.6257, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.219238519668579, "rewards/margins": 0.1664913296699524, "rewards/rejected": -1.3857297897338867, "semantic_entropy": 0.8128414154052734, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 5.529583935177261, "learning_rate": 1.865880607788523e-07, "logits/chosen": -0.12003302574157715, "logits/rejected": -0.08424254506826401, "logps/chosen": -1.2000020742416382, "logps/rejected": -1.341172218322754, "loss": 1.6029, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2000020742416382, "rewards/margins": 0.14117002487182617, "rewards/rejected": -1.341172218322754, "semantic_entropy": 0.8057316541671753, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 4.181145179799494, "learning_rate": 1.8537611572452316e-07, "logits/chosen": -0.21061989665031433, "logits/rejected": -0.12768778204917908, "logps/chosen": -1.1794354915618896, "logps/rejected": -1.310697317123413, "loss": 1.5974, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1794354915618896, "rewards/margins": 0.1312616914510727, "rewards/rejected": -1.310697317123413, "semantic_entropy": 0.8358890414237976, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 5.445545835893754, "learning_rate": 1.84167223152499e-07, "logits/chosen": -0.22739848494529724, "logits/rejected": -0.04953371733427048, "logps/chosen": -1.1299711465835571, "logps/rejected": -1.3490208387374878, "loss": 1.5501, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1299711465835571, "rewards/margins": 0.2190495729446411, "rewards/rejected": -1.3490208387374878, "semantic_entropy": 0.8402570486068726, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 5.466768220656771, "learning_rate": 1.8296139479146112e-07, "logits/chosen": -0.23618383705615997, "logits/rejected": -0.23938719928264618, "logps/chosen": -1.0995723009109497, "logps/rejected": -1.3170347213745117, "loss": 1.5146, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0995723009109497, "rewards/margins": 0.217462420463562, "rewards/rejected": -1.3170347213745117, "semantic_entropy": 0.8300333023071289, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 7.396925244310004, "learning_rate": 1.8175864234036132e-07, "logits/chosen": -0.10360310971736908, "logits/rejected": -0.013956268317997456, "logps/chosen": -1.1834025382995605, "logps/rejected": -1.2840938568115234, "loss": 1.5967, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1834025382995605, "rewards/margins": 0.10069123655557632, "rewards/rejected": -1.2840938568115234, "semantic_entropy": 0.8266922831535339, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 4.2912074723466445, "learning_rate": 1.805589774683094e-07, "logits/chosen": -0.29206061363220215, "logits/rejected": -0.16530589759349823, "logps/chosen": -1.2177801132202148, "logps/rejected": -1.3011841773986816, "loss": 1.6145, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2177801132202148, "rewards/margins": 0.08340400457382202, "rewards/rejected": -1.3011841773986816, "semantic_entropy": 0.7934310436248779, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 5.75216945832429, "learning_rate": 1.79362411814459e-07, "logits/chosen": -0.04362577572464943, "logits/rejected": -0.06727404147386551, "logps/chosen": -1.221893072128296, "logps/rejected": -1.3949968814849854, "loss": 1.6226, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.221893072128296, "rewards/margins": 0.17310376465320587, "rewards/rejected": -1.3949968814849854, "semantic_entropy": 0.8014314770698547, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 6.100699676445913, "learning_rate": 1.7816895698789552e-07, "logits/chosen": -0.24663782119750977, "logits/rejected": -0.18678709864616394, "logps/chosen": -1.1628715991973877, "logps/rejected": -1.3810681104660034, "loss": 1.5749, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1628715991973877, "rewards/margins": 0.2181965857744217, "rewards/rejected": -1.3810681104660034, "semantic_entropy": 0.8240032196044922, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 4.740577643367922, "learning_rate": 1.7697862456752271e-07, "logits/chosen": -0.19729050993919373, "logits/rejected": -0.12399884313344955, "logps/chosen": -1.1655830144882202, "logps/rejected": -1.3972151279449463, "loss": 1.5719, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1655830144882202, "rewards/margins": 0.23163215816020966, "rewards/rejected": -1.3972151279449463, "semantic_entropy": 0.8125913739204407, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 5.316139932010603, "learning_rate": 1.7579142610195124e-07, "logits/chosen": -0.21291694045066833, "logits/rejected": -0.12076542526483536, "logps/chosen": -1.202652931213379, "logps/rejected": -1.3000690937042236, "loss": 1.5994, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.202652931213379, "rewards/margins": 0.09741628170013428, "rewards/rejected": -1.3000690937042236, "semantic_entropy": 0.7935939431190491, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 5.531101028825615, "learning_rate": 1.7460737310938568e-07, "logits/chosen": -0.23271489143371582, "logits/rejected": -0.0714816004037857, "logps/chosen": -1.1969115734100342, "logps/rejected": -1.311009168624878, "loss": 1.6106, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1969115734100342, "rewards/margins": 0.11409779638051987, "rewards/rejected": -1.311009168624878, "semantic_entropy": 0.8273962140083313, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 4.844433804919581, "learning_rate": 1.734264770775133e-07, "logits/chosen": -0.23300214111804962, "logits/rejected": -0.09987245500087738, "logps/chosen": -1.2218164205551147, "logps/rejected": -1.4145606756210327, "loss": 1.6243, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2218164205551147, "rewards/margins": 0.19274425506591797, "rewards/rejected": -1.4145606756210327, "semantic_entropy": 0.8049126863479614, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 5.786542299130992, "learning_rate": 1.7224874946339241e-07, "logits/chosen": -0.23867681622505188, "logits/rejected": -0.18256641924381256, "logps/chosen": -1.2325165271759033, "logps/rejected": -1.4765574932098389, "loss": 1.6365, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2325165271759033, "rewards/margins": 0.24404099583625793, "rewards/rejected": -1.4765574932098389, "semantic_entropy": 0.8080425262451172, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 3.9897008232782465, "learning_rate": 1.7107420169334186e-07, "logits/chosen": -0.20769242942333221, "logits/rejected": -0.10469029098749161, "logps/chosen": -1.2589566707611084, "logps/rejected": -1.2831064462661743, "loss": 1.6645, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -1.2589566707611084, "rewards/margins": 0.024149714037775993, "rewards/rejected": -1.2831064462661743, "semantic_entropy": 0.8110135793685913, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 5.542025441821374, "learning_rate": 1.6990284516282893e-07, "logits/chosen": -0.18456456065177917, "logits/rejected": -0.09351705014705658, "logps/chosen": -1.2053933143615723, "logps/rejected": -1.258929967880249, "loss": 1.6135, "rewards/accuracies": 0.5, "rewards/chosen": -1.2053933143615723, "rewards/margins": 0.05353670194745064, "rewards/rejected": -1.258929967880249, "semantic_entropy": 0.8161219358444214, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 5.661371043341674, "learning_rate": 1.687346912363602e-07, "logits/chosen": -0.2513275444507599, "logits/rejected": -0.15836261212825775, "logps/chosen": -1.2086917161941528, "logps/rejected": -1.243709921836853, "loss": 1.6232, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2086917161941528, "rewards/margins": 0.035018257796764374, "rewards/rejected": -1.243709921836853, "semantic_entropy": 0.8289461135864258, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 3.92151000069052, "learning_rate": 1.675697512473697e-07, "logits/chosen": -0.15918885171413422, "logits/rejected": -0.025776857510209084, "logps/chosen": -1.1895248889923096, "logps/rejected": -1.296006202697754, "loss": 1.5969, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1895248889923096, "rewards/margins": 0.10648126900196075, "rewards/rejected": -1.296006202697754, "semantic_entropy": 0.8147109746932983, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 7.062103775610182, "learning_rate": 1.6640803649811087e-07, "logits/chosen": -0.22439053654670715, "logits/rejected": -0.031128834933042526, "logps/chosen": -1.2116504907608032, "logps/rejected": -1.366734504699707, "loss": 1.6154, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2116504907608032, "rewards/margins": 0.1550840586423874, "rewards/rejected": -1.366734504699707, "semantic_entropy": 0.8074728846549988, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 6.1567653530044755, "learning_rate": 1.6524955825954472e-07, "logits/chosen": -0.18798746168613434, "logits/rejected": -0.08310097455978394, "logps/chosen": -1.129688024520874, "logps/rejected": -1.3669233322143555, "loss": 1.5418, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.129688024520874, "rewards/margins": 0.23723526298999786, "rewards/rejected": -1.3669233322143555, "semantic_entropy": 0.8243156671524048, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 4.905362525551958, "learning_rate": 1.6409432777123277e-07, "logits/chosen": -0.28541994094848633, "logits/rejected": -0.1555914431810379, "logps/chosen": -1.1916757822036743, "logps/rejected": -1.320718765258789, "loss": 1.6055, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1916757822036743, "rewards/margins": 0.12904289364814758, "rewards/rejected": -1.320718765258789, "semantic_entropy": 0.8276355862617493, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 5.287022884091319, "learning_rate": 1.6294235624122577e-07, "logits/chosen": -0.11748915910720825, "logits/rejected": 0.08395910263061523, "logps/chosen": -1.1970247030258179, "logps/rejected": -1.4883216619491577, "loss": 1.6021, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1970247030258179, "rewards/margins": 0.29129716753959656, "rewards/rejected": -1.4883216619491577, "semantic_entropy": 0.8101539611816406, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 6.973651722101997, "learning_rate": 1.6179365484595697e-07, "logits/chosen": -0.18683718144893646, "logits/rejected": -0.13036464154720306, "logps/chosen": -1.2510545253753662, "logps/rejected": -1.3982536792755127, "loss": 1.6435, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2510545253753662, "rewards/margins": 0.1471991389989853, "rewards/rejected": -1.3982536792755127, "semantic_entropy": 0.784953236579895, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 4.7797810090607555, "learning_rate": 1.60648234730132e-07, "logits/chosen": -0.19713839888572693, "logits/rejected": -0.14190594851970673, "logps/chosen": -1.1513960361480713, "logps/rejected": -1.2622439861297607, "loss": 1.5654, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1513960361480713, "rewards/margins": 0.11084787547588348, "rewards/rejected": -1.2622439861297607, "semantic_entropy": 0.8280984163284302, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 9.59056704594612, "learning_rate": 1.595061070066222e-07, "logits/chosen": -0.15194591879844666, "logits/rejected": -0.16308292746543884, "logps/chosen": -1.1238676309585571, "logps/rejected": -1.3060160875320435, "loss": 1.5459, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1238676309585571, "rewards/margins": 0.18214842677116394, "rewards/rejected": -1.3060160875320435, "semantic_entropy": 0.8440696597099304, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 5.691399916102227, "learning_rate": 1.5836728275635542e-07, "logits/chosen": -0.2572895288467407, "logits/rejected": -0.11548779904842377, "logps/chosen": -1.2302570343017578, "logps/rejected": -1.2968885898590088, "loss": 1.6382, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2302570343017578, "rewards/margins": 0.06663177907466888, "rewards/rejected": -1.2968885898590088, "semantic_entropy": 0.8158594369888306, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 8.321429480155631, "learning_rate": 1.5723177302820984e-07, "logits/chosen": -0.21559301018714905, "logits/rejected": -0.19064505398273468, "logps/chosen": -1.1839215755462646, "logps/rejected": -1.352809190750122, "loss": 1.597, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1839215755462646, "rewards/margins": 0.16888760030269623, "rewards/rejected": -1.352809190750122, "semantic_entropy": 0.8261575698852539, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 6.514798662426732, "learning_rate": 1.5609958883890544e-07, "logits/chosen": -0.20665176212787628, "logits/rejected": -0.12365027517080307, "logps/chosen": -1.1901861429214478, "logps/rejected": -1.2659374475479126, "loss": 1.6062, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1901861429214478, "rewards/margins": 0.07575125247240067, "rewards/rejected": -1.2659374475479126, "semantic_entropy": 0.8320629000663757, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 6.0809800458083965, "learning_rate": 1.5497074117289865e-07, "logits/chosen": -0.2702236473560333, "logits/rejected": -0.17110756039619446, "logps/chosen": -1.1608952283859253, "logps/rejected": -1.3650672435760498, "loss": 1.5654, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1608952283859253, "rewards/margins": 0.20417198538780212, "rewards/rejected": -1.3650672435760498, "semantic_entropy": 0.8090604543685913, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 5.6068915168977425, "learning_rate": 1.5384524098227402e-07, "logits/chosen": -0.2124275267124176, "logits/rejected": -0.05867360159754753, "logps/chosen": -1.183924913406372, "logps/rejected": -1.348034143447876, "loss": 1.5961, "rewards/accuracies": 0.53125, "rewards/chosen": -1.183924913406372, "rewards/margins": 0.16410934925079346, "rewards/rejected": -1.348034143447876, "semantic_entropy": 0.8242883682250977, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 4.06261049502757, "learning_rate": 1.5272309918663974e-07, "logits/chosen": -0.18944670259952545, "logits/rejected": -0.07268358767032623, "logps/chosen": -1.1699378490447998, "logps/rejected": -1.2907711267471313, "loss": 1.5814, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1699378490447998, "rewards/margins": 0.12083326280117035, "rewards/rejected": -1.2907711267471313, "semantic_entropy": 0.8229478001594543, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 5.848032547140342, "learning_rate": 1.516043266730201e-07, "logits/chosen": -0.2091577798128128, "logits/rejected": -0.1202559694647789, "logps/chosen": -1.247551441192627, "logps/rejected": -1.2734215259552002, "loss": 1.6522, "rewards/accuracies": 0.53125, "rewards/chosen": -1.247551441192627, "rewards/margins": 0.02587016485631466, "rewards/rejected": -1.2734215259552002, "semantic_entropy": 0.8092119097709656, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 8.076897376439485, "learning_rate": 1.504889342957512e-07, "logits/chosen": -0.22406017780303955, "logits/rejected": -0.10291236639022827, "logps/chosen": -1.1667025089263916, "logps/rejected": -1.3511077165603638, "loss": 1.5739, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1667025089263916, "rewards/margins": 0.18440505862236023, "rewards/rejected": -1.3511077165603638, "semantic_entropy": 0.8143148422241211, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 4.14851246318551, "learning_rate": 1.4937693287637453e-07, "logits/chosen": -0.20327946543693542, "logits/rejected": -0.12431325018405914, "logps/chosen": -1.2662365436553955, "logps/rejected": -1.3120231628417969, "loss": 1.674, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.2662365436553955, "rewards/margins": 0.045786596834659576, "rewards/rejected": -1.3120231628417969, "semantic_entropy": 0.8154833912849426, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 4.902783093730407, "learning_rate": 1.4826833320353305e-07, "logits/chosen": -0.1769380271434784, "logits/rejected": -0.1343008577823639, "logps/chosen": -1.2226684093475342, "logps/rejected": -1.3501255512237549, "loss": 1.6109, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2226684093475342, "rewards/margins": 0.12745702266693115, "rewards/rejected": -1.3501255512237549, "semantic_entropy": 0.7764714360237122, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 5.877930112008186, "learning_rate": 1.4716314603286528e-07, "logits/chosen": -0.19880566000938416, "logits/rejected": -0.06474629044532776, "logps/chosen": -1.0965871810913086, "logps/rejected": -1.3856254816055298, "loss": 1.5137, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.0965871810913086, "rewards/margins": 0.2890383303165436, "rewards/rejected": -1.3856254816055298, "semantic_entropy": 0.8342844843864441, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 6.568306573021862, "learning_rate": 1.4606138208690233e-07, "logits/chosen": -0.25061970949172974, "logits/rejected": -0.18322598934173584, "logps/chosen": -1.297231674194336, "logps/rejected": -1.2720223665237427, "loss": 1.7038, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.297231674194336, "rewards/margins": -0.02520928345620632, "rewards/rejected": -1.2720223665237427, "semantic_entropy": 0.8132265210151672, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 4.919986703480767, "learning_rate": 1.4496305205496251e-07, "logits/chosen": -0.16978032886981964, "logits/rejected": -0.12163975089788437, "logps/chosen": -1.1873234510421753, "logps/rejected": -1.3641231060028076, "loss": 1.5981, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1873234510421753, "rewards/margins": 0.17679956555366516, "rewards/rejected": -1.3641231060028076, "semantic_entropy": 0.8215670585632324, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 5.353839862829944, "learning_rate": 1.4386816659304895e-07, "logits/chosen": -0.22045393288135529, "logits/rejected": -0.15511895716190338, "logps/chosen": -1.2339824438095093, "logps/rejected": -1.3752912282943726, "loss": 1.6348, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2339824438095093, "rewards/margins": 0.14130881428718567, "rewards/rejected": -1.3752912282943726, "semantic_entropy": 0.8016121983528137, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 6.466243653013747, "learning_rate": 1.4277673632374492e-07, "logits/chosen": -0.2883046865463257, "logits/rejected": -0.1176748052239418, "logps/chosen": -1.2859512567520142, "logps/rejected": -1.3422971963882446, "loss": 1.6827, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2859512567520142, "rewards/margins": 0.056345999240875244, "rewards/rejected": -1.3422971963882446, "semantic_entropy": 0.793555498123169, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 5.426638429429009, "learning_rate": 1.416887718361119e-07, "logits/chosen": -0.15013377368450165, "logits/rejected": -0.16058427095413208, "logps/chosen": -1.1430703401565552, "logps/rejected": -1.3370366096496582, "loss": 1.5618, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1430703401565552, "rewards/margins": 0.19396626949310303, "rewards/rejected": -1.3370366096496582, "semantic_entropy": 0.8374625444412231, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 8.01562259479993, "learning_rate": 1.406042836855859e-07, "logits/chosen": -0.17795519530773163, "logits/rejected": -0.06669540703296661, "logps/chosen": -1.0956206321716309, "logps/rejected": -1.4295997619628906, "loss": 1.5054, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.0956206321716309, "rewards/margins": 0.333979070186615, "rewards/rejected": -1.4295997619628906, "semantic_entropy": 0.8195252418518066, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 5.114993636992316, "learning_rate": 1.3952328239387595e-07, "logits/chosen": -0.3046717941761017, "logits/rejected": -0.16285748779773712, "logps/chosen": -1.2190165519714355, "logps/rejected": -1.3673566579818726, "loss": 1.6188, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2190165519714355, "rewards/margins": 0.14834021031856537, "rewards/rejected": -1.3673566579818726, "semantic_entropy": 0.7996227145195007, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 5.068384617905955, "learning_rate": 1.3844577844886109e-07, "logits/chosen": -0.21474358439445496, "logits/rejected": -0.08300717175006866, "logps/chosen": -1.2684152126312256, "logps/rejected": -1.346720814704895, "loss": 1.6785, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2684152126312256, "rewards/margins": 0.07830562442541122, "rewards/rejected": -1.346720814704895, "semantic_entropy": 0.8202077150344849, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 7.897552172086404, "learning_rate": 1.3737178230448955e-07, "logits/chosen": -0.2530320882797241, "logits/rejected": -0.13249406218528748, "logps/chosen": -1.172287106513977, "logps/rejected": -1.4291865825653076, "loss": 1.5864, "rewards/accuracies": 0.59375, "rewards/chosen": -1.172287106513977, "rewards/margins": 0.25689950585365295, "rewards/rejected": -1.4291865825653076, "semantic_entropy": 0.8282469511032104, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 5.6228696227825266, "learning_rate": 1.363013043806764e-07, "logits/chosen": -0.22183719277381897, "logits/rejected": -0.12009850889444351, "logps/chosen": -1.1790809631347656, "logps/rejected": -1.297424077987671, "loss": 1.5908, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1790809631347656, "rewards/margins": 0.11834307760000229, "rewards/rejected": -1.297424077987671, "semantic_entropy": 0.8233474493026733, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 4.672280805861314, "learning_rate": 1.352343550632034e-07, "logits/chosen": -0.16721583902835846, "logits/rejected": -0.04607416316866875, "logps/chosen": -1.1949318647384644, "logps/rejected": -1.4283303022384644, "loss": 1.5989, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1949318647384644, "rewards/margins": 0.23339852690696716, "rewards/rejected": -1.4283303022384644, "semantic_entropy": 0.8078802824020386, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 4.87982211057563, "learning_rate": 1.3417094470361722e-07, "logits/chosen": -0.2753719687461853, "logits/rejected": -0.15052953362464905, "logps/chosen": -1.1553093194961548, "logps/rejected": -1.3761399984359741, "loss": 1.5681, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1553093194961548, "rewards/margins": 0.2208307683467865, "rewards/rejected": -1.3761399984359741, "semantic_entropy": 0.82562655210495, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": 0.06269948929548264, "eval_logits/rejected": 0.1309981346130371, "eval_logps/chosen": -1.2674616575241089, "eval_logps/rejected": -1.4043465852737427, "eval_loss": 1.6701992750167847, "eval_rewards/accuracies": 0.5474777221679688, "eval_rewards/chosen": -1.2674616575241089, "eval_rewards/margins": 0.1368848830461502, "eval_rewards/rejected": -1.4043465852737427, "eval_runtime": 34.5801, "eval_samples_per_second": 38.895, "eval_semantic_entropy": 0.8052099347114563, "eval_steps_per_second": 9.745, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 5.430154815398098, "learning_rate": 1.3311108361913015e-07, "logits/chosen": -0.282816618680954, "logits/rejected": -0.2480306625366211, "logps/chosen": -1.1731822490692139, "logps/rejected": -1.2733691930770874, "loss": 1.5947, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1731822490692139, "rewards/margins": 0.10018690675497055, "rewards/rejected": -1.2733691930770874, "semantic_entropy": 0.84296053647995, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 5.50862577039049, "learning_rate": 1.3205478209251874e-07, "logits/chosen": -0.21883659064769745, "logits/rejected": -0.19455638527870178, "logps/chosen": -1.263965368270874, "logps/rejected": -1.4649038314819336, "loss": 1.668, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.263965368270874, "rewards/margins": 0.20093849301338196, "rewards/rejected": -1.4649038314819336, "semantic_entropy": 0.8080825805664062, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 5.406211921985467, "learning_rate": 1.310020503720254e-07, "logits/chosen": -0.18060298264026642, "logits/rejected": -0.05848909541964531, "logps/chosen": -1.2022631168365479, "logps/rejected": -1.3373029232025146, "loss": 1.615, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2022631168365479, "rewards/margins": 0.1350397765636444, "rewards/rejected": -1.3373029232025146, "semantic_entropy": 0.8255596160888672, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 4.80380101427645, "learning_rate": 1.2995289867125752e-07, "logits/chosen": -0.2230709344148636, "logits/rejected": -0.16655409336090088, "logps/chosen": -1.2387495040893555, "logps/rejected": -1.2815996408462524, "loss": 1.6454, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2387495040893555, "rewards/margins": 0.0428500734269619, "rewards/rejected": -1.2815996408462524, "semantic_entropy": 0.8132556080818176, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 4.6160077360695775, "learning_rate": 1.2890733716908986e-07, "logits/chosen": -0.21796850860118866, "logits/rejected": -0.12291719764471054, "logps/chosen": -1.160986065864563, "logps/rejected": -1.3658441305160522, "loss": 1.571, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.160986065864563, "rewards/margins": 0.20485815405845642, "rewards/rejected": -1.3658441305160522, "semantic_entropy": 0.8199388384819031, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 4.12751717753831, "learning_rate": 1.2786537600956454e-07, "logits/chosen": -0.20688951015472412, "logits/rejected": -0.0779847502708435, "logps/chosen": -1.2488815784454346, "logps/rejected": -1.3742520809173584, "loss": 1.6467, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2488815784454346, "rewards/margins": 0.1253705620765686, "rewards/rejected": -1.3742520809173584, "semantic_entropy": 0.795565128326416, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 5.00403236456907, "learning_rate": 1.268270253017933e-07, "logits/chosen": -0.2896130084991455, "logits/rejected": -0.11116087436676025, "logps/chosen": -1.1329156160354614, "logps/rejected": -1.32205069065094, "loss": 1.5513, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1329156160354614, "rewards/margins": 0.1891350895166397, "rewards/rejected": -1.32205069065094, "semantic_entropy": 0.8368131518363953, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 5.833756665035829, "learning_rate": 1.257922951198591e-07, "logits/chosen": -0.2937382161617279, "logits/rejected": -0.13492749631404877, "logps/chosen": -1.238324761390686, "logps/rejected": -1.3073033094406128, "loss": 1.6446, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.238324761390686, "rewards/margins": 0.06897857040166855, "rewards/rejected": -1.3073033094406128, "semantic_entropy": 0.8124908208847046, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 5.0622863609835935, "learning_rate": 1.24761195502719e-07, "logits/chosen": -0.2762572765350342, "logits/rejected": -0.14250406622886658, "logps/chosen": -1.1859509944915771, "logps/rejected": -1.3219935894012451, "loss": 1.5933, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1859509944915771, "rewards/margins": 0.1360425055027008, "rewards/rejected": -1.3219935894012451, "semantic_entropy": 0.8147033452987671, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 6.7020879115831855, "learning_rate": 1.2373373645410573e-07, "logits/chosen": -0.20272520184516907, "logits/rejected": -0.10933796316385269, "logps/chosen": -1.227804183959961, "logps/rejected": -1.3389074802398682, "loss": 1.6268, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.227804183959961, "rewards/margins": 0.11110343784093857, "rewards/rejected": -1.3389074802398682, "semantic_entropy": 0.7979435920715332, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 5.770595651998295, "learning_rate": 1.2270992794243175e-07, "logits/chosen": -0.2581040859222412, "logits/rejected": -0.18633589148521423, "logps/chosen": -1.2250630855560303, "logps/rejected": -1.335514783859253, "loss": 1.6331, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2250630855560303, "rewards/margins": 0.1104515939950943, "rewards/rejected": -1.335514783859253, "semantic_entropy": 0.815984845161438, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 6.9486960169652585, "learning_rate": 1.2168977990069147e-07, "logits/chosen": -0.2653730809688568, "logits/rejected": -0.07692326605319977, "logps/chosen": -1.1495521068572998, "logps/rejected": -1.3741111755371094, "loss": 1.5662, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1495521068572998, "rewards/margins": 0.2245592325925827, "rewards/rejected": -1.3741111755371094, "semantic_entropy": 0.8333722949028015, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 5.54478961019857, "learning_rate": 1.206733022263659e-07, "logits/chosen": -0.27518782019615173, "logits/rejected": -0.13236680626869202, "logps/chosen": -1.2456610202789307, "logps/rejected": -1.3502075672149658, "loss": 1.6594, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2456610202789307, "rewards/margins": 0.10454683005809784, "rewards/rejected": -1.3502075672149658, "semantic_entropy": 0.8275245428085327, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 7.336959582719817, "learning_rate": 1.1966050478132572e-07, "logits/chosen": -0.14237284660339355, "logits/rejected": -0.08916251361370087, "logps/chosen": -1.1069893836975098, "logps/rejected": -1.2441356182098389, "loss": 1.5298, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1069893836975098, "rewards/margins": 0.13714632391929626, "rewards/rejected": -1.2441356182098389, "semantic_entropy": 0.8455877304077148, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 4.865108730288012, "learning_rate": 1.1865139739173635e-07, "logits/chosen": -0.2485821694135666, "logits/rejected": -0.07005739212036133, "logps/chosen": -1.1445437669754028, "logps/rejected": -1.3136383295059204, "loss": 1.556, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1445437669754028, "rewards/margins": 0.16909447312355042, "rewards/rejected": -1.3136383295059204, "semantic_entropy": 0.8228176236152649, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 5.130994042360169, "learning_rate": 1.1764598984796187e-07, "logits/chosen": -0.1911982148885727, "logits/rejected": -0.1425384134054184, "logps/chosen": -1.1005656719207764, "logps/rejected": -1.2331244945526123, "loss": 1.5311, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1005656719207764, "rewards/margins": 0.13255885243415833, "rewards/rejected": -1.2331244945526123, "semantic_entropy": 0.8609903454780579, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 4.841922698351759, "learning_rate": 1.1664429190447095e-07, "logits/chosen": -0.22099187970161438, "logits/rejected": -0.130427747964859, "logps/chosen": -1.220674753189087, "logps/rejected": -1.3798946142196655, "loss": 1.6299, "rewards/accuracies": 0.625, "rewards/chosen": -1.220674753189087, "rewards/margins": 0.15921977162361145, "rewards/rejected": -1.3798946142196655, "semantic_entropy": 0.8183862566947937, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 5.416349130161093, "learning_rate": 1.1564631327974122e-07, "logits/chosen": -0.2573382556438446, "logits/rejected": -0.07683748006820679, "logps/chosen": -1.1778900623321533, "logps/rejected": -1.3842337131500244, "loss": 1.5826, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1778900623321533, "rewards/margins": 0.20634369552135468, "rewards/rejected": -1.3842337131500244, "semantic_entropy": 0.8093346357345581, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 6.772656553945887, "learning_rate": 1.1465206365616587e-07, "logits/chosen": -0.29118630290031433, "logits/rejected": -0.11850555986166, "logps/chosen": -1.1644620895385742, "logps/rejected": -1.3203295469284058, "loss": 1.5833, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1644620895385742, "rewards/margins": 0.1558675318956375, "rewards/rejected": -1.3203295469284058, "semantic_entropy": 0.8375909924507141, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 6.654398572902958, "learning_rate": 1.1366155267995887e-07, "logits/chosen": -0.1077810749411583, "logits/rejected": -0.12036697566509247, "logps/chosen": -1.1974709033966064, "logps/rejected": -1.332602858543396, "loss": 1.6025, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1974709033966064, "rewards/margins": 0.1351320445537567, "rewards/rejected": -1.332602858543396, "semantic_entropy": 0.8099727630615234, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 5.860946576808548, "learning_rate": 1.1267478996106228e-07, "logits/chosen": -0.26108574867248535, "logits/rejected": -0.14527593553066254, "logps/chosen": -1.181634545326233, "logps/rejected": -1.432959794998169, "loss": 1.5899, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.181634545326233, "rewards/margins": 0.2513253092765808, "rewards/rejected": -1.432959794998169, "semantic_entropy": 0.8164923787117004, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 7.149810640765493, "learning_rate": 1.116917850730521e-07, "logits/chosen": -0.2841007113456726, "logits/rejected": -0.17947736382484436, "logps/chosen": -1.1229379177093506, "logps/rejected": -1.2886290550231934, "loss": 1.5511, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1229379177093506, "rewards/margins": 0.16569112241268158, "rewards/rejected": -1.2886290550231934, "semantic_entropy": 0.856277585029602, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 4.682308244046834, "learning_rate": 1.1071254755304637e-07, "logits/chosen": -0.24783697724342346, "logits/rejected": -0.1969202756881714, "logps/chosen": -1.205506443977356, "logps/rejected": -1.3832862377166748, "loss": 1.6068, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.205506443977356, "rewards/margins": 0.177779883146286, "rewards/rejected": -1.3832862377166748, "semantic_entropy": 0.8026573061943054, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 5.778244693215126, "learning_rate": 1.0973708690161143e-07, "logits/chosen": -0.22860360145568848, "logits/rejected": -0.16646866500377655, "logps/chosen": -1.161747694015503, "logps/rejected": -1.4049632549285889, "loss": 1.5726, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.161747694015503, "rewards/margins": 0.24321556091308594, "rewards/rejected": -1.4049632549285889, "semantic_entropy": 0.8216179013252258, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 5.476623159983717, "learning_rate": 1.0876541258267119e-07, "logits/chosen": -0.31037330627441406, "logits/rejected": -0.1653234362602234, "logps/chosen": -1.2942192554473877, "logps/rejected": -1.456968903541565, "loss": 1.6886, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2942192554473877, "rewards/margins": 0.16274967789649963, "rewards/rejected": -1.456968903541565, "semantic_entropy": 0.7888429760932922, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 4.706891726387634, "learning_rate": 1.0779753402341379e-07, "logits/chosen": -0.258495569229126, "logits/rejected": -0.1904282420873642, "logps/chosen": -1.2231413125991821, "logps/rejected": -1.3420697450637817, "loss": 1.629, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2231413125991821, "rewards/margins": 0.11892850697040558, "rewards/rejected": -1.3420697450637817, "semantic_entropy": 0.811732292175293, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 4.546160369782997, "learning_rate": 1.0683346061420157e-07, "logits/chosen": -0.11585620790719986, "logits/rejected": -0.03626435995101929, "logps/chosen": -1.1511014699935913, "logps/rejected": -1.2984007596969604, "loss": 1.5535, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1511014699935913, "rewards/margins": 0.1472991406917572, "rewards/rejected": -1.2984007596969604, "semantic_entropy": 0.8047046661376953, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 5.6578280544302695, "learning_rate": 1.0587320170847874e-07, "logits/chosen": -0.12364192306995392, "logits/rejected": -0.0796326994895935, "logps/chosen": -1.1298457384109497, "logps/rejected": -1.2759674787521362, "loss": 1.5476, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1298457384109497, "rewards/margins": 0.14612190425395966, "rewards/rejected": -1.2759674787521362, "semantic_entropy": 0.8355566263198853, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 6.108695806380485, "learning_rate": 1.0491676662268156e-07, "logits/chosen": -0.1997412145137787, "logits/rejected": -0.08534231036901474, "logps/chosen": -1.1362594366073608, "logps/rejected": -1.2839211225509644, "loss": 1.5547, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1362594366073608, "rewards/margins": 0.14766177535057068, "rewards/rejected": -1.2839211225509644, "semantic_entropy": 0.8368202447891235, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 6.6828480599136055, "learning_rate": 1.0396416463614732e-07, "logits/chosen": -0.24874600768089294, "logits/rejected": -0.16255156695842743, "logps/chosen": -1.121525764465332, "logps/rejected": -1.3468575477600098, "loss": 1.5379, "rewards/accuracies": 0.59375, "rewards/chosen": -1.121525764465332, "rewards/margins": 0.22533181309700012, "rewards/rejected": -1.3468575477600098, "semantic_entropy": 0.8326648473739624, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 6.642757481431296, "learning_rate": 1.0301540499102479e-07, "logits/chosen": -0.20943331718444824, "logits/rejected": -0.1547355204820633, "logps/chosen": -1.257939100265503, "logps/rejected": -1.3508199453353882, "loss": 1.6621, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.257939100265503, "rewards/margins": 0.09288088977336884, "rewards/rejected": -1.3508199453353882, "semantic_entropy": 0.80836021900177, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 4.98130216089787, "learning_rate": 1.0207049689218405e-07, "logits/chosen": -0.23428842425346375, "logits/rejected": -0.09207314252853394, "logps/chosen": -1.150626301765442, "logps/rejected": -1.2726784944534302, "loss": 1.5699, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.150626301765442, "rewards/margins": 0.12205219268798828, "rewards/rejected": -1.2726784944534302, "semantic_entropy": 0.8385592699050903, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 5.1349221954934325, "learning_rate": 1.0112944950712782e-07, "logits/chosen": -0.19507509469985962, "logits/rejected": -0.14641042053699493, "logps/chosen": -1.2396150827407837, "logps/rejected": -1.3563262224197388, "loss": 1.6418, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2396150827407837, "rewards/margins": 0.11671112477779388, "rewards/rejected": -1.3563262224197388, "semantic_entropy": 0.8043719530105591, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 5.128104064606291, "learning_rate": 1.0019227196590174e-07, "logits/chosen": -0.1547878533601761, "logits/rejected": -0.028545428067445755, "logps/chosen": -1.1958935260772705, "logps/rejected": -1.3952209949493408, "loss": 1.6038, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1958935260772705, "rewards/margins": 0.1993274986743927, "rewards/rejected": -1.3952209949493408, "semantic_entropy": 0.815838634967804, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 4.480583906922373, "learning_rate": 9.925897336100664e-08, "logits/chosen": -0.12469349056482315, "logits/rejected": -0.09070415794849396, "logps/chosen": -1.1831550598144531, "logps/rejected": -1.3135663270950317, "loss": 1.5896, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1831550598144531, "rewards/margins": 0.1304110586643219, "rewards/rejected": -1.3135663270950317, "semantic_entropy": 0.8129767179489136, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 6.646314101775974, "learning_rate": 9.832956274730946e-08, "logits/chosen": -0.22599034011363983, "logits/rejected": -0.1874760091304779, "logps/chosen": -1.1131327152252197, "logps/rejected": -1.356627345085144, "loss": 1.5283, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1131327152252197, "rewards/margins": 0.2434946596622467, "rewards/rejected": -1.356627345085144, "semantic_entropy": 0.8304254412651062, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 4.676795597092353, "learning_rate": 9.740404914195633e-08, "logits/chosen": -0.18190020322799683, "logits/rejected": -0.07394681870937347, "logps/chosen": -1.2582340240478516, "logps/rejected": -1.335383653640747, "loss": 1.661, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2582340240478516, "rewards/margins": 0.07714969664812088, "rewards/rejected": -1.335383653640747, "semantic_entropy": 0.8054393529891968, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 5.254365326323938, "learning_rate": 9.648244152428392e-08, "logits/chosen": -0.29261714220046997, "logits/rejected": -0.14058782160282135, "logps/chosen": -1.137317180633545, "logps/rejected": -1.3422868251800537, "loss": 1.5551, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.137317180633545, "rewards/margins": 0.2049696445465088, "rewards/rejected": -1.3422868251800537, "semantic_entropy": 0.835522472858429, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 6.058185437612524, "learning_rate": 9.556474883573379e-08, "logits/chosen": -0.2721913754940033, "logits/rejected": -0.1735362708568573, "logps/chosen": -1.2033871412277222, "logps/rejected": -1.384864091873169, "loss": 1.6036, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.2033871412277222, "rewards/margins": 0.18147686123847961, "rewards/rejected": -1.384864091873169, "semantic_entropy": 0.8004916310310364, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 4.810609846265227, "learning_rate": 9.465097997976412e-08, "logits/chosen": -0.2398107498884201, "logits/rejected": -0.024045180529356003, "logps/chosen": -1.2440158128738403, "logps/rejected": -1.4549797773361206, "loss": 1.6404, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2440158128738403, "rewards/margins": 0.21096408367156982, "rewards/rejected": -1.4549797773361206, "semantic_entropy": 0.7928181886672974, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 6.341902887941187, "learning_rate": 9.374114382176457e-08, "logits/chosen": -0.185301274061203, "logits/rejected": -0.08672976493835449, "logps/chosen": -1.2151572704315186, "logps/rejected": -1.3925215005874634, "loss": 1.6243, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2151572704315186, "rewards/margins": 0.17736423015594482, "rewards/rejected": -1.3925215005874634, "semantic_entropy": 0.8182731866836548, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 5.896706935383898, "learning_rate": 9.283524918896945e-08, "logits/chosen": -0.22949771583080292, "logits/rejected": -0.09966824948787689, "logps/chosen": -1.1999315023422241, "logps/rejected": -1.3874129056930542, "loss": 1.6015, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1999315023422241, "rewards/margins": 0.1874813735485077, "rewards/rejected": -1.3874129056930542, "semantic_entropy": 0.8030966520309448, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 4.465385559971465, "learning_rate": 9.193330487037232e-08, "logits/chosen": -0.15486857295036316, "logits/rejected": -0.04246527701616287, "logps/chosen": -1.2221777439117432, "logps/rejected": -1.414550542831421, "loss": 1.6316, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2221777439117432, "rewards/margins": 0.1923731118440628, "rewards/rejected": -1.414550542831421, "semantic_entropy": 0.818783164024353, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 8.630764777969766, "learning_rate": 9.103531961664118e-08, "logits/chosen": -0.18179291486740112, "logits/rejected": -0.025497624650597572, "logps/chosen": -1.1169579029083252, "logps/rejected": -1.3336776494979858, "loss": 1.5385, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1169579029083252, "rewards/margins": 0.21671970188617706, "rewards/rejected": -1.3336776494979858, "semantic_entropy": 0.843165397644043, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 4.751841915493891, "learning_rate": 9.014130214003269e-08, "logits/chosen": -0.3168572783470154, "logits/rejected": -0.27677637338638306, "logps/chosen": -1.243691086769104, "logps/rejected": -1.3853175640106201, "loss": 1.6468, "rewards/accuracies": 0.53125, "rewards/chosen": -1.243691086769104, "rewards/margins": 0.14162659645080566, "rewards/rejected": -1.3853175640106201, "semantic_entropy": 0.8062641024589539, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 6.061326437559875, "learning_rate": 8.925126111430848e-08, "logits/chosen": -0.19238585233688354, "logits/rejected": -0.12667891383171082, "logps/chosen": -1.1414568424224854, "logps/rejected": -1.3716002702713013, "loss": 1.5538, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1414568424224854, "rewards/margins": 0.23014338314533234, "rewards/rejected": -1.3716002702713013, "semantic_entropy": 0.8246892094612122, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 5.777089629876477, "learning_rate": 8.83652051746504e-08, "logits/chosen": -0.11585505306720734, "logits/rejected": -0.0006043463945388794, "logps/chosen": -1.1516425609588623, "logps/rejected": -1.3429611921310425, "loss": 1.5597, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1516425609588623, "rewards/margins": 0.19131866097450256, "rewards/rejected": -1.3429611921310425, "semantic_entropy": 0.8160654306411743, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 5.7512235664488, "learning_rate": 8.748314291757696e-08, "logits/chosen": -0.16687771677970886, "logits/rejected": -0.08392693847417831, "logps/chosen": -1.187285304069519, "logps/rejected": -1.3756736516952515, "loss": 1.5869, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.187285304069519, "rewards/margins": 0.1883884072303772, "rewards/rejected": -1.3756736516952515, "semantic_entropy": 0.7992327213287354, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 6.287316839453035, "learning_rate": 8.660508290086032e-08, "logits/chosen": -0.18698559701442719, "logits/rejected": -0.07352995127439499, "logps/chosen": -1.168413758277893, "logps/rejected": -1.3713876008987427, "loss": 1.5764, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.168413758277893, "rewards/margins": 0.202973872423172, "rewards/rejected": -1.3713876008987427, "semantic_entropy": 0.8159362077713013, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 5.9368984255186215, "learning_rate": 8.573103364344231e-08, "logits/chosen": -0.23626060783863068, "logits/rejected": -0.03977738693356514, "logps/chosen": -1.2048670053482056, "logps/rejected": -1.434964895248413, "loss": 1.6068, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2048670053482056, "rewards/margins": 0.230097696185112, "rewards/rejected": -1.434964895248413, "semantic_entropy": 0.8038923144340515, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 5.853632218118439, "learning_rate": 8.486100362535292e-08, "logits/chosen": -0.24392414093017578, "logits/rejected": -0.12454120069742203, "logps/chosen": -1.1739752292633057, "logps/rejected": -1.2904012203216553, "loss": 1.5869, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1739752292633057, "rewards/margins": 0.11642596870660782, "rewards/rejected": -1.2904012203216553, "semantic_entropy": 0.8257750272750854, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 5.298231378039098, "learning_rate": 8.399500128762693e-08, "logits/chosen": -0.23954136669635773, "logits/rejected": -0.14991997182369232, "logps/chosen": -1.2384611368179321, "logps/rejected": -1.340154767036438, "loss": 1.6283, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2384611368179321, "rewards/margins": 0.10169367492198944, "rewards/rejected": -1.340154767036438, "semantic_entropy": 0.7796241044998169, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 5.790032211661871, "learning_rate": 8.313303503222313e-08, "logits/chosen": -0.20989234745502472, "logits/rejected": -0.1543906033039093, "logps/chosen": -1.2659891843795776, "logps/rejected": -1.4262335300445557, "loss": 1.6609, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2659891843795776, "rewards/margins": 0.16024437546730042, "rewards/rejected": -1.4262335300445557, "semantic_entropy": 0.7898926734924316, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 5.816384633900608, "learning_rate": 8.227511322194164e-08, "logits/chosen": -0.18056149780750275, "logits/rejected": -0.07796251773834229, "logps/chosen": -1.2366188764572144, "logps/rejected": -1.3517028093338013, "loss": 1.644, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2366188764572144, "rewards/margins": 0.1150839775800705, "rewards/rejected": -1.3517028093338013, "semantic_entropy": 0.8146642446517944, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 5.073956637530935, "learning_rate": 8.142124418034385e-08, "logits/chosen": -0.17053021490573883, "logits/rejected": -0.06061788275837898, "logps/chosen": -1.1419013738632202, "logps/rejected": -1.3210012912750244, "loss": 1.558, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1419013738632202, "rewards/margins": 0.1790998876094818, "rewards/rejected": -1.3210012912750244, "semantic_entropy": 0.8321198225021362, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 6.592151758433538, "learning_rate": 8.057143619167073e-08, "logits/chosen": -0.16448600590229034, "logits/rejected": -0.0765252485871315, "logps/chosen": -1.1948121786117554, "logps/rejected": -1.3244434595108032, "loss": 1.6127, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1948121786117554, "rewards/margins": 0.12963134050369263, "rewards/rejected": -1.3244434595108032, "semantic_entropy": 0.8357879519462585, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 4.030495214138565, "learning_rate": 7.97256975007633e-08, "logits/chosen": -0.2652263641357422, "logits/rejected": -0.11818347126245499, "logps/chosen": -1.2102594375610352, "logps/rejected": -1.3340270519256592, "loss": 1.6163, "rewards/accuracies": 0.5, "rewards/chosen": -1.2102594375610352, "rewards/margins": 0.12376763671636581, "rewards/rejected": -1.3340270519256592, "semantic_entropy": 0.8121445775032043, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 6.769194803642693, "learning_rate": 7.888403631298186e-08, "logits/chosen": -0.18318554759025574, "logits/rejected": -0.15048848092556, "logps/chosen": -1.2298216819763184, "logps/rejected": -1.3463466167449951, "loss": 1.634, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2298216819763184, "rewards/margins": 0.11652486026287079, "rewards/rejected": -1.3463466167449951, "semantic_entropy": 0.8082779049873352, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 5.364011130288799, "learning_rate": 7.804646079412719e-08, "logits/chosen": -0.17940878868103027, "logits/rejected": -0.025525590404868126, "logps/chosen": -1.2114226818084717, "logps/rejected": -1.336875081062317, "loss": 1.6235, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2114226818084717, "rewards/margins": 0.1254524290561676, "rewards/rejected": -1.336875081062317, "semantic_entropy": 0.8240588903427124, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 5.961989280040665, "learning_rate": 7.72129790703604e-08, "logits/chosen": -0.2501986622810364, "logits/rejected": -0.16463592648506165, "logps/chosen": -1.1703846454620361, "logps/rejected": -1.2990435361862183, "loss": 1.5933, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1703846454620361, "rewards/margins": 0.12865887582302094, "rewards/rejected": -1.2990435361862183, "semantic_entropy": 0.8457660675048828, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 8.495008831357168, "learning_rate": 7.638359922812504e-08, "logits/chosen": -0.1314721256494522, "logits/rejected": -0.11683490127325058, "logps/chosen": -1.2429084777832031, "logps/rejected": -1.3592987060546875, "loss": 1.6318, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2429084777832031, "rewards/margins": 0.11639020591974258, "rewards/rejected": -1.3592987060546875, "semantic_entropy": 0.7777156233787537, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 7.070831634397481, "learning_rate": 7.555832931406774e-08, "logits/chosen": -0.2140822857618332, "logits/rejected": -0.10100038349628448, "logps/chosen": -1.217376470565796, "logps/rejected": -1.36806321144104, "loss": 1.6216, "rewards/accuracies": 0.5625, "rewards/chosen": -1.217376470565796, "rewards/margins": 0.1506866067647934, "rewards/rejected": -1.36806321144104, "semantic_entropy": 0.8084825277328491, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 5.691130699992712, "learning_rate": 7.47371773349611e-08, "logits/chosen": -0.22983917593955994, "logits/rejected": -0.21817061305046082, "logps/chosen": -1.283363699913025, "logps/rejected": -1.4209480285644531, "loss": 1.6727, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.283363699913025, "rewards/margins": 0.1375843733549118, "rewards/rejected": -1.4209480285644531, "semantic_entropy": 0.7787033915519714, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 6.707115203721298, "learning_rate": 7.392015125762496e-08, "logits/chosen": -0.1817096471786499, "logits/rejected": -0.11873488128185272, "logps/chosen": -1.1363061666488647, "logps/rejected": -1.3477298021316528, "loss": 1.5544, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1363061666488647, "rewards/margins": 0.21142356097698212, "rewards/rejected": -1.3477298021316528, "semantic_entropy": 0.8362777829170227, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 5.783628171511241, "learning_rate": 7.310725900885018e-08, "logits/chosen": -0.24997857213020325, "logits/rejected": -0.20143404603004456, "logps/chosen": -1.2113721370697021, "logps/rejected": -1.3221945762634277, "loss": 1.6186, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2113721370697021, "rewards/margins": 0.11082251369953156, "rewards/rejected": -1.3221945762634277, "semantic_entropy": 0.814383864402771, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 6.58895943063552, "learning_rate": 7.229850847532076e-08, "logits/chosen": -0.1938858926296234, "logits/rejected": -0.10287405550479889, "logps/chosen": -1.115453839302063, "logps/rejected": -1.2714155912399292, "loss": 1.5259, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.115453839302063, "rewards/margins": 0.15596170723438263, "rewards/rejected": -1.2714155912399292, "semantic_entropy": 0.820944607257843, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 4.5475836140301285, "learning_rate": 7.149390750353779e-08, "logits/chosen": -0.19098015129566193, "logits/rejected": -0.24854807555675507, "logps/chosen": -1.2456705570220947, "logps/rejected": -1.3776724338531494, "loss": 1.6499, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2456705570220947, "rewards/margins": 0.1320018470287323, "rewards/rejected": -1.3776724338531494, "semantic_entropy": 0.8083675503730774, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 4.918093531290784, "learning_rate": 7.069346389974374e-08, "logits/chosen": -0.26043814420700073, "logits/rejected": -0.14872071146965027, "logps/chosen": -1.2334837913513184, "logps/rejected": -1.432740330696106, "loss": 1.6381, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2334837913513184, "rewards/margins": 0.1992565095424652, "rewards/rejected": -1.432740330696106, "semantic_entropy": 0.8091747164726257, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 6.659629301049917, "learning_rate": 6.989718542984563e-08, "logits/chosen": -0.18613910675048828, "logits/rejected": -0.16516457498073578, "logps/chosen": -1.2351375818252563, "logps/rejected": -1.4086172580718994, "loss": 1.634, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2351375818252563, "rewards/margins": 0.17347970604896545, "rewards/rejected": -1.4086172580718994, "semantic_entropy": 0.7977755665779114, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 4.041410203676996, "learning_rate": 6.9105079819341e-08, "logits/chosen": -0.1450665444135666, "logits/rejected": -0.0005016446230001748, "logps/chosen": -1.1782689094543457, "logps/rejected": -1.5038906335830688, "loss": 1.596, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1782689094543457, "rewards/margins": 0.32562169432640076, "rewards/rejected": -1.5038906335830688, "semantic_entropy": 0.8354153633117676, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 4.835159914007807, "learning_rate": 6.831715475324163e-08, "logits/chosen": -0.21535837650299072, "logits/rejected": -0.08020300418138504, "logps/chosen": -1.1191022396087646, "logps/rejected": -1.3830084800720215, "loss": 1.5358, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1191022396087646, "rewards/margins": 0.26390641927719116, "rewards/rejected": -1.3830084800720215, "semantic_entropy": 0.8333007097244263, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 4.521458584423979, "learning_rate": 6.753341787600026e-08, "logits/chosen": -0.2677205801010132, "logits/rejected": -0.15883246064186096, "logps/chosen": -1.125661849975586, "logps/rejected": -1.2857940196990967, "loss": 1.5449, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.125661849975586, "rewards/margins": 0.16013234853744507, "rewards/rejected": -1.2857940196990967, "semantic_entropy": 0.8385255932807922, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 6.448156693636901, "learning_rate": 6.67538767914353e-08, "logits/chosen": -0.24683931469917297, "logits/rejected": -0.11016325652599335, "logps/chosen": -1.2065963745117188, "logps/rejected": -1.2861545085906982, "loss": 1.6143, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2065963745117188, "rewards/margins": 0.07955806702375412, "rewards/rejected": -1.2861545085906982, "semantic_entropy": 0.8155028223991394, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 5.984391481949238, "learning_rate": 6.597853906265793e-08, "logits/chosen": -0.1949569284915924, "logits/rejected": -0.12112084776163101, "logps/chosen": -1.2212117910385132, "logps/rejected": -1.3881285190582275, "loss": 1.6343, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2212117910385132, "rewards/margins": 0.1669166088104248, "rewards/rejected": -1.3881285190582275, "semantic_entropy": 0.8262287378311157, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 3.86434953697175, "learning_rate": 6.5207412211998e-08, "logits/chosen": -0.11144611984491348, "logits/rejected": -0.021073434501886368, "logps/chosen": -1.1447874307632446, "logps/rejected": -1.3194307088851929, "loss": 1.567, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1447874307632446, "rewards/margins": 0.17464321851730347, "rewards/rejected": -1.3194307088851929, "semantic_entropy": 0.8445202112197876, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 4.18016435898079, "learning_rate": 6.444050372093186e-08, "logits/chosen": -0.18034487962722778, "logits/rejected": -0.11971686780452728, "logps/chosen": -1.1987321376800537, "logps/rejected": -1.3465774059295654, "loss": 1.6066, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1987321376800537, "rewards/margins": 0.14784519374370575, "rewards/rejected": -1.3465774059295654, "semantic_entropy": 0.8156407475471497, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 6.802206548719309, "learning_rate": 6.367782103000873e-08, "logits/chosen": -0.17430144548416138, "logits/rejected": -0.1352550983428955, "logps/chosen": -1.2139757871627808, "logps/rejected": -1.3222541809082031, "loss": 1.6205, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2139757871627808, "rewards/margins": 0.10827841609716415, "rewards/rejected": -1.3222541809082031, "semantic_entropy": 0.8130617141723633, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 4.924647418811335, "learning_rate": 6.29193715387798e-08, "logits/chosen": -0.27534228563308716, "logits/rejected": -0.16728226840496063, "logps/chosen": -1.227097749710083, "logps/rejected": -1.3663431406021118, "loss": 1.6355, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.227097749710083, "rewards/margins": 0.13924536108970642, "rewards/rejected": -1.3663431406021118, "semantic_entropy": 0.8168867230415344, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 7.304863329405773, "learning_rate": 6.216516260572502e-08, "logits/chosen": -0.16023625433444977, "logits/rejected": -0.11408082395792007, "logps/chosen": -1.2263009548187256, "logps/rejected": -1.3645062446594238, "loss": 1.6303, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2263009548187256, "rewards/margins": 0.13820533454418182, "rewards/rejected": -1.3645062446594238, "semantic_entropy": 0.8080469965934753, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 5.784574693198778, "learning_rate": 6.141520154818297e-08, "logits/chosen": -0.19207091629505157, "logits/rejected": -0.11240673065185547, "logps/chosen": -1.14096999168396, "logps/rejected": -1.3353478908538818, "loss": 1.5524, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.14096999168396, "rewards/margins": 0.19437818229198456, "rewards/rejected": -1.3353478908538818, "semantic_entropy": 0.8229475021362305, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.018750905990600586, "eval_logits/rejected": 0.08335219323635101, "eval_logps/chosen": -1.2672524452209473, "eval_logps/rejected": -1.4042669534683228, "eval_loss": 1.6700292825698853, "eval_rewards/accuracies": 0.5482195615768433, "eval_rewards/chosen": -1.2672524452209473, "eval_rewards/margins": 0.1370147168636322, "eval_rewards/rejected": -1.4042669534683228, "eval_runtime": 34.5255, "eval_samples_per_second": 38.957, "eval_semantic_entropy": 0.8052894473075867, "eval_steps_per_second": 9.761, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 7.838309824598802, "learning_rate": 6.066949564227897e-08, "logits/chosen": -0.26311224699020386, "logits/rejected": -0.1702595204114914, "logps/chosen": -1.2542366981506348, "logps/rejected": -1.4652049541473389, "loss": 1.6534, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2542366981506348, "rewards/margins": 0.21096821129322052, "rewards/rejected": -1.4652049541473389, "semantic_entropy": 0.7982694506645203, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 5.212984136276359, "learning_rate": 5.992805212285523e-08, "logits/chosen": -0.1574612408876419, "logits/rejected": -0.048284441232681274, "logps/chosen": -1.2648286819458008, "logps/rejected": -1.4075676202774048, "loss": 1.661, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2648286819458008, "rewards/margins": 0.14273887872695923, "rewards/rejected": -1.4075676202774048, "semantic_entropy": 0.7923511862754822, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 6.216676509332401, "learning_rate": 5.9190878183399684e-08, "logits/chosen": -0.161361962556839, "logits/rejected": -0.07018058001995087, "logps/chosen": -1.0996425151824951, "logps/rejected": -1.423787236213684, "loss": 1.5169, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0996425151824951, "rewards/margins": 0.3241446912288666, "rewards/rejected": -1.423787236213684, "semantic_entropy": 0.8345838785171509, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 6.03997422483082, "learning_rate": 5.845798097597748e-08, "logits/chosen": -0.1661689728498459, "logits/rejected": -0.0944608598947525, "logps/chosen": -1.2619267702102661, "logps/rejected": -1.3789420127868652, "loss": 1.6636, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2619267702102661, "rewards/margins": 0.11701524257659912, "rewards/rejected": -1.3789420127868652, "semantic_entropy": 0.8034173846244812, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 5.566185493375155, "learning_rate": 5.772936761116026e-08, "logits/chosen": -0.16259567439556122, "logits/rejected": -0.08361367881298065, "logps/chosen": -1.1621887683868408, "logps/rejected": -1.2838950157165527, "loss": 1.5749, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1621887683868408, "rewards/margins": 0.1217062696814537, "rewards/rejected": -1.2838950157165527, "semantic_entropy": 0.8253543972969055, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 3.7207813561036005, "learning_rate": 5.700504515795829e-08, "logits/chosen": -0.20747077465057373, "logits/rejected": -0.08944211900234222, "logps/chosen": -1.2531126737594604, "logps/rejected": -1.3348289728164673, "loss": 1.6601, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2531126737594604, "rewards/margins": 0.08171630650758743, "rewards/rejected": -1.3348289728164673, "semantic_entropy": 0.8140729069709778, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 5.684695186815946, "learning_rate": 5.628502064375101e-08, "logits/chosen": -0.3198186457157135, "logits/rejected": -0.178703173995018, "logps/chosen": -1.1813348531723022, "logps/rejected": -1.2973575592041016, "loss": 1.6008, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1813348531723022, "rewards/margins": 0.11602268368005753, "rewards/rejected": -1.2973575592041016, "semantic_entropy": 0.8389140963554382, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 7.301879892619553, "learning_rate": 5.55693010542197e-08, "logits/chosen": -0.2445097267627716, "logits/rejected": -0.10317005217075348, "logps/chosen": -1.1999493837356567, "logps/rejected": -1.3213441371917725, "loss": 1.6081, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1999493837356567, "rewards/margins": 0.12139477580785751, "rewards/rejected": -1.3213441371917725, "semantic_entropy": 0.8163037300109863, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 5.52921409081991, "learning_rate": 5.485789333327856e-08, "logits/chosen": -0.22078566253185272, "logits/rejected": -0.11441943794488907, "logps/chosen": -1.1605533361434937, "logps/rejected": -1.36667799949646, "loss": 1.5769, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1605533361434937, "rewards/margins": 0.20612454414367676, "rewards/rejected": -1.36667799949646, "semantic_entropy": 0.8326366543769836, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 6.349888839091984, "learning_rate": 5.4150804383008675e-08, "logits/chosen": -0.33861786127090454, "logits/rejected": -0.20907047390937805, "logps/chosen": -1.1506234407424927, "logps/rejected": -1.3096128702163696, "loss": 1.5713, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1506234407424927, "rewards/margins": 0.1589892953634262, "rewards/rejected": -1.3096128702163696, "semantic_entropy": 0.8413839340209961, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 5.013652824327247, "learning_rate": 5.344804106359002e-08, "logits/chosen": -0.18829232454299927, "logits/rejected": -0.059736013412475586, "logps/chosen": -1.1435438394546509, "logps/rejected": -1.275304913520813, "loss": 1.5627, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1435438394546509, "rewards/margins": 0.13176119327545166, "rewards/rejected": -1.275304913520813, "semantic_entropy": 0.8382665514945984, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 7.757787504804834, "learning_rate": 5.274961019323559e-08, "logits/chosen": -0.2371334284543991, "logits/rejected": -0.19555452466011047, "logps/chosen": -1.110109567642212, "logps/rejected": -1.4044098854064941, "loss": 1.5178, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.110109567642212, "rewards/margins": 0.2943003177642822, "rewards/rejected": -1.4044098854064941, "semantic_entropy": 0.8153454065322876, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 4.953909102290371, "learning_rate": 5.205551854812451e-08, "logits/chosen": -0.312411367893219, "logits/rejected": -0.24798598885536194, "logps/chosen": -1.180785059928894, "logps/rejected": -1.40126371383667, "loss": 1.5921, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.180785059928894, "rewards/margins": 0.2204788625240326, "rewards/rejected": -1.40126371383667, "semantic_entropy": 0.8226381540298462, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 9.857496487300056, "learning_rate": 5.1365772862337177e-08, "logits/chosen": -0.17745932936668396, "logits/rejected": -0.07730675488710403, "logps/chosen": -1.1867878437042236, "logps/rejected": -1.2937209606170654, "loss": 1.5971, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1867878437042236, "rewards/margins": 0.10693307220935822, "rewards/rejected": -1.2937209606170654, "semantic_entropy": 0.8206270337104797, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 6.0171011951706594, "learning_rate": 5.068037982778905e-08, "logits/chosen": -0.0680459514260292, "logits/rejected": -0.010929008014500141, "logps/chosen": -1.1533907651901245, "logps/rejected": -1.437929391860962, "loss": 1.566, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1533907651901245, "rewards/margins": 0.28453850746154785, "rewards/rejected": -1.437929391860962, "semantic_entropy": 0.8252226114273071, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 5.109288351269742, "learning_rate": 4.999934609416656e-08, "logits/chosen": -0.138889342546463, "logits/rejected": -0.0562707781791687, "logps/chosen": -1.1587345600128174, "logps/rejected": -1.3460900783538818, "loss": 1.5712, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1587345600128174, "rewards/margins": 0.18735568225383759, "rewards/rejected": -1.3460900783538818, "semantic_entropy": 0.8248738050460815, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 6.259898525013245, "learning_rate": 4.932267826886183e-08, "logits/chosen": -0.140023335814476, "logits/rejected": -0.07762879133224487, "logps/chosen": -1.2594740390777588, "logps/rejected": -1.4607652425765991, "loss": 1.6466, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.2594740390777588, "rewards/margins": 0.20129111409187317, "rewards/rejected": -1.4607652425765991, "semantic_entropy": 0.7743476629257202, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 4.2660797575743015, "learning_rate": 4.8650382916909206e-08, "logits/chosen": -0.2582811713218689, "logits/rejected": -0.1325262188911438, "logps/chosen": -1.1899445056915283, "logps/rejected": -1.3772531747817993, "loss": 1.6, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1899445056915283, "rewards/margins": 0.18730869889259338, "rewards/rejected": -1.3772531747817993, "semantic_entropy": 0.8201318979263306, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 5.48872254768201, "learning_rate": 4.7982466560920976e-08, "logits/chosen": -0.21464118361473083, "logits/rejected": -0.13742205500602722, "logps/chosen": -1.2322580814361572, "logps/rejected": -1.3302807807922363, "loss": 1.6344, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2322580814361572, "rewards/margins": 0.09802266210317612, "rewards/rejected": -1.3302807807922363, "semantic_entropy": 0.8042311668395996, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 5.052493303668469, "learning_rate": 4.7318935681024685e-08, "logits/chosen": -0.1607995331287384, "logits/rejected": -0.057098377496004105, "logps/chosen": -1.1783926486968994, "logps/rejected": -1.3661010265350342, "loss": 1.5857, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1783926486968994, "rewards/margins": 0.1877085268497467, "rewards/rejected": -1.3661010265350342, "semantic_entropy": 0.8145910501480103, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 4.4896707180987345, "learning_rate": 4.6659796714799745e-08, "logits/chosen": -0.2395261824131012, "logits/rejected": -0.12110918760299683, "logps/chosen": -1.1718931198120117, "logps/rejected": -1.347839117050171, "loss": 1.5799, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1718931198120117, "rewards/margins": 0.17594608664512634, "rewards/rejected": -1.347839117050171, "semantic_entropy": 0.8160476684570312, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 5.09022320923381, "learning_rate": 4.60050560572155e-08, "logits/chosen": -0.243981271982193, "logits/rejected": -0.2738270163536072, "logps/chosen": -1.1903190612792969, "logps/rejected": -1.4210926294326782, "loss": 1.6035, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1903190612792969, "rewards/margins": 0.2307736575603485, "rewards/rejected": -1.4210926294326782, "semantic_entropy": 0.8264458775520325, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 6.261230619986042, "learning_rate": 4.535472006056834e-08, "logits/chosen": -0.18451471626758575, "logits/rejected": -0.11190980672836304, "logps/chosen": -1.106879711151123, "logps/rejected": -1.3452517986297607, "loss": 1.5232, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.106879711151123, "rewards/margins": 0.23837217688560486, "rewards/rejected": -1.3452517986297607, "semantic_entropy": 0.8325529098510742, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 7.2286288675038035, "learning_rate": 4.470879503442132e-08, "logits/chosen": -0.14138111472129822, "logits/rejected": -0.08882437646389008, "logps/chosen": -1.1970179080963135, "logps/rejected": -1.292048454284668, "loss": 1.6112, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1970179080963135, "rewards/margins": 0.09503050148487091, "rewards/rejected": -1.292048454284668, "semantic_entropy": 0.828389048576355, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 5.797910483904959, "learning_rate": 4.406728724554154e-08, "logits/chosen": -0.3514033854007721, "logits/rejected": -0.14897218346595764, "logps/chosen": -1.1621859073638916, "logps/rejected": -1.3483350276947021, "loss": 1.5719, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1621859073638916, "rewards/margins": 0.186149001121521, "rewards/rejected": -1.3483350276947021, "semantic_entropy": 0.8194893002510071, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 4.82443430403224, "learning_rate": 4.3430202917840664e-08, "logits/chosen": -0.13451623916625977, "logits/rejected": -0.023087620735168457, "logps/chosen": -1.2260851860046387, "logps/rejected": -1.4703863859176636, "loss": 1.6263, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2260851860046387, "rewards/margins": 0.24430112540721893, "rewards/rejected": -1.4703863859176636, "semantic_entropy": 0.8004641532897949, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 4.7301509692288155, "learning_rate": 4.279754823231346e-08, "logits/chosen": -0.26739370822906494, "logits/rejected": -0.14884258806705475, "logps/chosen": -1.1885273456573486, "logps/rejected": -1.3071404695510864, "loss": 1.5931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1885273456573486, "rewards/margins": 0.11861300468444824, "rewards/rejected": -1.3071404695510864, "semantic_entropy": 0.8092037439346313, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 4.247546523299479, "learning_rate": 4.216932932697859e-08, "logits/chosen": -0.20447468757629395, "logits/rejected": -0.15082180500030518, "logps/chosen": -1.192589521408081, "logps/rejected": -1.2797870635986328, "loss": 1.6133, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.192589521408081, "rewards/margins": 0.08719761669635773, "rewards/rejected": -1.2797870635986328, "semantic_entropy": 0.8413387537002563, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 5.41142084123019, "learning_rate": 4.154555229681844e-08, "logits/chosen": -0.20720532536506653, "logits/rejected": -0.04938402771949768, "logps/chosen": -1.1850001811981201, "logps/rejected": -1.403842568397522, "loss": 1.5912, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1850001811981201, "rewards/margins": 0.21884234249591827, "rewards/rejected": -1.403842568397522, "semantic_entropy": 0.8124829530715942, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 5.663010647079669, "learning_rate": 4.092622319372069e-08, "logits/chosen": -0.22689147293567657, "logits/rejected": -0.13233539462089539, "logps/chosen": -1.1548818349838257, "logps/rejected": -1.362243413925171, "loss": 1.566, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1548818349838257, "rewards/margins": 0.20736172795295715, "rewards/rejected": -1.362243413925171, "semantic_entropy": 0.8221772909164429, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 5.4914839305953596, "learning_rate": 4.031134802641889e-08, "logits/chosen": -0.19999375939369202, "logits/rejected": -0.2165372371673584, "logps/chosen": -1.1714431047439575, "logps/rejected": -1.4042613506317139, "loss": 1.5773, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1714431047439575, "rewards/margins": 0.23281824588775635, "rewards/rejected": -1.4042613506317139, "semantic_entropy": 0.8116925358772278, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 5.4714560086000725, "learning_rate": 3.970093276043468e-08, "logits/chosen": -0.11451669037342072, "logits/rejected": -0.03737467899918556, "logps/chosen": -1.2169392108917236, "logps/rejected": -1.3454158306121826, "loss": 1.6241, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2169392108917236, "rewards/margins": 0.12847664952278137, "rewards/rejected": -1.3454158306121826, "semantic_entropy": 0.8143658638000488, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 5.539005587940148, "learning_rate": 3.9094983318019584e-08, "logits/chosen": -0.22387132048606873, "logits/rejected": -0.12062165886163712, "logps/chosen": -1.1019034385681152, "logps/rejected": -1.2801936864852905, "loss": 1.527, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1019034385681152, "rewards/margins": 0.17829032242298126, "rewards/rejected": -1.2801936864852905, "semantic_entropy": 0.8501994013786316, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 6.452079659088734, "learning_rate": 3.849350557809789e-08, "logits/chosen": -0.1358400136232376, "logits/rejected": -0.08804251253604889, "logps/chosen": -1.147835373878479, "logps/rejected": -1.3965626955032349, "loss": 1.5697, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.147835373878479, "rewards/margins": 0.24872732162475586, "rewards/rejected": -1.3965626955032349, "semantic_entropy": 0.8436657190322876, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 5.1084716543428526, "learning_rate": 3.789650537620903e-08, "logits/chosen": -0.17675364017486572, "logits/rejected": -0.1460619866847992, "logps/chosen": -1.209539771080017, "logps/rejected": -1.3522628545761108, "loss": 1.6191, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.209539771080017, "rewards/margins": 0.14272302389144897, "rewards/rejected": -1.3522628545761108, "semantic_entropy": 0.8190957307815552, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 5.667515240784143, "learning_rate": 3.730398850445182e-08, "logits/chosen": -0.1065405011177063, "logits/rejected": -0.06168811395764351, "logps/chosen": -1.3162662982940674, "logps/rejected": -1.4139964580535889, "loss": 1.7147, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3162662982940674, "rewards/margins": 0.09773032367229462, "rewards/rejected": -1.4139964580535889, "semantic_entropy": 0.7968921661376953, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 6.365832950520623, "learning_rate": 3.671596071142735e-08, "logits/chosen": -0.1529506891965866, "logits/rejected": -0.02435176447033882, "logps/chosen": -1.1400558948516846, "logps/rejected": -1.3359276056289673, "loss": 1.5477, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1400558948516846, "rewards/margins": 0.19587165117263794, "rewards/rejected": -1.3359276056289673, "semantic_entropy": 0.8153068423271179, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 4.548254688635265, "learning_rate": 3.6132427702183996e-08, "logits/chosen": -0.27473416924476624, "logits/rejected": -0.09734959900379181, "logps/chosen": -1.1419093608856201, "logps/rejected": -1.3564326763153076, "loss": 1.5631, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1419093608856201, "rewards/margins": 0.21452336013317108, "rewards/rejected": -1.3564326763153076, "semantic_entropy": 0.8423225283622742, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 6.521972104738788, "learning_rate": 3.555339513816147e-08, "logits/chosen": -0.23122099041938782, "logits/rejected": -0.22746233642101288, "logps/chosen": -1.1982905864715576, "logps/rejected": -1.3453247547149658, "loss": 1.6084, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1982905864715576, "rewards/margins": 0.14703437685966492, "rewards/rejected": -1.3453247547149658, "semantic_entropy": 0.8201590776443481, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 6.021195096847241, "learning_rate": 3.497886863713639e-08, "logits/chosen": -0.20530113577842712, "logits/rejected": -0.20220032334327698, "logps/chosen": -1.1813305616378784, "logps/rejected": -1.4230222702026367, "loss": 1.5857, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1813305616378784, "rewards/margins": 0.24169154465198517, "rewards/rejected": -1.4230222702026367, "semantic_entropy": 0.8087220191955566, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 6.877956493271813, "learning_rate": 3.440885377316721e-08, "logits/chosen": -0.16880105435848236, "logits/rejected": -0.1402072012424469, "logps/chosen": -1.1766383647918701, "logps/rejected": -1.3262929916381836, "loss": 1.5911, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1766383647918701, "rewards/margins": 0.14965465664863586, "rewards/rejected": -1.3262929916381836, "semantic_entropy": 0.8289486169815063, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 6.229257200252672, "learning_rate": 3.384335607654082e-08, "logits/chosen": -0.13708363473415375, "logits/rejected": -0.06658971309661865, "logps/chosen": -1.3100519180297852, "logps/rejected": -1.4620118141174316, "loss": 1.7091, "rewards/accuracies": 0.53125, "rewards/chosen": -1.3100519180297852, "rewards/margins": 0.15195997059345245, "rewards/rejected": -1.4620118141174316, "semantic_entropy": 0.7981749773025513, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 6.453739522684308, "learning_rate": 3.328238103371811e-08, "logits/chosen": -0.2447488009929657, "logits/rejected": -0.202292799949646, "logps/chosen": -1.175835371017456, "logps/rejected": -1.381829023361206, "loss": 1.5765, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.175835371017456, "rewards/margins": 0.20599377155303955, "rewards/rejected": -1.381829023361206, "semantic_entropy": 0.8012315630912781, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 6.06396534664486, "learning_rate": 3.272593408728169e-08, "logits/chosen": -0.25284430384635925, "logits/rejected": -0.10677488148212433, "logps/chosen": -1.1540048122406006, "logps/rejected": -1.275700330734253, "loss": 1.5736, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1540048122406006, "rewards/margins": 0.1216956228017807, "rewards/rejected": -1.275700330734253, "semantic_entropy": 0.8392844200134277, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 5.844508020045383, "learning_rate": 3.217402063588204e-08, "logits/chosen": -0.23845867812633514, "logits/rejected": -0.12692376971244812, "logps/chosen": -1.2270883321762085, "logps/rejected": -1.3483750820159912, "loss": 1.6372, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2270883321762085, "rewards/margins": 0.12128664553165436, "rewards/rejected": -1.3483750820159912, "semantic_entropy": 0.8203119039535522, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 5.782163538942205, "learning_rate": 3.162664603418608e-08, "logits/chosen": -0.17216728627681732, "logits/rejected": -0.1305660456418991, "logps/chosen": -1.1966090202331543, "logps/rejected": -1.289238691329956, "loss": 1.6146, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.1966090202331543, "rewards/margins": 0.09262975305318832, "rewards/rejected": -1.289238691329956, "semantic_entropy": 0.8360216021537781, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 7.04307722637663, "learning_rate": 3.1083815592824416e-08, "logits/chosen": -0.21597878634929657, "logits/rejected": -0.1146397739648819, "logps/chosen": -1.2200103998184204, "logps/rejected": -1.3769346475601196, "loss": 1.6228, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2200103998184204, "rewards/margins": 0.15692415833473206, "rewards/rejected": -1.3769346475601196, "semantic_entropy": 0.8056038618087769, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 7.4702640906300335, "learning_rate": 3.054553457834053e-08, "logits/chosen": -0.009552930481731892, "logits/rejected": -0.06247756630182266, "logps/chosen": -1.1254466772079468, "logps/rejected": -1.3883439302444458, "loss": 1.5364, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1254466772079468, "rewards/margins": 0.262897253036499, "rewards/rejected": -1.3883439302444458, "semantic_entropy": 0.8218750953674316, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 5.043823656159, "learning_rate": 3.0011808213139036e-08, "logits/chosen": -0.14745041728019714, "logits/rejected": -0.12443532049655914, "logps/chosen": -1.1473709344863892, "logps/rejected": -1.293113112449646, "loss": 1.5659, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1473709344863892, "rewards/margins": 0.14574211835861206, "rewards/rejected": -1.293113112449646, "semantic_entropy": 0.8371566534042358, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 7.471773386592667, "learning_rate": 2.948264167543568e-08, "logits/chosen": -0.19864864647388458, "logits/rejected": -0.16300702095031738, "logps/chosen": -1.065683126449585, "logps/rejected": -1.3315073251724243, "loss": 1.4914, "rewards/accuracies": 0.625, "rewards/chosen": -1.065683126449585, "rewards/margins": 0.26582401990890503, "rewards/rejected": -1.3315073251724243, "semantic_entropy": 0.8514785766601562, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 5.191673643267243, "learning_rate": 2.8958040099206216e-08, "logits/chosen": -0.2630145847797394, "logits/rejected": -0.19936862587928772, "logps/chosen": -1.112653136253357, "logps/rejected": -1.303905725479126, "loss": 1.5252, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.112653136253357, "rewards/margins": 0.19125264883041382, "rewards/rejected": -1.303905725479126, "semantic_entropy": 0.8250961303710938, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 6.652747086233137, "learning_rate": 2.843800857413775e-08, "logits/chosen": -0.17713990807533264, "logits/rejected": -0.13936197757720947, "logps/chosen": -1.171979308128357, "logps/rejected": -1.3692396879196167, "loss": 1.5749, "rewards/accuracies": 0.59375, "rewards/chosen": -1.171979308128357, "rewards/margins": 0.19726040959358215, "rewards/rejected": -1.3692396879196167, "semantic_entropy": 0.8057729601860046, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 7.24907928245829, "learning_rate": 2.7922552145578203e-08, "logits/chosen": -0.19944065809249878, "logits/rejected": -0.01121221762150526, "logps/chosen": -1.2501572370529175, "logps/rejected": -1.4698867797851562, "loss": 1.6456, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2501572370529175, "rewards/margins": 0.2197294682264328, "rewards/rejected": -1.4698867797851562, "semantic_entropy": 0.7908746004104614, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 5.789859477917714, "learning_rate": 2.7411675814488277e-08, "logits/chosen": -0.09272589534521103, "logits/rejected": 0.034744732081890106, "logps/chosen": -1.125180959701538, "logps/rejected": -1.2798967361450195, "loss": 1.5484, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.125180959701538, "rewards/margins": 0.15471577644348145, "rewards/rejected": -1.2798967361450195, "semantic_entropy": 0.8464130163192749, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 5.379927743178426, "learning_rate": 2.690538453739216e-08, "logits/chosen": -0.1815638542175293, "logits/rejected": -0.12836329638957977, "logps/chosen": -1.1617991924285889, "logps/rejected": -1.2463396787643433, "loss": 1.5717, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.1617991924285889, "rewards/margins": 0.08454050868749619, "rewards/rejected": -1.2463396787643433, "semantic_entropy": 0.8198903799057007, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 5.454210209188993, "learning_rate": 2.6403683226330298e-08, "logits/chosen": -0.2398330420255661, "logits/rejected": -0.13930942118167877, "logps/chosen": -1.2433769702911377, "logps/rejected": -1.371046781539917, "loss": 1.647, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2433769702911377, "rewards/margins": 0.12766972184181213, "rewards/rejected": -1.371046781539917, "semantic_entropy": 0.8071671724319458, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 5.470678858055292, "learning_rate": 2.5906576748810804e-08, "logits/chosen": -0.29332485795021057, "logits/rejected": -0.20562238991260529, "logps/chosen": -1.095189094543457, "logps/rejected": -1.4198602437973022, "loss": 1.5211, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.095189094543457, "rewards/margins": 0.3246711790561676, "rewards/rejected": -1.4198602437973022, "semantic_entropy": 0.8517764210700989, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 6.344751055716861, "learning_rate": 2.5414069927763016e-08, "logits/chosen": -0.28106459975242615, "logits/rejected": -0.16111283004283905, "logps/chosen": -1.2475659847259521, "logps/rejected": -1.4131131172180176, "loss": 1.6468, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2475659847259521, "rewards/margins": 0.16554707288742065, "rewards/rejected": -1.4131131172180176, "semantic_entropy": 0.7985636591911316, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 4.33807178447299, "learning_rate": 2.4926167541490185e-08, "logits/chosen": -0.3282173275947571, "logits/rejected": -0.17481629550457, "logps/chosen": -1.1834831237792969, "logps/rejected": -1.4144530296325684, "loss": 1.5901, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1834831237792969, "rewards/margins": 0.23096971213817596, "rewards/rejected": -1.4144530296325684, "semantic_entropy": 0.8132593035697937, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 5.943973763905658, "learning_rate": 2.4442874323623574e-08, "logits/chosen": -0.12029655277729034, "logits/rejected": -0.021431569010019302, "logps/chosen": -1.1262171268463135, "logps/rejected": -1.3706810474395752, "loss": 1.5456, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1262171268463135, "rewards/margins": 0.24446384608745575, "rewards/rejected": -1.3706810474395752, "semantic_entropy": 0.838672935962677, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 6.474960771204301, "learning_rate": 2.396419496307589e-08, "logits/chosen": -0.195916086435318, "logits/rejected": -0.06300166994333267, "logps/chosen": -1.1948484182357788, "logps/rejected": -1.3169810771942139, "loss": 1.6067, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1948484182357788, "rewards/margins": 0.12213277816772461, "rewards/rejected": -1.3169810771942139, "semantic_entropy": 0.8236440420150757, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 5.118926920743012, "learning_rate": 2.349013410399653e-08, "logits/chosen": -0.25815996527671814, "logits/rejected": -0.13134902715682983, "logps/chosen": -1.1541625261306763, "logps/rejected": -1.3163402080535889, "loss": 1.5613, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1541625261306763, "rewards/margins": 0.16217786073684692, "rewards/rejected": -1.3163402080535889, "semantic_entropy": 0.8141773343086243, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 9.265520978819074, "learning_rate": 2.3020696345725954e-08, "logits/chosen": -0.2834155559539795, "logits/rejected": -0.11906103044748306, "logps/chosen": -1.1684099435806274, "logps/rejected": -1.3839528560638428, "loss": 1.5809, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1684099435806274, "rewards/margins": 0.21554288268089294, "rewards/rejected": -1.3839528560638428, "semantic_entropy": 0.8250047564506531, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 6.469541539925241, "learning_rate": 2.2555886242751398e-08, "logits/chosen": -0.21147695183753967, "logits/rejected": -0.17885956168174744, "logps/chosen": -1.2880058288574219, "logps/rejected": -1.4623143672943115, "loss": 1.6865, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2880058288574219, "rewards/margins": 0.1743084192276001, "rewards/rejected": -1.4623143672943115, "semantic_entropy": 0.7969205379486084, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 6.415020652512225, "learning_rate": 2.2095708304662453e-08, "logits/chosen": -0.3173529803752899, "logits/rejected": -0.13515403866767883, "logps/chosen": -1.1369807720184326, "logps/rejected": -1.3754149675369263, "loss": 1.5437, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1369807720184326, "rewards/margins": 0.23843412101268768, "rewards/rejected": -1.3754149675369263, "semantic_entropy": 0.81353759765625, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 4.811281784277279, "learning_rate": 2.16401669961076e-08, "logits/chosen": -0.3560958504676819, "logits/rejected": -0.19394418597221375, "logps/chosen": -1.1683796644210815, "logps/rejected": -1.3916758298873901, "loss": 1.5875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1683796644210815, "rewards/margins": 0.22329625487327576, "rewards/rejected": -1.3916758298873901, "semantic_entropy": 0.8382458686828613, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 7.226620088047714, "learning_rate": 2.1189266736750532e-08, "logits/chosen": -0.14493972063064575, "logits/rejected": -0.09346255660057068, "logps/chosen": -1.164661169052124, "logps/rejected": -1.3571916818618774, "loss": 1.5776, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.164661169052124, "rewards/margins": 0.19253046810626984, "rewards/rejected": -1.3571916818618774, "semantic_entropy": 0.825888991355896, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 5.3452075199395, "learning_rate": 2.0743011901227623e-08, "logits/chosen": -0.16351541876792908, "logits/rejected": -0.07005085051059723, "logps/chosen": -1.2592918872833252, "logps/rejected": -1.395971655845642, "loss": 1.6639, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.2592918872833252, "rewards/margins": 0.1366797685623169, "rewards/rejected": -1.395971655845642, "semantic_entropy": 0.8093145489692688, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 7.399162250564681, "learning_rate": 2.030140681910508e-08, "logits/chosen": -0.2341056764125824, "logits/rejected": -0.08554646372795105, "logps/chosen": -1.153937578201294, "logps/rejected": -1.2926714420318604, "loss": 1.568, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.153937578201294, "rewards/margins": 0.1387338638305664, "rewards/rejected": -1.2926714420318604, "semantic_entropy": 0.8280371427536011, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 4.047427730519607, "learning_rate": 1.986445577483753e-08, "logits/chosen": -0.2623864412307739, "logits/rejected": -0.17165681719779968, "logps/chosen": -1.1644560098648071, "logps/rejected": -1.3542258739471436, "loss": 1.5711, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1644560098648071, "rewards/margins": 0.18976984918117523, "rewards/rejected": -1.3542258739471436, "semantic_entropy": 0.8132050633430481, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 5.485021570013116, "learning_rate": 1.9432163007725765e-08, "logits/chosen": -0.2755053639411926, "logits/rejected": -0.19668903946876526, "logps/chosen": -1.2260733842849731, "logps/rejected": -1.4181524515151978, "loss": 1.6218, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2260733842849731, "rewards/margins": 0.19207903742790222, "rewards/rejected": -1.4181524515151978, "semantic_entropy": 0.7915480732917786, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 5.126895561410886, "learning_rate": 1.9004532711876297e-08, "logits/chosen": -0.2537384033203125, "logits/rejected": -0.20249919593334198, "logps/chosen": -1.1536331176757812, "logps/rejected": -1.4676556587219238, "loss": 1.5557, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1536331176757812, "rewards/margins": 0.3140224814414978, "rewards/rejected": -1.4676556587219238, "semantic_entropy": 0.8041528463363647, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 5.187207229614111, "learning_rate": 1.8581569036159928e-08, "logits/chosen": -0.2422453910112381, "logits/rejected": -0.09306782484054565, "logps/chosen": -1.1506612300872803, "logps/rejected": -1.3426655530929565, "loss": 1.5702, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1506612300872803, "rewards/margins": 0.19200429320335388, "rewards/rejected": -1.3426655530929565, "semantic_entropy": 0.8389831781387329, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 7.395718333706245, "learning_rate": 1.8163276084172285e-08, "logits/chosen": -0.20249783992767334, "logits/rejected": -0.10615064948797226, "logps/chosen": -1.2493163347244263, "logps/rejected": -1.3707681894302368, "loss": 1.6508, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2493163347244263, "rewards/margins": 0.1214517205953598, "rewards/rejected": -1.3707681894302368, "semantic_entropy": 0.8029763102531433, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 6.104802220257584, "learning_rate": 1.7749657914193194e-08, "logits/chosen": -0.2825106978416443, "logits/rejected": -0.20105138421058655, "logps/chosen": -1.2427489757537842, "logps/rejected": -1.3232057094573975, "loss": 1.643, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2427489757537842, "rewards/margins": 0.0804567039012909, "rewards/rejected": -1.3232057094573975, "semantic_entropy": 0.8004171252250671, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 5.967222966708706, "learning_rate": 1.7340718539148203e-08, "logits/chosen": -0.20327219367027283, "logits/rejected": -0.17749294638633728, "logps/chosen": -1.2504197359085083, "logps/rejected": -1.4823780059814453, "loss": 1.6502, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2504197359085083, "rewards/margins": 0.231958270072937, "rewards/rejected": -1.4823780059814453, "semantic_entropy": 0.7995802164077759, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 5.477327491188166, "learning_rate": 1.6936461926568724e-08, "logits/chosen": -0.22447064518928528, "logits/rejected": -0.14304491877555847, "logps/chosen": -1.093876600265503, "logps/rejected": -1.281096339225769, "loss": 1.5093, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.093876600265503, "rewards/margins": 0.1872197389602661, "rewards/rejected": -1.281096339225769, "semantic_entropy": 0.8308781385421753, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 6.272429682707429, "learning_rate": 1.6536891998554346e-08, "logits/chosen": -0.32161200046539307, "logits/rejected": -0.20424389839172363, "logps/chosen": -1.1848098039627075, "logps/rejected": -1.3006442785263062, "loss": 1.5919, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1848098039627075, "rewards/margins": 0.11583447456359863, "rewards/rejected": -1.3006442785263062, "semantic_entropy": 0.8142232894897461, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 5.923088088817108, "learning_rate": 1.6142012631734093e-08, "logits/chosen": -0.23960113525390625, "logits/rejected": -0.1392134130001068, "logps/chosen": -1.1781275272369385, "logps/rejected": -1.3265058994293213, "loss": 1.5917, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1781275272369385, "rewards/margins": 0.1483784168958664, "rewards/rejected": -1.3265058994293213, "semantic_entropy": 0.8271776437759399, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 7.413959981881807, "learning_rate": 1.575182765722949e-08, "logits/chosen": -0.273266464471817, "logits/rejected": -0.1663859784603119, "logps/chosen": -1.1294550895690918, "logps/rejected": -1.2970006465911865, "loss": 1.537, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1294550895690918, "rewards/margins": 0.16754552721977234, "rewards/rejected": -1.2970006465911865, "semantic_entropy": 0.8150800466537476, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.027700087055563927, "eval_logits/rejected": 0.09298869967460632, "eval_logps/chosen": -1.2672092914581299, "eval_logps/rejected": -1.4042826890945435, "eval_loss": 1.6700152158737183, "eval_rewards/accuracies": 0.5482195615768433, "eval_rewards/chosen": -1.2672092914581299, "eval_rewards/margins": 0.13707335293293, "eval_rewards/rejected": -1.4042826890945435, "eval_runtime": 34.5991, "eval_samples_per_second": 38.874, "eval_semantic_entropy": 0.8053316473960876, "eval_steps_per_second": 9.74, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 4.899599493509731, "learning_rate": 1.536634086061672e-08, "logits/chosen": -0.16172626614570618, "logits/rejected": -0.15884217619895935, "logps/chosen": -1.187535285949707, "logps/rejected": -1.303769588470459, "loss": 1.6076, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -1.187535285949707, "rewards/margins": 0.11623434722423553, "rewards/rejected": -1.303769588470459, "semantic_entropy": 0.8400986790657043, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 6.19181042160228, "learning_rate": 1.4985555981890495e-08, "logits/chosen": -0.22247116267681122, "logits/rejected": -0.1709560602903366, "logps/chosen": -1.171589970588684, "logps/rejected": -1.3086298704147339, "loss": 1.5851, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.171589970588684, "rewards/margins": 0.13704009354114532, "rewards/rejected": -1.3086298704147339, "semantic_entropy": 0.8270018696784973, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 5.457870675882319, "learning_rate": 1.4609476715427226e-08, "logits/chosen": -0.22052256762981415, "logits/rejected": -0.15993133187294006, "logps/chosen": -1.1752756834030151, "logps/rejected": -1.3884080648422241, "loss": 1.5801, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1752756834030151, "rewards/margins": 0.21313218772411346, "rewards/rejected": -1.3884080648422241, "semantic_entropy": 0.8097451329231262, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 5.38246708761095, "learning_rate": 1.4238106709949792e-08, "logits/chosen": -0.22449664771556854, "logits/rejected": -0.18037332594394684, "logps/chosen": -1.1507208347320557, "logps/rejected": -1.338881492614746, "loss": 1.555, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1507208347320557, "rewards/margins": 0.1881604939699173, "rewards/rejected": -1.338881492614746, "semantic_entropy": 0.8084988594055176, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 5.663109372790309, "learning_rate": 1.3871449568491511e-08, "logits/chosen": -0.18718287348747253, "logits/rejected": -0.0982210785150528, "logps/chosen": -1.2385473251342773, "logps/rejected": -1.4307384490966797, "loss": 1.6401, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2385473251342773, "rewards/margins": 0.1921912133693695, "rewards/rejected": -1.4307384490966797, "semantic_entropy": 0.8031408190727234, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 8.764366407573233, "learning_rate": 1.3509508848361606e-08, "logits/chosen": -0.3000984489917755, "logits/rejected": -0.20205077528953552, "logps/chosen": -1.197575569152832, "logps/rejected": -1.337348461151123, "loss": 1.6063, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.197575569152832, "rewards/margins": 0.13977280259132385, "rewards/rejected": -1.337348461151123, "semantic_entropy": 0.8175147771835327, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 4.953201467225806, "learning_rate": 1.3152288061110517e-08, "logits/chosen": -0.25233954191207886, "logits/rejected": -0.19374088943004608, "logps/chosen": -1.215494155883789, "logps/rejected": -1.3425992727279663, "loss": 1.6222, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.215494155883789, "rewards/margins": 0.12710505723953247, "rewards/rejected": -1.3425992727279663, "semantic_entropy": 0.8134604692459106, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 5.624328725668429, "learning_rate": 1.2799790672495814e-08, "logits/chosen": -0.22095008194446564, "logits/rejected": -0.05417835712432861, "logps/chosen": -1.1956743001937866, "logps/rejected": -1.3386344909667969, "loss": 1.6063, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.1956743001937866, "rewards/margins": 0.14296004176139832, "rewards/rejected": -1.3386344909667969, "semantic_entropy": 0.8211973309516907, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 5.057466709984263, "learning_rate": 1.2452020102448835e-08, "logits/chosen": -0.13732822239398956, "logits/rejected": -0.09785889089107513, "logps/chosen": -1.1524379253387451, "logps/rejected": -1.2888638973236084, "loss": 1.5782, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1524379253387451, "rewards/margins": 0.1364259123802185, "rewards/rejected": -1.2888638973236084, "semantic_entropy": 0.851573646068573, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 5.454946319432153, "learning_rate": 1.2108979725041103e-08, "logits/chosen": -0.2387726753950119, "logits/rejected": -0.1369660198688507, "logps/chosen": -1.2224401235580444, "logps/rejected": -1.3440228700637817, "loss": 1.6257, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.2224401235580444, "rewards/margins": 0.12158264964818954, "rewards/rejected": -1.3440228700637817, "semantic_entropy": 0.8065109252929688, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 5.089087559933949, "learning_rate": 1.1770672868451958e-08, "logits/chosen": -0.17929542064666748, "logits/rejected": -0.009806843474507332, "logps/chosen": -1.2119605541229248, "logps/rejected": -1.2874178886413574, "loss": 1.6259, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2119605541229248, "rewards/margins": 0.07545743882656097, "rewards/rejected": -1.2874178886413574, "semantic_entropy": 0.8278552889823914, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 7.637695527985737, "learning_rate": 1.1437102814935872e-08, "logits/chosen": -0.22138150036334991, "logits/rejected": -0.16821135580539703, "logps/chosen": -1.1913281679153442, "logps/rejected": -1.3632866144180298, "loss": 1.5948, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1913281679153442, "rewards/margins": 0.17195859551429749, "rewards/rejected": -1.3632866144180298, "semantic_entropy": 0.8068788647651672, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 5.5651965285197535, "learning_rate": 1.1108272800791018e-08, "logits/chosen": -0.3304412066936493, "logits/rejected": -0.17530521750450134, "logps/chosen": -1.370415210723877, "logps/rejected": -1.4041099548339844, "loss": 1.7585, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.370415210723877, "rewards/margins": 0.033694807440042496, "rewards/rejected": -1.4041099548339844, "semantic_entropy": 0.7760937213897705, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 4.889086991984989, "learning_rate": 1.078418601632769e-08, "logits/chosen": -0.20290783047676086, "logits/rejected": -0.08997891843318939, "logps/chosen": -1.1350520849227905, "logps/rejected": -1.3978471755981445, "loss": 1.5573, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1350520849227905, "rewards/margins": 0.2627951204776764, "rewards/rejected": -1.3978471755981445, "semantic_entropy": 0.8444234728813171, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 6.333984098495397, "learning_rate": 1.0464845605837159e-08, "logits/chosen": -0.19114695489406586, "logits/rejected": -0.06502260267734528, "logps/chosen": -1.2191637754440308, "logps/rejected": -1.3933669328689575, "loss": 1.626, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2191637754440308, "rewards/margins": 0.17420312762260437, "rewards/rejected": -1.3933669328689575, "semantic_entropy": 0.8136224746704102, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 5.581001385573074, "learning_rate": 1.0150254667561642e-08, "logits/chosen": -0.19595910608768463, "logits/rejected": -0.07162782549858093, "logps/chosen": -1.2411977052688599, "logps/rejected": -1.3477356433868408, "loss": 1.651, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.2411977052688599, "rewards/margins": 0.10653793811798096, "rewards/rejected": -1.3477356433868408, "semantic_entropy": 0.8196479082107544, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 5.883085689610549, "learning_rate": 9.840416253663719e-09, "logits/chosen": -0.24358844757080078, "logits/rejected": -0.16975031793117523, "logps/chosen": -1.1667983531951904, "logps/rejected": -1.313002347946167, "loss": 1.5905, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1667983531951904, "rewards/margins": 0.1462039351463318, "rewards/rejected": -1.313002347946167, "semantic_entropy": 0.8474776148796082, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 4.292258717594069, "learning_rate": 9.535333370197074e-09, "logits/chosen": -0.21653184294700623, "logits/rejected": -0.11125503480434418, "logps/chosen": -1.2387261390686035, "logps/rejected": -1.354978322982788, "loss": 1.6485, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.2387261390686035, "rewards/margins": 0.11625204980373383, "rewards/rejected": -1.354978322982788, "semantic_entropy": 0.8195913434028625, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 3.6873072921783705, "learning_rate": 9.23500897707713e-09, "logits/chosen": -0.26647061109542847, "logits/rejected": -0.11514315754175186, "logps/chosen": -1.2834727764129639, "logps/rejected": -1.4842513799667358, "loss": 1.6776, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2834727764129639, "rewards/margins": 0.20077872276306152, "rewards/rejected": -1.4842513799667358, "semantic_entropy": 0.788171112537384, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 5.729172173848957, "learning_rate": 8.939445988052574e-09, "logits/chosen": -0.24200348556041718, "logits/rejected": -0.22081796824932098, "logps/chosen": -1.1831735372543335, "logps/rejected": -1.4513851404190063, "loss": 1.5897, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.1831735372543335, "rewards/margins": 0.2682115137577057, "rewards/rejected": -1.4513851404190063, "semantic_entropy": 0.8130677938461304, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 5.583041680165885, "learning_rate": 8.648647270676656e-09, "logits/chosen": -0.21722412109375, "logits/rejected": -0.10467751324176788, "logps/chosen": -1.2482264041900635, "logps/rejected": -1.4273207187652588, "loss": 1.6337, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2482264041900635, "rewards/margins": 0.17909429967403412, "rewards/rejected": -1.4273207187652588, "semantic_entropy": 0.7709903717041016, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 3.7884424586810477, "learning_rate": 8.362615646279991e-09, "logits/chosen": -0.33781111240386963, "logits/rejected": -0.1396912783384323, "logps/chosen": -1.1316759586334229, "logps/rejected": -1.3783786296844482, "loss": 1.5407, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1316759586334229, "rewards/margins": 0.24670283496379852, "rewards/rejected": -1.3783786296844482, "semantic_entropy": 0.8180691003799438, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 5.280322188272147, "learning_rate": 8.081353889942466e-09, "logits/chosen": -0.12690454721450806, "logits/rejected": -0.058228593319654465, "logps/chosen": -1.1722227334976196, "logps/rejected": -1.2653868198394775, "loss": 1.5915, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1722227334976196, "rewards/margins": 0.09316390752792358, "rewards/rejected": -1.2653868198394775, "semantic_entropy": 0.8385244607925415, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 4.6040573654282, "learning_rate": 7.804864730467042e-09, "logits/chosen": -0.14167170226573944, "logits/rejected": -0.09052537381649017, "logps/chosen": -1.1747121810913086, "logps/rejected": -1.261125922203064, "loss": 1.5964, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.1747121810913086, "rewards/margins": 0.08641383796930313, "rewards/rejected": -1.261125922203064, "semantic_entropy": 0.8433883786201477, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 5.478783648819646, "learning_rate": 7.533150850352665e-09, "logits/chosen": -0.18699513375759125, "logits/rejected": -0.09736774861812592, "logps/chosen": -1.2216894626617432, "logps/rejected": -1.4373347759246826, "loss": 1.6192, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2216894626617432, "rewards/margins": 0.21564534306526184, "rewards/rejected": -1.4373347759246826, "semantic_entropy": 0.7950273752212524, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 7.2166052036988875, "learning_rate": 7.2662148857686175e-09, "logits/chosen": -0.13375705480575562, "logits/rejected": -0.09192383289337158, "logps/chosen": -1.1569154262542725, "logps/rejected": -1.3403160572052002, "loss": 1.5762, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1569154262542725, "rewards/margins": 0.1834004819393158, "rewards/rejected": -1.3403160572052002, "semantic_entropy": 0.838494598865509, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 6.187742909072841, "learning_rate": 7.0040594265287635e-09, "logits/chosen": -0.1285972148180008, "logits/rejected": -0.1494758576154709, "logps/chosen": -1.1915875673294067, "logps/rejected": -1.3053762912750244, "loss": 1.6101, "rewards/accuracies": 0.53125, "rewards/chosen": -1.1915875673294067, "rewards/margins": 0.11378880590200424, "rewards/rejected": -1.3053762912750244, "semantic_entropy": 0.8370029330253601, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 5.308482599279001, "learning_rate": 6.746687016066566e-09, "logits/chosen": -0.16552288830280304, "logits/rejected": -0.1655363142490387, "logps/chosen": -1.219613790512085, "logps/rejected": -1.3235218524932861, "loss": 1.6217, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.219613790512085, "rewards/margins": 0.10390814393758774, "rewards/rejected": -1.3235218524932861, "semantic_entropy": 0.8042513132095337, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 6.116631072482262, "learning_rate": 6.494100151410276e-09, "logits/chosen": -0.32022494077682495, "logits/rejected": -0.19335010647773743, "logps/chosen": -1.1525671482086182, "logps/rejected": -1.3434569835662842, "loss": 1.5666, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1525671482086182, "rewards/margins": 0.19088973104953766, "rewards/rejected": -1.3434569835662842, "semantic_entropy": 0.8281334638595581, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 5.654100880875961, "learning_rate": 6.246301283158728e-09, "logits/chosen": -0.17696094512939453, "logits/rejected": -0.21474246680736542, "logps/chosen": -1.273454189300537, "logps/rejected": -1.4090838432312012, "loss": 1.6704, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.273454189300537, "rewards/margins": 0.13562971353530884, "rewards/rejected": -1.4090838432312012, "semantic_entropy": 0.7939369082450867, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 7.482228710930792, "learning_rate": 6.0032928154576944e-09, "logits/chosen": -0.25565433502197266, "logits/rejected": -0.19009876251220703, "logps/chosen": -1.1828062534332275, "logps/rejected": -1.339215636253357, "loss": 1.5925, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.1828062534332275, "rewards/margins": 0.1564093381166458, "rewards/rejected": -1.339215636253357, "semantic_entropy": 0.8193303942680359, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 6.881155948008991, "learning_rate": 5.76507710597629e-09, "logits/chosen": -0.19264690577983856, "logits/rejected": -0.04387538880109787, "logps/chosen": -1.2032321691513062, "logps/rejected": -1.3728487491607666, "loss": 1.6021, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2032321691513062, "rewards/margins": 0.16961655020713806, "rewards/rejected": -1.3728487491607666, "semantic_entropy": 0.7978094816207886, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 5.683498620573821, "learning_rate": 5.531656465884438e-09, "logits/chosen": -0.2734878957271576, "logits/rejected": -0.15563836693763733, "logps/chosen": -1.201833963394165, "logps/rejected": -1.397862195968628, "loss": 1.615, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.201833963394165, "rewards/margins": 0.1960281878709793, "rewards/rejected": -1.397862195968628, "semantic_entropy": 0.8262723088264465, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 6.556909607176425, "learning_rate": 5.303033159830217e-09, "logits/chosen": -0.15590143203735352, "logits/rejected": -0.1370287984609604, "logps/chosen": -1.168755292892456, "logps/rejected": -1.2662416696548462, "loss": 1.5853, "rewards/accuracies": 0.5, "rewards/chosen": -1.168755292892456, "rewards/margins": 0.09748627990484238, "rewards/rejected": -1.2662416696548462, "semantic_entropy": 0.8331258893013, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 5.915395930278695, "learning_rate": 5.079209405917939e-09, "logits/chosen": -0.21506504714488983, "logits/rejected": -0.13724765181541443, "logps/chosen": -1.1483042240142822, "logps/rejected": -1.4054268598556519, "loss": 1.5546, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1483042240142822, "rewards/margins": 0.2571226954460144, "rewards/rejected": -1.4054268598556519, "semantic_entropy": 0.8125613927841187, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 4.19684401567249, "learning_rate": 4.860187375686664e-09, "logits/chosen": -0.22821882367134094, "logits/rejected": -0.06705351918935776, "logps/chosen": -1.3252593278884888, "logps/rejected": -1.442463994026184, "loss": 1.7214, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3252593278884888, "rewards/margins": 0.11720477044582367, "rewards/rejected": -1.442463994026184, "semantic_entropy": 0.7921825647354126, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 5.857597030471675, "learning_rate": 4.64596919408905e-09, "logits/chosen": -0.1760951280593872, "logits/rejected": -0.12244913727045059, "logps/chosen": -1.1747556924819946, "logps/rejected": -1.301001787185669, "loss": 1.5803, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1747556924819946, "rewards/margins": 0.12624609470367432, "rewards/rejected": -1.301001787185669, "semantic_entropy": 0.8110553622245789, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 5.0236651124053, "learning_rate": 4.436556939470814e-09, "logits/chosen": -0.17454347014427185, "logits/rejected": -0.09962751716375351, "logps/chosen": -1.2282720804214478, "logps/rejected": -1.3416638374328613, "loss": 1.6332, "rewards/accuracies": 0.53125, "rewards/chosen": -1.2282720804214478, "rewards/margins": 0.11339167505502701, "rewards/rejected": -1.3416638374328613, "semantic_entropy": 0.809842586517334, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 4.55705822248592, "learning_rate": 4.23195264355064e-09, "logits/chosen": -0.350921094417572, "logits/rejected": -0.20238156616687775, "logps/chosen": -1.1485497951507568, "logps/rejected": -1.347874402999878, "loss": 1.563, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1485497951507568, "rewards/margins": 0.1993245780467987, "rewards/rejected": -1.347874402999878, "semantic_entropy": 0.8288493156433105, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 5.749831879788882, "learning_rate": 4.032158291400245e-09, "logits/chosen": -0.23490352928638458, "logits/rejected": -0.03213455528020859, "logps/chosen": -1.2031171321868896, "logps/rejected": -1.5328325033187866, "loss": 1.6084, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2031171321868896, "rewards/margins": 0.3297153115272522, "rewards/rejected": -1.5328325033187866, "semantic_entropy": 0.8106520771980286, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 7.128384458193278, "learning_rate": 3.837175821425398e-09, "logits/chosen": -0.15077051520347595, "logits/rejected": -0.1360398530960083, "logps/chosen": -1.247150182723999, "logps/rejected": -1.4482719898223877, "loss": 1.6485, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.247150182723999, "rewards/margins": 0.2011217325925827, "rewards/rejected": -1.4482719898223877, "semantic_entropy": 0.8026644587516785, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 5.30403952420332, "learning_rate": 3.6470071253467683e-09, "logits/chosen": -0.1441146582365036, "logits/rejected": -0.051567353308200836, "logps/chosen": -1.1867955923080444, "logps/rejected": -1.4536371231079102, "loss": 1.5895, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1867955923080444, "rewards/margins": 0.2668416202068329, "rewards/rejected": -1.4536371231079102, "semantic_entropy": 0.8054877519607544, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 3.7625207987198817, "learning_rate": 3.461654048181939e-09, "logits/chosen": -0.2413823902606964, "logits/rejected": -0.11689722537994385, "logps/chosen": -1.215753197669983, "logps/rejected": -1.372206449508667, "loss": 1.6259, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.215753197669983, "rewards/margins": 0.15645340085029602, "rewards/rejected": -1.372206449508667, "semantic_entropy": 0.8201963305473328, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 4.601994633831063, "learning_rate": 3.281118388227255e-09, "logits/chosen": -0.19543695449829102, "logits/rejected": -0.14571912586688995, "logps/chosen": -1.120548963546753, "logps/rejected": -1.2865253686904907, "loss": 1.5328, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.120548963546753, "rewards/margins": 0.165976420044899, "rewards/rejected": -1.2865253686904907, "semantic_entropy": 0.8245772123336792, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 7.668942096072226, "learning_rate": 3.1054018970405048e-09, "logits/chosen": -0.19049115478992462, "logits/rejected": -0.10552265495061874, "logps/chosen": -1.2237884998321533, "logps/rejected": -1.4074777364730835, "loss": 1.6176, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.2237884998321533, "rewards/margins": 0.18368938565254211, "rewards/rejected": -1.4074777364730835, "semantic_entropy": 0.7876933813095093, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 4.737951486126168, "learning_rate": 2.9345062794238207e-09, "logits/chosen": -0.23236480355262756, "logits/rejected": -0.10827644914388657, "logps/chosen": -1.1727113723754883, "logps/rejected": -1.4416927099227905, "loss": 1.5767, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1727113723754883, "rewards/margins": 0.26898136734962463, "rewards/rejected": -1.4416927099227905, "semantic_entropy": 0.808040976524353, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 6.86503610372617, "learning_rate": 2.7684331934072492e-09, "logits/chosen": -0.3180335760116577, "logits/rejected": -0.23892898857593536, "logps/chosen": -1.1709237098693848, "logps/rejected": -1.4454119205474854, "loss": 1.5813, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1709237098693848, "rewards/margins": 0.2744879424571991, "rewards/rejected": -1.4454119205474854, "semantic_entropy": 0.8208174705505371, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 6.217947823551511, "learning_rate": 2.6071842502326526e-09, "logits/chosen": -0.23997139930725098, "logits/rejected": -0.15597184002399445, "logps/chosen": -1.1805267333984375, "logps/rejected": -1.4628156423568726, "loss": 1.5846, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1805267333984375, "rewards/margins": 0.28228893876075745, "rewards/rejected": -1.4628156423568726, "semantic_entropy": 0.8082114458084106, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 4.658060817629073, "learning_rate": 2.450761014337888e-09, "logits/chosen": -0.05150661617517471, "logits/rejected": -0.05451994016766548, "logps/chosen": -1.1528609991073608, "logps/rejected": -1.4355922937393188, "loss": 1.562, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1528609991073608, "rewards/margins": 0.2827311158180237, "rewards/rejected": -1.4355922937393188, "semantic_entropy": 0.8183358311653137, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 5.313474360842343, "learning_rate": 2.299165003341985e-09, "logits/chosen": -0.1604050099849701, "logits/rejected": -0.07966902107000351, "logps/chosen": -1.2120118141174316, "logps/rejected": -1.454734206199646, "loss": 1.6253, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2120118141174316, "rewards/margins": 0.2427222728729248, "rewards/rejected": -1.454734206199646, "semantic_entropy": 0.8265057802200317, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 5.263906900736717, "learning_rate": 2.1523976880299945e-09, "logits/chosen": -0.21598677337169647, "logits/rejected": -0.07660199701786041, "logps/chosen": -1.1829307079315186, "logps/rejected": -1.2785327434539795, "loss": 1.6092, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.1829307079315186, "rewards/margins": 0.09560209512710571, "rewards/rejected": -1.2785327434539795, "semantic_entropy": 0.8524999618530273, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 7.277123929595455, "learning_rate": 2.010460492339161e-09, "logits/chosen": -0.21047189831733704, "logits/rejected": -0.14302575588226318, "logps/chosen": -1.2014880180358887, "logps/rejected": -1.384842038154602, "loss": 1.598, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.2014880180358887, "rewards/margins": 0.1833539456129074, "rewards/rejected": -1.384842038154602, "semantic_entropy": 0.7930424213409424, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 5.99061201416753, "learning_rate": 1.8733547933446614e-09, "logits/chosen": -0.29193368554115295, "logits/rejected": -0.15174652636051178, "logps/chosen": -1.2459696531295776, "logps/rejected": -1.3646684885025024, "loss": 1.6387, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2459696531295776, "rewards/margins": 0.11869869381189346, "rewards/rejected": -1.3646684885025024, "semantic_entropy": 0.7853687405586243, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 6.040636033982683, "learning_rate": 1.7410819212467231e-09, "logits/chosen": -0.1758815348148346, "logits/rejected": -0.11016283929347992, "logps/chosen": -1.1320698261260986, "logps/rejected": -1.318982481956482, "loss": 1.5566, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1320698261260986, "rewards/margins": 0.1869126856327057, "rewards/rejected": -1.318982481956482, "semantic_entropy": 0.8489736318588257, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 5.946460120349657, "learning_rate": 1.613643159357192e-09, "logits/chosen": -0.1529126912355423, "logits/rejected": -0.20058736205101013, "logps/chosen": -1.099339246749878, "logps/rejected": -1.2741444110870361, "loss": 1.5167, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.099339246749878, "rewards/margins": 0.1748051941394806, "rewards/rejected": -1.2741444110870361, "semantic_entropy": 0.8347418904304504, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 5.268457411384769, "learning_rate": 1.4910397440875967e-09, "logits/chosen": -0.1788414865732193, "logits/rejected": -0.1027057021856308, "logps/chosen": -1.201273798942566, "logps/rejected": -1.3226804733276367, "loss": 1.6023, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.201273798942566, "rewards/margins": 0.121406689286232, "rewards/rejected": -1.3226804733276367, "semantic_entropy": 0.8021144866943359, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 5.340562670008102, "learning_rate": 1.3732728649368253e-09, "logits/chosen": -0.1456112265586853, "logits/rejected": -0.02332606166601181, "logps/chosen": -1.1777400970458984, "logps/rejected": -1.3314663171768188, "loss": 1.5993, "rewards/accuracies": 0.59375, "rewards/chosen": -1.1777400970458984, "rewards/margins": 0.1537262499332428, "rewards/rejected": -1.3314663171768188, "semantic_entropy": 0.8431594967842102, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 6.002227359079419, "learning_rate": 1.260343664479524e-09, "logits/chosen": -0.1981392800807953, "logits/rejected": -0.18424752354621887, "logps/chosen": -1.1819145679473877, "logps/rejected": -1.2320106029510498, "loss": 1.5966, "rewards/accuracies": 0.5, "rewards/chosen": -1.1819145679473877, "rewards/margins": 0.05009603500366211, "rewards/rejected": -1.2320106029510498, "semantic_entropy": 0.8293901681900024, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 5.694190040908527, "learning_rate": 1.1522532383554384e-09, "logits/chosen": -0.28008538484573364, "logits/rejected": -0.11984242498874664, "logps/chosen": -1.1513967514038086, "logps/rejected": -1.3825325965881348, "loss": 1.57, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1513967514038086, "rewards/margins": 0.23113596439361572, "rewards/rejected": -1.3825325965881348, "semantic_entropy": 0.8372230529785156, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 4.780417792208094, "learning_rate": 1.049002635258256e-09, "logits/chosen": -0.2068876326084137, "logits/rejected": -0.1176435574889183, "logps/chosen": -1.2129509449005127, "logps/rejected": -1.267187476158142, "loss": 1.6202, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.2129509449005127, "rewards/margins": 0.054236579686403275, "rewards/rejected": -1.267187476158142, "semantic_entropy": 0.814551830291748, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 4.621668261063069, "learning_rate": 9.505928569258358e-10, "logits/chosen": -0.1435539722442627, "logits/rejected": -0.1537472903728485, "logps/chosen": -1.1971156597137451, "logps/rejected": -1.4037588834762573, "loss": 1.5942, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1971156597137451, "rewards/margins": 0.20664314925670624, "rewards/rejected": -1.4037588834762573, "semantic_entropy": 0.7941396832466125, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 6.538723178459022, "learning_rate": 8.57024858130273e-10, "logits/chosen": -0.206007719039917, "logits/rejected": -0.13320791721343994, "logps/chosen": -1.194725751876831, "logps/rejected": -1.4337600469589233, "loss": 1.6025, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.194725751876831, "rewards/margins": 0.23903420567512512, "rewards/rejected": -1.4337600469589233, "semantic_entropy": 0.8155485391616821, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 6.540071244271228, "learning_rate": 7.682995466686826e-10, "logits/chosen": -0.2856406569480896, "logits/rejected": -0.1857413351535797, "logps/chosen": -1.1494338512420654, "logps/rejected": -1.434762716293335, "loss": 1.5516, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1494338512420654, "rewards/margins": 0.28532877564430237, "rewards/rejected": -1.434762716293335, "semantic_entropy": 0.8043950200080872, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 5.534361163359473, "learning_rate": 6.844177833543741e-10, "logits/chosen": -0.1894369125366211, "logits/rejected": -0.15660791099071503, "logps/chosen": -1.133742094039917, "logps/rejected": -1.30770742893219, "loss": 1.5382, "rewards/accuracies": 0.5625, "rewards/chosen": -1.133742094039917, "rewards/margins": 0.17396536469459534, "rewards/rejected": -1.30770742893219, "semantic_entropy": 0.8089858293533325, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 4.522109833361041, "learning_rate": 6.053803820087467e-10, "logits/chosen": -0.2204488217830658, "logits/rejected": -0.1316094696521759, "logps/chosen": -1.1728408336639404, "logps/rejected": -1.4038156270980835, "loss": 1.5766, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1728408336639404, "rewards/margins": 0.23097486793994904, "rewards/rejected": -1.4038156270980835, "semantic_entropy": 0.8075426816940308, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 5.6006857865706134, "learning_rate": 5.311881094528514e-10, "logits/chosen": -0.2568807005882263, "logits/rejected": -0.08604659140110016, "logps/chosen": -1.279069185256958, "logps/rejected": -1.3520950078964233, "loss": 1.6843, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.279069185256958, "rewards/margins": 0.07302595674991608, "rewards/rejected": -1.3520950078964233, "semantic_entropy": 0.8105060458183289, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 6.299769030148249, "learning_rate": 4.6184168550050806e-10, "logits/chosen": -0.2216963768005371, "logits/rejected": -0.20229634642601013, "logps/chosen": -1.1629399061203003, "logps/rejected": -1.247417688369751, "loss": 1.5728, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1629399061203003, "rewards/margins": 0.08447777479887009, "rewards/rejected": -1.247417688369751, "semantic_entropy": 0.8197963833808899, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 5.897952765604465, "learning_rate": 3.973417829510328e-10, "logits/chosen": -0.3149182200431824, "logits/rejected": -0.1898161917924881, "logps/chosen": -1.238563895225525, "logps/rejected": -1.3242768049240112, "loss": 1.6505, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.238563895225525, "rewards/margins": 0.08571288734674454, "rewards/rejected": -1.3242768049240112, "semantic_entropy": 0.8239692449569702, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 5.372930216972268, "learning_rate": 3.3768902758274377e-10, "logits/chosen": -0.2110154628753662, "logits/rejected": -0.14126203954219818, "logps/chosen": -1.1208343505859375, "logps/rejected": -1.281507968902588, "loss": 1.5325, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1208343505859375, "rewards/margins": 0.160673588514328, "rewards/rejected": -1.281507968902588, "semantic_entropy": 0.8232558965682983, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 5.701645118144605, "learning_rate": 2.8288399814691e-10, "logits/chosen": -0.13407756388187408, "logits/rejected": -0.04441801831126213, "logps/chosen": -1.2582229375839233, "logps/rejected": -1.3782384395599365, "loss": 1.6573, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2582229375839233, "rewards/margins": 0.12001542747020721, "rewards/rejected": -1.3782384395599365, "semantic_entropy": 0.7980928421020508, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 7.1526600692166165, "learning_rate": 2.3292722636220066e-10, "logits/chosen": -0.2314671277999878, "logits/rejected": -0.06888873130083084, "logps/chosen": -1.2859220504760742, "logps/rejected": -1.4090259075164795, "loss": 1.6783, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2859220504760742, "rewards/margins": 0.12310393154621124, "rewards/rejected": -1.4090259075164795, "semantic_entropy": 0.7847461104393005, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 5.594320510171007, "learning_rate": 1.8781919690946668e-10, "logits/chosen": -0.17352885007858276, "logits/rejected": -0.18353821337223053, "logps/chosen": -1.2088223695755005, "logps/rejected": -1.2567827701568604, "loss": 1.6197, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2088223695755005, "rewards/margins": 0.04796028509736061, "rewards/rejected": -1.2567827701568604, "semantic_entropy": 0.8218268156051636, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 7.742941764713279, "learning_rate": 1.4756034742696711e-10, "logits/chosen": -0.25590983033180237, "logits/rejected": -0.23918232321739197, "logps/chosen": -1.1864460706710815, "logps/rejected": -1.278072714805603, "loss": 1.604, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.1864460706710815, "rewards/margins": 0.09162665158510208, "rewards/rejected": -1.278072714805603, "semantic_entropy": 0.8350431323051453, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 6.434776357373966, "learning_rate": 1.12151068506261e-10, "logits/chosen": -0.18201963603496552, "logits/rejected": -0.09469863027334213, "logps/chosen": -1.1451067924499512, "logps/rejected": -1.3979992866516113, "loss": 1.5465, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1451067924499512, "rewards/margins": 0.2528926432132721, "rewards/rejected": -1.3979992866516113, "semantic_entropy": 0.8028466105461121, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 5.887551107429691, "learning_rate": 8.159170368826629e-11, "logits/chosen": -0.24026715755462646, "logits/rejected": -0.13791745901107788, "logps/chosen": -1.170651912689209, "logps/rejected": -1.385170817375183, "loss": 1.5846, "rewards/accuracies": 0.59375, "rewards/chosen": -1.170651912689209, "rewards/margins": 0.2145189791917801, "rewards/rejected": -1.385170817375183, "semantic_entropy": 0.8279166221618652, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 8.50097873680111, "learning_rate": 5.588254946015114e-11, "logits/chosen": -0.30358585715293884, "logits/rejected": -0.12270130217075348, "logps/chosen": -1.0993317365646362, "logps/rejected": -1.3860334157943726, "loss": 1.5212, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.0993317365646362, "rewards/margins": 0.2867015600204468, "rewards/rejected": -1.3860334157943726, "semantic_entropy": 0.8437032699584961, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 6.04507769570789, "learning_rate": 3.502385525216978e-11, "logits/chosen": -0.2897643744945526, "logits/rejected": -0.1677568256855011, "logps/chosen": -1.2193632125854492, "logps/rejected": -1.376584529876709, "loss": 1.6262, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.2193632125854492, "rewards/margins": 0.15722131729125977, "rewards/rejected": -1.376584529876709, "semantic_entropy": 0.813629150390625, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 6.056394848853814, "learning_rate": 1.901582343555308e-11, "logits/chosen": -0.2302633821964264, "logits/rejected": -0.20351533591747284, "logps/chosen": -1.2500874996185303, "logps/rejected": -1.321251630783081, "loss": 1.6453, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.2500874996185303, "rewards/margins": 0.0711643248796463, "rewards/rejected": -1.321251630783081, "semantic_entropy": 0.7904828190803528, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 5.103143337959364, "learning_rate": 7.858609320232634e-12, "logits/chosen": -0.22627322375774384, "logits/rejected": -0.146840900182724, "logps/chosen": -1.1231087446212769, "logps/rejected": -1.3206827640533447, "loss": 1.5439, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.1231087446212769, "rewards/margins": 0.19757387042045593, "rewards/rejected": -1.3206827640533447, "semantic_entropy": 0.8415719270706177, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 7.13577048936118, "learning_rate": 1.5523211535639624e-12, "logits/chosen": -0.22940698266029358, "logits/rejected": -0.17437343299388885, "logps/chosen": -1.1686615943908691, "logps/rejected": -1.3803951740264893, "loss": 1.5699, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.1686615943908691, "rewards/margins": 0.21173350512981415, "rewards/rejected": -1.3803951740264893, "semantic_entropy": 0.8024426698684692, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.004835642874240875, "eval_logits/rejected": 0.068243607878685, "eval_logps/chosen": -1.2670739889144897, "eval_logps/rejected": -1.404219388961792, "eval_loss": 1.6698358058929443, "eval_rewards/accuracies": 0.5474777221679688, "eval_rewards/chosen": -1.2670739889144897, "eval_rewards/margins": 0.13714542984962463, "eval_rewards/rejected": -1.404219388961792, "eval_runtime": 34.3529, "eval_samples_per_second": 39.152, "eval_semantic_entropy": 0.8052493333816528, "eval_steps_per_second": 9.81, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 1.640899122824931, "train_runtime": 28500.0022, "train_samples_per_second": 6.294, "train_steps_per_second": 0.197 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }