{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9978142076502732, "eval_steps": 400, "global_step": 914, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01092896174863388, "grad_norm": 47.87060782291424, "learning_rate": 5.434782608695652e-08, "logits/chosen": -1.0122432708740234, "logits/rejected": -1.0073297023773193, "logps/chosen": -0.28066128492355347, "logps/rejected": -0.2858629524707794, "loss": 3.1518, "rewards/accuracies": 0.53125, "rewards/chosen": -2.806612968444824, "rewards/margins": 0.05201658606529236, "rewards/rejected": -2.8586294651031494, "semantic_entropy": 0.7517332434654236, "step": 5 }, { "epoch": 0.02185792349726776, "grad_norm": 63.59519845931534, "learning_rate": 1.0869565217391303e-07, "logits/chosen": -1.0451396703720093, "logits/rejected": -0.9949606657028198, "logps/chosen": -0.25711697340011597, "logps/rejected": -0.27150270342826843, "loss": 3.1207, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.57116961479187, "rewards/margins": 0.14385755360126495, "rewards/rejected": -2.715027332305908, "semantic_entropy": 0.7098506689071655, "step": 10 }, { "epoch": 0.03278688524590164, "grad_norm": 52.932404991436066, "learning_rate": 1.6304347826086955e-07, "logits/chosen": -1.0101398229599, "logits/rejected": -0.9632788896560669, "logps/chosen": -0.2672443389892578, "logps/rejected": -0.2731854319572449, "loss": 3.1124, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.6724436283111572, "rewards/margins": 0.05941082164645195, "rewards/rejected": -2.731854200363159, "semantic_entropy": 0.7272862195968628, "step": 15 }, { "epoch": 0.04371584699453552, "grad_norm": 68.70297338794734, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -0.946621298789978, "logits/rejected": -0.8962594270706177, "logps/chosen": -0.2722616195678711, "logps/rejected": -0.2844754159450531, "loss": 3.1543, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.722616195678711, "rewards/margins": 0.12213809788227081, "rewards/rejected": -2.844754219055176, "semantic_entropy": 0.7445966601371765, "step": 20 }, { "epoch": 0.0546448087431694, "grad_norm": 34.23797136353184, "learning_rate": 2.717391304347826e-07, "logits/chosen": -0.9447389841079712, "logits/rejected": -0.8695358037948608, "logps/chosen": -0.27488625049591064, "logps/rejected": -0.29340118169784546, "loss": 3.1248, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.7488627433776855, "rewards/margins": 0.1851491630077362, "rewards/rejected": -2.934011936187744, "semantic_entropy": 0.753722071647644, "step": 25 }, { "epoch": 0.06557377049180328, "grad_norm": 56.95442636508264, "learning_rate": 3.260869565217391e-07, "logits/chosen": -1.0504213571548462, "logits/rejected": -0.9853544235229492, "logps/chosen": -0.26506370306015015, "logps/rejected": -0.2821282744407654, "loss": 3.1282, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.650637149810791, "rewards/margins": 0.1706458032131195, "rewards/rejected": -2.8212831020355225, "semantic_entropy": 0.7199792861938477, "step": 30 }, { "epoch": 0.07650273224043716, "grad_norm": 54.514089612724746, "learning_rate": 3.8043478260869567e-07, "logits/chosen": -1.0058822631835938, "logits/rejected": -0.9390825033187866, "logps/chosen": -0.2544824182987213, "logps/rejected": -0.2758719325065613, "loss": 3.1, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.5448241233825684, "rewards/margins": 0.2138955146074295, "rewards/rejected": -2.7587194442749023, "semantic_entropy": 0.714081346988678, "step": 35 }, { "epoch": 0.08743169398907104, "grad_norm": 61.13897060157166, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -0.9637517929077148, "logits/rejected": -0.9011168479919434, "logps/chosen": -0.28103750944137573, "logps/rejected": -0.29354166984558105, "loss": 3.1681, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.8103749752044678, "rewards/margins": 0.12504148483276367, "rewards/rejected": -2.9354166984558105, "semantic_entropy": 0.7535971999168396, "step": 40 }, { "epoch": 0.09836065573770492, "grad_norm": 29.50202425422368, "learning_rate": 4.891304347826087e-07, "logits/chosen": -1.011054515838623, "logits/rejected": -0.9284116625785828, "logps/chosen": -0.28203994035720825, "logps/rejected": -0.3046588599681854, "loss": 3.106, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.820399522781372, "rewards/margins": 0.2261890470981598, "rewards/rejected": -3.04658842086792, "semantic_entropy": 0.7553126811981201, "step": 45 }, { "epoch": 0.1092896174863388, "grad_norm": 60.818918802477036, "learning_rate": 5.434782608695652e-07, "logits/chosen": -0.9375956654548645, "logits/rejected": -0.8574072122573853, "logps/chosen": -0.2780763804912567, "logps/rejected": -0.28224700689315796, "loss": 3.1338, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -2.780763864517212, "rewards/margins": 0.04170636087656021, "rewards/rejected": -2.822470188140869, "semantic_entropy": 0.7434889078140259, "step": 50 }, { "epoch": 0.12021857923497267, "grad_norm": 34.29716426184461, "learning_rate": 5.978260869565217e-07, "logits/chosen": -0.9751367568969727, "logits/rejected": -0.8606834411621094, "logps/chosen": -0.2696499526500702, "logps/rejected": -0.29947254061698914, "loss": 3.0524, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.6964995861053467, "rewards/margins": 0.2982260584831238, "rewards/rejected": -2.9947257041931152, "semantic_entropy": 0.7428679466247559, "step": 55 }, { "epoch": 0.13114754098360656, "grad_norm": 32.36546820788893, "learning_rate": 6.521739130434782e-07, "logits/chosen": -1.0148303508758545, "logits/rejected": -0.9685667157173157, "logps/chosen": -0.25762075185775757, "logps/rejected": -0.2997520864009857, "loss": 3.0039, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.576206922531128, "rewards/margins": 0.42131391167640686, "rewards/rejected": -2.997521162033081, "semantic_entropy": 0.7362821102142334, "step": 60 }, { "epoch": 0.14207650273224043, "grad_norm": 47.86126164856308, "learning_rate": 7.065217391304348e-07, "logits/chosen": -1.002937912940979, "logits/rejected": -0.9363768696784973, "logps/chosen": -0.2962821125984192, "logps/rejected": -0.3176509141921997, "loss": 3.0992, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.9628207683563232, "rewards/margins": 0.2136881798505783, "rewards/rejected": -3.176509141921997, "semantic_entropy": 0.7823900580406189, "step": 65 }, { "epoch": 0.15300546448087432, "grad_norm": 83.46398772579433, "learning_rate": 7.608695652173913e-07, "logits/chosen": -0.9694533348083496, "logits/rejected": -0.9480490684509277, "logps/chosen": -0.2837492823600769, "logps/rejected": -0.3052641451358795, "loss": 3.0367, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.8374929428100586, "rewards/margins": 0.2151484489440918, "rewards/rejected": -3.0526413917541504, "semantic_entropy": 0.7394664883613586, "step": 70 }, { "epoch": 0.16393442622950818, "grad_norm": 35.83270782611293, "learning_rate": 8.152173913043478e-07, "logits/chosen": -0.9647692441940308, "logits/rejected": -0.9482067227363586, "logps/chosen": -0.2907211184501648, "logps/rejected": -0.33229631185531616, "loss": 3.0658, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.9072110652923584, "rewards/margins": 0.41575226187705994, "rewards/rejected": -3.322962999343872, "semantic_entropy": 0.7694975733757019, "step": 75 }, { "epoch": 0.17486338797814208, "grad_norm": 52.413564512749005, "learning_rate": 8.695652173913043e-07, "logits/chosen": -0.9714950323104858, "logits/rejected": -0.9107065200805664, "logps/chosen": -0.2882896065711975, "logps/rejected": -0.3103812336921692, "loss": 3.0244, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.8828964233398438, "rewards/margins": 0.22091606259346008, "rewards/rejected": -3.1038122177124023, "semantic_entropy": 0.7423045039176941, "step": 80 }, { "epoch": 0.18579234972677597, "grad_norm": 57.128124235325, "learning_rate": 9.239130434782608e-07, "logits/chosen": -0.9738727807998657, "logits/rejected": -0.9262188076972961, "logps/chosen": -0.29303327202796936, "logps/rejected": -0.337748646736145, "loss": 3.0267, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.9303324222564697, "rewards/margins": 0.44715413451194763, "rewards/rejected": -3.37748646736145, "semantic_entropy": 0.7571176290512085, "step": 85 }, { "epoch": 0.19672131147540983, "grad_norm": 39.74743242931724, "learning_rate": 9.782608695652173e-07, "logits/chosen": -1.046452283859253, "logits/rejected": -0.9666553735733032, "logps/chosen": -0.31861579418182373, "logps/rejected": -0.34951895475387573, "loss": 3.0463, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.186157703399658, "rewards/margins": 0.3090316653251648, "rewards/rejected": -3.495189666748047, "semantic_entropy": 0.8055832982063293, "step": 90 }, { "epoch": 0.20765027322404372, "grad_norm": 51.89832814789265, "learning_rate": 9.999671349822886e-07, "logits/chosen": -0.9848623275756836, "logits/rejected": -0.9856392741203308, "logps/chosen": -0.31298893690109253, "logps/rejected": -0.3401663601398468, "loss": 2.9541, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.1298892498016357, "rewards/margins": 0.2717742323875427, "rewards/rejected": -3.4016640186309814, "semantic_entropy": 0.7869037389755249, "step": 95 }, { "epoch": 0.2185792349726776, "grad_norm": 69.97139505648609, "learning_rate": 9.997663088532014e-07, "logits/chosen": -0.9892705678939819, "logits/rejected": -0.943418025970459, "logps/chosen": -0.35917508602142334, "logps/rejected": -0.4198976159095764, "loss": 2.9725, "rewards/accuracies": 0.5625, "rewards/chosen": -3.5917506217956543, "rewards/margins": 0.6072250008583069, "rewards/rejected": -4.198975563049316, "semantic_entropy": 0.834593653678894, "step": 100 }, { "epoch": 0.22950819672131148, "grad_norm": 43.67519297008509, "learning_rate": 9.9938298818292e-07, "logits/chosen": -1.0403445959091187, "logits/rejected": -1.0104751586914062, "logps/chosen": -0.32551589608192444, "logps/rejected": -0.38466745615005493, "loss": 2.9376, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.2551589012145996, "rewards/margins": 0.5915151834487915, "rewards/rejected": -3.8466744422912598, "semantic_entropy": 0.8123003840446472, "step": 105 }, { "epoch": 0.24043715846994534, "grad_norm": 54.4822346164963, "learning_rate": 9.98817312944725e-07, "logits/chosen": -1.0293775796890259, "logits/rejected": -1.0085766315460205, "logps/chosen": -0.34657078981399536, "logps/rejected": -0.44877204298973083, "loss": 2.9452, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.465707778930664, "rewards/margins": 1.0220123529434204, "rewards/rejected": -4.487720012664795, "semantic_entropy": 0.8509441614151001, "step": 110 }, { "epoch": 0.25136612021857924, "grad_norm": 53.517455700291855, "learning_rate": 9.98069489700446e-07, "logits/chosen": -1.0341802835464478, "logits/rejected": -0.9952918887138367, "logps/chosen": -0.3461839258670807, "logps/rejected": -0.4705514907836914, "loss": 2.8994, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.4618396759033203, "rewards/margins": 1.2436755895614624, "rewards/rejected": -4.705514907836914, "semantic_entropy": 0.8380171656608582, "step": 115 }, { "epoch": 0.26229508196721313, "grad_norm": 40.64481855809536, "learning_rate": 9.971397915250336e-07, "logits/chosen": -1.0739099979400635, "logits/rejected": -1.0038702487945557, "logps/chosen": -0.3547818958759308, "logps/rejected": -0.4196414053440094, "loss": 2.8774, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.547818660736084, "rewards/margins": 0.6485950350761414, "rewards/rejected": -4.196413993835449, "semantic_entropy": 0.8623871803283691, "step": 120 }, { "epoch": 0.273224043715847, "grad_norm": 144.95477211017723, "learning_rate": 9.960285579068417e-07, "logits/chosen": -0.9688740968704224, "logits/rejected": -0.9354850053787231, "logps/chosen": -0.383869469165802, "logps/rejected": -0.47563114762306213, "loss": 2.8716, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.8386943340301514, "rewards/margins": 0.9176166653633118, "rewards/rejected": -4.756311416625977, "semantic_entropy": 0.8745672106742859, "step": 125 }, { "epoch": 0.28415300546448086, "grad_norm": 47.745102969069876, "learning_rate": 9.94736194623663e-07, "logits/chosen": -0.9936184883117676, "logits/rejected": -0.9872056841850281, "logps/chosen": -0.4027808606624603, "logps/rejected": -0.5585031509399414, "loss": 2.8889, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -4.02780818939209, "rewards/margins": 1.5572230815887451, "rewards/rejected": -5.585031032562256, "semantic_entropy": 0.8549701571464539, "step": 130 }, { "epoch": 0.29508196721311475, "grad_norm": 43.036244798527335, "learning_rate": 9.932631735945526e-07, "logits/chosen": -1.018587350845337, "logits/rejected": -0.9396653175354004, "logps/chosen": -0.3934100568294525, "logps/rejected": -0.5400375127792358, "loss": 2.8008, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -3.934100389480591, "rewards/margins": 1.4662750959396362, "rewards/rejected": -5.4003753662109375, "semantic_entropy": 0.8907697796821594, "step": 135 }, { "epoch": 0.30601092896174864, "grad_norm": 51.334063125222045, "learning_rate": 9.916100327075037e-07, "logits/chosen": -1.0269070863723755, "logits/rejected": -0.9736196398735046, "logps/chosen": -0.43043556809425354, "logps/rejected": -0.6303533911705017, "loss": 2.5701, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -4.304355621337891, "rewards/margins": 1.9991786479949951, "rewards/rejected": -6.303534507751465, "semantic_entropy": 0.9288080930709839, "step": 140 }, { "epoch": 0.31693989071038253, "grad_norm": 69.99654341210723, "learning_rate": 9.89777375623032e-07, "logits/chosen": -0.9977472424507141, "logits/rejected": -0.9811614751815796, "logps/chosen": -0.44030895829200745, "logps/rejected": -0.5321138501167297, "loss": 2.7244, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.4030890464782715, "rewards/margins": 0.9180487394332886, "rewards/rejected": -5.321138381958008, "semantic_entropy": 0.932425856590271, "step": 145 }, { "epoch": 0.32786885245901637, "grad_norm": 51.74709430626173, "learning_rate": 9.877658715537428e-07, "logits/chosen": -1.0553128719329834, "logits/rejected": -1.0262110233306885, "logps/chosen": -0.5291231870651245, "logps/rejected": -0.7928577661514282, "loss": 2.6042, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.291232109069824, "rewards/margins": 2.637345552444458, "rewards/rejected": -7.928577423095703, "semantic_entropy": 0.9483098983764648, "step": 150 }, { "epoch": 0.33879781420765026, "grad_norm": 59.40984432787828, "learning_rate": 9.85576255019963e-07, "logits/chosen": -1.0320864915847778, "logits/rejected": -0.9819043278694153, "logps/chosen": -0.5477417707443237, "logps/rejected": -0.7481231093406677, "loss": 2.5957, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -5.477417945861816, "rewards/margins": 2.0038137435913086, "rewards/rejected": -7.481231689453125, "semantic_entropy": 0.9526890516281128, "step": 155 }, { "epoch": 0.34972677595628415, "grad_norm": 63.33344115210913, "learning_rate": 9.832093255815216e-07, "logits/chosen": -1.0814168453216553, "logits/rejected": -1.0304033756256104, "logps/chosen": -0.6954716444015503, "logps/rejected": -0.8502774238586426, "loss": 2.6238, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.954716682434082, "rewards/margins": 1.5480577945709229, "rewards/rejected": -8.502774238586426, "semantic_entropy": 0.9549511671066284, "step": 160 }, { "epoch": 0.36065573770491804, "grad_norm": 62.82535328280916, "learning_rate": 9.806659475457849e-07, "logits/chosen": -1.0839955806732178, "logits/rejected": -1.031585931777954, "logps/chosen": -0.7121194005012512, "logps/rejected": -0.8951581716537476, "loss": 2.5445, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -7.121194362640381, "rewards/margins": 1.8303883075714111, "rewards/rejected": -8.951581954956055, "semantic_entropy": 0.9896249771118164, "step": 165 }, { "epoch": 0.37158469945355194, "grad_norm": 65.61173370500529, "learning_rate": 9.779470496520441e-07, "logits/chosen": -1.0843085050582886, "logits/rejected": -1.0285215377807617, "logps/chosen": -0.7273966670036316, "logps/rejected": -0.9349418878555298, "loss": 2.5832, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -7.2739667892456055, "rewards/margins": 2.0754518508911133, "rewards/rejected": -9.349418640136719, "semantic_entropy": 0.9762886762619019, "step": 170 }, { "epoch": 0.3825136612021858, "grad_norm": 48.476659698357665, "learning_rate": 9.750536247323789e-07, "logits/chosen": -1.1571153402328491, "logits/rejected": -1.131704330444336, "logps/chosen": -0.8265604972839355, "logps/rejected": -0.9824529886245728, "loss": 2.4619, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -8.265604019165039, "rewards/margins": 1.5589253902435303, "rewards/rejected": -9.824529647827148, "semantic_entropy": 0.9426374435424805, "step": 175 }, { "epoch": 0.39344262295081966, "grad_norm": 55.305744786201686, "learning_rate": 9.719867293491144e-07, "logits/chosen": -1.1452279090881348, "logits/rejected": -1.1399190425872803, "logps/chosen": -0.8152974843978882, "logps/rejected": -1.16525137424469, "loss": 2.3679, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -8.152975082397461, "rewards/margins": 3.4995384216308594, "rewards/rejected": -11.65251350402832, "semantic_entropy": 0.9442623257637024, "step": 180 }, { "epoch": 0.40437158469945356, "grad_norm": 50.733966507742444, "learning_rate": 9.687474834090067e-07, "logits/chosen": -1.1547253131866455, "logits/rejected": -1.1736373901367188, "logps/chosen": -0.8491543531417847, "logps/rejected": -1.1844466924667358, "loss": 2.3318, "rewards/accuracies": 0.8125, "rewards/chosen": -8.491543769836426, "rewards/margins": 3.35292387008667, "rewards/rejected": -11.844468116760254, "semantic_entropy": 0.9556644558906555, "step": 185 }, { "epoch": 0.41530054644808745, "grad_norm": 62.277237758824675, "learning_rate": 9.653370697542987e-07, "logits/chosen": -1.162003755569458, "logits/rejected": -1.121468186378479, "logps/chosen": -0.8294251561164856, "logps/rejected": -1.1698486804962158, "loss": 2.3649, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -8.294252395629883, "rewards/margins": 3.4042346477508545, "rewards/rejected": -11.698487281799316, "semantic_entropy": 0.9534858465194702, "step": 190 }, { "epoch": 0.4262295081967213, "grad_norm": 62.09032006268862, "learning_rate": 9.617567337307935e-07, "logits/chosen": -1.1882003545761108, "logits/rejected": -1.1697113513946533, "logps/chosen": -0.9817994236946106, "logps/rejected": -1.3722710609436035, "loss": 2.4013, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.817992210388184, "rewards/margins": 3.904717206954956, "rewards/rejected": -13.722711563110352, "semantic_entropy": 0.9071667790412903, "step": 195 }, { "epoch": 0.4371584699453552, "grad_norm": 54.79531932098139, "learning_rate": 9.580077827331037e-07, "logits/chosen": -1.160315990447998, "logits/rejected": -1.0766620635986328, "logps/chosen": -0.8970209360122681, "logps/rejected": -1.2237987518310547, "loss": 2.3542, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -8.970209121704102, "rewards/margins": 3.267777919769287, "rewards/rejected": -12.237987518310547, "semantic_entropy": 0.9425733685493469, "step": 200 }, { "epoch": 0.44808743169398907, "grad_norm": 45.857415331803075, "learning_rate": 9.540915857272445e-07, "logits/chosen": -1.120792269706726, "logits/rejected": -1.1374807357788086, "logps/chosen": -0.7932685017585754, "logps/rejected": -1.1045658588409424, "loss": 2.2801, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -7.932684898376465, "rewards/margins": 3.112973690032959, "rewards/rejected": -11.045658111572266, "semantic_entropy": 0.9677651524543762, "step": 205 }, { "epoch": 0.45901639344262296, "grad_norm": 71.89691225680161, "learning_rate": 9.500095727507419e-07, "logits/chosen": -1.1540464162826538, "logits/rejected": -1.1580009460449219, "logps/chosen": -0.8536632657051086, "logps/rejected": -1.2688827514648438, "loss": 2.1643, "rewards/accuracies": 0.8125, "rewards/chosen": -8.536632537841797, "rewards/margins": 4.152195453643799, "rewards/rejected": -12.688827514648438, "semantic_entropy": 0.9133696556091309, "step": 210 }, { "epoch": 0.46994535519125685, "grad_norm": 61.886918880598415, "learning_rate": 9.457632343904402e-07, "logits/chosen": -1.1507601737976074, "logits/rejected": -1.0994901657104492, "logps/chosen": -0.891444981098175, "logps/rejected": -1.3195106983184814, "loss": 2.2496, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -8.914449691772461, "rewards/margins": 4.2806572914123535, "rewards/rejected": -13.195106506347656, "semantic_entropy": 0.943720817565918, "step": 215 }, { "epoch": 0.4808743169398907, "grad_norm": 48.894845818998725, "learning_rate": 9.413541212382004e-07, "logits/chosen": -1.2136586904525757, "logits/rejected": -1.1905956268310547, "logps/chosen": -0.9255884289741516, "logps/rejected": -1.2389224767684937, "loss": 2.2122, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -9.255884170532227, "rewards/margins": 3.133340358734131, "rewards/rejected": -12.389223098754883, "semantic_entropy": 0.9290882349014282, "step": 220 }, { "epoch": 0.4918032786885246, "grad_norm": 53.07969298601074, "learning_rate": 9.367838433246857e-07, "logits/chosen": -1.2239024639129639, "logits/rejected": -1.1851261854171753, "logps/chosen": -0.8761332631111145, "logps/rejected": -1.2777061462402344, "loss": 2.1765, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -8.761332511901855, "rewards/margins": 4.0157294273376465, "rewards/rejected": -12.777061462402344, "semantic_entropy": 0.9319503903388977, "step": 225 }, { "epoch": 0.5027322404371585, "grad_norm": 51.09299897041373, "learning_rate": 9.320540695314438e-07, "logits/chosen": -1.1558756828308105, "logits/rejected": -1.1598188877105713, "logps/chosen": -0.8811947703361511, "logps/rejected": -1.2912404537200928, "loss": 2.2098, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -8.811946868896484, "rewards/margins": 4.100456714630127, "rewards/rejected": -12.91240406036377, "semantic_entropy": 0.9310176968574524, "step": 230 }, { "epoch": 0.5136612021857924, "grad_norm": 63.259306143827835, "learning_rate": 9.271665269814983e-07, "logits/chosen": -1.188391923904419, "logits/rejected": -1.1512023210525513, "logps/chosen": -0.8918437957763672, "logps/rejected": -1.2489241361618042, "loss": 2.1333, "rewards/accuracies": 0.84375, "rewards/chosen": -8.918437957763672, "rewards/margins": 3.5708038806915283, "rewards/rejected": -12.489240646362305, "semantic_entropy": 0.9315102696418762, "step": 235 }, { "epoch": 0.5245901639344263, "grad_norm": 55.527350378129, "learning_rate": 9.221230004086721e-07, "logits/chosen": -1.2678356170654297, "logits/rejected": -1.2772780656814575, "logps/chosen": -0.8592067956924438, "logps/rejected": -1.3196837902069092, "loss": 2.0237, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -8.592068672180176, "rewards/margins": 4.604770660400391, "rewards/rejected": -13.19683837890625, "semantic_entropy": 0.9410519599914551, "step": 240 }, { "epoch": 0.5355191256830601, "grad_norm": 46.907821708328406, "learning_rate": 9.169253315058763e-07, "logits/chosen": -1.1692125797271729, "logits/rejected": -1.125632405281067, "logps/chosen": -0.905608057975769, "logps/rejected": -1.3867673873901367, "loss": 2.1096, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.056081771850586, "rewards/margins": 4.811593055725098, "rewards/rejected": -13.867673873901367, "semantic_entropy": 0.921157717704773, "step": 245 }, { "epoch": 0.546448087431694, "grad_norm": 47.861862507896085, "learning_rate": 9.11575418252596e-07, "logits/chosen": -1.232251763343811, "logits/rejected": -1.1941629648208618, "logps/chosen": -0.8441025614738464, "logps/rejected": -1.2240302562713623, "loss": 2.1618, "rewards/accuracies": 0.78125, "rewards/chosen": -8.441025733947754, "rewards/margins": 3.799276828765869, "rewards/rejected": -12.240303039550781, "semantic_entropy": 0.9252967834472656, "step": 250 }, { "epoch": 0.5573770491803278, "grad_norm": 54.40105116628037, "learning_rate": 9.060752142218257e-07, "logits/chosen": -1.213555932044983, "logits/rejected": -1.1773382425308228, "logps/chosen": -0.8959819078445435, "logps/rejected": -1.3679741621017456, "loss": 2.0365, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -8.959817886352539, "rewards/margins": 4.71992301940918, "rewards/rejected": -13.679742813110352, "semantic_entropy": 0.9322195053100586, "step": 255 }, { "epoch": 0.5683060109289617, "grad_norm": 43.449537508765815, "learning_rate": 9.004267278667031e-07, "logits/chosen": -1.1810890436172485, "logits/rejected": -1.1702289581298828, "logps/chosen": -0.8510452508926392, "logps/rejected": -1.3418259620666504, "loss": 2.011, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -8.510452270507812, "rewards/margins": 4.907806873321533, "rewards/rejected": -13.41826057434082, "semantic_entropy": 0.9143549203872681, "step": 260 }, { "epoch": 0.5792349726775956, "grad_norm": 46.818755419193465, "learning_rate": 8.946320217871025e-07, "logits/chosen": -1.1749790906906128, "logits/rejected": -1.1358766555786133, "logps/chosen": -0.855148434638977, "logps/rejected": -1.3291784524917603, "loss": 1.9976, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -8.551485061645508, "rewards/margins": 4.740299224853516, "rewards/rejected": -13.291783332824707, "semantic_entropy": 0.9298276901245117, "step": 265 }, { "epoch": 0.5901639344262295, "grad_norm": 95.7954675499385, "learning_rate": 8.886932119764565e-07, "logits/chosen": -1.1698591709136963, "logits/rejected": -1.1438281536102295, "logps/chosen": -0.8544471859931946, "logps/rejected": -1.377416968345642, "loss": 1.9774, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -8.544472694396973, "rewards/margins": 5.229698657989502, "rewards/rejected": -13.774169921875, "semantic_entropy": 0.9152740240097046, "step": 270 }, { "epoch": 0.6010928961748634, "grad_norm": 52.4740552697882, "learning_rate": 8.826124670490802e-07, "logits/chosen": -1.140944242477417, "logits/rejected": -1.0730197429656982, "logps/chosen": -0.8467117547988892, "logps/rejected": -1.2126039266586304, "loss": 1.9796, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -8.467116355895996, "rewards/margins": 3.658921718597412, "rewards/rejected": -12.126038551330566, "semantic_entropy": 0.933331310749054, "step": 275 }, { "epoch": 0.6120218579234973, "grad_norm": 54.704164373442616, "learning_rate": 8.763920074482809e-07, "logits/chosen": -1.102807879447937, "logits/rejected": -1.105039358139038, "logps/chosen": -0.8896454572677612, "logps/rejected": -1.4699008464813232, "loss": 1.9808, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.896454811096191, "rewards/margins": 5.802553176879883, "rewards/rejected": -14.699007987976074, "semantic_entropy": 0.8732292056083679, "step": 280 }, { "epoch": 0.6229508196721312, "grad_norm": 48.33002211252601, "learning_rate": 8.700341046355411e-07, "logits/chosen": -1.2859059572219849, "logits/rejected": -1.2477091550827026, "logps/chosen": -0.8521019220352173, "logps/rejected": -1.4364469051361084, "loss": 1.8954, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -8.52101993560791, "rewards/margins": 5.843448162078857, "rewards/rejected": -14.364468574523926, "semantic_entropy": 0.9044594764709473, "step": 285 }, { "epoch": 0.6338797814207651, "grad_norm": 62.5830858895554, "learning_rate": 8.635410802610723e-07, "logits/chosen": -1.2080810070037842, "logits/rejected": -1.1687798500061035, "logps/chosen": -0.8889066576957703, "logps/rejected": -1.4597949981689453, "loss": 1.9215, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -8.889066696166992, "rewards/margins": 5.708883762359619, "rewards/rejected": -14.59795093536377, "semantic_entropy": 0.903703510761261, "step": 290 }, { "epoch": 0.644808743169399, "grad_norm": 52.105468651247094, "learning_rate": 8.569153053160428e-07, "logits/chosen": -1.1924866437911987, "logits/rejected": -1.182565689086914, "logps/chosen": -0.9297744035720825, "logps/rejected": -1.5572900772094727, "loss": 1.8847, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -9.29774284362793, "rewards/margins": 6.2751569747924805, "rewards/rejected": -15.572900772094727, "semantic_entropy": 0.8886201977729797, "step": 295 }, { "epoch": 0.6557377049180327, "grad_norm": 43.97404227922028, "learning_rate": 8.501591992667849e-07, "logits/chosen": -1.2417964935302734, "logits/rejected": -1.2167500257492065, "logps/chosen": -0.9788614511489868, "logps/rejected": -1.5977232456207275, "loss": 1.9048, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -9.788614273071289, "rewards/margins": 6.188617706298828, "rewards/rejected": -15.977231979370117, "semantic_entropy": 0.8578527569770813, "step": 300 }, { "epoch": 0.6666666666666666, "grad_norm": 62.519761231188205, "learning_rate": 8.432752291713058e-07, "logits/chosen": -1.227373719215393, "logits/rejected": -1.1630009412765503, "logps/chosen": -0.9313735961914062, "logps/rejected": -1.6220667362213135, "loss": 1.876, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -9.313735008239746, "rewards/margins": 6.906930446624756, "rewards/rejected": -16.220666885375977, "semantic_entropy": 0.8703945875167847, "step": 305 }, { "epoch": 0.6775956284153005, "grad_norm": 47.74676931324823, "learning_rate": 8.362659087784152e-07, "logits/chosen": -1.1420575380325317, "logits/rejected": -1.1442222595214844, "logps/chosen": -0.921275794506073, "logps/rejected": -1.5964065790176392, "loss": 1.9255, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.212759017944336, "rewards/margins": 6.751306056976318, "rewards/rejected": -15.964065551757812, "semantic_entropy": 0.8867815732955933, "step": 310 }, { "epoch": 0.6885245901639344, "grad_norm": 48.12633140725401, "learning_rate": 8.291337976098067e-07, "logits/chosen": -1.1699371337890625, "logits/rejected": -1.1596167087554932, "logps/chosen": -0.9925182461738586, "logps/rejected": -1.4757254123687744, "loss": 1.8872, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -9.925182342529297, "rewards/margins": 4.832071781158447, "rewards/rejected": -14.757253646850586, "semantic_entropy": 0.8734658360481262, "step": 315 }, { "epoch": 0.6994535519125683, "grad_norm": 47.1038569824555, "learning_rate": 8.218815000254231e-07, "logits/chosen": -1.2591969966888428, "logits/rejected": -1.1927886009216309, "logps/chosen": -0.8629493713378906, "logps/rejected": -1.4769127368927002, "loss": 1.8067, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -8.629494667053223, "rewards/margins": 6.139632225036621, "rewards/rejected": -14.769126892089844, "semantic_entropy": 0.9108262062072754, "step": 320 }, { "epoch": 0.7103825136612022, "grad_norm": 56.67465709928985, "learning_rate": 8.145116642724485e-07, "logits/chosen": -1.2181096076965332, "logits/rejected": -1.189969778060913, "logps/chosen": -0.8706620335578918, "logps/rejected": -1.4245946407318115, "loss": 1.8061, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -8.706620216369629, "rewards/margins": 5.539328098297119, "rewards/rejected": -14.245946884155273, "semantic_entropy": 0.893680214881897, "step": 325 }, { "epoch": 0.7213114754098361, "grad_norm": 43.692074758430785, "learning_rate": 8.07026981518276e-07, "logits/chosen": -1.1343576908111572, "logits/rejected": -1.0772193670272827, "logps/chosen": -0.8813779950141907, "logps/rejected": -1.7738568782806396, "loss": 1.7373, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -8.813779830932617, "rewards/margins": 8.924787521362305, "rewards/rejected": -17.73856544494629, "semantic_entropy": 0.8537489771842957, "step": 330 }, { "epoch": 0.73224043715847, "grad_norm": 54.41817403205364, "learning_rate": 7.994301848678004e-07, "logits/chosen": -1.134152889251709, "logits/rejected": -1.063077449798584, "logps/chosen": -0.9365140199661255, "logps/rejected": -1.6991326808929443, "loss": 1.766, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.365139961242676, "rewards/margins": 7.626187324523926, "rewards/rejected": -16.9913272857666, "semantic_entropy": 0.8437296152114868, "step": 335 }, { "epoch": 0.7431693989071039, "grad_norm": 56.714537939738605, "learning_rate": 7.917240483654e-07, "logits/chosen": -1.1386888027191162, "logits/rejected": -1.0677882432937622, "logps/chosen": -0.9699214100837708, "logps/rejected": -1.7819700241088867, "loss": 1.8199, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -9.699213981628418, "rewards/margins": 8.12048625946045, "rewards/rejected": -17.819698333740234, "semantic_entropy": 0.8428508639335632, "step": 340 }, { "epoch": 0.7540983606557377, "grad_norm": 54.15768742157569, "learning_rate": 7.839113859819656e-07, "logits/chosen": -1.2082730531692505, "logits/rejected": -1.1757750511169434, "logps/chosen": -1.0214024782180786, "logps/rejected": -1.8994626998901367, "loss": 1.8236, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -10.214024543762207, "rewards/margins": 8.78060245513916, "rewards/rejected": -18.994626998901367, "semantic_entropy": 0.818555474281311, "step": 345 }, { "epoch": 0.7650273224043715, "grad_norm": 52.04532684140525, "learning_rate": 7.759950505873521e-07, "logits/chosen": -1.2180219888687134, "logits/rejected": -1.1834783554077148, "logps/chosen": -0.7670449018478394, "logps/rejected": -1.324202060699463, "loss": 1.7353, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -7.670449256896973, "rewards/margins": 5.571571350097656, "rewards/rejected": -13.242021560668945, "semantic_entropy": 0.9124476313591003, "step": 350 }, { "epoch": 0.7759562841530054, "grad_norm": 52.511907795888796, "learning_rate": 7.67977932908626e-07, "logits/chosen": -1.175022840499878, "logits/rejected": -1.1130549907684326, "logps/chosen": -0.8713346719741821, "logps/rejected": -1.66217839717865, "loss": 1.726, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -8.713346481323242, "rewards/margins": 7.908437252044678, "rewards/rejected": -16.621784210205078, "semantic_entropy": 0.8560686111450195, "step": 355 }, { "epoch": 0.7868852459016393, "grad_norm": 47.66801579095495, "learning_rate": 7.598629604744872e-07, "logits/chosen": -1.1504714488983154, "logits/rejected": -1.121519923210144, "logps/chosen": -1.078308343887329, "logps/rejected": -2.017784833908081, "loss": 1.687, "rewards/accuracies": 0.8125, "rewards/chosen": -10.783082008361816, "rewards/margins": 9.394767761230469, "rewards/rejected": -20.17784881591797, "semantic_entropy": 0.8011868596076965, "step": 360 }, { "epoch": 0.7978142076502732, "grad_norm": 78.73352396462461, "learning_rate": 7.516530965462539e-07, "logits/chosen": -1.2399051189422607, "logits/rejected": -1.2221591472625732, "logps/chosen": -0.869607150554657, "logps/rejected": -1.7532609701156616, "loss": 1.6969, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -8.696072578430176, "rewards/margins": 8.836538314819336, "rewards/rejected": -17.532609939575195, "semantic_entropy": 0.8715127110481262, "step": 365 }, { "epoch": 0.8087431693989071, "grad_norm": 52.51768985735217, "learning_rate": 7.433513390357989e-07, "logits/chosen": -1.2507340908050537, "logits/rejected": -1.187475562095642, "logps/chosen": -0.9717696905136108, "logps/rejected": -2.0153520107269287, "loss": 1.6488, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.717697143554688, "rewards/margins": 10.435824394226074, "rewards/rejected": -20.153522491455078, "semantic_entropy": 0.8269231915473938, "step": 370 }, { "epoch": 0.819672131147541, "grad_norm": 50.10941942498599, "learning_rate": 7.349607194108322e-07, "logits/chosen": -1.2848598957061768, "logits/rejected": -1.1889159679412842, "logps/chosen": -0.8790639638900757, "logps/rejected": -1.7771461009979248, "loss": 1.6703, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -8.790639877319336, "rewards/margins": 8.980820655822754, "rewards/rejected": -17.771459579467773, "semantic_entropy": 0.853074848651886, "step": 375 }, { "epoch": 0.8306010928961749, "grad_norm": 45.566081133100745, "learning_rate": 7.264843015879321e-07, "logits/chosen": -1.1421478986740112, "logits/rejected": -1.140625238418579, "logps/chosen": -0.9042370915412903, "logps/rejected": -1.7280666828155518, "loss": 1.541, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -9.04237174987793, "rewards/margins": 8.23829460144043, "rewards/rejected": -17.280664443969727, "semantic_entropy": 0.8745312690734863, "step": 380 }, { "epoch": 0.8415300546448088, "grad_norm": 59.00085660352214, "learning_rate": 7.17925180813725e-07, "logits/chosen": -1.2217355966567993, "logits/rejected": -1.159557580947876, "logps/chosen": -1.042198657989502, "logps/rejected": -2.1717679500579834, "loss": 1.7473, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -10.42198657989502, "rewards/margins": 11.295695304870605, "rewards/rejected": -21.717683792114258, "semantic_entropy": 0.8145696520805359, "step": 385 }, { "epoch": 0.8524590163934426, "grad_norm": 68.24919118342267, "learning_rate": 7.092864825346266e-07, "logits/chosen": -1.2256710529327393, "logits/rejected": -1.154592752456665, "logps/chosen": -0.8894011378288269, "logps/rejected": -2.0597283840179443, "loss": 1.5906, "rewards/accuracies": 0.875, "rewards/chosen": -8.894010543823242, "rewards/margins": 11.703274726867676, "rewards/rejected": -20.597286224365234, "semantic_entropy": 0.8356989026069641, "step": 390 }, { "epoch": 0.8633879781420765, "grad_norm": 52.86840793380424, "learning_rate": 7.005713612555545e-07, "logits/chosen": -1.1973850727081299, "logits/rejected": -1.15791654586792, "logps/chosen": -0.9084303975105286, "logps/rejected": -1.824072241783142, "loss": 1.5811, "rewards/accuracies": 0.84375, "rewards/chosen": -9.084303855895996, "rewards/margins": 9.156417846679688, "rewards/rejected": -18.240720748901367, "semantic_entropy": 0.863986611366272, "step": 395 }, { "epoch": 0.8743169398907104, "grad_norm": 54.969346083508704, "learning_rate": 6.917829993880302e-07, "logits/chosen": -1.1350136995315552, "logits/rejected": -1.078984022140503, "logps/chosen": -0.9205960035324097, "logps/rejected": -1.9763364791870117, "loss": 1.5778, "rewards/accuracies": 0.875, "rewards/chosen": -9.205960273742676, "rewards/margins": 10.557405471801758, "rewards/rejected": -19.763364791870117, "semantic_entropy": 0.8187274932861328, "step": 400 }, { "epoch": 0.8743169398907104, "eval_logits/chosen": -1.5077557563781738, "eval_logits/rejected": -1.432308554649353, "eval_logps/chosen": -0.868651807308197, "eval_logps/rejected": -1.8860282897949219, "eval_loss": 1.6372781991958618, "eval_rewards/accuracies": 0.8734939694404602, "eval_rewards/chosen": -8.686517715454102, "eval_rewards/margins": 10.173765182495117, "eval_rewards/rejected": -18.86028289794922, "eval_runtime": 37.7445, "eval_samples_per_second": 34.919, "eval_semantic_entropy": 0.8519198894500732, "eval_steps_per_second": 2.199, "step": 400 }, { "epoch": 0.8852459016393442, "grad_norm": 54.747379817385166, "learning_rate": 6.8292460608809e-07, "logits/chosen": -1.1865565776824951, "logits/rejected": -1.0789119005203247, "logps/chosen": -0.8656112551689148, "logps/rejected": -1.9079488515853882, "loss": 1.557, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -8.656112670898438, "rewards/margins": 10.423376083374023, "rewards/rejected": -19.07948875427246, "semantic_entropy": 0.8483451008796692, "step": 405 }, { "epoch": 0.8961748633879781, "grad_norm": 54.38709320329884, "learning_rate": 6.739994160844309e-07, "logits/chosen": -1.2001937627792358, "logits/rejected": -1.2109323740005493, "logps/chosen": -1.0198501348495483, "logps/rejected": -2.304253101348877, "loss": 1.5398, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -10.198502540588379, "rewards/margins": 12.844027519226074, "rewards/rejected": -23.042530059814453, "semantic_entropy": 0.7884197235107422, "step": 410 }, { "epoch": 0.907103825136612, "grad_norm": 58.8994587847891, "learning_rate": 6.650106884972176e-07, "logits/chosen": -1.2297394275665283, "logits/rejected": -1.2055060863494873, "logps/chosen": -0.8097732663154602, "logps/rejected": -2.0647740364074707, "loss": 1.6318, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -8.097734451293945, "rewards/margins": 12.550005912780762, "rewards/rejected": -20.647741317749023, "semantic_entropy": 0.8577386736869812, "step": 415 }, { "epoch": 0.9180327868852459, "grad_norm": 66.32923235150443, "learning_rate": 6.559617056479827e-07, "logits/chosen": -1.2397379875183105, "logits/rejected": -1.1944515705108643, "logps/chosen": -0.9744995832443237, "logps/rejected": -2.2359464168548584, "loss": 1.5364, "rewards/accuracies": 0.84375, "rewards/chosen": -9.7449951171875, "rewards/margins": 12.614469528198242, "rewards/rejected": -22.359464645385742, "semantic_entropy": 0.8098868131637573, "step": 420 }, { "epoch": 0.9289617486338798, "grad_norm": 64.90064469639756, "learning_rate": 6.468557718610559e-07, "logits/chosen": -1.2209162712097168, "logits/rejected": -1.169478178024292, "logps/chosen": -1.0786913633346558, "logps/rejected": -2.5019688606262207, "loss": 1.6058, "rewards/accuracies": 0.84375, "rewards/chosen": -10.786913871765137, "rewards/margins": 14.232770919799805, "rewards/rejected": -25.019685745239258, "semantic_entropy": 0.7745442390441895, "step": 425 }, { "epoch": 0.9398907103825137, "grad_norm": 65.90460986634548, "learning_rate": 6.376962122569567e-07, "logits/chosen": -1.1558514833450317, "logits/rejected": -1.1550347805023193, "logps/chosen": -0.6848023533821106, "logps/rejected": -1.8477531671524048, "loss": 1.3787, "rewards/accuracies": 0.9375, "rewards/chosen": -6.848023414611816, "rewards/margins": 11.629508972167969, "rewards/rejected": -18.4775333404541, "semantic_entropy": 0.8978629112243652, "step": 430 }, { "epoch": 0.9508196721311475, "grad_norm": 80.36478809238143, "learning_rate": 6.284863715381948e-07, "logits/chosen": -1.2516933679580688, "logits/rejected": -1.2447582483291626, "logps/chosen": -0.8717735409736633, "logps/rejected": -2.2636890411376953, "loss": 1.5367, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -8.717737197875977, "rewards/margins": 13.919151306152344, "rewards/rejected": -22.63688850402832, "semantic_entropy": 0.8273345828056335, "step": 435 }, { "epoch": 0.9617486338797814, "grad_norm": 79.39000046120883, "learning_rate": 6.192296127679192e-07, "logits/chosen": -1.1874706745147705, "logits/rejected": -1.1192582845687866, "logps/chosen": -0.9044081568717957, "logps/rejected": -2.0115015506744385, "loss": 1.5428, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -9.04408073425293, "rewards/margins": 11.070935249328613, "rewards/rejected": -20.11501693725586, "semantic_entropy": 0.8257206082344055, "step": 440 }, { "epoch": 0.9726775956284153, "grad_norm": 59.45278594899511, "learning_rate": 6.099293161418629e-07, "logits/chosen": -1.2240984439849854, "logits/rejected": -1.18662428855896, "logps/chosen": -0.6975774168968201, "logps/rejected": -1.919647216796875, "loss": 1.5818, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -6.97577428817749, "rewards/margins": 12.220699310302734, "rewards/rejected": -19.196474075317383, "semantic_entropy": 0.887184739112854, "step": 445 }, { "epoch": 0.9836065573770492, "grad_norm": 53.56869631451961, "learning_rate": 6.005888777540319e-07, "logits/chosen": -1.1677896976470947, "logits/rejected": -1.1477397680282593, "logps/chosen": -0.8627035021781921, "logps/rejected": -1.9724452495574951, "loss": 1.5352, "rewards/accuracies": 0.875, "rewards/chosen": -8.627036094665527, "rewards/margins": 11.097416877746582, "rewards/rejected": -19.72445297241211, "semantic_entropy": 0.8503534197807312, "step": 450 }, { "epoch": 0.994535519125683, "grad_norm": 75.11227313091236, "learning_rate": 5.912117083565873e-07, "logits/chosen": -1.1938502788543701, "logits/rejected": -1.1654444932937622, "logps/chosen": -1.1713725328445435, "logps/rejected": -2.3690249919891357, "loss": 1.5941, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -11.713726043701172, "rewards/margins": 11.976524353027344, "rewards/rejected": -23.690250396728516, "semantic_entropy": 0.7848092913627625, "step": 455 }, { "epoch": 1.005464480874317, "grad_norm": 48.44167575969943, "learning_rate": 5.818012321143773e-07, "logits/chosen": -1.2322055101394653, "logits/rejected": -1.1756855249404907, "logps/chosen": -0.8835703730583191, "logps/rejected": -2.2671618461608887, "loss": 1.3987, "rewards/accuracies": 0.875, "rewards/chosen": -8.835702896118164, "rewards/margins": 13.835916519165039, "rewards/rejected": -22.671619415283203, "semantic_entropy": 0.8247418403625488, "step": 460 }, { "epoch": 1.0163934426229508, "grad_norm": 47.683623908009125, "learning_rate": 5.723608853545684e-07, "logits/chosen": -1.2683448791503906, "logits/rejected": -1.2093217372894287, "logps/chosen": -0.8307113647460938, "logps/rejected": -2.3884284496307373, "loss": 1.1472, "rewards/accuracies": 0.9375, "rewards/chosen": -8.307112693786621, "rewards/margins": 15.577173233032227, "rewards/rejected": -23.88428497314453, "semantic_entropy": 0.8331409692764282, "step": 465 }, { "epoch": 1.0273224043715847, "grad_norm": 57.239399331005785, "learning_rate": 5.628941153118388e-07, "logits/chosen": -1.2552951574325562, "logits/rejected": -1.2222687005996704, "logps/chosen": -0.8629674911499023, "logps/rejected": -2.325558662414551, "loss": 1.1426, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -8.629674911499023, "rewards/margins": 14.625910758972168, "rewards/rejected": -23.25558853149414, "semantic_entropy": 0.8217577934265137, "step": 470 }, { "epoch": 1.0382513661202186, "grad_norm": 39.48804343487935, "learning_rate": 5.534043788695852e-07, "logits/chosen": -1.22693932056427, "logits/rejected": -1.1497706174850464, "logps/chosen": -0.7519802451133728, "logps/rejected": -2.1450114250183105, "loss": 1.0975, "rewards/accuracies": 0.9375, "rewards/chosen": -7.519803047180176, "rewards/margins": 13.930310249328613, "rewards/rejected": -21.450115203857422, "semantic_entropy": 0.8537012338638306, "step": 475 }, { "epoch": 1.0491803278688525, "grad_norm": 37.024988536485964, "learning_rate": 5.438951412976098e-07, "logits/chosen": -1.3238413333892822, "logits/rejected": -1.2577579021453857, "logps/chosen": -0.7658538818359375, "logps/rejected": -2.0598320960998535, "loss": 1.1533, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -7.658538818359375, "rewards/margins": 12.939779281616211, "rewards/rejected": -20.598318099975586, "semantic_entropy": 0.8649771809577942, "step": 480 }, { "epoch": 1.0601092896174864, "grad_norm": 42.1526889978167, "learning_rate": 5.34369874986742e-07, "logits/chosen": -1.2668297290802002, "logits/rejected": -1.1939513683319092, "logps/chosen": -0.8974517583847046, "logps/rejected": -2.424004077911377, "loss": 1.0247, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -8.974517822265625, "rewards/margins": 15.265522956848145, "rewards/rejected": -24.24004364013672, "semantic_entropy": 0.7897659540176392, "step": 485 }, { "epoch": 1.0710382513661203, "grad_norm": 52.525378226092165, "learning_rate": 5.248320581808619e-07, "logits/chosen": -1.2010338306427002, "logits/rejected": -1.1409817934036255, "logps/chosen": -0.7397095561027527, "logps/rejected": -2.3880066871643066, "loss": 1.1343, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.397095680236816, "rewards/margins": 16.48297119140625, "rewards/rejected": -23.88006591796875, "semantic_entropy": 0.8509289026260376, "step": 490 }, { "epoch": 1.0819672131147542, "grad_norm": 57.24209028140043, "learning_rate": 5.15285173706785e-07, "logits/chosen": -1.2966060638427734, "logits/rejected": -1.2440364360809326, "logps/chosen": -0.7074769139289856, "logps/rejected": -2.2080492973327637, "loss": 1.104, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.074769496917725, "rewards/margins": 15.00572681427002, "rewards/rejected": -22.080495834350586, "semantic_entropy": 0.862097442150116, "step": 495 }, { "epoch": 1.092896174863388, "grad_norm": 60.20969441966712, "learning_rate": 5.057327077024744e-07, "logits/chosen": -1.31562340259552, "logits/rejected": -1.2055505514144897, "logps/chosen": -0.7696375846862793, "logps/rejected": -2.1600234508514404, "loss": 1.0776, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.696375370025635, "rewards/margins": 13.903857231140137, "rewards/rejected": -21.600234985351562, "semantic_entropy": 0.8503168821334839, "step": 500 }, { "epoch": 1.1038251366120218, "grad_norm": 39.37970422474807, "learning_rate": 4.961781483440433e-07, "logits/chosen": -1.2652629613876343, "logits/rejected": -1.155110239982605, "logps/chosen": -0.7121917009353638, "logps/rejected": -2.2156224250793457, "loss": 1.0684, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.1219162940979, "rewards/margins": 15.034309387207031, "rewards/rejected": -22.156227111816406, "semantic_entropy": 0.856345534324646, "step": 505 }, { "epoch": 1.1147540983606556, "grad_norm": 53.63055077579748, "learning_rate": 4.866249845720132e-07, "logits/chosen": -1.2122000455856323, "logits/rejected": -1.1381186246871948, "logps/chosen": -0.7895854115486145, "logps/rejected": -2.1967644691467285, "loss": 1.1991, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -7.8958539962768555, "rewards/margins": 14.071792602539062, "rewards/rejected": -21.96764373779297, "semantic_entropy": 0.8369362950325012, "step": 510 }, { "epoch": 1.1256830601092895, "grad_norm": 45.3883880528581, "learning_rate": 4.770767048172948e-07, "logits/chosen": -1.2122347354888916, "logits/rejected": -1.149927020072937, "logps/chosen": -0.7574501633644104, "logps/rejected": -2.262672185897827, "loss": 1.0855, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.574501037597656, "rewards/margins": 15.052220344543457, "rewards/rejected": -22.62672233581543, "semantic_entropy": 0.8394317626953125, "step": 515 }, { "epoch": 1.1366120218579234, "grad_norm": 40.766203312385706, "learning_rate": 4.675367957273505e-07, "logits/chosen": -1.2204854488372803, "logits/rejected": -1.144971251487732, "logps/chosen": -0.7849557995796204, "logps/rejected": -2.2667272090911865, "loss": 1.0264, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.849558353424072, "rewards/margins": 14.817715644836426, "rewards/rejected": -22.66727066040039, "semantic_entropy": 0.8283472061157227, "step": 520 }, { "epoch": 1.1475409836065573, "grad_norm": 42.8963401742162, "learning_rate": 4.5800874089301455e-07, "logits/chosen": -1.261281132698059, "logits/rejected": -1.1677086353302002, "logps/chosen": -0.7403801679611206, "logps/rejected": -2.290158987045288, "loss": 0.9619, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -7.403802394866943, "rewards/margins": 15.497787475585938, "rewards/rejected": -22.901592254638672, "semantic_entropy": 0.8431955575942993, "step": 525 }, { "epoch": 1.1584699453551912, "grad_norm": 57.97538998117191, "learning_rate": 4.4849601957642285e-07, "logits/chosen": -1.174661636352539, "logits/rejected": -1.115818738937378, "logps/chosen": -0.7541646361351013, "logps/rejected": -2.2110159397125244, "loss": 1.0935, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -7.5416460037231445, "rewards/margins": 14.568511962890625, "rewards/rejected": -22.110157012939453, "semantic_entropy": 0.853602409362793, "step": 530 }, { "epoch": 1.169398907103825, "grad_norm": 56.26607000357583, "learning_rate": 4.390021054405286e-07, "logits/chosen": -1.240636944770813, "logits/rejected": -1.1869792938232422, "logps/chosen": -0.7534674406051636, "logps/rejected": -2.2876932621002197, "loss": 0.9657, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.534674167633057, "rewards/margins": 15.342257499694824, "rewards/rejected": -22.87693214416504, "semantic_entropy": 0.8402601480484009, "step": 535 }, { "epoch": 1.180327868852459, "grad_norm": 54.25638397035917, "learning_rate": 4.295304652806592e-07, "logits/chosen": -1.2079153060913086, "logits/rejected": -1.142287015914917, "logps/chosen": -0.611890971660614, "logps/rejected": -2.0176615715026855, "loss": 1.0051, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.11890983581543, "rewards/margins": 14.057706832885742, "rewards/rejected": -20.17661476135254, "semantic_entropy": 0.8606586456298828, "step": 540 }, { "epoch": 1.1912568306010929, "grad_norm": 44.34686440564056, "learning_rate": 4.200845577585826e-07, "logits/chosen": -1.2312743663787842, "logits/rejected": -1.1274607181549072, "logps/chosen": -0.6904948353767395, "logps/rejected": -2.0026180744171143, "loss": 1.0628, "rewards/accuracies": 0.9375, "rewards/chosen": -6.9049482345581055, "rewards/margins": 13.121232986450195, "rewards/rejected": -20.026180267333984, "semantic_entropy": 0.839868426322937, "step": 545 }, { "epoch": 1.2021857923497268, "grad_norm": 51.975486510086114, "learning_rate": 4.106678321395433e-07, "logits/chosen": -1.1899176836013794, "logits/rejected": -1.1200889348983765, "logps/chosen": -0.7009586095809937, "logps/rejected": -2.399099826812744, "loss": 0.9114, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -7.009586334228516, "rewards/margins": 16.98141098022461, "rewards/rejected": -23.990997314453125, "semantic_entropy": 0.8362213373184204, "step": 550 }, { "epoch": 1.2131147540983607, "grad_norm": 39.51029900614786, "learning_rate": 4.012837270327288e-07, "logits/chosen": -1.1518226861953735, "logits/rejected": -1.1040208339691162, "logps/chosen": -0.6657946705818176, "logps/rejected": -2.024448871612549, "loss": 1.0111, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -6.6579461097717285, "rewards/margins": 13.586542129516602, "rewards/rejected": -20.244487762451172, "semantic_entropy": 0.8607606887817383, "step": 555 }, { "epoch": 1.2240437158469946, "grad_norm": 47.0785790742371, "learning_rate": 3.9193566913562915e-07, "logits/chosen": -1.2187812328338623, "logits/rejected": -1.1253793239593506, "logps/chosen": -0.8078786730766296, "logps/rejected": -2.1750519275665283, "loss": 1.0263, "rewards/accuracies": 0.90625, "rewards/chosen": -8.078786849975586, "rewards/margins": 13.671732902526855, "rewards/rejected": -21.750518798828125, "semantic_entropy": 0.8194610476493835, "step": 560 }, { "epoch": 1.2349726775956285, "grad_norm": 44.31080037447064, "learning_rate": 3.826270719827435e-07, "logits/chosen": -1.2184025049209595, "logits/rejected": -1.1244232654571533, "logps/chosen": -0.7781059741973877, "logps/rejected": -2.595242977142334, "loss": 1.0496, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.781059265136719, "rewards/margins": 18.171369552612305, "rewards/rejected": -25.952428817749023, "semantic_entropy": 0.8032097816467285, "step": 565 }, { "epoch": 1.2459016393442623, "grad_norm": 57.680087176882985, "learning_rate": 3.7336133469909623e-07, "logits/chosen": -1.262069821357727, "logits/rejected": -1.203547477722168, "logps/chosen": -0.7461926341056824, "logps/rejected": -2.1672732830047607, "loss": 1.1028, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.461926460266113, "rewards/margins": 14.210809707641602, "rewards/rejected": -21.6727352142334, "semantic_entropy": 0.8577653169631958, "step": 570 }, { "epoch": 1.2568306010928962, "grad_norm": 46.59857147414731, "learning_rate": 3.64141840759012e-07, "logits/chosen": -1.1375811100006104, "logits/rejected": -1.0560975074768066, "logps/chosen": -0.6888304948806763, "logps/rejected": -2.229635238647461, "loss": 0.9418, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.8883056640625, "rewards/margins": 15.408047676086426, "rewards/rejected": -22.29635238647461, "semantic_entropy": 0.8547189831733704, "step": 575 }, { "epoch": 1.2677595628415301, "grad_norm": 70.16919238335676, "learning_rate": 3.549719567506076e-07, "logits/chosen": -1.1417677402496338, "logits/rejected": -1.1007084846496582, "logps/chosen": -0.746972918510437, "logps/rejected": -2.0715861320495605, "loss": 0.9986, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -7.469728946685791, "rewards/margins": 13.246131896972656, "rewards/rejected": -20.715862274169922, "semantic_entropy": 0.8440540432929993, "step": 580 }, { "epoch": 1.278688524590164, "grad_norm": 39.105942102294286, "learning_rate": 3.4585503114644996e-07, "logits/chosen": -1.2692724466323853, "logits/rejected": -1.1571121215820312, "logps/chosen": -0.7609504461288452, "logps/rejected": -2.3702054023742676, "loss": 1.0065, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.609503746032715, "rewards/margins": 16.092552185058594, "rewards/rejected": -23.70205307006836, "semantic_entropy": 0.8199702501296997, "step": 585 }, { "epoch": 1.289617486338798, "grad_norm": 35.90062312874268, "learning_rate": 3.3679439308082774e-07, "logits/chosen": -1.226792335510254, "logits/rejected": -1.176424264907837, "logps/chosen": -0.6281425356864929, "logps/rejected": -2.045499324798584, "loss": 0.9731, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -6.281425952911377, "rewards/margins": 14.173568725585938, "rewards/rejected": -20.454992294311523, "semantic_entropy": 0.8588122129440308, "step": 590 }, { "epoch": 1.3005464480874318, "grad_norm": 52.95598574128885, "learning_rate": 3.2779335113408646e-07, "logits/chosen": -1.233185052871704, "logits/rejected": -1.1640207767486572, "logps/chosen": -0.7508488297462463, "logps/rejected": -2.4652957916259766, "loss": 1.0038, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.508486747741699, "rewards/margins": 17.14447021484375, "rewards/rejected": -24.6529598236084, "semantic_entropy": 0.8177651166915894, "step": 595 }, { "epoch": 1.3114754098360657, "grad_norm": 40.0562568923892, "learning_rate": 3.1885519212446716e-07, "logits/chosen": -1.2854266166687012, "logits/rejected": -1.177534580230713, "logps/chosen": -0.6793255805969238, "logps/rejected": -2.2706198692321777, "loss": 0.9506, "rewards/accuracies": 0.96875, "rewards/chosen": -6.7932562828063965, "rewards/margins": 15.912942886352539, "rewards/rejected": -22.70619773864746, "semantic_entropy": 0.8524688482284546, "step": 600 }, { "epoch": 1.3224043715846996, "grad_norm": 56.41638001057574, "learning_rate": 3.0998317990789376e-07, "logits/chosen": -1.2670646905899048, "logits/rejected": -1.171144962310791, "logps/chosen": -0.6692796349525452, "logps/rejected": -1.934456467628479, "loss": 1.0026, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.692796230316162, "rewards/margins": 12.65176773071289, "rewards/rejected": -19.34456443786621, "semantic_entropy": 0.869337260723114, "step": 605 }, { "epoch": 1.3333333333333333, "grad_norm": 47.30494559887427, "learning_rate": 3.0118055418614295e-07, "logits/chosen": -1.3104336261749268, "logits/rejected": -1.213578224182129, "logps/chosen": -0.8171396255493164, "logps/rejected": -2.5085349082946777, "loss": 0.9846, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -8.17139720916748, "rewards/margins": 16.91395378112793, "rewards/rejected": -25.085350036621094, "semantic_entropy": 0.7933089733123779, "step": 610 }, { "epoch": 1.3442622950819672, "grad_norm": 55.65069108222119, "learning_rate": 2.9245052932383707e-07, "logits/chosen": -1.2602143287658691, "logits/rejected": -1.1212416887283325, "logps/chosen": -0.7733426094055176, "logps/rejected": -2.3373031616210938, "loss": 1.0585, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -7.733426570892334, "rewards/margins": 15.639605522155762, "rewards/rejected": -23.37303352355957, "semantic_entropy": 0.8259070515632629, "step": 615 }, { "epoch": 1.355191256830601, "grad_norm": 41.83594828022189, "learning_rate": 2.83796293174686e-07, "logits/chosen": -1.1642497777938843, "logits/rejected": -1.0947132110595703, "logps/chosen": -0.7484847903251648, "logps/rejected": -2.3808321952819824, "loss": 1.0132, "rewards/accuracies": 0.9375, "rewards/chosen": -7.4848480224609375, "rewards/margins": 16.323474884033203, "rewards/rejected": -23.808320999145508, "semantic_entropy": 0.8322114944458008, "step": 620 }, { "epoch": 1.366120218579235, "grad_norm": 45.85253729227267, "learning_rate": 2.7522100591741217e-07, "logits/chosen": -1.234703779220581, "logits/rejected": -1.1591752767562866, "logps/chosen": -0.6658716201782227, "logps/rejected": -2.3456645011901855, "loss": 0.9989, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -6.658716678619385, "rewards/margins": 16.797927856445312, "rewards/rejected": -23.45664405822754, "semantic_entropy": 0.8470379710197449, "step": 625 }, { "epoch": 1.3770491803278688, "grad_norm": 47.20204057866064, "learning_rate": 2.6672779890178046e-07, "logits/chosen": -1.163450002670288, "logits/rejected": -1.0469523668289185, "logps/chosen": -0.7807295918464661, "logps/rejected": -2.2187490463256836, "loss": 1.0123, "rewards/accuracies": 0.9375, "rewards/chosen": -7.807295322418213, "rewards/margins": 14.380197525024414, "rewards/rejected": -22.1874942779541, "semantic_entropy": 0.829529881477356, "step": 630 }, { "epoch": 1.3879781420765027, "grad_norm": 48.43553604807009, "learning_rate": 2.5831977350515454e-07, "logits/chosen": -1.1149486303329468, "logits/rejected": -1.0645884275436401, "logps/chosen": -0.7764806747436523, "logps/rejected": -2.346562385559082, "loss": 1.0361, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.764806270599365, "rewards/margins": 15.700818061828613, "rewards/rejected": -23.465625762939453, "semantic_entropy": 0.8258574604988098, "step": 635 }, { "epoch": 1.3989071038251366, "grad_norm": 50.20523377491874, "learning_rate": 2.500000000000001e-07, "logits/chosen": -1.2106841802597046, "logits/rejected": -1.164466142654419, "logps/chosen": -0.7233768105506897, "logps/rejected": -2.620008945465088, "loss": 0.932, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.233767032623291, "rewards/margins": 18.966323852539062, "rewards/rejected": -26.200092315673828, "semantic_entropy": 0.8185870051383972, "step": 640 }, { "epoch": 1.4098360655737705, "grad_norm": 50.15293641915176, "learning_rate": 2.4177151643274307e-07, "logits/chosen": -1.1696977615356445, "logits/rejected": -1.112188458442688, "logps/chosen": -0.7105950117111206, "logps/rejected": -2.4047422409057617, "loss": 0.9626, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.105950832366943, "rewards/margins": 16.941471099853516, "rewards/rejected": -24.047422409057617, "semantic_entropy": 0.8116961717605591, "step": 645 }, { "epoch": 1.4207650273224044, "grad_norm": 52.051060632639825, "learning_rate": 2.3363732751439923e-07, "logits/chosen": -1.2659627199172974, "logits/rejected": -1.178022027015686, "logps/chosen": -0.7824967503547668, "logps/rejected": -2.2903237342834473, "loss": 1.0342, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -7.824967861175537, "rewards/margins": 15.078269958496094, "rewards/rejected": -22.903236389160156, "semantic_entropy": 0.8222282528877258, "step": 650 }, { "epoch": 1.4316939890710383, "grad_norm": 104.74662245786296, "learning_rate": 2.2560040352337307e-07, "logits/chosen": -1.1930986642837524, "logits/rejected": -1.0961310863494873, "logps/chosen": -0.8049964904785156, "logps/rejected": -2.6303577423095703, "loss": 1.0368, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -8.049962997436523, "rewards/margins": 18.253612518310547, "rewards/rejected": -26.303577423095703, "semantic_entropy": 0.8019247055053711, "step": 655 }, { "epoch": 1.4426229508196722, "grad_norm": 74.14915143886914, "learning_rate": 2.1766367922083283e-07, "logits/chosen": -1.2195419073104858, "logits/rejected": -1.1510334014892578, "logps/chosen": -0.7229866981506348, "logps/rejected": -2.4508605003356934, "loss": 0.9204, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.229866981506348, "rewards/margins": 17.278736114501953, "rewards/rejected": -24.508602142333984, "semantic_entropy": 0.8276729583740234, "step": 660 }, { "epoch": 1.453551912568306, "grad_norm": 40.08916656671079, "learning_rate": 2.0983005277905347e-07, "logits/chosen": -1.25788152217865, "logits/rejected": -1.1829631328582764, "logps/chosen": -0.7363836765289307, "logps/rejected": -2.4085285663604736, "loss": 0.9793, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -7.363836765289307, "rewards/margins": 16.721446990966797, "rewards/rejected": -24.085285186767578, "semantic_entropy": 0.8287376165390015, "step": 665 }, { "epoch": 1.46448087431694, "grad_norm": 47.3989204733329, "learning_rate": 2.021023847231202e-07, "logits/chosen": -1.2234550714492798, "logits/rejected": -1.1443179845809937, "logps/chosen": -0.7974756956100464, "logps/rejected": -2.3043999671936035, "loss": 0.9905, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.974755764007568, "rewards/margins": 15.069241523742676, "rewards/rejected": -23.043996810913086, "semantic_entropy": 0.8342604637145996, "step": 670 }, { "epoch": 1.4754098360655736, "grad_norm": 108.78018960923754, "learning_rate": 1.94483496886381e-07, "logits/chosen": -1.1683439016342163, "logits/rejected": -1.1087901592254639, "logps/chosen": -0.6944879293441772, "logps/rejected": -2.433687925338745, "loss": 0.8989, "rewards/accuracies": 0.9375, "rewards/chosen": -6.944879055023193, "rewards/margins": 17.391998291015625, "rewards/rejected": -24.33687973022461, "semantic_entropy": 0.8319599032402039, "step": 675 }, { "epoch": 1.4863387978142075, "grad_norm": 60.19474027091482, "learning_rate": 1.869761713800254e-07, "logits/chosen": -1.2412843704223633, "logits/rejected": -1.1452839374542236, "logps/chosen": -0.831190288066864, "logps/rejected": -2.4966881275177, "loss": 1.0112, "rewards/accuracies": 0.9375, "rewards/chosen": -8.31190299987793, "rewards/margins": 16.654979705810547, "rewards/rejected": -24.966880798339844, "semantic_entropy": 0.800665020942688, "step": 680 }, { "epoch": 1.4972677595628414, "grad_norm": 45.288193362387666, "learning_rate": 1.7958314957717064e-07, "logits/chosen": -1.2326924800872803, "logits/rejected": -1.1884281635284424, "logps/chosen": -0.6524280309677124, "logps/rejected": -2.181318998336792, "loss": 0.9979, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -6.524280548095703, "rewards/margins": 15.288909912109375, "rewards/rejected": -21.813190460205078, "semantic_entropy": 0.8463915586471558, "step": 685 }, { "epoch": 1.5081967213114753, "grad_norm": 77.54652900736652, "learning_rate": 1.7230713111182164e-07, "logits/chosen": -1.2749425172805786, "logits/rejected": -1.1991561651229858, "logps/chosen": -0.6433757543563843, "logps/rejected": -2.4266154766082764, "loss": 0.9611, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.433757781982422, "rewards/margins": 17.8323974609375, "rewards/rejected": -24.266155242919922, "semantic_entropy": 0.8604837656021118, "step": 690 }, { "epoch": 1.5191256830601092, "grad_norm": 45.5200345735269, "learning_rate": 1.651507728930739e-07, "logits/chosen": -1.1950256824493408, "logits/rejected": -1.131256103515625, "logps/chosen": -0.6931561231613159, "logps/rejected": -2.161853551864624, "loss": 0.9934, "rewards/accuracies": 0.90625, "rewards/chosen": -6.931561470031738, "rewards/margins": 14.686975479125977, "rewards/rejected": -21.61853790283203, "semantic_entropy": 0.8436753153800964, "step": 695 }, { "epoch": 1.530054644808743, "grad_norm": 49.242008834049685, "learning_rate": 1.5811668813491696e-07, "logits/chosen": -1.3293455839157104, "logits/rejected": -1.2231751680374146, "logps/chosen": -0.7694125771522522, "logps/rejected": -2.4189977645874023, "loss": 0.978, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -7.694125175476074, "rewards/margins": 16.495851516723633, "rewards/rejected": -24.189977645874023, "semantic_entropy": 0.8082691431045532, "step": 700 }, { "epoch": 1.540983606557377, "grad_norm": 44.65399870377938, "learning_rate": 1.5120744540199343e-07, "logits/chosen": -1.2114274501800537, "logits/rejected": -1.1308143138885498, "logps/chosen": -0.7381525635719299, "logps/rejected": -2.3527631759643555, "loss": 0.9314, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.381524562835693, "rewards/margins": 16.146106719970703, "rewards/rejected": -23.527631759643555, "semantic_entropy": 0.8333342671394348, "step": 705 }, { "epoch": 1.5519125683060109, "grad_norm": 52.47246084045148, "learning_rate": 1.4442556767166369e-07, "logits/chosen": -1.2004725933074951, "logits/rejected": -1.1394346952438354, "logps/chosen": -0.7631191611289978, "logps/rejected": -2.4908859729766846, "loss": 1.0138, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -7.631192207336426, "rewards/margins": 17.277666091918945, "rewards/rejected": -24.908855438232422, "semantic_entropy": 0.8088520169258118, "step": 710 }, { "epoch": 1.5628415300546448, "grad_norm": 39.161062372274245, "learning_rate": 1.377735314127148e-07, "logits/chosen": -1.1989295482635498, "logits/rejected": -1.0892112255096436, "logps/chosen": -0.754266083240509, "logps/rejected": -2.3557486534118652, "loss": 0.9097, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.542661190032959, "rewards/margins": 16.014827728271484, "rewards/rejected": -23.5574893951416, "semantic_entropy": 0.8200591206550598, "step": 715 }, { "epoch": 1.5737704918032787, "grad_norm": 57.53753951235613, "learning_rate": 1.312537656810549e-07, "logits/chosen": -1.1801402568817139, "logits/rejected": -1.1305280923843384, "logps/chosen": -0.8796719312667847, "logps/rejected": -2.6609649658203125, "loss": 1.0603, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.79671859741211, "rewards/margins": 17.812931060791016, "rewards/rejected": -26.609649658203125, "semantic_entropy": 0.7918781042098999, "step": 720 }, { "epoch": 1.5846994535519126, "grad_norm": 51.68795375166876, "learning_rate": 1.2486865123271866e-07, "logits/chosen": -1.2510040998458862, "logits/rejected": -1.1513909101486206, "logps/chosen": -0.7905360460281372, "logps/rejected": -2.450331449508667, "loss": 0.988, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -7.905358791351318, "rewards/margins": 16.59795570373535, "rewards/rejected": -24.503314971923828, "semantic_entropy": 0.811559796333313, "step": 725 }, { "epoch": 1.5956284153005464, "grad_norm": 53.36439634728435, "learning_rate": 1.1862051965451214e-07, "logits/chosen": -1.2445173263549805, "logits/rejected": -1.1288838386535645, "logps/chosen": -0.7035760283470154, "logps/rejected": -2.4538397789001465, "loss": 0.9645, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.035760402679443, "rewards/margins": 17.502635955810547, "rewards/rejected": -24.53839683532715, "semantic_entropy": 0.8314288258552551, "step": 730 }, { "epoch": 1.6065573770491803, "grad_norm": 52.77186710891425, "learning_rate": 1.1251165251261047e-07, "logits/chosen": -1.1849864721298218, "logits/rejected": -1.111053466796875, "logps/chosen": -0.6819809675216675, "logps/rejected": -2.3596489429473877, "loss": 0.9183, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.819809913635254, "rewards/margins": 16.77667999267578, "rewards/rejected": -23.596487045288086, "semantic_entropy": 0.8518983721733093, "step": 735 }, { "epoch": 1.6174863387978142, "grad_norm": 51.04954161674348, "learning_rate": 1.0654428051942138e-07, "logits/chosen": -1.185575246810913, "logits/rejected": -1.1258459091186523, "logps/chosen": -0.8496238589286804, "logps/rejected": -2.4404985904693604, "loss": 1.0108, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -8.49623966217041, "rewards/margins": 15.908746719360352, "rewards/rejected": -24.404987335205078, "semantic_entropy": 0.8217931985855103, "step": 740 }, { "epoch": 1.6284153005464481, "grad_norm": 44.78590940359996, "learning_rate": 1.0072058271901978e-07, "logits/chosen": -1.1844556331634521, "logits/rejected": -1.096343994140625, "logps/chosen": -0.7650187611579895, "logps/rejected": -2.4417996406555176, "loss": 0.9889, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -7.6501874923706055, "rewards/margins": 16.767807006835938, "rewards/rejected": -24.41799545288086, "semantic_entropy": 0.8134136199951172, "step": 745 }, { "epoch": 1.639344262295082, "grad_norm": 41.35825995367568, "learning_rate": 9.504268569144763e-08, "logits/chosen": -1.2524887323379517, "logits/rejected": -1.1518092155456543, "logps/chosen": -0.6517141461372375, "logps/rejected": -2.495558977127075, "loss": 0.9019, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.517141819000244, "rewards/margins": 18.438446044921875, "rewards/rejected": -24.95558738708496, "semantic_entropy": 0.8249934911727905, "step": 750 }, { "epoch": 1.650273224043716, "grad_norm": 49.17981910139592, "learning_rate": 8.951266277617325e-08, "logits/chosen": -1.174800992012024, "logits/rejected": -1.0904661417007446, "logps/chosen": -0.6784438490867615, "logps/rejected": -2.281085968017578, "loss": 0.9285, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -6.784438133239746, "rewards/margins": 16.026418685913086, "rewards/rejected": -22.81085968017578, "semantic_entropy": 0.8071689605712891, "step": 755 }, { "epoch": 1.6612021857923498, "grad_norm": 55.44235074285089, "learning_rate": 8.413253331499049e-08, "logits/chosen": -1.2523894309997559, "logits/rejected": -1.1709582805633545, "logps/chosen": -0.7902460694313049, "logps/rejected": -2.353731155395508, "loss": 0.9701, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -7.90246057510376, "rewards/margins": 15.634851455688477, "rewards/rejected": -23.537311553955078, "semantic_entropy": 0.8497117757797241, "step": 760 }, { "epoch": 1.6721311475409837, "grad_norm": 46.939667617709574, "learning_rate": 7.8904261914637e-08, "logits/chosen": -1.2579504251480103, "logits/rejected": -1.2005599737167358, "logps/chosen": -0.7765697240829468, "logps/rejected": -2.3420188426971436, "loss": 1.0131, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.765698432922363, "rewards/margins": 15.654492378234863, "rewards/rejected": -23.420190811157227, "semantic_entropy": 0.8250833749771118, "step": 765 }, { "epoch": 1.6830601092896176, "grad_norm": 67.60864064581558, "learning_rate": 7.382975772939865e-08, "logits/chosen": -1.2617108821868896, "logits/rejected": -1.2064878940582275, "logps/chosen": -0.7011424899101257, "logps/rejected": -2.4052655696868896, "loss": 0.9795, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -7.011425971984863, "rewards/margins": 17.04123306274414, "rewards/rejected": -24.052656173706055, "semantic_entropy": 0.8459088206291199, "step": 770 }, { "epoch": 1.6939890710382515, "grad_norm": 68.25252330535682, "learning_rate": 6.891087376396315e-08, "logits/chosen": -1.1619203090667725, "logits/rejected": -1.1151115894317627, "logps/chosen": -0.6944946050643921, "logps/rejected": -2.123880624771118, "loss": 1.0529, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.9449462890625, "rewards/margins": 14.293858528137207, "rewards/rejected": -21.23880386352539, "semantic_entropy": 0.8555929064750671, "step": 775 }, { "epoch": 1.7049180327868854, "grad_norm": 58.94990698167926, "learning_rate": 6.414940619677734e-08, "logits/chosen": -1.21394944190979, "logits/rejected": -1.148568034172058, "logps/chosen": -0.7798916697502136, "logps/rejected": -2.334639072418213, "loss": 1.0831, "rewards/accuracies": 0.9375, "rewards/chosen": -7.798917293548584, "rewards/margins": 15.547472953796387, "rewards/rejected": -23.346389770507812, "semantic_entropy": 0.8230711221694946, "step": 780 }, { "epoch": 1.7158469945355193, "grad_norm": 54.978556677024066, "learning_rate": 5.954709372415523e-08, "logits/chosen": -1.2210636138916016, "logits/rejected": -1.134007453918457, "logps/chosen": -0.8276329040527344, "logps/rejected": -2.5226263999938965, "loss": 1.0036, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -8.27632999420166, "rewards/margins": 16.949934005737305, "rewards/rejected": -25.22626304626465, "semantic_entropy": 0.8026347160339355, "step": 785 }, { "epoch": 1.7267759562841531, "grad_norm": 58.44169076386046, "learning_rate": 5.5105616925376296e-08, "logits/chosen": -1.3411870002746582, "logits/rejected": -1.1771245002746582, "logps/chosen": -0.7094103097915649, "logps/rejected": -2.3087127208709717, "loss": 0.9863, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.094101905822754, "rewards/margins": 15.993026733398438, "rewards/rejected": -23.08713150024414, "semantic_entropy": 0.8216876983642578, "step": 790 }, { "epoch": 1.737704918032787, "grad_norm": 53.528578956238725, "learning_rate": 5.082659764900482e-08, "logits/chosen": -1.2835462093353271, "logits/rejected": -1.2009773254394531, "logps/chosen": -0.6398060917854309, "logps/rejected": -2.0710248947143555, "loss": 1.0059, "rewards/accuracies": 0.9375, "rewards/chosen": -6.3980607986450195, "rewards/margins": 14.312187194824219, "rewards/rejected": -20.710247039794922, "semantic_entropy": 0.8597515225410461, "step": 795 }, { "epoch": 1.748633879781421, "grad_norm": 59.57829603969701, "learning_rate": 4.6711598420656976e-08, "logits/chosen": -1.2482662200927734, "logits/rejected": -1.1601988077163696, "logps/chosen": -0.7208329439163208, "logps/rejected": -2.314363956451416, "loss": 0.9552, "rewards/accuracies": 0.9375, "rewards/chosen": -7.208329200744629, "rewards/margins": 15.935308456420898, "rewards/rejected": -23.143640518188477, "semantic_entropy": 0.8409850001335144, "step": 800 }, { "epoch": 1.748633879781421, "eval_logits/chosen": -1.5359925031661987, "eval_logits/rejected": -1.4433752298355103, "eval_logps/chosen": -0.8280417323112488, "eval_logps/rejected": -2.125033140182495, "eval_loss": 1.4401862621307373, "eval_rewards/accuracies": 0.8795180916786194, "eval_rewards/chosen": -8.280416488647461, "eval_rewards/margins": 12.969916343688965, "eval_rewards/rejected": -21.25033187866211, "eval_runtime": 33.6039, "eval_samples_per_second": 39.222, "eval_semantic_entropy": 0.8376908898353577, "eval_steps_per_second": 2.47, "step": 800 }, { "epoch": 1.7595628415300546, "grad_norm": 44.1368033825106, "learning_rate": 4.2762121872428615e-08, "logits/chosen": -1.2641065120697021, "logits/rejected": -1.2107889652252197, "logps/chosen": -0.6843208074569702, "logps/rejected": -2.0283682346343994, "loss": 1.0256, "rewards/accuracies": 0.96875, "rewards/chosen": -6.843208312988281, "rewards/margins": 13.440475463867188, "rewards/rejected": -20.28368377685547, "semantic_entropy": 0.8609482645988464, "step": 805 }, { "epoch": 1.7704918032786885, "grad_norm": 60.8116220315975, "learning_rate": 3.897961019419516e-08, "logits/chosen": -1.242765188217163, "logits/rejected": -1.111221194267273, "logps/chosen": -0.6914607882499695, "logps/rejected": -2.5515542030334473, "loss": 1.026, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.914607048034668, "rewards/margins": 18.600933074951172, "rewards/rejected": -25.515541076660156, "semantic_entropy": 0.8368776440620422, "step": 810 }, { "epoch": 1.7814207650273224, "grad_norm": 48.041636594141835, "learning_rate": 3.536544460698143e-08, "logits/chosen": -1.2581889629364014, "logits/rejected": -1.2215464115142822, "logps/chosen": -0.7543720006942749, "logps/rejected": -2.438751220703125, "loss": 1.0363, "rewards/accuracies": 0.9375, "rewards/chosen": -7.5437211990356445, "rewards/margins": 16.843791961669922, "rewards/rejected": -24.387516021728516, "semantic_entropy": 0.8024908304214478, "step": 815 }, { "epoch": 1.7923497267759563, "grad_norm": 46.211811466738105, "learning_rate": 3.192094485859526e-08, "logits/chosen": -1.2139607667922974, "logits/rejected": -1.1563109159469604, "logps/chosen": -0.7942629456520081, "logps/rejected": -2.2374846935272217, "loss": 0.9534, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -7.942629337310791, "rewards/margins": 14.432218551635742, "rewards/rejected": -22.374849319458008, "semantic_entropy": 0.8313804864883423, "step": 820 }, { "epoch": 1.8032786885245902, "grad_norm": 51.498446681364456, "learning_rate": 2.8647368741709367e-08, "logits/chosen": -1.307348608970642, "logits/rejected": -1.172135353088379, "logps/chosen": -0.8334323167800903, "logps/rejected": -2.4974188804626465, "loss": 0.9931, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -8.334322929382324, "rewards/margins": 16.63986587524414, "rewards/rejected": -24.974185943603516, "semantic_entropy": 0.7853243350982666, "step": 825 }, { "epoch": 1.814207650273224, "grad_norm": 65.74184699216715, "learning_rate": 2.5545911634565265e-08, "logits/chosen": -1.2999436855316162, "logits/rejected": -1.1716783046722412, "logps/chosen": -0.7435690760612488, "logps/rejected": -2.767209529876709, "loss": 0.9785, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.435690402984619, "rewards/margins": 20.236404418945312, "rewards/rejected": -27.672094345092773, "semantic_entropy": 0.8089765310287476, "step": 830 }, { "epoch": 1.825136612021858, "grad_norm": 53.69755289327063, "learning_rate": 2.261770606446983e-08, "logits/chosen": -1.3077576160430908, "logits/rejected": -1.2317638397216797, "logps/chosen": -0.7318333387374878, "logps/rejected": -1.9953196048736572, "loss": 0.9652, "rewards/accuracies": 0.9375, "rewards/chosen": -7.318333625793457, "rewards/margins": 12.634860038757324, "rewards/rejected": -19.95319366455078, "semantic_entropy": 0.8394795656204224, "step": 835 }, { "epoch": 1.8360655737704918, "grad_norm": 48.57345276974053, "learning_rate": 1.9863821294241522e-08, "logits/chosen": -1.2126185894012451, "logits/rejected": -1.10856032371521, "logps/chosen": -0.7022706866264343, "logps/rejected": -2.3867998123168945, "loss": 0.9824, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -7.022706508636475, "rewards/margins": 16.84528923034668, "rewards/rejected": -23.86799430847168, "semantic_entropy": 0.8377873301506042, "step": 840 }, { "epoch": 1.8469945355191257, "grad_norm": 47.58974510921998, "learning_rate": 1.7285262931759082e-08, "logits/chosen": -1.170081615447998, "logits/rejected": -1.1226613521575928, "logps/chosen": -0.709827721118927, "logps/rejected": -2.499692440032959, "loss": 1.0049, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.0982770919799805, "rewards/margins": 17.89864730834961, "rewards/rejected": -24.99692726135254, "semantic_entropy": 0.8213443756103516, "step": 845 }, { "epoch": 1.8579234972677594, "grad_norm": 43.861213908082526, "learning_rate": 1.4882972562753615e-08, "logits/chosen": -1.2278581857681274, "logits/rejected": -1.1186041831970215, "logps/chosen": -0.6293253898620605, "logps/rejected": -2.4325814247131348, "loss": 0.9317, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -6.293253421783447, "rewards/margins": 18.032560348510742, "rewards/rejected": -24.325815200805664, "semantic_entropy": 0.8304460644721985, "step": 850 }, { "epoch": 1.8688524590163933, "grad_norm": 46.25639636622948, "learning_rate": 1.2657827406979404e-08, "logits/chosen": -1.2755509614944458, "logits/rejected": -1.1995421648025513, "logps/chosen": -0.7046025991439819, "logps/rejected": -2.2888636589050293, "loss": 0.9631, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.046026706695557, "rewards/margins": 15.842610359191895, "rewards/rejected": -22.88863754272461, "semantic_entropy": 0.8367988467216492, "step": 855 }, { "epoch": 1.8797814207650272, "grad_norm": 43.742641732125044, "learning_rate": 1.0610639997888915e-08, "logits/chosen": -1.144809603691101, "logits/rejected": -1.0996748208999634, "logps/chosen": -0.6617113947868347, "logps/rejected": -2.071277141571045, "loss": 0.9799, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.6171135902404785, "rewards/margins": 14.095659255981445, "rewards/rejected": -20.712770462036133, "semantic_entropy": 0.8550912141799927, "step": 860 }, { "epoch": 1.890710382513661, "grad_norm": 43.8556750169825, "learning_rate": 8.742157885927804e-09, "logits/chosen": -1.264917016029358, "logits/rejected": -1.1865818500518799, "logps/chosen": -0.7975755333900452, "logps/rejected": -2.4832332134246826, "loss": 0.9288, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.975754737854004, "rewards/margins": 16.856576919555664, "rewards/rejected": -24.832332611083984, "semantic_entropy": 0.8138486742973328, "step": 865 }, { "epoch": 1.901639344262295, "grad_norm": 49.4352171489155, "learning_rate": 7.053063365559997e-09, "logits/chosen": -1.2424798011779785, "logits/rejected": -1.1954628229141235, "logps/chosen": -0.6465862393379211, "logps/rejected": -2.410433769226074, "loss": 0.8832, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -6.465862274169922, "rewards/margins": 17.638477325439453, "rewards/rejected": -24.104337692260742, "semantic_entropy": 0.8361645936965942, "step": 870 }, { "epoch": 1.9125683060109289, "grad_norm": 79.71459811079978, "learning_rate": 5.543973226120935e-09, "logits/chosen": -1.2222373485565186, "logits/rejected": -1.1502609252929688, "logps/chosen": -0.7222265005111694, "logps/rejected": -2.1863186359405518, "loss": 0.9862, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.222264289855957, "rewards/margins": 14.640920639038086, "rewards/rejected": -21.86318588256836, "semantic_entropy": 0.8562089800834656, "step": 875 }, { "epoch": 1.9234972677595628, "grad_norm": 53.36096318687935, "learning_rate": 4.215438526591064e-09, "logits/chosen": -1.2770297527313232, "logits/rejected": -1.2093579769134521, "logps/chosen": -0.6959497332572937, "logps/rejected": -2.2840352058410645, "loss": 0.9871, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.959497928619385, "rewards/margins": 15.880853652954102, "rewards/rejected": -22.840351104736328, "semantic_entropy": 0.849157452583313, "step": 880 }, { "epoch": 1.9344262295081966, "grad_norm": 38.81388371132877, "learning_rate": 3.0679443943712467e-09, "logits/chosen": -1.3255574703216553, "logits/rejected": -1.2370083332061768, "logps/chosen": -0.7685250639915466, "logps/rejected": -2.3793647289276123, "loss": 0.9499, "rewards/accuracies": 0.9375, "rewards/chosen": -7.685250759124756, "rewards/margins": 16.108396530151367, "rewards/rejected": -23.79364585876465, "semantic_entropy": 0.8148989677429199, "step": 885 }, { "epoch": 1.9453551912568305, "grad_norm": 43.52784854249128, "learning_rate": 2.1019098481337426e-09, "logits/chosen": -1.271645188331604, "logits/rejected": -1.1847755908966064, "logps/chosen": -0.7262202501296997, "logps/rejected": -2.480203151702881, "loss": 0.9648, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -7.262202262878418, "rewards/margins": 17.53982925415039, "rewards/rejected": -24.802032470703125, "semantic_entropy": 0.8072282671928406, "step": 890 }, { "epoch": 1.9562841530054644, "grad_norm": 54.276319170574524, "learning_rate": 1.3176876448135477e-09, "logits/chosen": -1.311767816543579, "logits/rejected": -1.1933305263519287, "logps/chosen": -0.8360783457756042, "logps/rejected": -2.5562148094177246, "loss": 1.0277, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -8.360783576965332, "rewards/margins": 17.201366424560547, "rewards/rejected": -25.562149047851562, "semantic_entropy": 0.8203535079956055, "step": 895 }, { "epoch": 1.9672131147540983, "grad_norm": 50.371762692382795, "learning_rate": 7.155641507955445e-10, "logits/chosen": -1.2078804969787598, "logits/rejected": -1.1214892864227295, "logps/chosen": -0.6584422588348389, "logps/rejected": -2.1391983032226562, "loss": 1.026, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.5844221115112305, "rewards/margins": 14.807560920715332, "rewards/rejected": -21.391983032226562, "semantic_entropy": 0.8509858250617981, "step": 900 }, { "epoch": 1.9781420765027322, "grad_norm": 55.00348286428405, "learning_rate": 2.957592373452056e-10, "logits/chosen": -1.2071561813354492, "logits/rejected": -1.1362513303756714, "logps/chosen": -0.719018280506134, "logps/rejected": -2.406873941421509, "loss": 0.9953, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -7.190183162689209, "rewards/margins": 16.87855339050293, "rewards/rejected": -24.068737030029297, "semantic_entropy": 0.8274633288383484, "step": 905 }, { "epoch": 1.989071038251366, "grad_norm": 43.167771529061575, "learning_rate": 5.842620032053824e-11, "logits/chosen": -1.2589218616485596, "logits/rejected": -1.189516544342041, "logps/chosen": -0.7029792666435242, "logps/rejected": -2.2207939624786377, "loss": 0.9075, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -7.029792785644531, "rewards/margins": 15.178146362304688, "rewards/rejected": -22.20793914794922, "semantic_entropy": 0.8508146405220032, "step": 910 }, { "epoch": 1.9978142076502732, "step": 914, "total_flos": 0.0, "train_loss": 1.6402891297830795, "train_runtime": 11806.3913, "train_samples_per_second": 9.92, "train_steps_per_second": 0.077 } ], "logging_steps": 5, "max_steps": 914, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }