{ "best_metric": 1.4484930038452148, "best_model_checkpoint": "saves/Falcon-7B-Instruct/lora/orpo-salt/checkpoint-1500", "epoch": 2.9969690846635686, "eval_steps": 500, "global_step": 1854, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01616488179430188, "grad_norm": 0.5467122793197632, "learning_rate": 4.999648198770648e-06, "logits/chosen": -14.078092575073242, "logits/rejected": -14.159353256225586, "logps/chosen": -1.7583353519439697, "logps/rejected": -1.8469493389129639, "loss": 1.8299, "odds_ratio_loss": 0.7155797481536865, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.17583352327346802, "rewards/margins": 0.008861413225531578, "rewards/rejected": -0.18469493091106415, "sft_loss": 1.7583353519439697, "step": 10 }, { "epoch": 0.03232976358860376, "grad_norm": 0.495731920003891, "learning_rate": 4.998578646361359e-06, "logits/chosen": -14.073513984680176, "logits/rejected": -14.144752502441406, "logps/chosen": -1.9236218929290771, "logps/rejected": -1.9451425075531006, "loss": 2.0003, "odds_ratio_loss": 0.766566812992096, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.1923622190952301, "rewards/margins": 0.002152049448341131, "rewards/rejected": -0.19451424479484558, "sft_loss": 1.9236218929290771, "step": 20 }, { "epoch": 0.04849464538290564, "grad_norm": 0.6057537198066711, "learning_rate": 4.996791614004449e-06, "logits/chosen": -14.302851676940918, "logits/rejected": -14.224812507629395, "logps/chosen": -1.8387420177459717, "logps/rejected": -1.910175085067749, "loss": 1.9128, "odds_ratio_loss": 0.7409650087356567, "rewards/accuracies": 0.46875, "rewards/chosen": -0.1838742196559906, "rewards/margins": 0.007143297698348761, "rewards/rejected": -0.1910175085067749, "sft_loss": 1.8387420177459717, "step": 30 }, { "epoch": 0.06465952717720752, "grad_norm": 0.5634093284606934, "learning_rate": 4.994287614855618e-06, "logits/chosen": -14.0798921585083, "logits/rejected": -14.19922161102295, "logps/chosen": -1.947654366493225, "logps/rejected": -1.9009010791778564, "loss": 2.0298, "odds_ratio_loss": 0.8212669491767883, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.19476543366909027, "rewards/margins": -0.004675320815294981, "rewards/rejected": -0.1900901347398758, "sft_loss": 1.947654366493225, "step": 40 }, { "epoch": 0.0808244089715094, "grad_norm": 0.7957186698913574, "learning_rate": 4.991067367951343e-06, "logits/chosen": -14.371423721313477, "logits/rejected": -14.266546249389648, "logps/chosen": -2.017087697982788, "logps/rejected": -2.0035624504089355, "loss": 2.0958, "odds_ratio_loss": 0.7871265411376953, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.20170876383781433, "rewards/margins": -0.0013525458052754402, "rewards/rejected": -0.20035621523857117, "sft_loss": 2.017087697982788, "step": 50 }, { "epoch": 0.09698929076581128, "grad_norm": 0.5418820381164551, "learning_rate": 4.987131798002389e-06, "logits/chosen": -14.21721076965332, "logits/rejected": -14.099153518676758, "logps/chosen": -1.8751760721206665, "logps/rejected": -1.8855310678482056, "loss": 1.9577, "odds_ratio_loss": 0.8254929780960083, "rewards/accuracies": 0.5, "rewards/chosen": -0.18751761317253113, "rewards/margins": 0.001035516383126378, "rewards/rejected": -0.188553124666214, "sft_loss": 1.8751760721206665, "step": 60 }, { "epoch": 0.11315417256011315, "grad_norm": 1.0633864402770996, "learning_rate": 4.982482035128285e-06, "logits/chosen": -14.105901718139648, "logits/rejected": -14.193835258483887, "logps/chosen": -2.0220446586608887, "logps/rejected": -1.9594541788101196, "loss": 2.1089, "odds_ratio_loss": 0.8683654069900513, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.20220446586608887, "rewards/margins": -0.00625905767083168, "rewards/rejected": -0.19594541192054749, "sft_loss": 2.0220446586608887, "step": 70 }, { "epoch": 0.12931905435441504, "grad_norm": 1.0158140659332275, "learning_rate": 4.9771194145328e-06, "logits/chosen": -14.075093269348145, "logits/rejected": -14.02421760559082, "logps/chosen": -1.6751682758331299, "logps/rejected": -1.7500627040863037, "loss": 1.7468, "odds_ratio_loss": 0.716758668422699, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.167516827583313, "rewards/margins": 0.00748945539817214, "rewards/rejected": -0.17500628530979156, "sft_loss": 1.6751682758331299, "step": 80 }, { "epoch": 0.1454839361487169, "grad_norm": 1.3243364095687866, "learning_rate": 4.971045476120532e-06, "logits/chosen": -14.14300537109375, "logits/rejected": -14.079290390014648, "logps/chosen": -1.8245623111724854, "logps/rejected": -1.760660171508789, "loss": 1.9067, "odds_ratio_loss": 0.8211291432380676, "rewards/accuracies": 0.4375, "rewards/chosen": -0.18245622515678406, "rewards/margins": -0.006390226539224386, "rewards/rejected": -0.17606601119041443, "sft_loss": 1.8245623111724854, "step": 90 }, { "epoch": 0.1616488179430188, "grad_norm": 0.7163342237472534, "learning_rate": 4.964261964054713e-06, "logits/chosen": -14.068964958190918, "logits/rejected": -14.082951545715332, "logps/chosen": -1.7527011632919312, "logps/rejected": -1.8138408660888672, "loss": 1.8297, "odds_ratio_loss": 0.7703070044517517, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.17527012526988983, "rewards/margins": 0.0061139510944485664, "rewards/rejected": -0.18138407170772552, "sft_loss": 1.7527011632919312, "step": 100 }, { "epoch": 0.17781369973732067, "grad_norm": 1.006773829460144, "learning_rate": 4.956770826256372e-06, "logits/chosen": -14.166906356811523, "logits/rejected": -14.120782852172852, "logps/chosen": -1.7077207565307617, "logps/rejected": -1.7365996837615967, "loss": 1.7844, "odds_ratio_loss": 0.7667573690414429, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.17077207565307617, "rewards/margins": 0.002887908834964037, "rewards/rejected": -0.17365998029708862, "sft_loss": 1.7077207565307617, "step": 110 }, { "epoch": 0.19397858153162256, "grad_norm": 0.8139289617538452, "learning_rate": 4.94857421384497e-06, "logits/chosen": -14.175407409667969, "logits/rejected": -14.165875434875488, "logps/chosen": -1.692577600479126, "logps/rejected": -1.8239320516586304, "loss": 1.7682, "odds_ratio_loss": 0.7562084794044495, "rewards/accuracies": 0.5, "rewards/chosen": -0.1692577451467514, "rewards/margins": 0.013135453686118126, "rewards/rejected": -0.18239320814609528, "sft_loss": 1.692577600479126, "step": 120 }, { "epoch": 0.21014346332592443, "grad_norm": 1.0950274467468262, "learning_rate": 4.939674480520701e-06, "logits/chosen": -14.055421829223633, "logits/rejected": -14.265202522277832, "logps/chosen": -1.65860915184021, "logps/rejected": -1.6671603918075562, "loss": 1.7352, "odds_ratio_loss": 0.7663736939430237, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.16586092114448547, "rewards/margins": 0.0008551125647500157, "rewards/rejected": -0.16671602427959442, "sft_loss": 1.65860915184021, "step": 130 }, { "epoch": 0.2263083451202263, "grad_norm": 0.6190826892852783, "learning_rate": 4.930074181888613e-06, "logits/chosen": -14.116220474243164, "logits/rejected": -14.158090591430664, "logps/chosen": -1.7475076913833618, "logps/rejected": -1.736114501953125, "loss": 1.8234, "odds_ratio_loss": 0.7589074373245239, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.17475078999996185, "rewards/margins": -0.0011393536115065217, "rewards/rejected": -0.17361143231391907, "sft_loss": 1.7475076913833618, "step": 140 }, { "epoch": 0.2424732269145282, "grad_norm": 0.8096482157707214, "learning_rate": 4.91977607472475e-06, "logits/chosen": -14.182394027709961, "logits/rejected": -14.252290725708008, "logps/chosen": -1.6399564743041992, "logps/rejected": -1.6184114217758179, "loss": 1.7178, "odds_ratio_loss": 0.778221607208252, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.1639956533908844, "rewards/margins": -0.0021545083727687597, "rewards/rejected": -0.16184113919734955, "sft_loss": 1.6399564743041992, "step": 150 }, { "epoch": 0.2586381087088301, "grad_norm": 1.5372618436813354, "learning_rate": 4.908783116184534e-06, "logits/chosen": -14.110807418823242, "logits/rejected": -14.087692260742188, "logps/chosen": -1.613721489906311, "logps/rejected": -1.7073653936386108, "loss": 1.6837, "odds_ratio_loss": 0.6995801329612732, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1613721400499344, "rewards/margins": 0.009364412166178226, "rewards/rejected": -0.17073655128479004, "sft_loss": 1.613721489906311, "step": 160 }, { "epoch": 0.27480299050313195, "grad_norm": 1.0400787591934204, "learning_rate": 4.897098462953598e-06, "logits/chosen": -14.309249877929688, "logits/rejected": -14.144041061401367, "logps/chosen": -1.572377324104309, "logps/rejected": -1.679239273071289, "loss": 1.6438, "odds_ratio_loss": 0.7143967747688293, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.15723773837089539, "rewards/margins": 0.010686198249459267, "rewards/rejected": -0.1679239273071289, "sft_loss": 1.572377324104309, "step": 170 }, { "epoch": 0.2909678722974338, "grad_norm": 0.6752244234085083, "learning_rate": 4.884725470341331e-06, "logits/chosen": -14.362325668334961, "logits/rejected": -14.368985176086426, "logps/chosen": -1.5275907516479492, "logps/rejected": -1.6322838068008423, "loss": 1.5969, "odds_ratio_loss": 0.6928091645240784, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.15275909006595612, "rewards/margins": 0.01046929694712162, "rewards/rejected": -0.1632283627986908, "sft_loss": 1.5275907516479492, "step": 180 }, { "epoch": 0.3071327540917357, "grad_norm": 1.5551739931106567, "learning_rate": 4.871667691317377e-06, "logits/chosen": -14.23143196105957, "logits/rejected": -14.168081283569336, "logps/chosen": -1.5617109537124634, "logps/rejected": -1.516629934310913, "loss": 1.6442, "odds_ratio_loss": 0.8246932029724121, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.15617111325263977, "rewards/margins": -0.004508105106651783, "rewards/rejected": -0.15166299045085907, "sft_loss": 1.5617109537124634, "step": 190 }, { "epoch": 0.3232976358860376, "grad_norm": 4.873908519744873, "learning_rate": 4.857928875491392e-06, "logits/chosen": -14.317342758178711, "logits/rejected": -14.135493278503418, "logps/chosen": -1.4843647480010986, "logps/rejected": -1.5346746444702148, "loss": 1.5575, "odds_ratio_loss": 0.7314870953559875, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.14843648672103882, "rewards/margins": 0.005030992440879345, "rewards/rejected": -0.15346747636795044, "sft_loss": 1.4843647480010986, "step": 200 }, { "epoch": 0.33946251768033947, "grad_norm": 1.1008872985839844, "learning_rate": 4.843512968036314e-06, "logits/chosen": -13.899968147277832, "logits/rejected": -13.980463027954102, "logps/chosen": -1.4831616878509521, "logps/rejected": -1.464994192123413, "loss": 1.5606, "odds_ratio_loss": 0.7743188738822937, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1483161747455597, "rewards/margins": -0.0018167542293667793, "rewards/rejected": -0.1464994251728058, "sft_loss": 1.4831616878509521, "step": 210 }, { "epoch": 0.35562739947464134, "grad_norm": 2.111262083053589, "learning_rate": 4.828424108555486e-06, "logits/chosen": -14.277219772338867, "logits/rejected": -14.1966552734375, "logps/chosen": -1.5998783111572266, "logps/rejected": -1.7076078653335571, "loss": 1.6726, "odds_ratio_loss": 0.727408230304718, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.15998782217502594, "rewards/margins": 0.010772952809929848, "rewards/rejected": -0.17076078057289124, "sft_loss": 1.5998783111572266, "step": 220 }, { "epoch": 0.3717922812689432, "grad_norm": 0.6497421264648438, "learning_rate": 4.812666629893957e-06, "logits/chosen": -14.255824089050293, "logits/rejected": -14.233850479125977, "logps/chosen": -1.5216138362884521, "logps/rejected": -1.4904725551605225, "loss": 1.599, "odds_ratio_loss": 0.7741049528121948, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.1521613895893097, "rewards/margins": -0.0031141184736043215, "rewards/rejected": -0.14904727041721344, "sft_loss": 1.5216138362884521, "step": 230 }, { "epoch": 0.3879571630632451, "grad_norm": 1.4030089378356934, "learning_rate": 4.796245056894273e-06, "logits/chosen": -13.990198135375977, "logits/rejected": -14.032785415649414, "logps/chosen": -1.5593761205673218, "logps/rejected": -1.5817941427230835, "loss": 1.6382, "odds_ratio_loss": 0.7885618805885315, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.15593759715557098, "rewards/margins": 0.0022418068256229162, "rewards/rejected": -0.1581794172525406, "sft_loss": 1.5593761205673218, "step": 240 }, { "epoch": 0.404122044857547, "grad_norm": 1.03659987449646, "learning_rate": 4.779164105097148e-06, "logits/chosen": -14.23992919921875, "logits/rejected": -14.331039428710938, "logps/chosen": -1.4630193710327148, "logps/rejected": -1.6595561504364014, "loss": 1.5308, "odds_ratio_loss": 0.6777212023735046, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.14630195498466492, "rewards/margins": 0.019653689116239548, "rewards/rejected": -0.16595561802387238, "sft_loss": 1.4630193710327148, "step": 250 }, { "epoch": 0.42028692665184886, "grad_norm": 1.1558053493499756, "learning_rate": 4.761428679387373e-06, "logits/chosen": -14.19200611114502, "logits/rejected": -14.27843189239502, "logps/chosen": -1.4934606552124023, "logps/rejected": -1.5448919534683228, "loss": 1.5664, "odds_ratio_loss": 0.7296234369277954, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.14934605360031128, "rewards/margins": 0.005143154412508011, "rewards/rejected": -0.154489204287529, "sft_loss": 1.4934606552124023, "step": 260 }, { "epoch": 0.4364518084461507, "grad_norm": 1.3478955030441284, "learning_rate": 4.7430438725853515e-06, "logits/chosen": -14.099308967590332, "logits/rejected": -14.247446060180664, "logps/chosen": -1.5219833850860596, "logps/rejected": -1.7108709812164307, "loss": 1.5916, "odds_ratio_loss": 0.6957148313522339, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.15219834446907043, "rewards/margins": 0.01888876222074032, "rewards/rejected": -0.1710870862007141, "sft_loss": 1.5219833850860596, "step": 270 }, { "epoch": 0.4526166902404526, "grad_norm": 1.0543924570083618, "learning_rate": 4.724014963984669e-06, "logits/chosen": -14.321874618530273, "logits/rejected": -14.308130264282227, "logps/chosen": -1.4753090143203735, "logps/rejected": -1.6179271936416626, "loss": 1.5473, "odds_ratio_loss": 0.7201633453369141, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.14753088355064392, "rewards/margins": 0.014261829666793346, "rewards/rejected": -0.16179272532463074, "sft_loss": 1.4753090143203735, "step": 280 }, { "epoch": 0.4687815720347545, "grad_norm": 1.6008622646331787, "learning_rate": 4.704347417836116e-06, "logits/chosen": -14.192815780639648, "logits/rejected": -14.182914733886719, "logps/chosen": -1.373263955116272, "logps/rejected": -1.4777114391326904, "loss": 1.4462, "odds_ratio_loss": 0.7295758128166199, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.13732638955116272, "rewards/margins": 0.010444764979183674, "rewards/rejected": -0.14777114987373352, "sft_loss": 1.373263955116272, "step": 290 }, { "epoch": 0.4849464538290564, "grad_norm": 1.0440045595169067, "learning_rate": 4.684046881778603e-06, "logits/chosen": -13.9605131149292, "logits/rejected": -14.021821975708008, "logps/chosen": -1.3839852809906006, "logps/rejected": -1.4472886323928833, "loss": 1.456, "odds_ratio_loss": 0.719718337059021, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.13839852809906006, "rewards/margins": 0.006330335047096014, "rewards/rejected": -0.1447288691997528, "sft_loss": 1.3839852809906006, "step": 300 }, { "epoch": 0.5011113356233583, "grad_norm": 0.8026280999183655, "learning_rate": 4.663119185217409e-06, "logits/chosen": -14.247451782226562, "logits/rejected": -14.332074165344238, "logps/chosen": -1.4372491836547852, "logps/rejected": -1.5869617462158203, "loss": 1.5057, "odds_ratio_loss": 0.684893012046814, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.1437249332666397, "rewards/margins": 0.014971258118748665, "rewards/rejected": -0.15869615972042084, "sft_loss": 1.4372491836547852, "step": 310 }, { "epoch": 0.5172762174176602, "grad_norm": 1.054210901260376, "learning_rate": 4.641570337650232e-06, "logits/chosen": -14.101099967956543, "logits/rejected": -14.234477043151855, "logps/chosen": -1.3175721168518066, "logps/rejected": -1.46291184425354, "loss": 1.3866, "odds_ratio_loss": 0.6904350519180298, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1317571997642517, "rewards/margins": 0.014533978886902332, "rewards/rejected": -0.14629118144512177, "sft_loss": 1.3175721168518066, "step": 320 }, { "epoch": 0.533441099211962, "grad_norm": 1.6171979904174805, "learning_rate": 4.61940652694154e-06, "logits/chosen": -14.107089042663574, "logits/rejected": -14.126917839050293, "logps/chosen": -1.5025255680084229, "logps/rejected": -1.4795392751693726, "loss": 1.5835, "odds_ratio_loss": 0.8096711039543152, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.1502525359392166, "rewards/margins": -0.0022986275143921375, "rewards/rejected": -0.14795391261577606, "sft_loss": 1.5025255680084229, "step": 330 }, { "epoch": 0.5496059810062639, "grad_norm": 1.2122093439102173, "learning_rate": 4.596634117545689e-06, "logits/chosen": -14.346307754516602, "logits/rejected": -14.166845321655273, "logps/chosen": -1.5319068431854248, "logps/rejected": -1.624324083328247, "loss": 1.6054, "odds_ratio_loss": 0.735165536403656, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.15319068729877472, "rewards/margins": 0.00924170482903719, "rewards/rejected": -0.16243240237236023, "sft_loss": 1.5319068431854248, "step": 340 }, { "epoch": 0.5657708628005658, "grad_norm": 0.899023175239563, "learning_rate": 4.573259648679335e-06, "logits/chosen": -14.317461013793945, "logits/rejected": -14.103338241577148, "logps/chosen": -1.47697114944458, "logps/rejected": -1.648705244064331, "loss": 1.546, "odds_ratio_loss": 0.6902921199798584, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.14769712090492249, "rewards/margins": 0.017173420637845993, "rewards/rejected": -0.16487054526805878, "sft_loss": 1.47697114944458, "step": 350 }, { "epoch": 0.5819357445948676, "grad_norm": 2.3687381744384766, "learning_rate": 4.549289832443663e-06, "logits/chosen": -14.142545700073242, "logits/rejected": -14.211145401000977, "logps/chosen": -1.4514472484588623, "logps/rejected": -1.5542781352996826, "loss": 1.5233, "odds_ratio_loss": 0.7186037302017212, "rewards/accuracies": 0.5, "rewards/chosen": -0.1451447308063507, "rewards/margins": 0.010283084586262703, "rewards/rejected": -0.15542782843112946, "sft_loss": 1.4514472484588623, "step": 360 }, { "epoch": 0.5981006263891695, "grad_norm": 1.039651870727539, "learning_rate": 4.524731551896978e-06, "logits/chosen": -14.117040634155273, "logits/rejected": -14.164260864257812, "logps/chosen": -1.3633731603622437, "logps/rejected": -1.4127264022827148, "loss": 1.4381, "odds_ratio_loss": 0.7473303079605103, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.13633732497692108, "rewards/margins": 0.004935313947498798, "rewards/rejected": -0.1412726640701294, "sft_loss": 1.3633731603622437, "step": 370 }, { "epoch": 0.6142655081834714, "grad_norm": 2.077622413635254, "learning_rate": 4.4995918590781925e-06, "logits/chosen": -14.212381362915039, "logits/rejected": -14.251853942871094, "logps/chosen": -1.3631454706192017, "logps/rejected": -1.4832844734191895, "loss": 1.437, "odds_ratio_loss": 0.7388315200805664, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1363145411014557, "rewards/margins": 0.012013902887701988, "rewards/rejected": -0.14832845330238342, "sft_loss": 1.3631454706192017, "step": 380 }, { "epoch": 0.6304303899777733, "grad_norm": 0.6616309881210327, "learning_rate": 4.473877972981797e-06, "logits/chosen": -14.166543960571289, "logits/rejected": -14.008458137512207, "logps/chosen": -1.414536476135254, "logps/rejected": -1.5125486850738525, "loss": 1.4849, "odds_ratio_loss": 0.7040683031082153, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.14145365357398987, "rewards/margins": 0.009801235981285572, "rewards/rejected": -0.15125489234924316, "sft_loss": 1.414536476135254, "step": 390 }, { "epoch": 0.6465952717720752, "grad_norm": 1.2422401905059814, "learning_rate": 4.447597277484894e-06, "logits/chosen": -14.10089111328125, "logits/rejected": -14.177225112915039, "logps/chosen": -1.3244436979293823, "logps/rejected": -1.434922456741333, "loss": 1.3936, "odds_ratio_loss": 0.6911473274230957, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.132444366812706, "rewards/margins": 0.011047879233956337, "rewards/rejected": -0.14349225163459778, "sft_loss": 1.3244436979293823, "step": 400 }, { "epoch": 0.6627601535663771, "grad_norm": 1.3308875560760498, "learning_rate": 4.42075731922687e-06, "logits/chosen": -14.254026412963867, "logits/rejected": -14.150421142578125, "logps/chosen": -1.4931491613388062, "logps/rejected": -1.5233150720596313, "loss": 1.5684, "odds_ratio_loss": 0.7521846890449524, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.14931491017341614, "rewards/margins": 0.0030165952630341053, "rewards/rejected": -0.15233151614665985, "sft_loss": 1.4931491613388062, "step": 410 }, { "epoch": 0.6789250353606789, "grad_norm": 1.4143937826156616, "learning_rate": 4.3933658054423465e-06, "logits/chosen": -14.156329154968262, "logits/rejected": -14.047518730163574, "logps/chosen": -1.338627576828003, "logps/rejected": -1.4370090961456299, "loss": 1.4095, "odds_ratio_loss": 0.70883709192276, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.13386276364326477, "rewards/margins": 0.009838144294917583, "rewards/rejected": -0.14370091259479523, "sft_loss": 1.338627576828003, "step": 420 }, { "epoch": 0.6950899171549808, "grad_norm": 2.3574774265289307, "learning_rate": 4.365430601748003e-06, "logits/chosen": -14.235176086425781, "logits/rejected": -14.395864486694336, "logps/chosen": -1.564626932144165, "logps/rejected": -1.5344398021697998, "loss": 1.6431, "odds_ratio_loss": 0.7849880456924438, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.15646269917488098, "rewards/margins": -0.0030187165830284357, "rewards/rejected": -0.15344397723674774, "sft_loss": 1.564626932144165, "step": 430 }, { "epoch": 0.7112547989492827, "grad_norm": 3.739943504333496, "learning_rate": 4.336959729883925e-06, "logits/chosen": -14.274754524230957, "logits/rejected": -14.191232681274414, "logps/chosen": -1.3745372295379639, "logps/rejected": -1.405700445175171, "loss": 1.4506, "odds_ratio_loss": 0.7607132196426392, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.13745373487472534, "rewards/margins": 0.0031163152307271957, "rewards/rejected": -0.1405700445175171, "sft_loss": 1.3745372295379639, "step": 440 }, { "epoch": 0.7274196807435845, "grad_norm": 0.9312599301338196, "learning_rate": 4.307961365410118e-06, "logits/chosen": -14.044285774230957, "logits/rejected": -14.011823654174805, "logps/chosen": -1.4385414123535156, "logps/rejected": -1.4718294143676758, "loss": 1.5134, "odds_ratio_loss": 0.7482468485832214, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.14385412633419037, "rewards/margins": 0.003328789724037051, "rewards/rejected": -0.14718294143676758, "sft_loss": 1.4385414123535156, "step": 450 }, { "epoch": 0.7435845625378864, "grad_norm": 1.4249197244644165, "learning_rate": 4.278443835358854e-06, "logits/chosen": -14.115106582641602, "logits/rejected": -14.075739860534668, "logps/chosen": -1.3712975978851318, "logps/rejected": -1.5527522563934326, "loss": 1.4406, "odds_ratio_loss": 0.6929912567138672, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1371297538280487, "rewards/margins": 0.018145468086004257, "rewards/rejected": -0.15527524054050446, "sft_loss": 1.3712975978851318, "step": 460 }, { "epoch": 0.7597494443321883, "grad_norm": 1.1615644693374634, "learning_rate": 4.248415615843523e-06, "logits/chosen": -14.288152694702148, "logits/rejected": -14.206695556640625, "logps/chosen": -1.4021141529083252, "logps/rejected": -1.416723370552063, "loss": 1.4775, "odds_ratio_loss": 0.7538274526596069, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.14021141827106476, "rewards/margins": 0.001460921368561685, "rewards/rejected": -0.14167232811450958, "sft_loss": 1.4021141529083252, "step": 470 }, { "epoch": 0.7759143261264903, "grad_norm": 1.276267409324646, "learning_rate": 4.217885329624666e-06, "logits/chosen": -14.302003860473633, "logits/rejected": -14.307230949401855, "logps/chosen": -1.346254587173462, "logps/rejected": -1.4862271547317505, "loss": 1.4137, "odds_ratio_loss": 0.6745720505714417, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.13462546467781067, "rewards/margins": 0.013997259549796581, "rewards/rejected": -0.14862270653247833, "sft_loss": 1.346254587173462, "step": 480 }, { "epoch": 0.7920792079207921, "grad_norm": 1.6030430793762207, "learning_rate": 4.186861743633911e-06, "logits/chosen": -14.13404369354248, "logits/rejected": -14.251507759094238, "logps/chosen": -1.4151580333709717, "logps/rejected": -1.5721826553344727, "loss": 1.4904, "odds_ratio_loss": 0.7523505091667175, "rewards/accuracies": 0.5, "rewards/chosen": -0.1415158212184906, "rewards/margins": 0.015702461823821068, "rewards/rejected": -0.15721826255321503, "sft_loss": 1.4151580333709717, "step": 490 }, { "epoch": 0.808244089715094, "grad_norm": 1.7222312688827515, "learning_rate": 4.155353766456497e-06, "logits/chosen": -14.4000825881958, "logits/rejected": -14.304115295410156, "logps/chosen": -1.433506727218628, "logps/rejected": -1.535611867904663, "loss": 1.5005, "odds_ratio_loss": 0.6703948378562927, "rewards/accuracies": 0.46875, "rewards/chosen": -0.14335067570209503, "rewards/margins": 0.010210518725216389, "rewards/rejected": -0.15356118977069855, "sft_loss": 1.433506727218628, "step": 500 }, { "epoch": 0.808244089715094, "eval_logits/chosen": -14.227585792541504, "eval_logits/rejected": -14.265686988830566, "eval_logps/chosen": -1.4436272382736206, "eval_logps/rejected": -1.4898087978363037, "eval_loss": 1.5202080011367798, "eval_odds_ratio_loss": 0.7658076882362366, "eval_rewards/accuracies": 0.48181816935539246, "eval_rewards/chosen": -0.1443627029657364, "eval_rewards/margins": 0.004618145525455475, "eval_rewards/rejected": -0.14898087084293365, "eval_runtime": 207.676, "eval_samples_per_second": 5.297, "eval_sft_loss": 1.4436272382736206, "eval_steps_per_second": 2.648, "step": 500 }, { "epoch": 0.8244089715093958, "grad_norm": 1.143004059791565, "learning_rate": 4.123370445773134e-06, "logits/chosen": -14.356025695800781, "logits/rejected": -14.339376449584961, "logps/chosen": -1.4154841899871826, "logps/rejected": -1.4348183870315552, "loss": 1.4927, "odds_ratio_loss": 0.7723585963249207, "rewards/accuracies": 0.5, "rewards/chosen": -0.14154842495918274, "rewards/margins": 0.001933417865075171, "rewards/rejected": -0.14348182082176208, "sft_loss": 1.4154841899871826, "step": 510 }, { "epoch": 0.8405738533036977, "grad_norm": 3.6751832962036133, "learning_rate": 4.090920965761906e-06, "logits/chosen": -14.4230375289917, "logits/rejected": -14.330423355102539, "logps/chosen": -1.4806926250457764, "logps/rejected": -1.4873076677322388, "loss": 1.559, "odds_ratio_loss": 0.7833209037780762, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.14806927740573883, "rewards/margins": 0.0006614929297938943, "rewards/rejected": -0.14873075485229492, "sft_loss": 1.4806926250457764, "step": 520 }, { "epoch": 0.8567387350979996, "grad_norm": 4.592033386230469, "learning_rate": 4.058014644460991e-06, "logits/chosen": -14.309356689453125, "logits/rejected": -14.266693115234375, "logps/chosen": -1.4232040643692017, "logps/rejected": -1.4629483222961426, "loss": 1.4967, "odds_ratio_loss": 0.7350074052810669, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.1423204094171524, "rewards/margins": 0.003974422812461853, "rewards/rejected": -0.14629481732845306, "sft_loss": 1.4232040643692017, "step": 530 }, { "epoch": 0.8729036168923014, "grad_norm": 1.3515141010284424, "learning_rate": 4.024660931092939e-06, "logits/chosen": -14.12739086151123, "logits/rejected": -14.135973930358887, "logps/chosen": -1.4027074575424194, "logps/rejected": -1.5116406679153442, "loss": 1.4748, "odds_ratio_loss": 0.7212173938751221, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.14027073979377747, "rewards/margins": 0.010893313214182854, "rewards/rejected": -0.15116406977176666, "sft_loss": 1.4027074575424194, "step": 540 }, { "epoch": 0.8890684986866033, "grad_norm": 3.3689217567443848, "learning_rate": 3.990869403351272e-06, "logits/chosen": -14.354001998901367, "logits/rejected": -14.225595474243164, "logps/chosen": -1.4652130603790283, "logps/rejected": -1.552912712097168, "loss": 1.5359, "odds_ratio_loss": 0.7067934274673462, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.14652130007743835, "rewards/margins": 0.008769966661930084, "rewards/rejected": -0.15529127418994904, "sft_loss": 1.4652130603790283, "step": 550 }, { "epoch": 0.9052333804809052, "grad_norm": 1.5204488039016724, "learning_rate": 3.956649764650206e-06, "logits/chosen": -14.487988471984863, "logits/rejected": -14.507904052734375, "logps/chosen": -1.4564487934112549, "logps/rejected": -1.5203144550323486, "loss": 1.5325, "odds_ratio_loss": 0.7608081102371216, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.1456448882818222, "rewards/margins": 0.006386570632457733, "rewards/rejected": -0.15203145146369934, "sft_loss": 1.4564487934112549, "step": 560 }, { "epoch": 0.9213982622752072, "grad_norm": 2.2319583892822266, "learning_rate": 3.92201184133826e-06, "logits/chosen": -14.393239974975586, "logits/rejected": -14.3502779006958, "logps/chosen": -1.3946270942687988, "logps/rejected": -1.444805383682251, "loss": 1.4679, "odds_ratio_loss": 0.7322729229927063, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13946272432804108, "rewards/margins": 0.005017831921577454, "rewards/rejected": -0.14448055624961853, "sft_loss": 1.3946270942687988, "step": 570 }, { "epoch": 0.937563144069509, "grad_norm": 1.4617536067962646, "learning_rate": 3.886965579876572e-06, "logits/chosen": -14.353238105773926, "logits/rejected": -14.260797500610352, "logps/chosen": -1.3793189525604248, "logps/rejected": -1.445691704750061, "loss": 1.4501, "odds_ratio_loss": 0.7080078125, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13793189823627472, "rewards/margins": 0.006637275218963623, "rewards/rejected": -0.14456915855407715, "sft_loss": 1.3793189525604248, "step": 580 }, { "epoch": 0.9537280258638109, "grad_norm": 1.2430846691131592, "learning_rate": 3.851521043982716e-06, "logits/chosen": -14.31140422821045, "logits/rejected": -14.404243469238281, "logps/chosen": -1.424002766609192, "logps/rejected": -1.4054510593414307, "loss": 1.4998, "odds_ratio_loss": 0.7578663229942322, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.14240026473999023, "rewards/margins": -0.0018551532411947846, "rewards/rejected": -0.14054511487483978, "sft_loss": 1.424002766609192, "step": 590 }, { "epoch": 0.9698929076581128, "grad_norm": 1.5072684288024902, "learning_rate": 3.81568841174086e-06, "logits/chosen": -14.169085502624512, "logits/rejected": -14.1954345703125, "logps/chosen": -1.4412424564361572, "logps/rejected": -1.4657504558563232, "loss": 1.5191, "odds_ratio_loss": 0.7788038849830627, "rewards/accuracies": 0.5, "rewards/chosen": -0.14412423968315125, "rewards/margins": 0.002450800035148859, "rewards/rejected": -0.14657504856586456, "sft_loss": 1.4412424564361572, "step": 600 }, { "epoch": 0.9860577894524146, "grad_norm": 1.2968331575393677, "learning_rate": 3.7794779726790664e-06, "logits/chosen": -14.130575180053711, "logits/rejected": -14.240781784057617, "logps/chosen": -1.3836543560028076, "logps/rejected": -1.457695722579956, "loss": 1.4561, "odds_ratio_loss": 0.7247332334518433, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.13836543262004852, "rewards/margins": 0.007404146250337362, "rewards/rejected": -0.14576958119869232, "sft_loss": 1.3836543560028076, "step": 610 }, { "epoch": 1.0022226712467166, "grad_norm": 4.868699550628662, "learning_rate": 3.7429001248146096e-06, "logits/chosen": -14.240348815917969, "logits/rejected": -14.297922134399414, "logps/chosen": -1.4243017435073853, "logps/rejected": -1.5530868768692017, "loss": 1.4924, "odds_ratio_loss": 0.680776059627533, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1424301713705063, "rewards/margins": 0.012878507375717163, "rewards/rejected": -0.15530869364738464, "sft_loss": 1.4243017435073853, "step": 620 }, { "epoch": 1.0183875530410185, "grad_norm": 0.8127214312553406, "learning_rate": 3.7059653716681227e-06, "logits/chosen": -14.380844116210938, "logits/rejected": -14.255830764770508, "logps/chosen": -1.4107029438018799, "logps/rejected": -1.521928071975708, "loss": 1.4861, "odds_ratio_loss": 0.7541464567184448, "rewards/accuracies": 0.5, "rewards/chosen": -0.14107032120227814, "rewards/margins": 0.01112250704318285, "rewards/rejected": -0.15219281613826752, "sft_loss": 1.4107029438018799, "step": 630 }, { "epoch": 1.0345524348353203, "grad_norm": 3.8503897190093994, "learning_rate": 3.668684319247463e-06, "logits/chosen": -14.447845458984375, "logits/rejected": -14.433076858520508, "logps/chosen": -1.367375135421753, "logps/rejected": -1.548612356185913, "loss": 1.4348, "odds_ratio_loss": 0.6741297841072083, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.13673751056194305, "rewards/margins": 0.018123725429177284, "rewards/rejected": -0.1548612415790558, "sft_loss": 1.367375135421753, "step": 640 }, { "epoch": 1.0507173166296222, "grad_norm": 0.9416384100914001, "learning_rate": 3.6310676730021373e-06, "logits/chosen": -14.3724946975708, "logits/rejected": -14.455398559570312, "logps/chosen": -1.3245970010757446, "logps/rejected": -1.3460277318954468, "loss": 1.3979, "odds_ratio_loss": 0.7330806255340576, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13245970010757446, "rewards/margins": 0.002143078250810504, "rewards/rejected": -0.13460277020931244, "sft_loss": 1.3245970010757446, "step": 650 }, { "epoch": 1.066882198423924, "grad_norm": 2.8321056365966797, "learning_rate": 3.593126234749178e-06, "logits/chosen": -14.317327499389648, "logits/rejected": -14.38727855682373, "logps/chosen": -1.423680067062378, "logps/rejected": -1.4616180658340454, "loss": 1.4976, "odds_ratio_loss": 0.739305853843689, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.14236800372600555, "rewards/margins": 0.0037938044406473637, "rewards/rejected": -0.14616182446479797, "sft_loss": 1.423680067062378, "step": 660 }, { "epoch": 1.083047080218226, "grad_norm": 0.9518349766731262, "learning_rate": 3.554870899571343e-06, "logits/chosen": -14.144752502441406, "logits/rejected": -14.251813888549805, "logps/chosen": -1.4052397012710571, "logps/rejected": -1.5265625715255737, "loss": 1.4767, "odds_ratio_loss": 0.7148950695991516, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1405239850282669, "rewards/margins": 0.012132286094129086, "rewards/rejected": -0.15265627205371857, "sft_loss": 1.4052397012710571, "step": 670 }, { "epoch": 1.0992119620125278, "grad_norm": 3.0823421478271484, "learning_rate": 3.5163126526885373e-06, "logits/chosen": -14.263737678527832, "logits/rejected": -14.341888427734375, "logps/chosen": -1.3758028745651245, "logps/rejected": -1.4713342189788818, "loss": 1.4506, "odds_ratio_loss": 0.748176097869873, "rewards/accuracies": 0.46875, "rewards/chosen": -0.1375802755355835, "rewards/margins": 0.009553151205182076, "rewards/rejected": -0.14713343977928162, "sft_loss": 1.3758028745651245, "step": 680 }, { "epoch": 1.1153768438068297, "grad_norm": 1.1957412958145142, "learning_rate": 3.4774625663033484e-06, "logits/chosen": -14.262721061706543, "logits/rejected": -14.248212814331055, "logps/chosen": -1.4033539295196533, "logps/rejected": -1.4489859342575073, "loss": 1.4783, "odds_ratio_loss": 0.7493518590927124, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.14033538103103638, "rewards/margins": 0.004563204478472471, "rewards/rejected": -0.14489860832691193, "sft_loss": 1.4033539295196533, "step": 690 }, { "epoch": 1.1315417256011315, "grad_norm": 1.0352710485458374, "learning_rate": 3.4383317964216067e-06, "logits/chosen": -14.168815612792969, "logits/rejected": -14.324069023132324, "logps/chosen": -1.3365106582641602, "logps/rejected": -1.3756332397460938, "loss": 1.4108, "odds_ratio_loss": 0.7429829835891724, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.13365106284618378, "rewards/margins": 0.0039122505113482475, "rewards/rejected": -0.1375633180141449, "sft_loss": 1.3365106582641602, "step": 700 }, { "epoch": 1.1477066073954334, "grad_norm": 2.4808411598205566, "learning_rate": 3.398931579648877e-06, "logits/chosen": -14.3150053024292, "logits/rejected": -14.531530380249023, "logps/chosen": -1.4491299390792847, "logps/rejected": -1.5492023229599, "loss": 1.5203, "odds_ratio_loss": 0.7113555669784546, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.14491300284862518, "rewards/margins": 0.010007232427597046, "rewards/rejected": -0.15492023527622223, "sft_loss": 1.4491299390792847, "step": 710 }, { "epoch": 1.1638714891897353, "grad_norm": 1.2726991176605225, "learning_rate": 3.359273229963813e-06, "logits/chosen": -14.357129096984863, "logits/rejected": -14.291903495788574, "logps/chosen": -1.3459408283233643, "logps/rejected": -1.3911712169647217, "loss": 1.421, "odds_ratio_loss": 0.750839114189148, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.13459408283233643, "rewards/margins": 0.004523060750216246, "rewards/rejected": -0.13911715149879456, "sft_loss": 1.3459408283233643, "step": 720 }, { "epoch": 1.1800363709840371, "grad_norm": 1.0978913307189941, "learning_rate": 3.319368135469285e-06, "logits/chosen": -14.36750602722168, "logits/rejected": -14.435731887817383, "logps/chosen": -1.3765571117401123, "logps/rejected": -1.4039866924285889, "loss": 1.4538, "odds_ratio_loss": 0.7719755172729492, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.13765572011470795, "rewards/margins": 0.002742946846410632, "rewards/rejected": -0.14039869606494904, "sft_loss": 1.3765571117401123, "step": 730 }, { "epoch": 1.196201252778339, "grad_norm": 2.1035361289978027, "learning_rate": 3.279227755122228e-06, "logits/chosen": -14.316058158874512, "logits/rejected": -14.294093132019043, "logps/chosen": -1.320318579673767, "logps/rejected": -1.5284496545791626, "loss": 1.3866, "odds_ratio_loss": 0.6632006764411926, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.13203184306621552, "rewards/margins": 0.020813116803765297, "rewards/rejected": -0.15284495055675507, "sft_loss": 1.320318579673767, "step": 740 }, { "epoch": 1.2123661345726409, "grad_norm": 3.223933696746826, "learning_rate": 3.2388636154431417e-06, "logits/chosen": -14.34916877746582, "logits/rejected": -14.280328750610352, "logps/chosen": -1.429145097732544, "logps/rejected": -1.5203419923782349, "loss": 1.502, "odds_ratio_loss": 0.7281750440597534, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.1429145336151123, "rewards/margins": 0.009119677357375622, "rewards/rejected": -0.152034193277359, "sft_loss": 1.429145097732544, "step": 750 }, { "epoch": 1.2285310163669427, "grad_norm": 1.1619030237197876, "learning_rate": 3.198287307206192e-06, "logits/chosen": -14.091611862182617, "logits/rejected": -14.187002182006836, "logps/chosen": -1.4056107997894287, "logps/rejected": -1.442886233329773, "loss": 1.4829, "odds_ratio_loss": 0.7725043296813965, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.1405610740184784, "rewards/margins": 0.003727543633431196, "rewards/rejected": -0.14428862929344177, "sft_loss": 1.4056107997894287, "step": 760 }, { "epoch": 1.2446958981612446, "grad_norm": 1.0456814765930176, "learning_rate": 3.157510482110856e-06, "logits/chosen": -14.408856391906738, "logits/rejected": -14.243043899536133, "logps/chosen": -1.3281633853912354, "logps/rejected": -1.3863494396209717, "loss": 1.4004, "odds_ratio_loss": 0.7221428751945496, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.132816344499588, "rewards/margins": 0.005818599369376898, "rewards/rejected": -0.13863493502140045, "sft_loss": 1.3281633853912354, "step": 770 }, { "epoch": 1.2608607799555465, "grad_norm": 1.2318408489227295, "learning_rate": 3.116544849436077e-06, "logits/chosen": -14.334813117980957, "logits/rejected": -14.20678997039795, "logps/chosen": -1.5153284072875977, "logps/rejected": -1.6125590801239014, "loss": 1.588, "odds_ratio_loss": 0.7266558408737183, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.151532843708992, "rewards/margins": 0.009723084978759289, "rewards/rejected": -0.16125592589378357, "sft_loss": 1.5153284072875977, "step": 780 }, { "epoch": 1.2770256617498483, "grad_norm": 1.3976880311965942, "learning_rate": 3.0754021726778848e-06, "logits/chosen": -14.33143138885498, "logits/rejected": -14.257779121398926, "logps/chosen": -1.3455626964569092, "logps/rejected": -1.4571717977523804, "loss": 1.4162, "odds_ratio_loss": 0.7065266370773315, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.13455626368522644, "rewards/margins": 0.011160916648805141, "rewards/rejected": -0.14571718871593475, "sft_loss": 1.3455626964569092, "step": 790 }, { "epoch": 1.2931905435441502, "grad_norm": 0.7877367734909058, "learning_rate": 3.0340942661714463e-06, "logits/chosen": -14.352252006530762, "logits/rejected": -14.257513046264648, "logps/chosen": -1.4310262203216553, "logps/rejected": -1.4348089694976807, "loss": 1.5077, "odds_ratio_loss": 0.7662674188613892, "rewards/accuracies": 0.4375, "rewards/chosen": -0.14310263097286224, "rewards/margins": 0.00037826746120117605, "rewards/rejected": -0.14348089694976807, "sft_loss": 1.4310262203216553, "step": 800 }, { "epoch": 1.3093554253384523, "grad_norm": 1.265386939048767, "learning_rate": 2.992632991698512e-06, "logits/chosen": -14.194437980651855, "logits/rejected": -14.312055587768555, "logps/chosen": -1.3498046398162842, "logps/rejected": -1.4344502687454224, "loss": 1.4207, "odds_ratio_loss": 0.7088189721107483, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1349804699420929, "rewards/margins": 0.008464555256068707, "rewards/rejected": -0.14344502985477448, "sft_loss": 1.3498046398162842, "step": 810 }, { "epoch": 1.3255203071327541, "grad_norm": 1.7529423236846924, "learning_rate": 2.9510302550812537e-06, "logits/chosen": -14.307215690612793, "logits/rejected": -14.374090194702148, "logps/chosen": -1.3449764251708984, "logps/rejected": -1.5051848888397217, "loss": 1.4155, "odds_ratio_loss": 0.7051501274108887, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.13449765741825104, "rewards/margins": 0.016020851209759712, "rewards/rejected": -0.1505185067653656, "sft_loss": 1.3449764251708984, "step": 820 }, { "epoch": 1.341685188927056, "grad_norm": 3.534449815750122, "learning_rate": 2.9092980027634325e-06, "logits/chosen": -14.194910049438477, "logits/rejected": -14.260457038879395, "logps/chosen": -1.3157680034637451, "logps/rejected": -1.39622163772583, "loss": 1.3858, "odds_ratio_loss": 0.7005105018615723, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.131576806306839, "rewards/margins": 0.008045351132750511, "rewards/rejected": -0.13962216675281525, "sft_loss": 1.3157680034637451, "step": 830 }, { "epoch": 1.3578500707213579, "grad_norm": 1.6155622005462646, "learning_rate": 2.867448218379927e-06, "logits/chosen": -14.231335639953613, "logits/rejected": -14.248939514160156, "logps/chosen": -1.3620965480804443, "logps/rejected": -1.409558892250061, "loss": 1.4355, "odds_ratio_loss": 0.734248697757721, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1362096518278122, "rewards/margins": 0.0047462377697229385, "rewards/rejected": -0.14095589518547058, "sft_loss": 1.3620965480804443, "step": 840 }, { "epoch": 1.3740149525156597, "grad_norm": 4.540154933929443, "learning_rate": 2.825492919315559e-06, "logits/chosen": -14.306146621704102, "logits/rejected": -14.476399421691895, "logps/chosen": -1.4043729305267334, "logps/rejected": -1.4499131441116333, "loss": 1.4789, "odds_ratio_loss": 0.7450671195983887, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1404372900724411, "rewards/margins": 0.004554024897515774, "rewards/rejected": -0.14499132335186005, "sft_loss": 1.4043729305267334, "step": 850 }, { "epoch": 1.3901798343099616, "grad_norm": 1.2316781282424927, "learning_rate": 2.7834441532542482e-06, "logits/chosen": -14.352537155151367, "logits/rejected": -14.446965217590332, "logps/chosen": -1.3581891059875488, "logps/rejected": -1.4636138677597046, "loss": 1.4297, "odds_ratio_loss": 0.7155886888504028, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.13581891357898712, "rewards/margins": 0.01054247748106718, "rewards/rejected": -0.14636139571666718, "sft_loss": 1.3581891059875488, "step": 860 }, { "epoch": 1.4063447161042635, "grad_norm": 0.915081799030304, "learning_rate": 2.74131399471945e-06, "logits/chosen": -14.232261657714844, "logits/rejected": -14.369558334350586, "logps/chosen": -1.4017927646636963, "logps/rejected": -1.4412128925323486, "loss": 1.4755, "odds_ratio_loss": 0.7375406622886658, "rewards/accuracies": 0.5, "rewards/chosen": -0.14017929136753082, "rewards/margins": 0.003942002542316914, "rewards/rejected": -0.14412127435207367, "sft_loss": 1.4017927646636963, "step": 870 }, { "epoch": 1.4225095978985653, "grad_norm": 1.1700351238250732, "learning_rate": 2.6991145416068947e-06, "logits/chosen": -14.184051513671875, "logits/rejected": -14.361761093139648, "logps/chosen": -1.3888486623764038, "logps/rejected": -1.3866727352142334, "loss": 1.4645, "odds_ratio_loss": 0.7568970918655396, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.13888487219810486, "rewards/margins": -0.00021760519302915782, "rewards/rejected": -0.1386672556400299, "sft_loss": 1.3888486623764038, "step": 880 }, { "epoch": 1.4386744796928672, "grad_norm": 0.7416606545448303, "learning_rate": 2.6568579117106143e-06, "logits/chosen": -14.222585678100586, "logits/rejected": -14.173550605773926, "logps/chosen": -1.321872591972351, "logps/rejected": -1.451570749282837, "loss": 1.3933, "odds_ratio_loss": 0.7138932943344116, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.13218727707862854, "rewards/margins": 0.012969812378287315, "rewards/rejected": -0.1451570689678192, "sft_loss": 1.321872591972351, "step": 890 }, { "epoch": 1.454839361487169, "grad_norm": 0.7456266283988953, "learning_rate": 2.6145562392432544e-06, "logits/chosen": -14.201733589172363, "logits/rejected": -14.159896850585938, "logps/chosen": -1.371537446975708, "logps/rejected": -1.4001505374908447, "loss": 1.4466, "odds_ratio_loss": 0.7501237392425537, "rewards/accuracies": 0.5, "rewards/chosen": -0.137153759598732, "rewards/margins": 0.002861298155039549, "rewards/rejected": -0.14001503586769104, "sft_loss": 1.371537446975708, "step": 900 }, { "epoch": 1.471004243281471, "grad_norm": 1.7800395488739014, "learning_rate": 2.5722216713516682e-06, "logits/chosen": -14.122312545776367, "logits/rejected": -14.1841402053833, "logps/chosen": -1.2916905879974365, "logps/rejected": -1.3739659786224365, "loss": 1.3653, "odds_ratio_loss": 0.7365130186080933, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.1291690617799759, "rewards/margins": 0.008227519690990448, "rewards/rejected": -0.13739657402038574, "sft_loss": 1.2916905879974365, "step": 910 }, { "epoch": 1.4871691250757728, "grad_norm": 3.366191864013672, "learning_rate": 2.5298663646288064e-06, "logits/chosen": -14.279853820800781, "logits/rejected": -14.313766479492188, "logps/chosen": -1.3366254568099976, "logps/rejected": -1.4743283987045288, "loss": 1.4084, "odds_ratio_loss": 0.7178291082382202, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.13366253674030304, "rewards/margins": 0.013770299032330513, "rewards/rejected": -0.1474328488111496, "sft_loss": 1.3366254568099976, "step": 920 }, { "epoch": 1.503334006870075, "grad_norm": 1.793541431427002, "learning_rate": 2.487502481622879e-06, "logits/chosen": -14.228408813476562, "logits/rejected": -14.142854690551758, "logps/chosen": -1.3270151615142822, "logps/rejected": -1.4341893196105957, "loss": 1.3983, "odds_ratio_loss": 0.7129431366920471, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.13270151615142822, "rewards/margins": 0.010717417113482952, "rewards/rejected": -0.14341893792152405, "sft_loss": 1.3270151615142822, "step": 930 }, { "epoch": 1.5194988886643768, "grad_norm": 2.546449661254883, "learning_rate": 2.4451421873448253e-06, "logits/chosen": -14.15150260925293, "logits/rejected": -14.336977005004883, "logps/chosen": -1.431612253189087, "logps/rejected": -1.4608542919158936, "loss": 1.508, "odds_ratio_loss": 0.7637500762939453, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.14316122233867645, "rewards/margins": 0.002924212021753192, "rewards/rejected": -0.1460854411125183, "sft_loss": 1.431612253189087, "step": 940 }, { "epoch": 1.5356637704586786, "grad_norm": 2.0193891525268555, "learning_rate": 2.40279764577506e-06, "logits/chosen": -14.358665466308594, "logits/rejected": -14.505513191223145, "logps/chosen": -1.403634786605835, "logps/rejected": -1.4488627910614014, "loss": 1.48, "odds_ratio_loss": 0.7633059620857239, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.14036348462104797, "rewards/margins": 0.004522812552750111, "rewards/rejected": -0.14488628506660461, "sft_loss": 1.403634786605835, "step": 950 }, { "epoch": 1.5518286522529805, "grad_norm": 1.2108488082885742, "learning_rate": 2.3604810163705242e-06, "logits/chosen": -14.17876148223877, "logits/rejected": -14.2489652633667, "logps/chosen": -1.306792140007019, "logps/rejected": -1.3910942077636719, "loss": 1.377, "odds_ratio_loss": 0.7023099660873413, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1306792050600052, "rewards/margins": 0.00843021459877491, "rewards/rejected": -0.13910941779613495, "sft_loss": 1.306792140007019, "step": 960 }, { "epoch": 1.5679935340472824, "grad_norm": 1.9210587739944458, "learning_rate": 2.3182044505730364e-06, "logits/chosen": -14.331990242004395, "logits/rejected": -14.305018424987793, "logps/chosen": -1.2632302045822144, "logps/rejected": -1.3584424257278442, "loss": 1.3349, "odds_ratio_loss": 0.7163167595863342, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.12632302939891815, "rewards/margins": 0.009521213360130787, "rewards/rejected": -0.13584424555301666, "sft_loss": 1.2632302045822144, "step": 970 }, { "epoch": 1.5841584158415842, "grad_norm": 1.7603510618209839, "learning_rate": 2.275980088319941e-06, "logits/chosen": -14.362065315246582, "logits/rejected": -14.22284984588623, "logps/chosen": -1.269855260848999, "logps/rejected": -1.3405383825302124, "loss": 1.3406, "odds_ratio_loss": 0.7074419260025024, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.12698553502559662, "rewards/margins": 0.0070683010853827, "rewards/rejected": -0.13405382633209229, "sft_loss": 1.269855260848999, "step": 980 }, { "epoch": 1.600323297635886, "grad_norm": 1.6920086145401, "learning_rate": 2.2338200545580577e-06, "logits/chosen": -14.224035263061523, "logits/rejected": -14.358423233032227, "logps/chosen": -1.2658283710479736, "logps/rejected": -1.4482189416885376, "loss": 1.3345, "odds_ratio_loss": 0.6871744990348816, "rewards/accuracies": 0.5, "rewards/chosen": -0.12658283114433289, "rewards/margins": 0.01823904737830162, "rewards/rejected": -0.1448218822479248, "sft_loss": 1.2658283710479736, "step": 990 }, { "epoch": 1.616488179430188, "grad_norm": 1.0991649627685547, "learning_rate": 2.191736455761947e-06, "logits/chosen": -14.324908256530762, "logits/rejected": -14.3560209274292, "logps/chosen": -1.2651708126068115, "logps/rejected": -1.290913701057434, "loss": 1.3401, "odds_ratio_loss": 0.749754786491394, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.12651710212230682, "rewards/margins": 0.0025742852594703436, "rewards/rejected": -0.12909138202667236, "sft_loss": 1.2651708126068115, "step": 1000 }, { "epoch": 1.616488179430188, "eval_logits/chosen": -14.268522262573242, "eval_logits/rejected": -14.308253288269043, "eval_logps/chosen": -1.3874938488006592, "eval_logps/rejected": -1.4423273801803589, "eval_loss": 1.4635207653045654, "eval_odds_ratio_loss": 0.7602682709693909, "eval_rewards/accuracies": 0.48363634943962097, "eval_rewards/chosen": -0.1387493908405304, "eval_rewards/margins": 0.00548336049541831, "eval_rewards/rejected": -0.14423276484012604, "eval_runtime": 207.8962, "eval_samples_per_second": 5.291, "eval_sft_loss": 1.3874938488006592, "eval_steps_per_second": 2.646, "step": 1000 }, { "epoch": 1.6326530612244898, "grad_norm": 0.9229074716567993, "learning_rate": 2.1497413764574673e-06, "logits/chosen": -14.391751289367676, "logits/rejected": -14.302392959594727, "logps/chosen": -1.4207522869110107, "logps/rejected": -1.4941614866256714, "loss": 1.4937, "odds_ratio_loss": 0.7297941446304321, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.14207521080970764, "rewards/margins": 0.007340931333601475, "rewards/rejected": -0.14941613376140594, "sft_loss": 1.4207522869110107, "step": 1010 }, { "epoch": 1.6488179430187917, "grad_norm": 1.2489970922470093, "learning_rate": 2.1078468757516395e-06, "logits/chosen": -14.41105842590332, "logits/rejected": -14.309954643249512, "logps/chosen": -1.3737413883209229, "logps/rejected": -1.331855297088623, "loss": 1.453, "odds_ratio_loss": 0.7925962805747986, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.1373741328716278, "rewards/margins": -0.004188609775155783, "rewards/rejected": -0.1331855207681656, "sft_loss": 1.3737413883209229, "step": 1020 }, { "epoch": 1.6649828248130936, "grad_norm": 0.9103444814682007, "learning_rate": 2.0660649838698145e-06, "logits/chosen": -14.60859203338623, "logits/rejected": -14.583990097045898, "logps/chosen": -1.3282297849655151, "logps/rejected": -1.4166333675384521, "loss": 1.3999, "odds_ratio_loss": 0.7163518071174622, "rewards/accuracies": 0.4375, "rewards/chosen": -0.13282299041748047, "rewards/margins": 0.008840366266667843, "rewards/rejected": -0.1416633427143097, "sft_loss": 1.3282297849655151, "step": 1030 }, { "epoch": 1.6811477066073954, "grad_norm": 1.1333231925964355, "learning_rate": 2.0244076987011284e-06, "logits/chosen": -14.382695198059082, "logits/rejected": -14.247182846069336, "logps/chosen": -1.3871229887008667, "logps/rejected": -1.5080008506774902, "loss": 1.4558, "odds_ratio_loss": 0.68644779920578, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1387123018503189, "rewards/margins": 0.012087779119610786, "rewards/rejected": -0.15080007910728455, "sft_loss": 1.3871229887008667, "step": 1040 }, { "epoch": 1.6973125884016973, "grad_norm": 1.302032709121704, "learning_rate": 1.982886982353251e-06, "logits/chosen": -14.392558097839355, "logits/rejected": -14.241909980773926, "logps/chosen": -1.3640697002410889, "logps/rejected": -1.5009006261825562, "loss": 1.4359, "odds_ratio_loss": 0.7178789377212524, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.13640697300434113, "rewards/margins": 0.013683101162314415, "rewards/rejected": -0.1500900685787201, "sft_loss": 1.3640697002410889, "step": 1050 }, { "epoch": 1.7134774701959992, "grad_norm": 1.7859091758728027, "learning_rate": 1.941514757717392e-06, "logits/chosen": -14.138816833496094, "logits/rejected": -14.210226058959961, "logps/chosen": -1.3156766891479492, "logps/rejected": -1.4917762279510498, "loss": 1.3807, "odds_ratio_loss": 0.6497665643692017, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.13156768679618835, "rewards/margins": 0.01760994642972946, "rewards/rejected": -0.1491776406764984, "sft_loss": 1.3156766891479492, "step": 1060 }, { "epoch": 1.729642351990301, "grad_norm": 2.0628256797790527, "learning_rate": 1.9003029050445953e-06, "logits/chosen": -14.267855644226074, "logits/rejected": -14.399972915649414, "logps/chosen": -1.402465581893921, "logps/rejected": -1.4434514045715332, "loss": 1.4747, "odds_ratio_loss": 0.7224588990211487, "rewards/accuracies": 0.5, "rewards/chosen": -0.14024657011032104, "rewards/margins": 0.004098571836948395, "rewards/rejected": -0.14434513449668884, "sft_loss": 1.402465581893921, "step": 1070 }, { "epoch": 1.745807233784603, "grad_norm": 1.5042709112167358, "learning_rate": 1.8592632585342523e-06, "logits/chosen": -14.195714950561523, "logits/rejected": -14.285571098327637, "logps/chosen": -1.3312032222747803, "logps/rejected": -1.412341833114624, "loss": 1.4047, "odds_ratio_loss": 0.7354634404182434, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.1331203281879425, "rewards/margins": 0.008113870397210121, "rewards/rejected": -0.14123418927192688, "sft_loss": 1.3312032222747803, "step": 1080 }, { "epoch": 1.7619721155789048, "grad_norm": 3.4297995567321777, "learning_rate": 1.8184076029358527e-06, "logits/chosen": -14.20643138885498, "logits/rejected": -14.019030570983887, "logps/chosen": -1.2683379650115967, "logps/rejected": -1.2236586809158325, "loss": 1.3443, "odds_ratio_loss": 0.7591326832771301, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.12683378159999847, "rewards/margins": -0.00446792459115386, "rewards/rejected": -0.12236586958169937, "sft_loss": 1.2683379650115967, "step": 1090 }, { "epoch": 1.7781369973732066, "grad_norm": 1.0218937397003174, "learning_rate": 1.7777476701649318e-06, "logits/chosen": -14.1577730178833, "logits/rejected": -14.125236511230469, "logps/chosen": -1.3477040529251099, "logps/rejected": -1.391446828842163, "loss": 1.4231, "odds_ratio_loss": 0.7540372610092163, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13477042317390442, "rewards/margins": 0.004374279640614986, "rewards/rejected": -0.1391446888446808, "sft_loss": 1.3477040529251099, "step": 1100 }, { "epoch": 1.7943018791675085, "grad_norm": 1.4984055757522583, "learning_rate": 1.7372951359341925e-06, "logits/chosen": -14.369695663452148, "logits/rejected": -14.277885437011719, "logps/chosen": -1.2875721454620361, "logps/rejected": -1.3878809213638306, "loss": 1.3577, "odds_ratio_loss": 0.7012876272201538, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.12875720858573914, "rewards/margins": 0.01003087218850851, "rewards/rejected": -0.13878807425498962, "sft_loss": 1.2875721454620361, "step": 1110 }, { "epoch": 1.8104667609618104, "grad_norm": 3.3275625705718994, "learning_rate": 1.6970616164007547e-06, "logits/chosen": -14.229268074035645, "logits/rejected": -14.10546875, "logps/chosen": -1.364091396331787, "logps/rejected": -1.3946739435195923, "loss": 1.4435, "odds_ratio_loss": 0.7942220568656921, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13640913367271423, "rewards/margins": 0.0030582635663449764, "rewards/rejected": -0.13946738839149475, "sft_loss": 1.364091396331787, "step": 1120 }, { "epoch": 1.8266316427561122, "grad_norm": 2.735656976699829, "learning_rate": 1.6570586648305276e-06, "logits/chosen": -14.143117904663086, "logits/rejected": -14.2241849899292, "logps/chosen": -1.344879150390625, "logps/rejected": -1.493446707725525, "loss": 1.4182, "odds_ratio_loss": 0.733532726764679, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.13448792695999146, "rewards/margins": 0.014856770634651184, "rewards/rejected": -0.14934466779232025, "sft_loss": 1.344879150390625, "step": 1130 }, { "epoch": 1.842796524550414, "grad_norm": 1.1568862199783325, "learning_rate": 1.6172977682806151e-06, "logits/chosen": -14.38661003112793, "logits/rejected": -14.517931938171387, "logps/chosen": -1.3603746891021729, "logps/rejected": -1.5093238353729248, "loss": 1.4288, "odds_ratio_loss": 0.68376624584198, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.1360374540090561, "rewards/margins": 0.014894920401275158, "rewards/rejected": -0.15093238651752472, "sft_loss": 1.3603746891021729, "step": 1140 }, { "epoch": 1.858961406344716, "grad_norm": 1.1773515939712524, "learning_rate": 1.5777903443007586e-06, "logits/chosen": -14.423624992370605, "logits/rejected": -14.032621383666992, "logps/chosen": -1.387117624282837, "logps/rejected": -1.4605300426483154, "loss": 1.4607, "odds_ratio_loss": 0.7362414598464966, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13871176540851593, "rewards/margins": 0.007341254502534866, "rewards/rejected": -0.1460530012845993, "sft_loss": 1.387117624282837, "step": 1150 }, { "epoch": 1.8751262881390178, "grad_norm": 1.5692604780197144, "learning_rate": 1.5385477376547226e-06, "logits/chosen": -14.410656929016113, "logits/rejected": -14.352084159851074, "logps/chosen": -1.3973274230957031, "logps/rejected": -1.4963886737823486, "loss": 1.4675, "odds_ratio_loss": 0.7020548582077026, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.1397327482700348, "rewards/margins": 0.009906120598316193, "rewards/rejected": -0.14963887631893158, "sft_loss": 1.3973274230957031, "step": 1160 }, { "epoch": 1.89129116993332, "grad_norm": 3.0858218669891357, "learning_rate": 1.4995812170625845e-06, "logits/chosen": -14.365419387817383, "logits/rejected": -14.341082572937012, "logps/chosen": -1.4526535272598267, "logps/rejected": -1.5791641473770142, "loss": 1.5265, "odds_ratio_loss": 0.7380681037902832, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1452653706073761, "rewards/margins": 0.012651054188609123, "rewards/rejected": -0.15791639685630798, "sft_loss": 1.4526535272598267, "step": 1170 }, { "epoch": 1.9074560517276218, "grad_norm": 2.4256625175476074, "learning_rate": 1.4609019719648666e-06, "logits/chosen": -14.359014511108398, "logits/rejected": -14.343942642211914, "logps/chosen": -1.365081787109375, "logps/rejected": -1.4730589389801025, "loss": 1.4336, "odds_ratio_loss": 0.685504138469696, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.13650815188884735, "rewards/margins": 0.010797703638672829, "rewards/rejected": -0.14730587601661682, "sft_loss": 1.365081787109375, "step": 1180 }, { "epoch": 1.9236209335219236, "grad_norm": 2.2215967178344727, "learning_rate": 1.42252110930943e-06, "logits/chosen": -14.144754409790039, "logits/rejected": -14.116401672363281, "logps/chosen": -1.2247555255889893, "logps/rejected": -1.2106770277023315, "loss": 1.3031, "odds_ratio_loss": 0.7834988832473755, "rewards/accuracies": 0.46875, "rewards/chosen": -0.12247554957866669, "rewards/margins": -0.0014078498352319002, "rewards/rejected": -0.12106770277023315, "sft_loss": 1.2247555255889893, "step": 1190 }, { "epoch": 1.9397858153162255, "grad_norm": 1.6026244163513184, "learning_rate": 1.3844496503620493e-06, "logits/chosen": -14.315832138061523, "logits/rejected": -14.499916076660156, "logps/chosen": -1.4833340644836426, "logps/rejected": -1.521794080734253, "loss": 1.5547, "odds_ratio_loss": 0.7132872343063354, "rewards/accuracies": 0.46875, "rewards/chosen": -0.14833340048789978, "rewards/margins": 0.0038460283540189266, "rewards/rejected": -0.15217943489551544, "sft_loss": 1.4833340644836426, "step": 1200 }, { "epoch": 1.9559506971105274, "grad_norm": 1.1467649936676025, "learning_rate": 1.3466985275416081e-06, "logits/chosen": -14.316365242004395, "logits/rejected": -14.039219856262207, "logps/chosen": -1.4100277423858643, "logps/rejected": -1.4868837594985962, "loss": 1.4848, "odds_ratio_loss": 0.7481211423873901, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.14100277423858643, "rewards/margins": 0.00768560403957963, "rewards/rejected": -0.14868836104869843, "sft_loss": 1.4100277423858643, "step": 1210 }, { "epoch": 1.9721155789048292, "grad_norm": 1.3261767625808716, "learning_rate": 1.309278581280791e-06, "logits/chosen": -14.425065994262695, "logits/rejected": -14.19542121887207, "logps/chosen": -1.258156418800354, "logps/rejected": -1.3927624225616455, "loss": 1.3258, "odds_ratio_loss": 0.6761429309844971, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.12581565976142883, "rewards/margins": 0.013460601679980755, "rewards/rejected": -0.13927623629570007, "sft_loss": 1.258156418800354, "step": 1220 }, { "epoch": 1.9882804606991311, "grad_norm": 0.8793450593948364, "learning_rate": 1.272200556913199e-06, "logits/chosen": -14.331692695617676, "logits/rejected": -14.390342712402344, "logps/chosen": -1.2902759313583374, "logps/rejected": -1.398531198501587, "loss": 1.3633, "odds_ratio_loss": 0.7302906513214111, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1290276050567627, "rewards/margins": 0.01082551758736372, "rewards/rejected": -0.1398531198501587, "sft_loss": 1.2902759313583374, "step": 1230 }, { "epoch": 2.004445342493433, "grad_norm": 2.07963228225708, "learning_rate": 1.2354751015877698e-06, "logits/chosen": -14.254411697387695, "logits/rejected": -14.420768737792969, "logps/chosen": -1.2709214687347412, "logps/rejected": -1.4514631032943726, "loss": 1.3403, "odds_ratio_loss": 0.6936594247817993, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.1270921379327774, "rewards/margins": 0.018054189160466194, "rewards/rejected": -0.14514632523059845, "sft_loss": 1.2709214687347412, "step": 1240 }, { "epoch": 2.020610224287735, "grad_norm": 2.574068069458008, "learning_rate": 1.1991127612113945e-06, "logits/chosen": -14.361371040344238, "logits/rejected": -14.495355606079102, "logps/chosen": -1.3789875507354736, "logps/rejected": -1.5034908056259155, "loss": 1.4475, "odds_ratio_loss": 0.6847060322761536, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.1378987580537796, "rewards/margins": 0.012450330890715122, "rewards/rejected": -0.15034906566143036, "sft_loss": 1.3789875507354736, "step": 1250 }, { "epoch": 2.036775106082037, "grad_norm": 1.4936628341674805, "learning_rate": 1.1631239774206035e-06, "logits/chosen": -14.19866943359375, "logits/rejected": -14.191067695617676, "logps/chosen": -1.347879409790039, "logps/rejected": -1.4048999547958374, "loss": 1.4251, "odds_ratio_loss": 0.7725744247436523, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1347879320383072, "rewards/margins": 0.005702070891857147, "rewards/rejected": -0.14049001038074493, "sft_loss": 1.347879409790039, "step": 1260 }, { "epoch": 2.052939987876339, "grad_norm": 1.7168585062026978, "learning_rate": 1.1275190845831978e-06, "logits/chosen": -14.3424711227417, "logits/rejected": -14.3289213180542, "logps/chosen": -1.3685007095336914, "logps/rejected": -1.4727340936660767, "loss": 1.4389, "odds_ratio_loss": 0.7035232782363892, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.13685005903244019, "rewards/margins": 0.010423343628644943, "rewards/rejected": -0.14727340638637543, "sft_loss": 1.3685007095336914, "step": 1270 }, { "epoch": 2.0691048696706407, "grad_norm": 1.1820368766784668, "learning_rate": 1.0923083068306778e-06, "logits/chosen": -14.398675918579102, "logits/rejected": -14.118631362915039, "logps/chosen": -1.2939175367355347, "logps/rejected": -1.473049283027649, "loss": 1.3601, "odds_ratio_loss": 0.662093997001648, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.12939175963401794, "rewards/margins": 0.017913173884153366, "rewards/rejected": -0.14730492234230042, "sft_loss": 1.2939175367355347, "step": 1280 }, { "epoch": 2.0852697514649425, "grad_norm": 1.1745166778564453, "learning_rate": 1.0575017551223348e-06, "logits/chosen": -14.3531494140625, "logits/rejected": -14.198529243469238, "logps/chosen": -1.2511951923370361, "logps/rejected": -1.3217878341674805, "loss": 1.3224, "odds_ratio_loss": 0.7121993899345398, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.12511952221393585, "rewards/margins": 0.007059249095618725, "rewards/rejected": -0.13217875361442566, "sft_loss": 1.2511951923370361, "step": 1290 }, { "epoch": 2.1014346332592444, "grad_norm": 0.894344687461853, "learning_rate": 1.023109424341833e-06, "logits/chosen": -14.153393745422363, "logits/rejected": -14.245986938476562, "logps/chosen": -1.3667266368865967, "logps/rejected": -1.42815363407135, "loss": 1.4394, "odds_ratio_loss": 0.727142333984375, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13667264580726624, "rewards/margins": 0.006142704281955957, "rewards/rejected": -0.14281536638736725, "sft_loss": 1.3667266368865967, "step": 1300 }, { "epoch": 2.1175995150535463, "grad_norm": 1.5093544721603394, "learning_rate": 9.891411904271273e-07, "logits/chosen": -14.242596626281738, "logits/rejected": -14.327380180358887, "logps/chosen": -1.3282233476638794, "logps/rejected": -1.3852262496948242, "loss": 1.4007, "odds_ratio_loss": 0.7251249551773071, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.13282233476638794, "rewards/margins": 0.005700295325368643, "rewards/rejected": -0.13852263987064362, "sft_loss": 1.3282233476638794, "step": 1310 }, { "epoch": 2.133764396847848, "grad_norm": 0.8299040198326111, "learning_rate": 9.556068075345363e-07, "logits/chosen": -14.465705871582031, "logits/rejected": -14.254651069641113, "logps/chosen": -1.2607736587524414, "logps/rejected": -1.3249403238296509, "loss": 1.3327, "odds_ratio_loss": 0.7195707559585571, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.12607736885547638, "rewards/margins": 0.006416681222617626, "rewards/rejected": -0.13249404728412628, "sft_loss": 1.2607736587524414, "step": 1320 }, { "epoch": 2.14992927864215, "grad_norm": 1.5431737899780273, "learning_rate": 9.225159052377838e-07, "logits/chosen": -14.418218612670898, "logits/rejected": -14.442914009094238, "logps/chosen": -1.369145393371582, "logps/rejected": -1.4892218112945557, "loss": 1.4395, "odds_ratio_loss": 0.7034425735473633, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.13691455125808716, "rewards/margins": 0.012007640674710274, "rewards/rejected": -0.1489221751689911, "sft_loss": 1.369145393371582, "step": 1330 }, { "epoch": 2.166094160436452, "grad_norm": 2.125438928604126, "learning_rate": 8.898779857628184e-07, "logits/chosen": -14.263992309570312, "logits/rejected": -14.439204216003418, "logps/chosen": -1.2737493515014648, "logps/rejected": -1.307660698890686, "loss": 1.3488, "odds_ratio_loss": 0.7507684826850891, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.12737493216991425, "rewards/margins": 0.003391148056834936, "rewards/rejected": -0.13076607882976532, "sft_loss": 1.2737493515014648, "step": 1340 }, { "epoch": 2.1822590422307537, "grad_norm": 1.0558884143829346, "learning_rate": 8.577024212591975e-07, "logits/chosen": -14.523656845092773, "logits/rejected": -14.395648002624512, "logps/chosen": -1.3369591236114502, "logps/rejected": -1.402151346206665, "loss": 1.4081, "odds_ratio_loss": 0.7112525701522827, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.13369593024253845, "rewards/margins": 0.006519217975437641, "rewards/rejected": -0.14021514356136322, "sft_loss": 1.3369591236114502, "step": 1350 }, { "epoch": 2.1984239240250556, "grad_norm": 1.1882685422897339, "learning_rate": 8.259984511088276e-07, "logits/chosen": -14.409403800964355, "logits/rejected": -14.405116081237793, "logps/chosen": -1.3154635429382324, "logps/rejected": -1.4095304012298584, "loss": 1.3863, "odds_ratio_loss": 0.7081496715545654, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.13154636323451996, "rewards/margins": 0.009406678378582, "rewards/rejected": -0.14095303416252136, "sft_loss": 1.3154635429382324, "step": 1360 }, { "epoch": 2.2145888058193575, "grad_norm": 1.6390233039855957, "learning_rate": 7.947751792728237e-07, "logits/chosen": -14.409843444824219, "logits/rejected": -14.329424858093262, "logps/chosen": -1.3204478025436401, "logps/rejected": -1.4512555599212646, "loss": 1.3901, "odds_ratio_loss": 0.6965182423591614, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13204479217529297, "rewards/margins": 0.013080772943794727, "rewards/rejected": -0.14512555301189423, "sft_loss": 1.3204478025436401, "step": 1370 }, { "epoch": 2.2307536876136593, "grad_norm": 1.7825186252593994, "learning_rate": 7.640415716772626e-07, "logits/chosen": -14.333005905151367, "logits/rejected": -14.429731369018555, "logps/chosen": -1.3603641986846924, "logps/rejected": -1.4518425464630127, "loss": 1.4331, "odds_ratio_loss": 0.7270913124084473, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13603642582893372, "rewards/margins": 0.009147830307483673, "rewards/rejected": -0.1451842486858368, "sft_loss": 1.3603641986846924, "step": 1380 }, { "epoch": 2.246918569407961, "grad_norm": 1.125680685043335, "learning_rate": 7.338064536385722e-07, "logits/chosen": -14.394281387329102, "logits/rejected": -14.345739364624023, "logps/chosen": -1.3667652606964111, "logps/rejected": -1.5295965671539307, "loss": 1.435, "odds_ratio_loss": 0.6821550130844116, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.13667652010917664, "rewards/margins": 0.016283124685287476, "rewards/rejected": -0.1529596596956253, "sft_loss": 1.3667652606964111, "step": 1390 }, { "epoch": 2.263083451202263, "grad_norm": 1.7544102668762207, "learning_rate": 7.040785073292883e-07, "logits/chosen": -14.237360000610352, "logits/rejected": -14.33959674835205, "logps/chosen": -1.4276225566864014, "logps/rejected": -1.4824755191802979, "loss": 1.5027, "odds_ratio_loss": 0.750755786895752, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.14276224374771118, "rewards/margins": 0.00548530463129282, "rewards/rejected": -0.14824756979942322, "sft_loss": 1.4276225566864014, "step": 1400 }, { "epoch": 2.279248332996565, "grad_norm": 1.7468085289001465, "learning_rate": 6.748662692849297e-07, "logits/chosen": -14.5598726272583, "logits/rejected": -14.531698226928711, "logps/chosen": -1.3492968082427979, "logps/rejected": -1.4934823513031006, "loss": 1.4184, "odds_ratio_loss": 0.6912583112716675, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.13492968678474426, "rewards/margins": 0.014418545179069042, "rewards/rejected": -0.14934822916984558, "sft_loss": 1.3492968082427979, "step": 1410 }, { "epoch": 2.295413214790867, "grad_norm": 3.2176520824432373, "learning_rate": 6.46178127952686e-07, "logits/chosen": -14.288836479187012, "logits/rejected": -14.204765319824219, "logps/chosen": -1.299232840538025, "logps/rejected": -1.4280776977539062, "loss": 1.3673, "odds_ratio_loss": 0.6802908182144165, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.12992329895496368, "rewards/margins": 0.01288448553532362, "rewards/rejected": -0.1428077667951584, "sft_loss": 1.299232840538025, "step": 1420 }, { "epoch": 2.3115780965851687, "grad_norm": 2.5991835594177246, "learning_rate": 6.180223212826289e-07, "logits/chosen": -14.347335815429688, "logits/rejected": -14.187026977539062, "logps/chosen": -1.2904529571533203, "logps/rejected": -1.3600698709487915, "loss": 1.362, "odds_ratio_loss": 0.7157233953475952, "rewards/accuracies": 0.46875, "rewards/chosen": -0.1290452927350998, "rewards/margins": 0.006961710751056671, "rewards/rejected": -0.13600699603557587, "sft_loss": 1.2904529571533203, "step": 1430 }, { "epoch": 2.3277429783794705, "grad_norm": 0.8683578968048096, "learning_rate": 5.904069343621443e-07, "logits/chosen": -14.465449333190918, "logits/rejected": -14.325057983398438, "logps/chosen": -1.299377202987671, "logps/rejected": -1.401989459991455, "loss": 1.3706, "odds_ratio_loss": 0.7122213244438171, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.12993772327899933, "rewards/margins": 0.010261224582791328, "rewards/rejected": -0.14019893109798431, "sft_loss": 1.299377202987671, "step": 1440 }, { "epoch": 2.3439078601737724, "grad_norm": 1.7288964986801147, "learning_rate": 5.633398970942544e-07, "logits/chosen": -14.3145170211792, "logits/rejected": -14.42223834991455, "logps/chosen": -1.2952549457550049, "logps/rejected": -1.3960306644439697, "loss": 1.3675, "odds_ratio_loss": 0.7228525876998901, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.12952548265457153, "rewards/margins": 0.010077586397528648, "rewards/rejected": -0.13960307836532593, "sft_loss": 1.2952549457550049, "step": 1450 }, { "epoch": 2.3600727419680743, "grad_norm": 1.8580021858215332, "learning_rate": 5.368289819205069e-07, "logits/chosen": -14.319725036621094, "logits/rejected": -14.285405158996582, "logps/chosen": -1.2445900440216064, "logps/rejected": -1.3483976125717163, "loss": 1.3139, "odds_ratio_loss": 0.6927712559700012, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.12445902824401855, "rewards/margins": 0.010380755178630352, "rewards/rejected": -0.13483977317810059, "sft_loss": 1.2445900440216064, "step": 1460 }, { "epoch": 2.376237623762376, "grad_norm": 2.3416638374328613, "learning_rate": 5.108818015890785e-07, "logits/chosen": -14.468851089477539, "logits/rejected": -14.461502075195312, "logps/chosen": -1.3592495918273926, "logps/rejected": -1.4990885257720947, "loss": 1.4311, "odds_ratio_loss": 0.7181252241134644, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.13592496514320374, "rewards/margins": 0.013983884826302528, "rewards/rejected": -0.14990884065628052, "sft_loss": 1.3592495918273926, "step": 1470 }, { "epoch": 2.392402505556678, "grad_norm": 1.5794059038162231, "learning_rate": 4.855058069687291e-07, "logits/chosen": -14.158782958984375, "logits/rejected": -14.074625015258789, "logps/chosen": -1.324530839920044, "logps/rejected": -1.366247296333313, "loss": 1.3974, "odds_ratio_loss": 0.7290586233139038, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1324530839920044, "rewards/margins": 0.004171643406152725, "rewards/rejected": -0.13662473857402802, "sft_loss": 1.324530839920044, "step": 1480 }, { "epoch": 2.40856738735098, "grad_norm": 2.1180176734924316, "learning_rate": 4.607082849092523e-07, "logits/chosen": -14.219759941101074, "logits/rejected": -14.182577133178711, "logps/chosen": -1.4282917976379395, "logps/rejected": -1.4976496696472168, "loss": 1.5016, "odds_ratio_loss": 0.7326869368553162, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.14282917976379395, "rewards/margins": 0.006935800425708294, "rewards/rejected": -0.14976496994495392, "sft_loss": 1.4282917976379395, "step": 1490 }, { "epoch": 2.4247322691452817, "grad_norm": 2.495347738265991, "learning_rate": 4.3649635614901405e-07, "logits/chosen": -14.16241455078125, "logits/rejected": -14.45665168762207, "logps/chosen": -1.3701971769332886, "logps/rejected": -1.3534958362579346, "loss": 1.446, "odds_ratio_loss": 0.7579734921455383, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.13701972365379333, "rewards/margins": -0.0016701335553079844, "rewards/rejected": -0.1353495866060257, "sft_loss": 1.3701971769332886, "step": 1500 }, { "epoch": 2.4247322691452817, "eval_logits/chosen": -14.27784252166748, "eval_logits/rejected": -14.317824363708496, "eval_logps/chosen": -1.372594952583313, "eval_logps/rejected": -1.4290432929992676, "eval_loss": 1.4484930038452148, "eval_odds_ratio_loss": 0.7589808702468872, "eval_rewards/accuracies": 0.4809090793132782, "eval_rewards/chosen": -0.13725949823856354, "eval_rewards/margins": 0.005644842050969601, "eval_rewards/rejected": -0.1429043412208557, "eval_runtime": 396.2162, "eval_samples_per_second": 2.776, "eval_sft_loss": 1.372594952583313, "eval_steps_per_second": 1.388, "step": 1500 }, { "epoch": 2.4408971509395836, "grad_norm": 1.8667449951171875, "learning_rate": 4.128769732701973e-07, "logits/chosen": -14.2674560546875, "logits/rejected": -14.17170524597168, "logps/chosen": -1.3341007232666016, "logps/rejected": -1.4468257427215576, "loss": 1.4053, "odds_ratio_loss": 0.7120139002799988, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13341006636619568, "rewards/margins": 0.011272510513663292, "rewards/rejected": -0.14468258619308472, "sft_loss": 1.3341007232666016, "step": 1510 }, { "epoch": 2.4570620327338855, "grad_norm": 2.940946102142334, "learning_rate": 3.8985691870233046e-07, "logits/chosen": -14.28807258605957, "logits/rejected": -14.214245796203613, "logps/chosen": -1.3024286031723022, "logps/rejected": -1.4218701124191284, "loss": 1.3737, "odds_ratio_loss": 0.712692379951477, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.13024285435676575, "rewards/margins": 0.011944140307605267, "rewards/rejected": -0.1421869993209839, "sft_loss": 1.3024286031723022, "step": 1520 }, { "epoch": 2.4732269145281873, "grad_norm": 2.6948108673095703, "learning_rate": 3.6744280277467904e-07, "logits/chosen": -14.425226211547852, "logits/rejected": -14.381690979003906, "logps/chosen": -1.4246366024017334, "logps/rejected": -1.426334023475647, "loss": 1.5046, "odds_ratio_loss": 0.7999409437179565, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.14246365427970886, "rewards/margins": 0.00016971743025351316, "rewards/rejected": -0.14263339340686798, "sft_loss": 1.4246366024017334, "step": 1530 }, { "epoch": 2.489391796322489, "grad_norm": 1.6409363746643066, "learning_rate": 3.456410618180503e-07, "logits/chosen": -13.974553108215332, "logits/rejected": -14.2942533493042, "logps/chosen": -1.2257071733474731, "logps/rejected": -1.43178391456604, "loss": 1.2927, "odds_ratio_loss": 0.6698334217071533, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.1225707158446312, "rewards/margins": 0.020607685670256615, "rewards/rejected": -0.14317841827869415, "sft_loss": 1.2257071733474731, "step": 1540 }, { "epoch": 2.5055566781167915, "grad_norm": 1.3992644548416138, "learning_rate": 3.244579563165753e-07, "logits/chosen": -14.36426830291748, "logits/rejected": -14.48327922821045, "logps/chosen": -1.2957897186279297, "logps/rejected": -1.4375650882720947, "loss": 1.3673, "odds_ratio_loss": 0.7152336239814758, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.12957896292209625, "rewards/margins": 0.014177536591887474, "rewards/rejected": -0.14375647902488708, "sft_loss": 1.2957897186279297, "step": 1550 }, { "epoch": 2.521721559911093, "grad_norm": 0.9756754636764526, "learning_rate": 3.038995691099697e-07, "logits/chosen": -14.465911865234375, "logits/rejected": -14.273321151733398, "logps/chosen": -1.3624980449676514, "logps/rejected": -1.5072979927062988, "loss": 1.4344, "odds_ratio_loss": 0.7189978361129761, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.13624981045722961, "rewards/margins": 0.014479981735348701, "rewards/rejected": -0.15072980523109436, "sft_loss": 1.3624980449676514, "step": 1560 }, { "epoch": 2.5378864417053952, "grad_norm": 2.6390867233276367, "learning_rate": 2.839718036468192e-07, "logits/chosen": -14.324618339538574, "logits/rejected": -14.362611770629883, "logps/chosen": -1.4562547206878662, "logps/rejected": -1.4829699993133545, "loss": 1.5307, "odds_ratio_loss": 0.7442874312400818, "rewards/accuracies": 0.5, "rewards/chosen": -0.14562548696994781, "rewards/margins": 0.0026715078856796026, "rewards/rejected": -0.1482969969511032, "sft_loss": 1.4562547206878662, "step": 1570 }, { "epoch": 2.5540513234996967, "grad_norm": 1.9648209810256958, "learning_rate": 2.646803822893723e-07, "logits/chosen": -14.38152027130127, "logits/rejected": -14.392126083374023, "logps/chosen": -1.4547812938690186, "logps/rejected": -1.4928423166275024, "loss": 1.5325, "odds_ratio_loss": 0.7773637175559998, "rewards/accuracies": 0.5, "rewards/chosen": -0.14547815918922424, "rewards/margins": 0.003806093242019415, "rewards/rejected": -0.1492842435836792, "sft_loss": 1.4547812938690186, "step": 1580 }, { "epoch": 2.570216205293999, "grad_norm": 1.1905252933502197, "learning_rate": 2.460308446703341e-07, "logits/chosen": -14.339777946472168, "logits/rejected": -14.1979398727417, "logps/chosen": -1.3354339599609375, "logps/rejected": -1.348439335823059, "loss": 1.4097, "odds_ratio_loss": 0.7425277829170227, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.13354340195655823, "rewards/margins": 0.0013005301589146256, "rewards/rejected": -0.13484393060207367, "sft_loss": 1.3354339599609375, "step": 1590 }, { "epoch": 2.5863810870883004, "grad_norm": 4.711751461029053, "learning_rate": 2.2802854610213143e-07, "logits/chosen": -14.302705764770508, "logits/rejected": -14.19762134552002, "logps/chosen": -1.3138768672943115, "logps/rejected": -1.4147188663482666, "loss": 1.3864, "odds_ratio_loss": 0.7257053256034851, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13138769567012787, "rewards/margins": 0.010084209032356739, "rewards/rejected": -0.14147189259529114, "sft_loss": 1.3138768672943115, "step": 1600 }, { "epoch": 2.6025459688826027, "grad_norm": 4.042973518371582, "learning_rate": 2.106786560391072e-07, "logits/chosen": -14.2058744430542, "logits/rejected": -14.269085884094238, "logps/chosen": -1.3923499584197998, "logps/rejected": -1.3771612644195557, "loss": 1.4698, "odds_ratio_loss": 0.7747048139572144, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1392349898815155, "rewards/margins": -0.0015188835095614195, "rewards/rejected": -0.1377161294221878, "sft_loss": 1.3923499584197998, "step": 1610 }, { "epoch": 2.6187108506769046, "grad_norm": 1.3606544733047485, "learning_rate": 1.9398615659308255e-07, "logits/chosen": -14.2599515914917, "logits/rejected": -14.334997177124023, "logps/chosen": -1.3270127773284912, "logps/rejected": -1.3853967189788818, "loss": 1.3982, "odds_ratio_loss": 0.7119258046150208, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.13270129263401031, "rewards/margins": 0.005838391836732626, "rewards/rejected": -0.13853967189788818, "sft_loss": 1.3270127773284912, "step": 1620 }, { "epoch": 2.6348757324712064, "grad_norm": 1.4494473934173584, "learning_rate": 1.7795584110272184e-07, "logits/chosen": -14.470367431640625, "logits/rejected": -14.478838920593262, "logps/chosen": -1.3744457960128784, "logps/rejected": -1.4546699523925781, "loss": 1.4475, "odds_ratio_loss": 0.730518639087677, "rewards/accuracies": 0.46875, "rewards/chosen": -0.13744458556175232, "rewards/margins": 0.008022413589060307, "rewards/rejected": -0.14546698331832886, "sft_loss": 1.3744457960128784, "step": 1630 }, { "epoch": 2.6510406142655083, "grad_norm": 2.888951539993286, "learning_rate": 1.6259231275709636e-07, "logits/chosen": -14.41100788116455, "logits/rejected": -14.428006172180176, "logps/chosen": -1.3241318464279175, "logps/rejected": -1.318234920501709, "loss": 1.4028, "odds_ratio_loss": 0.7864112257957458, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.13241317868232727, "rewards/margins": -0.000589700706768781, "rewards/rejected": -0.13182349503040314, "sft_loss": 1.3241318464279175, "step": 1640 }, { "epoch": 2.66720549605981, "grad_norm": 1.5565133094787598, "learning_rate": 1.478999832738548e-07, "logits/chosen": -14.382177352905273, "logits/rejected": -14.320945739746094, "logps/chosen": -1.297300934791565, "logps/rejected": -1.4187005758285522, "loss": 1.368, "odds_ratio_loss": 0.7067518830299377, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.12973010540008545, "rewards/margins": 0.01213997695595026, "rewards/rejected": -0.14187008142471313, "sft_loss": 1.297300934791565, "step": 1650 }, { "epoch": 2.683370377854112, "grad_norm": 2.0713951587677, "learning_rate": 1.338830716323769e-07, "logits/chosen": -14.337793350219727, "logits/rejected": -14.350440979003906, "logps/chosen": -1.3087949752807617, "logps/rejected": -1.350098967552185, "loss": 1.383, "odds_ratio_loss": 0.7419986724853516, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.1308794915676117, "rewards/margins": 0.004130417015403509, "rewards/rejected": -0.13500989973545074, "sft_loss": 1.3087949752807617, "step": 1660 }, { "epoch": 2.699535259648414, "grad_norm": 2.8654770851135254, "learning_rate": 1.205456028622723e-07, "logits/chosen": -14.387499809265137, "logits/rejected": -14.384310722351074, "logps/chosen": -1.2575846910476685, "logps/rejected": -1.4380841255187988, "loss": 1.3249, "odds_ratio_loss": 0.6730828285217285, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.12575848400592804, "rewards/margins": 0.018049929291009903, "rewards/rejected": -0.14380840957164764, "sft_loss": 1.2575846910476685, "step": 1670 }, { "epoch": 2.7157001414427158, "grad_norm": 2.644263505935669, "learning_rate": 1.0789140688756805e-07, "logits/chosen": -14.564410209655762, "logits/rejected": -14.484796524047852, "logps/chosen": -1.331872582435608, "logps/rejected": -1.4917659759521484, "loss": 1.3983, "odds_ratio_loss": 0.6643630862236023, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.13318723440170288, "rewards/margins": 0.015989361330866814, "rewards/rejected": -0.14917659759521484, "sft_loss": 1.331872582435608, "step": 1680 }, { "epoch": 2.7318650232370176, "grad_norm": 1.8434594869613647, "learning_rate": 9.592411742693098e-08, "logits/chosen": -14.349563598632812, "logits/rejected": -14.297950744628906, "logps/chosen": -1.284172773361206, "logps/rejected": -1.3313789367675781, "loss": 1.3598, "odds_ratio_loss": 0.7563740611076355, "rewards/accuracies": 0.46875, "rewards/chosen": -0.12841728329658508, "rewards/margins": 0.004720622207969427, "rewards/rejected": -0.13313789665699005, "sft_loss": 1.284172773361206, "step": 1690 }, { "epoch": 2.7480299050313195, "grad_norm": 0.9198280572891235, "learning_rate": 8.464717095022168e-08, "logits/chosen": -14.535560607910156, "logits/rejected": -14.29857349395752, "logps/chosen": -1.291333794593811, "logps/rejected": -1.4038417339324951, "loss": 1.3626, "odds_ratio_loss": 0.7129305601119995, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.12913337349891663, "rewards/margins": 0.011250784620642662, "rewards/rejected": -0.14038416743278503, "sft_loss": 1.291333794593811, "step": 1700 }, { "epoch": 2.7641947868256214, "grad_norm": 1.85430908203125, "learning_rate": 7.406380569169841e-08, "logits/chosen": -14.304112434387207, "logits/rejected": -14.291776657104492, "logps/chosen": -1.3815504312515259, "logps/rejected": -1.3685299158096313, "loss": 1.4574, "odds_ratio_loss": 0.7585769891738892, "rewards/accuracies": 0.46875, "rewards/chosen": -0.13815505802631378, "rewards/margins": -0.0013020627666264772, "rewards/rejected": -0.13685297966003418, "sft_loss": 1.3815504312515259, "step": 1710 }, { "epoch": 2.7803596686199232, "grad_norm": 7.879937171936035, "learning_rate": 6.417706072013808e-08, "logits/chosen": -14.357699394226074, "logits/rejected": -14.520744323730469, "logps/chosen": -1.4151430130004883, "logps/rejected": -1.4842795133590698, "loss": 1.4887, "odds_ratio_loss": 0.7356118559837341, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.14151428639888763, "rewards/margins": 0.006913675460964441, "rewards/rejected": -0.14842796325683594, "sft_loss": 1.4151430130004883, "step": 1720 }, { "epoch": 2.796524550414225, "grad_norm": 2.3623361587524414, "learning_rate": 5.498977506615294e-08, "logits/chosen": -14.438512802124023, "logits/rejected": -14.370248794555664, "logps/chosen": -1.4021018743515015, "logps/rejected": -1.3835337162017822, "loss": 1.4818, "odds_ratio_loss": 0.796977698802948, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.14021018147468567, "rewards/margins": -0.0018568048253655434, "rewards/rejected": -0.1383533775806427, "sft_loss": 1.4021018743515015, "step": 1730 }, { "epoch": 2.812689432208527, "grad_norm": 1.0650444030761719, "learning_rate": 4.6504586906947756e-08, "logits/chosen": -14.35010051727295, "logits/rejected": -14.401901245117188, "logps/chosen": -1.3507376909255981, "logps/rejected": -1.4280903339385986, "loss": 1.4204, "odds_ratio_loss": 0.6963773369789124, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.13507376611232758, "rewards/margins": 0.007735266350209713, "rewards/rejected": -0.14280903339385986, "sft_loss": 1.3507376909255981, "step": 1740 }, { "epoch": 2.828854314002829, "grad_norm": 5.588193893432617, "learning_rate": 3.8723932808754914e-08, "logits/chosen": -14.620956420898438, "logits/rejected": -14.591873168945312, "logps/chosen": -1.4141243696212769, "logps/rejected": -1.4447482824325562, "loss": 1.4888, "odds_ratio_loss": 0.7466815710067749, "rewards/accuracies": 0.4375, "rewards/chosen": -0.1414124220609665, "rewards/margins": 0.003062391420826316, "rewards/rejected": -0.1444748193025589, "sft_loss": 1.4141243696212769, "step": 1750 }, { "epoch": 2.8450191957971307, "grad_norm": 2.8461813926696777, "learning_rate": 3.1650047027158014e-08, "logits/chosen": -14.406710624694824, "logits/rejected": -14.431941032409668, "logps/chosen": -1.3235969543457031, "logps/rejected": -1.378565788269043, "loss": 1.3941, "odds_ratio_loss": 0.7055075764656067, "rewards/accuracies": 0.5, "rewards/chosen": -0.13235969841480255, "rewards/margins": 0.005496888421475887, "rewards/rejected": -0.13785657286643982, "sft_loss": 1.3235969543457031, "step": 1760 }, { "epoch": 2.8611840775914326, "grad_norm": 1.4648724794387817, "learning_rate": 2.5284960865517848e-08, "logits/chosen": -14.247715950012207, "logits/rejected": -14.30573844909668, "logps/chosen": -1.2652337551116943, "logps/rejected": -1.3874812126159668, "loss": 1.3373, "odds_ratio_loss": 0.7210808992385864, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.12652337551116943, "rewards/margins": 0.012224750593304634, "rewards/rejected": -0.13874812424182892, "sft_loss": 1.2652337551116943, "step": 1770 }, { "epoch": 2.8773489593857344, "grad_norm": 1.2711795568466187, "learning_rate": 1.9630502091670388e-08, "logits/chosen": -14.345422744750977, "logits/rejected": -14.210649490356445, "logps/chosen": -1.3347010612487793, "logps/rejected": -1.4864898920059204, "loss": 1.4034, "odds_ratio_loss": 0.686531126499176, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.13347011804580688, "rewards/margins": 0.015178876928985119, "rewards/rejected": -0.14864897727966309, "sft_loss": 1.3347010612487793, "step": 1780 }, { "epoch": 2.8935138411800363, "grad_norm": 4.285287857055664, "learning_rate": 1.4688294413074677e-08, "logits/chosen": -14.240816116333008, "logits/rejected": -14.293863296508789, "logps/chosen": -1.2230440378189087, "logps/rejected": -1.3717424869537354, "loss": 1.2918, "odds_ratio_loss": 0.6871523857116699, "rewards/accuracies": 0.5, "rewards/chosen": -0.12230439484119415, "rewards/margins": 0.014869834296405315, "rewards/rejected": -0.13717423379421234, "sft_loss": 1.2230440378189087, "step": 1790 }, { "epoch": 2.909678722974338, "grad_norm": 1.111965298652649, "learning_rate": 1.0459757010556626e-08, "logits/chosen": -14.294512748718262, "logits/rejected": -14.2905912399292, "logps/chosen": -1.3162596225738525, "logps/rejected": -1.357807993888855, "loss": 1.3902, "odds_ratio_loss": 0.7398349046707153, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1316259652376175, "rewards/margins": 0.0041548521257936954, "rewards/rejected": -0.13578079640865326, "sft_loss": 1.3162596225738525, "step": 1800 }, { "epoch": 2.92584360476864, "grad_norm": 1.985671043395996, "learning_rate": 6.94610413078306e-09, "logits/chosen": -14.099322319030762, "logits/rejected": -14.289319038391113, "logps/chosen": -1.3942023515701294, "logps/rejected": -1.5463578701019287, "loss": 1.4669, "odds_ratio_loss": 0.7267955541610718, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.13942024111747742, "rewards/margins": 0.015215557999908924, "rewards/rejected": -0.15463578701019287, "sft_loss": 1.3942023515701294, "step": 1810 }, { "epoch": 2.942008486562942, "grad_norm": 1.1975542306900024, "learning_rate": 4.14834473758563e-09, "logits/chosen": -14.166104316711426, "logits/rejected": -14.219152450561523, "logps/chosen": -1.2467665672302246, "logps/rejected": -1.3985602855682373, "loss": 1.3162, "odds_ratio_loss": 0.6939627528190613, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1246766597032547, "rewards/margins": 0.015179386362433434, "rewards/rejected": -0.13985604047775269, "sft_loss": 1.2467665672302246, "step": 1820 }, { "epoch": 2.9581733683572438, "grad_norm": 1.3036004304885864, "learning_rate": 2.067282222230349e-09, "logits/chosen": -14.375224113464355, "logits/rejected": -14.571484565734863, "logps/chosen": -1.326818585395813, "logps/rejected": -1.477850317955017, "loss": 1.3957, "odds_ratio_loss": 0.6886210441589355, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.13268187642097473, "rewards/margins": 0.015103173442184925, "rewards/rejected": -0.1477850377559662, "sft_loss": 1.326818585395813, "step": 1830 }, { "epoch": 2.9743382501515456, "grad_norm": 6.394278049468994, "learning_rate": 7.035141727212979e-10, "logits/chosen": -14.3215913772583, "logits/rejected": -14.438852310180664, "logps/chosen": -1.256394386291504, "logps/rejected": -1.3541960716247559, "loss": 1.3287, "odds_ratio_loss": 0.7228869199752808, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1256394237279892, "rewards/margins": 0.009780170395970345, "rewards/rejected": -0.13541960716247559, "sft_loss": 1.256394386291504, "step": 1840 }, { "epoch": 2.9905031319458475, "grad_norm": 2.8705546855926514, "learning_rate": 5.743220219761592e-11, "logits/chosen": -14.366948127746582, "logits/rejected": -14.415715217590332, "logps/chosen": -1.3598301410675049, "logps/rejected": -1.40765380859375, "loss": 1.4375, "odds_ratio_loss": 0.7764675617218018, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.13598300516605377, "rewards/margins": 0.004782381001859903, "rewards/rejected": -0.14076539874076843, "sft_loss": 1.3598301410675049, "step": 1850 }, { "epoch": 2.9969690846635686, "step": 1854, "total_flos": 1.9131711497471508e+18, "train_loss": 1.4823461713142765, "train_runtime": 22122.5243, "train_samples_per_second": 1.342, "train_steps_per_second": 0.084 } ], "logging_steps": 10, "max_steps": 1854, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.9131711497471508e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }