Falcon-7B-Instruct-ORPO-SALT / trainer_state.json
chchen's picture
End of training
f556a40 verified
{
"best_metric": 1.4484930038452148,
"best_model_checkpoint": "saves/Falcon-7B-Instruct/lora/orpo-salt/checkpoint-1500",
"epoch": 2.9969690846635686,
"eval_steps": 500,
"global_step": 1854,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01616488179430188,
"grad_norm": 0.5467122793197632,
"learning_rate": 4.999648198770648e-06,
"logits/chosen": -14.078092575073242,
"logits/rejected": -14.159353256225586,
"logps/chosen": -1.7583353519439697,
"logps/rejected": -1.8469493389129639,
"loss": 1.8299,
"odds_ratio_loss": 0.7155797481536865,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.17583352327346802,
"rewards/margins": 0.008861413225531578,
"rewards/rejected": -0.18469493091106415,
"sft_loss": 1.7583353519439697,
"step": 10
},
{
"epoch": 0.03232976358860376,
"grad_norm": 0.495731920003891,
"learning_rate": 4.998578646361359e-06,
"logits/chosen": -14.073513984680176,
"logits/rejected": -14.144752502441406,
"logps/chosen": -1.9236218929290771,
"logps/rejected": -1.9451425075531006,
"loss": 2.0003,
"odds_ratio_loss": 0.766566812992096,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.1923622190952301,
"rewards/margins": 0.002152049448341131,
"rewards/rejected": -0.19451424479484558,
"sft_loss": 1.9236218929290771,
"step": 20
},
{
"epoch": 0.04849464538290564,
"grad_norm": 0.6057537198066711,
"learning_rate": 4.996791614004449e-06,
"logits/chosen": -14.302851676940918,
"logits/rejected": -14.224812507629395,
"logps/chosen": -1.8387420177459717,
"logps/rejected": -1.910175085067749,
"loss": 1.9128,
"odds_ratio_loss": 0.7409650087356567,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.1838742196559906,
"rewards/margins": 0.007143297698348761,
"rewards/rejected": -0.1910175085067749,
"sft_loss": 1.8387420177459717,
"step": 30
},
{
"epoch": 0.06465952717720752,
"grad_norm": 0.5634093284606934,
"learning_rate": 4.994287614855618e-06,
"logits/chosen": -14.0798921585083,
"logits/rejected": -14.19922161102295,
"logps/chosen": -1.947654366493225,
"logps/rejected": -1.9009010791778564,
"loss": 2.0298,
"odds_ratio_loss": 0.8212669491767883,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.19476543366909027,
"rewards/margins": -0.004675320815294981,
"rewards/rejected": -0.1900901347398758,
"sft_loss": 1.947654366493225,
"step": 40
},
{
"epoch": 0.0808244089715094,
"grad_norm": 0.7957186698913574,
"learning_rate": 4.991067367951343e-06,
"logits/chosen": -14.371423721313477,
"logits/rejected": -14.266546249389648,
"logps/chosen": -2.017087697982788,
"logps/rejected": -2.0035624504089355,
"loss": 2.0958,
"odds_ratio_loss": 0.7871265411376953,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": -0.20170876383781433,
"rewards/margins": -0.0013525458052754402,
"rewards/rejected": -0.20035621523857117,
"sft_loss": 2.017087697982788,
"step": 50
},
{
"epoch": 0.09698929076581128,
"grad_norm": 0.5418820381164551,
"learning_rate": 4.987131798002389e-06,
"logits/chosen": -14.21721076965332,
"logits/rejected": -14.099153518676758,
"logps/chosen": -1.8751760721206665,
"logps/rejected": -1.8855310678482056,
"loss": 1.9577,
"odds_ratio_loss": 0.8254929780960083,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.18751761317253113,
"rewards/margins": 0.001035516383126378,
"rewards/rejected": -0.188553124666214,
"sft_loss": 1.8751760721206665,
"step": 60
},
{
"epoch": 0.11315417256011315,
"grad_norm": 1.0633864402770996,
"learning_rate": 4.982482035128285e-06,
"logits/chosen": -14.105901718139648,
"logits/rejected": -14.193835258483887,
"logps/chosen": -2.0220446586608887,
"logps/rejected": -1.9594541788101196,
"loss": 2.1089,
"odds_ratio_loss": 0.8683654069900513,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.20220446586608887,
"rewards/margins": -0.00625905767083168,
"rewards/rejected": -0.19594541192054749,
"sft_loss": 2.0220446586608887,
"step": 70
},
{
"epoch": 0.12931905435441504,
"grad_norm": 1.0158140659332275,
"learning_rate": 4.9771194145328e-06,
"logits/chosen": -14.075093269348145,
"logits/rejected": -14.02421760559082,
"logps/chosen": -1.6751682758331299,
"logps/rejected": -1.7500627040863037,
"loss": 1.7468,
"odds_ratio_loss": 0.716758668422699,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.167516827583313,
"rewards/margins": 0.00748945539817214,
"rewards/rejected": -0.17500628530979156,
"sft_loss": 1.6751682758331299,
"step": 80
},
{
"epoch": 0.1454839361487169,
"grad_norm": 1.3243364095687866,
"learning_rate": 4.971045476120532e-06,
"logits/chosen": -14.14300537109375,
"logits/rejected": -14.079290390014648,
"logps/chosen": -1.8245623111724854,
"logps/rejected": -1.760660171508789,
"loss": 1.9067,
"odds_ratio_loss": 0.8211291432380676,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.18245622515678406,
"rewards/margins": -0.006390226539224386,
"rewards/rejected": -0.17606601119041443,
"sft_loss": 1.8245623111724854,
"step": 90
},
{
"epoch": 0.1616488179430188,
"grad_norm": 0.7163342237472534,
"learning_rate": 4.964261964054713e-06,
"logits/chosen": -14.068964958190918,
"logits/rejected": -14.082951545715332,
"logps/chosen": -1.7527011632919312,
"logps/rejected": -1.8138408660888672,
"loss": 1.8297,
"odds_ratio_loss": 0.7703070044517517,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.17527012526988983,
"rewards/margins": 0.0061139510944485664,
"rewards/rejected": -0.18138407170772552,
"sft_loss": 1.7527011632919312,
"step": 100
},
{
"epoch": 0.17781369973732067,
"grad_norm": 1.006773829460144,
"learning_rate": 4.956770826256372e-06,
"logits/chosen": -14.166906356811523,
"logits/rejected": -14.120782852172852,
"logps/chosen": -1.7077207565307617,
"logps/rejected": -1.7365996837615967,
"loss": 1.7844,
"odds_ratio_loss": 0.7667573690414429,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.17077207565307617,
"rewards/margins": 0.002887908834964037,
"rewards/rejected": -0.17365998029708862,
"sft_loss": 1.7077207565307617,
"step": 110
},
{
"epoch": 0.19397858153162256,
"grad_norm": 0.8139289617538452,
"learning_rate": 4.94857421384497e-06,
"logits/chosen": -14.175407409667969,
"logits/rejected": -14.165875434875488,
"logps/chosen": -1.692577600479126,
"logps/rejected": -1.8239320516586304,
"loss": 1.7682,
"odds_ratio_loss": 0.7562084794044495,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.1692577451467514,
"rewards/margins": 0.013135453686118126,
"rewards/rejected": -0.18239320814609528,
"sft_loss": 1.692577600479126,
"step": 120
},
{
"epoch": 0.21014346332592443,
"grad_norm": 1.0950274467468262,
"learning_rate": 4.939674480520701e-06,
"logits/chosen": -14.055421829223633,
"logits/rejected": -14.265202522277832,
"logps/chosen": -1.65860915184021,
"logps/rejected": -1.6671603918075562,
"loss": 1.7352,
"odds_ratio_loss": 0.7663736939430237,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.16586092114448547,
"rewards/margins": 0.0008551125647500157,
"rewards/rejected": -0.16671602427959442,
"sft_loss": 1.65860915184021,
"step": 130
},
{
"epoch": 0.2263083451202263,
"grad_norm": 0.6190826892852783,
"learning_rate": 4.930074181888613e-06,
"logits/chosen": -14.116220474243164,
"logits/rejected": -14.158090591430664,
"logps/chosen": -1.7475076913833618,
"logps/rejected": -1.736114501953125,
"loss": 1.8234,
"odds_ratio_loss": 0.7589074373245239,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.17475078999996185,
"rewards/margins": -0.0011393536115065217,
"rewards/rejected": -0.17361143231391907,
"sft_loss": 1.7475076913833618,
"step": 140
},
{
"epoch": 0.2424732269145282,
"grad_norm": 0.8096482157707214,
"learning_rate": 4.91977607472475e-06,
"logits/chosen": -14.182394027709961,
"logits/rejected": -14.252290725708008,
"logps/chosen": -1.6399564743041992,
"logps/rejected": -1.6184114217758179,
"loss": 1.7178,
"odds_ratio_loss": 0.778221607208252,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.1639956533908844,
"rewards/margins": -0.0021545083727687597,
"rewards/rejected": -0.16184113919734955,
"sft_loss": 1.6399564743041992,
"step": 150
},
{
"epoch": 0.2586381087088301,
"grad_norm": 1.5372618436813354,
"learning_rate": 4.908783116184534e-06,
"logits/chosen": -14.110807418823242,
"logits/rejected": -14.087692260742188,
"logps/chosen": -1.613721489906311,
"logps/rejected": -1.7073653936386108,
"loss": 1.6837,
"odds_ratio_loss": 0.6995801329612732,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1613721400499344,
"rewards/margins": 0.009364412166178226,
"rewards/rejected": -0.17073655128479004,
"sft_loss": 1.613721489906311,
"step": 160
},
{
"epoch": 0.27480299050313195,
"grad_norm": 1.0400787591934204,
"learning_rate": 4.897098462953598e-06,
"logits/chosen": -14.309249877929688,
"logits/rejected": -14.144041061401367,
"logps/chosen": -1.572377324104309,
"logps/rejected": -1.679239273071289,
"loss": 1.6438,
"odds_ratio_loss": 0.7143967747688293,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.15723773837089539,
"rewards/margins": 0.010686198249459267,
"rewards/rejected": -0.1679239273071289,
"sft_loss": 1.572377324104309,
"step": 170
},
{
"epoch": 0.2909678722974338,
"grad_norm": 0.6752244234085083,
"learning_rate": 4.884725470341331e-06,
"logits/chosen": -14.362325668334961,
"logits/rejected": -14.368985176086426,
"logps/chosen": -1.5275907516479492,
"logps/rejected": -1.6322838068008423,
"loss": 1.5969,
"odds_ratio_loss": 0.6928091645240784,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.15275909006595612,
"rewards/margins": 0.01046929694712162,
"rewards/rejected": -0.1632283627986908,
"sft_loss": 1.5275907516479492,
"step": 180
},
{
"epoch": 0.3071327540917357,
"grad_norm": 1.5551739931106567,
"learning_rate": 4.871667691317377e-06,
"logits/chosen": -14.23143196105957,
"logits/rejected": -14.168081283569336,
"logps/chosen": -1.5617109537124634,
"logps/rejected": -1.516629934310913,
"loss": 1.6442,
"odds_ratio_loss": 0.8246932029724121,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.15617111325263977,
"rewards/margins": -0.004508105106651783,
"rewards/rejected": -0.15166299045085907,
"sft_loss": 1.5617109537124634,
"step": 190
},
{
"epoch": 0.3232976358860376,
"grad_norm": 4.873908519744873,
"learning_rate": 4.857928875491392e-06,
"logits/chosen": -14.317342758178711,
"logits/rejected": -14.135493278503418,
"logps/chosen": -1.4843647480010986,
"logps/rejected": -1.5346746444702148,
"loss": 1.5575,
"odds_ratio_loss": 0.7314870953559875,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.14843648672103882,
"rewards/margins": 0.005030992440879345,
"rewards/rejected": -0.15346747636795044,
"sft_loss": 1.4843647480010986,
"step": 200
},
{
"epoch": 0.33946251768033947,
"grad_norm": 1.1008872985839844,
"learning_rate": 4.843512968036314e-06,
"logits/chosen": -13.899968147277832,
"logits/rejected": -13.980463027954102,
"logps/chosen": -1.4831616878509521,
"logps/rejected": -1.464994192123413,
"loss": 1.5606,
"odds_ratio_loss": 0.7743188738822937,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1483161747455597,
"rewards/margins": -0.0018167542293667793,
"rewards/rejected": -0.1464994251728058,
"sft_loss": 1.4831616878509521,
"step": 210
},
{
"epoch": 0.35562739947464134,
"grad_norm": 2.111262083053589,
"learning_rate": 4.828424108555486e-06,
"logits/chosen": -14.277219772338867,
"logits/rejected": -14.1966552734375,
"logps/chosen": -1.5998783111572266,
"logps/rejected": -1.7076078653335571,
"loss": 1.6726,
"odds_ratio_loss": 0.727408230304718,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.15998782217502594,
"rewards/margins": 0.010772952809929848,
"rewards/rejected": -0.17076078057289124,
"sft_loss": 1.5998783111572266,
"step": 220
},
{
"epoch": 0.3717922812689432,
"grad_norm": 0.6497421264648438,
"learning_rate": 4.812666629893957e-06,
"logits/chosen": -14.255824089050293,
"logits/rejected": -14.233850479125977,
"logps/chosen": -1.5216138362884521,
"logps/rejected": -1.4904725551605225,
"loss": 1.599,
"odds_ratio_loss": 0.7741049528121948,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.1521613895893097,
"rewards/margins": -0.0031141184736043215,
"rewards/rejected": -0.14904727041721344,
"sft_loss": 1.5216138362884521,
"step": 230
},
{
"epoch": 0.3879571630632451,
"grad_norm": 1.4030089378356934,
"learning_rate": 4.796245056894273e-06,
"logits/chosen": -13.990198135375977,
"logits/rejected": -14.032785415649414,
"logps/chosen": -1.5593761205673218,
"logps/rejected": -1.5817941427230835,
"loss": 1.6382,
"odds_ratio_loss": 0.7885618805885315,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.15593759715557098,
"rewards/margins": 0.0022418068256229162,
"rewards/rejected": -0.1581794172525406,
"sft_loss": 1.5593761205673218,
"step": 240
},
{
"epoch": 0.404122044857547,
"grad_norm": 1.03659987449646,
"learning_rate": 4.779164105097148e-06,
"logits/chosen": -14.23992919921875,
"logits/rejected": -14.331039428710938,
"logps/chosen": -1.4630193710327148,
"logps/rejected": -1.6595561504364014,
"loss": 1.5308,
"odds_ratio_loss": 0.6777212023735046,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.14630195498466492,
"rewards/margins": 0.019653689116239548,
"rewards/rejected": -0.16595561802387238,
"sft_loss": 1.4630193710327148,
"step": 250
},
{
"epoch": 0.42028692665184886,
"grad_norm": 1.1558053493499756,
"learning_rate": 4.761428679387373e-06,
"logits/chosen": -14.19200611114502,
"logits/rejected": -14.27843189239502,
"logps/chosen": -1.4934606552124023,
"logps/rejected": -1.5448919534683228,
"loss": 1.5664,
"odds_ratio_loss": 0.7296234369277954,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14934605360031128,
"rewards/margins": 0.005143154412508011,
"rewards/rejected": -0.154489204287529,
"sft_loss": 1.4934606552124023,
"step": 260
},
{
"epoch": 0.4364518084461507,
"grad_norm": 1.3478955030441284,
"learning_rate": 4.7430438725853515e-06,
"logits/chosen": -14.099308967590332,
"logits/rejected": -14.247446060180664,
"logps/chosen": -1.5219833850860596,
"logps/rejected": -1.7108709812164307,
"loss": 1.5916,
"odds_ratio_loss": 0.6957148313522339,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.15219834446907043,
"rewards/margins": 0.01888876222074032,
"rewards/rejected": -0.1710870862007141,
"sft_loss": 1.5219833850860596,
"step": 270
},
{
"epoch": 0.4526166902404526,
"grad_norm": 1.0543924570083618,
"learning_rate": 4.724014963984669e-06,
"logits/chosen": -14.321874618530273,
"logits/rejected": -14.308130264282227,
"logps/chosen": -1.4753090143203735,
"logps/rejected": -1.6179271936416626,
"loss": 1.5473,
"odds_ratio_loss": 0.7201633453369141,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.14753088355064392,
"rewards/margins": 0.014261829666793346,
"rewards/rejected": -0.16179272532463074,
"sft_loss": 1.4753090143203735,
"step": 280
},
{
"epoch": 0.4687815720347545,
"grad_norm": 1.6008622646331787,
"learning_rate": 4.704347417836116e-06,
"logits/chosen": -14.192815780639648,
"logits/rejected": -14.182914733886719,
"logps/chosen": -1.373263955116272,
"logps/rejected": -1.4777114391326904,
"loss": 1.4462,
"odds_ratio_loss": 0.7295758128166199,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.13732638955116272,
"rewards/margins": 0.010444764979183674,
"rewards/rejected": -0.14777114987373352,
"sft_loss": 1.373263955116272,
"step": 290
},
{
"epoch": 0.4849464538290564,
"grad_norm": 1.0440045595169067,
"learning_rate": 4.684046881778603e-06,
"logits/chosen": -13.9605131149292,
"logits/rejected": -14.021821975708008,
"logps/chosen": -1.3839852809906006,
"logps/rejected": -1.4472886323928833,
"loss": 1.456,
"odds_ratio_loss": 0.719718337059021,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.13839852809906006,
"rewards/margins": 0.006330335047096014,
"rewards/rejected": -0.1447288691997528,
"sft_loss": 1.3839852809906006,
"step": 300
},
{
"epoch": 0.5011113356233583,
"grad_norm": 0.8026280999183655,
"learning_rate": 4.663119185217409e-06,
"logits/chosen": -14.247451782226562,
"logits/rejected": -14.332074165344238,
"logps/chosen": -1.4372491836547852,
"logps/rejected": -1.5869617462158203,
"loss": 1.5057,
"odds_ratio_loss": 0.684893012046814,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.1437249332666397,
"rewards/margins": 0.014971258118748665,
"rewards/rejected": -0.15869615972042084,
"sft_loss": 1.4372491836547852,
"step": 310
},
{
"epoch": 0.5172762174176602,
"grad_norm": 1.054210901260376,
"learning_rate": 4.641570337650232e-06,
"logits/chosen": -14.101099967956543,
"logits/rejected": -14.234477043151855,
"logps/chosen": -1.3175721168518066,
"logps/rejected": -1.46291184425354,
"loss": 1.3866,
"odds_ratio_loss": 0.6904350519180298,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1317571997642517,
"rewards/margins": 0.014533978886902332,
"rewards/rejected": -0.14629118144512177,
"sft_loss": 1.3175721168518066,
"step": 320
},
{
"epoch": 0.533441099211962,
"grad_norm": 1.6171979904174805,
"learning_rate": 4.61940652694154e-06,
"logits/chosen": -14.107089042663574,
"logits/rejected": -14.126917839050293,
"logps/chosen": -1.5025255680084229,
"logps/rejected": -1.4795392751693726,
"loss": 1.5835,
"odds_ratio_loss": 0.8096711039543152,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.1502525359392166,
"rewards/margins": -0.0022986275143921375,
"rewards/rejected": -0.14795391261577606,
"sft_loss": 1.5025255680084229,
"step": 330
},
{
"epoch": 0.5496059810062639,
"grad_norm": 1.2122093439102173,
"learning_rate": 4.596634117545689e-06,
"logits/chosen": -14.346307754516602,
"logits/rejected": -14.166845321655273,
"logps/chosen": -1.5319068431854248,
"logps/rejected": -1.624324083328247,
"loss": 1.6054,
"odds_ratio_loss": 0.735165536403656,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.15319068729877472,
"rewards/margins": 0.00924170482903719,
"rewards/rejected": -0.16243240237236023,
"sft_loss": 1.5319068431854248,
"step": 340
},
{
"epoch": 0.5657708628005658,
"grad_norm": 0.899023175239563,
"learning_rate": 4.573259648679335e-06,
"logits/chosen": -14.317461013793945,
"logits/rejected": -14.103338241577148,
"logps/chosen": -1.47697114944458,
"logps/rejected": -1.648705244064331,
"loss": 1.546,
"odds_ratio_loss": 0.6902921199798584,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.14769712090492249,
"rewards/margins": 0.017173420637845993,
"rewards/rejected": -0.16487054526805878,
"sft_loss": 1.47697114944458,
"step": 350
},
{
"epoch": 0.5819357445948676,
"grad_norm": 2.3687381744384766,
"learning_rate": 4.549289832443663e-06,
"logits/chosen": -14.142545700073242,
"logits/rejected": -14.211145401000977,
"logps/chosen": -1.4514472484588623,
"logps/rejected": -1.5542781352996826,
"loss": 1.5233,
"odds_ratio_loss": 0.7186037302017212,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.1451447308063507,
"rewards/margins": 0.010283084586262703,
"rewards/rejected": -0.15542782843112946,
"sft_loss": 1.4514472484588623,
"step": 360
},
{
"epoch": 0.5981006263891695,
"grad_norm": 1.039651870727539,
"learning_rate": 4.524731551896978e-06,
"logits/chosen": -14.117040634155273,
"logits/rejected": -14.164260864257812,
"logps/chosen": -1.3633731603622437,
"logps/rejected": -1.4127264022827148,
"loss": 1.4381,
"odds_ratio_loss": 0.7473303079605103,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.13633732497692108,
"rewards/margins": 0.004935313947498798,
"rewards/rejected": -0.1412726640701294,
"sft_loss": 1.3633731603622437,
"step": 370
},
{
"epoch": 0.6142655081834714,
"grad_norm": 2.077622413635254,
"learning_rate": 4.4995918590781925e-06,
"logits/chosen": -14.212381362915039,
"logits/rejected": -14.251853942871094,
"logps/chosen": -1.3631454706192017,
"logps/rejected": -1.4832844734191895,
"loss": 1.437,
"odds_ratio_loss": 0.7388315200805664,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1363145411014557,
"rewards/margins": 0.012013902887701988,
"rewards/rejected": -0.14832845330238342,
"sft_loss": 1.3631454706192017,
"step": 380
},
{
"epoch": 0.6304303899777733,
"grad_norm": 0.6616309881210327,
"learning_rate": 4.473877972981797e-06,
"logits/chosen": -14.166543960571289,
"logits/rejected": -14.008458137512207,
"logps/chosen": -1.414536476135254,
"logps/rejected": -1.5125486850738525,
"loss": 1.4849,
"odds_ratio_loss": 0.7040683031082153,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.14145365357398987,
"rewards/margins": 0.009801235981285572,
"rewards/rejected": -0.15125489234924316,
"sft_loss": 1.414536476135254,
"step": 390
},
{
"epoch": 0.6465952717720752,
"grad_norm": 1.2422401905059814,
"learning_rate": 4.447597277484894e-06,
"logits/chosen": -14.10089111328125,
"logits/rejected": -14.177225112915039,
"logps/chosen": -1.3244436979293823,
"logps/rejected": -1.434922456741333,
"loss": 1.3936,
"odds_ratio_loss": 0.6911473274230957,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.132444366812706,
"rewards/margins": 0.011047879233956337,
"rewards/rejected": -0.14349225163459778,
"sft_loss": 1.3244436979293823,
"step": 400
},
{
"epoch": 0.6627601535663771,
"grad_norm": 1.3308875560760498,
"learning_rate": 4.42075731922687e-06,
"logits/chosen": -14.254026412963867,
"logits/rejected": -14.150421142578125,
"logps/chosen": -1.4931491613388062,
"logps/rejected": -1.5233150720596313,
"loss": 1.5684,
"odds_ratio_loss": 0.7521846890449524,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.14931491017341614,
"rewards/margins": 0.0030165952630341053,
"rewards/rejected": -0.15233151614665985,
"sft_loss": 1.4931491613388062,
"step": 410
},
{
"epoch": 0.6789250353606789,
"grad_norm": 1.4143937826156616,
"learning_rate": 4.3933658054423465e-06,
"logits/chosen": -14.156329154968262,
"logits/rejected": -14.047518730163574,
"logps/chosen": -1.338627576828003,
"logps/rejected": -1.4370090961456299,
"loss": 1.4095,
"odds_ratio_loss": 0.70883709192276,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.13386276364326477,
"rewards/margins": 0.009838144294917583,
"rewards/rejected": -0.14370091259479523,
"sft_loss": 1.338627576828003,
"step": 420
},
{
"epoch": 0.6950899171549808,
"grad_norm": 2.3574774265289307,
"learning_rate": 4.365430601748003e-06,
"logits/chosen": -14.235176086425781,
"logits/rejected": -14.395864486694336,
"logps/chosen": -1.564626932144165,
"logps/rejected": -1.5344398021697998,
"loss": 1.6431,
"odds_ratio_loss": 0.7849880456924438,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.15646269917488098,
"rewards/margins": -0.0030187165830284357,
"rewards/rejected": -0.15344397723674774,
"sft_loss": 1.564626932144165,
"step": 430
},
{
"epoch": 0.7112547989492827,
"grad_norm": 3.739943504333496,
"learning_rate": 4.336959729883925e-06,
"logits/chosen": -14.274754524230957,
"logits/rejected": -14.191232681274414,
"logps/chosen": -1.3745372295379639,
"logps/rejected": -1.405700445175171,
"loss": 1.4506,
"odds_ratio_loss": 0.7607132196426392,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.13745373487472534,
"rewards/margins": 0.0031163152307271957,
"rewards/rejected": -0.1405700445175171,
"sft_loss": 1.3745372295379639,
"step": 440
},
{
"epoch": 0.7274196807435845,
"grad_norm": 0.9312599301338196,
"learning_rate": 4.307961365410118e-06,
"logits/chosen": -14.044285774230957,
"logits/rejected": -14.011823654174805,
"logps/chosen": -1.4385414123535156,
"logps/rejected": -1.4718294143676758,
"loss": 1.5134,
"odds_ratio_loss": 0.7482468485832214,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.14385412633419037,
"rewards/margins": 0.003328789724037051,
"rewards/rejected": -0.14718294143676758,
"sft_loss": 1.4385414123535156,
"step": 450
},
{
"epoch": 0.7435845625378864,
"grad_norm": 1.4249197244644165,
"learning_rate": 4.278443835358854e-06,
"logits/chosen": -14.115106582641602,
"logits/rejected": -14.075739860534668,
"logps/chosen": -1.3712975978851318,
"logps/rejected": -1.5527522563934326,
"loss": 1.4406,
"odds_ratio_loss": 0.6929912567138672,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1371297538280487,
"rewards/margins": 0.018145468086004257,
"rewards/rejected": -0.15527524054050446,
"sft_loss": 1.3712975978851318,
"step": 460
},
{
"epoch": 0.7597494443321883,
"grad_norm": 1.1615644693374634,
"learning_rate": 4.248415615843523e-06,
"logits/chosen": -14.288152694702148,
"logits/rejected": -14.206695556640625,
"logps/chosen": -1.4021141529083252,
"logps/rejected": -1.416723370552063,
"loss": 1.4775,
"odds_ratio_loss": 0.7538274526596069,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.14021141827106476,
"rewards/margins": 0.001460921368561685,
"rewards/rejected": -0.14167232811450958,
"sft_loss": 1.4021141529083252,
"step": 470
},
{
"epoch": 0.7759143261264903,
"grad_norm": 1.276267409324646,
"learning_rate": 4.217885329624666e-06,
"logits/chosen": -14.302003860473633,
"logits/rejected": -14.307230949401855,
"logps/chosen": -1.346254587173462,
"logps/rejected": -1.4862271547317505,
"loss": 1.4137,
"odds_ratio_loss": 0.6745720505714417,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.13462546467781067,
"rewards/margins": 0.013997259549796581,
"rewards/rejected": -0.14862270653247833,
"sft_loss": 1.346254587173462,
"step": 480
},
{
"epoch": 0.7920792079207921,
"grad_norm": 1.6030430793762207,
"learning_rate": 4.186861743633911e-06,
"logits/chosen": -14.13404369354248,
"logits/rejected": -14.251507759094238,
"logps/chosen": -1.4151580333709717,
"logps/rejected": -1.5721826553344727,
"loss": 1.4904,
"odds_ratio_loss": 0.7523505091667175,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.1415158212184906,
"rewards/margins": 0.015702461823821068,
"rewards/rejected": -0.15721826255321503,
"sft_loss": 1.4151580333709717,
"step": 490
},
{
"epoch": 0.808244089715094,
"grad_norm": 1.7222312688827515,
"learning_rate": 4.155353766456497e-06,
"logits/chosen": -14.4000825881958,
"logits/rejected": -14.304115295410156,
"logps/chosen": -1.433506727218628,
"logps/rejected": -1.535611867904663,
"loss": 1.5005,
"odds_ratio_loss": 0.6703948378562927,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.14335067570209503,
"rewards/margins": 0.010210518725216389,
"rewards/rejected": -0.15356118977069855,
"sft_loss": 1.433506727218628,
"step": 500
},
{
"epoch": 0.808244089715094,
"eval_logits/chosen": -14.227585792541504,
"eval_logits/rejected": -14.265686988830566,
"eval_logps/chosen": -1.4436272382736206,
"eval_logps/rejected": -1.4898087978363037,
"eval_loss": 1.5202080011367798,
"eval_odds_ratio_loss": 0.7658076882362366,
"eval_rewards/accuracies": 0.48181816935539246,
"eval_rewards/chosen": -0.1443627029657364,
"eval_rewards/margins": 0.004618145525455475,
"eval_rewards/rejected": -0.14898087084293365,
"eval_runtime": 207.676,
"eval_samples_per_second": 5.297,
"eval_sft_loss": 1.4436272382736206,
"eval_steps_per_second": 2.648,
"step": 500
},
{
"epoch": 0.8244089715093958,
"grad_norm": 1.143004059791565,
"learning_rate": 4.123370445773134e-06,
"logits/chosen": -14.356025695800781,
"logits/rejected": -14.339376449584961,
"logps/chosen": -1.4154841899871826,
"logps/rejected": -1.4348183870315552,
"loss": 1.4927,
"odds_ratio_loss": 0.7723585963249207,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.14154842495918274,
"rewards/margins": 0.001933417865075171,
"rewards/rejected": -0.14348182082176208,
"sft_loss": 1.4154841899871826,
"step": 510
},
{
"epoch": 0.8405738533036977,
"grad_norm": 3.6751832962036133,
"learning_rate": 4.090920965761906e-06,
"logits/chosen": -14.4230375289917,
"logits/rejected": -14.330423355102539,
"logps/chosen": -1.4806926250457764,
"logps/rejected": -1.4873076677322388,
"loss": 1.559,
"odds_ratio_loss": 0.7833209037780762,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.14806927740573883,
"rewards/margins": 0.0006614929297938943,
"rewards/rejected": -0.14873075485229492,
"sft_loss": 1.4806926250457764,
"step": 520
},
{
"epoch": 0.8567387350979996,
"grad_norm": 4.592033386230469,
"learning_rate": 4.058014644460991e-06,
"logits/chosen": -14.309356689453125,
"logits/rejected": -14.266693115234375,
"logps/chosen": -1.4232040643692017,
"logps/rejected": -1.4629483222961426,
"loss": 1.4967,
"odds_ratio_loss": 0.7350074052810669,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.1423204094171524,
"rewards/margins": 0.003974422812461853,
"rewards/rejected": -0.14629481732845306,
"sft_loss": 1.4232040643692017,
"step": 530
},
{
"epoch": 0.8729036168923014,
"grad_norm": 1.3515141010284424,
"learning_rate": 4.024660931092939e-06,
"logits/chosen": -14.12739086151123,
"logits/rejected": -14.135973930358887,
"logps/chosen": -1.4027074575424194,
"logps/rejected": -1.5116406679153442,
"loss": 1.4748,
"odds_ratio_loss": 0.7212173938751221,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.14027073979377747,
"rewards/margins": 0.010893313214182854,
"rewards/rejected": -0.15116406977176666,
"sft_loss": 1.4027074575424194,
"step": 540
},
{
"epoch": 0.8890684986866033,
"grad_norm": 3.3689217567443848,
"learning_rate": 3.990869403351272e-06,
"logits/chosen": -14.354001998901367,
"logits/rejected": -14.225595474243164,
"logps/chosen": -1.4652130603790283,
"logps/rejected": -1.552912712097168,
"loss": 1.5359,
"odds_ratio_loss": 0.7067934274673462,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.14652130007743835,
"rewards/margins": 0.008769966661930084,
"rewards/rejected": -0.15529127418994904,
"sft_loss": 1.4652130603790283,
"step": 550
},
{
"epoch": 0.9052333804809052,
"grad_norm": 1.5204488039016724,
"learning_rate": 3.956649764650206e-06,
"logits/chosen": -14.487988471984863,
"logits/rejected": -14.507904052734375,
"logps/chosen": -1.4564487934112549,
"logps/rejected": -1.5203144550323486,
"loss": 1.5325,
"odds_ratio_loss": 0.7608081102371216,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.1456448882818222,
"rewards/margins": 0.006386570632457733,
"rewards/rejected": -0.15203145146369934,
"sft_loss": 1.4564487934112549,
"step": 560
},
{
"epoch": 0.9213982622752072,
"grad_norm": 2.2319583892822266,
"learning_rate": 3.92201184133826e-06,
"logits/chosen": -14.393239974975586,
"logits/rejected": -14.3502779006958,
"logps/chosen": -1.3946270942687988,
"logps/rejected": -1.444805383682251,
"loss": 1.4679,
"odds_ratio_loss": 0.7322729229927063,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.13946272432804108,
"rewards/margins": 0.005017831921577454,
"rewards/rejected": -0.14448055624961853,
"sft_loss": 1.3946270942687988,
"step": 570
},
{
"epoch": 0.937563144069509,
"grad_norm": 1.4617536067962646,
"learning_rate": 3.886965579876572e-06,
"logits/chosen": -14.353238105773926,
"logits/rejected": -14.260797500610352,
"logps/chosen": -1.3793189525604248,
"logps/rejected": -1.445691704750061,
"loss": 1.4501,
"odds_ratio_loss": 0.7080078125,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.13793189823627472,
"rewards/margins": 0.006637275218963623,
"rewards/rejected": -0.14456915855407715,
"sft_loss": 1.3793189525604248,
"step": 580
},
{
"epoch": 0.9537280258638109,
"grad_norm": 1.2430846691131592,
"learning_rate": 3.851521043982716e-06,
"logits/chosen": -14.31140422821045,
"logits/rejected": -14.404243469238281,
"logps/chosen": -1.424002766609192,
"logps/rejected": -1.4054510593414307,
"loss": 1.4998,
"odds_ratio_loss": 0.7578663229942322,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.14240026473999023,
"rewards/margins": -0.0018551532411947846,
"rewards/rejected": -0.14054511487483978,
"sft_loss": 1.424002766609192,
"step": 590
},
{
"epoch": 0.9698929076581128,
"grad_norm": 1.5072684288024902,
"learning_rate": 3.81568841174086e-06,
"logits/chosen": -14.169085502624512,
"logits/rejected": -14.1954345703125,
"logps/chosen": -1.4412424564361572,
"logps/rejected": -1.4657504558563232,
"loss": 1.5191,
"odds_ratio_loss": 0.7788038849830627,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.14412423968315125,
"rewards/margins": 0.002450800035148859,
"rewards/rejected": -0.14657504856586456,
"sft_loss": 1.4412424564361572,
"step": 600
},
{
"epoch": 0.9860577894524146,
"grad_norm": 1.2968331575393677,
"learning_rate": 3.7794779726790664e-06,
"logits/chosen": -14.130575180053711,
"logits/rejected": -14.240781784057617,
"logps/chosen": -1.3836543560028076,
"logps/rejected": -1.457695722579956,
"loss": 1.4561,
"odds_ratio_loss": 0.7247332334518433,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.13836543262004852,
"rewards/margins": 0.007404146250337362,
"rewards/rejected": -0.14576958119869232,
"sft_loss": 1.3836543560028076,
"step": 610
},
{
"epoch": 1.0022226712467166,
"grad_norm": 4.868699550628662,
"learning_rate": 3.7429001248146096e-06,
"logits/chosen": -14.240348815917969,
"logits/rejected": -14.297922134399414,
"logps/chosen": -1.4243017435073853,
"logps/rejected": -1.5530868768692017,
"loss": 1.4924,
"odds_ratio_loss": 0.680776059627533,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1424301713705063,
"rewards/margins": 0.012878507375717163,
"rewards/rejected": -0.15530869364738464,
"sft_loss": 1.4243017435073853,
"step": 620
},
{
"epoch": 1.0183875530410185,
"grad_norm": 0.8127214312553406,
"learning_rate": 3.7059653716681227e-06,
"logits/chosen": -14.380844116210938,
"logits/rejected": -14.255830764770508,
"logps/chosen": -1.4107029438018799,
"logps/rejected": -1.521928071975708,
"loss": 1.4861,
"odds_ratio_loss": 0.7541464567184448,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.14107032120227814,
"rewards/margins": 0.01112250704318285,
"rewards/rejected": -0.15219281613826752,
"sft_loss": 1.4107029438018799,
"step": 630
},
{
"epoch": 1.0345524348353203,
"grad_norm": 3.8503897190093994,
"learning_rate": 3.668684319247463e-06,
"logits/chosen": -14.447845458984375,
"logits/rejected": -14.433076858520508,
"logps/chosen": -1.367375135421753,
"logps/rejected": -1.548612356185913,
"loss": 1.4348,
"odds_ratio_loss": 0.6741297841072083,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.13673751056194305,
"rewards/margins": 0.018123725429177284,
"rewards/rejected": -0.1548612415790558,
"sft_loss": 1.367375135421753,
"step": 640
},
{
"epoch": 1.0507173166296222,
"grad_norm": 0.9416384100914001,
"learning_rate": 3.6310676730021373e-06,
"logits/chosen": -14.3724946975708,
"logits/rejected": -14.455398559570312,
"logps/chosen": -1.3245970010757446,
"logps/rejected": -1.3460277318954468,
"loss": 1.3979,
"odds_ratio_loss": 0.7330806255340576,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.13245970010757446,
"rewards/margins": 0.002143078250810504,
"rewards/rejected": -0.13460277020931244,
"sft_loss": 1.3245970010757446,
"step": 650
},
{
"epoch": 1.066882198423924,
"grad_norm": 2.8321056365966797,
"learning_rate": 3.593126234749178e-06,
"logits/chosen": -14.317327499389648,
"logits/rejected": -14.38727855682373,
"logps/chosen": -1.423680067062378,
"logps/rejected": -1.4616180658340454,
"loss": 1.4976,
"odds_ratio_loss": 0.739305853843689,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.14236800372600555,
"rewards/margins": 0.0037938044406473637,
"rewards/rejected": -0.14616182446479797,
"sft_loss": 1.423680067062378,
"step": 660
},
{
"epoch": 1.083047080218226,
"grad_norm": 0.9518349766731262,
"learning_rate": 3.554870899571343e-06,
"logits/chosen": -14.144752502441406,
"logits/rejected": -14.251813888549805,
"logps/chosen": -1.4052397012710571,
"logps/rejected": -1.5265625715255737,
"loss": 1.4767,
"odds_ratio_loss": 0.7148950695991516,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1405239850282669,
"rewards/margins": 0.012132286094129086,
"rewards/rejected": -0.15265627205371857,
"sft_loss": 1.4052397012710571,
"step": 670
},
{
"epoch": 1.0992119620125278,
"grad_norm": 3.0823421478271484,
"learning_rate": 3.5163126526885373e-06,
"logits/chosen": -14.263737678527832,
"logits/rejected": -14.341888427734375,
"logps/chosen": -1.3758028745651245,
"logps/rejected": -1.4713342189788818,
"loss": 1.4506,
"odds_ratio_loss": 0.748176097869873,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.1375802755355835,
"rewards/margins": 0.009553151205182076,
"rewards/rejected": -0.14713343977928162,
"sft_loss": 1.3758028745651245,
"step": 680
},
{
"epoch": 1.1153768438068297,
"grad_norm": 1.1957412958145142,
"learning_rate": 3.4774625663033484e-06,
"logits/chosen": -14.262721061706543,
"logits/rejected": -14.248212814331055,
"logps/chosen": -1.4033539295196533,
"logps/rejected": -1.4489859342575073,
"loss": 1.4783,
"odds_ratio_loss": 0.7493518590927124,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14033538103103638,
"rewards/margins": 0.004563204478472471,
"rewards/rejected": -0.14489860832691193,
"sft_loss": 1.4033539295196533,
"step": 690
},
{
"epoch": 1.1315417256011315,
"grad_norm": 1.0352710485458374,
"learning_rate": 3.4383317964216067e-06,
"logits/chosen": -14.168815612792969,
"logits/rejected": -14.324069023132324,
"logps/chosen": -1.3365106582641602,
"logps/rejected": -1.3756332397460938,
"loss": 1.4108,
"odds_ratio_loss": 0.7429829835891724,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.13365106284618378,
"rewards/margins": 0.0039122505113482475,
"rewards/rejected": -0.1375633180141449,
"sft_loss": 1.3365106582641602,
"step": 700
},
{
"epoch": 1.1477066073954334,
"grad_norm": 2.4808411598205566,
"learning_rate": 3.398931579648877e-06,
"logits/chosen": -14.3150053024292,
"logits/rejected": -14.531530380249023,
"logps/chosen": -1.4491299390792847,
"logps/rejected": -1.5492023229599,
"loss": 1.5203,
"odds_ratio_loss": 0.7113555669784546,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.14491300284862518,
"rewards/margins": 0.010007232427597046,
"rewards/rejected": -0.15492023527622223,
"sft_loss": 1.4491299390792847,
"step": 710
},
{
"epoch": 1.1638714891897353,
"grad_norm": 1.2726991176605225,
"learning_rate": 3.359273229963813e-06,
"logits/chosen": -14.357129096984863,
"logits/rejected": -14.291903495788574,
"logps/chosen": -1.3459408283233643,
"logps/rejected": -1.3911712169647217,
"loss": 1.421,
"odds_ratio_loss": 0.750839114189148,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.13459408283233643,
"rewards/margins": 0.004523060750216246,
"rewards/rejected": -0.13911715149879456,
"sft_loss": 1.3459408283233643,
"step": 720
},
{
"epoch": 1.1800363709840371,
"grad_norm": 1.0978913307189941,
"learning_rate": 3.319368135469285e-06,
"logits/chosen": -14.36750602722168,
"logits/rejected": -14.435731887817383,
"logps/chosen": -1.3765571117401123,
"logps/rejected": -1.4039866924285889,
"loss": 1.4538,
"odds_ratio_loss": 0.7719755172729492,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.13765572011470795,
"rewards/margins": 0.002742946846410632,
"rewards/rejected": -0.14039869606494904,
"sft_loss": 1.3765571117401123,
"step": 730
},
{
"epoch": 1.196201252778339,
"grad_norm": 2.1035361289978027,
"learning_rate": 3.279227755122228e-06,
"logits/chosen": -14.316058158874512,
"logits/rejected": -14.294093132019043,
"logps/chosen": -1.320318579673767,
"logps/rejected": -1.5284496545791626,
"loss": 1.3866,
"odds_ratio_loss": 0.6632006764411926,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.13203184306621552,
"rewards/margins": 0.020813116803765297,
"rewards/rejected": -0.15284495055675507,
"sft_loss": 1.320318579673767,
"step": 740
},
{
"epoch": 1.2123661345726409,
"grad_norm": 3.223933696746826,
"learning_rate": 3.2388636154431417e-06,
"logits/chosen": -14.34916877746582,
"logits/rejected": -14.280328750610352,
"logps/chosen": -1.429145097732544,
"logps/rejected": -1.5203419923782349,
"loss": 1.502,
"odds_ratio_loss": 0.7281750440597534,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.1429145336151123,
"rewards/margins": 0.009119677357375622,
"rewards/rejected": -0.152034193277359,
"sft_loss": 1.429145097732544,
"step": 750
},
{
"epoch": 1.2285310163669427,
"grad_norm": 1.1619030237197876,
"learning_rate": 3.198287307206192e-06,
"logits/chosen": -14.091611862182617,
"logits/rejected": -14.187002182006836,
"logps/chosen": -1.4056107997894287,
"logps/rejected": -1.442886233329773,
"loss": 1.4829,
"odds_ratio_loss": 0.7725043296813965,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.1405610740184784,
"rewards/margins": 0.003727543633431196,
"rewards/rejected": -0.14428862929344177,
"sft_loss": 1.4056107997894287,
"step": 760
},
{
"epoch": 1.2446958981612446,
"grad_norm": 1.0456814765930176,
"learning_rate": 3.157510482110856e-06,
"logits/chosen": -14.408856391906738,
"logits/rejected": -14.243043899536133,
"logps/chosen": -1.3281633853912354,
"logps/rejected": -1.3863494396209717,
"loss": 1.4004,
"odds_ratio_loss": 0.7221428751945496,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.132816344499588,
"rewards/margins": 0.005818599369376898,
"rewards/rejected": -0.13863493502140045,
"sft_loss": 1.3281633853912354,
"step": 770
},
{
"epoch": 1.2608607799555465,
"grad_norm": 1.2318408489227295,
"learning_rate": 3.116544849436077e-06,
"logits/chosen": -14.334813117980957,
"logits/rejected": -14.20678997039795,
"logps/chosen": -1.5153284072875977,
"logps/rejected": -1.6125590801239014,
"loss": 1.588,
"odds_ratio_loss": 0.7266558408737183,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.151532843708992,
"rewards/margins": 0.009723084978759289,
"rewards/rejected": -0.16125592589378357,
"sft_loss": 1.5153284072875977,
"step": 780
},
{
"epoch": 1.2770256617498483,
"grad_norm": 1.3976880311965942,
"learning_rate": 3.0754021726778848e-06,
"logits/chosen": -14.33143138885498,
"logits/rejected": -14.257779121398926,
"logps/chosen": -1.3455626964569092,
"logps/rejected": -1.4571717977523804,
"loss": 1.4162,
"odds_ratio_loss": 0.7065266370773315,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.13455626368522644,
"rewards/margins": 0.011160916648805141,
"rewards/rejected": -0.14571718871593475,
"sft_loss": 1.3455626964569092,
"step": 790
},
{
"epoch": 1.2931905435441502,
"grad_norm": 0.7877367734909058,
"learning_rate": 3.0340942661714463e-06,
"logits/chosen": -14.352252006530762,
"logits/rejected": -14.257513046264648,
"logps/chosen": -1.4310262203216553,
"logps/rejected": -1.4348089694976807,
"loss": 1.5077,
"odds_ratio_loss": 0.7662674188613892,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.14310263097286224,
"rewards/margins": 0.00037826746120117605,
"rewards/rejected": -0.14348089694976807,
"sft_loss": 1.4310262203216553,
"step": 800
},
{
"epoch": 1.3093554253384523,
"grad_norm": 1.265386939048767,
"learning_rate": 2.992632991698512e-06,
"logits/chosen": -14.194437980651855,
"logits/rejected": -14.312055587768555,
"logps/chosen": -1.3498046398162842,
"logps/rejected": -1.4344502687454224,
"loss": 1.4207,
"odds_ratio_loss": 0.7088189721107483,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1349804699420929,
"rewards/margins": 0.008464555256068707,
"rewards/rejected": -0.14344502985477448,
"sft_loss": 1.3498046398162842,
"step": 810
},
{
"epoch": 1.3255203071327541,
"grad_norm": 1.7529423236846924,
"learning_rate": 2.9510302550812537e-06,
"logits/chosen": -14.307215690612793,
"logits/rejected": -14.374090194702148,
"logps/chosen": -1.3449764251708984,
"logps/rejected": -1.5051848888397217,
"loss": 1.4155,
"odds_ratio_loss": 0.7051501274108887,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.13449765741825104,
"rewards/margins": 0.016020851209759712,
"rewards/rejected": -0.1505185067653656,
"sft_loss": 1.3449764251708984,
"step": 820
},
{
"epoch": 1.341685188927056,
"grad_norm": 3.534449815750122,
"learning_rate": 2.9092980027634325e-06,
"logits/chosen": -14.194910049438477,
"logits/rejected": -14.260457038879395,
"logps/chosen": -1.3157680034637451,
"logps/rejected": -1.39622163772583,
"loss": 1.3858,
"odds_ratio_loss": 0.7005105018615723,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.131576806306839,
"rewards/margins": 0.008045351132750511,
"rewards/rejected": -0.13962216675281525,
"sft_loss": 1.3157680034637451,
"step": 830
},
{
"epoch": 1.3578500707213579,
"grad_norm": 1.6155622005462646,
"learning_rate": 2.867448218379927e-06,
"logits/chosen": -14.231335639953613,
"logits/rejected": -14.248939514160156,
"logps/chosen": -1.3620965480804443,
"logps/rejected": -1.409558892250061,
"loss": 1.4355,
"odds_ratio_loss": 0.734248697757721,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1362096518278122,
"rewards/margins": 0.0047462377697229385,
"rewards/rejected": -0.14095589518547058,
"sft_loss": 1.3620965480804443,
"step": 840
},
{
"epoch": 1.3740149525156597,
"grad_norm": 4.540154933929443,
"learning_rate": 2.825492919315559e-06,
"logits/chosen": -14.306146621704102,
"logits/rejected": -14.476399421691895,
"logps/chosen": -1.4043729305267334,
"logps/rejected": -1.4499131441116333,
"loss": 1.4789,
"odds_ratio_loss": 0.7450671195983887,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1404372900724411,
"rewards/margins": 0.004554024897515774,
"rewards/rejected": -0.14499132335186005,
"sft_loss": 1.4043729305267334,
"step": 850
},
{
"epoch": 1.3901798343099616,
"grad_norm": 1.2316781282424927,
"learning_rate": 2.7834441532542482e-06,
"logits/chosen": -14.352537155151367,
"logits/rejected": -14.446965217590332,
"logps/chosen": -1.3581891059875488,
"logps/rejected": -1.4636138677597046,
"loss": 1.4297,
"odds_ratio_loss": 0.7155886888504028,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.13581891357898712,
"rewards/margins": 0.01054247748106718,
"rewards/rejected": -0.14636139571666718,
"sft_loss": 1.3581891059875488,
"step": 860
},
{
"epoch": 1.4063447161042635,
"grad_norm": 0.915081799030304,
"learning_rate": 2.74131399471945e-06,
"logits/chosen": -14.232261657714844,
"logits/rejected": -14.369558334350586,
"logps/chosen": -1.4017927646636963,
"logps/rejected": -1.4412128925323486,
"loss": 1.4755,
"odds_ratio_loss": 0.7375406622886658,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.14017929136753082,
"rewards/margins": 0.003942002542316914,
"rewards/rejected": -0.14412127435207367,
"sft_loss": 1.4017927646636963,
"step": 870
},
{
"epoch": 1.4225095978985653,
"grad_norm": 1.1700351238250732,
"learning_rate": 2.6991145416068947e-06,
"logits/chosen": -14.184051513671875,
"logits/rejected": -14.361761093139648,
"logps/chosen": -1.3888486623764038,
"logps/rejected": -1.3866727352142334,
"loss": 1.4645,
"odds_ratio_loss": 0.7568970918655396,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.13888487219810486,
"rewards/margins": -0.00021760519302915782,
"rewards/rejected": -0.1386672556400299,
"sft_loss": 1.3888486623764038,
"step": 880
},
{
"epoch": 1.4386744796928672,
"grad_norm": 0.7416606545448303,
"learning_rate": 2.6568579117106143e-06,
"logits/chosen": -14.222585678100586,
"logits/rejected": -14.173550605773926,
"logps/chosen": -1.321872591972351,
"logps/rejected": -1.451570749282837,
"loss": 1.3933,
"odds_ratio_loss": 0.7138932943344116,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.13218727707862854,
"rewards/margins": 0.012969812378287315,
"rewards/rejected": -0.1451570689678192,
"sft_loss": 1.321872591972351,
"step": 890
},
{
"epoch": 1.454839361487169,
"grad_norm": 0.7456266283988953,
"learning_rate": 2.6145562392432544e-06,
"logits/chosen": -14.201733589172363,
"logits/rejected": -14.159896850585938,
"logps/chosen": -1.371537446975708,
"logps/rejected": -1.4001505374908447,
"loss": 1.4466,
"odds_ratio_loss": 0.7501237392425537,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.137153759598732,
"rewards/margins": 0.002861298155039549,
"rewards/rejected": -0.14001503586769104,
"sft_loss": 1.371537446975708,
"step": 900
},
{
"epoch": 1.471004243281471,
"grad_norm": 1.7800395488739014,
"learning_rate": 2.5722216713516682e-06,
"logits/chosen": -14.122312545776367,
"logits/rejected": -14.1841402053833,
"logps/chosen": -1.2916905879974365,
"logps/rejected": -1.3739659786224365,
"loss": 1.3653,
"odds_ratio_loss": 0.7365130186080933,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.1291690617799759,
"rewards/margins": 0.008227519690990448,
"rewards/rejected": -0.13739657402038574,
"sft_loss": 1.2916905879974365,
"step": 910
},
{
"epoch": 1.4871691250757728,
"grad_norm": 3.366191864013672,
"learning_rate": 2.5298663646288064e-06,
"logits/chosen": -14.279853820800781,
"logits/rejected": -14.313766479492188,
"logps/chosen": -1.3366254568099976,
"logps/rejected": -1.4743283987045288,
"loss": 1.4084,
"odds_ratio_loss": 0.7178291082382202,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.13366253674030304,
"rewards/margins": 0.013770299032330513,
"rewards/rejected": -0.1474328488111496,
"sft_loss": 1.3366254568099976,
"step": 920
},
{
"epoch": 1.503334006870075,
"grad_norm": 1.793541431427002,
"learning_rate": 2.487502481622879e-06,
"logits/chosen": -14.228408813476562,
"logits/rejected": -14.142854690551758,
"logps/chosen": -1.3270151615142822,
"logps/rejected": -1.4341893196105957,
"loss": 1.3983,
"odds_ratio_loss": 0.7129431366920471,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.13270151615142822,
"rewards/margins": 0.010717417113482952,
"rewards/rejected": -0.14341893792152405,
"sft_loss": 1.3270151615142822,
"step": 930
},
{
"epoch": 1.5194988886643768,
"grad_norm": 2.546449661254883,
"learning_rate": 2.4451421873448253e-06,
"logits/chosen": -14.15150260925293,
"logits/rejected": -14.336977005004883,
"logps/chosen": -1.431612253189087,
"logps/rejected": -1.4608542919158936,
"loss": 1.508,
"odds_ratio_loss": 0.7637500762939453,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.14316122233867645,
"rewards/margins": 0.002924212021753192,
"rewards/rejected": -0.1460854411125183,
"sft_loss": 1.431612253189087,
"step": 940
},
{
"epoch": 1.5356637704586786,
"grad_norm": 2.0193891525268555,
"learning_rate": 2.40279764577506e-06,
"logits/chosen": -14.358665466308594,
"logits/rejected": -14.505513191223145,
"logps/chosen": -1.403634786605835,
"logps/rejected": -1.4488627910614014,
"loss": 1.48,
"odds_ratio_loss": 0.7633059620857239,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.14036348462104797,
"rewards/margins": 0.004522812552750111,
"rewards/rejected": -0.14488628506660461,
"sft_loss": 1.403634786605835,
"step": 950
},
{
"epoch": 1.5518286522529805,
"grad_norm": 1.2108488082885742,
"learning_rate": 2.3604810163705242e-06,
"logits/chosen": -14.17876148223877,
"logits/rejected": -14.2489652633667,
"logps/chosen": -1.306792140007019,
"logps/rejected": -1.3910942077636719,
"loss": 1.377,
"odds_ratio_loss": 0.7023099660873413,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1306792050600052,
"rewards/margins": 0.00843021459877491,
"rewards/rejected": -0.13910941779613495,
"sft_loss": 1.306792140007019,
"step": 960
},
{
"epoch": 1.5679935340472824,
"grad_norm": 1.9210587739944458,
"learning_rate": 2.3182044505730364e-06,
"logits/chosen": -14.331990242004395,
"logits/rejected": -14.305018424987793,
"logps/chosen": -1.2632302045822144,
"logps/rejected": -1.3584424257278442,
"loss": 1.3349,
"odds_ratio_loss": 0.7163167595863342,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.12632302939891815,
"rewards/margins": 0.009521213360130787,
"rewards/rejected": -0.13584424555301666,
"sft_loss": 1.2632302045822144,
"step": 970
},
{
"epoch": 1.5841584158415842,
"grad_norm": 1.7603510618209839,
"learning_rate": 2.275980088319941e-06,
"logits/chosen": -14.362065315246582,
"logits/rejected": -14.22284984588623,
"logps/chosen": -1.269855260848999,
"logps/rejected": -1.3405383825302124,
"loss": 1.3406,
"odds_ratio_loss": 0.7074419260025024,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.12698553502559662,
"rewards/margins": 0.0070683010853827,
"rewards/rejected": -0.13405382633209229,
"sft_loss": 1.269855260848999,
"step": 980
},
{
"epoch": 1.600323297635886,
"grad_norm": 1.6920086145401,
"learning_rate": 2.2338200545580577e-06,
"logits/chosen": -14.224035263061523,
"logits/rejected": -14.358423233032227,
"logps/chosen": -1.2658283710479736,
"logps/rejected": -1.4482189416885376,
"loss": 1.3345,
"odds_ratio_loss": 0.6871744990348816,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.12658283114433289,
"rewards/margins": 0.01823904737830162,
"rewards/rejected": -0.1448218822479248,
"sft_loss": 1.2658283710479736,
"step": 990
},
{
"epoch": 1.616488179430188,
"grad_norm": 1.0991649627685547,
"learning_rate": 2.191736455761947e-06,
"logits/chosen": -14.324908256530762,
"logits/rejected": -14.3560209274292,
"logps/chosen": -1.2651708126068115,
"logps/rejected": -1.290913701057434,
"loss": 1.3401,
"odds_ratio_loss": 0.749754786491394,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.12651710212230682,
"rewards/margins": 0.0025742852594703436,
"rewards/rejected": -0.12909138202667236,
"sft_loss": 1.2651708126068115,
"step": 1000
},
{
"epoch": 1.616488179430188,
"eval_logits/chosen": -14.268522262573242,
"eval_logits/rejected": -14.308253288269043,
"eval_logps/chosen": -1.3874938488006592,
"eval_logps/rejected": -1.4423273801803589,
"eval_loss": 1.4635207653045654,
"eval_odds_ratio_loss": 0.7602682709693909,
"eval_rewards/accuracies": 0.48363634943962097,
"eval_rewards/chosen": -0.1387493908405304,
"eval_rewards/margins": 0.00548336049541831,
"eval_rewards/rejected": -0.14423276484012604,
"eval_runtime": 207.8962,
"eval_samples_per_second": 5.291,
"eval_sft_loss": 1.3874938488006592,
"eval_steps_per_second": 2.646,
"step": 1000
},
{
"epoch": 1.6326530612244898,
"grad_norm": 0.9229074716567993,
"learning_rate": 2.1497413764574673e-06,
"logits/chosen": -14.391751289367676,
"logits/rejected": -14.302392959594727,
"logps/chosen": -1.4207522869110107,
"logps/rejected": -1.4941614866256714,
"loss": 1.4937,
"odds_ratio_loss": 0.7297941446304321,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.14207521080970764,
"rewards/margins": 0.007340931333601475,
"rewards/rejected": -0.14941613376140594,
"sft_loss": 1.4207522869110107,
"step": 1010
},
{
"epoch": 1.6488179430187917,
"grad_norm": 1.2489970922470093,
"learning_rate": 2.1078468757516395e-06,
"logits/chosen": -14.41105842590332,
"logits/rejected": -14.309954643249512,
"logps/chosen": -1.3737413883209229,
"logps/rejected": -1.331855297088623,
"loss": 1.453,
"odds_ratio_loss": 0.7925962805747986,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.1373741328716278,
"rewards/margins": -0.004188609775155783,
"rewards/rejected": -0.1331855207681656,
"sft_loss": 1.3737413883209229,
"step": 1020
},
{
"epoch": 1.6649828248130936,
"grad_norm": 0.9103444814682007,
"learning_rate": 2.0660649838698145e-06,
"logits/chosen": -14.60859203338623,
"logits/rejected": -14.583990097045898,
"logps/chosen": -1.3282297849655151,
"logps/rejected": -1.4166333675384521,
"loss": 1.3999,
"odds_ratio_loss": 0.7163518071174622,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.13282299041748047,
"rewards/margins": 0.008840366266667843,
"rewards/rejected": -0.1416633427143097,
"sft_loss": 1.3282297849655151,
"step": 1030
},
{
"epoch": 1.6811477066073954,
"grad_norm": 1.1333231925964355,
"learning_rate": 2.0244076987011284e-06,
"logits/chosen": -14.382695198059082,
"logits/rejected": -14.247182846069336,
"logps/chosen": -1.3871229887008667,
"logps/rejected": -1.5080008506774902,
"loss": 1.4558,
"odds_ratio_loss": 0.68644779920578,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1387123018503189,
"rewards/margins": 0.012087779119610786,
"rewards/rejected": -0.15080007910728455,
"sft_loss": 1.3871229887008667,
"step": 1040
},
{
"epoch": 1.6973125884016973,
"grad_norm": 1.302032709121704,
"learning_rate": 1.982886982353251e-06,
"logits/chosen": -14.392558097839355,
"logits/rejected": -14.241909980773926,
"logps/chosen": -1.3640697002410889,
"logps/rejected": -1.5009006261825562,
"loss": 1.4359,
"odds_ratio_loss": 0.7178789377212524,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.13640697300434113,
"rewards/margins": 0.013683101162314415,
"rewards/rejected": -0.1500900685787201,
"sft_loss": 1.3640697002410889,
"step": 1050
},
{
"epoch": 1.7134774701959992,
"grad_norm": 1.7859091758728027,
"learning_rate": 1.941514757717392e-06,
"logits/chosen": -14.138816833496094,
"logits/rejected": -14.210226058959961,
"logps/chosen": -1.3156766891479492,
"logps/rejected": -1.4917762279510498,
"loss": 1.3807,
"odds_ratio_loss": 0.6497665643692017,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.13156768679618835,
"rewards/margins": 0.01760994642972946,
"rewards/rejected": -0.1491776406764984,
"sft_loss": 1.3156766891479492,
"step": 1060
},
{
"epoch": 1.729642351990301,
"grad_norm": 2.0628256797790527,
"learning_rate": 1.9003029050445953e-06,
"logits/chosen": -14.267855644226074,
"logits/rejected": -14.399972915649414,
"logps/chosen": -1.402465581893921,
"logps/rejected": -1.4434514045715332,
"loss": 1.4747,
"odds_ratio_loss": 0.7224588990211487,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.14024657011032104,
"rewards/margins": 0.004098571836948395,
"rewards/rejected": -0.14434513449668884,
"sft_loss": 1.402465581893921,
"step": 1070
},
{
"epoch": 1.745807233784603,
"grad_norm": 1.5042709112167358,
"learning_rate": 1.8592632585342523e-06,
"logits/chosen": -14.195714950561523,
"logits/rejected": -14.285571098327637,
"logps/chosen": -1.3312032222747803,
"logps/rejected": -1.412341833114624,
"loss": 1.4047,
"odds_ratio_loss": 0.7354634404182434,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.1331203281879425,
"rewards/margins": 0.008113870397210121,
"rewards/rejected": -0.14123418927192688,
"sft_loss": 1.3312032222747803,
"step": 1080
},
{
"epoch": 1.7619721155789048,
"grad_norm": 3.4297995567321777,
"learning_rate": 1.8184076029358527e-06,
"logits/chosen": -14.20643138885498,
"logits/rejected": -14.019030570983887,
"logps/chosen": -1.2683379650115967,
"logps/rejected": -1.2236586809158325,
"loss": 1.3443,
"odds_ratio_loss": 0.7591326832771301,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.12683378159999847,
"rewards/margins": -0.00446792459115386,
"rewards/rejected": -0.12236586958169937,
"sft_loss": 1.2683379650115967,
"step": 1090
},
{
"epoch": 1.7781369973732066,
"grad_norm": 1.0218937397003174,
"learning_rate": 1.7777476701649318e-06,
"logits/chosen": -14.1577730178833,
"logits/rejected": -14.125236511230469,
"logps/chosen": -1.3477040529251099,
"logps/rejected": -1.391446828842163,
"loss": 1.4231,
"odds_ratio_loss": 0.7540372610092163,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.13477042317390442,
"rewards/margins": 0.004374279640614986,
"rewards/rejected": -0.1391446888446808,
"sft_loss": 1.3477040529251099,
"step": 1100
},
{
"epoch": 1.7943018791675085,
"grad_norm": 1.4984055757522583,
"learning_rate": 1.7372951359341925e-06,
"logits/chosen": -14.369695663452148,
"logits/rejected": -14.277885437011719,
"logps/chosen": -1.2875721454620361,
"logps/rejected": -1.3878809213638306,
"loss": 1.3577,
"odds_ratio_loss": 0.7012876272201538,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.12875720858573914,
"rewards/margins": 0.01003087218850851,
"rewards/rejected": -0.13878807425498962,
"sft_loss": 1.2875721454620361,
"step": 1110
},
{
"epoch": 1.8104667609618104,
"grad_norm": 3.3275625705718994,
"learning_rate": 1.6970616164007547e-06,
"logits/chosen": -14.229268074035645,
"logits/rejected": -14.10546875,
"logps/chosen": -1.364091396331787,
"logps/rejected": -1.3946739435195923,
"loss": 1.4435,
"odds_ratio_loss": 0.7942220568656921,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.13640913367271423,
"rewards/margins": 0.0030582635663449764,
"rewards/rejected": -0.13946738839149475,
"sft_loss": 1.364091396331787,
"step": 1120
},
{
"epoch": 1.8266316427561122,
"grad_norm": 2.735656976699829,
"learning_rate": 1.6570586648305276e-06,
"logits/chosen": -14.143117904663086,
"logits/rejected": -14.2241849899292,
"logps/chosen": -1.344879150390625,
"logps/rejected": -1.493446707725525,
"loss": 1.4182,
"odds_ratio_loss": 0.733532726764679,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.13448792695999146,
"rewards/margins": 0.014856770634651184,
"rewards/rejected": -0.14934466779232025,
"sft_loss": 1.344879150390625,
"step": 1130
},
{
"epoch": 1.842796524550414,
"grad_norm": 1.1568862199783325,
"learning_rate": 1.6172977682806151e-06,
"logits/chosen": -14.38661003112793,
"logits/rejected": -14.517931938171387,
"logps/chosen": -1.3603746891021729,
"logps/rejected": -1.5093238353729248,
"loss": 1.4288,
"odds_ratio_loss": 0.68376624584198,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.1360374540090561,
"rewards/margins": 0.014894920401275158,
"rewards/rejected": -0.15093238651752472,
"sft_loss": 1.3603746891021729,
"step": 1140
},
{
"epoch": 1.858961406344716,
"grad_norm": 1.1773515939712524,
"learning_rate": 1.5777903443007586e-06,
"logits/chosen": -14.423624992370605,
"logits/rejected": -14.032621383666992,
"logps/chosen": -1.387117624282837,
"logps/rejected": -1.4605300426483154,
"loss": 1.4607,
"odds_ratio_loss": 0.7362414598464966,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.13871176540851593,
"rewards/margins": 0.007341254502534866,
"rewards/rejected": -0.1460530012845993,
"sft_loss": 1.387117624282837,
"step": 1150
},
{
"epoch": 1.8751262881390178,
"grad_norm": 1.5692604780197144,
"learning_rate": 1.5385477376547226e-06,
"logits/chosen": -14.410656929016113,
"logits/rejected": -14.352084159851074,
"logps/chosen": -1.3973274230957031,
"logps/rejected": -1.4963886737823486,
"loss": 1.4675,
"odds_ratio_loss": 0.7020548582077026,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.1397327482700348,
"rewards/margins": 0.009906120598316193,
"rewards/rejected": -0.14963887631893158,
"sft_loss": 1.3973274230957031,
"step": 1160
},
{
"epoch": 1.89129116993332,
"grad_norm": 3.0858218669891357,
"learning_rate": 1.4995812170625845e-06,
"logits/chosen": -14.365419387817383,
"logits/rejected": -14.341082572937012,
"logps/chosen": -1.4526535272598267,
"logps/rejected": -1.5791641473770142,
"loss": 1.5265,
"odds_ratio_loss": 0.7380681037902832,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1452653706073761,
"rewards/margins": 0.012651054188609123,
"rewards/rejected": -0.15791639685630798,
"sft_loss": 1.4526535272598267,
"step": 1170
},
{
"epoch": 1.9074560517276218,
"grad_norm": 2.4256625175476074,
"learning_rate": 1.4609019719648666e-06,
"logits/chosen": -14.359014511108398,
"logits/rejected": -14.343942642211914,
"logps/chosen": -1.365081787109375,
"logps/rejected": -1.4730589389801025,
"loss": 1.4336,
"odds_ratio_loss": 0.685504138469696,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.13650815188884735,
"rewards/margins": 0.010797703638672829,
"rewards/rejected": -0.14730587601661682,
"sft_loss": 1.365081787109375,
"step": 1180
},
{
"epoch": 1.9236209335219236,
"grad_norm": 2.2215967178344727,
"learning_rate": 1.42252110930943e-06,
"logits/chosen": -14.144754409790039,
"logits/rejected": -14.116401672363281,
"logps/chosen": -1.2247555255889893,
"logps/rejected": -1.2106770277023315,
"loss": 1.3031,
"odds_ratio_loss": 0.7834988832473755,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.12247554957866669,
"rewards/margins": -0.0014078498352319002,
"rewards/rejected": -0.12106770277023315,
"sft_loss": 1.2247555255889893,
"step": 1190
},
{
"epoch": 1.9397858153162255,
"grad_norm": 1.6026244163513184,
"learning_rate": 1.3844496503620493e-06,
"logits/chosen": -14.315832138061523,
"logits/rejected": -14.499916076660156,
"logps/chosen": -1.4833340644836426,
"logps/rejected": -1.521794080734253,
"loss": 1.5547,
"odds_ratio_loss": 0.7132872343063354,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.14833340048789978,
"rewards/margins": 0.0038460283540189266,
"rewards/rejected": -0.15217943489551544,
"sft_loss": 1.4833340644836426,
"step": 1200
},
{
"epoch": 1.9559506971105274,
"grad_norm": 1.1467649936676025,
"learning_rate": 1.3466985275416081e-06,
"logits/chosen": -14.316365242004395,
"logits/rejected": -14.039219856262207,
"logps/chosen": -1.4100277423858643,
"logps/rejected": -1.4868837594985962,
"loss": 1.4848,
"odds_ratio_loss": 0.7481211423873901,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.14100277423858643,
"rewards/margins": 0.00768560403957963,
"rewards/rejected": -0.14868836104869843,
"sft_loss": 1.4100277423858643,
"step": 1210
},
{
"epoch": 1.9721155789048292,
"grad_norm": 1.3261767625808716,
"learning_rate": 1.309278581280791e-06,
"logits/chosen": -14.425065994262695,
"logits/rejected": -14.19542121887207,
"logps/chosen": -1.258156418800354,
"logps/rejected": -1.3927624225616455,
"loss": 1.3258,
"odds_ratio_loss": 0.6761429309844971,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.12581565976142883,
"rewards/margins": 0.013460601679980755,
"rewards/rejected": -0.13927623629570007,
"sft_loss": 1.258156418800354,
"step": 1220
},
{
"epoch": 1.9882804606991311,
"grad_norm": 0.8793450593948364,
"learning_rate": 1.272200556913199e-06,
"logits/chosen": -14.331692695617676,
"logits/rejected": -14.390342712402344,
"logps/chosen": -1.2902759313583374,
"logps/rejected": -1.398531198501587,
"loss": 1.3633,
"odds_ratio_loss": 0.7302906513214111,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1290276050567627,
"rewards/margins": 0.01082551758736372,
"rewards/rejected": -0.1398531198501587,
"sft_loss": 1.2902759313583374,
"step": 1230
},
{
"epoch": 2.004445342493433,
"grad_norm": 2.07963228225708,
"learning_rate": 1.2354751015877698e-06,
"logits/chosen": -14.254411697387695,
"logits/rejected": -14.420768737792969,
"logps/chosen": -1.2709214687347412,
"logps/rejected": -1.4514631032943726,
"loss": 1.3403,
"odds_ratio_loss": 0.6936594247817993,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.1270921379327774,
"rewards/margins": 0.018054189160466194,
"rewards/rejected": -0.14514632523059845,
"sft_loss": 1.2709214687347412,
"step": 1240
},
{
"epoch": 2.020610224287735,
"grad_norm": 2.574068069458008,
"learning_rate": 1.1991127612113945e-06,
"logits/chosen": -14.361371040344238,
"logits/rejected": -14.495355606079102,
"logps/chosen": -1.3789875507354736,
"logps/rejected": -1.5034908056259155,
"loss": 1.4475,
"odds_ratio_loss": 0.6847060322761536,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.1378987580537796,
"rewards/margins": 0.012450330890715122,
"rewards/rejected": -0.15034906566143036,
"sft_loss": 1.3789875507354736,
"step": 1250
},
{
"epoch": 2.036775106082037,
"grad_norm": 1.4936628341674805,
"learning_rate": 1.1631239774206035e-06,
"logits/chosen": -14.19866943359375,
"logits/rejected": -14.191067695617676,
"logps/chosen": -1.347879409790039,
"logps/rejected": -1.4048999547958374,
"loss": 1.4251,
"odds_ratio_loss": 0.7725744247436523,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.1347879320383072,
"rewards/margins": 0.005702070891857147,
"rewards/rejected": -0.14049001038074493,
"sft_loss": 1.347879409790039,
"step": 1260
},
{
"epoch": 2.052939987876339,
"grad_norm": 1.7168585062026978,
"learning_rate": 1.1275190845831978e-06,
"logits/chosen": -14.3424711227417,
"logits/rejected": -14.3289213180542,
"logps/chosen": -1.3685007095336914,
"logps/rejected": -1.4727340936660767,
"loss": 1.4389,
"odds_ratio_loss": 0.7035232782363892,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.13685005903244019,
"rewards/margins": 0.010423343628644943,
"rewards/rejected": -0.14727340638637543,
"sft_loss": 1.3685007095336914,
"step": 1270
},
{
"epoch": 2.0691048696706407,
"grad_norm": 1.1820368766784668,
"learning_rate": 1.0923083068306778e-06,
"logits/chosen": -14.398675918579102,
"logits/rejected": -14.118631362915039,
"logps/chosen": -1.2939175367355347,
"logps/rejected": -1.473049283027649,
"loss": 1.3601,
"odds_ratio_loss": 0.662093997001648,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.12939175963401794,
"rewards/margins": 0.017913173884153366,
"rewards/rejected": -0.14730492234230042,
"sft_loss": 1.2939175367355347,
"step": 1280
},
{
"epoch": 2.0852697514649425,
"grad_norm": 1.1745166778564453,
"learning_rate": 1.0575017551223348e-06,
"logits/chosen": -14.3531494140625,
"logits/rejected": -14.198529243469238,
"logps/chosen": -1.2511951923370361,
"logps/rejected": -1.3217878341674805,
"loss": 1.3224,
"odds_ratio_loss": 0.7121993899345398,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.12511952221393585,
"rewards/margins": 0.007059249095618725,
"rewards/rejected": -0.13217875361442566,
"sft_loss": 1.2511951923370361,
"step": 1290
},
{
"epoch": 2.1014346332592444,
"grad_norm": 0.894344687461853,
"learning_rate": 1.023109424341833e-06,
"logits/chosen": -14.153393745422363,
"logits/rejected": -14.245986938476562,
"logps/chosen": -1.3667266368865967,
"logps/rejected": -1.42815363407135,
"loss": 1.4394,
"odds_ratio_loss": 0.727142333984375,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.13667264580726624,
"rewards/margins": 0.006142704281955957,
"rewards/rejected": -0.14281536638736725,
"sft_loss": 1.3667266368865967,
"step": 1300
},
{
"epoch": 2.1175995150535463,
"grad_norm": 1.5093544721603394,
"learning_rate": 9.891411904271273e-07,
"logits/chosen": -14.242596626281738,
"logits/rejected": -14.327380180358887,
"logps/chosen": -1.3282233476638794,
"logps/rejected": -1.3852262496948242,
"loss": 1.4007,
"odds_ratio_loss": 0.7251249551773071,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.13282233476638794,
"rewards/margins": 0.005700295325368643,
"rewards/rejected": -0.13852263987064362,
"sft_loss": 1.3282233476638794,
"step": 1310
},
{
"epoch": 2.133764396847848,
"grad_norm": 0.8299040198326111,
"learning_rate": 9.556068075345363e-07,
"logits/chosen": -14.465705871582031,
"logits/rejected": -14.254651069641113,
"logps/chosen": -1.2607736587524414,
"logps/rejected": -1.3249403238296509,
"loss": 1.3327,
"odds_ratio_loss": 0.7195707559585571,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.12607736885547638,
"rewards/margins": 0.006416681222617626,
"rewards/rejected": -0.13249404728412628,
"sft_loss": 1.2607736587524414,
"step": 1320
},
{
"epoch": 2.14992927864215,
"grad_norm": 1.5431737899780273,
"learning_rate": 9.225159052377838e-07,
"logits/chosen": -14.418218612670898,
"logits/rejected": -14.442914009094238,
"logps/chosen": -1.369145393371582,
"logps/rejected": -1.4892218112945557,
"loss": 1.4395,
"odds_ratio_loss": 0.7034425735473633,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.13691455125808716,
"rewards/margins": 0.012007640674710274,
"rewards/rejected": -0.1489221751689911,
"sft_loss": 1.369145393371582,
"step": 1330
},
{
"epoch": 2.166094160436452,
"grad_norm": 2.125438928604126,
"learning_rate": 8.898779857628184e-07,
"logits/chosen": -14.263992309570312,
"logits/rejected": -14.439204216003418,
"logps/chosen": -1.2737493515014648,
"logps/rejected": -1.307660698890686,
"loss": 1.3488,
"odds_ratio_loss": 0.7507684826850891,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.12737493216991425,
"rewards/margins": 0.003391148056834936,
"rewards/rejected": -0.13076607882976532,
"sft_loss": 1.2737493515014648,
"step": 1340
},
{
"epoch": 2.1822590422307537,
"grad_norm": 1.0558884143829346,
"learning_rate": 8.577024212591975e-07,
"logits/chosen": -14.523656845092773,
"logits/rejected": -14.395648002624512,
"logps/chosen": -1.3369591236114502,
"logps/rejected": -1.402151346206665,
"loss": 1.4081,
"odds_ratio_loss": 0.7112525701522827,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.13369593024253845,
"rewards/margins": 0.006519217975437641,
"rewards/rejected": -0.14021514356136322,
"sft_loss": 1.3369591236114502,
"step": 1350
},
{
"epoch": 2.1984239240250556,
"grad_norm": 1.1882685422897339,
"learning_rate": 8.259984511088276e-07,
"logits/chosen": -14.409403800964355,
"logits/rejected": -14.405116081237793,
"logps/chosen": -1.3154635429382324,
"logps/rejected": -1.4095304012298584,
"loss": 1.3863,
"odds_ratio_loss": 0.7081496715545654,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.13154636323451996,
"rewards/margins": 0.009406678378582,
"rewards/rejected": -0.14095303416252136,
"sft_loss": 1.3154635429382324,
"step": 1360
},
{
"epoch": 2.2145888058193575,
"grad_norm": 1.6390233039855957,
"learning_rate": 7.947751792728237e-07,
"logits/chosen": -14.409843444824219,
"logits/rejected": -14.329424858093262,
"logps/chosen": -1.3204478025436401,
"logps/rejected": -1.4512555599212646,
"loss": 1.3901,
"odds_ratio_loss": 0.6965182423591614,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.13204479217529297,
"rewards/margins": 0.013080772943794727,
"rewards/rejected": -0.14512555301189423,
"sft_loss": 1.3204478025436401,
"step": 1370
},
{
"epoch": 2.2307536876136593,
"grad_norm": 1.7825186252593994,
"learning_rate": 7.640415716772626e-07,
"logits/chosen": -14.333005905151367,
"logits/rejected": -14.429731369018555,
"logps/chosen": -1.3603641986846924,
"logps/rejected": -1.4518425464630127,
"loss": 1.4331,
"odds_ratio_loss": 0.7270913124084473,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.13603642582893372,
"rewards/margins": 0.009147830307483673,
"rewards/rejected": -0.1451842486858368,
"sft_loss": 1.3603641986846924,
"step": 1380
},
{
"epoch": 2.246918569407961,
"grad_norm": 1.125680685043335,
"learning_rate": 7.338064536385722e-07,
"logits/chosen": -14.394281387329102,
"logits/rejected": -14.345739364624023,
"logps/chosen": -1.3667652606964111,
"logps/rejected": -1.5295965671539307,
"loss": 1.435,
"odds_ratio_loss": 0.6821550130844116,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.13667652010917664,
"rewards/margins": 0.016283124685287476,
"rewards/rejected": -0.1529596596956253,
"sft_loss": 1.3667652606964111,
"step": 1390
},
{
"epoch": 2.263083451202263,
"grad_norm": 1.7544102668762207,
"learning_rate": 7.040785073292883e-07,
"logits/chosen": -14.237360000610352,
"logits/rejected": -14.33959674835205,
"logps/chosen": -1.4276225566864014,
"logps/rejected": -1.4824755191802979,
"loss": 1.5027,
"odds_ratio_loss": 0.750755786895752,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.14276224374771118,
"rewards/margins": 0.00548530463129282,
"rewards/rejected": -0.14824756979942322,
"sft_loss": 1.4276225566864014,
"step": 1400
},
{
"epoch": 2.279248332996565,
"grad_norm": 1.7468085289001465,
"learning_rate": 6.748662692849297e-07,
"logits/chosen": -14.5598726272583,
"logits/rejected": -14.531698226928711,
"logps/chosen": -1.3492968082427979,
"logps/rejected": -1.4934823513031006,
"loss": 1.4184,
"odds_ratio_loss": 0.6912583112716675,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.13492968678474426,
"rewards/margins": 0.014418545179069042,
"rewards/rejected": -0.14934822916984558,
"sft_loss": 1.3492968082427979,
"step": 1410
},
{
"epoch": 2.295413214790867,
"grad_norm": 3.2176520824432373,
"learning_rate": 6.46178127952686e-07,
"logits/chosen": -14.288836479187012,
"logits/rejected": -14.204765319824219,
"logps/chosen": -1.299232840538025,
"logps/rejected": -1.4280776977539062,
"loss": 1.3673,
"odds_ratio_loss": 0.6802908182144165,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.12992329895496368,
"rewards/margins": 0.01288448553532362,
"rewards/rejected": -0.1428077667951584,
"sft_loss": 1.299232840538025,
"step": 1420
},
{
"epoch": 2.3115780965851687,
"grad_norm": 2.5991835594177246,
"learning_rate": 6.180223212826289e-07,
"logits/chosen": -14.347335815429688,
"logits/rejected": -14.187026977539062,
"logps/chosen": -1.2904529571533203,
"logps/rejected": -1.3600698709487915,
"loss": 1.362,
"odds_ratio_loss": 0.7157233953475952,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.1290452927350998,
"rewards/margins": 0.006961710751056671,
"rewards/rejected": -0.13600699603557587,
"sft_loss": 1.2904529571533203,
"step": 1430
},
{
"epoch": 2.3277429783794705,
"grad_norm": 0.8683578968048096,
"learning_rate": 5.904069343621443e-07,
"logits/chosen": -14.465449333190918,
"logits/rejected": -14.325057983398438,
"logps/chosen": -1.299377202987671,
"logps/rejected": -1.401989459991455,
"loss": 1.3706,
"odds_ratio_loss": 0.7122213244438171,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.12993772327899933,
"rewards/margins": 0.010261224582791328,
"rewards/rejected": -0.14019893109798431,
"sft_loss": 1.299377202987671,
"step": 1440
},
{
"epoch": 2.3439078601737724,
"grad_norm": 1.7288964986801147,
"learning_rate": 5.633398970942544e-07,
"logits/chosen": -14.3145170211792,
"logits/rejected": -14.42223834991455,
"logps/chosen": -1.2952549457550049,
"logps/rejected": -1.3960306644439697,
"loss": 1.3675,
"odds_ratio_loss": 0.7228525876998901,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.12952548265457153,
"rewards/margins": 0.010077586397528648,
"rewards/rejected": -0.13960307836532593,
"sft_loss": 1.2952549457550049,
"step": 1450
},
{
"epoch": 2.3600727419680743,
"grad_norm": 1.8580021858215332,
"learning_rate": 5.368289819205069e-07,
"logits/chosen": -14.319725036621094,
"logits/rejected": -14.285405158996582,
"logps/chosen": -1.2445900440216064,
"logps/rejected": -1.3483976125717163,
"loss": 1.3139,
"odds_ratio_loss": 0.6927712559700012,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.12445902824401855,
"rewards/margins": 0.010380755178630352,
"rewards/rejected": -0.13483977317810059,
"sft_loss": 1.2445900440216064,
"step": 1460
},
{
"epoch": 2.376237623762376,
"grad_norm": 2.3416638374328613,
"learning_rate": 5.108818015890785e-07,
"logits/chosen": -14.468851089477539,
"logits/rejected": -14.461502075195312,
"logps/chosen": -1.3592495918273926,
"logps/rejected": -1.4990885257720947,
"loss": 1.4311,
"odds_ratio_loss": 0.7181252241134644,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.13592496514320374,
"rewards/margins": 0.013983884826302528,
"rewards/rejected": -0.14990884065628052,
"sft_loss": 1.3592495918273926,
"step": 1470
},
{
"epoch": 2.392402505556678,
"grad_norm": 1.5794059038162231,
"learning_rate": 4.855058069687291e-07,
"logits/chosen": -14.158782958984375,
"logits/rejected": -14.074625015258789,
"logps/chosen": -1.324530839920044,
"logps/rejected": -1.366247296333313,
"loss": 1.3974,
"odds_ratio_loss": 0.7290586233139038,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1324530839920044,
"rewards/margins": 0.004171643406152725,
"rewards/rejected": -0.13662473857402802,
"sft_loss": 1.324530839920044,
"step": 1480
},
{
"epoch": 2.40856738735098,
"grad_norm": 2.1180176734924316,
"learning_rate": 4.607082849092523e-07,
"logits/chosen": -14.219759941101074,
"logits/rejected": -14.182577133178711,
"logps/chosen": -1.4282917976379395,
"logps/rejected": -1.4976496696472168,
"loss": 1.5016,
"odds_ratio_loss": 0.7326869368553162,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.14282917976379395,
"rewards/margins": 0.006935800425708294,
"rewards/rejected": -0.14976496994495392,
"sft_loss": 1.4282917976379395,
"step": 1490
},
{
"epoch": 2.4247322691452817,
"grad_norm": 2.495347738265991,
"learning_rate": 4.3649635614901405e-07,
"logits/chosen": -14.16241455078125,
"logits/rejected": -14.45665168762207,
"logps/chosen": -1.3701971769332886,
"logps/rejected": -1.3534958362579346,
"loss": 1.446,
"odds_ratio_loss": 0.7579734921455383,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.13701972365379333,
"rewards/margins": -0.0016701335553079844,
"rewards/rejected": -0.1353495866060257,
"sft_loss": 1.3701971769332886,
"step": 1500
},
{
"epoch": 2.4247322691452817,
"eval_logits/chosen": -14.27784252166748,
"eval_logits/rejected": -14.317824363708496,
"eval_logps/chosen": -1.372594952583313,
"eval_logps/rejected": -1.4290432929992676,
"eval_loss": 1.4484930038452148,
"eval_odds_ratio_loss": 0.7589808702468872,
"eval_rewards/accuracies": 0.4809090793132782,
"eval_rewards/chosen": -0.13725949823856354,
"eval_rewards/margins": 0.005644842050969601,
"eval_rewards/rejected": -0.1429043412208557,
"eval_runtime": 396.2162,
"eval_samples_per_second": 2.776,
"eval_sft_loss": 1.372594952583313,
"eval_steps_per_second": 1.388,
"step": 1500
},
{
"epoch": 2.4408971509395836,
"grad_norm": 1.8667449951171875,
"learning_rate": 4.128769732701973e-07,
"logits/chosen": -14.2674560546875,
"logits/rejected": -14.17170524597168,
"logps/chosen": -1.3341007232666016,
"logps/rejected": -1.4468257427215576,
"loss": 1.4053,
"odds_ratio_loss": 0.7120139002799988,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.13341006636619568,
"rewards/margins": 0.011272510513663292,
"rewards/rejected": -0.14468258619308472,
"sft_loss": 1.3341007232666016,
"step": 1510
},
{
"epoch": 2.4570620327338855,
"grad_norm": 2.940946102142334,
"learning_rate": 3.8985691870233046e-07,
"logits/chosen": -14.28807258605957,
"logits/rejected": -14.214245796203613,
"logps/chosen": -1.3024286031723022,
"logps/rejected": -1.4218701124191284,
"loss": 1.3737,
"odds_ratio_loss": 0.712692379951477,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.13024285435676575,
"rewards/margins": 0.011944140307605267,
"rewards/rejected": -0.1421869993209839,
"sft_loss": 1.3024286031723022,
"step": 1520
},
{
"epoch": 2.4732269145281873,
"grad_norm": 2.6948108673095703,
"learning_rate": 3.6744280277467904e-07,
"logits/chosen": -14.425226211547852,
"logits/rejected": -14.381690979003906,
"logps/chosen": -1.4246366024017334,
"logps/rejected": -1.426334023475647,
"loss": 1.5046,
"odds_ratio_loss": 0.7999409437179565,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.14246365427970886,
"rewards/margins": 0.00016971743025351316,
"rewards/rejected": -0.14263339340686798,
"sft_loss": 1.4246366024017334,
"step": 1530
},
{
"epoch": 2.489391796322489,
"grad_norm": 1.6409363746643066,
"learning_rate": 3.456410618180503e-07,
"logits/chosen": -13.974553108215332,
"logits/rejected": -14.2942533493042,
"logps/chosen": -1.2257071733474731,
"logps/rejected": -1.43178391456604,
"loss": 1.2927,
"odds_ratio_loss": 0.6698334217071533,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.1225707158446312,
"rewards/margins": 0.020607685670256615,
"rewards/rejected": -0.14317841827869415,
"sft_loss": 1.2257071733474731,
"step": 1540
},
{
"epoch": 2.5055566781167915,
"grad_norm": 1.3992644548416138,
"learning_rate": 3.244579563165753e-07,
"logits/chosen": -14.36426830291748,
"logits/rejected": -14.48327922821045,
"logps/chosen": -1.2957897186279297,
"logps/rejected": -1.4375650882720947,
"loss": 1.3673,
"odds_ratio_loss": 0.7152336239814758,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.12957896292209625,
"rewards/margins": 0.014177536591887474,
"rewards/rejected": -0.14375647902488708,
"sft_loss": 1.2957897186279297,
"step": 1550
},
{
"epoch": 2.521721559911093,
"grad_norm": 0.9756754636764526,
"learning_rate": 3.038995691099697e-07,
"logits/chosen": -14.465911865234375,
"logits/rejected": -14.273321151733398,
"logps/chosen": -1.3624980449676514,
"logps/rejected": -1.5072979927062988,
"loss": 1.4344,
"odds_ratio_loss": 0.7189978361129761,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.13624981045722961,
"rewards/margins": 0.014479981735348701,
"rewards/rejected": -0.15072980523109436,
"sft_loss": 1.3624980449676514,
"step": 1560
},
{
"epoch": 2.5378864417053952,
"grad_norm": 2.6390867233276367,
"learning_rate": 2.839718036468192e-07,
"logits/chosen": -14.324618339538574,
"logits/rejected": -14.362611770629883,
"logps/chosen": -1.4562547206878662,
"logps/rejected": -1.4829699993133545,
"loss": 1.5307,
"odds_ratio_loss": 0.7442874312400818,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.14562548696994781,
"rewards/margins": 0.0026715078856796026,
"rewards/rejected": -0.1482969969511032,
"sft_loss": 1.4562547206878662,
"step": 1570
},
{
"epoch": 2.5540513234996967,
"grad_norm": 1.9648209810256958,
"learning_rate": 2.646803822893723e-07,
"logits/chosen": -14.38152027130127,
"logits/rejected": -14.392126083374023,
"logps/chosen": -1.4547812938690186,
"logps/rejected": -1.4928423166275024,
"loss": 1.5325,
"odds_ratio_loss": 0.7773637175559998,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.14547815918922424,
"rewards/margins": 0.003806093242019415,
"rewards/rejected": -0.1492842435836792,
"sft_loss": 1.4547812938690186,
"step": 1580
},
{
"epoch": 2.570216205293999,
"grad_norm": 1.1905252933502197,
"learning_rate": 2.460308446703341e-07,
"logits/chosen": -14.339777946472168,
"logits/rejected": -14.1979398727417,
"logps/chosen": -1.3354339599609375,
"logps/rejected": -1.348439335823059,
"loss": 1.4097,
"odds_ratio_loss": 0.7425277829170227,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.13354340195655823,
"rewards/margins": 0.0013005301589146256,
"rewards/rejected": -0.13484393060207367,
"sft_loss": 1.3354339599609375,
"step": 1590
},
{
"epoch": 2.5863810870883004,
"grad_norm": 4.711751461029053,
"learning_rate": 2.2802854610213143e-07,
"logits/chosen": -14.302705764770508,
"logits/rejected": -14.19762134552002,
"logps/chosen": -1.3138768672943115,
"logps/rejected": -1.4147188663482666,
"loss": 1.3864,
"odds_ratio_loss": 0.7257053256034851,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.13138769567012787,
"rewards/margins": 0.010084209032356739,
"rewards/rejected": -0.14147189259529114,
"sft_loss": 1.3138768672943115,
"step": 1600
},
{
"epoch": 2.6025459688826027,
"grad_norm": 4.042973518371582,
"learning_rate": 2.106786560391072e-07,
"logits/chosen": -14.2058744430542,
"logits/rejected": -14.269085884094238,
"logps/chosen": -1.3923499584197998,
"logps/rejected": -1.3771612644195557,
"loss": 1.4698,
"odds_ratio_loss": 0.7747048139572144,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1392349898815155,
"rewards/margins": -0.0015188835095614195,
"rewards/rejected": -0.1377161294221878,
"sft_loss": 1.3923499584197998,
"step": 1610
},
{
"epoch": 2.6187108506769046,
"grad_norm": 1.3606544733047485,
"learning_rate": 1.9398615659308255e-07,
"logits/chosen": -14.2599515914917,
"logits/rejected": -14.334997177124023,
"logps/chosen": -1.3270127773284912,
"logps/rejected": -1.3853967189788818,
"loss": 1.3982,
"odds_ratio_loss": 0.7119258046150208,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.13270129263401031,
"rewards/margins": 0.005838391836732626,
"rewards/rejected": -0.13853967189788818,
"sft_loss": 1.3270127773284912,
"step": 1620
},
{
"epoch": 2.6348757324712064,
"grad_norm": 1.4494473934173584,
"learning_rate": 1.7795584110272184e-07,
"logits/chosen": -14.470367431640625,
"logits/rejected": -14.478838920593262,
"logps/chosen": -1.3744457960128784,
"logps/rejected": -1.4546699523925781,
"loss": 1.4475,
"odds_ratio_loss": 0.730518639087677,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.13744458556175232,
"rewards/margins": 0.008022413589060307,
"rewards/rejected": -0.14546698331832886,
"sft_loss": 1.3744457960128784,
"step": 1630
},
{
"epoch": 2.6510406142655083,
"grad_norm": 2.888951539993286,
"learning_rate": 1.6259231275709636e-07,
"logits/chosen": -14.41100788116455,
"logits/rejected": -14.428006172180176,
"logps/chosen": -1.3241318464279175,
"logps/rejected": -1.318234920501709,
"loss": 1.4028,
"odds_ratio_loss": 0.7864112257957458,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.13241317868232727,
"rewards/margins": -0.000589700706768781,
"rewards/rejected": -0.13182349503040314,
"sft_loss": 1.3241318464279175,
"step": 1640
},
{
"epoch": 2.66720549605981,
"grad_norm": 1.5565133094787598,
"learning_rate": 1.478999832738548e-07,
"logits/chosen": -14.382177352905273,
"logits/rejected": -14.320945739746094,
"logps/chosen": -1.297300934791565,
"logps/rejected": -1.4187005758285522,
"loss": 1.368,
"odds_ratio_loss": 0.7067518830299377,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.12973010540008545,
"rewards/margins": 0.01213997695595026,
"rewards/rejected": -0.14187008142471313,
"sft_loss": 1.297300934791565,
"step": 1650
},
{
"epoch": 2.683370377854112,
"grad_norm": 2.0713951587677,
"learning_rate": 1.338830716323769e-07,
"logits/chosen": -14.337793350219727,
"logits/rejected": -14.350440979003906,
"logps/chosen": -1.3087949752807617,
"logps/rejected": -1.350098967552185,
"loss": 1.383,
"odds_ratio_loss": 0.7419986724853516,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.1308794915676117,
"rewards/margins": 0.004130417015403509,
"rewards/rejected": -0.13500989973545074,
"sft_loss": 1.3087949752807617,
"step": 1660
},
{
"epoch": 2.699535259648414,
"grad_norm": 2.8654770851135254,
"learning_rate": 1.205456028622723e-07,
"logits/chosen": -14.387499809265137,
"logits/rejected": -14.384310722351074,
"logps/chosen": -1.2575846910476685,
"logps/rejected": -1.4380841255187988,
"loss": 1.3249,
"odds_ratio_loss": 0.6730828285217285,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.12575848400592804,
"rewards/margins": 0.018049929291009903,
"rewards/rejected": -0.14380840957164764,
"sft_loss": 1.2575846910476685,
"step": 1670
},
{
"epoch": 2.7157001414427158,
"grad_norm": 2.644263505935669,
"learning_rate": 1.0789140688756805e-07,
"logits/chosen": -14.564410209655762,
"logits/rejected": -14.484796524047852,
"logps/chosen": -1.331872582435608,
"logps/rejected": -1.4917659759521484,
"loss": 1.3983,
"odds_ratio_loss": 0.6643630862236023,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.13318723440170288,
"rewards/margins": 0.015989361330866814,
"rewards/rejected": -0.14917659759521484,
"sft_loss": 1.331872582435608,
"step": 1680
},
{
"epoch": 2.7318650232370176,
"grad_norm": 1.8434594869613647,
"learning_rate": 9.592411742693098e-08,
"logits/chosen": -14.349563598632812,
"logits/rejected": -14.297950744628906,
"logps/chosen": -1.284172773361206,
"logps/rejected": -1.3313789367675781,
"loss": 1.3598,
"odds_ratio_loss": 0.7563740611076355,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.12841728329658508,
"rewards/margins": 0.004720622207969427,
"rewards/rejected": -0.13313789665699005,
"sft_loss": 1.284172773361206,
"step": 1690
},
{
"epoch": 2.7480299050313195,
"grad_norm": 0.9198280572891235,
"learning_rate": 8.464717095022168e-08,
"logits/chosen": -14.535560607910156,
"logits/rejected": -14.29857349395752,
"logps/chosen": -1.291333794593811,
"logps/rejected": -1.4038417339324951,
"loss": 1.3626,
"odds_ratio_loss": 0.7129305601119995,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.12913337349891663,
"rewards/margins": 0.011250784620642662,
"rewards/rejected": -0.14038416743278503,
"sft_loss": 1.291333794593811,
"step": 1700
},
{
"epoch": 2.7641947868256214,
"grad_norm": 1.85430908203125,
"learning_rate": 7.406380569169841e-08,
"logits/chosen": -14.304112434387207,
"logits/rejected": -14.291776657104492,
"logps/chosen": -1.3815504312515259,
"logps/rejected": -1.3685299158096313,
"loss": 1.4574,
"odds_ratio_loss": 0.7585769891738892,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.13815505802631378,
"rewards/margins": -0.0013020627666264772,
"rewards/rejected": -0.13685297966003418,
"sft_loss": 1.3815504312515259,
"step": 1710
},
{
"epoch": 2.7803596686199232,
"grad_norm": 7.879937171936035,
"learning_rate": 6.417706072013808e-08,
"logits/chosen": -14.357699394226074,
"logits/rejected": -14.520744323730469,
"logps/chosen": -1.4151430130004883,
"logps/rejected": -1.4842795133590698,
"loss": 1.4887,
"odds_ratio_loss": 0.7356118559837341,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.14151428639888763,
"rewards/margins": 0.006913675460964441,
"rewards/rejected": -0.14842796325683594,
"sft_loss": 1.4151430130004883,
"step": 1720
},
{
"epoch": 2.796524550414225,
"grad_norm": 2.3623361587524414,
"learning_rate": 5.498977506615294e-08,
"logits/chosen": -14.438512802124023,
"logits/rejected": -14.370248794555664,
"logps/chosen": -1.4021018743515015,
"logps/rejected": -1.3835337162017822,
"loss": 1.4818,
"odds_ratio_loss": 0.796977698802948,
"rewards/accuracies": 0.41874998807907104,
"rewards/chosen": -0.14021018147468567,
"rewards/margins": -0.0018568048253655434,
"rewards/rejected": -0.1383533775806427,
"sft_loss": 1.4021018743515015,
"step": 1730
},
{
"epoch": 2.812689432208527,
"grad_norm": 1.0650444030761719,
"learning_rate": 4.6504586906947756e-08,
"logits/chosen": -14.35010051727295,
"logits/rejected": -14.401901245117188,
"logps/chosen": -1.3507376909255981,
"logps/rejected": -1.4280903339385986,
"loss": 1.4204,
"odds_ratio_loss": 0.6963773369789124,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.13507376611232758,
"rewards/margins": 0.007735266350209713,
"rewards/rejected": -0.14280903339385986,
"sft_loss": 1.3507376909255981,
"step": 1740
},
{
"epoch": 2.828854314002829,
"grad_norm": 5.588193893432617,
"learning_rate": 3.8723932808754914e-08,
"logits/chosen": -14.620956420898438,
"logits/rejected": -14.591873168945312,
"logps/chosen": -1.4141243696212769,
"logps/rejected": -1.4447482824325562,
"loss": 1.4888,
"odds_ratio_loss": 0.7466815710067749,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.1414124220609665,
"rewards/margins": 0.003062391420826316,
"rewards/rejected": -0.1444748193025589,
"sft_loss": 1.4141243696212769,
"step": 1750
},
{
"epoch": 2.8450191957971307,
"grad_norm": 2.8461813926696777,
"learning_rate": 3.1650047027158014e-08,
"logits/chosen": -14.406710624694824,
"logits/rejected": -14.431941032409668,
"logps/chosen": -1.3235969543457031,
"logps/rejected": -1.378565788269043,
"loss": 1.3941,
"odds_ratio_loss": 0.7055075764656067,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.13235969841480255,
"rewards/margins": 0.005496888421475887,
"rewards/rejected": -0.13785657286643982,
"sft_loss": 1.3235969543457031,
"step": 1760
},
{
"epoch": 2.8611840775914326,
"grad_norm": 1.4648724794387817,
"learning_rate": 2.5284960865517848e-08,
"logits/chosen": -14.247715950012207,
"logits/rejected": -14.30573844909668,
"logps/chosen": -1.2652337551116943,
"logps/rejected": -1.3874812126159668,
"loss": 1.3373,
"odds_ratio_loss": 0.7210808992385864,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.12652337551116943,
"rewards/margins": 0.012224750593304634,
"rewards/rejected": -0.13874812424182892,
"sft_loss": 1.2652337551116943,
"step": 1770
},
{
"epoch": 2.8773489593857344,
"grad_norm": 1.2711795568466187,
"learning_rate": 1.9630502091670388e-08,
"logits/chosen": -14.345422744750977,
"logits/rejected": -14.210649490356445,
"logps/chosen": -1.3347010612487793,
"logps/rejected": -1.4864898920059204,
"loss": 1.4034,
"odds_ratio_loss": 0.686531126499176,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.13347011804580688,
"rewards/margins": 0.015178876928985119,
"rewards/rejected": -0.14864897727966309,
"sft_loss": 1.3347010612487793,
"step": 1780
},
{
"epoch": 2.8935138411800363,
"grad_norm": 4.285287857055664,
"learning_rate": 1.4688294413074677e-08,
"logits/chosen": -14.240816116333008,
"logits/rejected": -14.293863296508789,
"logps/chosen": -1.2230440378189087,
"logps/rejected": -1.3717424869537354,
"loss": 1.2918,
"odds_ratio_loss": 0.6871523857116699,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.12230439484119415,
"rewards/margins": 0.014869834296405315,
"rewards/rejected": -0.13717423379421234,
"sft_loss": 1.2230440378189087,
"step": 1790
},
{
"epoch": 2.909678722974338,
"grad_norm": 1.111965298652649,
"learning_rate": 1.0459757010556626e-08,
"logits/chosen": -14.294512748718262,
"logits/rejected": -14.2905912399292,
"logps/chosen": -1.3162596225738525,
"logps/rejected": -1.357807993888855,
"loss": 1.3902,
"odds_ratio_loss": 0.7398349046707153,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1316259652376175,
"rewards/margins": 0.0041548521257936954,
"rewards/rejected": -0.13578079640865326,
"sft_loss": 1.3162596225738525,
"step": 1800
},
{
"epoch": 2.92584360476864,
"grad_norm": 1.985671043395996,
"learning_rate": 6.94610413078306e-09,
"logits/chosen": -14.099322319030762,
"logits/rejected": -14.289319038391113,
"logps/chosen": -1.3942023515701294,
"logps/rejected": -1.5463578701019287,
"loss": 1.4669,
"odds_ratio_loss": 0.7267955541610718,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.13942024111747742,
"rewards/margins": 0.015215557999908924,
"rewards/rejected": -0.15463578701019287,
"sft_loss": 1.3942023515701294,
"step": 1810
},
{
"epoch": 2.942008486562942,
"grad_norm": 1.1975542306900024,
"learning_rate": 4.14834473758563e-09,
"logits/chosen": -14.166104316711426,
"logits/rejected": -14.219152450561523,
"logps/chosen": -1.2467665672302246,
"logps/rejected": -1.3985602855682373,
"loss": 1.3162,
"odds_ratio_loss": 0.6939627528190613,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1246766597032547,
"rewards/margins": 0.015179386362433434,
"rewards/rejected": -0.13985604047775269,
"sft_loss": 1.2467665672302246,
"step": 1820
},
{
"epoch": 2.9581733683572438,
"grad_norm": 1.3036004304885864,
"learning_rate": 2.067282222230349e-09,
"logits/chosen": -14.375224113464355,
"logits/rejected": -14.571484565734863,
"logps/chosen": -1.326818585395813,
"logps/rejected": -1.477850317955017,
"loss": 1.3957,
"odds_ratio_loss": 0.6886210441589355,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.13268187642097473,
"rewards/margins": 0.015103173442184925,
"rewards/rejected": -0.1477850377559662,
"sft_loss": 1.326818585395813,
"step": 1830
},
{
"epoch": 2.9743382501515456,
"grad_norm": 6.394278049468994,
"learning_rate": 7.035141727212979e-10,
"logits/chosen": -14.3215913772583,
"logits/rejected": -14.438852310180664,
"logps/chosen": -1.256394386291504,
"logps/rejected": -1.3541960716247559,
"loss": 1.3287,
"odds_ratio_loss": 0.7228869199752808,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1256394237279892,
"rewards/margins": 0.009780170395970345,
"rewards/rejected": -0.13541960716247559,
"sft_loss": 1.256394386291504,
"step": 1840
},
{
"epoch": 2.9905031319458475,
"grad_norm": 2.8705546855926514,
"learning_rate": 5.743220219761592e-11,
"logits/chosen": -14.366948127746582,
"logits/rejected": -14.415715217590332,
"logps/chosen": -1.3598301410675049,
"logps/rejected": -1.40765380859375,
"loss": 1.4375,
"odds_ratio_loss": 0.7764675617218018,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.13598300516605377,
"rewards/margins": 0.004782381001859903,
"rewards/rejected": -0.14076539874076843,
"sft_loss": 1.3598301410675049,
"step": 1850
},
{
"epoch": 2.9969690846635686,
"step": 1854,
"total_flos": 1.9131711497471508e+18,
"train_loss": 1.4823461713142765,
"train_runtime": 22122.5243,
"train_samples_per_second": 1.342,
"train_steps_per_second": 0.084
}
],
"logging_steps": 10,
"max_steps": 1854,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 1.9131711497471508e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}