{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "dpo_losses": 0.6931471824645996, "epoch": 0.0, "grad_norm": 1.6611914425325234, "learning_rate": 1.3054830287206268e-08, "logits/chosen": -2.909182548522949, "logits/rejected": -2.942319393157959, "logps/chosen": -202.1656494140625, "logps/rejected": -236.2765350341797, "loss": 0.6931, "positive_losses": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "dpo_losses": 0.6931732296943665, "epoch": 0.0, "grad_norm": 31.969198661929852, "learning_rate": 1.3054830287206266e-07, "logits/chosen": -2.8677401542663574, "logits/rejected": -2.7863104343414307, "logps/chosen": -300.2490234375, "logps/rejected": -226.55227661132812, "loss": 0.7007, "positive_losses": 0.08082646876573563, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": -0.00019778122077696025, "rewards/margins": -5.036979200667702e-05, "rewards/margins_max": 0.002247290452942252, "rewards/margins_min": -0.0025868744123727083, "rewards/margins_std": 0.0021279146894812584, "rewards/rejected": -0.0001474114542361349, "step": 10 }, { "dpo_losses": 0.6931561231613159, "epoch": 0.01, "grad_norm": 14.306452777137075, "learning_rate": 2.610966057441253e-07, "logits/chosen": -2.8997511863708496, "logits/rejected": -2.821716070175171, "logps/chosen": -342.60882568359375, "logps/rejected": -237.42715454101562, "loss": 0.6975, "positive_losses": 0.0433620922267437, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0008173284004442394, "rewards/margins": -1.608573984412942e-05, "rewards/margins_max": 0.0027120746672153473, "rewards/margins_min": -0.003053296823054552, "rewards/margins_std": 0.0025677671656012535, "rewards/rejected": 0.0008334142039529979, "step": 20 }, { "dpo_losses": 0.6928127408027649, "epoch": 0.01, "grad_norm": 6.76920412465167, "learning_rate": 3.9164490861618804e-07, "logits/chosen": -2.7987051010131836, "logits/rejected": -2.8019251823425293, "logps/chosen": -303.80767822265625, "logps/rejected": -266.1310729980469, "loss": 0.6945, "positive_losses": 0.018559837713837624, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004252296872437, "rewards/margins": 0.0006736738723702729, "rewards/margins_max": 0.004781021270900965, "rewards/margins_min": -0.0029487106949090958, "rewards/margins_std": 0.003470769850537181, "rewards/rejected": 0.003578622592613101, "step": 30 }, { "dpo_losses": 0.692796528339386, "epoch": 0.01, "grad_norm": 5.604010862243276, "learning_rate": 5.221932114882506e-07, "logits/chosen": -2.813326120376587, "logits/rejected": -2.8262887001037598, "logps/chosen": -280.18328857421875, "logps/rejected": -280.54071044921875, "loss": 0.6934, "positive_losses": 0.011556625366210938, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.008161318488419056, "rewards/margins": 0.0007072348380461335, "rewards/margins_max": 0.005385141354054213, "rewards/margins_min": -0.0038401507772505283, "rewards/margins_std": 0.004110351204872131, "rewards/rejected": 0.007454083301126957, "step": 40 }, { "dpo_losses": 0.6927393674850464, "epoch": 0.01, "grad_norm": 2.04497375713942, "learning_rate": 6.527415143603135e-07, "logits/chosen": -2.886263608932495, "logits/rejected": -2.8584060668945312, "logps/chosen": -293.516845703125, "logps/rejected": -290.35809326171875, "loss": 0.693, "positive_losses": 0.0017120360862463713, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.011908010579645634, "rewards/margins": 0.0008212241227738559, "rewards/margins_max": 0.005125211086124182, "rewards/margins_min": -0.00381028326228261, "rewards/margins_std": 0.004039672203361988, "rewards/rejected": 0.011086787097156048, "step": 50 }, { "dpo_losses": 0.6924117207527161, "epoch": 0.02, "grad_norm": 1.9405895334341494, "learning_rate": 7.832898172323761e-07, "logits/chosen": -2.821598768234253, "logits/rejected": -2.7612762451171875, "logps/chosen": -284.66741943359375, "logps/rejected": -250.3779296875, "loss": 0.6928, "positive_losses": 0.0, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.014016744680702686, "rewards/margins": 0.0014791989233344793, "rewards/margins_max": 0.0065080830827355385, "rewards/margins_min": -0.003556284587830305, "rewards/margins_std": 0.004505008924752474, "rewards/rejected": 0.012537546455860138, "step": 60 }, { "dpo_losses": 0.692319929599762, "epoch": 0.02, "grad_norm": 1.6259721635638693, "learning_rate": 9.138381201044387e-07, "logits/chosen": -2.8565468788146973, "logits/rejected": -2.826981782913208, "logps/chosen": -247.64559936523438, "logps/rejected": -229.2498779296875, "loss": 0.6926, "positive_losses": 0.0, "rewards/accuracies": 0.625, "rewards/chosen": 0.015192938968539238, "rewards/margins": 0.001666490687057376, "rewards/margins_max": 0.00816585123538971, "rewards/margins_min": -0.005111562553793192, "rewards/margins_std": 0.0060561723075807095, "rewards/rejected": 0.013526448979973793, "step": 70 }, { "dpo_losses": 0.6917449235916138, "epoch": 0.02, "grad_norm": 2.4954140563057763, "learning_rate": 1.0443864229765013e-06, "logits/chosen": -2.8182671070098877, "logits/rejected": -2.7784154415130615, "logps/chosen": -275.432373046875, "logps/rejected": -225.180419921875, "loss": 0.6922, "positive_losses": 0.0004261016729287803, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.016106154769659042, "rewards/margins": 0.002822594018653035, "rewards/margins_max": 0.011124899610877037, "rewards/margins_min": -0.004864133894443512, "rewards/margins_std": 0.007102040108293295, "rewards/rejected": 0.01328356098383665, "step": 80 }, { "dpo_losses": 0.6901671886444092, "epoch": 0.02, "grad_norm": 11.82878483855531, "learning_rate": 1.1749347258485642e-06, "logits/chosen": -2.879368543624878, "logits/rejected": -2.833066940307617, "logps/chosen": -322.00164794921875, "logps/rejected": -271.5631408691406, "loss": 0.6924, "positive_losses": 0.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02144201099872589, "rewards/margins": 0.0059935045428574085, "rewards/margins_max": 0.01698596030473709, "rewards/margins_min": -0.004721859935671091, "rewards/margins_std": 0.009729361161589622, "rewards/rejected": 0.015448507852852345, "step": 90 }, { "dpo_losses": 0.6903530359268188, "epoch": 0.03, "grad_norm": 2.1562864880999015, "learning_rate": 1.305483028720627e-06, "logits/chosen": -2.7181944847106934, "logits/rejected": -2.6738967895507812, "logps/chosen": -341.0799560546875, "logps/rejected": -240.3539581298828, "loss": 0.6911, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.02394462376832962, "rewards/margins": 0.0056390054523944855, "rewards/margins_max": 0.019726106896996498, "rewards/margins_min": -0.006556331180036068, "rewards/margins_std": 0.01177787035703659, "rewards/rejected": 0.018305614590644836, "step": 100 }, { "epoch": 0.03, "eval_dpo_losses": 0.690110445022583, "eval_logits/chosen": -2.8053641319274902, "eval_logits/rejected": -2.766242742538452, "eval_logps/chosen": -282.0331115722656, "eval_logps/rejected": -256.6321105957031, "eval_loss": 0.6917737722396851, "eval_positive_losses": 0.009668411687016487, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.025603098794817924, "eval_rewards/margins": 0.006135226227343082, "eval_rewards/margins_max": 0.02824905700981617, "eval_rewards/margins_min": -0.013623973354697227, "eval_rewards/margins_std": 0.013793708756566048, "eval_rewards/rejected": 0.019467873498797417, "eval_runtime": 428.7568, "eval_samples_per_second": 4.665, "eval_steps_per_second": 0.292, "step": 100 }, { "dpo_losses": 0.6918027400970459, "epoch": 0.03, "grad_norm": 2.156851422013261, "learning_rate": 1.4360313315926894e-06, "logits/chosen": -2.8136510848999023, "logits/rejected": -2.823888063430786, "logps/chosen": -253.9114532470703, "logps/rejected": -245.44131469726562, "loss": 0.6909, "positive_losses": 0.0, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.023961525410413742, "rewards/margins": 0.002739057643339038, "rewards/margins_max": 0.017869509756565094, "rewards/margins_min": -0.011799002066254616, "rewards/margins_std": 0.01312586385756731, "rewards/rejected": 0.021222466602921486, "step": 110 }, { "dpo_losses": 0.6911963224411011, "epoch": 0.03, "grad_norm": 21.782948059706587, "learning_rate": 1.5665796344647521e-06, "logits/chosen": -2.802748203277588, "logits/rejected": -2.7897446155548096, "logps/chosen": -276.5313415527344, "logps/rejected": -233.54483032226562, "loss": 0.7017, "positive_losses": 0.20288391411304474, "rewards/accuracies": 0.6875, "rewards/chosen": 0.027138402685523033, "rewards/margins": 0.0040526604279875755, "rewards/margins_max": 0.02045324817299843, "rewards/margins_min": -0.015398552641272545, "rewards/margins_std": 0.01642894372344017, "rewards/rejected": 0.02308574505150318, "step": 120 }, { "dpo_losses": 0.6882020831108093, "epoch": 0.03, "grad_norm": 10.984128078327988, "learning_rate": 1.6971279373368146e-06, "logits/chosen": -2.8226253986358643, "logits/rejected": -2.7765445709228516, "logps/chosen": -260.9440612792969, "logps/rejected": -306.1463317871094, "loss": 0.6895, "positive_losses": 0.0018714905017986894, "rewards/accuracies": 0.5625, "rewards/chosen": 0.029916221275925636, "rewards/margins": 0.010501563549041748, "rewards/margins_max": 0.046757232397794724, "rewards/margins_min": -0.01352185569703579, "rewards/margins_std": 0.027795681729912758, "rewards/rejected": 0.01941465586423874, "step": 130 }, { "dpo_losses": 0.6875099539756775, "epoch": 0.04, "grad_norm": 1.8029906647620886, "learning_rate": 1.8276762402088774e-06, "logits/chosen": -2.8402276039123535, "logits/rejected": -2.7861874103546143, "logps/chosen": -259.905029296875, "logps/rejected": -230.09567260742188, "loss": 0.7114, "positive_losses": 0.06240863725543022, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03723273053765297, "rewards/margins": 0.011453449726104736, "rewards/margins_max": 0.03811323270201683, "rewards/margins_min": -0.010656825266778469, "rewards/margins_std": 0.021766941994428635, "rewards/rejected": 0.025779282674193382, "step": 140 }, { "dpo_losses": 0.688108503818512, "epoch": 0.04, "grad_norm": 1.8550180630691293, "learning_rate": 1.9582245430809403e-06, "logits/chosen": -2.8100781440734863, "logits/rejected": -2.791452646255493, "logps/chosen": -238.95901489257812, "logps/rejected": -254.30355834960938, "loss": 0.6919, "positive_losses": 0.016205215826630592, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0451044887304306, "rewards/margins": 0.010226741433143616, "rewards/margins_max": 0.03394794091582298, "rewards/margins_min": -0.010631146840751171, "rewards/margins_std": 0.02019248902797699, "rewards/rejected": 0.03487774729728699, "step": 150 }, { "dpo_losses": 0.6860537528991699, "epoch": 0.04, "grad_norm": 9.356915744700173, "learning_rate": 2.0887728459530026e-06, "logits/chosen": -2.7489659786224365, "logits/rejected": -2.7682833671569824, "logps/chosen": -269.04193115234375, "logps/rejected": -257.1709899902344, "loss": 0.6854, "positive_losses": 0.007275390438735485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.05685564875602722, "rewards/margins": 0.014535295777022839, "rewards/margins_max": 0.0533498153090477, "rewards/margins_min": -0.021386949345469475, "rewards/margins_std": 0.033080536872148514, "rewards/rejected": 0.04232034832239151, "step": 160 }, { "dpo_losses": 0.6875606179237366, "epoch": 0.04, "grad_norm": 1.850999384368349, "learning_rate": 2.2193211488250653e-06, "logits/chosen": -2.7869582176208496, "logits/rejected": -2.797764539718628, "logps/chosen": -264.21929931640625, "logps/rejected": -230.4424591064453, "loss": 0.6905, "positive_losses": 0.022145461291074753, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0651407465338707, "rewards/margins": 0.011593434028327465, "rewards/margins_max": 0.05123548582196236, "rewards/margins_min": -0.02727467194199562, "rewards/margins_std": 0.03492007032036781, "rewards/rejected": 0.053547315299510956, "step": 170 }, { "dpo_losses": 0.6835399866104126, "epoch": 0.05, "grad_norm": 9.104754000529407, "learning_rate": 2.3498694516971284e-06, "logits/chosen": -2.8353443145751953, "logits/rejected": -2.7759976387023926, "logps/chosen": -318.8158874511719, "logps/rejected": -278.764404296875, "loss": 0.6944, "positive_losses": 0.17639747262001038, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.07056266814470291, "rewards/margins": 0.019875146448612213, "rewards/margins_max": 0.06871607899665833, "rewards/margins_min": -0.030139094218611717, "rewards/margins_std": 0.04327835515141487, "rewards/rejected": 0.050687529146671295, "step": 180 }, { "dpo_losses": 0.6838158369064331, "epoch": 0.05, "grad_norm": 9.269298320661703, "learning_rate": 2.4804177545691907e-06, "logits/chosen": -2.835176706314087, "logits/rejected": -2.8169798851013184, "logps/chosen": -227.8518524169922, "logps/rejected": -219.61294555664062, "loss": 0.6869, "positive_losses": 0.031234169378876686, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.07945264130830765, "rewards/margins": 0.019288327544927597, "rewards/margins_max": 0.06636989861726761, "rewards/margins_min": -0.024780582636594772, "rewards/margins_std": 0.04111936315894127, "rewards/rejected": 0.06016431376338005, "step": 190 }, { "dpo_losses": 0.6795304417610168, "epoch": 0.05, "grad_norm": 2.036685420364637, "learning_rate": 2.610966057441254e-06, "logits/chosen": -2.832998037338257, "logits/rejected": -2.7580273151397705, "logps/chosen": -308.475341796875, "logps/rejected": -245.78466796875, "loss": 0.6847, "positive_losses": 0.009261703118681908, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.08847187459468842, "rewards/margins": 0.027999553829431534, "rewards/margins_max": 0.0806564912199974, "rewards/margins_min": -0.012813249602913857, "rewards/margins_std": 0.042584288865327835, "rewards/rejected": 0.06047232076525688, "step": 200 }, { "epoch": 0.05, "eval_dpo_losses": 0.6806074976921082, "eval_logits/chosen": -2.798675775527954, "eval_logits/rejected": -2.7591915130615234, "eval_logps/chosen": -276.15399169921875, "eval_logps/rejected": -252.74549865722656, "eval_loss": 0.6918641924858093, "eval_positive_losses": 0.031035613268613815, "eval_rewards/accuracies": 0.6710000038146973, "eval_rewards/chosen": 0.084394171833992, "eval_rewards/margins": 0.026059743016958237, "eval_rewards/margins_max": 0.1132497489452362, "eval_rewards/margins_min": -0.05120939016342163, "eval_rewards/margins_std": 0.05419847369194031, "eval_rewards/rejected": 0.05833442509174347, "eval_runtime": 428.0904, "eval_samples_per_second": 4.672, "eval_steps_per_second": 0.292, "step": 200 }, { "dpo_losses": 0.6805652379989624, "epoch": 0.05, "grad_norm": 2.5355264778356994, "learning_rate": 2.741514360313316e-06, "logits/chosen": -2.778172731399536, "logits/rejected": -2.7424604892730713, "logps/chosen": -288.6905212402344, "logps/rejected": -287.5398254394531, "loss": 0.6883, "positive_losses": 0.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0932592898607254, "rewards/margins": 0.026345139369368553, "rewards/margins_max": 0.10087727010250092, "rewards/margins_min": -0.03499458357691765, "rewards/margins_std": 0.05978749319911003, "rewards/rejected": 0.0669141560792923, "step": 210 }, { "dpo_losses": 0.682792067527771, "epoch": 0.06, "grad_norm": 1.7829356332394, "learning_rate": 2.872062663185379e-06, "logits/chosen": -2.7870593070983887, "logits/rejected": -2.750718116760254, "logps/chosen": -261.4877624511719, "logps/rejected": -268.4627990722656, "loss": 0.6804, "positive_losses": 0.0, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.10234057903289795, "rewards/margins": 0.021648995578289032, "rewards/margins_max": 0.07374467700719833, "rewards/margins_min": -0.03438428044319153, "rewards/margins_std": 0.04767972230911255, "rewards/rejected": 0.08069159090518951, "step": 220 }, { "dpo_losses": 0.6789739727973938, "epoch": 0.06, "grad_norm": 1.8845999689384942, "learning_rate": 3.0026109660574416e-06, "logits/chosen": -2.8217036724090576, "logits/rejected": -2.8107285499572754, "logps/chosen": -275.61614990234375, "logps/rejected": -252.70364379882812, "loss": 0.6815, "positive_losses": 0.05097904056310654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12186799943447113, "rewards/margins": 0.029651399701833725, "rewards/margins_max": 0.1046239361166954, "rewards/margins_min": -0.04132336378097534, "rewards/margins_std": 0.06577299535274506, "rewards/rejected": 0.0922165960073471, "step": 230 }, { "dpo_losses": 0.6752057075500488, "epoch": 0.06, "grad_norm": 2.0298898691454608, "learning_rate": 3.1331592689295043e-06, "logits/chosen": -2.7409098148345947, "logits/rejected": -2.6662404537200928, "logps/chosen": -274.1945495605469, "logps/rejected": -229.91552734375, "loss": 0.6731, "positive_losses": 0.002334022428840399, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.10710887610912323, "rewards/margins": 0.037525489926338196, "rewards/margins_max": 0.11302550882101059, "rewards/margins_min": -0.03477070480585098, "rewards/margins_std": 0.06618161499500275, "rewards/rejected": 0.06958337128162384, "step": 240 }, { "dpo_losses": 0.6733521223068237, "epoch": 0.07, "grad_norm": 7.746980632694252, "learning_rate": 3.263707571801567e-06, "logits/chosen": -2.8070075511932373, "logits/rejected": -2.7885661125183105, "logps/chosen": -259.63763427734375, "logps/rejected": -228.86135864257812, "loss": 0.6798, "positive_losses": 0.03568840026855469, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09187337756156921, "rewards/margins": 0.0415494330227375, "rewards/margins_max": 0.12277477979660034, "rewards/margins_min": -0.03311099857091904, "rewards/margins_std": 0.06869812309741974, "rewards/rejected": 0.05032395198941231, "step": 250 }, { "dpo_losses": 0.6689683198928833, "epoch": 0.07, "grad_norm": 20.958700350005678, "learning_rate": 3.3942558746736293e-06, "logits/chosen": -2.81817889213562, "logits/rejected": -2.772019863128662, "logps/chosen": -312.8612060546875, "logps/rejected": -309.1396484375, "loss": 0.6777, "positive_losses": 0.13976097106933594, "rewards/accuracies": 0.75, "rewards/chosen": 0.1171041950583458, "rewards/margins": 0.05053260922431946, "rewards/margins_max": 0.13087162375450134, "rewards/margins_min": -0.023819979280233383, "rewards/margins_std": 0.07033131271600723, "rewards/rejected": 0.06657158583402634, "step": 260 }, { "dpo_losses": 0.673005223274231, "epoch": 0.07, "grad_norm": 8.312133780995351, "learning_rate": 3.524804177545692e-06, "logits/chosen": -2.809727907180786, "logits/rejected": -2.7245917320251465, "logps/chosen": -302.3597106933594, "logps/rejected": -282.5569763183594, "loss": 0.6822, "positive_losses": 0.22031250596046448, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.12276358902454376, "rewards/margins": 0.04253315180540085, "rewards/margins_max": 0.13749028742313385, "rewards/margins_min": -0.0451325885951519, "rewards/margins_std": 0.08130183815956116, "rewards/rejected": 0.08023042976856232, "step": 270 }, { "dpo_losses": 0.6772290468215942, "epoch": 0.07, "grad_norm": 9.803365355327683, "learning_rate": 3.6553524804177547e-06, "logits/chosen": -2.886641502380371, "logits/rejected": -2.8426270484924316, "logps/chosen": -279.29400634765625, "logps/rejected": -256.7546691894531, "loss": 0.6867, "positive_losses": 0.09031333774328232, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.14377516508102417, "rewards/margins": 0.03387909010052681, "rewards/margins_max": 0.1268281638622284, "rewards/margins_min": -0.05632222443819046, "rewards/margins_std": 0.08138807117938995, "rewards/rejected": 0.10989607870578766, "step": 280 }, { "dpo_losses": 0.6649600863456726, "epoch": 0.08, "grad_norm": 11.360849822588373, "learning_rate": 3.7859007832898174e-06, "logits/chosen": -2.850799322128296, "logits/rejected": -2.8138887882232666, "logps/chosen": -282.0655212402344, "logps/rejected": -245.19259643554688, "loss": 0.6748, "positive_losses": 0.07300148159265518, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.15921849012374878, "rewards/margins": 0.05898389220237732, "rewards/margins_max": 0.1448855698108673, "rewards/margins_min": -0.019111331552267075, "rewards/margins_std": 0.07298921048641205, "rewards/rejected": 0.10023460537195206, "step": 290 }, { "dpo_losses": 0.6765052080154419, "epoch": 0.08, "grad_norm": 1.9811501318373503, "learning_rate": 3.9164490861618806e-06, "logits/chosen": -2.855132579803467, "logits/rejected": -2.780561923980713, "logps/chosen": -268.2986145019531, "logps/rejected": -228.91799926757812, "loss": 0.686, "positive_losses": 0.11273155361413956, "rewards/accuracies": 0.5625, "rewards/chosen": 0.12802688777446747, "rewards/margins": 0.03576163575053215, "rewards/margins_max": 0.15934522449970245, "rewards/margins_min": -0.04905729740858078, "rewards/margins_std": 0.09489398449659348, "rewards/rejected": 0.09226525574922562, "step": 300 }, { "epoch": 0.08, "eval_dpo_losses": 0.6693459749221802, "eval_logits/chosen": -2.7859046459198, "eval_logits/rejected": -2.747396945953369, "eval_logps/chosen": -269.94671630859375, "eval_logps/rejected": -249.0195770263672, "eval_loss": 0.6901127696037292, "eval_positive_losses": 0.08413992077112198, "eval_rewards/accuracies": 0.6949999928474426, "eval_rewards/chosen": 0.14646685123443604, "eval_rewards/margins": 0.05087343230843544, "eval_rewards/margins_max": 0.20712482929229736, "eval_rewards/margins_min": -0.09147688001394272, "eval_rewards/margins_std": 0.0988682359457016, "eval_rewards/rejected": 0.0955934152007103, "eval_runtime": 428.161, "eval_samples_per_second": 4.671, "eval_steps_per_second": 0.292, "step": 300 }, { "dpo_losses": 0.6752734184265137, "epoch": 0.08, "grad_norm": 2.2957750658047917, "learning_rate": 4.046997389033943e-06, "logits/chosen": -2.8415491580963135, "logits/rejected": -2.8036723136901855, "logps/chosen": -237.29660034179688, "logps/rejected": -239.52114868164062, "loss": 0.6832, "positive_losses": 0.04194068908691406, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.13866667449474335, "rewards/margins": 0.03833303973078728, "rewards/margins_max": 0.1446342021226883, "rewards/margins_min": -0.04681024327874184, "rewards/margins_std": 0.08503605425357819, "rewards/rejected": 0.10033364593982697, "step": 310 }, { "dpo_losses": 0.6736447215080261, "epoch": 0.08, "grad_norm": 4.724465451959904, "learning_rate": 4.177545691906005e-06, "logits/chosen": -2.8330652713775635, "logits/rejected": -2.797053813934326, "logps/chosen": -267.61407470703125, "logps/rejected": -243.7859344482422, "loss": 0.6854, "positive_losses": 0.23391112685203552, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.15120115876197815, "rewards/margins": 0.04250707849860191, "rewards/margins_max": 0.1621485948562622, "rewards/margins_min": -0.06767591834068298, "rewards/margins_std": 0.10243155807256699, "rewards/rejected": 0.10869406163692474, "step": 320 }, { "dpo_losses": 0.6691862344741821, "epoch": 0.09, "grad_norm": 5.16089323962061, "learning_rate": 4.308093994778068e-06, "logits/chosen": -2.8560335636138916, "logits/rejected": -2.7866241931915283, "logps/chosen": -273.2575378417969, "logps/rejected": -278.06524658203125, "loss": 0.6763, "positive_losses": 0.0, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1741090714931488, "rewards/margins": 0.05146823078393936, "rewards/margins_max": 0.16819505393505096, "rewards/margins_min": -0.050195444375276566, "rewards/margins_std": 0.09703671187162399, "rewards/rejected": 0.12264084815979004, "step": 330 }, { "dpo_losses": 0.6649759411811829, "epoch": 0.09, "grad_norm": 6.243975978729356, "learning_rate": 4.4386422976501306e-06, "logits/chosen": -2.8070483207702637, "logits/rejected": -2.795396089553833, "logps/chosen": -281.8221130371094, "logps/rejected": -258.61187744140625, "loss": 0.6762, "positive_losses": 0.09027175605297089, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15640690922737122, "rewards/margins": 0.06027814745903015, "rewards/margins_max": 0.18680498003959656, "rewards/margins_min": -0.036148302257061005, "rewards/margins_std": 0.10083751380443573, "rewards/rejected": 0.09612873941659927, "step": 340 }, { "dpo_losses": 0.6607569456100464, "epoch": 0.09, "grad_norm": 13.972627856764554, "learning_rate": 4.569190600522193e-06, "logits/chosen": -2.6934661865234375, "logits/rejected": -2.6470305919647217, "logps/chosen": -265.8113098144531, "logps/rejected": -233.8306121826172, "loss": 0.7053, "positive_losses": 0.37513604760169983, "rewards/accuracies": 0.75, "rewards/chosen": 0.13258785009384155, "rewards/margins": 0.07012196630239487, "rewards/margins_max": 0.2063256800174713, "rewards/margins_min": -0.06749741733074188, "rewards/margins_std": 0.1223006621003151, "rewards/rejected": 0.06246587634086609, "step": 350 }, { "dpo_losses": 0.6704517006874084, "epoch": 0.09, "grad_norm": 8.96812984435849, "learning_rate": 4.699738903394257e-06, "logits/chosen": -2.8276827335357666, "logits/rejected": -2.8534798622131348, "logps/chosen": -274.4246520996094, "logps/rejected": -240.2241668701172, "loss": 0.6834, "positive_losses": 0.19836406409740448, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.14870551228523254, "rewards/margins": 0.049779582768678665, "rewards/margins_max": 0.1779724657535553, "rewards/margins_min": -0.07502292096614838, "rewards/margins_std": 0.11457856744527817, "rewards/rejected": 0.09892591834068298, "step": 360 }, { "dpo_losses": 0.676064670085907, "epoch": 0.1, "grad_norm": 1.9760803116991252, "learning_rate": 4.8302872062663196e-06, "logits/chosen": -2.7008068561553955, "logits/rejected": -2.6861443519592285, "logps/chosen": -259.9791564941406, "logps/rejected": -247.25051879882812, "loss": 0.6935, "positive_losses": 0.02337036095559597, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.16256669163703918, "rewards/margins": 0.03676387667655945, "rewards/margins_max": 0.13351775705814362, "rewards/margins_min": -0.05251342058181763, "rewards/margins_std": 0.08308423310518265, "rewards/rejected": 0.12580282986164093, "step": 370 }, { "dpo_losses": 0.6824192404747009, "epoch": 0.1, "grad_norm": 1.9243637346295395, "learning_rate": 4.9608355091383814e-06, "logits/chosen": -2.746785879135132, "logits/rejected": -2.6604361534118652, "logps/chosen": -239.18319702148438, "logps/rejected": -252.88662719726562, "loss": 0.6804, "positive_losses": 0.0, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.15879032015800476, "rewards/margins": 0.025484349578619003, "rewards/margins_max": 0.1668674200773239, "rewards/margins_min": -0.09540258347988129, "rewards/margins_std": 0.11624189466238022, "rewards/rejected": 0.13330596685409546, "step": 380 }, { "dpo_losses": 0.6571207046508789, "epoch": 0.1, "grad_norm": 2.1382667192572367, "learning_rate": 4.9999488562447675e-06, "logits/chosen": -2.8185324668884277, "logits/rejected": -2.790360927581787, "logps/chosen": -274.35955810546875, "logps/rejected": -251.66928100585938, "loss": 0.7061, "positive_losses": 0.5538875460624695, "rewards/accuracies": 0.75, "rewards/chosen": 0.18151499330997467, "rewards/margins": 0.07630706578493118, "rewards/margins_max": 0.19101569056510925, "rewards/margins_min": -0.030439352616667747, "rewards/margins_std": 0.09843975305557251, "rewards/rejected": 0.10520792007446289, "step": 390 }, { "dpo_losses": 0.6789859533309937, "epoch": 0.1, "grad_norm": 7.956872563004055, "learning_rate": 4.999698361256577e-06, "logits/chosen": -2.7469451427459717, "logits/rejected": -2.71948504447937, "logps/chosen": -238.6767578125, "logps/rejected": -236.68798828125, "loss": 0.6944, "positive_losses": 0.21751323342323303, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.13562782108783722, "rewards/margins": 0.033287692815065384, "rewards/margins_max": 0.17778214812278748, "rewards/margins_min": -0.09985264390707016, "rewards/margins_std": 0.1263827383518219, "rewards/rejected": 0.10234012454748154, "step": 400 }, { "epoch": 0.1, "eval_dpo_losses": 0.663064181804657, "eval_logits/chosen": -2.7503912448883057, "eval_logits/rejected": -2.711548328399658, "eval_logps/chosen": -268.7827453613281, "eval_logps/rejected": -249.2730255126953, "eval_loss": 0.6911394596099854, "eval_positive_losses": 0.1509626805782318, "eval_rewards/accuracies": 0.7099999785423279, "eval_rewards/chosen": 0.15810656547546387, "eval_rewards/margins": 0.06504742801189423, "eval_rewards/margins_max": 0.24901820719242096, "eval_rewards/margins_min": -0.11125880479812622, "eval_rewards/margins_std": 0.11945176124572754, "eval_rewards/rejected": 0.09305915236473083, "eval_runtime": 428.2463, "eval_samples_per_second": 4.67, "eval_steps_per_second": 0.292, "step": 400 }, { "dpo_losses": 0.664069414138794, "epoch": 0.11, "grad_norm": 1.9598413572530216, "learning_rate": 4.999239142174581e-06, "logits/chosen": -2.775360345840454, "logits/rejected": -2.691457509994507, "logps/chosen": -267.9887390136719, "logps/rejected": -250.80062866210938, "loss": 0.7028, "positive_losses": 0.5063844919204712, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14695055782794952, "rewards/margins": 0.062417663633823395, "rewards/margins_max": 0.18425968289375305, "rewards/margins_min": -0.06093385070562363, "rewards/margins_std": 0.11020394414663315, "rewards/rejected": 0.08453289419412613, "step": 410 }, { "dpo_losses": 0.653849720954895, "epoch": 0.11, "grad_norm": 4.945979117109157, "learning_rate": 4.99857123734344e-06, "logits/chosen": -2.789283275604248, "logits/rejected": -2.7467398643493652, "logps/chosen": -277.5484313964844, "logps/rejected": -231.8037567138672, "loss": 0.6648, "positive_losses": 0.02366619184613228, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.1615566909313202, "rewards/margins": 0.08374477177858353, "rewards/margins_max": 0.2054201066493988, "rewards/margins_min": -0.03128126636147499, "rewards/margins_std": 0.10419468581676483, "rewards/rejected": 0.07781191915273666, "step": 420 }, { "dpo_losses": 0.6573296785354614, "epoch": 0.11, "grad_norm": 13.690638352217581, "learning_rate": 4.997694702533016e-06, "logits/chosen": -2.7743239402770996, "logits/rejected": -2.7178587913513184, "logps/chosen": -336.22369384765625, "logps/rejected": -252.02197265625, "loss": 0.6747, "positive_losses": 0.1470256745815277, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.16634371876716614, "rewards/margins": 0.07746388018131256, "rewards/margins_max": 0.22486260533332825, "rewards/margins_min": -0.044031620025634766, "rewards/margins_std": 0.11824808269739151, "rewards/rejected": 0.08887985348701477, "step": 430 }, { "dpo_losses": 0.6622677445411682, "epoch": 0.12, "grad_norm": 1.8875260610233917, "learning_rate": 4.996609610933713e-06, "logits/chosen": -2.7872633934020996, "logits/rejected": -2.776425838470459, "logps/chosen": -258.63134765625, "logps/rejected": -246.9169921875, "loss": 0.6676, "positive_losses": 0.15967722237110138, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1545584499835968, "rewards/margins": 0.06730343401432037, "rewards/margins_max": 0.21810702979564667, "rewards/margins_min": -0.07433603703975677, "rewards/margins_std": 0.13059645891189575, "rewards/rejected": 0.08725499361753464, "step": 440 }, { "dpo_losses": 0.660641074180603, "epoch": 0.12, "grad_norm": 10.268091007199386, "learning_rate": 4.995316053150366e-06, "logits/chosen": -2.822967052459717, "logits/rejected": -2.783705234527588, "logps/chosen": -284.12713623046875, "logps/rejected": -260.09515380859375, "loss": 0.6735, "positive_losses": 0.07706870883703232, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.17116017639636993, "rewards/margins": 0.06977041810750961, "rewards/margins_max": 0.20091423392295837, "rewards/margins_min": -0.05889366939663887, "rewards/margins_std": 0.11523435264825821, "rewards/rejected": 0.10138972848653793, "step": 450 }, { "dpo_losses": 0.6438701748847961, "epoch": 0.12, "grad_norm": 5.215093736693054, "learning_rate": 4.9938141371946815e-06, "logits/chosen": -2.7379660606384277, "logits/rejected": -2.7176480293273926, "logps/chosen": -261.96063232421875, "logps/rejected": -243.8579864501953, "loss": 0.6708, "positive_losses": 0.4775467813014984, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1769762635231018, "rewards/margins": 0.10664983838796616, "rewards/margins_max": 0.2506755292415619, "rewards/margins_min": -0.03088602051138878, "rewards/margins_std": 0.12461646646261215, "rewards/rejected": 0.07032643258571625, "step": 460 }, { "dpo_losses": 0.6677502393722534, "epoch": 0.12, "grad_norm": 1.4687140440090638, "learning_rate": 4.992103988476206e-06, "logits/chosen": -2.6899399757385254, "logits/rejected": -2.72920560836792, "logps/chosen": -234.1622772216797, "logps/rejected": -235.1865234375, "loss": 0.6852, "positive_losses": 0.19793835282325745, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.15648001432418823, "rewards/margins": 0.05565309524536133, "rewards/margins_max": 0.17241524159908295, "rewards/margins_min": -0.06818656623363495, "rewards/margins_std": 0.1089044064283371, "rewards/rejected": 0.1008269339799881, "step": 470 }, { "dpo_losses": 0.6689162254333496, "epoch": 0.13, "grad_norm": 6.083829188931725, "learning_rate": 4.990185749791866e-06, "logits/chosen": -2.7936508655548096, "logits/rejected": -2.7398219108581543, "logps/chosen": -275.41754150390625, "logps/rejected": -241.33541870117188, "loss": 0.6862, "positive_losses": 0.22376975417137146, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.17008516192436218, "rewards/margins": 0.05477757006883621, "rewards/margins_max": 0.20395350456237793, "rewards/margins_min": -0.09113912284374237, "rewards/margins_std": 0.13186687231063843, "rewards/rejected": 0.11530760675668716, "step": 480 }, { "dpo_losses": 0.6603975892066956, "epoch": 0.13, "grad_norm": 17.080156629500834, "learning_rate": 4.9880595813140395e-06, "logits/chosen": -2.729578733444214, "logits/rejected": -2.7126169204711914, "logps/chosen": -285.56719970703125, "logps/rejected": -253.8076171875, "loss": 0.6882, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1881304681301117, "rewards/margins": 0.07103969156742096, "rewards/margins_max": 0.21888594329357147, "rewards/margins_min": -0.046334654092788696, "rewards/margins_std": 0.11919162422418594, "rewards/rejected": 0.11709077656269073, "step": 490 }, { "dpo_losses": 0.6637958288192749, "epoch": 0.13, "grad_norm": 1.8083399023941662, "learning_rate": 4.985725660577184e-06, "logits/chosen": -2.7672886848449707, "logits/rejected": -2.735917568206787, "logps/chosen": -288.93804931640625, "logps/rejected": -259.6017150878906, "loss": 0.6923, "positive_losses": 0.08706741034984589, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.19365164637565613, "rewards/margins": 0.06393040716648102, "rewards/margins_max": 0.19910098612308502, "rewards/margins_min": -0.0605672225356102, "rewards/margins_std": 0.11656157672405243, "rewards/rejected": 0.1297212392091751, "step": 500 }, { "epoch": 0.13, "eval_dpo_losses": 0.6647040247917175, "eval_logits/chosen": -2.724285125732422, "eval_logits/rejected": -2.6843347549438477, "eval_logps/chosen": -265.1090087890625, "eval_logps/rejected": -245.26019287109375, "eval_loss": 0.6787527799606323, "eval_positive_losses": 0.05962928384542465, "eval_rewards/accuracies": 0.6949999928474426, "eval_rewards/chosen": 0.19484417140483856, "eval_rewards/margins": 0.061656612902879715, "eval_rewards/margins_max": 0.25129368901252747, "eval_rewards/margins_min": -0.10771217942237854, "eval_rewards/margins_std": 0.11904539167881012, "eval_rewards/rejected": 0.13318756222724915, "eval_runtime": 428.3102, "eval_samples_per_second": 4.67, "eval_steps_per_second": 0.292, "step": 500 }, { "dpo_losses": 0.6538182497024536, "epoch": 0.13, "grad_norm": 1.8732684086967137, "learning_rate": 4.983184182463009e-06, "logits/chosen": -2.6488800048828125, "logits/rejected": -2.6324260234832764, "logps/chosen": -260.6108093261719, "logps/rejected": -240.75369262695312, "loss": 0.6678, "positive_losses": 0.11122193187475204, "rewards/accuracies": 0.6875, "rewards/chosen": 0.19986779987812042, "rewards/margins": 0.08869300782680511, "rewards/margins_max": 0.28801050782203674, "rewards/margins_min": -0.06173459812998772, "rewards/margins_std": 0.15817862749099731, "rewards/rejected": 0.11117477715015411, "step": 510 }, { "dpo_losses": 0.6551252603530884, "epoch": 0.14, "grad_norm": 1.7478967016062716, "learning_rate": 4.980435359184203e-06, "logits/chosen": -2.7087385654449463, "logits/rejected": -2.661221981048584, "logps/chosen": -257.74530029296875, "logps/rejected": -240.1739959716797, "loss": 0.6886, "positive_losses": 0.26389384269714355, "rewards/accuracies": 0.75, "rewards/chosen": 0.16229435801506042, "rewards/margins": 0.0823858231306076, "rewards/margins_max": 0.22145190834999084, "rewards/margins_min": -0.06387770175933838, "rewards/margins_std": 0.1273488700389862, "rewards/rejected": 0.07990851253271103, "step": 520 }, { "dpo_losses": 0.6596701741218567, "epoch": 0.14, "grad_norm": 2.042964081913883, "learning_rate": 4.9774794202667236e-06, "logits/chosen": -2.783825397491455, "logits/rejected": -2.7646143436431885, "logps/chosen": -273.3974914550781, "logps/rejected": -257.3733215332031, "loss": 0.6823, "positive_losses": 0.06744994968175888, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1812540590763092, "rewards/margins": 0.071528360247612, "rewards/margins_max": 0.199395090341568, "rewards/margins_min": -0.04514295980334282, "rewards/margins_std": 0.10833747684955597, "rewards/rejected": 0.10972567647695541, "step": 530 }, { "dpo_losses": 0.665927529335022, "epoch": 0.14, "grad_norm": 1.9756807916397077, "learning_rate": 4.974316612530615e-06, "logits/chosen": -2.728533983230591, "logits/rejected": -2.7084076404571533, "logps/chosen": -248.6405029296875, "logps/rejected": -231.85354614257812, "loss": 0.6775, "positive_losses": 0.1546613723039627, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.190275639295578, "rewards/margins": 0.06172444671392441, "rewards/margins_max": 0.22966797649860382, "rewards/margins_min": -0.09864543378353119, "rewards/margins_std": 0.1446523368358612, "rewards/rejected": 0.12855121493339539, "step": 540 }, { "dpo_losses": 0.6500524878501892, "epoch": 0.14, "grad_norm": 1.71028968008198, "learning_rate": 4.970947200069416e-06, "logits/chosen": -2.826908588409424, "logits/rejected": -2.7796719074249268, "logps/chosen": -292.29608154296875, "logps/rejected": -250.68896484375, "loss": 0.6916, "positive_losses": 0.1571556031703949, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1823032796382904, "rewards/margins": 0.09319052845239639, "rewards/margins_max": 0.24011695384979248, "rewards/margins_min": -0.050476692616939545, "rewards/margins_std": 0.12862922251224518, "rewards/rejected": 0.08911273628473282, "step": 550 }, { "dpo_losses": 0.6723580360412598, "epoch": 0.15, "grad_norm": 1.8415686424485658, "learning_rate": 4.967371464228096e-06, "logits/chosen": -2.7671542167663574, "logits/rejected": -2.786092758178711, "logps/chosen": -279.84710693359375, "logps/rejected": -265.36932373046875, "loss": 0.6746, "positive_losses": 0.01470336876809597, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.19616767764091492, "rewards/margins": 0.04653048887848854, "rewards/margins_max": 0.19184985756874084, "rewards/margins_min": -0.09195338934659958, "rewards/margins_std": 0.12599508464336395, "rewards/rejected": 0.14963720738887787, "step": 560 }, { "dpo_losses": 0.660744309425354, "epoch": 0.15, "grad_norm": 1.7197400102842324, "learning_rate": 4.963589703579569e-06, "logits/chosen": -2.7402005195617676, "logits/rejected": -2.7011075019836426, "logps/chosen": -236.1312713623047, "logps/rejected": -220.7137451171875, "loss": 0.6725, "positive_losses": 0.0806148499250412, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.18730634450912476, "rewards/margins": 0.07002018392086029, "rewards/margins_max": 0.2193438708782196, "rewards/margins_min": -0.06707224994897842, "rewards/margins_std": 0.12361118942499161, "rewards/rejected": 0.11728618294000626, "step": 570 }, { "dpo_losses": 0.6621208786964417, "epoch": 0.15, "grad_norm": 16.76326761443846, "learning_rate": 4.9596022338997615e-06, "logits/chosen": -2.7995400428771973, "logits/rejected": -2.763375997543335, "logps/chosen": -280.8608703613281, "logps/rejected": -242.50106811523438, "loss": 0.6797, "positive_losses": 0.2067081481218338, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.20165982842445374, "rewards/margins": 0.0678969994187355, "rewards/margins_max": 0.19827218353748322, "rewards/margins_min": -0.04111642390489578, "rewards/margins_std": 0.10833221673965454, "rewards/rejected": 0.13376283645629883, "step": 580 }, { "dpo_losses": 0.658281147480011, "epoch": 0.15, "grad_norm": 3.9625818575036393, "learning_rate": 4.955409388141243e-06, "logits/chosen": -2.760684013366699, "logits/rejected": -2.710430383682251, "logps/chosen": -261.4241638183594, "logps/rejected": -240.5662078857422, "loss": 0.6887, "positive_losses": 0.19717493653297424, "rewards/accuracies": 0.75, "rewards/chosen": 0.19780950248241425, "rewards/margins": 0.07692292332649231, "rewards/margins_max": 0.2365892380475998, "rewards/margins_min": -0.06682348251342773, "rewards/margins_std": 0.13651010394096375, "rewards/rejected": 0.12088658660650253, "step": 590 }, { "dpo_losses": 0.6623961329460144, "epoch": 0.16, "grad_norm": 1.5443859689987585, "learning_rate": 4.951011516405429e-06, "logits/chosen": -2.774395227432251, "logits/rejected": -2.745779514312744, "logps/chosen": -305.0125427246094, "logps/rejected": -267.03594970703125, "loss": 0.663, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.21630334854125977, "rewards/margins": 0.06726609915494919, "rewards/margins_max": 0.20849759876728058, "rewards/margins_min": -0.08070281893014908, "rewards/margins_std": 0.12883268296718597, "rewards/rejected": 0.14903724193572998, "step": 600 }, { "epoch": 0.16, "eval_dpo_losses": 0.6606666445732117, "eval_logits/chosen": -2.7035694122314453, "eval_logits/rejected": -2.665982484817505, "eval_logps/chosen": -265.1740417480469, "eval_logps/rejected": -246.3229522705078, "eval_loss": 0.6892355680465698, "eval_positive_losses": 0.14829835295677185, "eval_rewards/accuracies": 0.6769999861717224, "eval_rewards/chosen": 0.19419392943382263, "eval_rewards/margins": 0.07163416594266891, "eval_rewards/margins_max": 0.3007630705833435, "eval_rewards/margins_min": -0.12855397164821625, "eval_rewards/margins_std": 0.14202702045440674, "eval_rewards/rejected": 0.12255976349115372, "eval_runtime": 428.4408, "eval_samples_per_second": 4.668, "eval_steps_per_second": 0.292, "step": 600 }, { "dpo_losses": 0.6442610621452332, "epoch": 0.16, "grad_norm": 9.046805004316873, "learning_rate": 4.946408985913344e-06, "logits/chosen": -2.7550129890441895, "logits/rejected": -2.678056478500366, "logps/chosen": -345.64093017578125, "logps/rejected": -259.5520324707031, "loss": 0.6606, "positive_losses": 0.055927276611328125, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23318998515605927, "rewards/margins": 0.10763374716043472, "rewards/margins_max": 0.2763409912586212, "rewards/margins_min": -0.0661562830209732, "rewards/margins_std": 0.15407855808734894, "rewards/rejected": 0.12555620074272156, "step": 610 }, { "dpo_losses": 0.6608101725578308, "epoch": 0.16, "grad_norm": 2.8816620306621594, "learning_rate": 4.941602180974958e-06, "logits/chosen": -2.742089033126831, "logits/rejected": -2.7148919105529785, "logps/chosen": -278.1106872558594, "logps/rejected": -254.6637725830078, "loss": 0.6557, "positive_losses": 0.0004772186221089214, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.18525430560112, "rewards/margins": 0.07316222786903381, "rewards/margins_max": 0.2310808151960373, "rewards/margins_min": -0.09148738533258438, "rewards/margins_std": 0.14664871990680695, "rewards/rejected": 0.11209206283092499, "step": 620 }, { "dpo_losses": 0.649380624294281, "epoch": 0.16, "grad_norm": 1.9957136475335484, "learning_rate": 4.936591502957101e-06, "logits/chosen": -2.6965575218200684, "logits/rejected": -2.675640821456909, "logps/chosen": -249.79653930664062, "logps/rejected": -261.4026794433594, "loss": 0.7133, "positive_losses": 0.27226218581199646, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1899053305387497, "rewards/margins": 0.09622377902269363, "rewards/margins_max": 0.27748948335647583, "rewards/margins_min": -0.03962727636098862, "rewards/margins_std": 0.14250265061855316, "rewards/rejected": 0.09368153661489487, "step": 630 }, { "dpo_losses": 0.658961296081543, "epoch": 0.17, "grad_norm": 11.497970401956069, "learning_rate": 4.931377370249946e-06, "logits/chosen": -2.7044734954833984, "logits/rejected": -2.6543304920196533, "logps/chosen": -293.74298095703125, "logps/rejected": -247.9723358154297, "loss": 0.6902, "positive_losses": 0.3072100281715393, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.20657439529895782, "rewards/margins": 0.07905445247888565, "rewards/margins_max": 0.2459186315536499, "rewards/margins_min": -0.1058909073472023, "rewards/margins_std": 0.1593790501356125, "rewards/rejected": 0.12751996517181396, "step": 640 }, { "dpo_losses": 0.6644202470779419, "epoch": 0.17, "grad_norm": 9.998754805287007, "learning_rate": 4.925960218232073e-06, "logits/chosen": -2.684481143951416, "logits/rejected": -2.6854796409606934, "logps/chosen": -240.8351287841797, "logps/rejected": -239.85079956054688, "loss": 0.6814, "positive_losses": 0.13215656578540802, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1786859631538391, "rewards/margins": 0.06295709311962128, "rewards/margins_max": 0.20083513855934143, "rewards/margins_min": -0.07749038934707642, "rewards/margins_std": 0.12378038465976715, "rewards/rejected": 0.11572885513305664, "step": 650 }, { "dpo_losses": 0.6548426747322083, "epoch": 0.17, "grad_norm": 9.859053858923993, "learning_rate": 4.920340499234116e-06, "logits/chosen": -2.75348162651062, "logits/rejected": -2.735724449157715, "logps/chosen": -242.1502227783203, "logps/rejected": -245.6394500732422, "loss": 0.7021, "positive_losses": 0.3593042492866516, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21502117812633514, "rewards/margins": 0.08600465953350067, "rewards/margins_max": 0.2713359296321869, "rewards/margins_min": -0.10064806044101715, "rewards/margins_std": 0.16272717714309692, "rewards/rejected": 0.12901651859283447, "step": 660 }, { "dpo_losses": 0.6737793684005737, "epoch": 0.18, "grad_norm": 1.7701974587295215, "learning_rate": 4.914518682500995e-06, "logits/chosen": -2.7391796112060547, "logits/rejected": -2.6809182167053223, "logps/chosen": -255.51742553710938, "logps/rejected": -226.91268920898438, "loss": 0.7042, "positive_losses": 0.3297177255153656, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.17366547882556915, "rewards/margins": 0.04646056145429611, "rewards/margins_max": 0.21436409652233124, "rewards/margins_min": -0.11843661963939667, "rewards/margins_std": 0.14836609363555908, "rewards/rejected": 0.12720489501953125, "step": 670 }, { "dpo_losses": 0.6623993515968323, "epoch": 0.18, "grad_norm": 1.7781013634691223, "learning_rate": 4.9084952541527315e-06, "logits/chosen": -2.5873992443084717, "logits/rejected": -2.566702365875244, "logps/chosen": -258.7884521484375, "logps/rejected": -231.9339141845703, "loss": 0.6795, "positive_losses": 0.30301570892333984, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20401909947395325, "rewards/margins": 0.06752722710371017, "rewards/margins_max": 0.22269508242607117, "rewards/margins_min": -0.07635542750358582, "rewards/margins_std": 0.1323748379945755, "rewards/rejected": 0.13649186491966248, "step": 680 }, { "dpo_losses": 0.662765622138977, "epoch": 0.18, "grad_norm": 1.7073095169049701, "learning_rate": 4.902270717143858e-06, "logits/chosen": -2.6526148319244385, "logits/rejected": -2.629845142364502, "logps/chosen": -268.54486083984375, "logps/rejected": -229.29171752929688, "loss": 0.654, "positive_losses": 0.007523536682128906, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.18195727467536926, "rewards/margins": 0.06656062602996826, "rewards/margins_max": 0.20236501097679138, "rewards/margins_min": -0.05715782567858696, "rewards/margins_std": 0.11579607427120209, "rewards/rejected": 0.1153966411948204, "step": 690 }, { "dpo_losses": 0.6593031287193298, "epoch": 0.18, "grad_norm": 10.299610197228036, "learning_rate": 4.895845591221427e-06, "logits/chosen": -2.670139789581299, "logits/rejected": -2.638843059539795, "logps/chosen": -255.5277862548828, "logps/rejected": -218.6800994873047, "loss": 0.6784, "positive_losses": 0.43383750319480896, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17841890454292297, "rewards/margins": 0.07547406852245331, "rewards/margins_max": 0.23104026913642883, "rewards/margins_min": -0.07409827411174774, "rewards/margins_std": 0.1367238461971283, "rewards/rejected": 0.10294482856988907, "step": 700 }, { "epoch": 0.18, "eval_dpo_losses": 0.654996395111084, "eval_logits/chosen": -2.6624057292938232, "eval_logits/rejected": -2.6229429244995117, "eval_logps/chosen": -265.67559814453125, "eval_logps/rejected": -248.0891571044922, "eval_loss": 0.6934565901756287, "eval_positive_losses": 0.21421319246292114, "eval_rewards/accuracies": 0.6970000267028809, "eval_rewards/chosen": 0.18917834758758545, "eval_rewards/margins": 0.08428053557872772, "eval_rewards/margins_max": 0.32754144072532654, "eval_rewards/margins_min": -0.1273568570613861, "eval_rewards/margins_std": 0.15159085392951965, "eval_rewards/rejected": 0.10489779710769653, "eval_runtime": 428.06, "eval_samples_per_second": 4.672, "eval_steps_per_second": 0.292, "step": 700 }, { "dpo_losses": 0.6701689958572388, "epoch": 0.19, "grad_norm": 8.136188022777755, "learning_rate": 4.8892204128816e-06, "logits/chosen": -2.6619200706481934, "logits/rejected": -2.6518778800964355, "logps/chosen": -194.81417846679688, "logps/rejected": -204.11599731445312, "loss": 0.6688, "positive_losses": 0.038634538650512695, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.16319485008716583, "rewards/margins": 0.04956919327378273, "rewards/margins_max": 0.175912007689476, "rewards/margins_min": -0.05440413951873779, "rewards/margins_std": 0.1039264053106308, "rewards/rejected": 0.1136256605386734, "step": 710 }, { "dpo_losses": 0.652790904045105, "epoch": 0.19, "grad_norm": 2.0698475441559903, "learning_rate": 4.882395735324864e-06, "logits/chosen": -2.6227266788482666, "logits/rejected": -2.596557140350342, "logps/chosen": -305.59320068359375, "logps/rejected": -292.477783203125, "loss": 0.6722, "positive_losses": 0.32846182584762573, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.20921404659748077, "rewards/margins": 0.09199421107769012, "rewards/margins_max": 0.3147227168083191, "rewards/margins_min": -0.09670272469520569, "rewards/margins_std": 0.18117788434028625, "rewards/rejected": 0.11721982061862946, "step": 720 }, { "dpo_losses": 0.6393214464187622, "epoch": 0.19, "grad_norm": 2.064984414732266, "learning_rate": 4.87537212840983e-06, "logits/chosen": -2.7411181926727295, "logits/rejected": -2.6920838356018066, "logps/chosen": -307.7519836425781, "logps/rejected": -273.3503112792969, "loss": 0.6824, "positive_losses": 0.33143624663352966, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2093723714351654, "rewards/margins": 0.11846580356359482, "rewards/margins_max": 0.2905295789241791, "rewards/margins_min": -0.041656799614429474, "rewards/margins_std": 0.15608441829681396, "rewards/rejected": 0.09090657532215118, "step": 730 }, { "dpo_losses": 0.6581794619560242, "epoch": 0.19, "grad_norm": 6.430319624056747, "learning_rate": 4.8681501786056545e-06, "logits/chosen": -2.698139190673828, "logits/rejected": -2.7497072219848633, "logps/chosen": -251.01333618164062, "logps/rejected": -292.71453857421875, "loss": 0.6999, "positive_losses": 0.4462181031703949, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1730455905199051, "rewards/margins": 0.07829009741544724, "rewards/margins_max": 0.2488713562488556, "rewards/margins_min": -0.09430457651615143, "rewards/margins_std": 0.15166929364204407, "rewards/rejected": 0.09475548565387726, "step": 740 }, { "dpo_losses": 0.656989336013794, "epoch": 0.2, "grad_norm": 2.0014309532421306, "learning_rate": 4.860730488943068e-06, "logits/chosen": -2.7123594284057617, "logits/rejected": -2.727226972579956, "logps/chosen": -281.8397521972656, "logps/rejected": -279.3072204589844, "loss": 0.6873, "positive_losses": 0.23088416457176208, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20445775985717773, "rewards/margins": 0.07950909435749054, "rewards/margins_max": 0.2337539941072464, "rewards/margins_min": -0.06165488809347153, "rewards/margins_std": 0.13100259006023407, "rewards/rejected": 0.1249486654996872, "step": 750 }, { "dpo_losses": 0.639510989189148, "epoch": 0.2, "grad_norm": 1.8298763786835541, "learning_rate": 4.853113678964022e-06, "logits/chosen": -2.767143487930298, "logits/rejected": -2.704211711883545, "logps/chosen": -256.5793151855469, "logps/rejected": -230.35009765625, "loss": 0.6468, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.2196684181690216, "rewards/margins": 0.11621483415365219, "rewards/margins_max": 0.26115891337394714, "rewards/margins_min": -0.009762251749634743, "rewards/margins_std": 0.12192968279123306, "rewards/rejected": 0.10345359891653061, "step": 760 }, { "dpo_losses": 0.6363869905471802, "epoch": 0.2, "grad_norm": 2.0467806495877507, "learning_rate": 4.845300384669958e-06, "logits/chosen": -2.7408857345581055, "logits/rejected": -2.6877858638763428, "logps/chosen": -298.6504821777344, "logps/rejected": -261.07952880859375, "loss": 0.6629, "positive_losses": 0.1998986303806305, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.214046448469162, "rewards/margins": 0.13136693835258484, "rewards/margins_max": 0.37038570642471313, "rewards/margins_min": -0.10493201017379761, "rewards/margins_std": 0.21196743845939636, "rewards/rejected": 0.08267951756715775, "step": 770 }, { "dpo_losses": 0.6489611864089966, "epoch": 0.2, "grad_norm": 5.9553260238661245, "learning_rate": 4.837291258468701e-06, "logits/chosen": -2.7537689208984375, "logits/rejected": -2.7151620388031006, "logps/chosen": -289.8448791503906, "logps/rejected": -253.50607299804688, "loss": 0.6994, "positive_losses": 0.34298810362815857, "rewards/accuracies": 0.75, "rewards/chosen": 0.19952091574668884, "rewards/margins": 0.09759555757045746, "rewards/margins_max": 0.2846040427684784, "rewards/margins_min": -0.08340279757976532, "rewards/margins_std": 0.16337545216083527, "rewards/rejected": 0.10192535072565079, "step": 780 }, { "dpo_losses": 0.6534120440483093, "epoch": 0.21, "grad_norm": 2.031892410809195, "learning_rate": 4.829086969119984e-06, "logits/chosen": -2.7431013584136963, "logits/rejected": -2.7174015045166016, "logps/chosen": -252.091552734375, "logps/rejected": -253.53713989257812, "loss": 0.7048, "positive_losses": 0.26349717378616333, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1834312379360199, "rewards/margins": 0.08911575376987457, "rewards/margins_max": 0.273346483707428, "rewards/margins_min": -0.08002379536628723, "rewards/margins_std": 0.15820558369159698, "rewards/rejected": 0.09431548416614532, "step": 790 }, { "dpo_losses": 0.6413323879241943, "epoch": 0.21, "grad_norm": 9.333042094818406, "learning_rate": 4.820688201679605e-06, "logits/chosen": -2.733985185623169, "logits/rejected": -2.703796863555908, "logps/chosen": -329.62115478515625, "logps/rejected": -263.99871826171875, "loss": 0.661, "positive_losses": 0.1969766616821289, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.22037820518016815, "rewards/margins": 0.11573261022567749, "rewards/margins_max": 0.3472324013710022, "rewards/margins_min": -0.047318510711193085, "rewards/margins_std": 0.1790885031223297, "rewards/rejected": 0.10464560985565186, "step": 800 }, { "epoch": 0.21, "eval_dpo_losses": 0.6538042426109314, "eval_logits/chosen": -2.7244741916656494, "eval_logits/rejected": -2.6850404739379883, "eval_logps/chosen": -264.6508483886719, "eval_logps/rejected": -247.35475158691406, "eval_loss": 0.688547670841217, "eval_positive_losses": 0.17699968814849854, "eval_rewards/accuracies": 0.7020000219345093, "eval_rewards/chosen": 0.19942589104175568, "eval_rewards/margins": 0.08718385547399521, "eval_rewards/margins_max": 0.33879923820495605, "eval_rewards/margins_min": -0.12915974855422974, "eval_rewards/margins_std": 0.15493494272232056, "eval_rewards/rejected": 0.11224202811717987, "eval_runtime": 428.1578, "eval_samples_per_second": 4.671, "eval_steps_per_second": 0.292, "step": 800 }, { "dpo_losses": 0.6463054418563843, "epoch": 0.21, "grad_norm": 30.82180664153932, "learning_rate": 4.8120956574422315e-06, "logits/chosen": -2.770087957382202, "logits/rejected": -2.721579074859619, "logps/chosen": -267.48638916015625, "logps/rejected": -268.20855712890625, "loss": 0.6844, "positive_losses": 0.3319166302680969, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20016130805015564, "rewards/margins": 0.10353595018386841, "rewards/margins_max": 0.31145358085632324, "rewards/margins_min": -0.08192861080169678, "rewards/margins_std": 0.17669261991977692, "rewards/rejected": 0.09662538021802902, "step": 810 }, { "dpo_losses": 0.6368136405944824, "epoch": 0.21, "grad_norm": 2.0363461462731247, "learning_rate": 4.803310053882831e-06, "logits/chosen": -2.727980852127075, "logits/rejected": -2.6585936546325684, "logps/chosen": -241.8242645263672, "logps/rejected": -211.6393585205078, "loss": 0.7019, "positive_losses": 0.810762882232666, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.19861382246017456, "rewards/margins": 0.12479345500469208, "rewards/margins_max": 0.3012546896934509, "rewards/margins_min": -0.02588585577905178, "rewards/margins_std": 0.15163125097751617, "rewards/rejected": 0.07382034510374069, "step": 820 }, { "dpo_losses": 0.6578240394592285, "epoch": 0.22, "grad_norm": 1.949723111570459, "learning_rate": 4.794332124596775e-06, "logits/chosen": -2.742356538772583, "logits/rejected": -2.704735279083252, "logps/chosen": -309.84222412109375, "logps/rejected": -289.576904296875, "loss": 0.7202, "positive_losses": 0.6529260873794556, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.19200512766838074, "rewards/margins": 0.08338505029678345, "rewards/margins_max": 0.2779453992843628, "rewards/margins_min": -0.11795749515295029, "rewards/margins_std": 0.1819847971200943, "rewards/rejected": 0.10862010717391968, "step": 830 }, { "dpo_losses": 0.6642175912857056, "epoch": 0.22, "grad_norm": 9.685621904012176, "learning_rate": 4.785162619238575e-06, "logits/chosen": -2.7567312717437744, "logits/rejected": -2.720520257949829, "logps/chosen": -287.269775390625, "logps/rejected": -249.7572479248047, "loss": 0.6882, "positive_losses": 0.4141426086425781, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.19045835733413696, "rewards/margins": 0.06596094369888306, "rewards/margins_max": 0.2540523409843445, "rewards/margins_min": -0.08973310887813568, "rewards/margins_std": 0.15320825576782227, "rewards/rejected": 0.1244974136352539, "step": 840 }, { "dpo_losses": 0.6543776392936707, "epoch": 0.22, "grad_norm": 15.955758440669381, "learning_rate": 4.775802303459288e-06, "logits/chosen": -2.7536678314208984, "logits/rejected": -2.708517551422119, "logps/chosen": -278.27459716796875, "logps/rejected": -261.2574157714844, "loss": 0.6632, "positive_losses": 0.0038405179511755705, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1954740583896637, "rewards/margins": 0.08532064408063889, "rewards/margins_max": 0.23452997207641602, "rewards/margins_min": -0.0665128082036972, "rewards/margins_std": 0.13811759650707245, "rewards/rejected": 0.11015341430902481, "step": 850 }, { "dpo_losses": 0.6685608625411987, "epoch": 0.23, "grad_norm": 11.169216111697574, "learning_rate": 4.766251958842589e-06, "logits/chosen": -2.730212926864624, "logits/rejected": -2.743756055831909, "logps/chosen": -207.6484832763672, "logps/rejected": -218.2662811279297, "loss": 0.6885, "positive_losses": 0.24169044196605682, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.1786372810602188, "rewards/margins": 0.054984550923109055, "rewards/margins_max": 0.2055453062057495, "rewards/margins_min": -0.07690045982599258, "rewards/margins_std": 0.1269712895154953, "rewards/rejected": 0.12365271896123886, "step": 860 }, { "dpo_losses": 0.6498798131942749, "epoch": 0.23, "grad_norm": 2.056410382633826, "learning_rate": 4.7565123828395066e-06, "logits/chosen": -2.666647434234619, "logits/rejected": -2.6648640632629395, "logps/chosen": -301.1907958984375, "logps/rejected": -234.89401245117188, "loss": 0.6975, "positive_losses": 0.7158435583114624, "rewards/accuracies": 0.625, "rewards/chosen": 0.22358696162700653, "rewards/margins": 0.12362835556268692, "rewards/margins_max": 0.463988721370697, "rewards/margins_min": -0.09853404015302658, "rewards/margins_std": 0.26094773411750793, "rewards/rejected": 0.0999586284160614, "step": 870 }, { "dpo_losses": 0.6778437495231628, "epoch": 0.23, "grad_norm": 2.009156248724103, "learning_rate": 4.746584388701831e-06, "logits/chosen": -2.7310903072357178, "logits/rejected": -2.728564500808716, "logps/chosen": -251.1389617919922, "logps/rejected": -289.5425720214844, "loss": 0.6921, "positive_losses": 0.04077606275677681, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.18281050026416779, "rewards/margins": 0.03961994871497154, "rewards/margins_max": 0.21845373511314392, "rewards/margins_min": -0.16457554697990417, "rewards/margins_std": 0.1732816994190216, "rewards/rejected": 0.14319053292274475, "step": 880 }, { "dpo_losses": 0.6645732522010803, "epoch": 0.23, "grad_norm": 7.164405406869741, "learning_rate": 4.736468805414218e-06, "logits/chosen": -2.723555564880371, "logits/rejected": -2.708336353302002, "logps/chosen": -291.2392578125, "logps/rejected": -265.4895935058594, "loss": 0.6918, "positive_losses": 0.4872266352176666, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.19292163848876953, "rewards/margins": 0.06546325981616974, "rewards/margins_max": 0.25236308574676514, "rewards/margins_min": -0.0974319577217102, "rewards/margins_std": 0.15806356072425842, "rewards/rejected": 0.1274583637714386, "step": 890 }, { "dpo_losses": 0.6708654165267944, "epoch": 0.24, "grad_norm": 1.846600801668145, "learning_rate": 4.7261664776249595e-06, "logits/chosen": -2.7252776622772217, "logits/rejected": -2.707298755645752, "logps/chosen": -245.03970336914062, "logps/rejected": -263.7660827636719, "loss": 0.6736, "positive_losses": 0.04109077528119087, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.19691479206085205, "rewards/margins": 0.05294477939605713, "rewards/margins_max": 0.24138686060905457, "rewards/margins_min": -0.1393672525882721, "rewards/margins_std": 0.16866567730903625, "rewards/rejected": 0.14396999776363373, "step": 900 }, { "epoch": 0.24, "eval_dpo_losses": 0.6556591987609863, "eval_logits/chosen": -2.7201335430145264, "eval_logits/rejected": -2.681368112564087, "eval_logps/chosen": -264.33880615234375, "eval_logps/rejected": -246.65931701660156, "eval_loss": 0.6827124357223511, "eval_positive_losses": 0.15759699046611786, "eval_rewards/accuracies": 0.6940000057220459, "eval_rewards/chosen": 0.20254585146903992, "eval_rewards/margins": 0.08334962278604507, "eval_rewards/margins_max": 0.33453845977783203, "eval_rewards/margins_min": -0.13353128731250763, "eval_rewards/margins_std": 0.15607841312885284, "eval_rewards/rejected": 0.11919621378183365, "eval_runtime": 428.1043, "eval_samples_per_second": 4.672, "eval_steps_per_second": 0.292, "step": 900 }, { "dpo_losses": 0.6529034376144409, "epoch": 0.24, "grad_norm": 11.924980908904399, "learning_rate": 4.715678265575463e-06, "logits/chosen": -2.732459783554077, "logits/rejected": -2.6982932090759277, "logps/chosen": -246.77554321289062, "logps/rejected": -207.98641967773438, "loss": 0.6815, "positive_losses": 0.10796146094799042, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.20487447082996368, "rewards/margins": 0.08838620036840439, "rewards/margins_max": 0.2575072646141052, "rewards/margins_min": -0.06950771808624268, "rewards/margins_std": 0.1464182436466217, "rewards/rejected": 0.1164882630109787, "step": 910 }, { "dpo_losses": 0.6596372723579407, "epoch": 0.24, "grad_norm": 8.42476750174722, "learning_rate": 4.705005045028415e-06, "logits/chosen": -2.6776070594787598, "logits/rejected": -2.6785919666290283, "logps/chosen": -262.73541259765625, "logps/rejected": -258.1957092285156, "loss": 0.6866, "positive_losses": 0.18081608414649963, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.19900774955749512, "rewards/margins": 0.07932322472333908, "rewards/margins_max": 0.27794817090034485, "rewards/margins_min": -0.09649928659200668, "rewards/margins_std": 0.16794349253177643, "rewards/rejected": 0.11968453228473663, "step": 920 }, { "dpo_losses": 0.6416321992874146, "epoch": 0.24, "grad_norm": 9.065324435269773, "learning_rate": 4.694147707194659e-06, "logits/chosen": -2.7121102809906006, "logits/rejected": -2.673405885696411, "logps/chosen": -307.6800231933594, "logps/rejected": -261.2682189941406, "loss": 0.6748, "positive_losses": 0.24066261947155, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.21907511353492737, "rewards/margins": 0.11731608957052231, "rewards/margins_max": 0.34327879548072815, "rewards/margins_min": -0.11537656933069229, "rewards/margins_std": 0.20834875106811523, "rewards/rejected": 0.10175903886556625, "step": 930 }, { "dpo_losses": 0.6601050496101379, "epoch": 0.25, "grad_norm": 19.494077125341175, "learning_rate": 4.683107158658782e-06, "logits/chosen": -2.758807897567749, "logits/rejected": -2.750800609588623, "logps/chosen": -257.49993896484375, "logps/rejected": -261.9332580566406, "loss": 0.7437, "positive_losses": 0.3766586184501648, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.19478636980056763, "rewards/margins": 0.07299786806106567, "rewards/margins_max": 0.23471426963806152, "rewards/margins_min": -0.09077923744916916, "rewards/margins_std": 0.14624808728694916, "rewards/rejected": 0.12178850173950195, "step": 940 }, { "dpo_losses": 0.6490548849105835, "epoch": 0.25, "grad_norm": 2.033985819119182, "learning_rate": 4.671884321303407e-06, "logits/chosen": -2.7479376792907715, "logits/rejected": -2.76665997505188, "logps/chosen": -247.3947296142578, "logps/rejected": -275.3429260253906, "loss": 0.6816, "positive_losses": 0.367349237203598, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.19129522144794464, "rewards/margins": 0.09990672767162323, "rewards/margins_max": 0.3187563121318817, "rewards/margins_min": -0.10026586055755615, "rewards/margins_std": 0.1873435080051422, "rewards/rejected": 0.09138850122690201, "step": 950 }, { "dpo_losses": 0.6453216075897217, "epoch": 0.25, "grad_norm": 1.9967649641449399, "learning_rate": 4.660480132232224e-06, "logits/chosen": -2.734900951385498, "logits/rejected": -2.645411252975464, "logps/chosen": -311.1207580566406, "logps/rejected": -248.730224609375, "loss": 0.6846, "positive_losses": 0.4737134873867035, "rewards/accuracies": 0.75, "rewards/chosen": 0.1866598129272461, "rewards/margins": 0.10710735619068146, "rewards/margins_max": 0.30022764205932617, "rewards/margins_min": -0.10734431445598602, "rewards/margins_std": 0.1846868395805359, "rewards/rejected": 0.07955245673656464, "step": 960 }, { "dpo_losses": 0.6419434547424316, "epoch": 0.25, "grad_norm": 13.359467516127022, "learning_rate": 4.6488955436917414e-06, "logits/chosen": -2.7888317108154297, "logits/rejected": -2.78301739692688, "logps/chosen": -292.08740234375, "logps/rejected": -263.8045349121094, "loss": 0.6857, "positive_losses": 0.4418838620185852, "rewards/accuracies": 0.75, "rewards/chosen": 0.18865352869033813, "rewards/margins": 0.11351042985916138, "rewards/margins_max": 0.2997412085533142, "rewards/margins_min": -0.07673824578523636, "rewards/margins_std": 0.16734425723552704, "rewards/rejected": 0.07514312863349915, "step": 970 }, { "dpo_losses": 0.6530637145042419, "epoch": 0.26, "grad_norm": 6.4673183165262085, "learning_rate": 4.6371315229917644e-06, "logits/chosen": -2.809450626373291, "logits/rejected": -2.80094313621521, "logps/chosen": -261.11688232421875, "logps/rejected": -223.17453002929688, "loss": 0.6668, "positive_losses": 0.17321071028709412, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.18977181613445282, "rewards/margins": 0.08781514316797256, "rewards/margins_max": 0.2462121695280075, "rewards/margins_min": -0.07525520026683807, "rewards/margins_std": 0.1442008763551712, "rewards/rejected": 0.10195668041706085, "step": 980 }, { "dpo_losses": 0.6565328240394592, "epoch": 0.26, "grad_norm": 8.967947362413515, "learning_rate": 4.625189052424638e-06, "logits/chosen": -2.6529622077941895, "logits/rejected": -2.6158525943756104, "logps/chosen": -274.95526123046875, "logps/rejected": -258.0525817871094, "loss": 0.836, "positive_losses": 3.846707582473755, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.16130439937114716, "rewards/margins": 0.08392287790775299, "rewards/margins_max": 0.26575109362602234, "rewards/margins_min": -0.1301548182964325, "rewards/margins_std": 0.18032459914684296, "rewards/rejected": 0.07738152891397476, "step": 990 }, { "dpo_losses": 0.6670268774032593, "epoch": 0.26, "grad_norm": 11.870740909736584, "learning_rate": 4.613069129183218e-06, "logits/chosen": -2.7265543937683105, "logits/rejected": -2.713296890258789, "logps/chosen": -253.14404296875, "logps/rejected": -242.2244110107422, "loss": 0.6998, "positive_losses": 0.3402255177497864, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.184846431016922, "rewards/margins": 0.061308927834033966, "rewards/margins_max": 0.2561453878879547, "rewards/margins_min": -0.1173776239156723, "rewards/margins_std": 0.16720959544181824, "rewards/rejected": 0.12353750318288803, "step": 1000 }, { "epoch": 0.26, "eval_dpo_losses": 0.651651918888092, "eval_logits/chosen": -2.7190160751342773, "eval_logits/rejected": -2.683014392852783, "eval_logps/chosen": -264.2192077636719, "eval_logps/rejected": -247.42453002929688, "eval_loss": 0.6805807948112488, "eval_positive_losses": 0.21309128403663635, "eval_rewards/accuracies": 0.7070000171661377, "eval_rewards/chosen": 0.20374199748039246, "eval_rewards/margins": 0.09219793230295181, "eval_rewards/margins_max": 0.34986335039138794, "eval_rewards/margins_min": -0.133478045463562, "eval_rewards/margins_std": 0.16154462099075317, "eval_rewards/rejected": 0.11154407262802124, "eval_runtime": 428.3508, "eval_samples_per_second": 4.669, "eval_steps_per_second": 0.292, "step": 1000 }, { "dpo_losses": 0.6370615363121033, "epoch": 0.26, "grad_norm": 11.008320869094142, "learning_rate": 4.600772765277607e-06, "logits/chosen": -2.7868447303771973, "logits/rejected": -2.746675729751587, "logps/chosen": -266.67388916015625, "logps/rejected": -269.6067810058594, "loss": 0.7051, "positive_losses": 0.28719252347946167, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.19845828413963318, "rewards/margins": 0.202920600771904, "rewards/margins_max": 0.6908767819404602, "rewards/margins_min": -0.03486362472176552, "rewards/margins_std": 0.3422602117061615, "rewards/rejected": -0.0044622840359807014, "step": 1010 }, { "dpo_losses": 0.669637143611908, "epoch": 0.27, "grad_norm": 10.704096528671498, "learning_rate": 4.588300987450652e-06, "logits/chosen": -2.775867462158203, "logits/rejected": -2.7486350536346436, "logps/chosen": -265.61761474609375, "logps/rejected": -266.26904296875, "loss": 0.7002, "positive_losses": 0.5554599761962891, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.18071797490119934, "rewards/margins": 0.05442746728658676, "rewards/margins_max": 0.21861937642097473, "rewards/margins_min": -0.11456086486577988, "rewards/margins_std": 0.1543532758951187, "rewards/rejected": 0.12629050016403198, "step": 1020 }, { "dpo_losses": 0.6489515900611877, "epoch": 0.27, "grad_norm": 2.235231995457484, "learning_rate": 4.5756548370922136e-06, "logits/chosen": -2.7182843685150146, "logits/rejected": -2.717958927154541, "logps/chosen": -278.40966796875, "logps/rejected": -283.611328125, "loss": 0.6727, "positive_losses": 0.07070960849523544, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.21360652148723602, "rewards/margins": 0.10068871825933456, "rewards/margins_max": 0.30906373262405396, "rewards/margins_min": -0.11298646777868271, "rewards/margins_std": 0.1882849931716919, "rewards/rejected": 0.11291780322790146, "step": 1030 }, { "dpo_losses": 0.640581488609314, "epoch": 0.27, "grad_norm": 1.838801840149293, "learning_rate": 4.562835370152206e-06, "logits/chosen": -2.682776927947998, "logits/rejected": -2.652805805206299, "logps/chosen": -254.6275634765625, "logps/rejected": -233.42239379882812, "loss": 0.6677, "positive_losses": 0.03793792799115181, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.22730672359466553, "rewards/margins": 0.11742307990789413, "rewards/margins_max": 0.32565218210220337, "rewards/margins_min": -0.05968831852078438, "rewards/margins_std": 0.17563822865486145, "rewards/rejected": 0.1098836287856102, "step": 1040 }, { "dpo_losses": 0.6529589891433716, "epoch": 0.27, "grad_norm": 19.3033959123777, "learning_rate": 4.54984365705243e-06, "logits/chosen": -2.706040859222412, "logits/rejected": -2.6679813861846924, "logps/chosen": -295.9115905761719, "logps/rejected": -298.58062744140625, "loss": 0.6748, "positive_losses": 0.3477066159248352, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2416692078113556, "rewards/margins": 0.0935961976647377, "rewards/margins_max": 0.3274371325969696, "rewards/margins_min": -0.13003945350646973, "rewards/margins_std": 0.201436847448349, "rewards/rejected": 0.14807303249835968, "step": 1050 }, { "dpo_losses": 0.627490222454071, "epoch": 0.28, "grad_norm": 1.928168930962888, "learning_rate": 4.536680782597191e-06, "logits/chosen": -2.7081384658813477, "logits/rejected": -2.6851656436920166, "logps/chosen": -305.4207458496094, "logps/rejected": -286.4194641113281, "loss": 0.6566, "positive_losses": 0.12177524715662003, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.22412219643592834, "rewards/margins": 0.1465277522802353, "rewards/margins_max": 0.371377557516098, "rewards/margins_min": -0.0201462060213089, "rewards/margins_std": 0.17978012561798096, "rewards/rejected": 0.07759441435337067, "step": 1060 }, { "dpo_losses": 0.6365917921066284, "epoch": 0.28, "grad_norm": 8.350123567866095, "learning_rate": 4.523347845882718e-06, "logits/chosen": -2.75634503364563, "logits/rejected": -2.7587296962738037, "logps/chosen": -230.2615966796875, "logps/rejected": -232.2123565673828, "loss": 0.7026, "positive_losses": 0.3845987319946289, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.18936094641685486, "rewards/margins": 0.12512774765491486, "rewards/margins_max": 0.33139970898628235, "rewards/margins_min": -0.08519905805587769, "rewards/margins_std": 0.1838373988866806, "rewards/rejected": 0.06423317641019821, "step": 1070 }, { "dpo_losses": 0.655931830406189, "epoch": 0.28, "grad_norm": 1.8322517571395962, "learning_rate": 4.50984596020539e-06, "logits/chosen": -2.702406167984009, "logits/rejected": -2.5898425579071045, "logps/chosen": -267.80548095703125, "logps/rejected": -243.29324340820312, "loss": 0.7019, "positive_losses": 0.3977195620536804, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.21579650044441223, "rewards/margins": 0.08627926558256149, "rewards/margins_max": 0.27977439761161804, "rewards/margins_min": -0.13393890857696533, "rewards/margins_std": 0.1862020194530487, "rewards/rejected": 0.12951722741127014, "step": 1080 }, { "dpo_losses": 0.6420449614524841, "epoch": 0.29, "grad_norm": 2.10960782496911, "learning_rate": 4.4961762529687745e-06, "logits/chosen": -2.7220356464385986, "logits/rejected": -2.690786600112915, "logps/chosen": -246.0760955810547, "logps/rejected": -221.6387176513672, "loss": 0.6464, "positive_losses": 0.06222038343548775, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.21960726380348206, "rewards/margins": 0.11451101303100586, "rewards/margins_max": 0.30883947014808655, "rewards/margins_min": -0.07992779463529587, "rewards/margins_std": 0.1761944591999054, "rewards/rejected": 0.105096235871315, "step": 1090 }, { "dpo_losses": 0.6517956852912903, "epoch": 0.29, "grad_norm": 6.732583641895375, "learning_rate": 4.482339865589492e-06, "logits/chosen": -2.6478145122528076, "logits/rejected": -2.6480424404144287, "logps/chosen": -278.41357421875, "logps/rejected": -245.4984130859375, "loss": 0.6943, "positive_losses": 0.2920181155204773, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2183549404144287, "rewards/margins": 0.09904515743255615, "rewards/margins_max": 0.34376025199890137, "rewards/margins_min": -0.08190792798995972, "rewards/margins_std": 0.19108134508132935, "rewards/rejected": 0.11930978298187256, "step": 1100 }, { "epoch": 0.29, "eval_dpo_losses": 0.6502726078033447, "eval_logits/chosen": -2.6978962421417236, "eval_logits/rejected": -2.6632769107818604, "eval_logps/chosen": -263.5789489746094, "eval_logps/rejected": -247.13441467285156, "eval_loss": 0.6807675957679749, "eval_positive_losses": 0.2124890238046646, "eval_rewards/accuracies": 0.7099999785423279, "eval_rewards/chosen": 0.21014489233493805, "eval_rewards/margins": 0.09569980204105377, "eval_rewards/margins_max": 0.3629496395587921, "eval_rewards/margins_min": -0.1371210664510727, "eval_rewards/margins_std": 0.16737282276153564, "eval_rewards/rejected": 0.11444510519504547, "eval_runtime": 429.0088, "eval_samples_per_second": 4.662, "eval_steps_per_second": 0.291, "step": 1100 }, { "dpo_losses": 0.6413752436637878, "epoch": 0.29, "grad_norm": 2.1748893183465925, "learning_rate": 4.468337953401909e-06, "logits/chosen": -2.738788604736328, "logits/rejected": -2.723175287246704, "logps/chosen": -265.66070556640625, "logps/rejected": -256.4477844238281, "loss": 0.6548, "positive_losses": 0.08787040412425995, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21674147248268127, "rewards/margins": 0.1143731027841568, "rewards/margins_max": 0.3031662106513977, "rewards/margins_min": -0.05679730698466301, "rewards/margins_std": 0.16097629070281982, "rewards/rejected": 0.10236841440200806, "step": 1110 }, { "dpo_losses": 0.6477451920509338, "epoch": 0.29, "grad_norm": 14.056073835531809, "learning_rate": 4.45417168556166e-06, "logits/chosen": -2.738856792449951, "logits/rejected": -2.695164918899536, "logps/chosen": -268.19818115234375, "logps/rejected": -223.90090942382812, "loss": 0.6943, "positive_losses": 0.43545445799827576, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2281276285648346, "rewards/margins": 0.10189314186573029, "rewards/margins_max": 0.29055875539779663, "rewards/margins_min": -0.06951011717319489, "rewards/margins_std": 0.16105321049690247, "rewards/rejected": 0.1262345016002655, "step": 1120 }, { "dpo_losses": 0.6370912194252014, "epoch": 0.3, "grad_norm": 2.132578742663081, "learning_rate": 4.439842244948036e-06, "logits/chosen": -2.7594432830810547, "logits/rejected": -2.716683864593506, "logps/chosen": -284.3851318359375, "logps/rejected": -248.4437255859375, "loss": 0.6533, "positive_losses": 0.10260801017284393, "rewards/accuracies": 0.8125, "rewards/chosen": 0.21433600783348083, "rewards/margins": 0.1223423033952713, "rewards/margins_max": 0.29776710271835327, "rewards/margins_min": -0.0162980817258358, "rewards/margins_std": 0.14208626747131348, "rewards/rejected": 0.09199371933937073, "step": 1130 }, { "dpo_losses": 0.6386234760284424, "epoch": 0.3, "grad_norm": 16.81325418702227, "learning_rate": 4.425350828065204e-06, "logits/chosen": -2.7315094470977783, "logits/rejected": -2.699362277984619, "logps/chosen": -231.3442840576172, "logps/rejected": -217.38253784179688, "loss": 0.6649, "positive_losses": 0.36649513244628906, "rewards/accuracies": 0.75, "rewards/chosen": 0.20520725846290588, "rewards/margins": 0.12019307911396027, "rewards/margins_max": 0.32323652505874634, "rewards/margins_min": -0.058698803186416626, "rewards/margins_std": 0.16871722042560577, "rewards/rejected": 0.08501417934894562, "step": 1140 }, { "dpo_losses": 0.6301491856575012, "epoch": 0.3, "grad_norm": 2.1136794640555174, "learning_rate": 4.410698644942303e-06, "logits/chosen": -2.7129852771759033, "logits/rejected": -2.7118496894836426, "logps/chosen": -285.33026123046875, "logps/rejected": -243.83712768554688, "loss": 0.6547, "positive_losses": 0.145775705575943, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23448574542999268, "rewards/margins": 0.14068496227264404, "rewards/margins_max": 0.3842105567455292, "rewards/margins_min": -0.058680903166532516, "rewards/margins_std": 0.19865915179252625, "rewards/rejected": 0.09380079060792923, "step": 1150 }, { "dpo_losses": 0.644921600818634, "epoch": 0.3, "grad_norm": 2.066305637614414, "learning_rate": 4.395886919032406e-06, "logits/chosen": -2.648340940475464, "logits/rejected": -2.6257126331329346, "logps/chosen": -215.16940307617188, "logps/rejected": -204.62840270996094, "loss": 0.6784, "positive_losses": 0.661210298538208, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.18664805591106415, "rewards/margins": 0.10691721737384796, "rewards/margins_max": 0.3014773428440094, "rewards/margins_min": -0.06276218593120575, "rewards/margins_std": 0.1590488702058792, "rewards/rejected": 0.07973084598779678, "step": 1160 }, { "dpo_losses": 0.6426645517349243, "epoch": 0.31, "grad_norm": 2.084542368699148, "learning_rate": 4.380916887110366e-06, "logits/chosen": -2.7288904190063477, "logits/rejected": -2.689663887023926, "logps/chosen": -243.3949737548828, "logps/rejected": -254.04248046875, "loss": 0.6843, "positive_losses": 0.3210752606391907, "rewards/accuracies": 0.6875, "rewards/chosen": 0.19764408469200134, "rewards/margins": 0.11319931596517563, "rewards/margins_max": 0.3080732226371765, "rewards/margins_min": -0.07896386086940765, "rewards/margins_std": 0.17322476208209991, "rewards/rejected": 0.08444477617740631, "step": 1170 }, { "dpo_losses": 0.6467557549476624, "epoch": 0.31, "grad_norm": 1.906405208084926, "learning_rate": 4.365789799169539e-06, "logits/chosen": -2.699143171310425, "logits/rejected": -2.661818027496338, "logps/chosen": -244.04592895507812, "logps/rejected": -226.03610229492188, "loss": 0.6639, "positive_losses": 0.2675541937351227, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.20552949607372284, "rewards/margins": 0.10467071831226349, "rewards/margins_max": 0.30532926321029663, "rewards/margins_min": -0.07091771066188812, "rewards/margins_std": 0.16842308640480042, "rewards/rejected": 0.10085882246494293, "step": 1180 }, { "dpo_losses": 0.6682693958282471, "epoch": 0.31, "grad_norm": 15.821492047076443, "learning_rate": 4.350506918317416e-06, "logits/chosen": -2.6909573078155518, "logits/rejected": -2.681933641433716, "logps/chosen": -250.02426147460938, "logps/rejected": -231.6263885498047, "loss": 0.7676, "positive_losses": 1.3114748001098633, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20274274051189423, "rewards/margins": 0.059476565569639206, "rewards/margins_max": 0.23862798511981964, "rewards/margins_min": -0.12899455428123474, "rewards/margins_std": 0.16008736193180084, "rewards/rejected": 0.14326617121696472, "step": 1190 }, { "dpo_losses": 0.6668694615364075, "epoch": 0.31, "grad_norm": 1.8757632291628046, "learning_rate": 4.335069520670149e-06, "logits/chosen": -2.693537473678589, "logits/rejected": -2.671903133392334, "logps/chosen": -259.87274169921875, "logps/rejected": -235.5923614501953, "loss": 0.6761, "positive_losses": 0.11368007957935333, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.22806064784526825, "rewards/margins": 0.06635666638612747, "rewards/margins_max": 0.28212258219718933, "rewards/margins_min": -0.13651351630687714, "rewards/margins_std": 0.18843720853328705, "rewards/rejected": 0.16170397400856018, "step": 1200 }, { "epoch": 0.31, "eval_dpo_losses": 0.6510934233665466, "eval_logits/chosen": -2.6915640830993652, "eval_logits/rejected": -2.6573286056518555, "eval_logps/chosen": -263.0201110839844, "eval_logps/rejected": -246.42547607421875, "eval_loss": 0.6793044209480286, "eval_positive_losses": 0.18975259363651276, "eval_rewards/accuracies": 0.7110000252723694, "eval_rewards/chosen": 0.21573299169540405, "eval_rewards/margins": 0.09419818967580795, "eval_rewards/margins_max": 0.3704459071159363, "eval_rewards/margins_min": -0.13656333088874817, "eval_rewards/margins_std": 0.16917268931865692, "eval_rewards/rejected": 0.1215347945690155, "eval_runtime": 428.1461, "eval_samples_per_second": 4.671, "eval_steps_per_second": 0.292, "step": 1200 }, { "dpo_losses": 0.6602927446365356, "epoch": 0.32, "grad_norm": 1.772652069799707, "learning_rate": 4.319478895246e-06, "logits/chosen": -2.7384231090545654, "logits/rejected": -2.7090110778808594, "logps/chosen": -262.98516845703125, "logps/rejected": -229.734130859375, "loss": 0.6552, "positive_losses": 0.04911189153790474, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.23368725180625916, "rewards/margins": 0.0766754299402237, "rewards/margins_max": 0.3011917471885681, "rewards/margins_min": -0.13092352449893951, "rewards/margins_std": 0.19443733990192413, "rewards/rejected": 0.15701182186603546, "step": 1210 }, { "dpo_losses": 0.6473007202148438, "epoch": 0.32, "grad_norm": 1.963837565042628, "learning_rate": 4.303736343857704e-06, "logits/chosen": -2.7466750144958496, "logits/rejected": -2.720334529876709, "logps/chosen": -275.8536071777344, "logps/rejected": -256.60882568359375, "loss": 0.7014, "positive_losses": 0.2650478482246399, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21415309607982635, "rewards/margins": 0.10407302528619766, "rewards/margins_max": 0.32783380150794983, "rewards/margins_min": -0.06663999706506729, "rewards/margins_std": 0.17963366210460663, "rewards/rejected": 0.1100800633430481, "step": 1220 }, { "dpo_losses": 0.6583267450332642, "epoch": 0.32, "grad_norm": 1.7006431484971423, "learning_rate": 4.287843181003772e-06, "logits/chosen": -2.664750337600708, "logits/rejected": -2.6802029609680176, "logps/chosen": -206.2310028076172, "logps/rejected": -220.775390625, "loss": 0.6694, "positive_losses": 0.01693267747759819, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.19193141162395477, "rewards/margins": 0.07765965163707733, "rewards/margins_max": 0.2502481937408447, "rewards/margins_min": -0.0685039535164833, "rewards/margins_std": 0.14312848448753357, "rewards/rejected": 0.11427175998687744, "step": 1230 }, { "dpo_losses": 0.6376180052757263, "epoch": 0.32, "grad_norm": 1.6513813363342713, "learning_rate": 4.27180073375873e-06, "logits/chosen": -2.627671003341675, "logits/rejected": -2.6573538780212402, "logps/chosen": -227.34194946289062, "logps/rejected": -215.28567504882812, "loss": 0.6641, "positive_losses": 0.25857123732566833, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.21966569125652313, "rewards/margins": 0.12344787269830704, "rewards/margins_max": 0.318206787109375, "rewards/margins_min": -0.02818525768816471, "rewards/margins_std": 0.15635992586612701, "rewards/rejected": 0.0962178111076355, "step": 1240 }, { "dpo_losses": 0.6496556401252747, "epoch": 0.33, "grad_norm": 2.4268975138989184, "learning_rate": 4.255610341662304e-06, "logits/chosen": -2.6240971088409424, "logits/rejected": -2.648489475250244, "logps/chosen": -251.15036010742188, "logps/rejected": -266.73590087890625, "loss": 0.7239, "positive_losses": 1.152416467666626, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.19764620065689087, "rewards/margins": 0.10158193111419678, "rewards/margins_max": 0.31982916593551636, "rewards/margins_min": -0.09628921747207642, "rewards/margins_std": 0.18527303636074066, "rewards/rejected": 0.09606426954269409, "step": 1250 }, { "dpo_losses": 0.6284223794937134, "epoch": 0.33, "grad_norm": 12.252363950000884, "learning_rate": 4.2392733566075764e-06, "logits/chosen": -2.7110676765441895, "logits/rejected": -2.7120535373687744, "logps/chosen": -227.1322021484375, "logps/rejected": -213.12319946289062, "loss": 0.7199, "positive_losses": 0.46507692337036133, "rewards/accuracies": 0.75, "rewards/chosen": 0.20356687903404236, "rewards/margins": 0.1450451761484146, "rewards/margins_max": 0.3519541621208191, "rewards/margins_min": -0.06976257264614105, "rewards/margins_std": 0.1865970492362976, "rewards/rejected": 0.05852172523736954, "step": 1260 }, { "dpo_losses": 0.6484035849571228, "epoch": 0.33, "grad_norm": 1.7312466700552855, "learning_rate": 4.2227911427280975e-06, "logits/chosen": -2.763777494430542, "logits/rejected": -2.7636969089508057, "logps/chosen": -290.2239074707031, "logps/rejected": -288.4922180175781, "loss": 0.6953, "positive_losses": 0.015831470489501953, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.23205384612083435, "rewards/margins": 0.1020529493689537, "rewards/margins_max": 0.3355284333229065, "rewards/margins_min": -0.08288715034723282, "rewards/margins_std": 0.18504774570465088, "rewards/rejected": 0.13000090420246124, "step": 1270 }, { "dpo_losses": 0.6495410203933716, "epoch": 0.33, "grad_norm": 10.634064521483323, "learning_rate": 4.206165076283983e-06, "logits/chosen": -2.647918701171875, "logits/rejected": -2.6600279808044434, "logps/chosen": -258.177978515625, "logps/rejected": -247.9029541015625, "loss": 0.6737, "positive_losses": 0.13902759552001953, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21103759109973907, "rewards/margins": 0.09863315522670746, "rewards/margins_max": 0.28625327348709106, "rewards/margins_min": -0.08542687445878983, "rewards/margins_std": 0.16910138726234436, "rewards/rejected": 0.11240440607070923, "step": 1280 }, { "dpo_losses": 0.6440567374229431, "epoch": 0.34, "grad_norm": 14.369690553098103, "learning_rate": 4.189396545546995e-06, "logits/chosen": -2.648498296737671, "logits/rejected": -2.643078327178955, "logps/chosen": -247.05038452148438, "logps/rejected": -245.91970825195312, "loss": 0.7193, "positive_losses": 0.5839151740074158, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2182537019252777, "rewards/margins": 0.12008634954690933, "rewards/margins_max": 0.3548484444618225, "rewards/margins_min": -0.09168001264333725, "rewards/margins_std": 0.20228728652000427, "rewards/rejected": 0.09816733002662659, "step": 1290 }, { "dpo_losses": 0.6662293672561646, "epoch": 0.34, "grad_norm": 1.8886050949204087, "learning_rate": 4.172486950684627e-06, "logits/chosen": -2.685309648513794, "logits/rejected": -2.6278042793273926, "logps/chosen": -181.18350219726562, "logps/rejected": -209.67837524414062, "loss": 0.6976, "positive_losses": 0.6295714378356934, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1815871298313141, "rewards/margins": 0.06246403604745865, "rewards/margins_max": 0.23650658130645752, "rewards/margins_min": -0.12367131561040878, "rewards/margins_std": 0.15985225141048431, "rewards/rejected": 0.11912310123443604, "step": 1300 }, { "epoch": 0.34, "eval_dpo_losses": 0.6535477042198181, "eval_logits/chosen": -2.664144277572632, "eval_logits/rejected": -2.628232717514038, "eval_logps/chosen": -262.81219482421875, "eval_logps/rejected": -245.60546875, "eval_loss": 0.6730425953865051, "eval_positive_losses": 0.11943159997463226, "eval_rewards/accuracies": 0.7080000042915344, "eval_rewards/chosen": 0.2178124189376831, "eval_rewards/margins": 0.08807788044214249, "eval_rewards/margins_max": 0.3433879613876343, "eval_rewards/margins_min": -0.13216106593608856, "eval_rewards/margins_std": 0.1594102680683136, "eval_rewards/rejected": 0.12973454594612122, "eval_runtime": 428.1365, "eval_samples_per_second": 4.671, "eval_steps_per_second": 0.292, "step": 1300 }, { "dpo_losses": 0.6625665426254272, "epoch": 0.34, "grad_norm": 14.739595008369198, "learning_rate": 4.155437703643182e-06, "logits/chosen": -2.6630187034606934, "logits/rejected": -2.617025852203369, "logps/chosen": -251.9992218017578, "logps/rejected": -239.4009246826172, "loss": 0.6807, "positive_losses": 0.28922000527381897, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20277170836925507, "rewards/margins": 0.07075655460357666, "rewards/margins_max": 0.2553870975971222, "rewards/margins_min": -0.10119247436523438, "rewards/margins_std": 0.16121159493923187, "rewards/rejected": 0.1320151537656784, "step": 1310 }, { "dpo_losses": 0.6581068634986877, "epoch": 0.35, "grad_norm": 15.239631505000478, "learning_rate": 4.138250228029882e-06, "logits/chosen": -2.713186502456665, "logits/rejected": -2.6854805946350098, "logps/chosen": -265.56597900390625, "logps/rejected": -242.75222778320312, "loss": 0.6827, "positive_losses": 0.1938123255968094, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22281205654144287, "rewards/margins": 0.08136356621980667, "rewards/margins_max": 0.2923230230808258, "rewards/margins_min": -0.1347009688615799, "rewards/margins_std": 0.1873144507408142, "rewards/rejected": 0.1414484828710556, "step": 1320 }, { "dpo_losses": 0.6390641927719116, "epoch": 0.35, "grad_norm": 1.8551365605185532, "learning_rate": 4.120925958993994e-06, "logits/chosen": -2.6785247325897217, "logits/rejected": -2.677243232727051, "logps/chosen": -270.0191955566406, "logps/rejected": -263.7103576660156, "loss": 0.6983, "positive_losses": 0.18178720772266388, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.23928770422935486, "rewards/margins": 0.1227588877081871, "rewards/margins_max": 0.32545411586761475, "rewards/margins_min": -0.08336476981639862, "rewards/margins_std": 0.18180139362812042, "rewards/rejected": 0.11652884632349014, "step": 1330 }, { "dpo_losses": 0.6413576602935791, "epoch": 0.35, "grad_norm": 3.235333729057778, "learning_rate": 4.103466343106999e-06, "logits/chosen": -2.545454502105713, "logits/rejected": -2.519524335861206, "logps/chosen": -324.9889221191406, "logps/rejected": -270.2016906738281, "loss": 0.6622, "positive_losses": 0.10343074798583984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.242496058344841, "rewards/margins": 0.1184769719839096, "rewards/margins_max": 0.3343360722064972, "rewards/margins_min": -0.08624882996082306, "rewards/margins_std": 0.1914907842874527, "rewards/rejected": 0.1240190863609314, "step": 1340 }, { "dpo_losses": 0.6509544849395752, "epoch": 0.35, "grad_norm": 8.931457611084193, "learning_rate": 4.085872838241797e-06, "logits/chosen": -2.7083029747009277, "logits/rejected": -2.661639451980591, "logps/chosen": -294.5940856933594, "logps/rejected": -282.7342224121094, "loss": 0.6798, "positive_losses": 0.41289058327674866, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2159956395626068, "rewards/margins": 0.09552866220474243, "rewards/margins_max": 0.32265013456344604, "rewards/margins_min": -0.10738413035869598, "rewards/margins_std": 0.18789581954479218, "rewards/rejected": 0.12046699225902557, "step": 1350 }, { "dpo_losses": 0.658233642578125, "epoch": 0.36, "grad_norm": 2.2726473664195037, "learning_rate": 4.06814691345098e-06, "logits/chosen": -2.6470699310302734, "logits/rejected": -2.6591906547546387, "logps/chosen": -254.8450469970703, "logps/rejected": -259.53704833984375, "loss": 0.6959, "positive_losses": 0.18143853545188904, "rewards/accuracies": 0.6875, "rewards/chosen": 0.22105729579925537, "rewards/margins": 0.07999229431152344, "rewards/margins_max": 0.28359168767929077, "rewards/margins_min": -0.0992380753159523, "rewards/margins_std": 0.16937807202339172, "rewards/rejected": 0.14106498658657074, "step": 1360 }, { "dpo_losses": 0.645255982875824, "epoch": 0.36, "grad_norm": 1.8297135713643442, "learning_rate": 4.050290048844171e-06, "logits/chosen": -2.7332587242126465, "logits/rejected": -2.6860995292663574, "logps/chosen": -276.13128662109375, "logps/rejected": -242.00167846679688, "loss": 0.6569, "positive_losses": 0.025147819891572, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2587778866291046, "rewards/margins": 0.10525497049093246, "rewards/margins_max": 0.2734186351299286, "rewards/margins_min": -0.04697667807340622, "rewards/margins_std": 0.14240820705890656, "rewards/rejected": 0.15352290868759155, "step": 1370 }, { "dpo_losses": 0.6353882551193237, "epoch": 0.36, "grad_norm": 5.55038006142768, "learning_rate": 4.032303735464422e-06, "logits/chosen": -2.6713452339172363, "logits/rejected": -2.6468327045440674, "logps/chosen": -265.26678466796875, "logps/rejected": -232.63088989257812, "loss": 0.6595, "positive_losses": 0.20324555039405823, "rewards/accuracies": 0.75, "rewards/chosen": 0.2500496804714203, "rewards/margins": 0.12701056897640228, "rewards/margins_max": 0.3361486792564392, "rewards/margins_min": -0.03808742016553879, "rewards/margins_std": 0.16679345071315765, "rewards/rejected": 0.12303910404443741, "step": 1380 }, { "dpo_losses": 0.642040491104126, "epoch": 0.36, "grad_norm": 2.0037947655315964, "learning_rate": 4.014189475163727e-06, "logits/chosen": -2.737389326095581, "logits/rejected": -2.705068588256836, "logps/chosen": -235.0084991455078, "logps/rejected": -231.0500946044922, "loss": 0.6591, "positive_losses": 0.02664165571331978, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.21402215957641602, "rewards/margins": 0.11353866010904312, "rewards/margins_max": 0.2948105037212372, "rewards/margins_min": -0.08596575260162354, "rewards/margins_std": 0.17296113073825836, "rewards/rejected": 0.1004834994673729, "step": 1390 }, { "dpo_losses": 0.6551258563995361, "epoch": 0.37, "grad_norm": 6.662737769487926, "learning_rate": 3.995948780477605e-06, "logits/chosen": -2.690420150756836, "logits/rejected": -2.6233978271484375, "logps/chosen": -256.7580261230469, "logps/rejected": -224.26107788085938, "loss": 0.7536, "positive_losses": 1.4171825647354126, "rewards/accuracies": 0.6875, "rewards/chosen": 0.19240829348564148, "rewards/margins": 0.08824966102838516, "rewards/margins_max": 0.2761348783969879, "rewards/margins_min": -0.11071120202541351, "rewards/margins_std": 0.1715453863143921, "rewards/rejected": 0.10415863990783691, "step": 1400 }, { "epoch": 0.37, "eval_dpo_losses": 0.6471304297447205, "eval_logits/chosen": -2.65718150138855, "eval_logits/rejected": -2.621112108230591, "eval_logps/chosen": -263.3833312988281, "eval_logps/rejected": -247.75094604492188, "eval_loss": 0.7005280256271362, "eval_positive_losses": 0.31428229808807373, "eval_rewards/accuracies": 0.703000009059906, "eval_rewards/chosen": 0.2121007740497589, "eval_rewards/margins": 0.10382122546434402, "eval_rewards/margins_max": 0.3986285924911499, "eval_rewards/margins_min": -0.1530025601387024, "eval_rewards/margins_std": 0.1837586909532547, "eval_rewards/rejected": 0.10827956348657608, "eval_runtime": 427.9161, "eval_samples_per_second": 4.674, "eval_steps_per_second": 0.292, "step": 1400 }, { "dpo_losses": 0.6532198190689087, "epoch": 0.37, "grad_norm": 1.9163321603977244, "learning_rate": 3.977583174498816e-06, "logits/chosen": -2.616386890411377, "logits/rejected": -2.594870090484619, "logps/chosen": -212.79452514648438, "logps/rejected": -201.47752380371094, "loss": 0.6752, "positive_losses": 0.1494629830121994, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.19361071288585663, "rewards/margins": 0.08965893089771271, "rewards/margins_max": 0.2625807225704193, "rewards/margins_min": -0.09533650428056717, "rewards/margins_std": 0.16135409474372864, "rewards/rejected": 0.10395178943872452, "step": 1410 }, { "dpo_losses": 0.6550511121749878, "epoch": 0.37, "grad_norm": 9.910783277980881, "learning_rate": 3.959094190750172e-06, "logits/chosen": -2.6816649436950684, "logits/rejected": -2.6005473136901855, "logps/chosen": -220.52005004882812, "logps/rejected": -170.9810791015625, "loss": 0.6839, "positive_losses": 0.1656380593776703, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.20555934309959412, "rewards/margins": 0.0838155522942543, "rewards/margins_max": 0.25206801295280457, "rewards/margins_min": -0.0647808387875557, "rewards/margins_std": 0.14257046580314636, "rewards/rejected": 0.1217438131570816, "step": 1420 }, { "dpo_losses": 0.65045166015625, "epoch": 0.37, "grad_norm": 2.470307932975548, "learning_rate": 3.9404833730564975e-06, "logits/chosen": -2.6928744316101074, "logits/rejected": -2.687532663345337, "logps/chosen": -249.73452758789062, "logps/rejected": -229.1648406982422, "loss": 0.6868, "positive_losses": 0.3897302746772766, "rewards/accuracies": 0.6875, "rewards/chosen": 0.18598373234272003, "rewards/margins": 0.09566141664981842, "rewards/margins_max": 0.2690790593624115, "rewards/margins_min": -0.0811094120144844, "rewards/margins_std": 0.15660539269447327, "rewards/rejected": 0.09032230079174042, "step": 1430 }, { "dpo_losses": 0.6608055830001831, "epoch": 0.38, "grad_norm": 6.588009618051499, "learning_rate": 3.921752275415712e-06, "logits/chosen": -2.652783155441284, "logits/rejected": -2.6326842308044434, "logps/chosen": -299.107421875, "logps/rejected": -297.75653076171875, "loss": 0.6803, "positive_losses": 0.592887818813324, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1909155696630478, "rewards/margins": 0.07359861582517624, "rewards/margins_max": 0.25505146384239197, "rewards/margins_min": -0.11408748477697372, "rewards/margins_std": 0.1656920462846756, "rewards/rejected": 0.11731694638729095, "step": 1440 }, { "dpo_losses": 0.6433164477348328, "epoch": 0.38, "grad_norm": 8.355582886960034, "learning_rate": 3.902902461869079e-06, "logits/chosen": -2.6364712715148926, "logits/rejected": -2.578061103820801, "logps/chosen": -302.90423583984375, "logps/rejected": -246.289306640625, "loss": 0.681, "positive_losses": 0.42677387595176697, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2319517880678177, "rewards/margins": 0.11193177849054337, "rewards/margins_max": 0.332444965839386, "rewards/margins_min": -0.07925325632095337, "rewards/margins_std": 0.18473513424396515, "rewards/rejected": 0.12001999467611313, "step": 1450 }, { "dpo_losses": 0.6398014426231384, "epoch": 0.38, "grad_norm": 1.871566171277863, "learning_rate": 3.883935506370605e-06, "logits/chosen": -2.661167621612549, "logits/rejected": -2.5897514820098877, "logps/chosen": -277.9273376464844, "logps/rejected": -250.48477172851562, "loss": 0.651, "positive_losses": 0.045377541333436966, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.22294461727142334, "rewards/margins": 0.1170610636472702, "rewards/margins_max": 0.2914854884147644, "rewards/margins_min": -0.059104692190885544, "rewards/margins_std": 0.1563470959663391, "rewards/rejected": 0.10588352382183075, "step": 1460 }, { "dpo_losses": 0.6480587720870972, "epoch": 0.38, "grad_norm": 10.342228250269091, "learning_rate": 3.864852992655617e-06, "logits/chosen": -2.708836317062378, "logits/rejected": -2.6821327209472656, "logps/chosen": -267.07965087890625, "logps/rejected": -222.532958984375, "loss": 0.6606, "positive_losses": 0.34216421842575073, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.20954665541648865, "rewards/margins": 0.10420210659503937, "rewards/margins_max": 0.3442058563232422, "rewards/margins_min": -0.10880078375339508, "rewards/margins_std": 0.20500075817108154, "rewards/rejected": 0.10534457117319107, "step": 1470 }, { "dpo_losses": 0.6472574472427368, "epoch": 0.39, "grad_norm": 1.967146871691099, "learning_rate": 3.845656514108516e-06, "logits/chosen": -2.6916420459747314, "logits/rejected": -2.6357762813568115, "logps/chosen": -236.6306610107422, "logps/rejected": -241.1412353515625, "loss": 0.6832, "positive_losses": 0.531735897064209, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.18670131266117096, "rewards/margins": 0.10241687297821045, "rewards/margins_max": 0.28860098123550415, "rewards/margins_min": -0.07669075578451157, "rewards/margins_std": 0.16355423629283905, "rewards/rejected": 0.08428442478179932, "step": 1480 }, { "dpo_losses": 0.6345597505569458, "epoch": 0.39, "grad_norm": 9.877404415176946, "learning_rate": 3.826347673629738e-06, "logits/chosen": -2.63944411277771, "logits/rejected": -2.6056394577026367, "logps/chosen": -269.664794921875, "logps/rejected": -252.0602569580078, "loss": 0.7134, "positive_losses": 0.5844457745552063, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2085011899471283, "rewards/margins": 0.13039520382881165, "rewards/margins_max": 0.3248598575592041, "rewards/margins_min": -0.060199182480573654, "rewards/margins_std": 0.17145588994026184, "rewards/rejected": 0.07810600847005844, "step": 1490 }, { "dpo_losses": 0.6581373810768127, "epoch": 0.39, "grad_norm": 1.655496173827722, "learning_rate": 3.8069280835019062e-06, "logits/chosen": -2.6964640617370605, "logits/rejected": -2.643976926803589, "logps/chosen": -222.6151885986328, "logps/rejected": -192.56661987304688, "loss": 0.6711, "positive_losses": 0.15156669914722443, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.1984512060880661, "rewards/margins": 0.0814533680677414, "rewards/margins_max": 0.32450541853904724, "rewards/margins_min": -0.09028832614421844, "rewards/margins_std": 0.18675634264945984, "rewards/rejected": 0.1169978603720665, "step": 1500 }, { "epoch": 0.39, "eval_dpo_losses": 0.64888596534729, "eval_logits/chosen": -2.6356375217437744, "eval_logits/rejected": -2.5982918739318848, "eval_logps/chosen": -262.6917419433594, "eval_logps/rejected": -246.61277770996094, "eval_loss": 0.6917663216590881, "eval_positive_losses": 0.22132979333400726, "eval_rewards/accuracies": 0.7039999961853027, "eval_rewards/chosen": 0.21901659667491913, "eval_rewards/margins": 0.09935507923364639, "eval_rewards/margins_max": 0.3825626075267792, "eval_rewards/margins_min": -0.14513574540615082, "eval_rewards/margins_std": 0.1760016828775406, "eval_rewards/rejected": 0.11966153234243393, "eval_runtime": 428.4872, "eval_samples_per_second": 4.668, "eval_steps_per_second": 0.292, "step": 1500 }, { "dpo_losses": 0.6641441583633423, "epoch": 0.4, "grad_norm": 2.2737252489182502, "learning_rate": 3.7873993652552077e-06, "logits/chosen": -2.673140287399292, "logits/rejected": -2.6399130821228027, "logps/chosen": -282.4440002441406, "logps/rejected": -251.7005157470703, "loss": 0.6808, "positive_losses": 0.2769942283630371, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.208525612950325, "rewards/margins": 0.06747975200414658, "rewards/margins_max": 0.2461291253566742, "rewards/margins_min": -0.11353076994419098, "rewards/margins_std": 0.16327622532844543, "rewards/rejected": 0.14104586839675903, "step": 1510 }, { "dpo_losses": 0.6597987413406372, "epoch": 0.4, "grad_norm": 2.0541448420460746, "learning_rate": 3.7677631495319953e-06, "logits/chosen": -2.6400909423828125, "logits/rejected": -2.607898235321045, "logps/chosen": -241.0442657470703, "logps/rejected": -222.66433715820312, "loss": 0.6816, "positive_losses": 0.32232433557510376, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.19642217457294464, "rewards/margins": 0.07600647211074829, "rewards/margins_max": 0.25835585594177246, "rewards/margins_min": -0.11334244161844254, "rewards/margins_std": 0.1659018099308014, "rewards/rejected": 0.12041568756103516, "step": 1520 }, { "dpo_losses": 0.6496556997299194, "epoch": 0.4, "grad_norm": 1.6912819197772817, "learning_rate": 3.748021075950633e-06, "logits/chosen": -2.6623635292053223, "logits/rejected": -2.6502346992492676, "logps/chosen": -257.75347900390625, "logps/rejected": -247.4079132080078, "loss": 0.6791, "positive_losses": 0.21904030442237854, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.21671731770038605, "rewards/margins": 0.09616968035697937, "rewards/margins_max": 0.26715341210365295, "rewards/margins_min": -0.07714874297380447, "rewards/margins_std": 0.1532919853925705, "rewards/rejected": 0.12054765224456787, "step": 1530 }, { "dpo_losses": 0.6511906385421753, "epoch": 0.4, "grad_norm": 16.54138726575801, "learning_rate": 3.7281747929685824e-06, "logits/chosen": -2.678647041320801, "logits/rejected": -2.6122212409973145, "logps/chosen": -326.34210205078125, "logps/rejected": -292.89617919921875, "loss": 0.6907, "positive_losses": 0.1314670592546463, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24261975288391113, "rewards/margins": 0.09828249365091324, "rewards/margins_max": 0.30110496282577515, "rewards/margins_min": -0.09005574136972427, "rewards/margins_std": 0.1740257441997528, "rewards/rejected": 0.1443372666835785, "step": 1540 }, { "dpo_losses": 0.6388503909111023, "epoch": 0.41, "grad_norm": 7.9614962006478205, "learning_rate": 3.7082259577447604e-06, "logits/chosen": -2.6226963996887207, "logits/rejected": -2.6585097312927246, "logps/chosen": -260.5569152832031, "logps/rejected": -260.112548828125, "loss": 0.6693, "positive_losses": 0.0032606124877929688, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.24550947546958923, "rewards/margins": 0.12715184688568115, "rewards/margins_max": 0.3580287992954254, "rewards/margins_min": -0.05972598120570183, "rewards/margins_std": 0.18930241465568542, "rewards/rejected": 0.11835767328739166, "step": 1550 }, { "dpo_losses": 0.645304799079895, "epoch": 0.41, "grad_norm": 2.1291018894840246, "learning_rate": 3.6881762360011688e-06, "logits/chosen": -2.679312229156494, "logits/rejected": -2.6295382976531982, "logps/chosen": -253.4844207763672, "logps/rejected": -228.0343475341797, "loss": 0.694, "positive_losses": 0.2775608003139496, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.25411921739578247, "rewards/margins": 0.11943242698907852, "rewards/margins_max": 0.4344254434108734, "rewards/margins_min": -0.09446341544389725, "rewards/margins_std": 0.23939839005470276, "rewards/rejected": 0.13468676805496216, "step": 1560 }, { "dpo_losses": 0.6481470465660095, "epoch": 0.41, "grad_norm": 19.61157856144872, "learning_rate": 3.668027301883802e-06, "logits/chosen": -2.7096943855285645, "logits/rejected": -2.7046380043029785, "logps/chosen": -293.19866943359375, "logps/rejected": -256.16204833984375, "loss": 0.6653, "positive_losses": 0.23131971061229706, "rewards/accuracies": 0.75, "rewards/chosen": 0.21611352264881134, "rewards/margins": 0.10061807930469513, "rewards/margins_max": 0.28255659341812134, "rewards/margins_min": -0.09664733707904816, "rewards/margins_std": 0.1659117192029953, "rewards/rejected": 0.1154954582452774, "step": 1570 }, { "dpo_losses": 0.6671686172485352, "epoch": 0.41, "grad_norm": 8.99614463457168, "learning_rate": 3.64778083782286e-06, "logits/chosen": -2.6565983295440674, "logits/rejected": -2.6668851375579834, "logps/chosen": -275.6929626464844, "logps/rejected": -244.76864624023438, "loss": 0.6833, "positive_losses": 0.2777779698371887, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.19542255997657776, "rewards/margins": 0.06095566228032112, "rewards/margins_max": 0.23151791095733643, "rewards/margins_min": -0.08547428995370865, "rewards/margins_std": 0.14268240332603455, "rewards/rejected": 0.13446690142154694, "step": 1580 }, { "dpo_losses": 0.6683533191680908, "epoch": 0.42, "grad_norm": 14.015471492679415, "learning_rate": 3.627438534392268e-06, "logits/chosen": -2.6828346252441406, "logits/rejected": -2.7137646675109863, "logps/chosen": -266.16461181640625, "logps/rejected": -254.8044891357422, "loss": 0.6805, "positive_losses": 0.3196195662021637, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.18736909329891205, "rewards/margins": 0.06001616641879082, "rewards/margins_max": 0.27491524815559387, "rewards/margins_min": -0.14767390489578247, "rewards/margins_std": 0.1888178288936615, "rewards/rejected": 0.12735293805599213, "step": 1590 }, { "dpo_losses": 0.6401321291923523, "epoch": 0.42, "grad_norm": 1.5944175818008404, "learning_rate": 3.607002090168506e-06, "logits/chosen": -2.6533164978027344, "logits/rejected": -2.6130764484405518, "logps/chosen": -241.86148071289062, "logps/rejected": -215.41580200195312, "loss": 0.7428, "positive_losses": 0.21368694305419922, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.23043449223041534, "rewards/margins": 0.11721567064523697, "rewards/margins_max": 0.3013015687465668, "rewards/margins_min": -0.0614323690533638, "rewards/margins_std": 0.16052599251270294, "rewards/rejected": 0.11321882903575897, "step": 1600 }, { "epoch": 0.42, "eval_dpo_losses": 0.6501025557518005, "eval_logits/chosen": -2.632763624191284, "eval_logits/rejected": -2.597917079925537, "eval_logps/chosen": -262.6611022949219, "eval_logps/rejected": -246.29571533203125, "eval_loss": 0.6866611242294312, "eval_positive_losses": 0.16520653665065765, "eval_rewards/accuracies": 0.7009999752044678, "eval_rewards/chosen": 0.21932312846183777, "eval_rewards/margins": 0.09649096429347992, "eval_rewards/margins_max": 0.37302011251449585, "eval_rewards/margins_min": -0.14475053548812866, "eval_rewards/margins_std": 0.17299990355968475, "eval_rewards/rejected": 0.12283217161893845, "eval_runtime": 428.1614, "eval_samples_per_second": 4.671, "eval_steps_per_second": 0.292, "step": 1600 }, { "dpo_losses": 0.6536516547203064, "epoch": 0.42, "grad_norm": 1.878714442624115, "learning_rate": 3.586473211588787e-06, "logits/chosen": -2.6485228538513184, "logits/rejected": -2.6389007568359375, "logps/chosen": -265.40814208984375, "logps/rejected": -233.7576904296875, "loss": 0.6639, "positive_losses": 0.14214439690113068, "rewards/accuracies": 0.6875, "rewards/chosen": 0.207864448428154, "rewards/margins": 0.09171368181705475, "rewards/margins_max": 0.3362428545951843, "rewards/margins_min": -0.09460137039422989, "rewards/margins_std": 0.19438976049423218, "rewards/rejected": 0.11615077406167984, "step": 1610 }, { "dpo_losses": 0.6428729891777039, "epoch": 0.42, "grad_norm": 2.2033140493675387, "learning_rate": 3.5658536128085623e-06, "logits/chosen": -2.584670305252075, "logits/rejected": -2.5773091316223145, "logps/chosen": -260.7598876953125, "logps/rejected": -268.07696533203125, "loss": 0.6476, "positive_losses": 0.1323743760585785, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.2739534378051758, "rewards/margins": 0.11782930046319962, "rewards/margins_max": 0.36470386385917664, "rewards/margins_min": -0.07272644340991974, "rewards/margins_std": 0.1928330510854721, "rewards/rejected": 0.15612414479255676, "step": 1620 }, { "dpo_losses": 0.6407750844955444, "epoch": 0.43, "grad_norm": 6.196434583225216, "learning_rate": 3.545145015558399e-06, "logits/chosen": -2.6906514167785645, "logits/rejected": -2.6470489501953125, "logps/chosen": -269.22802734375, "logps/rejected": -256.2124938964844, "loss": 0.6687, "positive_losses": 0.28842735290527344, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2171546220779419, "rewards/margins": 0.11523237079381943, "rewards/margins_max": 0.2809963822364807, "rewards/margins_min": -0.055764008313417435, "rewards/margins_std": 0.15428276360034943, "rewards/rejected": 0.10192225128412247, "step": 1630 }, { "dpo_losses": 0.6404193043708801, "epoch": 0.43, "grad_norm": 2.1957397045270692, "learning_rate": 3.5243491490002056e-06, "logits/chosen": -2.7068564891815186, "logits/rejected": -2.6719727516174316, "logps/chosen": -272.58416748046875, "logps/rejected": -227.26211547851562, "loss": 0.6776, "positive_losses": 0.1865355521440506, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2290363609790802, "rewards/margins": 0.1175479143857956, "rewards/margins_max": 0.3120426833629608, "rewards/margins_min": -0.06566186249256134, "rewards/margins_std": 0.1665848195552826, "rewards/rejected": 0.1114884465932846, "step": 1640 }, { "dpo_losses": 0.6327579617500305, "epoch": 0.43, "grad_norm": 2.3248961927968943, "learning_rate": 3.503467749582857e-06, "logits/chosen": -2.653707981109619, "logits/rejected": -2.5981452465057373, "logps/chosen": -308.4657897949219, "logps/rejected": -257.87567138671875, "loss": 0.6781, "positive_losses": 0.5005988478660583, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.22237035632133484, "rewards/margins": 0.13833247125148773, "rewards/margins_max": 0.3445996940135956, "rewards/margins_min": -0.08490337431430817, "rewards/margins_std": 0.19539986550807953, "rewards/rejected": 0.0840378999710083, "step": 1650 }, { "dpo_losses": 0.6430984139442444, "epoch": 0.43, "grad_norm": 5.0279293119427955, "learning_rate": 3.4825025608971947e-06, "logits/chosen": -2.6000447273254395, "logits/rejected": -2.57517671585083, "logps/chosen": -235.9827117919922, "logps/rejected": -258.79815673828125, "loss": 0.6541, "positive_losses": 0.03592414781451225, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2132348269224167, "rewards/margins": 0.1130198985338211, "rewards/margins_max": 0.3298777937889099, "rewards/margins_min": -0.07961928844451904, "rewards/margins_std": 0.17789717018604279, "rewards/rejected": 0.10021491348743439, "step": 1660 }, { "dpo_losses": 0.6461843848228455, "epoch": 0.44, "grad_norm": 1.7008980006114085, "learning_rate": 3.4614553335304407e-06, "logits/chosen": -2.6518378257751465, "logits/rejected": -2.6103954315185547, "logps/chosen": -238.67007446289062, "logps/rejected": -220.99642944335938, "loss": 0.6762, "positive_losses": 0.42069491744041443, "rewards/accuracies": 0.75, "rewards/chosen": 0.23298203945159912, "rewards/margins": 0.10479624569416046, "rewards/margins_max": 0.28432926535606384, "rewards/margins_min": -0.09235044568777084, "rewards/margins_std": 0.17162147164344788, "rewards/rejected": 0.12818579375743866, "step": 1670 }, { "dpo_losses": 0.6569772362709045, "epoch": 0.44, "grad_norm": 14.248715526847992, "learning_rate": 3.4403278249200222e-06, "logits/chosen": -2.6170597076416016, "logits/rejected": -2.594456672668457, "logps/chosen": -244.982421875, "logps/rejected": -260.5910339355469, "loss": 0.6837, "positive_losses": 0.6187906265258789, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.20160908997058868, "rewards/margins": 0.08317351341247559, "rewards/margins_max": 0.2904582917690277, "rewards/margins_min": -0.11839810758829117, "rewards/margins_std": 0.1846894472837448, "rewards/rejected": 0.1184355840086937, "step": 1680 }, { "dpo_losses": 0.6429563164710999, "epoch": 0.44, "grad_norm": 7.834677506500148, "learning_rate": 3.4191217992068293e-06, "logits/chosen": -2.5974044799804688, "logits/rejected": -2.604295015335083, "logps/chosen": -232.93344116210938, "logps/rejected": -248.7979736328125, "loss": 0.6826, "positive_losses": 0.513172447681427, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.20246489346027374, "rewards/margins": 0.11489985883235931, "rewards/margins_max": 0.3566764295101166, "rewards/margins_min": -0.07884009182453156, "rewards/margins_std": 0.19784900546073914, "rewards/rejected": 0.08756502717733383, "step": 1690 }, { "dpo_losses": 0.6462850570678711, "epoch": 0.44, "grad_norm": 13.119844132404689, "learning_rate": 3.3978390270879056e-06, "logits/chosen": -2.7017605304718018, "logits/rejected": -2.7095248699188232, "logps/chosen": -226.1155242919922, "logps/rejected": -251.32919311523438, "loss": 0.6593, "positive_losses": 0.1556541472673416, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.20098766684532166, "rewards/margins": 0.1058509349822998, "rewards/margins_max": 0.2990228533744812, "rewards/margins_min": -0.09451510012149811, "rewards/margins_std": 0.17923924326896667, "rewards/rejected": 0.09513673931360245, "step": 1700 }, { "epoch": 0.44, "eval_dpo_losses": 0.6466652154922485, "eval_logits/chosen": -2.6614034175872803, "eval_logits/rejected": -2.6261913776397705, "eval_logps/chosen": -262.3858947753906, "eval_logps/rejected": -246.8526153564453, "eval_loss": 0.6785080432891846, "eval_positive_losses": 0.22281299531459808, "eval_rewards/accuracies": 0.7110000252723694, "eval_rewards/chosen": 0.22207526862621307, "eval_rewards/margins": 0.10481205582618713, "eval_rewards/margins_max": 0.39778730273246765, "eval_rewards/margins_min": -0.1483326256275177, "eval_rewards/margins_std": 0.18249405920505524, "eval_rewards/rejected": 0.11726321280002594, "eval_runtime": 427.9753, "eval_samples_per_second": 4.673, "eval_steps_per_second": 0.292, "step": 1700 }, { "dpo_losses": 0.6631157994270325, "epoch": 0.45, "grad_norm": 13.091006480923983, "learning_rate": 3.3764812856685995e-06, "logits/chosen": -2.752492904663086, "logits/rejected": -2.7280375957489014, "logps/chosen": -243.3994903564453, "logps/rejected": -230.54507446289062, "loss": 0.6786, "positive_losses": 0.2162889540195465, "rewards/accuracies": 0.6875, "rewards/chosen": 0.22619624435901642, "rewards/margins": 0.07061522454023361, "rewards/margins_max": 0.2632225751876831, "rewards/margins_min": -0.11669723689556122, "rewards/margins_std": 0.16627001762390137, "rewards/rejected": 0.1555810272693634, "step": 1710 }, { "dpo_losses": 0.6593276858329773, "epoch": 0.45, "grad_norm": 2.1784248660099768, "learning_rate": 3.3550503583141726e-06, "logits/chosen": -2.6774322986602783, "logits/rejected": -2.670872211456299, "logps/chosen": -237.01089477539062, "logps/rejected": -249.7379913330078, "loss": 0.6847, "positive_losses": 0.5006786584854126, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.2009124457836151, "rewards/margins": 0.07849615067243576, "rewards/margins_max": 0.31795603036880493, "rewards/margins_min": -0.1156822070479393, "rewards/margins_std": 0.19167271256446838, "rewards/rejected": 0.12241628021001816, "step": 1720 }, { "dpo_losses": 0.6505664587020874, "epoch": 0.45, "grad_norm": 13.791308134245675, "learning_rate": 3.3335480345008907e-06, "logits/chosen": -2.6758742332458496, "logits/rejected": -2.640214443206787, "logps/chosen": -251.5399169921875, "logps/rejected": -258.8959045410156, "loss": 0.7027, "positive_losses": 0.018910503014922142, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1981344223022461, "rewards/margins": 0.0945214182138443, "rewards/margins_max": 0.2809906601905823, "rewards/margins_min": -0.058115411549806595, "rewards/margins_std": 0.15180954337120056, "rewards/rejected": 0.10361298173666, "step": 1730 }, { "dpo_losses": 0.6440737247467041, "epoch": 0.46, "grad_norm": 17.506469402688413, "learning_rate": 3.3119761096666055e-06, "logits/chosen": -2.607551097869873, "logits/rejected": -2.5610995292663574, "logps/chosen": -246.9075469970703, "logps/rejected": -207.48495483398438, "loss": 0.6828, "positive_losses": 0.12918797135353088, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.22817876935005188, "rewards/margins": 0.11010940372943878, "rewards/margins_max": 0.3282391130924225, "rewards/margins_min": -0.06957204639911652, "rewards/margins_std": 0.1807454377412796, "rewards/rejected": 0.1180693507194519, "step": 1740 }, { "dpo_losses": 0.6257420182228088, "epoch": 0.46, "grad_norm": 5.253192475910705, "learning_rate": 3.290336385060832e-06, "logits/chosen": -2.7200748920440674, "logits/rejected": -2.7011024951934814, "logps/chosen": -293.35955810546875, "logps/rejected": -290.0611877441406, "loss": 0.6538, "positive_losses": 0.06569008529186249, "rewards/accuracies": 0.8125, "rewards/chosen": 0.23800742626190186, "rewards/margins": 0.15084929764270782, "rewards/margins_max": 0.3628466725349426, "rewards/margins_min": -0.0599190779030323, "rewards/margins_std": 0.18702241778373718, "rewards/rejected": 0.08715813606977463, "step": 1750 }, { "dpo_losses": 0.6414699554443359, "epoch": 0.46, "grad_norm": 2.0931786284207314, "learning_rate": 3.268630667594348e-06, "logits/chosen": -2.6284737586975098, "logits/rejected": -2.5958545207977295, "logps/chosen": -246.00051879882812, "logps/rejected": -238.9696807861328, "loss": 0.6537, "positive_losses": 0.22344326972961426, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.23227672278881073, "rewards/margins": 0.11926314979791641, "rewards/margins_max": 0.36400899291038513, "rewards/margins_min": -0.09165000170469284, "rewards/margins_std": 0.2008214294910431, "rewards/rejected": 0.11301358044147491, "step": 1760 }, { "dpo_losses": 0.6561053991317749, "epoch": 0.46, "grad_norm": 12.468139939453867, "learning_rate": 3.2468607696883147e-06, "logits/chosen": -2.6870360374450684, "logits/rejected": -2.665567398071289, "logps/chosen": -256.4646301269531, "logps/rejected": -250.0025177001953, "loss": 0.698, "positive_losses": 0.25985726714134216, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.218532532453537, "rewards/margins": 0.08526696264743805, "rewards/margins_max": 0.265904039144516, "rewards/margins_min": -0.08091627061367035, "rewards/margins_std": 0.15936391055583954, "rewards/rejected": 0.13326558470726013, "step": 1770 }, { "dpo_losses": 0.6368280649185181, "epoch": 0.47, "grad_norm": 11.775675540020119, "learning_rate": 3.225028509122944e-06, "logits/chosen": -2.6872940063476562, "logits/rejected": -2.5880813598632812, "logps/chosen": -300.2762756347656, "logps/rejected": -226.71914672851562, "loss": 0.6672, "positive_losses": 0.22082766890525818, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.24134385585784912, "rewards/margins": 0.12595918774604797, "rewards/margins_max": 0.31853508949279785, "rewards/margins_min": -0.07879041135311127, "rewards/margins_std": 0.17602954804897308, "rewards/rejected": 0.11538468301296234, "step": 1780 }, { "dpo_losses": 0.6583558320999146, "epoch": 0.47, "grad_norm": 2.0654924820537137, "learning_rate": 3.2031357088857083e-06, "logits/chosen": -2.714325189590454, "logits/rejected": -2.6391549110412598, "logps/chosen": -302.1825866699219, "logps/rejected": -247.6293487548828, "loss": 0.6977, "positive_losses": 0.5240899920463562, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2267806977033615, "rewards/margins": 0.08057653903961182, "rewards/margins_max": 0.28991439938545227, "rewards/margins_min": -0.10471781343221664, "rewards/margins_std": 0.17987985908985138, "rewards/rejected": 0.1462041437625885, "step": 1790 }, { "dpo_losses": 0.6564275622367859, "epoch": 0.47, "grad_norm": 2.227088625134092, "learning_rate": 3.181184197019127e-06, "logits/chosen": -2.657984972000122, "logits/rejected": -2.5912022590637207, "logps/chosen": -291.07904052734375, "logps/rejected": -244.7218780517578, "loss": 0.6856, "positive_losses": 0.2376052886247635, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.20932650566101074, "rewards/margins": 0.08089903742074966, "rewards/margins_max": 0.23547455668449402, "rewards/margins_min": -0.06425313651561737, "rewards/margins_std": 0.13408055901527405, "rewards/rejected": 0.12842747569084167, "step": 1800 }, { "epoch": 0.47, "eval_dpo_losses": 0.6504243612289429, "eval_logits/chosen": -2.6325595378875732, "eval_logits/rejected": -2.5972495079040527, "eval_logps/chosen": -261.41424560546875, "eval_logps/rejected": -245.0161590576172, "eval_loss": 0.6702442169189453, "eval_positive_losses": 0.13426439464092255, "eval_rewards/accuracies": 0.6980000138282776, "eval_rewards/chosen": 0.2317916303873062, "eval_rewards/margins": 0.09616386890411377, "eval_rewards/margins_max": 0.37600037455558777, "eval_rewards/margins_min": -0.14539237320423126, "eval_rewards/margins_std": 0.17481529712677002, "eval_rewards/rejected": 0.13562773168087006, "eval_runtime": 428.5251, "eval_samples_per_second": 4.667, "eval_steps_per_second": 0.292, "step": 1800 }, { "dpo_losses": 0.648065447807312, "epoch": 0.47, "grad_norm": 1.9905249745676243, "learning_rate": 3.159175806468126e-06, "logits/chosen": -2.6667561531066895, "logits/rejected": -2.604173183441162, "logps/chosen": -298.1353454589844, "logps/rejected": -280.3143615722656, "loss": 0.6757, "positive_losses": 0.18467631936073303, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22995993494987488, "rewards/margins": 0.10116241872310638, "rewards/margins_max": 0.3034105896949768, "rewards/margins_min": -0.09748566150665283, "rewards/margins_std": 0.17770439386367798, "rewards/rejected": 0.1287975013256073, "step": 1810 }, { "dpo_losses": 0.6494191884994507, "epoch": 0.48, "grad_norm": 8.822187145087618, "learning_rate": 3.1371123749269804e-06, "logits/chosen": -2.6063125133514404, "logits/rejected": -2.578479290008545, "logps/chosen": -256.0450439453125, "logps/rejected": -266.94189453125, "loss": 0.6798, "positive_losses": 0.39024466276168823, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.24025802314281464, "rewards/margins": 0.10246167331933975, "rewards/margins_max": 0.33869725465774536, "rewards/margins_min": -0.07729412615299225, "rewards/margins_std": 0.1838615983724594, "rewards/rejected": 0.1377963423728943, "step": 1820 }, { "dpo_losses": 0.6494563817977905, "epoch": 0.48, "grad_norm": 1.8413912021959202, "learning_rate": 3.114995744685877e-06, "logits/chosen": -2.646260976791382, "logits/rejected": -2.6059529781341553, "logps/chosen": -284.9770812988281, "logps/rejected": -316.1197204589844, "loss": 0.6638, "positive_losses": 0.3104667663574219, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24419507384300232, "rewards/margins": 0.10210222005844116, "rewards/margins_max": 0.34120216965675354, "rewards/margins_min": -0.10375956445932388, "rewards/margins_std": 0.20285239815711975, "rewards/rejected": 0.14209285378456116, "step": 1830 }, { "dpo_losses": 0.647398829460144, "epoch": 0.48, "grad_norm": 12.69858491478049, "learning_rate": 3.0928277624770743e-06, "logits/chosen": -2.650028705596924, "logits/rejected": -2.5910632610321045, "logps/chosen": -240.1370391845703, "logps/rejected": -220.29757690429688, "loss": 0.6698, "positive_losses": 0.24458742141723633, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.21732616424560547, "rewards/margins": 0.10281439870595932, "rewards/margins_max": 0.3056146502494812, "rewards/margins_min": -0.07628180831670761, "rewards/margins_std": 0.17483511567115784, "rewards/rejected": 0.11451175063848495, "step": 1840 }, { "dpo_losses": 0.6395186185836792, "epoch": 0.48, "grad_norm": 2.0071692716628116, "learning_rate": 3.070610279320708e-06, "logits/chosen": -2.639828681945801, "logits/rejected": -2.5725197792053223, "logps/chosen": -242.0826873779297, "logps/rejected": -234.2345733642578, "loss": 0.6526, "positive_losses": 0.12014941871166229, "rewards/accuracies": 0.75, "rewards/chosen": 0.20422932505607605, "rewards/margins": 0.11995555460453033, "rewards/margins_max": 0.28611624240875244, "rewards/margins_min": -0.06965469568967819, "rewards/margins_std": 0.1571904718875885, "rewards/rejected": 0.08427377045154572, "step": 1850 }, { "dpo_losses": 0.6299269795417786, "epoch": 0.49, "grad_norm": 11.192479224306089, "learning_rate": 3.0483451503702264e-06, "logits/chosen": -2.6060972213745117, "logits/rejected": -2.5461373329162598, "logps/chosen": -283.3192443847656, "logps/rejected": -275.7360534667969, "loss": 0.6694, "positive_losses": 0.40090227127075195, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.23160043358802795, "rewards/margins": 0.15093091130256653, "rewards/margins_max": 0.452861487865448, "rewards/margins_min": -0.10485140979290009, "rewards/margins_std": 0.247911736369133, "rewards/rejected": 0.08066950738430023, "step": 1860 }, { "dpo_losses": 0.6297956705093384, "epoch": 0.49, "grad_norm": 1.9491342897024369, "learning_rate": 3.0260342347574916e-06, "logits/chosen": -2.612184524536133, "logits/rejected": -2.5798373222351074, "logps/chosen": -266.5758972167969, "logps/rejected": -257.8975830078125, "loss": 0.638, "positive_losses": 0.0, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.22928202152252197, "rewards/margins": 0.1414060890674591, "rewards/margins_max": 0.3423546850681305, "rewards/margins_min": -0.05438286066055298, "rewards/margins_std": 0.1759631633758545, "rewards/rejected": 0.08787593990564346, "step": 1870 }, { "dpo_losses": 0.633985698223114, "epoch": 0.49, "grad_norm": 25.29399641975962, "learning_rate": 3.0036793954375358e-06, "logits/chosen": -2.6048264503479004, "logits/rejected": -2.570802688598633, "logps/chosen": -242.4512176513672, "logps/rejected": -228.9194793701172, "loss": 0.6871, "positive_losses": 0.2523138225078583, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2291102111339569, "rewards/margins": 0.13221105933189392, "rewards/margins_max": 0.33114004135131836, "rewards/margins_min": -0.06668253242969513, "rewards/margins_std": 0.18370743095874786, "rewards/rejected": 0.09689915925264359, "step": 1880 }, { "dpo_losses": 0.6468071341514587, "epoch": 0.49, "grad_norm": 2.190082986905398, "learning_rate": 2.981282499033009e-06, "logits/chosen": -2.689648151397705, "logits/rejected": -2.6369049549102783, "logps/chosen": -289.0721740722656, "logps/rejected": -275.77740478515625, "loss": 0.6958, "positive_losses": 0.37820395827293396, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2109154462814331, "rewards/margins": 0.10500963032245636, "rewards/margins_max": 0.3085101246833801, "rewards/margins_min": -0.08139508962631226, "rewards/margins_std": 0.17975106835365295, "rewards/rejected": 0.10590583086013794, "step": 1890 }, { "dpo_losses": 0.6324842572212219, "epoch": 0.5, "grad_norm": 1.733292043939962, "learning_rate": 2.9588454156783163e-06, "logits/chosen": -2.688908338546753, "logits/rejected": -2.6539764404296875, "logps/chosen": -274.11370849609375, "logps/rejected": -247.70663452148438, "loss": 0.6552, "positive_losses": 0.31979283690452576, "rewards/accuracies": 0.75, "rewards/chosen": 0.22242672741413116, "rewards/margins": 0.1341570019721985, "rewards/margins_max": 0.3386608958244324, "rewards/margins_min": -0.04844938963651657, "rewards/margins_std": 0.1713506430387497, "rewards/rejected": 0.08826972544193268, "step": 1900 }, { "epoch": 0.5, "eval_dpo_losses": 0.6483587622642517, "eval_logits/chosen": -2.6118340492248535, "eval_logits/rejected": -2.5760679244995117, "eval_logps/chosen": -261.8096008300781, "eval_logps/rejected": -245.90634155273438, "eval_loss": 0.6742714643478394, "eval_positive_losses": 0.18548505008220673, "eval_rewards/accuracies": 0.6990000009536743, "eval_rewards/chosen": 0.22783830761909485, "eval_rewards/margins": 0.10111244767904282, "eval_rewards/margins_max": 0.39197680354118347, "eval_rewards/margins_min": -0.14937348663806915, "eval_rewards/margins_std": 0.1816234290599823, "eval_rewards/rejected": 0.12672588229179382, "eval_runtime": 428.2063, "eval_samples_per_second": 4.671, "eval_steps_per_second": 0.292, "step": 1900 }, { "dpo_losses": 0.6267801523208618, "epoch": 0.5, "grad_norm": 15.330194456057852, "learning_rate": 2.9363700188634597e-06, "logits/chosen": -2.6190261840820312, "logits/rejected": -2.625927209854126, "logps/chosen": -255.4626007080078, "logps/rejected": -255.8193359375, "loss": 0.6525, "positive_losses": 0.085240438580513, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.26029080152511597, "rewards/margins": 0.14714768528938293, "rewards/margins_max": 0.3583988547325134, "rewards/margins_min": -0.02962956205010414, "rewards/margins_std": 0.16915565729141235, "rewards/rejected": 0.11314307153224945, "step": 1910 }, { "dpo_losses": 0.6608460545539856, "epoch": 0.5, "grad_norm": 2.001834623969936, "learning_rate": 2.9138581852776053e-06, "logits/chosen": -2.610356569290161, "logits/rejected": -2.593571662902832, "logps/chosen": -240.2068634033203, "logps/rejected": -213.23709106445312, "loss": 0.6832, "positive_losses": 0.3589244782924652, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.20095577836036682, "rewards/margins": 0.07470414787530899, "rewards/margins_max": 0.26924365758895874, "rewards/margins_min": -0.11080431938171387, "rewards/margins_std": 0.16561010479927063, "rewards/rejected": 0.12625160813331604, "step": 1920 }, { "dpo_losses": 0.6245434880256653, "epoch": 0.51, "grad_norm": 11.289943061462033, "learning_rate": 2.8913117946523805e-06, "logits/chosen": -2.691692590713501, "logits/rejected": -2.6207187175750732, "logps/chosen": -280.5381164550781, "logps/rejected": -236.1483154296875, "loss": 0.6727, "positive_losses": 0.04957924038171768, "rewards/accuracies": 0.75, "rewards/chosen": 0.24948105216026306, "rewards/margins": 0.15194857120513916, "rewards/margins_max": 0.365040123462677, "rewards/margins_min": -0.04834876209497452, "rewards/margins_std": 0.18084149062633514, "rewards/rejected": 0.0975324809551239, "step": 1930 }, { "dpo_losses": 0.6420565843582153, "epoch": 0.51, "grad_norm": 22.284367111904366, "learning_rate": 2.8687327296049126e-06, "logits/chosen": -2.6661148071289062, "logits/rejected": -2.6287002563476562, "logps/chosen": -273.7840881347656, "logps/rejected": -255.37850952148438, "loss": 0.6836, "positive_losses": 0.5944596529006958, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.20418283343315125, "rewards/margins": 0.11511027812957764, "rewards/margins_max": 0.30449509620666504, "rewards/margins_min": -0.08485864102840424, "rewards/margins_std": 0.1740976870059967, "rewards/rejected": 0.0890725702047348, "step": 1940 }, { "dpo_losses": 0.6480592489242554, "epoch": 0.51, "grad_norm": 1.9088918795813237, "learning_rate": 2.8461228754806376e-06, "logits/chosen": -2.661724090576172, "logits/rejected": -2.6638152599334717, "logps/chosen": -252.10205078125, "logps/rejected": -240.0064697265625, "loss": 0.6628, "positive_losses": 0.27016982436180115, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.22642064094543457, "rewards/margins": 0.10169925540685654, "rewards/margins_max": 0.32218581438064575, "rewards/margins_min": -0.07501425594091415, "rewards/margins_std": 0.17813637852668762, "rewards/rejected": 0.12472137063741684, "step": 1950 }, { "dpo_losses": 0.6339899301528931, "epoch": 0.51, "grad_norm": 2.0925300801985247, "learning_rate": 2.823484120195865e-06, "logits/chosen": -2.636782169342041, "logits/rejected": -2.610898494720459, "logps/chosen": -274.0904846191406, "logps/rejected": -233.5660858154297, "loss": 0.6656, "positive_losses": 0.0016654968494549394, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2515600323677063, "rewards/margins": 0.13343092799186707, "rewards/margins_max": 0.36472180485725403, "rewards/margins_min": -0.06343535333871841, "rewards/margins_std": 0.1956871747970581, "rewards/rejected": 0.11812911927700043, "step": 1960 }, { "dpo_losses": 0.6549708843231201, "epoch": 0.52, "grad_norm": 1.8122554482530602, "learning_rate": 2.8008183540801486e-06, "logits/chosen": -2.6718621253967285, "logits/rejected": -2.6593704223632812, "logps/chosen": -234.1223602294922, "logps/rejected": -219.40792846679688, "loss": 0.6895, "positive_losses": 0.42588481307029724, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2129441797733307, "rewards/margins": 0.08714379370212555, "rewards/margins_max": 0.2743634283542633, "rewards/margins_min": -0.10051698982715607, "rewards/margins_std": 0.16739001870155334, "rewards/rejected": 0.12580038607120514, "step": 1970 }, { "dpo_losses": 0.6716464161872864, "epoch": 0.52, "grad_norm": 11.150848876388851, "learning_rate": 2.7781274697184353e-06, "logits/chosen": -2.6749327182769775, "logits/rejected": -2.625612258911133, "logps/chosen": -221.8278045654297, "logps/rejected": -226.98898315429688, "loss": 0.6794, "positive_losses": 0.11149444431066513, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.20755262672901154, "rewards/margins": 0.050573475658893585, "rewards/margins_max": 0.23641857504844666, "rewards/margins_min": -0.10350849479436874, "rewards/margins_std": 0.15354886651039124, "rewards/rejected": 0.15697914361953735, "step": 1980 }, { "dpo_losses": 0.6682985424995422, "epoch": 0.52, "grad_norm": 1.8371998444460573, "learning_rate": 2.7554133617930397e-06, "logits/chosen": -2.6378262042999268, "logits/rejected": -2.545085906982422, "logps/chosen": -248.79019165039062, "logps/rejected": -216.0754852294922, "loss": 0.6744, "positive_losses": 0.17018738389015198, "rewards/accuracies": 0.625, "rewards/chosen": 0.2032647579908371, "rewards/margins": 0.05912317708134651, "rewards/margins_max": 0.236159086227417, "rewards/margins_min": -0.11596567928791046, "rewards/margins_std": 0.1587970107793808, "rewards/rejected": 0.1441415697336197, "step": 1990 }, { "dpo_losses": 0.6558287143707275, "epoch": 0.52, "grad_norm": 5.998555645346825, "learning_rate": 2.7326779269254363e-06, "logits/chosen": -2.6352341175079346, "logits/rejected": -2.6149191856384277, "logps/chosen": -226.8389434814453, "logps/rejected": -214.03012084960938, "loss": 0.6577, "positive_losses": 0.10618214309215546, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24029576778411865, "rewards/margins": 0.08520184457302094, "rewards/margins_max": 0.26697662472724915, "rewards/margins_min": -0.11164041608572006, "rewards/margins_std": 0.17001576721668243, "rewards/rejected": 0.1550939381122589, "step": 2000 }, { "epoch": 0.52, "eval_dpo_losses": 0.6460915207862854, "eval_logits/chosen": -2.624117136001587, "eval_logits/rejected": -2.586920738220215, "eval_logps/chosen": -261.489013671875, "eval_logps/rejected": -246.10638427734375, "eval_loss": 0.6748321652412415, "eval_positive_losses": 0.20357099175453186, "eval_rewards/accuracies": 0.7089999914169312, "eval_rewards/chosen": 0.23104406893253326, "eval_rewards/margins": 0.10631891340017319, "eval_rewards/margins_max": 0.4015742838382721, "eval_rewards/margins_min": -0.152598038315773, "eval_rewards/margins_std": 0.18525823950767517, "eval_rewards/rejected": 0.12472515553236008, "eval_runtime": 428.2412, "eval_samples_per_second": 4.67, "eval_steps_per_second": 0.292, "step": 2000 }, { "dpo_losses": 0.6374010443687439, "epoch": 0.53, "grad_norm": 1.973224943165519, "learning_rate": 2.7099230635178954e-06, "logits/chosen": -2.5919179916381836, "logits/rejected": -2.601430892944336, "logps/chosen": -241.382568359375, "logps/rejected": -238.89810180664062, "loss": 0.6445, "positive_losses": 0.02119159698486328, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.23006686568260193, "rewards/margins": 0.12360439449548721, "rewards/margins_max": 0.3254317045211792, "rewards/margins_min": -0.03285626322031021, "rewards/margins_std": 0.16100876033306122, "rewards/rejected": 0.1064625009894371, "step": 2010 }, { "dpo_losses": 0.6473142504692078, "epoch": 0.53, "grad_norm": 15.41456339467445, "learning_rate": 2.6871506715949608e-06, "logits/chosen": -2.728001832962036, "logits/rejected": -2.6899797916412354, "logps/chosen": -294.582763671875, "logps/rejected": -275.78729248046875, "loss": 0.6803, "positive_losses": 0.3745996356010437, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2238301932811737, "rewards/margins": 0.10699672996997833, "rewards/margins_max": 0.36772987246513367, "rewards/margins_min": -0.10426433384418488, "rewards/margins_std": 0.2051413357257843, "rewards/rejected": 0.11683347076177597, "step": 2020 }, { "dpo_losses": 0.6469787359237671, "epoch": 0.53, "grad_norm": 2.038889559723446, "learning_rate": 2.6643626526448063e-06, "logits/chosen": -2.690023422241211, "logits/rejected": -2.6673452854156494, "logps/chosen": -250.1510772705078, "logps/rejected": -255.42233276367188, "loss": 0.6965, "positive_losses": 0.2789936661720276, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2083875685930252, "rewards/margins": 0.10868855565786362, "rewards/margins_max": 0.35883814096450806, "rewards/margins_min": -0.13523946702480316, "rewards/margins_std": 0.2176806479692459, "rewards/rejected": 0.099699005484581, "step": 2030 }, { "dpo_losses": 0.627991259098053, "epoch": 0.53, "grad_norm": 2.0095529100790013, "learning_rate": 2.6415609094604562e-06, "logits/chosen": -2.543858051300049, "logits/rejected": -2.564131021499634, "logps/chosen": -284.96490478515625, "logps/rejected": -213.75936889648438, "loss": 0.6419, "positive_losses": 0.08691177517175674, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.235652893781662, "rewards/margins": 0.1479492485523224, "rewards/margins_max": 0.40268969535827637, "rewards/margins_min": -0.03910567983984947, "rewards/margins_std": 0.19945472478866577, "rewards/rejected": 0.08770367503166199, "step": 2040 }, { "dpo_losses": 0.6487405896186829, "epoch": 0.54, "grad_norm": 27.932688868454377, "learning_rate": 2.618747345980904e-06, "logits/chosen": -2.665897846221924, "logits/rejected": -2.6270029544830322, "logps/chosen": -238.196044921875, "logps/rejected": -246.7635498046875, "loss": 0.7043, "positive_losses": 0.5654325485229492, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.210309699177742, "rewards/margins": 0.1032426729798317, "rewards/margins_max": 0.32105982303619385, "rewards/margins_min": -0.10647524893283844, "rewards/margins_std": 0.19207589328289032, "rewards/rejected": 0.10706702619791031, "step": 2050 }, { "dpo_losses": 0.6493858695030212, "epoch": 0.54, "grad_norm": 2.2409215669673306, "learning_rate": 2.595923867132136e-06, "logits/chosen": -2.702725887298584, "logits/rejected": -2.6534149646759033, "logps/chosen": -281.5009460449219, "logps/rejected": -256.12164306640625, "loss": 0.6666, "positive_losses": 0.2502501606941223, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.26053109765052795, "rewards/margins": 0.10209240764379501, "rewards/margins_max": 0.3246293067932129, "rewards/margins_min": -0.10611675679683685, "rewards/margins_std": 0.19126805663108826, "rewards/rejected": 0.15843868255615234, "step": 2060 }, { "dpo_losses": 0.6561940312385559, "epoch": 0.54, "grad_norm": 1.9395106557743642, "learning_rate": 2.5730923786680672e-06, "logits/chosen": -2.672586441040039, "logits/rejected": -2.6671547889709473, "logps/chosen": -239.53213500976562, "logps/rejected": -263.83819580078125, "loss": 0.6554, "positive_losses": 0.08901993930339813, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.22132229804992676, "rewards/margins": 0.0845700055360794, "rewards/margins_max": 0.28750890493392944, "rewards/margins_min": -0.09480021893978119, "rewards/margins_std": 0.17295077443122864, "rewards/rejected": 0.13675229251384735, "step": 2070 }, { "dpo_losses": 0.648390531539917, "epoch": 0.54, "grad_norm": 2.1118077695813278, "learning_rate": 2.5502547870114137e-06, "logits/chosen": -2.692939519882202, "logits/rejected": -2.6532864570617676, "logps/chosen": -245.56301879882812, "logps/rejected": -235.213623046875, "loss": 0.6669, "positive_losses": 0.24616627395153046, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2260979413986206, "rewards/margins": 0.10583852231502533, "rewards/margins_max": 0.3507748544216156, "rewards/margins_min": -0.14606128633022308, "rewards/margins_std": 0.22330446541309357, "rewards/rejected": 0.12025941908359528, "step": 2080 }, { "dpo_losses": 0.6537029147148132, "epoch": 0.55, "grad_norm": 2.312745261718066, "learning_rate": 2.527412999094507e-06, "logits/chosen": -2.614720106124878, "logits/rejected": -2.61167573928833, "logps/chosen": -245.9591064453125, "logps/rejected": -246.91757202148438, "loss": 0.6898, "positive_losses": 0.22916193306446075, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.19518707692623138, "rewards/margins": 0.08987731486558914, "rewards/margins_max": 0.2740749418735504, "rewards/margins_min": -0.08669424802064896, "rewards/margins_std": 0.15994782745838165, "rewards/rejected": 0.10530976206064224, "step": 2090 }, { "dpo_losses": 0.6503337621688843, "epoch": 0.55, "grad_norm": 1.6356648623281171, "learning_rate": 2.504568922200064e-06, "logits/chosen": -2.6984810829162598, "logits/rejected": -2.667903423309326, "logps/chosen": -225.71798706054688, "logps/rejected": -187.17330932617188, "loss": 0.6695, "positive_losses": 0.14211463928222656, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2089039832353592, "rewards/margins": 0.09576747566461563, "rewards/margins_max": 0.26251569390296936, "rewards/margins_min": -0.05504865571856499, "rewards/margins_std": 0.1408441960811615, "rewards/rejected": 0.11313650757074356, "step": 2100 }, { "epoch": 0.55, "eval_dpo_losses": 0.6443433165550232, "eval_logits/chosen": -2.6403861045837402, "eval_logits/rejected": -2.6032521724700928, "eval_logps/chosen": -262.2980041503906, "eval_logps/rejected": -247.34201049804688, "eval_loss": 0.6841303706169128, "eval_positive_losses": 0.28419262170791626, "eval_rewards/accuracies": 0.7099999785423279, "eval_rewards/chosen": 0.22295419871807098, "eval_rewards/margins": 0.11058513075113297, "eval_rewards/margins_max": 0.42021989822387695, "eval_rewards/margins_min": -0.153671532869339, "eval_rewards/margins_std": 0.19153155386447906, "eval_rewards/rejected": 0.11236906796693802, "eval_runtime": 428.0985, "eval_samples_per_second": 4.672, "eval_steps_per_second": 0.292, "step": 2100 }, { "dpo_losses": 0.6568988561630249, "epoch": 0.55, "grad_norm": 1.7868341286072185, "learning_rate": 2.4817244638019333e-06, "logits/chosen": -2.661259174346924, "logits/rejected": -2.641444683074951, "logps/chosen": -235.8902587890625, "logps/rejected": -235.330322265625, "loss": 0.7136, "positive_losses": 0.02505035325884819, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.2192874401807785, "rewards/margins": 0.08493863046169281, "rewards/margins_max": 0.3254045844078064, "rewards/margins_min": -0.10991616547107697, "rewards/margins_std": 0.1919499933719635, "rewards/rejected": 0.1343488246202469, "step": 2110 }, { "dpo_losses": 0.6377665996551514, "epoch": 0.55, "grad_norm": 21.985245351908034, "learning_rate": 2.4588815314058155e-06, "logits/chosen": -2.6296534538269043, "logits/rejected": -2.590580463409424, "logps/chosen": -226.78634643554688, "logps/rejected": -234.3419952392578, "loss": 0.6845, "positive_losses": 0.016133880242705345, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.22431540489196777, "rewards/margins": 0.12120058387517929, "rewards/margins_max": 0.29898110032081604, "rewards/margins_min": -0.0451454222202301, "rewards/margins_std": 0.15597811341285706, "rewards/rejected": 0.10311480611562729, "step": 2120 }, { "dpo_losses": 0.6583911776542664, "epoch": 0.56, "grad_norm": 11.158834718963995, "learning_rate": 2.4360420323899922e-06, "logits/chosen": -2.6720199584960938, "logits/rejected": -2.664806842803955, "logps/chosen": -242.418701171875, "logps/rejected": -251.580078125, "loss": 0.6923, "positive_losses": 0.3454399108886719, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.20674967765808105, "rewards/margins": 0.07989230006933212, "rewards/margins_max": 0.28146398067474365, "rewards/margins_min": -0.11776062101125717, "rewards/margins_std": 0.17787934839725494, "rewards/rejected": 0.12685738503932953, "step": 2130 }, { "dpo_losses": 0.641891598701477, "epoch": 0.56, "grad_norm": 22.608503028278708, "learning_rate": 2.4132078738460585e-06, "logits/chosen": -2.6954100131988525, "logits/rejected": -2.679792881011963, "logps/chosen": -255.9194793701172, "logps/rejected": -250.09262084960938, "loss": 0.67, "positive_losses": 0.2556828558444977, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.22262707352638245, "rewards/margins": 0.1144232302904129, "rewards/margins_max": 0.31138306856155396, "rewards/margins_min": -0.07543648779392242, "rewards/margins_std": 0.1741967797279358, "rewards/rejected": 0.10820382833480835, "step": 2140 }, { "dpo_losses": 0.6457602381706238, "epoch": 0.56, "grad_norm": 8.879370244695442, "learning_rate": 2.3903809624196826e-06, "logits/chosen": -2.6508166790008545, "logits/rejected": -2.6161789894104004, "logps/chosen": -260.8797912597656, "logps/rejected": -224.77035522460938, "loss": 0.6811, "positive_losses": 0.41312235593795776, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2238878309726715, "rewards/margins": 0.10860620439052582, "rewards/margins_max": 0.30983787775039673, "rewards/margins_min": -0.11345354467630386, "rewards/margins_std": 0.18995985388755798, "rewards/rejected": 0.11528158187866211, "step": 2150 }, { "dpo_losses": 0.6489660143852234, "epoch": 0.57, "grad_norm": 23.894288265582425, "learning_rate": 2.3675632041513978e-06, "logits/chosen": -2.5794270038604736, "logits/rejected": -2.571488618850708, "logps/chosen": -247.68661499023438, "logps/rejected": -241.1427001953125, "loss": 0.6863, "positive_losses": 0.3009759783744812, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.19268381595611572, "rewards/margins": 0.0996984988451004, "rewards/margins_max": 0.30270710587501526, "rewards/margins_min": -0.09910142421722412, "rewards/margins_std": 0.1765524446964264, "rewards/rejected": 0.09298529475927353, "step": 2160 }, { "dpo_losses": 0.6239650249481201, "epoch": 0.57, "grad_norm": 14.773507733836224, "learning_rate": 2.3447565043174533e-06, "logits/chosen": -2.550816059112549, "logits/rejected": -2.582885265350342, "logps/chosen": -252.0146026611328, "logps/rejected": -252.2535858154297, "loss": 0.672, "positive_losses": 0.3954521119594574, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.2451392114162445, "rewards/margins": 0.15671542286872864, "rewards/margins_max": 0.39923062920570374, "rewards/margins_min": -0.06341136246919632, "rewards/margins_std": 0.2062772810459137, "rewards/rejected": 0.08842380344867706, "step": 2170 }, { "dpo_losses": 0.6374545693397522, "epoch": 0.57, "grad_norm": 2.091495837749851, "learning_rate": 2.321962767270724e-06, "logits/chosen": -2.7370948791503906, "logits/rejected": -2.689626693725586, "logps/chosen": -287.5489501953125, "logps/rejected": -249.8207244873047, "loss": 0.6783, "positive_losses": 0.3480583131313324, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2280876636505127, "rewards/margins": 0.12556587159633636, "rewards/margins_max": 0.34966373443603516, "rewards/margins_min": -0.07942704856395721, "rewards/margins_std": 0.18983253836631775, "rewards/rejected": 0.10252177715301514, "step": 2180 }, { "dpo_losses": 0.6545939445495605, "epoch": 0.57, "grad_norm": 9.123808482513535, "learning_rate": 2.299183896281692e-06, "logits/chosen": -2.6965737342834473, "logits/rejected": -2.6977882385253906, "logps/chosen": -251.5290069580078, "logps/rejected": -271.16448974609375, "loss": 0.6629, "positive_losses": 0.02122955396771431, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20595040917396545, "rewards/margins": 0.08762118965387344, "rewards/margins_max": 0.2856506109237671, "rewards/margins_min": -0.11087393760681152, "rewards/margins_std": 0.1784714311361313, "rewards/rejected": 0.11832920461893082, "step": 2190 }, { "dpo_losses": 0.6504623889923096, "epoch": 0.58, "grad_norm": 2.1236901453664587, "learning_rate": 2.2764217933795297e-06, "logits/chosen": -2.6987946033477783, "logits/rejected": -2.6358554363250732, "logps/chosen": -271.6541442871094, "logps/rejected": -237.68222045898438, "loss": 0.6633, "positive_losses": 0.3033943176269531, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22934004664421082, "rewards/margins": 0.097927525639534, "rewards/margins_max": 0.30080220103263855, "rewards/margins_min": -0.12408483028411865, "rewards/margins_std": 0.18676115572452545, "rewards/rejected": 0.13141249120235443, "step": 2200 }, { "epoch": 0.58, "eval_dpo_losses": 0.6435040235519409, "eval_logits/chosen": -2.6383254528045654, "eval_logits/rejected": -2.6014163494110107, "eval_logps/chosen": -261.85888671875, "eval_logps/rejected": -247.10403442382812, "eval_loss": 0.6799082159996033, "eval_positive_losses": 0.2579602301120758, "eval_rewards/accuracies": 0.7139999866485596, "eval_rewards/chosen": 0.2273455262184143, "eval_rewards/margins": 0.1125965267419815, "eval_rewards/margins_max": 0.42541539669036865, "eval_rewards/margins_min": -0.15488092601299286, "eval_rewards/margins_std": 0.19319504499435425, "eval_rewards/rejected": 0.114749014377594, "eval_runtime": 428.0816, "eval_samples_per_second": 4.672, "eval_steps_per_second": 0.292, "step": 2200 }, { "dpo_losses": 0.6446993350982666, "epoch": 0.58, "grad_norm": 14.01389386257104, "learning_rate": 2.2536783591932786e-06, "logits/chosen": -2.656202793121338, "logits/rejected": -2.613140106201172, "logps/chosen": -239.73202514648438, "logps/rejected": -248.09237670898438, "loss": 0.6649, "positive_losses": 0.35535115003585815, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.21086589992046356, "rewards/margins": 0.10729577392339706, "rewards/margins_max": 0.28793585300445557, "rewards/margins_min": -0.07084666192531586, "rewards/margins_std": 0.16439056396484375, "rewards/rejected": 0.1035701185464859, "step": 2210 }, { "dpo_losses": 0.6435825228691101, "epoch": 0.58, "grad_norm": 18.876870574218103, "learning_rate": 2.230955492793149e-06, "logits/chosen": -2.7256557941436768, "logits/rejected": -2.7092697620391846, "logps/chosen": -301.4788513183594, "logps/rejected": -264.95849609375, "loss": 0.6931, "positive_losses": 0.31521472334861755, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21002745628356934, "rewards/margins": 0.11338339000940323, "rewards/margins_max": 0.3429552912712097, "rewards/margins_min": -0.11895330995321274, "rewards/margins_std": 0.20564672350883484, "rewards/rejected": 0.0966440886259079, "step": 2220 }, { "dpo_losses": 0.6391919851303101, "epoch": 0.58, "grad_norm": 20.86574326362134, "learning_rate": 2.208255091531947e-06, "logits/chosen": -2.610348701477051, "logits/rejected": -2.6132984161376953, "logps/chosen": -228.4042205810547, "logps/rejected": -228.87814331054688, "loss": 0.6615, "positive_losses": 0.11838321387767792, "rewards/accuracies": 0.75, "rewards/chosen": 0.23576466739177704, "rewards/margins": 0.12023010104894638, "rewards/margins_max": 0.32014840841293335, "rewards/margins_min": -0.06382576376199722, "rewards/margins_std": 0.17094869911670685, "rewards/rejected": 0.11553458124399185, "step": 2230 }, { "dpo_losses": 0.6561200618743896, "epoch": 0.59, "grad_norm": 2.1399648720540925, "learning_rate": 2.1855790508866435e-06, "logits/chosen": -2.5838656425476074, "logits/rejected": -2.5281291007995605, "logps/chosen": -262.9925231933594, "logps/rejected": -244.24853515625, "loss": 0.7209, "positive_losses": 0.18396730720996857, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.25871050357818604, "rewards/margins": 0.10102052986621857, "rewards/margins_max": 0.3833063244819641, "rewards/margins_min": -0.18261994421482086, "rewards/margins_std": 0.25534966588020325, "rewards/rejected": 0.15768997371196747, "step": 2240 }, { "dpo_losses": 0.6441120505332947, "epoch": 0.59, "grad_norm": 6.884931569365359, "learning_rate": 2.162929264300107e-06, "logits/chosen": -2.6860594749450684, "logits/rejected": -2.6936264038085938, "logps/chosen": -262.0154113769531, "logps/rejected": -240.4822998046875, "loss": 0.7164, "positive_losses": 0.605613112449646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21775564551353455, "rewards/margins": 0.11152400076389313, "rewards/margins_max": 0.3520006239414215, "rewards/margins_min": -0.0792192816734314, "rewards/margins_std": 0.19621047377586365, "rewards/rejected": 0.10623165220022202, "step": 2250 }, { "dpo_losses": 0.6494182348251343, "epoch": 0.59, "grad_norm": 11.585645778736357, "learning_rate": 2.1403076230230006e-06, "logits/chosen": -2.573509693145752, "logits/rejected": -2.5338587760925293, "logps/chosen": -246.6294403076172, "logps/rejected": -238.9720001220703, "loss": 0.6875, "positive_losses": 0.24757306277751923, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21872088313102722, "rewards/margins": 0.09721089154481888, "rewards/margins_max": 0.2833651900291443, "rewards/margins_min": -0.1053292527794838, "rewards/margins_std": 0.1713966429233551, "rewards/rejected": 0.12150999158620834, "step": 2260 }, { "dpo_losses": 0.6332554817199707, "epoch": 0.59, "grad_norm": 2.0882530361771363, "learning_rate": 2.11771601595586e-06, "logits/chosen": -2.669936418533325, "logits/rejected": -2.6094093322753906, "logps/chosen": -242.7032928466797, "logps/rejected": -234.1455535888672, "loss": 0.6378, "positive_losses": 0.030338669195771217, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.26567015051841736, "rewards/margins": 0.13651952147483826, "rewards/margins_max": 0.3859143853187561, "rewards/margins_min": -0.07704669237136841, "rewards/margins_std": 0.20730972290039062, "rewards/rejected": 0.1291506141424179, "step": 2270 }, { "dpo_losses": 0.6492759585380554, "epoch": 0.6, "grad_norm": 10.500555841174924, "learning_rate": 2.0951563294913737e-06, "logits/chosen": -2.61448335647583, "logits/rejected": -2.598428249359131, "logps/chosen": -261.6297912597656, "logps/rejected": -271.35736083984375, "loss": 0.6854, "positive_losses": 0.5357402563095093, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23328149318695068, "rewards/margins": 0.10041890293359756, "rewards/margins_max": 0.30268600583076477, "rewards/margins_min": -0.10914675891399384, "rewards/margins_std": 0.185011625289917, "rewards/rejected": 0.13286259770393372, "step": 2280 }, { "dpo_losses": 0.6415624618530273, "epoch": 0.6, "grad_norm": 1.641243663580953, "learning_rate": 2.0726304473568693e-06, "logits/chosen": -2.65415620803833, "logits/rejected": -2.597093105316162, "logps/chosen": -235.9225616455078, "logps/rejected": -211.4453582763672, "loss": 0.6927, "positive_losses": 0.8288175463676453, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.23576009273529053, "rewards/margins": 0.11631790548563004, "rewards/margins_max": 0.34672680497169495, "rewards/margins_min": -0.07067125290632248, "rewards/margins_std": 0.18884898722171783, "rewards/rejected": 0.1194421797990799, "step": 2290 }, { "dpo_losses": 0.65235835313797, "epoch": 0.6, "grad_norm": 1.9386519422396105, "learning_rate": 2.050140250457023e-06, "logits/chosen": -2.6757822036743164, "logits/rejected": -2.6740939617156982, "logps/chosen": -264.4683532714844, "logps/rejected": -268.44500732421875, "loss": 0.7136, "positive_losses": 0.49965667724609375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.22599156200885773, "rewards/margins": 0.0977611392736435, "rewards/margins_max": 0.3241087794303894, "rewards/margins_min": -0.10137069225311279, "rewards/margins_std": 0.18976759910583496, "rewards/rejected": 0.12823040783405304, "step": 2300 }, { "epoch": 0.6, "eval_dpo_losses": 0.6443361639976501, "eval_logits/chosen": -2.6471359729766846, "eval_logits/rejected": -2.611751079559326, "eval_logps/chosen": -261.69073486328125, "eval_logps/rejected": -246.74459838867188, "eval_loss": 0.6781137585639954, "eval_positive_losses": 0.23762869834899902, "eval_rewards/accuracies": 0.7110000252723694, "eval_rewards/chosen": 0.2290269285440445, "eval_rewards/margins": 0.11068341135978699, "eval_rewards/margins_max": 0.4196871817111969, "eval_rewards/margins_min": -0.15320247411727905, "eval_rewards/margins_std": 0.19144602119922638, "eval_rewards/rejected": 0.1183435395359993, "eval_runtime": 428.0913, "eval_samples_per_second": 4.672, "eval_steps_per_second": 0.292, "step": 2300 }, { "dpo_losses": 0.642924964427948, "epoch": 0.6, "grad_norm": 1.8356161932105675, "learning_rate": 2.0276876167168042e-06, "logits/chosen": -2.6548080444335938, "logits/rejected": -2.614123821258545, "logps/chosen": -290.8428649902344, "logps/rejected": -231.98458862304688, "loss": 0.6622, "positive_losses": 0.28353041410446167, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.22888973355293274, "rewards/margins": 0.11493362486362457, "rewards/margins_max": 0.37111008167266846, "rewards/margins_min": -0.08703051507472992, "rewards/margins_std": 0.20240816473960876, "rewards/rejected": 0.11395610868930817, "step": 2310 }, { "dpo_losses": 0.6534973978996277, "epoch": 0.61, "grad_norm": 4.820293111159344, "learning_rate": 2.0052744209246682e-06, "logits/chosen": -2.6986007690429688, "logits/rejected": -2.6542580127716064, "logps/chosen": -239.6178741455078, "logps/rejected": -219.450439453125, "loss": 0.6904, "positive_losses": 0.5139321088790894, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.23109444975852966, "rewards/margins": 0.09262574464082718, "rewards/margins_max": 0.323060542345047, "rewards/margins_min": -0.1246475949883461, "rewards/margins_std": 0.1977689117193222, "rewards/rejected": 0.13846872746944427, "step": 2320 }, { "dpo_losses": 0.6327471137046814, "epoch": 0.61, "grad_norm": 7.510093509178269, "learning_rate": 1.9829025345760127e-06, "logits/chosen": -2.6793408393859863, "logits/rejected": -2.6734347343444824, "logps/chosen": -279.198974609375, "logps/rejected": -287.8305969238281, "loss": 0.6621, "positive_losses": 0.3259936273097992, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2557789087295532, "rewards/margins": 0.13750512897968292, "rewards/margins_max": 0.40703850984573364, "rewards/margins_min": -0.06727553904056549, "rewards/margins_std": 0.21355748176574707, "rewards/rejected": 0.1182737797498703, "step": 2330 }, { "dpo_losses": 0.6526913046836853, "epoch": 0.61, "grad_norm": 4.227720768249378, "learning_rate": 1.9605738257169115e-06, "logits/chosen": -2.624246120452881, "logits/rejected": -2.602616786956787, "logps/chosen": -271.4979248046875, "logps/rejected": -267.3983459472656, "loss": 0.6765, "positive_losses": 0.3113154470920563, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21783605217933655, "rewards/margins": 0.0947721004486084, "rewards/margins_max": 0.33382928371429443, "rewards/margins_min": -0.13909652829170227, "rewards/margins_std": 0.21136465668678284, "rewards/rejected": 0.12306392192840576, "step": 2340 }, { "dpo_losses": 0.6590754389762878, "epoch": 0.62, "grad_norm": 4.172955662352638, "learning_rate": 1.9382901587881275e-06, "logits/chosen": -2.697080135345459, "logits/rejected": -2.6767849922180176, "logps/chosen": -266.29718017578125, "logps/rejected": -235.399169921875, "loss": 0.6694, "positive_losses": 0.1714244782924652, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.22708387672901154, "rewards/margins": 0.08011853694915771, "rewards/margins_max": 0.2996049225330353, "rewards/margins_min": -0.13483865559101105, "rewards/margins_std": 0.19062037765979767, "rewards/rejected": 0.14696532487869263, "step": 2350 }, { "dpo_losses": 0.6451055407524109, "epoch": 0.62, "grad_norm": 7.03921180084103, "learning_rate": 1.916053394469437e-06, "logits/chosen": -2.6840641498565674, "logits/rejected": -2.635782480239868, "logps/chosen": -263.34197998046875, "logps/rejected": -206.8403778076172, "loss": 0.6896, "positive_losses": 0.5735152959823608, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.21855726838111877, "rewards/margins": 0.12016957998275757, "rewards/margins_max": 0.3809802234172821, "rewards/margins_min": -0.12460510432720184, "rewards/margins_std": 0.22663649916648865, "rewards/rejected": 0.09838766604661942, "step": 2360 }, { "dpo_losses": 0.6624797582626343, "epoch": 0.62, "grad_norm": 2.3493769935869393, "learning_rate": 1.8938653895242604e-06, "logits/chosen": -2.708848476409912, "logits/rejected": -2.689735174179077, "logps/chosen": -226.92910766601562, "logps/rejected": -238.28622436523438, "loss": 0.6737, "positive_losses": 0.17334279417991638, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21128730475902557, "rewards/margins": 0.07327686995267868, "rewards/margins_max": 0.24936611950397491, "rewards/margins_min": -0.11990121752023697, "rewards/margins_std": 0.16652920842170715, "rewards/rejected": 0.1380104124546051, "step": 2370 }, { "dpo_losses": 0.6371510028839111, "epoch": 0.62, "grad_norm": 1.9093900988991668, "learning_rate": 1.8717279966446267e-06, "logits/chosen": -2.6955037117004395, "logits/rejected": -2.635916233062744, "logps/chosen": -276.50286865234375, "logps/rejected": -231.66708374023438, "loss": 0.6679, "positive_losses": 0.250906378030777, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23346546292304993, "rewards/margins": 0.12501740455627441, "rewards/margins_max": 0.32189956307411194, "rewards/margins_min": -0.06842408329248428, "rewards/margins_std": 0.17628346383571625, "rewards/rejected": 0.10844806581735611, "step": 2380 }, { "dpo_losses": 0.6528540849685669, "epoch": 0.63, "grad_norm": 12.358461985279623, "learning_rate": 1.8496430642964698e-06, "logits/chosen": -2.687997341156006, "logits/rejected": -2.675405979156494, "logps/chosen": -267.6132507324219, "logps/rejected": -262.12823486328125, "loss": 0.6581, "positive_losses": 0.11948716640472412, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2187456339597702, "rewards/margins": 0.09295535832643509, "rewards/margins_max": 0.318665087223053, "rewards/margins_min": -0.13442297279834747, "rewards/margins_std": 0.19662639498710632, "rewards/rejected": 0.1257902830839157, "step": 2390 }, { "dpo_losses": 0.6597181558609009, "epoch": 0.63, "grad_norm": 1.8895929845909043, "learning_rate": 1.827612436565286e-06, "logits/chosen": -2.6727089881896973, "logits/rejected": -2.613783359527588, "logps/chosen": -218.83389282226562, "logps/rejected": -250.94454956054688, "loss": 0.6631, "positive_losses": 0.18260526657104492, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.22674863040447235, "rewards/margins": 0.07946915179491043, "rewards/margins_max": 0.2924432158470154, "rewards/margins_min": -0.11948784440755844, "rewards/margins_std": 0.1873387098312378, "rewards/rejected": 0.1472795009613037, "step": 2400 }, { "epoch": 0.63, "eval_dpo_losses": 0.6449704170227051, "eval_logits/chosen": -2.6430318355560303, "eval_logits/rejected": -2.6072418689727783, "eval_logps/chosen": -261.74786376953125, "eval_logps/rejected": -246.63014221191406, "eval_loss": 0.676893413066864, "eval_positive_losses": 0.22887668013572693, "eval_rewards/accuracies": 0.7080000042915344, "eval_rewards/chosen": 0.22845546901226044, "eval_rewards/margins": 0.10896759480237961, "eval_rewards/margins_max": 0.41343066096305847, "eval_rewards/margins_min": -0.1509072631597519, "eval_rewards/margins_std": 0.18821609020233154, "eval_rewards/rejected": 0.11948786675930023, "eval_runtime": 428.2634, "eval_samples_per_second": 4.67, "eval_steps_per_second": 0.292, "step": 2400 }, { "dpo_losses": 0.6545339822769165, "epoch": 0.63, "grad_norm": 7.772719030867673, "learning_rate": 1.8056379530021492e-06, "logits/chosen": -2.615975856781006, "logits/rejected": -2.6049745082855225, "logps/chosen": -228.251953125, "logps/rejected": -227.32797241210938, "loss": 0.6481, "positive_losses": 0.014975356869399548, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.21992447972297668, "rewards/margins": 0.08761879056692123, "rewards/margins_max": 0.2946820557117462, "rewards/margins_min": -0.10195348411798477, "rewards/margins_std": 0.17538979649543762, "rewards/rejected": 0.13230566680431366, "step": 2410 }, { "dpo_losses": 0.6384466886520386, "epoch": 0.63, "grad_norm": 8.770477746806417, "learning_rate": 1.7837214484701154e-06, "logits/chosen": -2.7008354663848877, "logits/rejected": -2.6350302696228027, "logps/chosen": -311.5066223144531, "logps/rejected": -258.36822509765625, "loss": 0.668, "positive_losses": 0.29362478852272034, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.27750059962272644, "rewards/margins": 0.12787646055221558, "rewards/margins_max": 0.3754945397377014, "rewards/margins_min": -0.09464956074953079, "rewards/margins_std": 0.20763537287712097, "rewards/rejected": 0.14962413907051086, "step": 2420 }, { "dpo_losses": 0.6517640948295593, "epoch": 0.64, "grad_norm": 2.1106932865174315, "learning_rate": 1.7618647529910043e-06, "logits/chosen": -2.703248977661133, "logits/rejected": -2.6583917140960693, "logps/chosen": -287.2205810546875, "logps/rejected": -274.2737731933594, "loss": 0.6766, "positive_losses": 0.004874801728874445, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2177753895521164, "rewards/margins": 0.09591395407915115, "rewards/margins_max": 0.3515348732471466, "rewards/margins_min": -0.12417320907115936, "rewards/margins_std": 0.2077832669019699, "rewards/rejected": 0.12186142057180405, "step": 2430 }, { "dpo_losses": 0.6442539691925049, "epoch": 0.64, "grad_norm": 9.177313522817762, "learning_rate": 1.7400696915925996e-06, "logits/chosen": -2.61651873588562, "logits/rejected": -2.55776309967041, "logps/chosen": -250.7611083984375, "logps/rejected": -226.8053741455078, "loss": 0.6548, "positive_losses": 0.12011022865772247, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.23287931084632874, "rewards/margins": 0.11137101799249649, "rewards/margins_max": 0.33335360884666443, "rewards/margins_min": -0.07477789372205734, "rewards/margins_std": 0.18021732568740845, "rewards/rejected": 0.12150830030441284, "step": 2440 }, { "dpo_losses": 0.6601482033729553, "epoch": 0.64, "grad_norm": 1.873232326137569, "learning_rate": 1.718338084156254e-06, "logits/chosen": -2.7020750045776367, "logits/rejected": -2.7036221027374268, "logps/chosen": -257.56304931640625, "logps/rejected": -280.86236572265625, "loss": 0.6887, "positive_losses": 0.27468910813331604, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.21189913153648376, "rewards/margins": 0.0761132761836052, "rewards/margins_max": 0.24091093242168427, "rewards/margins_min": -0.09531867504119873, "rewards/margins_std": 0.14899347722530365, "rewards/rejected": 0.13578586280345917, "step": 2450 }, { "dpo_losses": 0.6478050947189331, "epoch": 0.64, "grad_norm": 15.953977197794108, "learning_rate": 1.6966717452649372e-06, "logits/chosen": -2.688554286956787, "logits/rejected": -2.678035020828247, "logps/chosen": -252.1296844482422, "logps/rejected": -239.04464721679688, "loss": 0.683, "positive_losses": 0.2908807396888733, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.21076655387878418, "rewards/margins": 0.10334186255931854, "rewards/margins_max": 0.32458725571632385, "rewards/margins_min": -0.0979321151971817, "rewards/margins_std": 0.19284126162528992, "rewards/rejected": 0.10742469877004623, "step": 2460 }, { "dpo_losses": 0.6468202471733093, "epoch": 0.65, "grad_norm": 1.9430355856596895, "learning_rate": 1.6750724840517103e-06, "logits/chosen": -2.649327039718628, "logits/rejected": -2.6505379676818848, "logps/chosen": -279.24298095703125, "logps/rejected": -266.89642333984375, "loss": 0.7082, "positive_losses": 0.6239467859268188, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21040575206279755, "rewards/margins": 0.10797951370477676, "rewards/margins_max": 0.30911627411842346, "rewards/margins_min": -0.08701352775096893, "rewards/margins_std": 0.18224899470806122, "rewards/rejected": 0.10242621600627899, "step": 2470 }, { "dpo_losses": 0.6477130651473999, "epoch": 0.65, "grad_norm": 15.76191478734966, "learning_rate": 1.6535421040486686e-06, "logits/chosen": -2.664402723312378, "logits/rejected": -2.6564764976501465, "logps/chosen": -260.77862548828125, "logps/rejected": -248.49453735351562, "loss": 0.677, "positive_losses": 0.32372361421585083, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.22065123915672302, "rewards/margins": 0.1069595217704773, "rewards/margins_max": 0.33060309290885925, "rewards/margins_min": -0.11348260939121246, "rewards/margins_std": 0.19639793038368225, "rewards/rejected": 0.11369173228740692, "step": 2480 }, { "dpo_losses": 0.6175050139427185, "epoch": 0.65, "grad_norm": 17.532775703719967, "learning_rate": 1.6320824030363458e-06, "logits/chosen": -2.599806785583496, "logits/rejected": -2.523268938064575, "logps/chosen": -273.0903015136719, "logps/rejected": -238.984375, "loss": 0.6431, "positive_losses": 0.190023735165596, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.2450869083404541, "rewards/margins": 0.1699949949979782, "rewards/margins_max": 0.40681809186935425, "rewards/margins_min": -0.07576446980237961, "rewards/margins_std": 0.21555539965629578, "rewards/rejected": 0.07509191334247589, "step": 2490 }, { "dpo_losses": 0.6297341585159302, "epoch": 0.65, "grad_norm": 8.013988662471801, "learning_rate": 1.6106951728936028e-06, "logits/chosen": -2.6417384147644043, "logits/rejected": -2.6619882583618164, "logps/chosen": -231.43295288085938, "logps/rejected": -252.7168426513672, "loss": 0.6884, "positive_losses": 0.8568565249443054, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.22378802299499512, "rewards/margins": 0.14048174023628235, "rewards/margins_max": 0.3421075940132141, "rewards/margins_min": -0.038745298981666565, "rewards/margins_std": 0.17421843111515045, "rewards/rejected": 0.08330627530813217, "step": 2500 }, { "epoch": 0.65, "eval_dpo_losses": 0.6403517127037048, "eval_logits/chosen": -2.6412670612335205, "eval_logits/rejected": -2.606358528137207, "eval_logps/chosen": -262.11669921875, "eval_logps/rejected": -248.11026000976562, "eval_loss": 0.6854027509689331, "eval_positive_losses": 0.3215247392654419, "eval_rewards/accuracies": 0.7120000123977661, "eval_rewards/chosen": 0.22476711869239807, "eval_rewards/margins": 0.12008056044578552, "eval_rewards/margins_max": 0.44079744815826416, "eval_rewards/margins_min": -0.15831314027309418, "eval_rewards/margins_std": 0.2000243216753006, "eval_rewards/rejected": 0.10468658804893494, "eval_runtime": 427.7686, "eval_samples_per_second": 4.675, "eval_steps_per_second": 0.292, "step": 2500 }, { "dpo_losses": 0.6387171745300293, "epoch": 0.66, "grad_norm": 1.9671122687318612, "learning_rate": 1.5893821994479996e-06, "logits/chosen": -2.6180126667022705, "logits/rejected": -2.570876121520996, "logps/chosen": -238.55923461914062, "logps/rejected": -235.71737670898438, "loss": 0.6459, "positive_losses": 0.010045480914413929, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.22037789225578308, "rewards/margins": 0.1271388828754425, "rewards/margins_max": 0.3906629681587219, "rewards/margins_min": -0.11404719203710556, "rewards/margins_std": 0.22322943806648254, "rewards/rejected": 0.09323902428150177, "step": 2510 }, { "dpo_losses": 0.6344737410545349, "epoch": 0.66, "grad_norm": 12.848256009868312, "learning_rate": 1.5681452623266868e-06, "logits/chosen": -2.694092035293579, "logits/rejected": -2.657163143157959, "logps/chosen": -237.0204620361328, "logps/rejected": -257.2022399902344, "loss": 0.6847, "positive_losses": 0.2897499203681946, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21192964911460876, "rewards/margins": 0.13573183119297028, "rewards/margins_max": 0.39697641134262085, "rewards/margins_min": -0.08935762196779251, "rewards/margins_std": 0.223657488822937, "rewards/rejected": 0.07619784027338028, "step": 2520 }, { "dpo_losses": 0.6213490962982178, "epoch": 0.66, "grad_norm": 7.56286014928845, "learning_rate": 1.5469861348078014e-06, "logits/chosen": -2.6197333335876465, "logits/rejected": -2.6297221183776855, "logps/chosen": -255.9955291748047, "logps/rejected": -268.439697265625, "loss": 0.6403, "positive_losses": 0.1759118139743805, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.25826987624168396, "rewards/margins": 0.16589568555355072, "rewards/margins_max": 0.4661618173122406, "rewards/margins_min": -0.05714429169893265, "rewards/margins_std": 0.2331564724445343, "rewards/rejected": 0.09237419068813324, "step": 2530 }, { "dpo_losses": 0.651238739490509, "epoch": 0.66, "grad_norm": 17.91774946385872, "learning_rate": 1.5259065836724035e-06, "logits/chosen": -2.650686740875244, "logits/rejected": -2.6421239376068115, "logps/chosen": -231.24282836914062, "logps/rejected": -279.4007263183594, "loss": 0.6847, "positive_losses": 0.3626817762851715, "rewards/accuracies": 0.6875, "rewards/chosen": 0.18830692768096924, "rewards/margins": 0.09621229022741318, "rewards/margins_max": 0.31938856840133667, "rewards/margins_min": -0.10441014915704727, "rewards/margins_std": 0.19326581060886383, "rewards/rejected": 0.09209464490413666, "step": 2540 }, { "dpo_losses": 0.640707790851593, "epoch": 0.67, "grad_norm": 9.327269446213379, "learning_rate": 1.5049083690569456e-06, "logits/chosen": -2.65415620803833, "logits/rejected": -2.6038031578063965, "logps/chosen": -226.2071533203125, "logps/rejected": -233.241455078125, "loss": 0.6716, "positive_losses": 0.41562312841415405, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2270263433456421, "rewards/margins": 0.1171361580491066, "rewards/margins_max": 0.3306456208229065, "rewards/margins_min": -0.0836900919675827, "rewards/margins_std": 0.18275295197963715, "rewards/rejected": 0.10989020019769669, "step": 2550 }, { "dpo_losses": 0.6367359161376953, "epoch": 0.67, "grad_norm": 2.0294225835552444, "learning_rate": 1.4839932443063057e-06, "logits/chosen": -2.576378345489502, "logits/rejected": -2.5502843856811523, "logps/chosen": -214.53225708007812, "logps/rejected": -193.76736450195312, "loss": 0.6614, "positive_losses": 0.5105085372924805, "rewards/accuracies": 0.75, "rewards/chosen": 0.2333839386701584, "rewards/margins": 0.12664873898029327, "rewards/margins_max": 0.3154265582561493, "rewards/margins_min": -0.07867036014795303, "rewards/margins_std": 0.17050711810588837, "rewards/rejected": 0.1067352145910263, "step": 2560 }, { "dpo_losses": 0.6389847993850708, "epoch": 0.67, "grad_norm": 12.980117265027586, "learning_rate": 1.4631629558273803e-06, "logits/chosen": -2.6802918910980225, "logits/rejected": -2.6632590293884277, "logps/chosen": -277.330810546875, "logps/rejected": -259.49267578125, "loss": 0.6689, "positive_losses": 0.12875232100486755, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.22042131423950195, "rewards/margins": 0.1271112561225891, "rewards/margins_max": 0.34655025601387024, "rewards/margins_min": -0.1025533527135849, "rewards/margins_std": 0.2004118412733078, "rewards/rejected": 0.09331005066633224, "step": 2570 }, { "dpo_losses": 0.6375996470451355, "epoch": 0.68, "grad_norm": 1.9154082880451648, "learning_rate": 1.4424192429432657e-06, "logits/chosen": -2.6565616130828857, "logits/rejected": -2.60559344291687, "logps/chosen": -248.62783813476562, "logps/rejected": -255.2563018798828, "loss": 0.6649, "positive_losses": 0.3865188658237457, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.233999565243721, "rewards/margins": 0.12307735532522202, "rewards/margins_max": 0.3148532807826996, "rewards/margins_min": -0.07773791253566742, "rewards/margins_std": 0.17624323070049286, "rewards/rejected": 0.11092217266559601, "step": 2580 }, { "dpo_losses": 0.6365996599197388, "epoch": 0.68, "grad_norm": 4.128842962352004, "learning_rate": 1.421763837748016e-06, "logits/chosen": -2.681898593902588, "logits/rejected": -2.651426315307617, "logps/chosen": -265.7993469238281, "logps/rejected": -227.25900268554688, "loss": 0.7046, "positive_losses": 0.6643952131271362, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.22455474734306335, "rewards/margins": 0.12930192053318024, "rewards/margins_max": 0.36015385389328003, "rewards/margins_min": -0.1021166443824768, "rewards/margins_std": 0.21218717098236084, "rewards/rejected": 0.09525284171104431, "step": 2590 }, { "dpo_losses": 0.6480204463005066, "epoch": 0.68, "grad_norm": 10.935780068452658, "learning_rate": 1.401198464962021e-06, "logits/chosen": -2.651763439178467, "logits/rejected": -2.638746500015259, "logps/chosen": -242.8926544189453, "logps/rejected": -274.14361572265625, "loss": 0.6701, "positive_losses": 0.24560967087745667, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20024743676185608, "rewards/margins": 0.10189042240381241, "rewards/margins_max": 0.31531795859336853, "rewards/margins_min": -0.08022803068161011, "rewards/margins_std": 0.17858561873435974, "rewards/rejected": 0.09835700690746307, "step": 2600 }, { "epoch": 0.68, "eval_dpo_losses": 0.6431987881660461, "eval_logits/chosen": -2.645789861679077, "eval_logits/rejected": -2.611599922180176, "eval_logps/chosen": -261.6951904296875, "eval_logps/rejected": -247.0383758544922, "eval_loss": 0.6817047595977783, "eval_positive_losses": 0.2660869061946869, "eval_rewards/accuracies": 0.7239999771118164, "eval_rewards/chosen": 0.22898218035697937, "eval_rewards/margins": 0.113576740026474, "eval_rewards/margins_max": 0.4344462454319, "eval_rewards/margins_min": -0.15536274015903473, "eval_rewards/margins_std": 0.19602753221988678, "eval_rewards/rejected": 0.11540544778108597, "eval_runtime": 427.7774, "eval_samples_per_second": 4.675, "eval_steps_per_second": 0.292, "step": 2600 }, { "dpo_losses": 0.637254536151886, "epoch": 0.68, "grad_norm": 12.47443858624476, "learning_rate": 1.3807248417879896e-06, "logits/chosen": -2.6484155654907227, "logits/rejected": -2.5987777709960938, "logps/chosen": -268.6302795410156, "logps/rejected": -212.509765625, "loss": 0.6782, "positive_losses": 0.20167112350463867, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2548355460166931, "rewards/margins": 0.12552940845489502, "rewards/margins_max": 0.3718295991420746, "rewards/margins_min": -0.07150840759277344, "rewards/margins_std": 0.2004489004611969, "rewards/rejected": 0.1293061524629593, "step": 2610 }, { "dpo_losses": 0.6588480472564697, "epoch": 0.69, "grad_norm": 2.203309288163017, "learning_rate": 1.3603446777675665e-06, "logits/chosen": -2.648198366165161, "logits/rejected": -2.640773296356201, "logps/chosen": -247.3656005859375, "logps/rejected": -243.3533172607422, "loss": 0.6802, "positive_losses": 0.26356926560401917, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.20906881988048553, "rewards/margins": 0.0802852138876915, "rewards/margins_max": 0.29386550188064575, "rewards/margins_min": -0.11695988476276398, "rewards/margins_std": 0.1891327053308487, "rewards/rejected": 0.12878362834453583, "step": 2620 }, { "dpo_losses": 0.6410268545150757, "epoch": 0.69, "grad_norm": 35.622983427026256, "learning_rate": 1.3400596746385817e-06, "logits/chosen": -2.602980375289917, "logits/rejected": -2.643425941467285, "logps/chosen": -261.08648681640625, "logps/rejected": -260.27093505859375, "loss": 0.7158, "positive_losses": 1.012854814529419, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.21067564189434052, "rewards/margins": 0.12065211683511734, "rewards/margins_max": 0.33560889959335327, "rewards/margins_min": -0.10677622258663177, "rewards/margins_std": 0.19985225796699524, "rewards/rejected": 0.09002353996038437, "step": 2630 }, { "dpo_losses": 0.6518687009811401, "epoch": 0.69, "grad_norm": 6.458894012068232, "learning_rate": 1.3198715261929587e-06, "logits/chosen": -2.7320096492767334, "logits/rejected": -2.717092514038086, "logps/chosen": -294.92437744140625, "logps/rejected": -248.4272003173828, "loss": 0.7169, "positive_losses": 0.7706745266914368, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.21491801738739014, "rewards/margins": 0.09523527324199677, "rewards/margins_max": 0.32567816972732544, "rewards/margins_min": -0.10061510652303696, "rewards/margins_std": 0.19867166876792908, "rewards/rejected": 0.11968272924423218, "step": 2640 }, { "dpo_losses": 0.6246224641799927, "epoch": 0.69, "grad_norm": 1.8372310545408177, "learning_rate": 1.2997819181352823e-06, "logits/chosen": -2.6638991832733154, "logits/rejected": -2.620840549468994, "logps/chosen": -238.86465454101562, "logps/rejected": -241.9066619873047, "loss": 0.648, "positive_losses": 0.20916476845741272, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.23800602555274963, "rewards/margins": 0.15638919174671173, "rewards/margins_max": 0.3884904980659485, "rewards/margins_min": -0.024152684956789017, "rewards/margins_std": 0.1865231841802597, "rewards/rejected": 0.0816168338060379, "step": 2650 }, { "dpo_losses": 0.6479545831680298, "epoch": 0.7, "grad_norm": 10.419345440065516, "learning_rate": 1.2797925279420454e-06, "logits/chosen": -2.727102756500244, "logits/rejected": -2.6972246170043945, "logps/chosen": -291.5232238769531, "logps/rejected": -232.34115600585938, "loss": 0.6486, "positive_losses": 0.0034275054931640625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.24116802215576172, "rewards/margins": 0.10435731709003448, "rewards/margins_max": 0.30152642726898193, "rewards/margins_min": -0.08927350491285324, "rewards/margins_std": 0.18111565709114075, "rewards/rejected": 0.13681069016456604, "step": 2660 }, { "dpo_losses": 0.6520140171051025, "epoch": 0.7, "grad_norm": 10.71310623216262, "learning_rate": 1.2599050247215764e-06, "logits/chosen": -2.759894847869873, "logits/rejected": -2.679072856903076, "logps/chosen": -255.1886749267578, "logps/rejected": -222.32943725585938, "loss": 0.6957, "positive_losses": 0.6127627491950989, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.20489636063575745, "rewards/margins": 0.09360338747501373, "rewards/margins_max": 0.29037782549858093, "rewards/margins_min": -0.08301670104265213, "rewards/margins_std": 0.16996921598911285, "rewards/rejected": 0.11129295825958252, "step": 2670 }, { "dpo_losses": 0.6375089883804321, "epoch": 0.7, "grad_norm": 10.434048201797872, "learning_rate": 1.2401210690746705e-06, "logits/chosen": -2.698019504547119, "logits/rejected": -2.700742483139038, "logps/chosen": -299.0583801269531, "logps/rejected": -336.4329833984375, "loss": 0.7219, "positive_losses": 1.1617672443389893, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.21842971444129944, "rewards/margins": 0.13291862607002258, "rewards/margins_max": 0.41158396005630493, "rewards/margins_min": -0.15379118919372559, "rewards/margins_std": 0.2509996294975281, "rewards/rejected": 0.08551109582185745, "step": 2680 }, { "dpo_losses": 0.6348380446434021, "epoch": 0.7, "grad_norm": 2.2058169104648453, "learning_rate": 1.2204423129559306e-06, "logits/chosen": -2.5682687759399414, "logits/rejected": -2.5628485679626465, "logps/chosen": -197.96792602539062, "logps/rejected": -222.6014862060547, "loss": 0.6793, "positive_losses": 0.20842599868774414, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21794691681861877, "rewards/margins": 0.13202880322933197, "rewards/margins_max": 0.34874072670936584, "rewards/margins_min": -0.07386180013418198, "rewards/margins_std": 0.1926882565021515, "rewards/rejected": 0.08591810613870621, "step": 2690 }, { "dpo_losses": 0.6509995460510254, "epoch": 0.71, "grad_norm": 16.022086158582685, "learning_rate": 1.20087039953583e-06, "logits/chosen": -2.643242359161377, "logits/rejected": -2.5987229347229004, "logps/chosen": -239.15554809570312, "logps/rejected": -214.74429321289062, "loss": 0.668, "positive_losses": 0.1304554045200348, "rewards/accuracies": 0.625, "rewards/chosen": 0.22552835941314697, "rewards/margins": 0.09569480270147324, "rewards/margins_max": 0.2902173399925232, "rewards/margins_min": -0.11860315501689911, "rewards/margins_std": 0.18064549565315247, "rewards/rejected": 0.12983354926109314, "step": 2700 }, { "epoch": 0.71, "eval_dpo_losses": 0.6440854072570801, "eval_logits/chosen": -2.6533379554748535, "eval_logits/rejected": -2.619645595550537, "eval_logps/chosen": -261.2966003417969, "eval_logps/rejected": -246.400390625, "eval_loss": 0.6771283149719238, "eval_positive_losses": 0.2208622395992279, "eval_rewards/accuracies": 0.718999981880188, "eval_rewards/chosen": 0.23296818137168884, "eval_rewards/margins": 0.11118274182081223, "eval_rewards/margins_max": 0.4213098883628845, "eval_rewards/margins_min": -0.15251097083091736, "eval_rewards/margins_std": 0.19105467200279236, "eval_rewards/rejected": 0.12178544700145721, "eval_runtime": 428.2103, "eval_samples_per_second": 4.671, "eval_steps_per_second": 0.292, "step": 2700 }, { "dpo_losses": 0.6561421155929565, "epoch": 0.71, "grad_norm": 12.403169878981542, "learning_rate": 1.181406963063507e-06, "logits/chosen": -2.743208169937134, "logits/rejected": -2.709289789199829, "logps/chosen": -266.05523681640625, "logps/rejected": -277.7951354980469, "loss": 0.7108, "positive_losses": 0.5917842984199524, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.22070738673210144, "rewards/margins": 0.08396445214748383, "rewards/margins_max": 0.2688206732273102, "rewards/margins_min": -0.09836752712726593, "rewards/margins_std": 0.1652737259864807, "rewards/rejected": 0.13674293458461761, "step": 2710 }, { "dpo_losses": 0.646786630153656, "epoch": 0.71, "grad_norm": 2.6027516806031588, "learning_rate": 1.1620536287303052e-06, "logits/chosen": -2.6333985328674316, "logits/rejected": -2.5747392177581787, "logps/chosen": -228.45947265625, "logps/rejected": -238.262939453125, "loss": 0.6666, "positive_losses": 0.027136946097016335, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.23899738490581512, "rewards/margins": 0.10501845180988312, "rewards/margins_max": 0.29407796263694763, "rewards/margins_min": -0.09125413000583649, "rewards/margins_std": 0.17288681864738464, "rewards/rejected": 0.133978933095932, "step": 2720 }, { "dpo_losses": 0.646081268787384, "epoch": 0.71, "grad_norm": 16.276106770703127, "learning_rate": 1.1428120125340717e-06, "logits/chosen": -2.6631503105163574, "logits/rejected": -2.6598093509674072, "logps/chosen": -254.3302764892578, "logps/rejected": -246.5015869140625, "loss": 0.6626, "positive_losses": 0.015617894940078259, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.27498504519462585, "rewards/margins": 0.11424392461776733, "rewards/margins_max": 0.40816107392311096, "rewards/margins_min": -0.10722239315509796, "rewards/margins_std": 0.22900240123271942, "rewards/rejected": 0.16074110567569733, "step": 2730 }, { "dpo_losses": 0.6563581228256226, "epoch": 0.72, "grad_norm": 9.278159503609219, "learning_rate": 1.123683721144223e-06, "logits/chosen": -2.663508892059326, "logits/rejected": -2.683802843093872, "logps/chosen": -224.25607299804688, "logps/rejected": -240.0520782470703, "loss": 0.7136, "positive_losses": 0.1441497802734375, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.22323274612426758, "rewards/margins": 0.08508383482694626, "rewards/margins_max": 0.3114696741104126, "rewards/margins_min": -0.08833112567663193, "rewards/margins_std": 0.18358775973320007, "rewards/rejected": 0.13814887404441833, "step": 2740 }, { "dpo_losses": 0.6344123482704163, "epoch": 0.72, "grad_norm": 12.207456867580083, "learning_rate": 1.1046703517675848e-06, "logits/chosen": -2.725710868835449, "logits/rejected": -2.6906607151031494, "logps/chosen": -258.3102111816406, "logps/rejected": -234.1375274658203, "loss": 0.6738, "positive_losses": 0.17371778190135956, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.25737282633781433, "rewards/margins": 0.13390055298805237, "rewards/margins_max": 0.34902340173721313, "rewards/margins_min": -0.047223832458257675, "rewards/margins_std": 0.1819126307964325, "rewards/rejected": 0.12347228825092316, "step": 2750 }, { "dpo_losses": 0.6266162395477295, "epoch": 0.72, "grad_norm": 9.972333019022438, "learning_rate": 1.085773492015028e-06, "logits/chosen": -2.673086643218994, "logits/rejected": -2.61374568939209, "logps/chosen": -305.066162109375, "logps/rejected": -278.8713684082031, "loss": 0.6407, "positive_losses": 0.03099961206316948, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2706281542778015, "rewards/margins": 0.15120838582515717, "rewards/margins_max": 0.39720866084098816, "rewards/margins_min": -0.05999114364385605, "rewards/margins_std": 0.20378157496452332, "rewards/rejected": 0.11941979080438614, "step": 2760 }, { "dpo_losses": 0.6503585577011108, "epoch": 0.72, "grad_norm": 3.3210533123475434, "learning_rate": 1.0669947197689034e-06, "logits/chosen": -2.767747402191162, "logits/rejected": -2.721989631652832, "logps/chosen": -284.27728271484375, "logps/rejected": -252.5501708984375, "loss": 0.6769, "positive_losses": 0.05525505542755127, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2337963879108429, "rewards/margins": 0.09842909872531891, "rewards/margins_max": 0.3275780975818634, "rewards/margins_min": -0.10053189098834991, "rewards/margins_std": 0.1886182427406311, "rewards/rejected": 0.13536730408668518, "step": 2770 }, { "dpo_losses": 0.6543646454811096, "epoch": 0.73, "grad_norm": 1.7555621174530065, "learning_rate": 1.048335603051291e-06, "logits/chosen": -2.6742210388183594, "logits/rejected": -2.6266090869903564, "logps/chosen": -299.0511169433594, "logps/rejected": -228.11959838867188, "loss": 0.6762, "positive_losses": 0.36124539375305176, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.21554644405841827, "rewards/margins": 0.08905548602342606, "rewards/margins_max": 0.3050028383731842, "rewards/margins_min": -0.1116354912519455, "rewards/margins_std": 0.1855059266090393, "rewards/rejected": 0.126490980386734, "step": 2780 }, { "dpo_losses": 0.6355771422386169, "epoch": 0.73, "grad_norm": 1.5128031151064185, "learning_rate": 1.0297976998930665e-06, "logits/chosen": -2.7675232887268066, "logits/rejected": -2.733736753463745, "logps/chosen": -301.5973205566406, "logps/rejected": -249.68814086914062, "loss": 0.6533, "positive_losses": 0.05606355518102646, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2681359052658081, "rewards/margins": 0.1323312371969223, "rewards/margins_max": 0.371121346950531, "rewards/margins_min": -0.0732019692659378, "rewards/margins_std": 0.19328762590885162, "rewards/rejected": 0.1358046680688858, "step": 2790 }, { "dpo_losses": 0.6555207371711731, "epoch": 0.73, "grad_norm": 7.699491498926473, "learning_rate": 1.0113825582038078e-06, "logits/chosen": -2.7008633613586426, "logits/rejected": -2.678898572921753, "logps/chosen": -244.8804931640625, "logps/rejected": -244.4199676513672, "loss": 0.6851, "positive_losses": 0.41886672377586365, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20459289848804474, "rewards/margins": 0.08580772578716278, "rewards/margins_max": 0.28134217858314514, "rewards/margins_min": -0.10736802965402603, "rewards/margins_std": 0.17606523633003235, "rewards/rejected": 0.11878518760204315, "step": 2800 }, { "epoch": 0.73, "eval_dpo_losses": 0.6430361270904541, "eval_logits/chosen": -2.661334991455078, "eval_logits/rejected": -2.6278324127197266, "eval_logps/chosen": -261.293701171875, "eval_logps/rejected": -246.66207885742188, "eval_loss": 0.677651584148407, "eval_positive_losses": 0.22990721464157104, "eval_rewards/accuracies": 0.7089999914169312, "eval_rewards/chosen": 0.23299722373485565, "eval_rewards/margins": 0.11382872611284256, "eval_rewards/margins_max": 0.42740339040756226, "eval_rewards/margins_min": -0.15496976673603058, "eval_rewards/margins_std": 0.1945992261171341, "eval_rewards/rejected": 0.1191684901714325, "eval_runtime": 428.0328, "eval_samples_per_second": 4.673, "eval_steps_per_second": 0.292, "step": 2800 }, { "dpo_losses": 0.6165627837181091, "epoch": 0.74, "grad_norm": 2.109208390049169, "learning_rate": 9.930917156425477e-07, "logits/chosen": -2.6779825687408447, "logits/rejected": -2.6695377826690674, "logps/chosen": -245.42239379882812, "logps/rejected": -220.6289520263672, "loss": 0.6648, "positive_losses": 0.18235059082508087, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.26049965620040894, "rewards/margins": 0.17864897847175598, "rewards/margins_max": 0.4070645272731781, "rewards/margins_min": -0.03810154274106026, "rewards/margins_std": 0.1952413022518158, "rewards/rejected": 0.08185064792633057, "step": 2810 }, { "dpo_losses": 0.6527791023254395, "epoch": 0.74, "grad_norm": 2.362274409427705, "learning_rate": 9.749266994893756e-07, "logits/chosen": -2.6672139167785645, "logits/rejected": -2.664942979812622, "logps/chosen": -241.8899383544922, "logps/rejected": -248.9877471923828, "loss": 0.6564, "positive_losses": 0.20571669936180115, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2259671688079834, "rewards/margins": 0.09094025939702988, "rewards/margins_max": 0.27354854345321655, "rewards/margins_min": -0.09114174544811249, "rewards/margins_std": 0.16183331608772278, "rewards/rejected": 0.13502691686153412, "step": 2820 }, { "dpo_losses": 0.6393716335296631, "epoch": 0.74, "grad_norm": 2.1587866637225996, "learning_rate": 9.56889026517913e-07, "logits/chosen": -2.6505894660949707, "logits/rejected": -2.636881113052368, "logps/chosen": -276.57061767578125, "logps/rejected": -363.47723388671875, "loss": 0.6445, "positive_losses": 0.06818590313196182, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.23265020549297333, "rewards/margins": 0.1187715083360672, "rewards/margins_max": 0.31650876998901367, "rewards/margins_min": -0.05719621106982231, "rewards/margins_std": 0.16603873670101166, "rewards/rejected": 0.11387868970632553, "step": 2830 }, { "dpo_losses": 0.6333634257316589, "epoch": 0.74, "grad_norm": 9.361244384340141, "learning_rate": 9.389802028686617e-07, "logits/chosen": -2.6403565406799316, "logits/rejected": -2.6060609817504883, "logps/chosen": -260.1311340332031, "logps/rejected": -231.0727081298828, "loss": 0.655, "positive_losses": 0.023256683722138405, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23338253796100616, "rewards/margins": 0.13441750407218933, "rewards/margins_max": 0.34804508090019226, "rewards/margins_min": -0.060067594051361084, "rewards/margins_std": 0.18399295210838318, "rewards/rejected": 0.09896502643823624, "step": 2840 }, { "dpo_losses": 0.6408575177192688, "epoch": 0.75, "grad_norm": 2.6338957869957857, "learning_rate": 9.212017239232427e-07, "logits/chosen": -2.713360548019409, "logits/rejected": -2.6990275382995605, "logps/chosen": -261.1534729003906, "logps/rejected": -248.7087860107422, "loss": 0.6963, "positive_losses": 0.2504243850708008, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.22237715125083923, "rewards/margins": 0.12060409784317017, "rewards/margins_max": 0.37134578824043274, "rewards/margins_min": -0.14265871047973633, "rewards/margins_std": 0.2249237596988678, "rewards/rejected": 0.10177306830883026, "step": 2850 }, { "dpo_losses": 0.641190230846405, "epoch": 0.75, "grad_norm": 1.977161723741076, "learning_rate": 9.03555074179533e-07, "logits/chosen": -2.6907799243927, "logits/rejected": -2.655926465988159, "logps/chosen": -227.423583984375, "logps/rejected": -216.56454467773438, "loss": 0.6444, "positive_losses": 0.05986515432596207, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2274264544248581, "rewards/margins": 0.11544140428304672, "rewards/margins_max": 0.3044131100177765, "rewards/margins_min": -0.0820235162973404, "rewards/margins_std": 0.17438822984695435, "rewards/rejected": 0.11198506504297256, "step": 2860 }, { "dpo_losses": 0.6329208612442017, "epoch": 0.75, "grad_norm": 13.625303492534464, "learning_rate": 8.860417271277067e-07, "logits/chosen": -2.6437621116638184, "logits/rejected": -2.6126272678375244, "logps/chosen": -286.8416442871094, "logps/rejected": -271.61279296875, "loss": 0.6515, "positive_losses": 0.0965370163321495, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.24304378032684326, "rewards/margins": 0.1386830061674118, "rewards/margins_max": 0.36370256543159485, "rewards/margins_min": -0.09242524206638336, "rewards/margins_std": 0.20286257565021515, "rewards/rejected": 0.10436077415943146, "step": 2870 }, { "dpo_losses": 0.6439909338951111, "epoch": 0.75, "grad_norm": 6.085725009327989, "learning_rate": 8.686631451272029e-07, "logits/chosen": -2.6746413707733154, "logits/rejected": -2.5927882194519043, "logps/chosen": -277.53564453125, "logps/rejected": -248.0998992919922, "loss": 0.6892, "positive_losses": 0.506631076335907, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22289161384105682, "rewards/margins": 0.1165764331817627, "rewards/margins_max": 0.33958134055137634, "rewards/margins_min": -0.12362408638000488, "rewards/margins_std": 0.2088121473789215, "rewards/rejected": 0.10631519556045532, "step": 2880 }, { "dpo_losses": 0.6548301577568054, "epoch": 0.76, "grad_norm": 1.9717958896927266, "learning_rate": 8.514207792846168e-07, "logits/chosen": -2.6367344856262207, "logits/rejected": -2.5813980102539062, "logps/chosen": -269.0919494628906, "logps/rejected": -237.8687286376953, "loss": 0.6582, "positive_losses": 0.07601909339427948, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.24029597640037537, "rewards/margins": 0.09323279559612274, "rewards/margins_max": 0.33396726846694946, "rewards/margins_min": -0.15189293026924133, "rewards/margins_std": 0.21695995330810547, "rewards/rejected": 0.147063210606575, "step": 2890 }, { "dpo_losses": 0.6318849325180054, "epoch": 0.76, "grad_norm": 14.571141376389916, "learning_rate": 8.343160693325356e-07, "logits/chosen": -2.6871140003204346, "logits/rejected": -2.6749329566955566, "logps/chosen": -278.9430847167969, "logps/rejected": -235.34912109375, "loss": 0.678, "positive_losses": 0.3711848258972168, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.24064703285694122, "rewards/margins": 0.1461915671825409, "rewards/margins_max": 0.44787925481796265, "rewards/margins_min": -0.07638730853796005, "rewards/margins_std": 0.2327384501695633, "rewards/rejected": 0.09445545822381973, "step": 2900 }, { "epoch": 0.76, "eval_dpo_losses": 0.6402330994606018, "eval_logits/chosen": -2.6602489948272705, "eval_logits/rejected": -2.626877546310425, "eval_logps/chosen": -261.8085021972656, "eval_logps/rejected": -247.86146545410156, "eval_loss": 0.685627818107605, "eval_positive_losses": 0.29970258474349976, "eval_rewards/accuracies": 0.7110000252723694, "eval_rewards/chosen": 0.22784921526908875, "eval_rewards/margins": 0.12067471444606781, "eval_rewards/margins_max": 0.4461641013622284, "eval_rewards/margins_min": -0.1602570116519928, "eval_rewards/margins_std": 0.20283648371696472, "eval_rewards/rejected": 0.10717451572418213, "eval_runtime": 427.9539, "eval_samples_per_second": 4.673, "eval_steps_per_second": 0.292, "step": 2900 }, { "dpo_losses": 0.6342518329620361, "epoch": 0.76, "grad_norm": 9.158026304874936, "learning_rate": 8.173504435093174e-07, "logits/chosen": -2.6617798805236816, "logits/rejected": -2.6154887676239014, "logps/chosen": -235.50564575195312, "logps/rejected": -237.1990966796875, "loss": 0.6705, "positive_losses": 0.3194850981235504, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.22792664170265198, "rewards/margins": 0.1314457356929779, "rewards/margins_max": 0.3273685872554779, "rewards/margins_min": -0.05593591183423996, "rewards/margins_std": 0.17454983294010162, "rewards/rejected": 0.09648089855909348, "step": 2910 }, { "dpo_losses": 0.636628270149231, "epoch": 0.76, "grad_norm": 22.19845680165628, "learning_rate": 8.00525318439836e-07, "logits/chosen": -2.675938367843628, "logits/rejected": -2.6456830501556396, "logps/chosen": -234.3569793701172, "logps/rejected": -204.05677795410156, "loss": 0.6814, "positive_losses": 0.8898605108261108, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.23664262890815735, "rewards/margins": 0.1313386708498001, "rewards/margins_max": 0.37490588426589966, "rewards/margins_min": -0.09426429122686386, "rewards/margins_std": 0.20440106093883514, "rewards/rejected": 0.10530395805835724, "step": 2920 }, { "dpo_losses": 0.6341038942337036, "epoch": 0.77, "grad_norm": 18.19028684887985, "learning_rate": 7.838420990171927e-07, "logits/chosen": -2.587294101715088, "logits/rejected": -2.614553451538086, "logps/chosen": -259.9834289550781, "logps/rejected": -228.9385986328125, "loss": 0.7121, "positive_losses": 0.6251548528671265, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.24708354473114014, "rewards/margins": 0.13857489824295044, "rewards/margins_max": 0.3844526410102844, "rewards/margins_min": -0.06038924306631088, "rewards/margins_std": 0.19883593916893005, "rewards/rejected": 0.10850866883993149, "step": 2930 }, { "dpo_losses": 0.6501592397689819, "epoch": 0.77, "grad_norm": 1.743798917679003, "learning_rate": 7.673021782854084e-07, "logits/chosen": -2.6355204582214355, "logits/rejected": -2.6447384357452393, "logps/chosen": -238.6925506591797, "logps/rejected": -237.84347534179688, "loss": 0.701, "positive_losses": 0.6593742370605469, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.23624686896800995, "rewards/margins": 0.10006473958492279, "rewards/margins_max": 0.33041948080062866, "rewards/margins_min": -0.1267656683921814, "rewards/margins_std": 0.20058086514472961, "rewards/rejected": 0.13618211448192596, "step": 2940 }, { "dpo_losses": 0.637436032295227, "epoch": 0.77, "grad_norm": 13.226887824557881, "learning_rate": 7.509069373231039e-07, "logits/chosen": -2.7241337299346924, "logits/rejected": -2.6747238636016846, "logps/chosen": -249.4187469482422, "logps/rejected": -271.2737731933594, "loss": 0.6926, "positive_losses": 1.0058292150497437, "rewards/accuracies": 0.75, "rewards/chosen": 0.22392499446868896, "rewards/margins": 0.13012099266052246, "rewards/margins_max": 0.3944118320941925, "rewards/margins_min": -0.08947178721427917, "rewards/margins_std": 0.2141483724117279, "rewards/rejected": 0.09380398690700531, "step": 2950 }, { "dpo_losses": 0.647119402885437, "epoch": 0.77, "grad_norm": 1.8484562438375651, "learning_rate": 7.346577451281822e-07, "logits/chosen": -2.6706955432891846, "logits/rejected": -2.662426471710205, "logps/chosen": -258.4949951171875, "logps/rejected": -227.0917510986328, "loss": 0.6722, "positive_losses": 0.6746174097061157, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23804013431072235, "rewards/margins": 0.10887646675109863, "rewards/margins_max": 0.3489915430545807, "rewards/margins_min": -0.12331470102071762, "rewards/margins_std": 0.20819251239299774, "rewards/rejected": 0.12916366755962372, "step": 2960 }, { "dpo_losses": 0.6399433612823486, "epoch": 0.78, "grad_norm": 7.929735846912345, "learning_rate": 7.185559585035138e-07, "logits/chosen": -2.661951780319214, "logits/rejected": -2.7017343044281006, "logps/chosen": -248.2020721435547, "logps/rejected": -241.7464599609375, "loss": 0.6815, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.24719658493995667, "rewards/margins": 0.12132751941680908, "rewards/margins_max": 0.3393659293651581, "rewards/margins_min": -0.05899649113416672, "rewards/margins_std": 0.1788053661584854, "rewards/rejected": 0.12586906552314758, "step": 2970 }, { "dpo_losses": 0.6399902105331421, "epoch": 0.78, "grad_norm": 2.018031494961003, "learning_rate": 7.026029219436504e-07, "logits/chosen": -2.6252198219299316, "logits/rejected": -2.644059419631958, "logps/chosen": -279.79241943359375, "logps/rejected": -288.64154052734375, "loss": 0.6595, "positive_losses": 0.3226403295993805, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.23678548634052277, "rewards/margins": 0.1288241147994995, "rewards/margins_max": 0.40611523389816284, "rewards/margins_min": -0.08896999061107635, "rewards/margins_std": 0.21852633357048035, "rewards/rejected": 0.10796137899160385, "step": 2980 }, { "dpo_losses": 0.6523637771606445, "epoch": 0.78, "grad_norm": 2.1797487530595916, "learning_rate": 6.867999675225523e-07, "logits/chosen": -2.6289401054382324, "logits/rejected": -2.572408437728882, "logps/chosen": -280.0498352050781, "logps/rejected": -271.45501708984375, "loss": 0.6729, "positive_losses": 0.18320608139038086, "rewards/accuracies": 0.75, "rewards/chosen": 0.23342570662498474, "rewards/margins": 0.10582546144723892, "rewards/margins_max": 0.3280499279499054, "rewards/margins_min": -0.17500343918800354, "rewards/margins_std": 0.22897744178771973, "rewards/rejected": 0.12760025262832642, "step": 2990 }, { "dpo_losses": 0.6533046960830688, "epoch": 0.79, "grad_norm": 2.3628875656726143, "learning_rate": 6.711484147823663e-07, "logits/chosen": -2.6833419799804688, "logits/rejected": -2.662086009979248, "logps/chosen": -222.61648559570312, "logps/rejected": -240.6640167236328, "loss": 0.6605, "positive_losses": 0.03348231315612793, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.21097604930400848, "rewards/margins": 0.09355119615793228, "rewards/margins_max": 0.32012492418289185, "rewards/margins_min": -0.07909585535526276, "rewards/margins_std": 0.18243499100208282, "rewards/rejected": 0.1174248605966568, "step": 3000 }, { "epoch": 0.79, "eval_dpo_losses": 0.6411991119384766, "eval_logits/chosen": -2.6605002880096436, "eval_logits/rejected": -2.627521514892578, "eval_logps/chosen": -261.4324035644531, "eval_logps/rejected": -247.2367401123047, "eval_loss": 0.6807260513305664, "eval_positive_losses": 0.24150457978248596, "eval_rewards/accuracies": 0.7160000205039978, "eval_rewards/chosen": 0.23161005973815918, "eval_rewards/margins": 0.11818789690732956, "eval_rewards/margins_max": 0.4379708766937256, "eval_rewards/margins_min": -0.1546909064054489, "eval_rewards/margins_std": 0.19860580563545227, "eval_rewards/rejected": 0.11342217773199081, "eval_runtime": 427.8032, "eval_samples_per_second": 4.675, "eval_steps_per_second": 0.292, "step": 3000 }, { "dpo_losses": 0.6328443288803101, "epoch": 0.79, "grad_norm": 2.0178014552248493, "learning_rate": 6.556495706232413e-07, "logits/chosen": -2.7266080379486084, "logits/rejected": -2.7250070571899414, "logps/chosen": -270.19879150390625, "logps/rejected": -247.5888671875, "loss": 0.6644, "positive_losses": 0.2839382290840149, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24732156097888947, "rewards/margins": 0.1365831345319748, "rewards/margins_max": 0.36095213890075684, "rewards/margins_min": -0.06557613611221313, "rewards/margins_std": 0.19293871521949768, "rewards/rejected": 0.11073843389749527, "step": 3010 }, { "dpo_losses": 0.6344121098518372, "epoch": 0.79, "grad_norm": 19.014779215475556, "learning_rate": 6.403047291942057e-07, "logits/chosen": -2.691561698913574, "logits/rejected": -2.63598370552063, "logps/chosen": -241.51620483398438, "logps/rejected": -209.51547241210938, "loss": 0.6889, "positive_losses": 0.2836257815361023, "rewards/accuracies": 0.75, "rewards/chosen": 0.2331659495830536, "rewards/margins": 0.13293863832950592, "rewards/margins_max": 0.3479847311973572, "rewards/margins_min": -0.08762186020612717, "rewards/margins_std": 0.19680854678153992, "rewards/rejected": 0.10022733360528946, "step": 3020 }, { "dpo_losses": 0.6390833854675293, "epoch": 0.79, "grad_norm": 1.9188208153793838, "learning_rate": 6.251151717851023e-07, "logits/chosen": -2.6485981941223145, "logits/rejected": -2.6350791454315186, "logps/chosen": -253.7571563720703, "logps/rejected": -280.71746826171875, "loss": 0.687, "positive_losses": 0.36080265045166016, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.22402584552764893, "rewards/margins": 0.12664227187633514, "rewards/margins_max": 0.39223814010620117, "rewards/margins_min": -0.07834620773792267, "rewards/margins_std": 0.20787307620048523, "rewards/rejected": 0.09738355875015259, "step": 3030 }, { "dpo_losses": 0.6260613203048706, "epoch": 0.8, "grad_norm": 2.513409699444311, "learning_rate": 6.100821667196041e-07, "logits/chosen": -2.678619384765625, "logits/rejected": -2.6174044609069824, "logps/chosen": -287.0162658691406, "logps/rejected": -278.2513427734375, "loss": 0.6457, "positive_losses": 0.19703082740306854, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.2569059133529663, "rewards/margins": 0.15052184462547302, "rewards/margins_max": 0.3674320578575134, "rewards/margins_min": -0.055698297917842865, "rewards/margins_std": 0.1886374056339264, "rewards/rejected": 0.10638407617807388, "step": 3040 }, { "dpo_losses": 0.6396051645278931, "epoch": 0.8, "grad_norm": 66.39852528600504, "learning_rate": 5.952069692493062e-07, "logits/chosen": -2.661994218826294, "logits/rejected": -2.6428751945495605, "logps/chosen": -248.02285766601562, "logps/rejected": -239.62319946289062, "loss": 0.7019, "positive_losses": 0.19167347252368927, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.22153182327747345, "rewards/margins": 0.1235644593834877, "rewards/margins_max": 0.3312918543815613, "rewards/margins_min": -0.09882686287164688, "rewards/margins_std": 0.19070424139499664, "rewards/rejected": 0.09796737134456635, "step": 3050 }, { "dpo_losses": 0.6472259163856506, "epoch": 0.8, "grad_norm": 13.873807288791474, "learning_rate": 5.80490821448918e-07, "logits/chosen": -2.6815364360809326, "logits/rejected": -2.6650428771972656, "logps/chosen": -283.110107421875, "logps/rejected": -268.1537170410156, "loss": 0.6617, "positive_losses": 0.19559669494628906, "rewards/accuracies": 0.75, "rewards/chosen": 0.24070200324058533, "rewards/margins": 0.1051400676369667, "rewards/margins_max": 0.3200732171535492, "rewards/margins_min": -0.11534781754016876, "rewards/margins_std": 0.1950106918811798, "rewards/rejected": 0.13556192815303802, "step": 3060 }, { "dpo_losses": 0.6577891111373901, "epoch": 0.8, "grad_norm": 13.527340930009814, "learning_rate": 5.659349521125459e-07, "logits/chosen": -2.61413836479187, "logits/rejected": -2.600532054901123, "logps/chosen": -224.72509765625, "logps/rejected": -247.92626953125, "loss": 0.677, "positive_losses": 0.5455880165100098, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20303694903850555, "rewards/margins": 0.08140838146209717, "rewards/margins_max": 0.28243789076805115, "rewards/margins_min": -0.09992311894893646, "rewards/margins_std": 0.1716955602169037, "rewards/rejected": 0.12162858247756958, "step": 3070 }, { "dpo_losses": 0.6692142486572266, "epoch": 0.81, "grad_norm": 2.043028733957474, "learning_rate": 5.5154057665109e-07, "logits/chosen": -2.6322388648986816, "logits/rejected": -2.638415813446045, "logps/chosen": -229.92355346679688, "logps/rejected": -269.1875305175781, "loss": 0.6743, "positive_losses": 0.5757365226745605, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.2035880982875824, "rewards/margins": 0.06119891256093979, "rewards/margins_max": 0.285746693611145, "rewards/margins_min": -0.15262167155742645, "rewards/margins_std": 0.19626206159591675, "rewards/rejected": 0.1423891931772232, "step": 3080 }, { "dpo_losses": 0.6530777215957642, "epoch": 0.81, "grad_norm": 6.519199772036796, "learning_rate": 5.373088969907586e-07, "logits/chosen": -2.6646833419799805, "logits/rejected": -2.612794876098633, "logps/chosen": -255.7550811767578, "logps/rejected": -229.76272583007812, "loss": 0.7051, "positive_losses": 0.8610352277755737, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.20456835627555847, "rewards/margins": 0.09238609671592712, "rewards/margins_max": 0.30011457204818726, "rewards/margins_min": -0.0869378000497818, "rewards/margins_std": 0.17641900479793549, "rewards/rejected": 0.11218225955963135, "step": 3090 }, { "dpo_losses": 0.6696368455886841, "epoch": 0.81, "grad_norm": 12.489859167790884, "learning_rate": 5.23241101472709e-07, "logits/chosen": -2.681021213531494, "logits/rejected": -2.673013687133789, "logps/chosen": -262.3744201660156, "logps/rejected": -271.0479736328125, "loss": 0.6874, "positive_losses": 0.26185646653175354, "rewards/accuracies": 0.625, "rewards/chosen": 0.23370583355426788, "rewards/margins": 0.0600501112639904, "rewards/margins_max": 0.2808021008968353, "rewards/margins_min": -0.1530933827161789, "rewards/margins_std": 0.19106140732765198, "rewards/rejected": 0.17365573346614838, "step": 3100 }, { "epoch": 0.81, "eval_dpo_losses": 0.6425239443778992, "eval_logits/chosen": -2.6493923664093018, "eval_logits/rejected": -2.6151318550109863, "eval_logps/chosen": -261.0995178222656, "eval_logps/rejected": -246.58517456054688, "eval_loss": 0.6752615571022034, "eval_positive_losses": 0.2061479240655899, "eval_rewards/accuracies": 0.718999981880188, "eval_rewards/chosen": 0.23493869602680206, "eval_rewards/margins": 0.11500106751918793, "eval_rewards/margins_max": 0.42997851967811584, "eval_rewards/margins_min": -0.15204735100269318, "eval_rewards/margins_std": 0.19510281085968018, "eval_rewards/rejected": 0.11993761360645294, "eval_runtime": 428.1959, "eval_samples_per_second": 4.671, "eval_steps_per_second": 0.292, "step": 3100 }, { "dpo_losses": 0.6559592485427856, "epoch": 0.81, "grad_norm": 2.2951095958795493, "learning_rate": 5.09338364753818e-07, "logits/chosen": -2.70573353767395, "logits/rejected": -2.6352028846740723, "logps/chosen": -266.19921875, "logps/rejected": -251.9840545654297, "loss": 0.6587, "positive_losses": 0.18008080124855042, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.22073253989219666, "rewards/margins": 0.08792804181575775, "rewards/margins_max": 0.3271465301513672, "rewards/margins_min": -0.1252015233039856, "rewards/margins_std": 0.1997259557247162, "rewards/rejected": 0.1328045129776001, "step": 3110 }, { "dpo_losses": 0.6467105746269226, "epoch": 0.82, "grad_norm": 1.9154281415492576, "learning_rate": 4.956018477086005e-07, "logits/chosen": -2.6191787719726562, "logits/rejected": -2.5866799354553223, "logps/chosen": -271.4346923828125, "logps/rejected": -247.22811889648438, "loss": 0.6679, "positive_losses": 0.4603656232357025, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.23231330513954163, "rewards/margins": 0.10724592208862305, "rewards/margins_max": 0.3365776538848877, "rewards/margins_min": -0.12125638872385025, "rewards/margins_std": 0.20351378619670868, "rewards/rejected": 0.1250673532485962, "step": 3120 }, { "dpo_losses": 0.6430977582931519, "epoch": 0.82, "grad_norm": 1.906469051094875, "learning_rate": 4.820326973322764e-07, "logits/chosen": -2.7109386920928955, "logits/rejected": -2.670637845993042, "logps/chosen": -241.487060546875, "logps/rejected": -210.2299041748047, "loss": 0.6443, "positive_losses": 0.1338365525007248, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.22416770458221436, "rewards/margins": 0.11624391376972198, "rewards/margins_max": 0.360629141330719, "rewards/margins_min": -0.10840293020009995, "rewards/margins_std": 0.211382657289505, "rewards/rejected": 0.10792376846075058, "step": 3130 }, { "dpo_losses": 0.6356518864631653, "epoch": 0.82, "grad_norm": 1.9870086974369754, "learning_rate": 4.686320466449981e-07, "logits/chosen": -2.613452434539795, "logits/rejected": -2.6398723125457764, "logps/chosen": -216.00192260742188, "logps/rejected": -240.72891235351562, "loss": 0.6845, "positive_losses": 0.26034507155418396, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.23193475604057312, "rewards/margins": 0.12595371901988983, "rewards/margins_max": 0.2877231240272522, "rewards/margins_min": -0.03262107074260712, "rewards/margins_std": 0.14468979835510254, "rewards/rejected": 0.10598105192184448, "step": 3140 }, { "dpo_losses": 0.6281145811080933, "epoch": 0.82, "grad_norm": 9.798790168042391, "learning_rate": 4.554010145972418e-07, "logits/chosen": -2.5790274143218994, "logits/rejected": -2.563317060470581, "logps/chosen": -278.03118896484375, "logps/rejected": -295.5206298828125, "loss": 0.6563, "positive_losses": 0.2543970048427582, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.25272035598754883, "rewards/margins": 0.15435662865638733, "rewards/margins_max": 0.41291069984436035, "rewards/margins_min": -0.06888072192668915, "rewards/margins_std": 0.21114492416381836, "rewards/rejected": 0.0983637273311615, "step": 3150 }, { "dpo_losses": 0.6204236149787903, "epoch": 0.83, "grad_norm": 6.473582945600099, "learning_rate": 4.4234070597637455e-07, "logits/chosen": -2.71343731880188, "logits/rejected": -2.677605152130127, "logps/chosen": -279.60089111328125, "logps/rejected": -246.6046905517578, "loss": 0.6658, "positive_losses": 0.0, "rewards/accuracies": 0.8125, "rewards/chosen": 0.259508341550827, "rewards/margins": 0.163357213139534, "rewards/margins_max": 0.3819490373134613, "rewards/margins_min": -0.04566248506307602, "rewards/margins_std": 0.1943180114030838, "rewards/rejected": 0.09615114331245422, "step": 3160 }, { "dpo_losses": 0.6419295072555542, "epoch": 0.83, "grad_norm": 11.911500749177504, "learning_rate": 4.2945221131440783e-07, "logits/chosen": -2.6954774856567383, "logits/rejected": -2.675586223602295, "logps/chosen": -273.2368469238281, "logps/rejected": -255.10995483398438, "loss": 0.6614, "positive_losses": 0.35317736864089966, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2160724401473999, "rewards/margins": 0.11563559621572495, "rewards/margins_max": 0.3120267689228058, "rewards/margins_min": -0.08002261817455292, "rewards/margins_std": 0.17242729663848877, "rewards/rejected": 0.10043685138225555, "step": 3170 }, { "dpo_losses": 0.6373859643936157, "epoch": 0.83, "grad_norm": 15.502172502424358, "learning_rate": 4.167366067969381e-07, "logits/chosen": -2.623873233795166, "logits/rejected": -2.6007158756256104, "logps/chosen": -269.0568542480469, "logps/rejected": -220.5035858154297, "loss": 0.6918, "positive_losses": 0.3878262937068939, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.25423291325569153, "rewards/margins": 0.12449681758880615, "rewards/margins_max": 0.3036009967327118, "rewards/margins_min": -0.08757523447275162, "rewards/margins_std": 0.17467781901359558, "rewards/rejected": 0.12973609566688538, "step": 3180 }, { "dpo_losses": 0.6364492774009705, "epoch": 0.83, "grad_norm": 1.986876525494331, "learning_rate": 4.041949541732826e-07, "logits/chosen": -2.6248703002929688, "logits/rejected": -2.5871098041534424, "logps/chosen": -259.7639465332031, "logps/rejected": -237.5189666748047, "loss": 0.6605, "positive_losses": 0.2175983488559723, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.22980129718780518, "rewards/margins": 0.12821714580059052, "rewards/margins_max": 0.35198521614074707, "rewards/margins_min": -0.09533867239952087, "rewards/margins_std": 0.20564059913158417, "rewards/rejected": 0.10158412158489227, "step": 3190 }, { "dpo_losses": 0.6411651372909546, "epoch": 0.84, "grad_norm": 1.6812550881461255, "learning_rate": 3.9182830066782614e-07, "logits/chosen": -2.6221518516540527, "logits/rejected": -2.59946870803833, "logps/chosen": -249.2165069580078, "logps/rejected": -240.29556274414062, "loss": 0.6516, "positive_losses": 0.1823883056640625, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2199111431837082, "rewards/margins": 0.11735031753778458, "rewards/margins_max": 0.3530258536338806, "rewards/margins_min": -0.0921543687582016, "rewards/margins_std": 0.1945345401763916, "rewards/rejected": 0.10256080329418182, "step": 3200 }, { "epoch": 0.84, "eval_dpo_losses": 0.6384575366973877, "eval_logits/chosen": -2.649825096130371, "eval_logits/rejected": -2.6158199310302734, "eval_logps/chosen": -261.75390625, "eval_logps/rejected": -248.2175750732422, "eval_loss": 0.6827730536460876, "eval_positive_losses": 0.3005758225917816, "eval_rewards/accuracies": 0.7160000205039978, "eval_rewards/chosen": 0.2283952236175537, "eval_rewards/margins": 0.12478169053792953, "eval_rewards/margins_max": 0.4527498781681061, "eval_rewards/margins_min": -0.15858082473278046, "eval_rewards/margins_std": 0.20521041750907898, "eval_rewards/rejected": 0.10361352562904358, "eval_runtime": 427.9924, "eval_samples_per_second": 4.673, "eval_steps_per_second": 0.292, "step": 3200 }, { "dpo_losses": 0.6365126371383667, "epoch": 0.84, "grad_norm": 2.1383141065978455, "learning_rate": 3.796376788925771e-07, "logits/chosen": -2.673523426055908, "logits/rejected": -2.652254104614258, "logps/chosen": -230.735595703125, "logps/rejected": -223.2124481201172, "loss": 0.6735, "positive_losses": 0.21147899329662323, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22603364288806915, "rewards/margins": 0.12993502616882324, "rewards/margins_max": 0.3829730153083801, "rewards/margins_min": -0.08001724630594254, "rewards/margins_std": 0.20853832364082336, "rewards/rejected": 0.09609860926866531, "step": 3210 }, { "dpo_losses": 0.6381786465644836, "epoch": 0.84, "grad_norm": 2.3358172637716947, "learning_rate": 3.676241067609465e-07, "logits/chosen": -2.612231731414795, "logits/rejected": -2.598639965057373, "logps/chosen": -268.4934387207031, "logps/rejected": -264.42913818359375, "loss": 0.6846, "positive_losses": 0.41847342252731323, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.22133584320545197, "rewards/margins": 0.1236250028014183, "rewards/margins_max": 0.3461146950721741, "rewards/margins_min": -0.05468686297535896, "rewards/margins_std": 0.17791306972503662, "rewards/rejected": 0.09771083295345306, "step": 3220 }, { "dpo_losses": 0.66154545545578, "epoch": 0.85, "grad_norm": 2.3606999195058305, "learning_rate": 3.5578858740274976e-07, "logits/chosen": -2.627288341522217, "logits/rejected": -2.6127145290374756, "logps/chosen": -294.7670593261719, "logps/rejected": -310.2293701171875, "loss": 0.7285, "positive_losses": 0.8174301981925964, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.21198442578315735, "rewards/margins": 0.0844467282295227, "rewards/margins_max": 0.4018055498600006, "rewards/margins_min": -0.1648840606212616, "rewards/margins_std": 0.2497369796037674, "rewards/rejected": 0.12753772735595703, "step": 3230 }, { "dpo_losses": 0.6271528601646423, "epoch": 0.85, "grad_norm": 13.007862940968268, "learning_rate": 3.44132109080447e-07, "logits/chosen": -2.6953787803649902, "logits/rejected": -2.646010637283325, "logps/chosen": -231.9201202392578, "logps/rejected": -249.96530151367188, "loss": 0.6825, "positive_losses": 0.1917347013950348, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.21758997440338135, "rewards/margins": 0.15806236863136292, "rewards/margins_max": 0.4027267098426819, "rewards/margins_min": -0.05626615881919861, "rewards/margins_std": 0.21179640293121338, "rewards/rejected": 0.05952761694788933, "step": 3240 }, { "dpo_losses": 0.6535229086875916, "epoch": 0.85, "grad_norm": 11.488135810665579, "learning_rate": 3.3265564510662344e-07, "logits/chosen": -2.7149996757507324, "logits/rejected": -2.6980862617492676, "logps/chosen": -271.8984375, "logps/rejected": -253.8206787109375, "loss": 0.6967, "positive_losses": 0.38341885805130005, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.21455137431621552, "rewards/margins": 0.0899895653128624, "rewards/margins_max": 0.2641471028327942, "rewards/margins_min": -0.08759258687496185, "rewards/margins_std": 0.15608647465705872, "rewards/rejected": 0.12456182390451431, "step": 3250 }, { "dpo_losses": 0.6507695317268372, "epoch": 0.85, "grad_norm": 7.504993556538967, "learning_rate": 3.213601537627195e-07, "logits/chosen": -2.7060675621032715, "logits/rejected": -2.717250347137451, "logps/chosen": -244.5074005126953, "logps/rejected": -284.1248474121094, "loss": 0.6728, "positive_losses": 0.2510760426521301, "rewards/accuracies": 0.75, "rewards/chosen": 0.19894081354141235, "rewards/margins": 0.096902996301651, "rewards/margins_max": 0.2957373857498169, "rewards/margins_min": -0.10130006074905396, "rewards/margins_std": 0.17616146802902222, "rewards/rejected": 0.10203780978918076, "step": 3260 }, { "dpo_losses": 0.6447475552558899, "epoch": 0.86, "grad_norm": 11.609264594024141, "learning_rate": 3.1024657821901063e-07, "logits/chosen": -2.6828439235687256, "logits/rejected": -2.631220579147339, "logps/chosen": -236.44894409179688, "logps/rejected": -255.2611541748047, "loss": 0.6838, "positive_losses": 0.306144654750824, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.21078269183635712, "rewards/margins": 0.11331923305988312, "rewards/margins_max": 0.3753519654273987, "rewards/margins_min": -0.13783003389835358, "rewards/margins_std": 0.2330664098262787, "rewards/rejected": 0.097463458776474, "step": 3270 }, { "dpo_losses": 0.6534501314163208, "epoch": 0.86, "grad_norm": 11.688866846461075, "learning_rate": 2.9931584645585654e-07, "logits/chosen": -2.7720062732696533, "logits/rejected": -2.679375171661377, "logps/chosen": -235.0417938232422, "logps/rejected": -235.50137329101562, "loss": 0.672, "positive_losses": 0.38392525911331177, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.2245892584323883, "rewards/margins": 0.09525544196367264, "rewards/margins_max": 0.34492596983909607, "rewards/margins_min": -0.1276320219039917, "rewards/margins_std": 0.20781132578849792, "rewards/rejected": 0.12933377921581268, "step": 3280 }, { "dpo_losses": 0.616895318031311, "epoch": 0.86, "grad_norm": 2.188074962803304, "learning_rate": 2.885688711862136e-07, "logits/chosen": -2.6105411052703857, "logits/rejected": -2.6143670082092285, "logps/chosen": -268.9541931152344, "logps/rejected": -265.6374816894531, "loss": 0.6597, "positive_losses": 0.35630494356155396, "rewards/accuracies": 0.75, "rewards/chosen": 0.25052422285079956, "rewards/margins": 0.17335107922554016, "rewards/margins_max": 0.40427374839782715, "rewards/margins_min": -0.08141206204891205, "rewards/margins_std": 0.2202124148607254, "rewards/rejected": 0.07717315107584, "step": 3290 }, { "dpo_losses": 0.6404051780700684, "epoch": 0.86, "grad_norm": 8.593080140796566, "learning_rate": 2.7800654977942486e-07, "logits/chosen": -2.6817626953125, "logits/rejected": -2.6735479831695557, "logps/chosen": -310.8262634277344, "logps/rejected": -296.20501708984375, "loss": 0.6627, "positive_losses": 0.14009293913841248, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.24962861835956573, "rewards/margins": 0.12139974534511566, "rewards/margins_max": 0.3643108010292053, "rewards/margins_min": -0.13667277991771698, "rewards/margins_std": 0.21952347457408905, "rewards/rejected": 0.12822887301445007, "step": 3300 }, { "epoch": 0.86, "eval_dpo_losses": 0.6403202414512634, "eval_logits/chosen": -2.652597188949585, "eval_logits/rejected": -2.6184253692626953, "eval_logps/chosen": -261.3397521972656, "eval_logps/rejected": -247.35203552246094, "eval_loss": 0.6772990822792053, "eval_positive_losses": 0.24060045182704926, "eval_rewards/accuracies": 0.718999981880188, "eval_rewards/chosen": 0.23253653943538666, "eval_rewards/margins": 0.12026768177747726, "eval_rewards/margins_max": 0.4419324994087219, "eval_rewards/margins_min": -0.15448962152004242, "eval_rewards/margins_std": 0.20027266442775726, "eval_rewards/rejected": 0.1122688427567482, "eval_runtime": 428.0713, "eval_samples_per_second": 4.672, "eval_steps_per_second": 0.292, "step": 3300 }, { "dpo_losses": 0.6264361143112183, "epoch": 0.87, "grad_norm": 39.82948273970552, "learning_rate": 2.6762976418628797e-07, "logits/chosen": -2.705967664718628, "logits/rejected": -2.658480405807495, "logps/chosen": -251.9584503173828, "logps/rejected": -254.8125, "loss": 0.6839, "positive_losses": 0.18140240013599396, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.22535867989063263, "rewards/margins": 0.15404847264289856, "rewards/margins_max": 0.436443567276001, "rewards/margins_min": -0.07473914325237274, "rewards/margins_std": 0.2287091761827469, "rewards/rejected": 0.07131022214889526, "step": 3310 }, { "dpo_losses": 0.6395207047462463, "epoch": 0.87, "grad_norm": 2.9691071947943692, "learning_rate": 2.5743938086541354e-07, "logits/chosen": -2.612696409225464, "logits/rejected": -2.59602689743042, "logps/chosen": -278.0087890625, "logps/rejected": -266.9355773925781, "loss": 0.6748, "positive_losses": 0.2753303647041321, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.26316890120506287, "rewards/margins": 0.13156357407569885, "rewards/margins_max": 0.4129224419593811, "rewards/margins_min": -0.09041708707809448, "rewards/margins_std": 0.23072955012321472, "rewards/rejected": 0.1316053569316864, "step": 3320 }, { "dpo_losses": 0.6477687358856201, "epoch": 0.87, "grad_norm": 2.3269522356509023, "learning_rate": 2.4743625071087574e-07, "logits/chosen": -2.737248182296753, "logits/rejected": -2.750629425048828, "logps/chosen": -280.17864990234375, "logps/rejected": -276.9759216308594, "loss": 0.6466, "positive_losses": 0.0, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.24916020035743713, "rewards/margins": 0.10327957570552826, "rewards/margins_max": 0.3029175102710724, "rewards/margins_min": -0.09398408979177475, "rewards/margins_std": 0.17593896389007568, "rewards/rejected": 0.14588062465190887, "step": 3330 }, { "dpo_losses": 0.6357873678207397, "epoch": 0.87, "grad_norm": 1.9947758633325405, "learning_rate": 2.3762120898116498e-07, "logits/chosen": -2.656794548034668, "logits/rejected": -2.672175168991089, "logps/chosen": -243.54544067382812, "logps/rejected": -295.7755126953125, "loss": 0.65, "positive_losses": 0.1419471800327301, "rewards/accuracies": 0.75, "rewards/chosen": 0.22034494578838348, "rewards/margins": 0.13167402148246765, "rewards/margins_max": 0.3637886047363281, "rewards/margins_min": -0.0878521203994751, "rewards/margins_std": 0.201420858502388, "rewards/rejected": 0.08867089450359344, "step": 3340 }, { "dpo_losses": 0.6441482305526733, "epoch": 0.88, "grad_norm": 7.694467256086055, "learning_rate": 2.2799507522944048e-07, "logits/chosen": -2.7229669094085693, "logits/rejected": -2.647151231765747, "logps/chosen": -252.82571411132812, "logps/rejected": -221.85879516601562, "loss": 0.6769, "positive_losses": 0.35150521993637085, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21200843155384064, "rewards/margins": 0.11605236679315567, "rewards/margins_max": 0.3657509982585907, "rewards/margins_min": -0.10902712494134903, "rewards/margins_std": 0.21472206711769104, "rewards/rejected": 0.09595610946416855, "step": 3350 }, { "dpo_losses": 0.6233208775520325, "epoch": 0.88, "grad_norm": 1.767260106599493, "learning_rate": 2.1855865323510056e-07, "logits/chosen": -2.6158571243286133, "logits/rejected": -2.5889029502868652, "logps/chosen": -270.18914794921875, "logps/rejected": -277.93341064453125, "loss": 0.6692, "positive_losses": 0.1520233154296875, "rewards/accuracies": 0.8125, "rewards/chosen": 0.23553188145160675, "rewards/margins": 0.16098877787590027, "rewards/margins_max": 0.4127779006958008, "rewards/margins_min": -0.05117069557309151, "rewards/margins_std": 0.2057846337556839, "rewards/rejected": 0.07454311102628708, "step": 3360 }, { "dpo_losses": 0.6466307044029236, "epoch": 0.88, "grad_norm": 2.0852991620424257, "learning_rate": 2.0931273093666575e-07, "logits/chosen": -2.713484287261963, "logits/rejected": -2.7137503623962402, "logps/chosen": -237.7594451904297, "logps/rejected": -243.6479949951172, "loss": 0.6619, "positive_losses": 0.06381092220544815, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.22457580268383026, "rewards/margins": 0.10744913667440414, "rewards/margins_max": 0.3464711308479309, "rewards/margins_min": -0.09933777153491974, "rewards/margins_std": 0.20307926833629608, "rewards/rejected": 0.1171267032623291, "step": 3370 }, { "dpo_losses": 0.6486043334007263, "epoch": 0.88, "grad_norm": 1.5986555560433542, "learning_rate": 2.002580803659873e-07, "logits/chosen": -2.7386975288391113, "logits/rejected": -2.6531612873077393, "logps/chosen": -231.8615264892578, "logps/rejected": -220.4953155517578, "loss": 0.7043, "positive_losses": 0.03696479648351669, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.20933715999126434, "rewards/margins": 0.10051582008600235, "rewards/margins_max": 0.30396851897239685, "rewards/margins_min": -0.06479805707931519, "rewards/margins_std": 0.16805866360664368, "rewards/rejected": 0.1088213324546814, "step": 3380 }, { "dpo_losses": 0.6446608304977417, "epoch": 0.89, "grad_norm": 1.9495041510241085, "learning_rate": 1.913954575837826e-07, "logits/chosen": -2.636171340942383, "logits/rejected": -2.6109492778778076, "logps/chosen": -255.878662109375, "logps/rejected": -243.5431671142578, "loss": 0.6607, "positive_losses": 0.33525413274765015, "rewards/accuracies": 0.625, "rewards/chosen": 0.21522343158721924, "rewards/margins": 0.11724289506673813, "rewards/margins_max": 0.403298944234848, "rewards/margins_min": -0.14402872323989868, "rewards/margins_std": 0.24462909996509552, "rewards/rejected": 0.09798052161931992, "step": 3390 }, { "dpo_losses": 0.6406092047691345, "epoch": 0.89, "grad_norm": 1.846940206289947, "learning_rate": 1.827256026165028e-07, "logits/chosen": -2.641024112701416, "logits/rejected": -2.635166883468628, "logps/chosen": -215.92178344726562, "logps/rejected": -237.233642578125, "loss": 0.6517, "positive_losses": 0.1393692046403885, "rewards/accuracies": 0.75, "rewards/chosen": 0.21265530586242676, "rewards/margins": 0.11898468434810638, "rewards/margins_max": 0.3562791049480438, "rewards/margins_min": -0.06635533273220062, "rewards/margins_std": 0.1890067160129547, "rewards/rejected": 0.09367059916257858, "step": 3400 }, { "epoch": 0.89, "eval_dpo_losses": 0.6386284828186035, "eval_logits/chosen": -2.6550769805908203, "eval_logits/rejected": -2.621270179748535, "eval_logps/chosen": -261.5968017578125, "eval_logps/rejected": -248.01812744140625, "eval_loss": 0.6813686490058899, "eval_positive_losses": 0.2865428030490875, "eval_rewards/accuracies": 0.718999981880188, "eval_rewards/chosen": 0.2299659103155136, "eval_rewards/margins": 0.12435787916183472, "eval_rewards/margins_max": 0.45185142755508423, "eval_rewards/margins_min": -0.1569340080022812, "eval_rewards/margins_std": 0.20452216267585754, "eval_rewards/rejected": 0.10560804605484009, "eval_runtime": 427.9107, "eval_samples_per_second": 4.674, "eval_steps_per_second": 0.292, "step": 3400 }, { "dpo_losses": 0.6420435905456543, "epoch": 0.89, "grad_norm": 1.6239890108112547, "learning_rate": 1.7424923939454274e-07, "logits/chosen": -2.5693392753601074, "logits/rejected": -2.581394672393799, "logps/chosen": -228.1303253173828, "logps/rejected": -247.8132781982422, "loss": 0.6825, "positive_losses": 0.7139572501182556, "rewards/accuracies": 0.6875, "rewards/chosen": 0.18297092616558075, "rewards/margins": 0.11609435081481934, "rewards/margins_max": 0.33712631464004517, "rewards/margins_min": -0.06108611077070236, "rewards/margins_std": 0.17885836958885193, "rewards/rejected": 0.0668765977025032, "step": 3410 }, { "dpo_losses": 0.6098856329917908, "epoch": 0.9, "grad_norm": 9.593562320029054, "learning_rate": 1.6596707569179304e-07, "logits/chosen": -2.623298168182373, "logits/rejected": -2.6018214225769043, "logps/chosen": -285.25372314453125, "logps/rejected": -255.9405517578125, "loss": 0.6298, "positive_losses": 0.10523166507482529, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.2724429666996002, "rewards/margins": 0.1895667314529419, "rewards/margins_max": 0.4633703827857971, "rewards/margins_min": -0.06719541549682617, "rewards/margins_std": 0.23646345734596252, "rewards/rejected": 0.08287624269723892, "step": 3420 }, { "dpo_losses": 0.6304863095283508, "epoch": 0.9, "grad_norm": 2.0355417541779164, "learning_rate": 1.578798030665385e-07, "logits/chosen": -2.647200345993042, "logits/rejected": -2.6434273719787598, "logps/chosen": -258.7545471191406, "logps/rejected": -249.9468536376953, "loss": 0.6541, "positive_losses": 0.08031348884105682, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2140689641237259, "rewards/margins": 0.139048233628273, "rewards/margins_max": 0.3309935927391052, "rewards/margins_min": -0.042954690754413605, "rewards/margins_std": 0.1671784669160843, "rewards/rejected": 0.07502072304487228, "step": 3430 }, { "dpo_losses": 0.641990065574646, "epoch": 0.9, "grad_norm": 1.828017663657493, "learning_rate": 1.499880968037165e-07, "logits/chosen": -2.6463229656219482, "logits/rejected": -2.642021417617798, "logps/chosen": -285.63006591796875, "logps/rejected": -275.5422058105469, "loss": 0.6635, "positive_losses": 0.28176501393318176, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.24048736691474915, "rewards/margins": 0.11896850913763046, "rewards/margins_max": 0.3504177927970886, "rewards/margins_min": -0.10457517206668854, "rewards/margins_std": 0.2031029909849167, "rewards/rejected": 0.12151883542537689, "step": 3440 }, { "dpo_losses": 0.6230959892272949, "epoch": 0.9, "grad_norm": 11.307529769679473, "learning_rate": 1.4229261585852805e-07, "logits/chosen": -2.69089674949646, "logits/rejected": -2.675455093383789, "logps/chosen": -284.59722900390625, "logps/rejected": -256.7802734375, "loss": 0.6745, "positive_losses": 0.520799994468689, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2580224871635437, "rewards/margins": 0.15853682160377502, "rewards/margins_max": 0.36704161763191223, "rewards/margins_min": -0.04584568738937378, "rewards/margins_std": 0.18185365200042725, "rewards/rejected": 0.09948565810918808, "step": 3450 }, { "dpo_losses": 0.6113892197608948, "epoch": 0.91, "grad_norm": 2.303752127453406, "learning_rate": 1.3479400280141886e-07, "logits/chosen": -2.702341318130493, "logits/rejected": -2.6269218921661377, "logps/chosen": -282.23284912109375, "logps/rejected": -220.28604125976562, "loss": 0.645, "positive_losses": 0.3007164001464844, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.26262277364730835, "rewards/margins": 0.1838146150112152, "rewards/margins_max": 0.42686158418655396, "rewards/margins_min": -0.011207438074052334, "rewards/margins_std": 0.19332796335220337, "rewards/rejected": 0.07880813628435135, "step": 3460 }, { "dpo_losses": 0.6374378204345703, "epoch": 0.91, "grad_norm": 2.208238875570211, "learning_rate": 1.2749288376442044e-07, "logits/chosen": -2.6841673851013184, "logits/rejected": -2.6038734912872314, "logps/chosen": -267.94732666015625, "logps/rejected": -256.0314025878906, "loss": 0.6904, "positive_losses": 0.3077290952205658, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.20196840167045593, "rewards/margins": 0.12561708688735962, "rewards/margins_max": 0.3410743772983551, "rewards/margins_min": -0.06772379577159882, "rewards/margins_std": 0.18634767830371857, "rewards/rejected": 0.0763513371348381, "step": 3470 }, { "dpo_losses": 0.6477632522583008, "epoch": 0.91, "grad_norm": 2.0981624283802955, "learning_rate": 1.203898683888713e-07, "logits/chosen": -2.7335000038146973, "logits/rejected": -2.6686019897460938, "logps/chosen": -265.36053466796875, "logps/rejected": -269.66693115234375, "loss": 0.6875, "positive_losses": 0.46651220321655273, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.22941572964191437, "rewards/margins": 0.10551927983760834, "rewards/margins_max": 0.316548228263855, "rewards/margins_min": -0.10447671264410019, "rewards/margins_std": 0.18978312611579895, "rewards/rejected": 0.12389643490314484, "step": 3480 }, { "dpo_losses": 0.6597553491592407, "epoch": 0.91, "grad_norm": 9.665018489245684, "learning_rate": 1.1348554977451132e-07, "logits/chosen": -2.646718978881836, "logits/rejected": -2.673459529876709, "logps/chosen": -228.7762908935547, "logps/rejected": -215.7211151123047, "loss": 0.6829, "positive_losses": 0.17726345360279083, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21256911754608154, "rewards/margins": 0.07763149589300156, "rewards/margins_max": 0.27114394307136536, "rewards/margins_min": -0.10825137794017792, "rewards/margins_std": 0.16942939162254333, "rewards/rejected": 0.1349376142024994, "step": 3490 }, { "dpo_losses": 0.6395228505134583, "epoch": 0.92, "grad_norm": 46.476112711094665, "learning_rate": 1.0678050442995802e-07, "logits/chosen": -2.725433111190796, "logits/rejected": -2.701233386993408, "logps/chosen": -287.6202087402344, "logps/rejected": -235.7355194091797, "loss": 0.7267, "positive_losses": 0.15096637606620789, "rewards/accuracies": 0.75, "rewards/chosen": 0.22031128406524658, "rewards/margins": 0.12081418186426163, "rewards/margins_max": 0.35709747672080994, "rewards/margins_min": -0.06519372016191483, "rewards/margins_std": 0.1876780092716217, "rewards/rejected": 0.09949707239866257, "step": 3500 }, { "epoch": 0.92, "eval_dpo_losses": 0.6385395526885986, "eval_logits/chosen": -2.655956983566284, "eval_logits/rejected": -2.6222283840179443, "eval_logps/chosen": -261.5743713378906, "eval_logps/rejected": -248.02084350585938, "eval_loss": 0.6810497045516968, "eval_positive_losses": 0.2879987061023712, "eval_rewards/accuracies": 0.7200000286102295, "eval_rewards/chosen": 0.23019051551818848, "eval_rewards/margins": 0.12460958957672119, "eval_rewards/margins_max": 0.45358777046203613, "eval_rewards/margins_min": -0.15690042078495026, "eval_rewards/margins_std": 0.2050294280052185, "eval_rewards/rejected": 0.10558092594146729, "eval_runtime": 428.2463, "eval_samples_per_second": 4.67, "eval_steps_per_second": 0.292, "step": 3500 }, { "dpo_losses": 0.6398534178733826, "epoch": 0.92, "grad_norm": 2.27603190148628, "learning_rate": 1.0027529222456755e-07, "logits/chosen": -2.6275250911712646, "logits/rejected": -2.6174960136413574, "logps/chosen": -267.97479248046875, "logps/rejected": -248.0113067626953, "loss": 0.7017, "positive_losses": 0.6987945437431335, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2240476906299591, "rewards/margins": 0.12655410170555115, "rewards/margins_max": 0.37817585468292236, "rewards/margins_min": -0.0808626115322113, "rewards/margins_std": 0.21181420981884003, "rewards/rejected": 0.09749359637498856, "step": 3510 }, { "dpo_losses": 0.6397808194160461, "epoch": 0.92, "grad_norm": 11.618890284798946, "learning_rate": 9.397045634168766e-08, "logits/chosen": -2.6689958572387695, "logits/rejected": -2.6879935264587402, "logps/chosen": -251.0983428955078, "logps/rejected": -250.15884399414062, "loss": 0.7027, "positive_losses": 0.4100571572780609, "rewards/accuracies": 0.6875, "rewards/chosen": 0.18390432000160217, "rewards/margins": 0.1212887167930603, "rewards/margins_max": 0.36527150869369507, "rewards/margins_min": -0.08835957199335098, "rewards/margins_std": 0.1976158767938614, "rewards/rejected": 0.06261558085680008, "step": 3520 }, { "dpo_losses": 0.6355238556861877, "epoch": 0.92, "grad_norm": 8.93058712249867, "learning_rate": 8.78665232332998e-08, "logits/chosen": -2.743079423904419, "logits/rejected": -2.7208328247070312, "logps/chosen": -273.70379638671875, "logps/rejected": -265.4845886230469, "loss": 0.6391, "positive_losses": 0.03184204176068306, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.25056618452072144, "rewards/margins": 0.1285247504711151, "rewards/margins_max": 0.36041417717933655, "rewards/margins_min": -0.034891076385974884, "rewards/margins_std": 0.17605885863304138, "rewards/rejected": 0.12204144150018692, "step": 3530 }, { "dpo_losses": 0.6398634910583496, "epoch": 0.93, "grad_norm": 14.409295875135173, "learning_rate": 8.196400257606208e-08, "logits/chosen": -2.6243643760681152, "logits/rejected": -2.5534205436706543, "logps/chosen": -280.9654846191406, "logps/rejected": -299.6298522949219, "loss": 0.7192, "positive_losses": 0.7538820505142212, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2547672390937805, "rewards/margins": 0.13398560881614685, "rewards/margins_max": 0.46419715881347656, "rewards/margins_min": -0.1198708787560463, "rewards/margins_std": 0.2631421685218811, "rewards/rejected": 0.12078163772821426, "step": 3540 }, { "dpo_losses": 0.6318740248680115, "epoch": 0.93, "grad_norm": 2.107180888315368, "learning_rate": 7.626338722875076e-08, "logits/chosen": -2.6094422340393066, "logits/rejected": -2.6372439861297607, "logps/chosen": -238.2021026611328, "logps/rejected": -258.7787170410156, "loss": 0.66, "positive_losses": 0.06889379024505615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2709522843360901, "rewards/margins": 0.14883530139923096, "rewards/margins_max": 0.41571635007858276, "rewards/margins_min": -0.08988544344902039, "rewards/margins_std": 0.2293560951948166, "rewards/rejected": 0.12211696058511734, "step": 3550 }, { "dpo_losses": 0.6310317516326904, "epoch": 0.93, "grad_norm": 10.46521774597658, "learning_rate": 7.076515319110688e-08, "logits/chosen": -2.6502716541290283, "logits/rejected": -2.582925319671631, "logps/chosen": -290.3277282714844, "logps/rejected": -261.40679931640625, "loss": 0.6553, "positive_losses": 0.20256996154785156, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.25210773944854736, "rewards/margins": 0.1431921422481537, "rewards/margins_max": 0.4087950587272644, "rewards/margins_min": -0.09110721200704575, "rewards/margins_std": 0.22172784805297852, "rewards/rejected": 0.10891561210155487, "step": 3560 }, { "dpo_losses": 0.6263138651847839, "epoch": 0.93, "grad_norm": 9.912305455265015, "learning_rate": 6.54697595640899e-08, "logits/chosen": -2.6590230464935303, "logits/rejected": -2.629793167114258, "logps/chosen": -300.0892639160156, "logps/rejected": -249.05899047851562, "loss": 0.6875, "positive_losses": 0.4945901930332184, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.221832275390625, "rewards/margins": 0.15287736058235168, "rewards/margins_max": 0.4320314824581146, "rewards/margins_min": -0.10347136110067368, "rewards/margins_std": 0.23440003395080566, "rewards/rejected": 0.06895491480827332, "step": 3570 }, { "dpo_losses": 0.628657341003418, "epoch": 0.94, "grad_norm": 7.1994219547929195, "learning_rate": 6.037764851154426e-08, "logits/chosen": -2.7059946060180664, "logits/rejected": -2.644545078277588, "logps/chosen": -265.8240966796875, "logps/rejected": -246.8155059814453, "loss": 0.6898, "positive_losses": 0.4449668824672699, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.22660474479198456, "rewards/margins": 0.14894968271255493, "rewards/margins_max": 0.3824352025985718, "rewards/margins_min": -0.10199352353811264, "rewards/margins_std": 0.21889081597328186, "rewards/rejected": 0.07765506953001022, "step": 3580 }, { "dpo_losses": 0.6285854578018188, "epoch": 0.94, "grad_norm": 6.918410697963323, "learning_rate": 5.548924522327748e-08, "logits/chosen": -2.676370143890381, "logits/rejected": -2.651085376739502, "logps/chosen": -249.1566162109375, "logps/rejected": -219.9067840576172, "loss": 0.6905, "positive_losses": 0.5990933179855347, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.23606470227241516, "rewards/margins": 0.15003365278244019, "rewards/margins_max": 0.4204083979129791, "rewards/margins_min": -0.09411970525979996, "rewards/margins_std": 0.22876787185668945, "rewards/rejected": 0.08603102713823318, "step": 3590 }, { "dpo_losses": 0.6493958234786987, "epoch": 0.94, "grad_norm": 2.531047577475979, "learning_rate": 5.0804957879556915e-08, "logits/chosen": -2.6386544704437256, "logits/rejected": -2.5956645011901855, "logps/chosen": -237.57080078125, "logps/rejected": -249.49801635742188, "loss": 0.6563, "positive_losses": 0.004039764404296875, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.21744480729103088, "rewards/margins": 0.09995146840810776, "rewards/margins_max": 0.296904981136322, "rewards/margins_min": -0.12265890836715698, "rewards/margins_std": 0.18629953265190125, "rewards/rejected": 0.11749333143234253, "step": 3600 }, { "epoch": 0.94, "eval_dpo_losses": 0.6394082307815552, "eval_logits/chosen": -2.6555135250091553, "eval_logits/rejected": -2.621596097946167, "eval_logps/chosen": -261.41357421875, "eval_logps/rejected": -247.6491241455078, "eval_loss": 0.679019033908844, "eval_positive_losses": 0.2627328932285309, "eval_rewards/accuracies": 0.722000002861023, "eval_rewards/chosen": 0.23179861903190613, "eval_rewards/margins": 0.12250068038702011, "eval_rewards/margins_max": 0.4486891031265259, "eval_rewards/margins_min": -0.1550326645374298, "eval_rewards/margins_std": 0.20270268619060516, "eval_rewards/rejected": 0.10929791629314423, "eval_runtime": 428.0374, "eval_samples_per_second": 4.672, "eval_steps_per_second": 0.292, "step": 3600 }, { "dpo_losses": 0.6413955688476562, "epoch": 0.94, "grad_norm": 9.51063303929373, "learning_rate": 4.632517761702815e-08, "logits/chosen": -2.661477565765381, "logits/rejected": -2.654937982559204, "logps/chosen": -305.40875244140625, "logps/rejected": -247.637451171875, "loss": 0.6674, "positive_losses": 0.05891609191894531, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2322145253419876, "rewards/margins": 0.11494660377502441, "rewards/margins_max": 0.28751423954963684, "rewards/margins_min": -0.06896142661571503, "rewards/margins_std": 0.1589422971010208, "rewards/rejected": 0.11726789176464081, "step": 3610 }, { "dpo_losses": 0.6433061361312866, "epoch": 0.95, "grad_norm": 2.5528407523443044, "learning_rate": 4.205027849605359e-08, "logits/chosen": -2.7029664516448975, "logits/rejected": -2.6620147228240967, "logps/chosen": -262.38421630859375, "logps/rejected": -237.93930053710938, "loss": 0.6516, "positive_losses": 0.02631073072552681, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2335212230682373, "rewards/margins": 0.10934171825647354, "rewards/margins_max": 0.30177029967308044, "rewards/margins_min": -0.052206508815288544, "rewards/margins_std": 0.15611205995082855, "rewards/rejected": 0.12417948246002197, "step": 3620 }, { "dpo_losses": 0.6407767534255981, "epoch": 0.95, "grad_norm": 9.639408968460176, "learning_rate": 3.798061746947995e-08, "logits/chosen": -2.6105198860168457, "logits/rejected": -2.558924436569214, "logps/chosen": -228.0260772705078, "logps/rejected": -273.5794677734375, "loss": 0.6908, "positive_losses": 0.8591312170028687, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1914621740579605, "rewards/margins": 0.11739379167556763, "rewards/margins_max": 0.3272903561592102, "rewards/margins_min": -0.0655929371714592, "rewards/margins_std": 0.17901813983917236, "rewards/rejected": 0.07406838238239288, "step": 3630 }, { "dpo_losses": 0.643804669380188, "epoch": 0.95, "grad_norm": 3.736702293568718, "learning_rate": 3.411653435283158e-08, "logits/chosen": -2.647799253463745, "logits/rejected": -2.6162681579589844, "logps/chosen": -234.08847045898438, "logps/rejected": -210.5509796142578, "loss": 0.6757, "positive_losses": 0.05722751468420029, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2295902520418167, "rewards/margins": 0.11434964835643768, "rewards/margins_max": 0.35972321033477783, "rewards/margins_min": -0.10925455391407013, "rewards/margins_std": 0.20940427482128143, "rewards/rejected": 0.11524059623479843, "step": 3640 }, { "dpo_losses": 0.6531789898872375, "epoch": 0.96, "grad_norm": 1.915823488391317, "learning_rate": 3.04583517959367e-08, "logits/chosen": -2.6603918075561523, "logits/rejected": -2.6616523265838623, "logps/chosen": -273.0516052246094, "logps/rejected": -283.12347412109375, "loss": 0.6599, "positive_losses": 0.3513060510158539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.22649244964122772, "rewards/margins": 0.09476588666439056, "rewards/margins_max": 0.3478698134422302, "rewards/margins_min": -0.12968704104423523, "rewards/margins_std": 0.2150001972913742, "rewards/rejected": 0.13172657787799835, "step": 3650 }, { "dpo_losses": 0.6662554740905762, "epoch": 0.96, "grad_norm": 2.2316362361908206, "learning_rate": 2.7006375255985984e-08, "logits/chosen": -2.8110129833221436, "logits/rejected": -2.7716379165649414, "logps/chosen": -262.69708251953125, "logps/rejected": -261.35894775390625, "loss": 0.7064, "positive_losses": 0.4577966630458832, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.19929836690425873, "rewards/margins": 0.06393896043300629, "rewards/margins_max": 0.24750575423240662, "rewards/margins_min": -0.1344698667526245, "rewards/margins_std": 0.16754016280174255, "rewards/rejected": 0.13535940647125244, "step": 3660 }, { "dpo_losses": 0.6305605173110962, "epoch": 0.96, "grad_norm": 1.8443417650134954, "learning_rate": 2.3760892972027328e-08, "logits/chosen": -2.6637253761291504, "logits/rejected": -2.6332812309265137, "logps/chosen": -275.3893127441406, "logps/rejected": -284.32098388671875, "loss": 0.6579, "positive_losses": 0.18694505095481873, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.24530284106731415, "rewards/margins": 0.14238665997982025, "rewards/margins_max": 0.38903477787971497, "rewards/margins_min": -0.068825364112854, "rewards/margins_std": 0.20899328589439392, "rewards/rejected": 0.1029161810874939, "step": 3670 }, { "dpo_losses": 0.6498867273330688, "epoch": 0.96, "grad_norm": 5.458971796720751, "learning_rate": 2.072217594089765e-08, "logits/chosen": -2.6662886142730713, "logits/rejected": -2.630774974822998, "logps/chosen": -265.15081787109375, "logps/rejected": -256.91204833984375, "loss": 0.6753, "positive_losses": 0.18378598988056183, "rewards/accuracies": 0.625, "rewards/chosen": 0.20927441120147705, "rewards/margins": 0.09998832643032074, "rewards/margins_max": 0.3337637484073639, "rewards/margins_min": -0.13355764746665955, "rewards/margins_std": 0.20748178660869598, "rewards/rejected": 0.1092861071228981, "step": 3680 }, { "dpo_losses": 0.622007429599762, "epoch": 0.97, "grad_norm": 9.395134628492558, "learning_rate": 1.789047789459375e-08, "logits/chosen": -2.6741926670074463, "logits/rejected": -2.6397705078125, "logps/chosen": -325.18414306640625, "logps/rejected": -261.67279052734375, "loss": 0.6462, "positive_losses": 0.0, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.27494171261787415, "rewards/margins": 0.15975967049598694, "rewards/margins_max": 0.35449787974357605, "rewards/margins_min": -0.06289161741733551, "rewards/margins_std": 0.18452905118465424, "rewards/rejected": 0.11518202722072601, "step": 3690 }, { "dpo_losses": 0.6495968103408813, "epoch": 0.97, "grad_norm": 7.4822717311515134, "learning_rate": 1.5266035279088708e-08, "logits/chosen": -2.7605767250061035, "logits/rejected": -2.7201406955718994, "logps/chosen": -267.16839599609375, "logps/rejected": -241.4385528564453, "loss": 0.7039, "positive_losses": 0.7736231088638306, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.23643679916858673, "rewards/margins": 0.0986911952495575, "rewards/margins_max": 0.29774361848831177, "rewards/margins_min": -0.10798849165439606, "rewards/margins_std": 0.17969079315662384, "rewards/rejected": 0.13774561882019043, "step": 3700 }, { "epoch": 0.97, "eval_dpo_losses": 0.6395586729049683, "eval_logits/chosen": -2.6559033393859863, "eval_logits/rejected": -2.622006893157959, "eval_logps/chosen": -261.3918151855469, "eval_logps/rejected": -247.59274291992188, "eval_loss": 0.6790311336517334, "eval_positive_losses": 0.26341503858566284, "eval_rewards/accuracies": 0.7229999899864197, "eval_rewards/chosen": 0.23201590776443481, "eval_rewards/margins": 0.12215426564216614, "eval_rewards/margins_max": 0.44827672839164734, "eval_rewards/margins_min": -0.15502171218395233, "eval_rewards/margins_std": 0.20247788727283478, "eval_rewards/rejected": 0.10986167192459106, "eval_runtime": 428.2413, "eval_samples_per_second": 4.67, "eval_steps_per_second": 0.292, "step": 3700 }, { "dpo_losses": 0.6481695175170898, "epoch": 0.97, "grad_norm": 7.4982549544584405, "learning_rate": 1.2849067234584623e-08, "logits/chosen": -2.629378080368042, "logits/rejected": -2.583587169647217, "logps/chosen": -255.37991333007812, "logps/rejected": -237.39132690429688, "loss": 0.6597, "positive_losses": 0.1582089364528656, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.23376032710075378, "rewards/margins": 0.10453498363494873, "rewards/margins_max": 0.3104201853275299, "rewards/margins_min": -0.1114891991019249, "rewards/margins_std": 0.19313430786132812, "rewards/rejected": 0.12922534346580505, "step": 3710 }, { "dpo_losses": 0.6363897323608398, "epoch": 0.97, "grad_norm": 9.799345687509973, "learning_rate": 1.0639775577218625e-08, "logits/chosen": -2.66035532951355, "logits/rejected": -2.659759998321533, "logps/chosen": -239.5210418701172, "logps/rejected": -238.7107391357422, "loss": 0.6614, "positive_losses": 0.48925361037254333, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23430407047271729, "rewards/margins": 0.1289234459400177, "rewards/margins_max": 0.3533153533935547, "rewards/margins_min": -0.09572507441043854, "rewards/margins_std": 0.20401068031787872, "rewards/rejected": 0.10538060963153839, "step": 3720 }, { "dpo_losses": 0.641473114490509, "epoch": 0.98, "grad_norm": 12.708607221476539, "learning_rate": 8.638344782207486e-09, "logits/chosen": -2.6768765449523926, "logits/rejected": -2.6652908325195312, "logps/chosen": -299.50384521484375, "logps/rejected": -268.8082580566406, "loss": 0.7508, "positive_losses": 0.9163432121276855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.24856901168823242, "rewards/margins": 0.12247447669506073, "rewards/margins_max": 0.41335535049438477, "rewards/margins_min": -0.11577402055263519, "rewards/margins_std": 0.23423083126544952, "rewards/rejected": 0.1260945051908493, "step": 3730 }, { "dpo_losses": 0.6213124394416809, "epoch": 0.98, "grad_norm": 11.03160963471843, "learning_rate": 6.84494196844715e-09, "logits/chosen": -2.666469097137451, "logits/rejected": -2.62202525138855, "logps/chosen": -282.1092529296875, "logps/rejected": -265.4429931640625, "loss": 0.6529, "positive_losses": 0.3889961242675781, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.25011515617370605, "rewards/margins": 0.1748693734407425, "rewards/margins_max": 0.45166462659835815, "rewards/margins_min": -0.05564745515584946, "rewards/margins_std": 0.22870846092700958, "rewards/rejected": 0.07524577528238297, "step": 3740 }, { "dpo_losses": 0.6486204862594604, "epoch": 0.98, "grad_norm": 2.142027903370626, "learning_rate": 5.259716884556121e-09, "logits/chosen": -2.6220340728759766, "logits/rejected": -2.6580042839050293, "logps/chosen": -269.54071044921875, "logps/rejected": -257.8643798828125, "loss": 0.6949, "positive_losses": 1.0363675355911255, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.23406870663166046, "rewards/margins": 0.10313974320888519, "rewards/margins_max": 0.3234516978263855, "rewards/margins_min": -0.11554668843746185, "rewards/margins_std": 0.19535276293754578, "rewards/rejected": 0.13092896342277527, "step": 3750 }, { "dpo_losses": 0.6417810320854187, "epoch": 0.98, "grad_norm": 1.9153348469624756, "learning_rate": 3.882801896372967e-09, "logits/chosen": -2.6521694660186768, "logits/rejected": -2.6701323986053467, "logps/chosen": -233.39639282226562, "logps/rejected": -241.3929443359375, "loss": 0.6517, "positive_losses": 0.23487205803394318, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2109147310256958, "rewards/margins": 0.11781202256679535, "rewards/margins_max": 0.3403708338737488, "rewards/margins_min": -0.07822667807340622, "rewards/margins_std": 0.18718595802783966, "rewards/rejected": 0.09310269355773926, "step": 3760 }, { "dpo_losses": 0.6552250385284424, "epoch": 0.99, "grad_norm": 15.349581416384067, "learning_rate": 2.7143119759026614e-09, "logits/chosen": -2.6792030334472656, "logits/rejected": -2.6174237728118896, "logps/chosen": -284.4708251953125, "logps/rejected": -291.76434326171875, "loss": 0.6763, "positive_losses": 0.43425750732421875, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2588556110858917, "rewards/margins": 0.09476854652166367, "rewards/margins_max": 0.3151422142982483, "rewards/margins_min": -0.13262644410133362, "rewards/margins_std": 0.2025650441646576, "rewards/rejected": 0.16408707201480865, "step": 3770 }, { "dpo_losses": 0.6605818867683411, "epoch": 0.99, "grad_norm": 2.0486200479718413, "learning_rate": 1.754344691717591e-09, "logits/chosen": -2.678449869155884, "logits/rejected": -2.6325161457061768, "logps/chosen": -249.8470458984375, "logps/rejected": -256.65533447265625, "loss": 0.6661, "positive_losses": 0.13493213057518005, "rewards/accuracies": 0.625, "rewards/chosen": 0.203067347407341, "rewards/margins": 0.08785489946603775, "rewards/margins_max": 0.3431921601295471, "rewards/margins_min": -0.2343141734600067, "rewards/margins_std": 0.2637536823749542, "rewards/rejected": 0.11521244049072266, "step": 3780 }, { "dpo_losses": 0.6521404385566711, "epoch": 0.99, "grad_norm": 18.55025604976165, "learning_rate": 1.0029802008096335e-09, "logits/chosen": -2.6939713954925537, "logits/rejected": -2.681117534637451, "logps/chosen": -215.4471893310547, "logps/rejected": -188.15237426757812, "loss": 0.7364, "positive_losses": 1.0411407947540283, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.19064785540103912, "rewards/margins": 0.09285817295312881, "rewards/margins_max": 0.2621470093727112, "rewards/margins_min": -0.07667995244264603, "rewards/margins_std": 0.14927421510219574, "rewards/rejected": 0.0977896898984909, "step": 3790 }, { "dpo_losses": 0.635759711265564, "epoch": 0.99, "grad_norm": 10.457828666598141, "learning_rate": 4.602812418974534e-10, "logits/chosen": -2.60920786857605, "logits/rejected": -2.6047613620758057, "logps/chosen": -262.1203918457031, "logps/rejected": -199.7135772705078, "loss": 0.6622, "positive_losses": 0.0, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.2570125162601471, "rewards/margins": 0.13325463235378265, "rewards/margins_max": 0.39995330572128296, "rewards/margins_min": -0.04778273031115532, "rewards/margins_std": 0.20286540687084198, "rewards/rejected": 0.12375785410404205, "step": 3800 }, { "epoch": 0.99, "eval_dpo_losses": 0.6395210027694702, "eval_logits/chosen": -2.6544384956359863, "eval_logits/rejected": -2.620440721511841, "eval_logps/chosen": -261.3937683105469, "eval_logps/rejected": -247.60301208496094, "eval_loss": 0.6789272427558899, "eval_positive_losses": 0.26124632358551025, "eval_rewards/accuracies": 0.722000002861023, "eval_rewards/chosen": 0.23199646174907684, "eval_rewards/margins": 0.12223710119724274, "eval_rewards/margins_max": 0.4482419788837433, "eval_rewards/margins_min": -0.15490815043449402, "eval_rewards/margins_std": 0.20252487063407898, "eval_rewards/rejected": 0.1097593605518341, "eval_runtime": 428.0449, "eval_samples_per_second": 4.672, "eval_steps_per_second": 0.292, "step": 3800 }, { "dpo_losses": 0.6477853655815125, "epoch": 1.0, "grad_norm": 1.7682638113441669, "learning_rate": 1.2629313018819312e-10, "logits/chosen": -2.6831555366516113, "logits/rejected": -2.651272773742676, "logps/chosen": -259.72967529296875, "logps/rejected": -233.97537231445312, "loss": 0.6669, "positive_losses": 0.11028347164392471, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.22155094146728516, "rewards/margins": 0.10187198221683502, "rewards/margins_max": 0.29284974932670593, "rewards/margins_min": -0.08088265359401703, "rewards/margins_std": 0.17286035418510437, "rewards/rejected": 0.11967895179986954, "step": 3810 }, { "dpo_losses": 0.6358076333999634, "epoch": 1.0, "grad_norm": 7.1167203747809475, "learning_rate": 1.0437535929996855e-12, "logits/chosen": -2.622563600540161, "logits/rejected": -2.609252452850342, "logps/chosen": -307.4974670410156, "logps/rejected": -206.5818328857422, "loss": 0.6681, "positive_losses": 0.5052299499511719, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.23733225464820862, "rewards/margins": 0.13700851798057556, "rewards/margins_max": 0.3782404065132141, "rewards/margins_min": -0.08631386607885361, "rewards/margins_std": 0.20959043502807617, "rewards/rejected": 0.10032373666763306, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.6795008113577053, "train_runtime": 46141.1523, "train_samples_per_second": 1.325, "train_steps_per_second": 0.083 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }