diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11728 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0283877878950187, + "eval_steps": 64, + "global_step": 480, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0021424745581146223, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 12.50527462559239, + "learning_rate": 0.0, + "logits/chosen": 1.0446698665618896, + "logits/rejected": 0.9781918525695801, + "logps/accuracies": 0.25, + "logps/chosen": -270.4280700683594, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -270.4280700683594, + "logps/ref_rejected": -259.14373779296875, + "logps/rejected": -259.14373779296875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/grad_term": 0.02500000037252903, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.004284949116229245, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 12.856352015281924, + "learning_rate": 1.0679540942081149e-07, + "logits/chosen": 0.4414063096046448, + "logits/rejected": 0.32948625087738037, + "logps/accuracies": 0.5, + "logps/chosen": -318.21771240234375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -317.62615966796875, + "logps/ref_rejected": -221.48974609375, + "logps/rejected": -221.3629608154297, + "loss": 0.69, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.02957625314593315, + "rewards/grad_term": 0.0254487507045269, + "rewards/margins": -0.03591585159301758, + "rewards/rejected": 0.006339598447084427, + "step": 2 + }, + { + "epoch": 0.006427423674343867, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 13.280048310716381, + "learning_rate": 1.6926671918114913e-07, + "logits/chosen": 1.0256762504577637, + "logits/rejected": 0.8810745477676392, + "logps/accuracies": 0.5, + "logps/chosen": -355.5387268066406, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -354.57049560546875, + "logps/ref_rejected": -364.0948486328125, + "logps/rejected": -363.83416748046875, + "loss": 0.6892, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0484108030796051, + "rewards/grad_term": 0.0257670097053051, + "rewards/margins": -0.061444856226444244, + "rewards/rejected": 0.013034057803452015, + "step": 3 + }, + { + "epoch": 0.00856989823245849, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 12.710494813385088, + "learning_rate": 2.1359081884162297e-07, + "logits/chosen": 1.1384323835372925, + "logits/rejected": 1.0655404329299927, + "logps/accuracies": 0.5, + "logps/chosen": -442.36578369140625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -442.594970703125, + "logps/ref_rejected": -345.18572998046875, + "logps/rejected": -344.8496398925781, + "loss": 0.6921, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0114593505859375, + "rewards/grad_term": 0.0250665545463562, + "rewards/margins": -0.005344009958207607, + "rewards/rejected": 0.016803361475467682, + "step": 4 + }, + { + "epoch": 0.010712372790573112, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 14.937463503106938, + "learning_rate": 2.479712615391807e-07, + "logits/chosen": 0.5404180288314819, + "logits/rejected": 0.45622000098228455, + "logps/accuracies": 0.0, + "logps/chosen": -413.39813232421875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -413.3487854003906, + "logps/ref_rejected": -304.2044982910156, + "logps/rejected": -304.66510009765625, + "loss": 0.691, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.00246772775426507, + "rewards/grad_term": 0.024743197485804558, + "rewards/margins": 0.020562361925840378, + "rewards/rejected": -0.02303009107708931, + "step": 5 + }, + { + "epoch": 0.012854847348687734, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 12.517831245113905, + "learning_rate": 2.7606212860196063e-07, + "logits/chosen": 1.1012723445892334, + "logits/rejected": 0.7194727659225464, + "logps/accuracies": 0.5, + "logps/chosen": -258.80072021484375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -258.87176513671875, + "logps/ref_rejected": -247.64498901367188, + "logps/rejected": -247.63116455078125, + "loss": 0.6903, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0035531027242541313, + "rewards/grad_term": 0.024964267387986183, + "rewards/margins": 0.002861691638827324, + "rewards/rejected": 0.0006914124824106693, + "step": 6 + }, + { + "epoch": 0.014997321906802356, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 12.204086479352032, + "learning_rate": 2.9981261829067217e-07, + "logits/chosen": 1.0940355062484741, + "logits/rejected": 0.9565569162368774, + "logps/accuracies": 0.25, + "logps/chosen": -237.1595916748047, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -236.5460662841797, + "logps/ref_rejected": -215.79209899902344, + "logps/rejected": -215.83944702148438, + "loss": 0.6913, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.030676748603582382, + "rewards/grad_term": 0.025353606790304184, + "rewards/margins": -0.02830987237393856, + "rewards/rejected": -0.0023668762296438217, + "step": 7 + }, + { + "epoch": 0.01713979646491698, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 12.475151458421857, + "learning_rate": 3.2038622826243447e-07, + "logits/chosen": 0.967343270778656, + "logits/rejected": 0.9412274360656738, + "logps/accuracies": 0.5, + "logps/chosen": -279.015380859375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -279.0576171875, + "logps/ref_rejected": -273.6654968261719, + "logps/rejected": -273.8743591308594, + "loss": 0.6941, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0021114349365234375, + "rewards/grad_term": 0.02484307810664177, + "rewards/margins": 0.01255502738058567, + "rewards/rejected": -0.010443592444062233, + "step": 8 + }, + { + "epoch": 0.0192822710230316, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 12.580237634234056, + "learning_rate": 3.3853343836229826e-07, + "logits/chosen": 0.9568102359771729, + "logits/rejected": 0.9983876943588257, + "logps/accuracies": 0.5, + "logps/chosen": -331.88922119140625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -332.0152587890625, + "logps/ref_rejected": -332.083251953125, + "logps/rejected": -331.71844482421875, + "loss": 0.6903, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.00630226219072938, + "rewards/grad_term": 0.02514958195388317, + "rewards/margins": -0.011937713250517845, + "rewards/rejected": 0.018239974975585938, + "step": 9 + }, + { + "epoch": 0.021424745581146223, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 13.151098903300593, + "learning_rate": 3.5476667095999224e-07, + "logits/chosen": 1.000882863998413, + "logits/rejected": 0.9467081427574158, + "logps/accuracies": 0.5, + "logps/chosen": -320.8128662109375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -320.7702941894531, + "logps/ref_rejected": -298.6964111328125, + "logps/rejected": -298.3901672363281, + "loss": 0.6846, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0021286013070493937, + "rewards/grad_term": 0.025217382237315178, + "rewards/margins": -0.017440060153603554, + "rewards/rejected": 0.01531145628541708, + "step": 10 + }, + { + "epoch": 0.023567220139260846, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 12.233476785972572, + "learning_rate": 3.6945141607567076e-07, + "logits/chosen": 1.1362941265106201, + "logits/rejected": 1.0800057649612427, + "logps/accuracies": 0.5, + "logps/chosen": -398.43902587890625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -399.17889404296875, + "logps/ref_rejected": -400.2832946777344, + "logps/rejected": -400.0302429199219, + "loss": 0.6858, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.036993030458688736, + "rewards/grad_term": 0.024695834144949913, + "rewards/margins": 0.024341586977243423, + "rewards/rejected": 0.012651442550122738, + "step": 11 + }, + { + "epoch": 0.025709694697375468, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 12.317733670973258, + "learning_rate": 3.8285753802277215e-07, + "logits/chosen": 1.1415050029754639, + "logits/rejected": 0.6014249920845032, + "logps/accuracies": 0.0, + "logps/chosen": -187.37680053710938, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -187.9676513671875, + "logps/ref_rejected": -126.48046875, + "logps/rejected": -126.51386260986328, + "loss": 0.685, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0295425895601511, + "rewards/grad_term": 0.02461005374789238, + "rewards/margins": 0.03121213987469673, + "rewards/rejected": -0.0016695503145456314, + "step": 12 + }, + { + "epoch": 0.02785216925549009, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 13.848164526581762, + "learning_rate": 3.9518997473591026e-07, + "logits/chosen": 1.0744524002075195, + "logits/rejected": 0.8894139528274536, + "logps/accuracies": 0.0, + "logps/chosen": -277.2958984375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -278.2149658203125, + "logps/ref_rejected": -189.09963989257812, + "logps/rejected": -189.11630249023438, + "loss": 0.688, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04595203697681427, + "rewards/grad_term": 0.02441561222076416, + "rewards/margins": 0.04678481072187424, + "rewards/rejected": -0.0008327784016728401, + "step": 13 + }, + { + "epoch": 0.029994643813604713, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 12.272371420817562, + "learning_rate": 4.066080277114836e-07, + "logits/chosen": 1.0568749904632568, + "logits/rejected": 0.8716171979904175, + "logps/accuracies": 0.0, + "logps/chosen": -219.07720947265625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -219.25765991210938, + "logps/ref_rejected": -129.22840881347656, + "logps/rejected": -128.9102020263672, + "loss": 0.686, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.009022902697324753, + "rewards/grad_term": 0.025086142122745514, + "rewards/margins": -0.006888152565807104, + "rewards/rejected": 0.015911055728793144, + "step": 14 + }, + { + "epoch": 0.032137118371719335, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 12.428694983873564, + "learning_rate": 4.1723798072032976e-07, + "logits/chosen": 1.037872314453125, + "logits/rejected": 0.9692336320877075, + "logps/accuracies": 0.25, + "logps/chosen": -381.4244384765625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -382.68658447265625, + "logps/ref_rejected": -322.65679931640625, + "logps/rejected": -321.3078308105469, + "loss": 0.6845, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06310494244098663, + "rewards/grad_term": 0.02505427412688732, + "rewards/margins": -0.004344702698290348, + "rewards/rejected": 0.06744963675737381, + "step": 15 + }, + { + "epoch": 0.03427959292983396, + "flips/correct->correct": 1.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 12.620364972338749, + "learning_rate": 4.2718163768324594e-07, + "logits/chosen": 0.9468654990196228, + "logits/rejected": 1.170510172843933, + "logps/accuracies": 1.0, + "logps/chosen": -173.85870361328125, + "logps/ref_accuracies": 1.0, + "logps/ref_chosen": -174.64218139648438, + "logps/ref_rejected": -222.02438354492188, + "logps/rejected": -222.00958251953125, + "loss": 0.6788, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03917388990521431, + "rewards/grad_term": 0.024519937112927437, + "rewards/margins": 0.03843364864587784, + "rewards/rejected": 0.0007402412593364716, + "step": 16 + }, + { + "epoch": 0.03642206748794858, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 11.697678503172199, + "learning_rate": 4.3652226762368345e-07, + "logits/chosen": 1.0040562152862549, + "logits/rejected": 0.9608211517333984, + "logps/accuracies": 0.5, + "logps/chosen": -253.23019409179688, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -254.5650177001953, + "logps/ref_rejected": -273.43707275390625, + "logps/rejected": -273.1368713378906, + "loss": 0.6744, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06674174964427948, + "rewards/grad_term": 0.024354225024580956, + "rewards/margins": 0.051732055842876434, + "rewards/rejected": 0.015009691938757896, + "step": 17 + }, + { + "epoch": 0.0385645420460632, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 11.78637513369074, + "learning_rate": 4.4532884778310973e-07, + "logits/chosen": 1.0503087043762207, + "logits/rejected": 0.9166826009750366, + "logps/accuracies": 0.5, + "logps/chosen": -244.81471252441406, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -244.83123779296875, + "logps/ref_rejected": -240.30084228515625, + "logps/rejected": -239.78488159179688, + "loss": 0.6772, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0008262638584710658, + "rewards/grad_term": 0.0253120306879282, + "rewards/margins": -0.02497131936252117, + "rewards/rejected": 0.025797583162784576, + "step": 18 + }, + { + "epoch": 0.040707016604177824, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 11.706003949713145, + "learning_rate": 4.536591579881374e-07, + "logits/chosen": 1.015830159187317, + "logits/rejected": 0.9545015692710876, + "logps/accuracies": 0.25, + "logps/chosen": -264.29638671875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -266.071044921875, + "logps/ref_rejected": -236.97666931152344, + "logps/rejected": -236.77438354492188, + "loss": 0.6778, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08873386681079865, + "rewards/grad_term": 0.02401968464255333, + "rewards/margins": 0.07861967384815216, + "rewards/rejected": 0.01011419203132391, + "step": 19 + }, + { + "epoch": 0.04284949116229245, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 11.815048094751976, + "learning_rate": 4.615620803808037e-07, + "logits/chosen": 0.896416187286377, + "logits/rejected": 1.0262924432754517, + "logps/accuracies": 0.5, + "logps/chosen": -294.67327880859375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -296.34197998046875, + "logps/ref_rejected": -321.8549499511719, + "logps/rejected": -321.5665283203125, + "loss": 0.6743, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08343505859375, + "rewards/grad_term": 0.02414114400744438, + "rewards/margins": 0.06901436299085617, + "rewards/rejected": 0.014420699328184128, + "step": 20 + }, + { + "epoch": 0.04499196572040707, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 11.637316673579596, + "learning_rate": 4.6907933747182127e-07, + "logits/chosen": 0.8530393242835999, + "logits/rejected": 0.6534068584442139, + "logps/accuracies": 0.25, + "logps/chosen": -276.7626647949219, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -277.6459655761719, + "logps/ref_rejected": -269.14227294921875, + "logps/rejected": -270.58624267578125, + "loss": 0.6723, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04416485130786896, + "rewards/grad_term": 0.02355196699500084, + "rewards/margins": 0.11636309325695038, + "rewards/rejected": -0.07219824939966202, + "step": 21 + }, + { + "epoch": 0.04713444027852169, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 11.876590663836538, + "learning_rate": 4.762468254964823e-07, + "logits/chosen": 1.009377121925354, + "logits/rejected": 0.9447416067123413, + "logps/accuracies": 0.5, + "logps/chosen": -317.7878723144531, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -319.1816711425781, + "logps/ref_rejected": -311.2941589355469, + "logps/rejected": -311.8822021484375, + "loss": 0.671, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06969013810157776, + "rewards/grad_term": 0.023765018209815025, + "rewards/margins": 0.09909267723560333, + "rewards/rejected": -0.029402542859315872, + "step": 22 + }, + { + "epoch": 0.049276914836636314, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 12.586762655265133, + "learning_rate": 4.830956511375156e-07, + "logits/chosen": 1.203920841217041, + "logits/rejected": 1.1563231945037842, + "logps/accuracies": 0.0, + "logps/chosen": -436.044921875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -438.2389831542969, + "logps/ref_rejected": -385.3761901855469, + "logps/rejected": -385.66180419921875, + "loss": 0.6674, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10970421135425568, + "rewards/grad_term": 0.0234676580876112, + "rewards/margins": 0.12398529052734375, + "rewards/rejected": -0.014281081967055798, + "step": 23 + }, + { + "epoch": 0.051419389394750936, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 12.26080518238754, + "learning_rate": 4.896529474435837e-07, + "logits/chosen": 0.9966791868209839, + "logits/rejected": 0.9108967185020447, + "logps/accuracies": 0.25, + "logps/chosen": -307.80572509765625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -310.31866455078125, + "logps/ref_rejected": -252.79197692871094, + "logps/rejected": -252.8419189453125, + "loss": 0.6669, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12564696371555328, + "rewards/grad_term": 0.023410532623529434, + "rewards/margins": 0.12814360857009888, + "rewards/rejected": -0.002496624831110239, + "step": 24 + }, + { + "epoch": 0.05356186395286556, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 11.366243366820711, + "learning_rate": 4.959425230783614e-07, + "logits/chosen": 0.9740027785301208, + "logits/rejected": 0.8190696239471436, + "logps/accuracies": 0.5, + "logps/chosen": -320.2606506347656, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -321.4267272949219, + "logps/ref_rejected": -328.9568176269531, + "logps/rejected": -328.89703369140625, + "loss": 0.6599, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05830345302820206, + "rewards/grad_term": 0.024309232831001282, + "rewards/margins": 0.055313680320978165, + "rewards/rejected": 0.0029897689819335938, + "step": 25 + }, + { + "epoch": 0.05570433851098018, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 11.08055129360116, + "learning_rate": 5.019853841567218e-07, + "logits/chosen": 1.1111705303192139, + "logits/rejected": 0.6438971161842346, + "logps/accuracies": 0.25, + "logps/chosen": -187.502197265625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -188.0975341796875, + "logps/ref_rejected": -102.05082702636719, + "logps/rejected": -102.74217987060547, + "loss": 0.6499, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.029765892773866653, + "rewards/grad_term": 0.024199258536100388, + "rewards/margins": 0.06433363258838654, + "rewards/rejected": -0.034567736089229584, + "step": 26 + }, + { + "epoch": 0.0578468130690948, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 11.88218195606169, + "learning_rate": 5.078001575434473e-07, + "logits/chosen": 0.791816771030426, + "logits/rejected": 0.884813666343689, + "logps/accuracies": 0.5, + "logps/chosen": -189.6773223876953, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -190.02127075195312, + "logps/ref_rejected": -203.477783203125, + "logps/rejected": -203.25155639648438, + "loss": 0.6377, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.017197083681821823, + "rewards/grad_term": 0.024926653131842613, + "rewards/margins": 0.00588593352586031, + "rewards/rejected": 0.011311152018606663, + "step": 27 + }, + { + "epoch": 0.059989287627209426, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 11.239511800333705, + "learning_rate": 5.134034371322951e-07, + "logits/chosen": 1.134675145149231, + "logits/rejected": 0.9631079435348511, + "logps/accuracies": 0.25, + "logps/chosen": -266.0494079589844, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -267.859619140625, + "logps/ref_rejected": -242.5032958984375, + "logps/rejected": -243.47943115234375, + "loss": 0.636, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09051056951284409, + "rewards/grad_term": 0.023272007703781128, + "rewards/margins": 0.13931767642498016, + "rewards/rejected": -0.04880712181329727, + "step": 28 + }, + { + "epoch": 0.06213176218532405, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 11.25145831338842, + "learning_rate": 5.188100693331704e-07, + "logits/chosen": 1.1358017921447754, + "logits/rejected": 0.5541727542877197, + "logps/accuracies": 0.0, + "logps/chosen": -365.2521057128906, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -368.4560546875, + "logps/ref_rejected": -241.02734375, + "logps/rejected": -240.83399963378906, + "loss": 0.639, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1601976454257965, + "rewards/grad_term": 0.0231227595359087, + "rewards/margins": 0.1505315899848938, + "rewards/rejected": 0.009666061028838158, + "step": 29 + }, + { + "epoch": 0.06427423674343867, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 10.7696981919874, + "learning_rate": 5.240333901411414e-07, + "logits/chosen": 1.0822885036468506, + "logits/rejected": 0.7618290185928345, + "logps/accuracies": 0.0, + "logps/chosen": -302.0892028808594, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -303.5124816894531, + "logps/ref_rejected": -231.44943237304688, + "logps/rejected": -231.66343688964844, + "loss": 0.6391, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07116356492042542, + "rewards/grad_term": 0.023980939760804176, + "rewards/margins": 0.08186331391334534, + "rewards/rejected": -0.010699748061597347, + "step": 30 + }, + { + "epoch": 0.0664167113015533, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 10.976869632876706, + "learning_rate": 5.2908542331884e-07, + "logits/chosen": 1.2087453603744507, + "logits/rejected": 0.821852445602417, + "logps/accuracies": 0.0, + "logps/chosen": -408.1747741699219, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -410.11907958984375, + "logps/ref_rejected": -321.87982177734375, + "logps/rejected": -322.6016540527344, + "loss": 0.6316, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09721412509679794, + "rewards/grad_term": 0.023339958861470222, + "rewards/margins": 0.1333070695400238, + "rewards/rejected": -0.03609294816851616, + "step": 31 + }, + { + "epoch": 0.06855918585966791, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 10.55127291702564, + "learning_rate": 5.339770471040575e-07, + "logits/chosen": 1.0062335729599, + "logits/rejected": 0.8178822994232178, + "logps/accuracies": 0.25, + "logps/chosen": -251.82504272460938, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -256.1924743652344, + "logps/ref_rejected": -205.9676971435547, + "logps/rejected": -208.8406524658203, + "loss": 0.6257, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21837130188941956, + "rewards/grad_term": 0.020633019506931305, + "rewards/margins": 0.36201906204223633, + "rewards/rejected": -0.14364777505397797, + "step": 32 + }, + { + "epoch": 0.07070166041778254, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 10.480165494817408, + "learning_rate": 5.387181352568199e-07, + "logits/chosen": 0.5726549625396729, + "logits/rejected": 0.4411008358001709, + "logps/accuracies": 0.5, + "logps/chosen": -123.8817138671875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -125.0975570678711, + "logps/ref_rejected": -92.3137435913086, + "logps/rejected": -92.87496948242188, + "loss": 0.6302, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06079201400279999, + "rewards/grad_term": 0.02389412932097912, + "rewards/margins": 0.08885356783866882, + "rewards/rejected": -0.028061550110578537, + "step": 33 + }, + { + "epoch": 0.07284413497589716, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 10.769954405702693, + "learning_rate": 5.43317677044495e-07, + "logits/chosen": 0.8887495994567871, + "logits/rejected": 0.7524275779724121, + "logps/accuracies": 0.5, + "logps/chosen": -233.1437530517578, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -235.97918701171875, + "logps/ref_rejected": -232.49705505371094, + "logps/rejected": -233.62205505371094, + "loss": 0.6253, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1417713165283203, + "rewards/grad_term": 0.022557333111763, + "rewards/margins": 0.1980208307504654, + "rewards/rejected": -0.056249529123306274, + "step": 34 + }, + { + "epoch": 0.07498660953401179, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 10.153566860628073, + "learning_rate": 5.477838798298528e-07, + "logits/chosen": 0.9315862655639648, + "logits/rejected": 1.0292893648147583, + "logps/accuracies": 0.5, + "logps/chosen": -176.9609375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -178.0975799560547, + "logps/ref_rejected": -238.39404296875, + "logps/rejected": -238.72592163085938, + "loss": 0.6173, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05683126673102379, + "rewards/grad_term": 0.02408386766910553, + "rewards/margins": 0.07342477142810822, + "rewards/rejected": -0.016593504697084427, + "step": 35 + }, + { + "epoch": 0.0771290840921264, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 11.29602350540091, + "learning_rate": 5.521242572039213e-07, + "logits/chosen": 1.1964863538742065, + "logits/rejected": 1.0984711647033691, + "logps/accuracies": 0.5, + "logps/chosen": -322.5604248046875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -322.1260986328125, + "logps/ref_rejected": -287.303466796875, + "logps/rejected": -293.40313720703125, + "loss": 0.6001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02171630784869194, + "rewards/grad_term": 0.021510563790798187, + "rewards/margins": 0.28326815366744995, + "rewards/rejected": -0.3049844801425934, + "step": 36 + }, + { + "epoch": 0.07927155865024103, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 11.139112324828863, + "learning_rate": 5.563457050409681e-07, + "logits/chosen": 1.173073410987854, + "logits/rejected": 1.0843687057495117, + "logps/accuracies": 0.75, + "logps/chosen": -273.99566650390625, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -275.65478515625, + "logps/ref_rejected": -284.9312438964844, + "logps/rejected": -284.83282470703125, + "loss": 0.6029, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08295460045337677, + "rewards/grad_term": 0.024028297513723373, + "rewards/margins": 0.07803288102149963, + "rewards/rejected": 0.0049217212945222855, + "step": 37 + }, + { + "epoch": 0.08141403320835565, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 11.36236689748956, + "learning_rate": 5.604545674089489e-07, + "logits/chosen": 1.127380609512329, + "logits/rejected": 0.8692578077316284, + "logps/accuracies": 0.25, + "logps/chosen": -628.7327270507812, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -633.2546997070312, + "logps/ref_rejected": -454.33013916015625, + "logps/rejected": -454.3887939453125, + "loss": 0.6011, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.22609534859657288, + "rewards/grad_term": 0.022296732291579247, + "rewards/margins": 0.22902806103229523, + "rewards/rejected": -0.0029327282682061195, + "step": 38 + }, + { + "epoch": 0.08355650776647028, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 10.40205719855862, + "learning_rate": 5.644566939170593e-07, + "logits/chosen": 1.1477112770080566, + "logits/rejected": 0.7197964191436768, + "logps/accuracies": 0.0, + "logps/chosen": -342.099609375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -345.7111511230469, + "logps/ref_rejected": -252.51214599609375, + "logps/rejected": -257.9091796875, + "loss": 0.5936, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18057651817798615, + "rewards/grad_term": 0.01955207623541355, + "rewards/margins": 0.45042720437049866, + "rewards/rejected": -0.2698506712913513, + "step": 39 + }, + { + "epoch": 0.0856989823245849, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 10.52698957163884, + "learning_rate": 5.683574898016152e-07, + "logits/chosen": 1.155718207359314, + "logits/rejected": 0.9540653228759766, + "logps/accuracies": 0.25, + "logps/chosen": -353.8412780761719, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -355.0787353515625, + "logps/ref_rejected": -313.3082580566406, + "logps/rejected": -315.2515869140625, + "loss": 0.5912, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0618743896484375, + "rewards/grad_term": 0.023020360618829727, + "rewards/margins": 0.15904179215431213, + "rewards/rejected": -0.09716740250587463, + "step": 40 + }, + { + "epoch": 0.08784145688269952, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 14.94332500203197, + "learning_rate": 5.721619598264776e-07, + "logits/chosen": 1.1048097610473633, + "logits/rejected": 0.8005577325820923, + "logps/accuracies": 0.0, + "logps/chosen": -317.59912109375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -313.39208984375, + "logps/ref_rejected": -253.1265411376953, + "logps/rejected": -262.5826416015625, + "loss": 0.5854, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2103523313999176, + "rewards/grad_term": 0.021769195795059204, + "rewards/margins": 0.2624519467353821, + "rewards/rejected": -0.4728042781352997, + "step": 41 + }, + { + "epoch": 0.08998393144081414, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 11.074479922271454, + "learning_rate": 5.758747468926328e-07, + "logits/chosen": 0.8858407735824585, + "logits/rejected": 0.7222996354103088, + "logps/accuracies": 0.5, + "logps/chosen": -298.509521484375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -300.38153076171875, + "logps/ref_rejected": -263.8077087402344, + "logps/rejected": -261.6027526855469, + "loss": 0.5645, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09360065311193466, + "rewards/grad_term": 0.025205716490745544, + "rewards/margins": -0.01664828509092331, + "rewards/rejected": 0.11024895310401917, + "step": 42 + }, + { + "epoch": 0.09212640599892877, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 10.322575540859543, + "learning_rate": 5.795001661041298e-07, + "logits/chosen": 1.0528746843338013, + "logits/rejected": 0.8013145923614502, + "logps/accuracies": 0.25, + "logps/chosen": -357.80035400390625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -354.52734375, + "logps/ref_rejected": -306.75, + "logps/rejected": -324.1343994140625, + "loss": 0.5605, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16364938020706177, + "rewards/grad_term": 0.016722146421670914, + "rewards/margins": 0.7055709958076477, + "rewards/rejected": -0.8692203760147095, + "step": 43 + }, + { + "epoch": 0.09426888055704338, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.88902402136389, + "learning_rate": 5.830422349172938e-07, + "logits/chosen": 1.1213502883911133, + "logits/rejected": 0.7059850692749023, + "logps/accuracies": 0.25, + "logps/chosen": -306.6339416503906, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -308.36224365234375, + "logps/ref_rejected": -222.45750427246094, + "logps/rejected": -234.72048950195312, + "loss": 0.5277, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08641643077135086, + "rewards/grad_term": 0.016946371644735336, + "rewards/margins": 0.6995644569396973, + "rewards/rejected": -0.6131480932235718, + "step": 44 + }, + { + "epoch": 0.09641135511515801, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.994736837402936, + "learning_rate": 5.865046999014789e-07, + "logits/chosen": 1.0356709957122803, + "logits/rejected": 0.9350219368934631, + "logps/accuracies": 0.25, + "logps/chosen": -449.03717041015625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -447.7042236328125, + "logps/ref_rejected": -399.0120544433594, + "logps/rejected": -408.2356262207031, + "loss": 0.537, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06664619594812393, + "rewards/grad_term": 0.0201456006616354, + "rewards/margins": 0.3945322036743164, + "rewards/rejected": -0.46117842197418213, + "step": 45 + }, + { + "epoch": 0.09855382967327263, + "flips/correct->correct": 1.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.684993785164789, + "learning_rate": 5.898910605583271e-07, + "logits/chosen": 0.8869616389274597, + "logits/rejected": 0.9591479301452637, + "logps/accuracies": 1.0, + "logps/chosen": -190.82815551757812, + "logps/ref_accuracies": 1.0, + "logps/ref_chosen": -184.7421112060547, + "logps/ref_rejected": -232.9263153076172, + "logps/rejected": -237.15127563476562, + "loss": 0.5444, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.30430203676223755, + "rewards/grad_term": 0.026146598160266876, + "rewards/margins": -0.0930541530251503, + "rewards/rejected": -0.21124787628650665, + "step": 46 + }, + { + "epoch": 0.10069630423138726, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 10.511687293551553, + "learning_rate": 5.932045905791884e-07, + "logits/chosen": 0.9791369438171387, + "logits/rejected": 0.7344577312469482, + "logps/accuracies": 0.0, + "logps/chosen": -380.52587890625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -378.03131103515625, + "logps/ref_rejected": -319.94256591796875, + "logps/rejected": -337.8666687011719, + "loss": 0.5447, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12472762912511826, + "rewards/grad_term": 0.01607554219663143, + "rewards/margins": 0.7714786529541016, + "rewards/rejected": -0.896206259727478, + "step": 47 + }, + { + "epoch": 0.10283877878950187, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 10.055964810316027, + "learning_rate": 5.964483568643951e-07, + "logits/chosen": 0.8970568776130676, + "logits/rejected": 0.48237085342407227, + "logps/accuracies": 0.5, + "logps/chosen": -349.3356018066406, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -343.4019775390625, + "logps/ref_rejected": -257.5508117675781, + "logps/rejected": -269.60809326171875, + "loss": 0.5329, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2966794967651367, + "rewards/grad_term": 0.021424874663352966, + "rewards/margins": 0.30618318915367126, + "rewards/rejected": -0.6028627157211304, + "step": 48 + }, + { + "epoch": 0.1049812533476165, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 10.617890748457961, + "learning_rate": 5.996252365813443e-07, + "logits/chosen": 0.9411455392837524, + "logits/rejected": 0.5716761350631714, + "logps/accuracies": 0.0, + "logps/chosen": -416.63934326171875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -415.9534606933594, + "logps/ref_rejected": -247.12747192382812, + "logps/rejected": -250.519287109375, + "loss": 0.5287, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03429294005036354, + "rewards/grad_term": 0.023422496393322945, + "rewards/margins": 0.1352972686290741, + "rewards/rejected": -0.16959019005298615, + "step": 49 + }, + { + "epoch": 0.10712372790573112, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 10.56249795289356, + "learning_rate": 6.02737932499173e-07, + "logits/chosen": 0.7994714975357056, + "logits/rejected": 0.42442572116851807, + "logps/accuracies": 0.0, + "logps/chosen": -441.199951171875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -438.83892822265625, + "logps/ref_rejected": -322.400146484375, + "logps/rejected": -338.1896667480469, + "loss": 0.5245, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11804847419261932, + "rewards/grad_term": 0.016969487071037292, + "rewards/margins": 0.6714283227920532, + "rewards/rejected": -0.789476752281189, + "step": 50 + }, + { + "epoch": 0.10926620246384575, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.886334758313449, + "learning_rate": 6.057889868048325e-07, + "logits/chosen": 1.0163636207580566, + "logits/rejected": 0.8965986967086792, + "logps/accuracies": 0.25, + "logps/chosen": -416.2098083496094, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -416.4342041015625, + "logps/ref_rejected": -324.2137145996094, + "logps/rejected": -339.7252502441406, + "loss": 0.5316, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.011221121996641159, + "rewards/grad_term": 0.01711445301771164, + "rewards/margins": 0.7867982983589172, + "rewards/rejected": -0.775577187538147, + "step": 51 + }, + { + "epoch": 0.11140867702196036, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 10.545934339053032, + "learning_rate": 6.087807935775333e-07, + "logits/chosen": 0.5185865759849548, + "logits/rejected": 0.3614073395729065, + "logps/accuracies": 0.25, + "logps/chosen": -229.86053466796875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -228.54452514648438, + "logps/ref_rejected": -195.83494567871094, + "logps/rejected": -206.37149047851562, + "loss": 0.5191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06580007076263428, + "rewards/grad_term": 0.019602250307798386, + "rewards/margins": 0.461027592420578, + "rewards/rejected": -0.5268276929855347, + "step": 52 + }, + { + "epoch": 0.11355115158007499, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.703023398000125, + "learning_rate": 6.117156100749175e-07, + "logits/chosen": 0.9196311235427856, + "logits/rejected": 0.8846197128295898, + "logps/accuracies": 0.25, + "logps/chosen": -436.0137023925781, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -429.5511474609375, + "logps/ref_rejected": -402.48614501953125, + "logps/rejected": -423.6464538574219, + "loss": 0.5158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32312700152397156, + "rewards/grad_term": 0.016992483288049698, + "rewards/margins": 0.7348867654800415, + "rewards/rejected": -1.0580137968063354, + "step": 53 + }, + { + "epoch": 0.1156936261381896, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.613596866402307, + "learning_rate": 6.145955669642588e-07, + "logits/chosen": 1.0864933729171753, + "logits/rejected": 1.0046788454055786, + "logps/accuracies": 0.75, + "logps/chosen": -408.01947021484375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -394.4793395996094, + "logps/ref_rejected": -375.3567199707031, + "logps/rejected": -408.60504150390625, + "loss": 0.5135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6770049929618835, + "rewards/grad_term": 0.013774032704532146, + "rewards/margins": 0.9854103326797485, + "rewards/rejected": -1.6624153852462769, + "step": 54 + }, + { + "epoch": 0.11783610069630424, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.495375001244799, + "learning_rate": 6.174226776148516e-07, + "logits/chosen": 0.8830907940864563, + "logits/rejected": 0.7350561618804932, + "logps/accuracies": 0.25, + "logps/chosen": -330.6471252441406, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -317.4583740234375, + "logps/ref_rejected": -287.886962890625, + "logps/rejected": -309.34942626953125, + "loss": 0.4821, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6594364643096924, + "rewards/grad_term": 0.020037367939949036, + "rewards/margins": 0.41368618607521057, + "rewards/rejected": -1.0731226205825806, + "step": 55 + }, + { + "epoch": 0.11997857525441885, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 10.135520472135779, + "learning_rate": 6.201988465531067e-07, + "logits/chosen": 0.6756561994552612, + "logits/rejected": 0.6564769744873047, + "logps/accuracies": 0.5, + "logps/chosen": -181.22642517089844, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -179.98342895507812, + "logps/ref_rejected": -201.45867919921875, + "logps/rejected": -211.14109802246094, + "loss": 0.5161, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06215019151568413, + "rewards/grad_term": 0.019981056451797485, + "rewards/margins": 0.421970933675766, + "rewards/rejected": -0.4841211438179016, + "step": 56 + }, + { + "epoch": 0.12212104981253348, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.52811690075287, + "learning_rate": 6.229258771692866e-07, + "logits/chosen": 0.7419092655181885, + "logits/rejected": 0.8074868321418762, + "logps/accuracies": 0.5, + "logps/chosen": -186.41204833984375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -182.34783935546875, + "logps/ref_rejected": -165.51622009277344, + "logps/rejected": -176.84945678710938, + "loss": 0.4952, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2032102644443512, + "rewards/grad_term": 0.02091275155544281, + "rewards/margins": 0.3634513318538666, + "rewards/rejected": -0.5666615962982178, + "step": 57 + }, + { + "epoch": 0.1242635243706481, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.647643500777985, + "learning_rate": 6.256054787539818e-07, + "logits/chosen": 1.005966067314148, + "logits/rejected": 0.8792574405670166, + "logps/accuracies": 0.25, + "logps/chosen": -421.94464111328125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -406.43414306640625, + "logps/ref_rejected": -377.0458068847656, + "logps/rejected": -405.022705078125, + "loss": 0.4958, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7755249738693237, + "rewards/grad_term": 0.017923269420862198, + "rewards/margins": 0.623319149017334, + "rewards/rejected": -1.3988441228866577, + "step": 58 + }, + { + "epoch": 0.12640599892876273, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.581762363344323, + "learning_rate": 6.282392729330889e-07, + "logits/chosen": 0.788644552230835, + "logits/rejected": 0.8738614916801453, + "logps/accuracies": 0.5, + "logps/chosen": -299.53521728515625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -293.8477783203125, + "logps/ref_rejected": -257.6317443847656, + "logps/rejected": -273.8648681640625, + "loss": 0.4967, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.284370094537735, + "rewards/grad_term": 0.019201159477233887, + "rewards/margins": 0.5272856950759888, + "rewards/rejected": -0.8116558194160461, + "step": 59 + }, + { + "epoch": 0.12854847348687734, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.152545681984156, + "learning_rate": 6.308287995619528e-07, + "logits/chosen": 1.099388837814331, + "logits/rejected": 0.9928939342498779, + "logps/accuracies": 0.75, + "logps/chosen": -435.457763671875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -415.12860107421875, + "logps/ref_rejected": -380.19720458984375, + "logps/rejected": -426.2452392578125, + "loss": 0.5009, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0164591073989868, + "rewards/grad_term": 0.011328812688589096, + "rewards/margins": 1.2859418392181396, + "rewards/rejected": -2.302400827407837, + "step": 60 + }, + { + "epoch": 0.13069094804499196, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 10.54221239337406, + "learning_rate": 6.33375522132322e-07, + "logits/chosen": 0.9639657735824585, + "logits/rejected": 0.8224814534187317, + "logps/accuracies": 0.25, + "logps/chosen": -348.4953918457031, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -334.9384765625, + "logps/ref_rejected": -316.9508056640625, + "logps/rejected": -356.79949951171875, + "loss": 0.4842, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6778436899185181, + "rewards/grad_term": 0.011656711809337139, + "rewards/margins": 1.314591884613037, + "rewards/rejected": -1.9924354553222656, + "step": 61 + }, + { + "epoch": 0.1328334226031066, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 9.022752821926046, + "learning_rate": 6.358808327396516e-07, + "logits/chosen": 1.0160002708435059, + "logits/rejected": 0.616927981376648, + "logps/accuracies": 0.0, + "logps/chosen": -329.97100830078125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -312.87115478515625, + "logps/ref_rejected": -194.55523681640625, + "logps/rejected": -234.106201171875, + "loss": 0.4768, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8549936413764954, + "rewards/grad_term": 0.013443742878735065, + "rewards/margins": 1.1225550174713135, + "rewards/rejected": -1.9775487184524536, + "step": 62 + }, + { + "epoch": 0.13497589716122121, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.278955604267708, + "learning_rate": 6.383460566529704e-07, + "logits/chosen": 1.092294454574585, + "logits/rejected": 0.9165109395980835, + "logps/accuracies": 0.25, + "logps/chosen": -447.57257080078125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -433.4320983886719, + "logps/ref_rejected": -357.5963439941406, + "logps/rejected": -389.59075927734375, + "loss": 0.467, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7070247530937195, + "rewards/grad_term": 0.014869745820760727, + "rewards/margins": 0.8926937580108643, + "rewards/rejected": -1.5997185707092285, + "step": 63 + }, + { + "epoch": 0.13711837171933583, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.629680006671775, + "learning_rate": 6.407724565248689e-07, + "logits/chosen": 1.0533897876739502, + "logits/rejected": 0.7830870151519775, + "logps/accuracies": 0.5, + "logps/chosen": -331.41888427734375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -315.2283020019531, + "logps/ref_rejected": -263.9187316894531, + "logps/rejected": -279.4315185546875, + "loss": 0.4427, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8095288276672363, + "rewards/grad_term": 0.025076739490032196, + "rewards/margins": -0.03389042615890503, + "rewards/rejected": -0.7756383419036865, + "step": 64 + }, + { + "epoch": 0.13711837171933583, + "eval_flips/correct->correct": 0.1599999964237213, + "eval_flips/correct->incorrect": 0.0, + "eval_flips/incorrect->correct": 0.019999999552965164, + "eval_flips/incorrect->incorrect": 0.8199999928474426, + "eval_logits/chosen": 0.9344247579574585, + "eval_logits/rejected": 0.7796935439109802, + "eval_logps/accuracies": 0.18000000715255737, + "eval_logps/chosen": -337.09112548828125, + "eval_logps/ref_accuracies": 0.1599999964237213, + "eval_logps/ref_chosen": -323.51568603515625, + "eval_logps/ref_rejected": -258.70098876953125, + "eval_logps/rejected": -284.0068664550781, + "eval_loss": 0.4775756597518921, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -0.6787735819816589, + "eval_rewards/grad_term": 0.018939374014735222, + "eval_rewards/margins": 0.5865211486816406, + "eval_rewards/rejected": -1.2652947902679443, + "eval_runtime": 374.3115, + "eval_samples_per_second": 4.221, + "eval_steps_per_second": 0.134, + "step": 64 + }, + { + "epoch": 0.13926084627745045, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 10.614066894777759, + "learning_rate": 6.431612362750908e-07, + "logits/chosen": 0.8821598887443542, + "logits/rejected": 0.7168709635734558, + "logps/accuracies": 0.25, + "logps/chosen": -355.70806884765625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -342.8082275390625, + "logps/ref_rejected": -322.1506652832031, + "logps/rejected": -350.2712097167969, + "loss": 0.4728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6449924111366272, + "rewards/grad_term": 0.016398221254348755, + "rewards/margins": 0.7610346674919128, + "rewards/rejected": -1.40602707862854, + "step": 65 + }, + { + "epoch": 0.1414033208355651, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.855172230097049, + "learning_rate": 6.455135446776313e-07, + "logits/chosen": 0.7938796281814575, + "logits/rejected": 0.872632622718811, + "logps/accuracies": 0.5, + "logps/chosen": -310.7122802734375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -288.7535400390625, + "logps/ref_rejected": -302.68560791015625, + "logps/rejected": -338.1268615722656, + "loss": 0.4281, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.097936749458313, + "rewards/grad_term": 0.017016390338540077, + "rewards/margins": 0.6741248369216919, + "rewards/rejected": -1.7720615863800049, + "step": 66 + }, + { + "epoch": 0.1435457953936797, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 9.759871054620625, + "learning_rate": 6.478304786780968e-07, + "logits/chosen": 0.6220331192016602, + "logits/rejected": 0.5919966697692871, + "logps/accuracies": 0.0, + "logps/chosen": -368.77520751953125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -351.3057556152344, + "logps/ref_rejected": -261.40069580078125, + "logps/rejected": -304.46539306640625, + "loss": 0.475, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8734725713729858, + "rewards/grad_term": 0.014455066993832588, + "rewards/margins": 1.2797632217407227, + "rewards/rejected": -2.153235912322998, + "step": 67 + }, + { + "epoch": 0.14568826995179432, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.285309028552831, + "learning_rate": 6.501130864653065e-07, + "logits/chosen": 1.0729789733886719, + "logits/rejected": 1.0459753274917603, + "logps/accuracies": 0.5, + "logps/chosen": -303.7723388671875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -290.2609558105469, + "logps/ref_rejected": -262.08392333984375, + "logps/rejected": -290.4940185546875, + "loss": 0.4704, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6755678057670593, + "rewards/grad_term": 0.016569742932915688, + "rewards/margins": 0.7449362277984619, + "rewards/rejected": -1.420504093170166, + "step": 68 + }, + { + "epoch": 0.14783074450990893, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.513566366172148, + "learning_rate": 6.523623703186648e-07, + "logits/chosen": 0.894729495048523, + "logits/rejected": 1.0188794136047363, + "logps/accuracies": 0.75, + "logps/chosen": -317.3673095703125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -297.9476013183594, + "logps/ref_rejected": -303.1460876464844, + "logps/rejected": -348.55316162109375, + "loss": 0.4135, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9709864258766174, + "rewards/grad_term": 0.013456292450428009, + "rewards/margins": 1.2993673086166382, + "rewards/rejected": -2.2703537940979004, + "step": 69 + }, + { + "epoch": 0.14997321906802358, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.31200023242295, + "learning_rate": 6.545792892506645e-07, + "logits/chosen": 0.7371494174003601, + "logits/rejected": 0.9037774801254272, + "logps/accuracies": 0.5, + "logps/chosen": -312.41497802734375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -298.4285888671875, + "logps/ref_rejected": -268.10101318359375, + "logps/rejected": -295.3829345703125, + "loss": 0.3791, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6993191242218018, + "rewards/grad_term": 0.017996307462453842, + "rewards/margins": 0.6647781729698181, + "rewards/rejected": -1.3640973567962646, + "step": 70 + }, + { + "epoch": 0.1521156936261382, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.814713554670275, + "learning_rate": 6.567647614619587e-07, + "logits/chosen": 0.8867220878601074, + "logits/rejected": 0.8295634984970093, + "logps/accuracies": 0.5, + "logps/chosen": -343.9578552246094, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -321.64984130859375, + "logps/ref_rejected": -283.7974548339844, + "logps/rejected": -332.0108337402344, + "loss": 0.4731, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1153992414474487, + "rewards/grad_term": 0.012294553220272064, + "rewards/margins": 1.2952699661254883, + "rewards/rejected": -2.4106695652008057, + "step": 71 + }, + { + "epoch": 0.1542581681842528, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.137092152998001, + "learning_rate": 6.589196666247328e-07, + "logits/chosen": 0.9599927663803101, + "logits/rejected": 0.6932302713394165, + "logps/accuracies": 0.25, + "logps/chosen": -317.6519775390625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -303.6397705078125, + "logps/ref_rejected": -240.10572814941406, + "logps/rejected": -283.4850158691406, + "loss": 0.4246, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.700611412525177, + "rewards/grad_term": 0.012517021037638187, + "rewards/margins": 1.4683525562286377, + "rewards/rejected": -2.16896390914917, + "step": 72 + }, + { + "epoch": 0.15640064274236742, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 10.33751492174849, + "learning_rate": 6.610448480085853e-07, + "logits/chosen": 0.4570969045162201, + "logits/rejected": 0.4160915017127991, + "logps/accuracies": 0.25, + "logps/chosen": -212.59051513671875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -212.9862060546875, + "logps/ref_rejected": -181.0455322265625, + "logps/rejected": -190.78341674804688, + "loss": 0.4513, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01978490501642227, + "rewards/grad_term": 0.019215064123272896, + "rewards/margins": 0.5066791772842407, + "rewards/rejected": -0.48689424991607666, + "step": 73 + }, + { + "epoch": 0.15854311730048207, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.366156800768572, + "learning_rate": 6.631411144617796e-07, + "logits/chosen": 0.7912124395370483, + "logits/rejected": 0.7016277313232422, + "logps/accuracies": 0.5, + "logps/chosen": -337.2362365722656, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -318.22076416015625, + "logps/ref_rejected": -285.63134765625, + "logps/rejected": -356.1361999511719, + "loss": 0.3717, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9507730603218079, + "rewards/grad_term": 0.008385634049773216, + "rewards/margins": 2.574469804763794, + "rewards/rejected": -3.525242805480957, + "step": 74 + }, + { + "epoch": 0.16068559185859668, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.408007517521634, + "learning_rate": 6.652092422595104e-07, + "logits/chosen": 0.8023636937141418, + "logits/rejected": 0.7735366821289062, + "logps/accuracies": 0.75, + "logps/chosen": -306.0715026855469, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -275.71649169921875, + "logps/ref_rejected": -242.64198303222656, + "logps/rejected": -290.594970703125, + "loss": 0.3573, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5177501440048218, + "rewards/grad_term": 0.018539991229772568, + "rewards/margins": 0.8798991441726685, + "rewards/rejected": -2.3976492881774902, + "step": 75 + }, + { + "epoch": 0.1628280664167113, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.365050151926509, + "learning_rate": 6.672499768297604e-07, + "logits/chosen": 0.8398016691207886, + "logits/rejected": 0.8039026260375977, + "logps/accuracies": 0.25, + "logps/chosen": -338.2458801269531, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -300.6439208984375, + "logps/ref_rejected": -266.9490661621094, + "logps/rejected": -321.94207763671875, + "loss": 0.4334, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8800976276397705, + "rewards/grad_term": 0.016930393874645233, + "rewards/margins": 0.8695545196533203, + "rewards/rejected": -2.749652147293091, + "step": 76 + }, + { + "epoch": 0.1649705409748259, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.75229443985931, + "learning_rate": 6.692640343663431e-07, + "logits/chosen": 0.8696576356887817, + "logits/rejected": 0.876376211643219, + "logps/accuracies": 0.5, + "logps/chosen": -341.6177978515625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -313.14349365234375, + "logps/ref_rejected": -307.5328369140625, + "logps/rejected": -359.84185791015625, + "loss": 0.3678, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4237148761749268, + "rewards/grad_term": 0.014790714718401432, + "rewards/margins": 1.1917363405227661, + "rewards/rejected": -2.6154510974884033, + "step": 77 + }, + { + "epoch": 0.16711301553294056, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 8.819014293843647, + "learning_rate": 6.712521033378708e-07, + "logits/chosen": 1.0659058094024658, + "logits/rejected": 0.6623323559761047, + "logps/accuracies": 0.0, + "logps/chosen": -331.59368896484375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -292.36590576171875, + "logps/ref_rejected": -146.4788360595703, + "logps/rejected": -199.08897399902344, + "loss": 0.4175, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9613897800445557, + "rewards/grad_term": 0.020100167021155357, + "rewards/margins": 0.6691172122955322, + "rewards/rejected": -2.630506992340088, + "step": 78 + }, + { + "epoch": 0.16925549009105517, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 10.100776831917964, + "learning_rate": 6.732148459006032e-07, + "logits/chosen": 0.7964029312133789, + "logits/rejected": 0.6478776931762695, + "logps/accuracies": 0.25, + "logps/chosen": -320.44476318359375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -274.2988586425781, + "logps/ref_rejected": -246.58741760253906, + "logps/rejected": -303.3105773925781, + "loss": 0.394, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.3072948455810547, + "rewards/grad_term": 0.022354397922754288, + "rewards/margins": 0.5288637280464172, + "rewards/rejected": -2.8361587524414062, + "step": 79 + }, + { + "epoch": 0.1713979646491698, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.215506702371561, + "learning_rate": 6.751528992224267e-07, + "logits/chosen": 1.0231748819351196, + "logits/rejected": 0.9256105422973633, + "logps/accuracies": 0.5, + "logps/chosen": -371.5207824707031, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -323.4173583984375, + "logps/ref_rejected": -287.026611328125, + "logps/rejected": -385.26141357421875, + "loss": 0.358, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4051713943481445, + "rewards/grad_term": 0.008784506469964981, + "rewards/margins": 2.5065674781799316, + "rewards/rejected": -4.911738872528076, + "step": 80 + }, + { + "epoch": 0.1735404392072844, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.286461530064006, + "learning_rate": 6.770668767245965e-07, + "logits/chosen": 0.9782469868659973, + "logits/rejected": 0.6938868761062622, + "logps/accuracies": 0.5, + "logps/chosen": -289.8148498535156, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -267.3828125, + "logps/ref_rejected": -228.83135986328125, + "logps/rejected": -291.4294128417969, + "loss": 0.3489, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1216013431549072, + "rewards/grad_term": 0.012826542370021343, + "rewards/margins": 2.0083022117614746, + "rewards/rejected": -3.129903554916382, + "step": 81 + }, + { + "epoch": 0.17568291376539905, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.751825989493291, + "learning_rate": 6.789573692472892e-07, + "logits/chosen": 0.8200634717941284, + "logits/rejected": 0.9688655138015747, + "logps/accuracies": 0.5, + "logps/chosen": -390.98150634765625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -348.5020751953125, + "logps/ref_rejected": -350.07159423828125, + "logps/rejected": -411.6036682128906, + "loss": 0.393, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.123972177505493, + "rewards/grad_term": 0.014323998242616653, + "rewards/margins": 0.9526323080062866, + "rewards/rejected": -3.0766046047210693, + "step": 82 + }, + { + "epoch": 0.17782538832351366, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.216588835288912, + "learning_rate": 6.808249461445122e-07, + "logits/chosen": 1.072934627532959, + "logits/rejected": 0.911789059638977, + "logps/accuracies": 0.75, + "logps/chosen": -479.0382080078125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -419.0920104980469, + "logps/ref_rejected": -390.1745300292969, + "logps/rejected": -485.1689453125, + "loss": 0.3724, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.997309684753418, + "rewards/grad_term": 0.0095378328114748, + "rewards/margins": 1.7524113655090332, + "rewards/rejected": -4.749721050262451, + "step": 83 + }, + { + "epoch": 0.17996786288162828, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.25160372674251, + "learning_rate": 6.826701563134442e-07, + "logits/chosen": 0.8092617988586426, + "logits/rejected": 0.7646486163139343, + "logps/accuracies": 0.25, + "logps/chosen": -410.1537780761719, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -384.58575439453125, + "logps/ref_rejected": -304.9964904785156, + "logps/rejected": -354.8568115234375, + "loss": 0.3661, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2784030437469482, + "rewards/grad_term": 0.012526333332061768, + "rewards/margins": 1.2146127223968506, + "rewards/rejected": -2.493015766143799, + "step": 84 + }, + { + "epoch": 0.1821103374397429, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.118700174990803, + "learning_rate": 6.844935291628642e-07, + "logits/chosen": 1.0619235038757324, + "logits/rejected": 0.9984962344169617, + "logps/accuracies": 0.25, + "logps/chosen": -409.0619201660156, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -377.1236267089844, + "logps/ref_rejected": -326.3685302734375, + "logps/rejected": -405.38037109375, + "loss": 0.3594, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.596914291381836, + "rewards/grad_term": 0.009607160463929176, + "rewards/margins": 2.353677988052368, + "rewards/rejected": -3.950592041015625, + "step": 85 + }, + { + "epoch": 0.18425281199785754, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.571514403852135, + "learning_rate": 6.862955755249413e-07, + "logits/chosen": 0.9191405773162842, + "logits/rejected": 0.801410436630249, + "logps/accuracies": 0.5, + "logps/chosen": -312.10894775390625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -278.7945556640625, + "logps/ref_rejected": -234.3353271484375, + "logps/rejected": -277.7581787109375, + "loss": 0.3311, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6657218933105469, + "rewards/grad_term": 0.019233139231801033, + "rewards/margins": 0.5054203271865845, + "rewards/rejected": -2.171142339706421, + "step": 86 + }, + { + "epoch": 0.18639528655597215, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 8.757747519851693, + "learning_rate": 6.880767885143194e-07, + "logits/chosen": 0.8738288283348083, + "logits/rejected": 0.684519350528717, + "logps/accuracies": 0.25, + "logps/chosen": -410.3459167480469, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -376.2248840332031, + "logps/ref_rejected": -312.1935729980469, + "logps/rejected": -370.3998107910156, + "loss": 0.3656, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7060527801513672, + "rewards/grad_term": 0.01283444557338953, + "rewards/margins": 1.2042596340179443, + "rewards/rejected": -2.9103124141693115, + "step": 87 + }, + { + "epoch": 0.18853776111408677, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 8.985961972583743, + "learning_rate": 6.898376443381053e-07, + "logits/chosen": 0.9711716771125793, + "logits/rejected": 0.8526138067245483, + "logps/accuracies": 0.25, + "logps/chosen": -403.53948974609375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -360.1213073730469, + "logps/ref_rejected": -311.68292236328125, + "logps/rejected": -381.7286376953125, + "loss": 0.3717, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.170908212661743, + "rewards/grad_term": 0.018295947462320328, + "rewards/margins": 1.3313778638839722, + "rewards/rejected": -3.502286195755005, + "step": 88 + }, + { + "epoch": 0.19068023567220138, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.683570806703395, + "learning_rate": 6.915786030600927e-07, + "logits/chosen": 0.843113899230957, + "logits/rejected": 0.7101360559463501, + "logps/accuracies": 0.5, + "logps/chosen": -428.8794250488281, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -389.9796447753906, + "logps/ref_rejected": -322.93255615234375, + "logps/rejected": -415.9242248535156, + "loss": 0.3549, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9449876546859741, + "rewards/grad_term": 0.004990034736692905, + "rewards/margins": 2.704596996307373, + "rewards/rejected": -4.649584770202637, + "step": 89 + }, + { + "epoch": 0.19282271023031602, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.498185901157015, + "learning_rate": 6.933001093222904e-07, + "logits/chosen": 0.9054147601127625, + "logits/rejected": 0.7073743939399719, + "logps/accuracies": 0.75, + "logps/chosen": -294.0045471191406, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -280.5016174316406, + "logps/ref_rejected": -278.77313232421875, + "logps/rejected": -316.6849060058594, + "loss": 0.3481, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6751471161842346, + "rewards/grad_term": 0.011688041500747204, + "rewards/margins": 1.2204415798187256, + "rewards/rejected": -1.895588755607605, + "step": 90 + }, + { + "epoch": 0.19496518478843064, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.490840097910208, + "learning_rate": 6.950025930265823e-07, + "logits/chosen": 0.8930804133415222, + "logits/rejected": 0.9076898097991943, + "logps/accuracies": 0.75, + "logps/chosen": -353.33563232421875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -318.02215576171875, + "logps/ref_rejected": -333.869140625, + "logps/rejected": -411.1112365722656, + "loss": 0.3822, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7656757831573486, + "rewards/grad_term": 0.006808947771787643, + "rewards/margins": 2.0964293479919434, + "rewards/rejected": -3.862105369567871, + "step": 91 + }, + { + "epoch": 0.19710765934654526, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.412294871691426, + "learning_rate": 6.966864699791386e-07, + "logits/chosen": 0.7138292789459229, + "logits/rejected": 0.6344163417816162, + "logps/accuracies": 0.75, + "logps/chosen": -284.339111328125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -267.2784423828125, + "logps/ref_rejected": -271.889892578125, + "logps/rejected": -323.4517822265625, + "loss": 0.3446, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8530327081680298, + "rewards/grad_term": 0.008454471826553345, + "rewards/margins": 1.7250609397888184, + "rewards/rejected": -2.5780937671661377, + "step": 92 + }, + { + "epoch": 0.19925013390465987, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 7.851393127918492, + "learning_rate": 6.983521424999892e-07, + "logits/chosen": 1.1121702194213867, + "logits/rejected": 1.0464541912078857, + "logps/accuracies": 0.0, + "logps/chosen": -323.29986572265625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -300.96435546875, + "logps/ref_rejected": -263.5765686035156, + "logps/rejected": -319.5726318359375, + "loss": 0.3308, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1167749166488647, + "rewards/grad_term": 0.01233928557485342, + "rewards/margins": 1.683027982711792, + "rewards/rejected": -2.7998030185699463, + "step": 93 + }, + { + "epoch": 0.20139260846277451, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 7.7736296892204395, + "learning_rate": 7e-07, + "logits/chosen": 0.8960837125778198, + "logits/rejected": 0.761021614074707, + "logps/accuracies": 0.25, + "logps/chosen": -377.28021240234375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -330.466064453125, + "logps/ref_rejected": -286.0745849609375, + "logps/rejected": -351.7135009765625, + "loss": 0.296, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3407070636749268, + "rewards/grad_term": 0.015900662168860435, + "rewards/margins": 0.9412397742271423, + "rewards/rejected": -3.281947135925293, + "step": 94 + }, + { + "epoch": 0.20353508302088913, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.371159163521241, + "learning_rate": 7e-07, + "logits/chosen": 0.9651041030883789, + "logits/rejected": 0.7860502004623413, + "logps/accuracies": 0.25, + "logps/chosen": -425.8303527832031, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -374.81097412109375, + "logps/ref_rejected": -321.1005859375, + "logps/rejected": -400.951171875, + "loss": 0.3427, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5509684085845947, + "rewards/grad_term": 0.0106906583532691, + "rewards/margins": 1.4415616989135742, + "rewards/rejected": -3.99252986907959, + "step": 95 + }, + { + "epoch": 0.20567755757900374, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 10.177619063233454, + "learning_rate": 6.991646778042959e-07, + "logits/chosen": 0.7661604881286621, + "logits/rejected": 0.6532018780708313, + "logps/accuracies": 0.5, + "logps/chosen": -257.8993835449219, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -240.183349609375, + "logps/ref_rejected": -198.9567413330078, + "logps/rejected": -253.74404907226562, + "loss": 0.3882, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8858024477958679, + "rewards/grad_term": 0.011164636351168156, + "rewards/margins": 1.8535633087158203, + "rewards/rejected": -2.739365816116333, + "step": 96 + }, + { + "epoch": 0.20782003213711836, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.250952702236187, + "learning_rate": 6.983293556085918e-07, + "logits/chosen": 0.8017215728759766, + "logits/rejected": 0.6558822393417358, + "logps/accuracies": 0.5, + "logps/chosen": -316.91131591796875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -272.19842529296875, + "logps/ref_rejected": -228.5543975830078, + "logps/rejected": -309.4710693359375, + "loss": 0.336, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.235644817352295, + "rewards/grad_term": 0.010754971764981747, + "rewards/margins": 1.810187816619873, + "rewards/rejected": -4.045832633972168, + "step": 97 + }, + { + "epoch": 0.209962506695233, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.58529126245538, + "learning_rate": 6.974940334128877e-07, + "logits/chosen": 0.8682336807250977, + "logits/rejected": 0.7378227710723877, + "logps/accuracies": 0.5, + "logps/chosen": -253.228515625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -229.12966918945312, + "logps/ref_rejected": -178.55392456054688, + "logps/rejected": -221.88421630859375, + "loss": 0.3899, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2049428224563599, + "rewards/grad_term": 0.014865662902593613, + "rewards/margins": 0.9615722894668579, + "rewards/rejected": -2.1665151119232178, + "step": 98 + }, + { + "epoch": 0.21210498125334762, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.572183229995163, + "learning_rate": 6.966587112171838e-07, + "logits/chosen": 0.8709318041801453, + "logits/rejected": 0.7755447626113892, + "logps/accuracies": 0.5, + "logps/chosen": -408.872802734375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -363.9078369140625, + "logps/ref_rejected": -299.89794921875, + "logps/rejected": -368.15155029296875, + "loss": 0.3652, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.2482473850250244, + "rewards/grad_term": 0.018564339727163315, + "rewards/margins": 1.1644330024719238, + "rewards/rejected": -3.4126803874969482, + "step": 99 + }, + { + "epoch": 0.21424745581146223, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.624545109237621, + "learning_rate": 6.958233890214797e-07, + "logits/chosen": 0.9372926354408264, + "logits/rejected": 0.8751659989356995, + "logps/accuracies": 0.5, + "logps/chosen": -474.0740966796875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -424.33819580078125, + "logps/ref_rejected": -390.84747314453125, + "logps/rejected": -506.8907165527344, + "loss": 0.3056, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4867947101593018, + "rewards/grad_term": 0.004845261108130217, + "rewards/margins": 3.3153672218322754, + "rewards/rejected": -5.802162170410156, + "step": 100 + }, + { + "epoch": 0.21638993036957685, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.067295914483129, + "learning_rate": 6.949880668257756e-07, + "logits/chosen": 0.9544820785522461, + "logits/rejected": 0.9464821815490723, + "logps/accuracies": 0.5, + "logps/chosen": -494.697509765625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -454.8968505859375, + "logps/ref_rejected": -400.18536376953125, + "logps/rejected": -466.2589111328125, + "loss": 0.358, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.990033745765686, + "rewards/grad_term": 0.013565966859459877, + "rewards/margins": 1.3136428594589233, + "rewards/rejected": -3.3036766052246094, + "step": 101 + }, + { + "epoch": 0.2185324049276915, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.960375031224479, + "learning_rate": 6.941527446300716e-07, + "logits/chosen": 0.8813360333442688, + "logits/rejected": 0.695158839225769, + "logps/accuracies": 0.5, + "logps/chosen": -291.07232666015625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -251.7209930419922, + "logps/ref_rejected": -190.36097717285156, + "logps/rejected": -257.9454345703125, + "loss": 0.3402, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9675672054290771, + "rewards/grad_term": 0.011471440084278584, + "rewards/margins": 1.411657452583313, + "rewards/rejected": -3.3792247772216797, + "step": 102 + }, + { + "epoch": 0.2206748794858061, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.26805795080682, + "learning_rate": 6.933174224343675e-07, + "logits/chosen": 0.8939443826675415, + "logits/rejected": 0.625817596912384, + "logps/accuracies": 0.75, + "logps/chosen": -145.8295440673828, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -133.74105834960938, + "logps/ref_rejected": -113.23294067382812, + "logps/rejected": -148.43218994140625, + "loss": 0.3759, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6044239401817322, + "rewards/grad_term": 0.014196785166859627, + "rewards/margins": 1.1555386781692505, + "rewards/rejected": -1.7599626779556274, + "step": 103 + }, + { + "epoch": 0.22281735404392072, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.93695871637079, + "learning_rate": 6.924821002386635e-07, + "logits/chosen": 1.0364937782287598, + "logits/rejected": 0.563789427280426, + "logps/accuracies": 0.5, + "logps/chosen": -199.2762451171875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -178.82568359375, + "logps/ref_rejected": -115.96835327148438, + "logps/rejected": -148.9056396484375, + "loss": 0.3053, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0225275754928589, + "rewards/grad_term": 0.019687440246343613, + "rewards/margins": 0.6243367195129395, + "rewards/rejected": -1.6468642950057983, + "step": 104 + }, + { + "epoch": 0.22495982860203534, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 8.863344087781872, + "learning_rate": 6.916467780429593e-07, + "logits/chosen": 0.869300901889801, + "logits/rejected": 0.7735204100608826, + "logps/accuracies": 0.25, + "logps/chosen": -459.5062561035156, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -410.751708984375, + "logps/ref_rejected": -351.19830322265625, + "logps/rejected": -436.1498718261719, + "loss": 0.3494, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4377286434173584, + "rewards/grad_term": 0.008553780615329742, + "rewards/margins": 1.8098492622375488, + "rewards/rejected": -4.247577667236328, + "step": 105 + }, + { + "epoch": 0.22710230316014998, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.525931151845846, + "learning_rate": 6.908114558472554e-07, + "logits/chosen": 0.9212764501571655, + "logits/rejected": 0.7540117502212524, + "logps/accuracies": 0.75, + "logps/chosen": -369.387939453125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -333.4745788574219, + "logps/ref_rejected": -299.4930419921875, + "logps/rejected": -394.82952880859375, + "loss": 0.3309, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7956693172454834, + "rewards/grad_term": 0.00758711900562048, + "rewards/margins": 2.9711527824401855, + "rewards/rejected": -4.76682186126709, + "step": 106 + }, + { + "epoch": 0.2292447777182646, + "flips/correct->correct": 1.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.335424081368634, + "learning_rate": 6.899761336515513e-07, + "logits/chosen": 0.7336133718490601, + "logits/rejected": 0.8765916228294373, + "logps/accuracies": 1.0, + "logps/chosen": -244.1851348876953, + "logps/ref_accuracies": 1.0, + "logps/ref_chosen": -227.0002899169922, + "logps/ref_rejected": -286.24835205078125, + "logps/rejected": -336.29425048828125, + "loss": 0.3244, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8592423796653748, + "rewards/grad_term": 0.009657299146056175, + "rewards/margins": 1.643053412437439, + "rewards/rejected": -2.502295970916748, + "step": 107 + }, + { + "epoch": 0.2313872522763792, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.973679697099532, + "learning_rate": 6.891408114558472e-07, + "logits/chosen": 0.9743479490280151, + "logits/rejected": 1.0019692182540894, + "logps/accuracies": 0.75, + "logps/chosen": -339.78680419921875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -297.22393798828125, + "logps/ref_rejected": -304.42578125, + "logps/rejected": -374.98779296875, + "loss": 0.2818, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1281425952911377, + "rewards/grad_term": 0.01099520642310381, + "rewards/margins": 1.3999576568603516, + "rewards/rejected": -3.5281002521514893, + "step": 108 + }, + { + "epoch": 0.23352972683449383, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.989727540849666, + "learning_rate": 6.883054892601431e-07, + "logits/chosen": 1.1120461225509644, + "logits/rejected": 0.7604467272758484, + "logps/accuracies": 0.5, + "logps/chosen": -414.3526916503906, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -393.2984619140625, + "logps/ref_rejected": -310.2208251953125, + "logps/rejected": -371.7498779296875, + "loss": 0.3362, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.052709937095642, + "rewards/grad_term": 0.010077744722366333, + "rewards/margins": 2.023743152618408, + "rewards/rejected": -3.07645320892334, + "step": 109 + }, + { + "epoch": 0.23567220139260847, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 10.51051941007351, + "learning_rate": 6.874701670644392e-07, + "logits/chosen": 0.8338203430175781, + "logits/rejected": 0.677147388458252, + "logps/accuracies": 0.5, + "logps/chosen": -344.638671875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -336.9551086425781, + "logps/ref_rejected": -251.71192932128906, + "logps/rejected": -278.19818115234375, + "loss": 0.3953, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3841773271560669, + "rewards/grad_term": 0.01569775864481926, + "rewards/margins": 0.9401355981826782, + "rewards/rejected": -1.3243129253387451, + "step": 110 + }, + { + "epoch": 0.2378146759507231, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.352097570099665, + "learning_rate": 6.866348448687351e-07, + "logits/chosen": 1.0819525718688965, + "logits/rejected": 0.9638932943344116, + "logps/accuracies": 0.5, + "logps/chosen": -340.05609130859375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -319.25897216796875, + "logps/ref_rejected": -289.39013671875, + "logps/rejected": -384.5931091308594, + "loss": 0.2425, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0398552417755127, + "rewards/grad_term": 0.007856637239456177, + "rewards/margins": 3.7202935218811035, + "rewards/rejected": -4.760149002075195, + "step": 111 + }, + { + "epoch": 0.2399571505088377, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 8.759463222456692, + "learning_rate": 6.85799522673031e-07, + "logits/chosen": 0.6596381664276123, + "logits/rejected": 0.4419824481010437, + "logps/accuracies": 0.25, + "logps/chosen": -287.6408996582031, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -243.9873046875, + "logps/ref_rejected": -173.07931518554688, + "logps/rejected": -247.95965576171875, + "loss": 0.3367, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1826789379119873, + "rewards/grad_term": 0.011957314796745777, + "rewards/margins": 1.5613383054733276, + "rewards/rejected": -3.7440171241760254, + "step": 112 + }, + { + "epoch": 0.24209962506695232, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 8.026439048323674, + "learning_rate": 6.849642004773269e-07, + "logits/chosen": 0.9305301904678345, + "logits/rejected": 0.8421428203582764, + "logps/accuracies": 0.25, + "logps/chosen": -441.80340576171875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -359.161865234375, + "logps/ref_rejected": -317.730712890625, + "logps/rejected": -430.5499267578125, + "loss": 0.2678, + "rewards/accuracies": 0.5, + "rewards/chosen": -4.132076263427734, + "rewards/grad_term": 0.01556328870356083, + "rewards/margins": 1.508885145187378, + "rewards/rejected": -5.640961647033691, + "step": 113 + }, + { + "epoch": 0.24424209962506696, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 8.200400573622296, + "learning_rate": 6.841288782816229e-07, + "logits/chosen": 0.957403838634491, + "logits/rejected": 0.7602821588516235, + "logps/accuracies": 0.25, + "logps/chosen": -317.9858703613281, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -303.8868713378906, + "logps/ref_rejected": -208.02256774902344, + "logps/rejected": -256.6693115234375, + "loss": 0.283, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7049498558044434, + "rewards/grad_term": 0.009732572361826897, + "rewards/margins": 1.7273874282836914, + "rewards/rejected": -2.4323372840881348, + "step": 114 + }, + { + "epoch": 0.24638457418318158, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 11.032492821291632, + "learning_rate": 6.832935560859188e-07, + "logits/chosen": 0.8847552537918091, + "logits/rejected": 0.8709891438484192, + "logps/accuracies": 0.75, + "logps/chosen": -275.5125427246094, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -239.18618774414062, + "logps/ref_rejected": -209.97958374023438, + "logps/rejected": -299.82904052734375, + "loss": 0.335, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.816316843032837, + "rewards/grad_term": 0.007570344489067793, + "rewards/margins": 2.6761562824249268, + "rewards/rejected": -4.492473125457764, + "step": 115 + }, + { + "epoch": 0.2485270487412962, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.054789967348844, + "learning_rate": 6.824582338902147e-07, + "logits/chosen": 0.9024197459220886, + "logits/rejected": 0.7947896718978882, + "logps/accuracies": 1.0, + "logps/chosen": -347.18023681640625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -307.41802978515625, + "logps/ref_rejected": -293.6680908203125, + "logps/rejected": -407.90380859375, + "loss": 0.3347, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9881106615066528, + "rewards/grad_term": 0.0060688708908855915, + "rewards/margins": 3.7236742973327637, + "rewards/rejected": -5.711784839630127, + "step": 116 + }, + { + "epoch": 0.25066952329941083, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 8.668010213567692, + "learning_rate": 6.816229116945108e-07, + "logits/chosen": 0.9976560473442078, + "logits/rejected": 0.6948249340057373, + "logps/accuracies": 0.0, + "logps/chosen": -401.39923095703125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -337.87274169921875, + "logps/ref_rejected": -240.39515686035156, + "logps/rejected": -330.98101806640625, + "loss": 0.3004, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.17632794380188, + "rewards/grad_term": 0.013613393530249596, + "rewards/margins": 1.3529647588729858, + "rewards/rejected": -4.529292583465576, + "step": 117 + }, + { + "epoch": 0.25281199785752545, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.847556657578917, + "learning_rate": 6.807875894988067e-07, + "logits/chosen": 0.9075788259506226, + "logits/rejected": 0.7977707982063293, + "logps/accuracies": 0.75, + "logps/chosen": -375.44140625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -321.0868225097656, + "logps/ref_rejected": -267.4314270019531, + "logps/rejected": -378.83599853515625, + "loss": 0.3172, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7177300453186035, + "rewards/grad_term": 0.004429055377840996, + "rewards/margins": 2.8524982929229736, + "rewards/rejected": -5.570228099822998, + "step": 118 + }, + { + "epoch": 0.25495447241564007, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 8.68803296697522, + "learning_rate": 6.799522673031026e-07, + "logits/chosen": 0.895818293094635, + "logits/rejected": 0.575642466545105, + "logps/accuracies": 0.0, + "logps/chosen": -318.50885009765625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -287.44415283203125, + "logps/ref_rejected": -177.8115234375, + "logps/rejected": -243.67047119140625, + "loss": 0.3158, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5532350540161133, + "rewards/grad_term": 0.010060964152216911, + "rewards/margins": 1.739712119102478, + "rewards/rejected": -3.2929470539093018, + "step": 119 + }, + { + "epoch": 0.2570969469737547, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 11.719803160063796, + "learning_rate": 6.791169451073985e-07, + "logits/chosen": 0.7772294282913208, + "logits/rejected": 0.8315998315811157, + "logps/accuracies": 1.0, + "logps/chosen": -432.6302185058594, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -355.44903564453125, + "logps/ref_rejected": -386.0659484863281, + "logps/rejected": -523.24609375, + "loss": 0.2837, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.859060287475586, + "rewards/grad_term": 0.003882521763443947, + "rewards/margins": 2.9999477863311768, + "rewards/rejected": -6.859008312225342, + "step": 120 + }, + { + "epoch": 0.2592394215318693, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 7.238978607387634, + "learning_rate": 6.782816229116945e-07, + "logits/chosen": 0.8819637894630432, + "logits/rejected": 0.7054441571235657, + "logps/accuracies": 0.25, + "logps/chosen": -393.9786071777344, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -364.44207763671875, + "logps/ref_rejected": -345.252685546875, + "logps/rejected": -389.1627502441406, + "loss": 0.2768, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4768271446228027, + "rewards/grad_term": 0.01706988736987114, + "rewards/margins": 0.7186762094497681, + "rewards/rejected": -2.1955032348632812, + "step": 121 + }, + { + "epoch": 0.2613818960899839, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.838179827940348, + "learning_rate": 6.774463007159905e-07, + "logits/chosen": 0.7459653615951538, + "logits/rejected": 0.7466898560523987, + "logps/accuracies": 0.5, + "logps/chosen": -289.6307067871094, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -244.8365020751953, + "logps/ref_rejected": -223.6743927001953, + "logps/rejected": -318.4463195800781, + "loss": 0.273, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2397100925445557, + "rewards/grad_term": 0.010229886509478092, + "rewards/margins": 2.4988858699798584, + "rewards/rejected": -4.738595962524414, + "step": 122 + }, + { + "epoch": 0.2635243706480985, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.71899475513759, + "learning_rate": 6.766109785202863e-07, + "logits/chosen": 0.9420760869979858, + "logits/rejected": 0.8325188755989075, + "logps/accuracies": 0.75, + "logps/chosen": -358.3894348144531, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -302.154052734375, + "logps/ref_rejected": -271.524658203125, + "logps/rejected": -382.03302001953125, + "loss": 0.2531, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.811767339706421, + "rewards/grad_term": 0.010155830532312393, + "rewards/margins": 2.7136499881744385, + "rewards/rejected": -5.525417327880859, + "step": 123 + }, + { + "epoch": 0.2656668452062132, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.827729622567765, + "learning_rate": 6.757756563245823e-07, + "logits/chosen": 0.9883730411529541, + "logits/rejected": 0.8319672346115112, + "logps/accuracies": 0.5, + "logps/chosen": -461.412109375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -367.73486328125, + "logps/ref_rejected": -298.51544189453125, + "logps/rejected": -510.0968017578125, + "loss": 0.2583, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.683863162994385, + "rewards/grad_term": 0.0023802227806299925, + "rewards/margins": 5.895203590393066, + "rewards/rejected": -10.57906723022461, + "step": 124 + }, + { + "epoch": 0.2678093197643278, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 5.875715435058411, + "learning_rate": 6.749403341288783e-07, + "logits/chosen": 0.9549310207366943, + "logits/rejected": 0.8220203518867493, + "logps/accuracies": 0.25, + "logps/chosen": -292.9206237792969, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -250.8220672607422, + "logps/ref_rejected": -188.01498413085938, + "logps/rejected": -286.50128173828125, + "loss": 0.2039, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1049275398254395, + "rewards/grad_term": 0.006195048335939646, + "rewards/margins": 2.8193886280059814, + "rewards/rejected": -4.92431640625, + "step": 125 + }, + { + "epoch": 0.26995179432244243, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.463586322032903, + "learning_rate": 6.741050119331742e-07, + "logits/chosen": 0.9875746965408325, + "logits/rejected": 0.7808788418769836, + "logps/accuracies": 0.5, + "logps/chosen": -251.4386444091797, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -224.3441619873047, + "logps/ref_rejected": -220.13214111328125, + "logps/rejected": -279.40972900390625, + "loss": 0.3086, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3547240495681763, + "rewards/grad_term": 0.009899970144033432, + "rewards/margins": 1.6091564893722534, + "rewards/rejected": -2.9638805389404297, + "step": 126 + }, + { + "epoch": 0.27209426888055704, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.995166618243903, + "learning_rate": 6.732696897374701e-07, + "logits/chosen": 1.0715935230255127, + "logits/rejected": 0.9156344532966614, + "logps/accuracies": 0.5, + "logps/chosen": -495.48419189453125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -411.56317138671875, + "logps/ref_rejected": -354.6976623535156, + "logps/rejected": -507.9322509765625, + "loss": 0.268, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.196050643920898, + "rewards/grad_term": 0.0026723581831902266, + "rewards/margins": 3.465679168701172, + "rewards/rejected": -7.66172981262207, + "step": 127 + }, + { + "epoch": 0.27423674343867166, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.959838246440706, + "learning_rate": 6.72434367541766e-07, + "logits/chosen": 0.9897805452346802, + "logits/rejected": 0.8163310289382935, + "logps/accuracies": 1.0, + "logps/chosen": -487.4618225097656, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -402.695556640625, + "logps/ref_rejected": -365.8092346191406, + "logps/rejected": -549.47314453125, + "loss": 0.303, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.238313674926758, + "rewards/grad_term": 0.0018401599954813719, + "rewards/margins": 4.944880485534668, + "rewards/rejected": -9.183194160461426, + "step": 128 + }, + { + "epoch": 0.27423674343867166, + "eval_flips/correct->correct": 0.1599999964237213, + "eval_flips/correct->incorrect": 0.0, + "eval_flips/incorrect->correct": 0.20000000298023224, + "eval_flips/incorrect->incorrect": 0.6399999856948853, + "eval_logits/chosen": 0.871035099029541, + "eval_logits/rejected": 0.7403469681739807, + "eval_logps/accuracies": 0.36000001430511475, + "eval_logps/chosen": -374.1800537109375, + "eval_logps/ref_accuracies": 0.1599999964237213, + "eval_logps/ref_chosen": -323.51568603515625, + "eval_logps/ref_rejected": -258.70098876953125, + "eval_logps/rejected": -354.16534423828125, + "eval_loss": 0.290331095457077, + "eval_rewards/accuracies": 0.7799999713897705, + "eval_rewards/chosen": -2.533219575881958, + "eval_rewards/grad_term": 0.011603965424001217, + "eval_rewards/margins": 2.2400009632110596, + "eval_rewards/rejected": -4.773220062255859, + "eval_runtime": 374.0534, + "eval_samples_per_second": 4.224, + "eval_steps_per_second": 0.134, + "step": 128 + }, + { + "epoch": 0.2763792179967863, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.320596690520534, + "learning_rate": 6.715990453460621e-07, + "logits/chosen": 0.959011435508728, + "logits/rejected": 0.8785006999969482, + "logps/accuracies": 1.0, + "logps/chosen": -253.6005859375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -211.31517028808594, + "logps/ref_rejected": -201.87469482421875, + "logps/rejected": -295.11090087890625, + "loss": 0.2446, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.114271402359009, + "rewards/grad_term": 0.01070532575249672, + "rewards/margins": 2.5475387573242188, + "rewards/rejected": -4.661810874938965, + "step": 129 + }, + { + "epoch": 0.2785216925549009, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 7.49663463971219, + "learning_rate": 6.707637231503579e-07, + "logits/chosen": 0.9510787129402161, + "logits/rejected": 0.6340673565864563, + "logps/accuracies": 0.25, + "logps/chosen": -353.7513427734375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -313.7836608886719, + "logps/ref_rejected": -263.53387451171875, + "logps/rejected": -344.3742980957031, + "loss": 0.2306, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9983829259872437, + "rewards/grad_term": 0.007957426831126213, + "rewards/margins": 2.043639659881592, + "rewards/rejected": -4.042022228240967, + "step": 130 + }, + { + "epoch": 0.2806641671130155, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 10.934812547934818, + "learning_rate": 6.699284009546539e-07, + "logits/chosen": 0.7116758823394775, + "logits/rejected": 0.6254409551620483, + "logps/accuracies": 0.0, + "logps/chosen": -361.2596740722656, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -345.98199462890625, + "logps/ref_rejected": -231.908447265625, + "logps/rejected": -297.0892028808594, + "loss": 0.2395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.763884425163269, + "rewards/grad_term": 0.008144252933561802, + "rewards/margins": 2.4951541423797607, + "rewards/rejected": -3.2590384483337402, + "step": 131 + }, + { + "epoch": 0.2828066416711302, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.274621875830415, + "learning_rate": 6.690930787589498e-07, + "logits/chosen": 0.9305391907691956, + "logits/rejected": 0.9035634994506836, + "logps/accuracies": 1.0, + "logps/chosen": -454.7461242675781, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -338.783447265625, + "logps/ref_rejected": -331.75201416015625, + "logps/rejected": -508.1329650878906, + "loss": 0.2374, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.79813289642334, + "rewards/grad_term": 0.0040429579094052315, + "rewards/margins": 3.020915985107422, + "rewards/rejected": -8.819048881530762, + "step": 132 + }, + { + "epoch": 0.2849491162292448, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.310190672888075, + "learning_rate": 6.682577565632458e-07, + "logits/chosen": 0.9652221202850342, + "logits/rejected": 0.8905255198478699, + "logps/accuracies": 0.75, + "logps/chosen": -474.09820556640625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -404.9132080078125, + "logps/ref_rejected": -397.7674560546875, + "logps/rejected": -542.0930786132812, + "loss": 0.2522, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.459249496459961, + "rewards/grad_term": 0.007244814652949572, + "rewards/margins": 3.7570319175720215, + "rewards/rejected": -7.216281414031982, + "step": 133 + }, + { + "epoch": 0.2870915907873594, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.377477290301309, + "learning_rate": 6.674224343675417e-07, + "logits/chosen": 0.9276644587516785, + "logits/rejected": 0.7834001779556274, + "logps/accuracies": 0.75, + "logps/chosen": -482.92828369140625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -378.0372009277344, + "logps/ref_rejected": -330.1478271484375, + "logps/rejected": -559.7905883789062, + "loss": 0.2727, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.244553089141846, + "rewards/grad_term": 0.001372232916764915, + "rewards/margins": 6.237585544586182, + "rewards/rejected": -11.482138633728027, + "step": 134 + }, + { + "epoch": 0.289234065345474, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.304875956111065, + "learning_rate": 6.665871121718377e-07, + "logits/chosen": 0.8576828837394714, + "logits/rejected": 0.6341058611869812, + "logps/accuracies": 0.5, + "logps/chosen": -337.80535888671875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -294.550537109375, + "logps/ref_rejected": -220.64215087890625, + "logps/rejected": -329.065673828125, + "loss": 0.2647, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.162740707397461, + "rewards/grad_term": 0.009863924235105515, + "rewards/margins": 3.2584362030029297, + "rewards/rejected": -5.421176910400391, + "step": 135 + }, + { + "epoch": 0.29137653990358864, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.625140040989452, + "learning_rate": 6.657517899761337e-07, + "logits/chosen": 0.8870478272438049, + "logits/rejected": 0.7576145529747009, + "logps/accuracies": 0.5, + "logps/chosen": -305.3637390136719, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -262.2385559082031, + "logps/ref_rejected": -227.50672912597656, + "logps/rejected": -304.53564453125, + "loss": 0.2461, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1562600135803223, + "rewards/grad_term": 0.010284369811415672, + "rewards/margins": 1.6951854228973389, + "rewards/rejected": -3.8514456748962402, + "step": 136 + }, + { + "epoch": 0.29351901446170325, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 7.018484141515877, + "learning_rate": 6.649164677804296e-07, + "logits/chosen": 0.9112040400505066, + "logits/rejected": 0.7212855815887451, + "logps/accuracies": 0.25, + "logps/chosen": -343.9112243652344, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -280.6199645996094, + "logps/ref_rejected": -210.87319946289062, + "logps/rejected": -326.63885498046875, + "loss": 0.2715, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.164562225341797, + "rewards/grad_term": 0.009311579167842865, + "rewards/margins": 2.62372088432312, + "rewards/rejected": -5.788283348083496, + "step": 137 + }, + { + "epoch": 0.29566148901981787, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.034703724953653, + "learning_rate": 6.640811455847255e-07, + "logits/chosen": 0.7124534845352173, + "logits/rejected": 0.6710624694824219, + "logps/accuracies": 0.5, + "logps/chosen": -434.5594482421875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -374.4966125488281, + "logps/ref_rejected": -310.2477722167969, + "logps/rejected": -434.73895263671875, + "loss": 0.2497, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0031418800354004, + "rewards/grad_term": 0.00873873382806778, + "rewards/margins": 3.221416473388672, + "rewards/rejected": -6.2245588302612305, + "step": 138 + }, + { + "epoch": 0.2978039635779325, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.812429176308449, + "learning_rate": 6.632458233890214e-07, + "logits/chosen": 0.9117505550384521, + "logits/rejected": 0.8502323031425476, + "logps/accuracies": 0.75, + "logps/chosen": -334.134765625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -272.14581298828125, + "logps/ref_rejected": -239.22955322265625, + "logps/rejected": -368.5514221191406, + "loss": 0.2585, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0994479656219482, + "rewards/grad_term": 0.0053825220093131065, + "rewards/margins": 3.366644859313965, + "rewards/rejected": -6.466092586517334, + "step": 139 + }, + { + "epoch": 0.29994643813604716, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.660588605349608, + "learning_rate": 6.624105011933175e-07, + "logits/chosen": 0.6605690717697144, + "logits/rejected": 0.8247154951095581, + "logps/accuracies": 0.5, + "logps/chosen": -329.8360900878906, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -290.978271484375, + "logps/ref_rejected": -246.50843811035156, + "logps/rejected": -299.4759521484375, + "loss": 0.2595, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.94289231300354, + "rewards/grad_term": 0.01796804741024971, + "rewards/margins": 0.705483078956604, + "rewards/rejected": -2.6483755111694336, + "step": 140 + }, + { + "epoch": 0.30208891269416177, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 10.598864233990223, + "learning_rate": 6.615751789976133e-07, + "logits/chosen": 0.6485729217529297, + "logits/rejected": 0.6901566982269287, + "logps/accuracies": 1.0, + "logps/chosen": -373.37396240234375, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -334.75689697265625, + "logps/ref_rejected": -381.53717041015625, + "logps/rejected": -493.22930908203125, + "loss": 0.2639, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.930854082107544, + "rewards/grad_term": 0.0024079105351120234, + "rewards/margins": 3.6537532806396484, + "rewards/rejected": -5.584607124328613, + "step": 141 + }, + { + "epoch": 0.3042313872522764, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.28171427184855, + "learning_rate": 6.607398568019093e-07, + "logits/chosen": 0.9653871655464172, + "logits/rejected": 0.6570479273796082, + "logps/accuracies": 0.5, + "logps/chosen": -340.83685302734375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -281.6165771484375, + "logps/ref_rejected": -203.56463623046875, + "logps/rejected": -293.328857421875, + "loss": 0.2356, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9610140323638916, + "rewards/grad_term": 0.010910983197391033, + "rewards/margins": 1.5271971225738525, + "rewards/rejected": -4.488211154937744, + "step": 142 + }, + { + "epoch": 0.306373861810391, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.851086611102091, + "learning_rate": 6.599045346062052e-07, + "logits/chosen": 0.9678068161010742, + "logits/rejected": 0.8588843941688538, + "logps/accuracies": 0.75, + "logps/chosen": -388.9454345703125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -324.3150634765625, + "logps/ref_rejected": -277.83184814453125, + "logps/rejected": -417.49212646484375, + "loss": 0.1983, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2315189838409424, + "rewards/grad_term": 0.004653441719710827, + "rewards/margins": 3.7514941692352295, + "rewards/rejected": -6.983013153076172, + "step": 143 + }, + { + "epoch": 0.3085163363685056, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.817695728935396, + "learning_rate": 6.590692124105012e-07, + "logits/chosen": 0.9185338616371155, + "logits/rejected": 0.8464267253875732, + "logps/accuracies": 0.75, + "logps/chosen": -355.24176025390625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -319.5701599121094, + "logps/ref_rejected": -291.1425476074219, + "logps/rejected": -393.3614196777344, + "loss": 0.2415, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7835807800292969, + "rewards/grad_term": 0.002678102580830455, + "rewards/margins": 3.327362060546875, + "rewards/rejected": -5.110942840576172, + "step": 144 + }, + { + "epoch": 0.31065881092662023, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 11.370643366730324, + "learning_rate": 6.582338902147971e-07, + "logits/chosen": 0.5740557909011841, + "logits/rejected": 0.39975208044052124, + "logps/accuracies": 0.25, + "logps/chosen": -487.64642333984375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -415.9956359863281, + "logps/ref_rejected": -316.01708984375, + "logps/rejected": -444.40020751953125, + "loss": 0.3096, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5825393199920654, + "rewards/grad_term": 0.013735326007008553, + "rewards/margins": 2.836615562438965, + "rewards/rejected": -6.419155120849609, + "step": 145 + }, + { + "epoch": 0.31280128548473485, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.25, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.223483927318776, + "learning_rate": 6.57398568019093e-07, + "logits/chosen": 0.7999095320701599, + "logits/rejected": 0.5960132479667664, + "logps/accuracies": 0.75, + "logps/chosen": -299.7486572265625, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -218.3001251220703, + "logps/ref_rejected": -201.54733276367188, + "logps/rejected": -300.24505615234375, + "loss": 0.2041, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.0724263191223145, + "rewards/grad_term": 0.02213701605796814, + "rewards/margins": 0.8624599575996399, + "rewards/rejected": -4.9348859786987305, + "step": 146 + }, + { + "epoch": 0.3149437600428495, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.579748942980807, + "learning_rate": 6.56563245823389e-07, + "logits/chosen": 0.926275372505188, + "logits/rejected": 0.8347383737564087, + "logps/accuracies": 0.75, + "logps/chosen": -363.90753173828125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -273.9169616699219, + "logps/ref_rejected": -306.166015625, + "logps/rejected": -483.9077453613281, + "loss": 0.257, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.4995293617248535, + "rewards/grad_term": 0.007000477984547615, + "rewards/margins": 4.387556076049805, + "rewards/rejected": -8.887085914611816, + "step": 147 + }, + { + "epoch": 0.31708623460096413, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.401901301955716, + "learning_rate": 6.557279236276849e-07, + "logits/chosen": 0.8783835172653198, + "logits/rejected": 0.748361349105835, + "logps/accuracies": 0.75, + "logps/chosen": -439.5329284667969, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -380.14080810546875, + "logps/ref_rejected": -311.24652099609375, + "logps/rejected": -461.9928283691406, + "loss": 0.2612, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.969606399536133, + "rewards/grad_term": 0.007177918218076229, + "rewards/margins": 4.567710876464844, + "rewards/rejected": -7.53731632232666, + "step": 148 + }, + { + "epoch": 0.31922870915907875, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 10.72349764158799, + "learning_rate": 6.548926014319809e-07, + "logits/chosen": 0.889015793800354, + "logits/rejected": 0.6200151443481445, + "logps/accuracies": 0.25, + "logps/chosen": -367.63616943359375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -328.658203125, + "logps/ref_rejected": -265.4963073730469, + "logps/rejected": -371.23028564453125, + "loss": 0.2875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9488979578018188, + "rewards/grad_term": 0.003809453221037984, + "rewards/margins": 3.337800979614258, + "rewards/rejected": -5.286699295043945, + "step": 149 + }, + { + "epoch": 0.32137118371719336, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.234046149517951, + "learning_rate": 6.540572792362768e-07, + "logits/chosen": 0.7394288182258606, + "logits/rejected": 0.7122300863265991, + "logps/accuracies": 0.75, + "logps/chosen": -405.5599365234375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -309.9801940917969, + "logps/ref_rejected": -281.0711975097656, + "logps/rejected": -448.6300048828125, + "loss": 0.2894, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.778985977172852, + "rewards/grad_term": 0.004231796134263277, + "rewards/margins": 3.5989530086517334, + "rewards/rejected": -8.377939224243164, + "step": 150 + }, + { + "epoch": 0.323513658275308, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 12.025800847575807, + "learning_rate": 6.532219570405727e-07, + "logits/chosen": 0.5386347770690918, + "logits/rejected": 0.35092538595199585, + "logps/accuracies": 0.5, + "logps/chosen": -216.28070068359375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -184.84176635742188, + "logps/ref_rejected": -179.14764404296875, + "logps/rejected": -240.31222534179688, + "loss": 0.2617, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5719472169876099, + "rewards/grad_term": 0.012875164858996868, + "rewards/margins": 1.4862821102142334, + "rewards/rejected": -3.058229446411133, + "step": 151 + }, + { + "epoch": 0.3256561328334226, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.899213029036904, + "learning_rate": 6.523866348448687e-07, + "logits/chosen": 0.9789618253707886, + "logits/rejected": 0.363411545753479, + "logps/accuracies": 0.5, + "logps/chosen": -279.868408203125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -243.69374084472656, + "logps/ref_rejected": -157.2808380126953, + "logps/rejected": -250.97132873535156, + "loss": 0.241, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8087329864501953, + "rewards/grad_term": 0.009951984509825706, + "rewards/margins": 2.875791072845459, + "rewards/rejected": -4.6845245361328125, + "step": 152 + }, + { + "epoch": 0.3277986073915372, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.25, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.604763643820585, + "learning_rate": 6.515513126491647e-07, + "logits/chosen": 0.8571368455886841, + "logits/rejected": 0.25795796513557434, + "logps/accuracies": 0.25, + "logps/chosen": -449.5162658691406, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -410.160400390625, + "logps/ref_rejected": -145.2788543701172, + "logps/rejected": -226.03286743164062, + "loss": 0.2287, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9677932262420654, + "rewards/grad_term": 0.015731465071439743, + "rewards/margins": 2.0699081420898438, + "rewards/rejected": -4.03770112991333, + "step": 153 + }, + { + "epoch": 0.3299410819496518, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 8.719226366200353, + "learning_rate": 6.507159904534606e-07, + "logits/chosen": 0.6871969103813171, + "logits/rejected": 0.4975748658180237, + "logps/accuracies": 0.25, + "logps/chosen": -325.2784423828125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -273.6944580078125, + "logps/ref_rejected": -270.9206237792969, + "logps/rejected": -343.53662109375, + "loss": 0.2694, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5791993141174316, + "rewards/grad_term": 0.015888353809714317, + "rewards/margins": 1.0516000986099243, + "rewards/rejected": -3.6307995319366455, + "step": 154 + }, + { + "epoch": 0.3320835565077665, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 6.321361613018943, + "learning_rate": 6.498806682577566e-07, + "logits/chosen": 0.703808069229126, + "logits/rejected": 0.6969115734100342, + "logps/accuracies": 1.0, + "logps/chosen": -476.7120361328125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -397.993896484375, + "logps/ref_rejected": -352.0650329589844, + "logps/rejected": -520.11376953125, + "loss": 0.1933, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.935906171798706, + "rewards/grad_term": 0.005700279027223587, + "rewards/margins": 4.466530799865723, + "rewards/rejected": -8.402437210083008, + "step": 155 + }, + { + "epoch": 0.3342260310658811, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 8.480714775934256, + "learning_rate": 6.490453460620525e-07, + "logits/chosen": 1.0573246479034424, + "logits/rejected": 0.7870907187461853, + "logps/accuracies": 0.25, + "logps/chosen": -498.836669921875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -427.01885986328125, + "logps/ref_rejected": -330.6030578613281, + "logps/rejected": -500.70513916015625, + "loss": 0.2106, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.590888023376465, + "rewards/grad_term": 0.0047126878052949905, + "rewards/margins": 4.914216995239258, + "rewards/rejected": -8.505105018615723, + "step": 156 + }, + { + "epoch": 0.33636850562399573, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.421964128197317, + "learning_rate": 6.482100238663484e-07, + "logits/chosen": 0.8880590796470642, + "logits/rejected": 0.7068474888801575, + "logps/accuracies": 0.75, + "logps/chosen": -279.9468994140625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -263.3426513671875, + "logps/ref_rejected": -203.75613403320312, + "logps/rejected": -287.427978515625, + "loss": 0.2597, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8302121162414551, + "rewards/grad_term": 0.008612211793661118, + "rewards/margins": 3.353381395339966, + "rewards/rejected": -4.18359375, + "step": 157 + }, + { + "epoch": 0.33851098018211034, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.8752213374271, + "learning_rate": 6.473747016706444e-07, + "logits/chosen": 0.7713953852653503, + "logits/rejected": 0.7244459986686707, + "logps/accuracies": 1.0, + "logps/chosen": -343.670654296875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -289.24114990234375, + "logps/ref_rejected": -304.27593994140625, + "logps/rejected": -411.5351257324219, + "loss": 0.2123, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7214746475219727, + "rewards/grad_term": 0.007066743914037943, + "rewards/margins": 2.641486406326294, + "rewards/rejected": -5.362961292266846, + "step": 158 + }, + { + "epoch": 0.34065345474022496, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.961209126432363, + "learning_rate": 6.465393794749403e-07, + "logits/chosen": 0.4089430570602417, + "logits/rejected": 0.48331207036972046, + "logps/accuracies": 1.0, + "logps/chosen": -227.35739135742188, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -194.3919219970703, + "logps/ref_rejected": -197.94259643554688, + "logps/rejected": -291.18634033203125, + "loss": 0.2271, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.648273229598999, + "rewards/grad_term": 0.007467413786798716, + "rewards/margins": 3.0139148235321045, + "rewards/rejected": -4.6621880531311035, + "step": 159 + }, + { + "epoch": 0.3427959292983396, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.623215056313119, + "learning_rate": 6.457040572792363e-07, + "logits/chosen": 0.7995873093605042, + "logits/rejected": 0.8776368498802185, + "logps/accuracies": 0.75, + "logps/chosen": -625.0630493164062, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -586.06396484375, + "logps/ref_rejected": -436.4676818847656, + "logps/rejected": -507.0144348144531, + "loss": 0.1998, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9499545097351074, + "rewards/grad_term": 0.008956504985690117, + "rewards/margins": 1.5773828029632568, + "rewards/rejected": -3.527337074279785, + "step": 160 + }, + { + "epoch": 0.3449384038564542, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.887366061229715, + "learning_rate": 6.448687350835322e-07, + "logits/chosen": 0.7675126791000366, + "logits/rejected": 0.530870258808136, + "logps/accuracies": 0.5, + "logps/chosen": -353.36944580078125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -317.55523681640625, + "logps/ref_rejected": -319.5533447265625, + "logps/rejected": -376.0810546875, + "loss": 0.2781, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.790710687637329, + "rewards/grad_term": 0.01590893603861332, + "rewards/margins": 1.035674810409546, + "rewards/rejected": -2.826385498046875, + "step": 161 + }, + { + "epoch": 0.3470808784145688, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.945357899951022, + "learning_rate": 6.440334128878281e-07, + "logits/chosen": 0.586737871170044, + "logits/rejected": 0.4655018150806427, + "logps/accuracies": 0.5, + "logps/chosen": -474.0818786621094, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -397.549560546875, + "logps/ref_rejected": -303.9313049316406, + "logps/rejected": -453.242431640625, + "loss": 0.178, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.826618194580078, + "rewards/grad_term": 0.005110522732138634, + "rewards/margins": 3.6389386653900146, + "rewards/rejected": -7.465556621551514, + "step": 162 + }, + { + "epoch": 0.3492233529726835, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.421904565149697, + "learning_rate": 6.431980906921241e-07, + "logits/chosen": 0.8405193090438843, + "logits/rejected": 0.9586330652236938, + "logps/accuracies": 0.75, + "logps/chosen": -362.607666015625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -323.2601318359375, + "logps/ref_rejected": -350.115966796875, + "logps/rejected": -414.9684143066406, + "loss": 0.1903, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9673755168914795, + "rewards/grad_term": 0.014036407694220543, + "rewards/margins": 1.2752478122711182, + "rewards/rejected": -3.2426233291625977, + "step": 163 + }, + { + "epoch": 0.3513658275307981, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.31758932737487, + "learning_rate": 6.4236276849642e-07, + "logits/chosen": 0.9069796800613403, + "logits/rejected": 0.8120055198669434, + "logps/accuracies": 0.5, + "logps/chosen": -529.5686645507812, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -436.9043273925781, + "logps/ref_rejected": -385.4817199707031, + "logps/rejected": -571.4048461914062, + "loss": 0.2857, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.633216381072998, + "rewards/grad_term": 0.001609130296856165, + "rewards/margins": 4.662940502166748, + "rewards/rejected": -9.296156883239746, + "step": 164 + }, + { + "epoch": 0.3535083020889127, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 6.7907749776023, + "learning_rate": 6.41527446300716e-07, + "logits/chosen": 0.7551314830780029, + "logits/rejected": 0.6372643709182739, + "logps/accuracies": 0.25, + "logps/chosen": -594.386962890625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -496.920166015625, + "logps/ref_rejected": -391.0464172363281, + "logps/rejected": -584.2574462890625, + "loss": 0.2221, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.873340129852295, + "rewards/grad_term": 0.0031820686999708414, + "rewards/margins": 4.787212371826172, + "rewards/rejected": -9.660552978515625, + "step": 165 + }, + { + "epoch": 0.3556507766470273, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.42834345389666, + "learning_rate": 6.406921241050118e-07, + "logits/chosen": 0.8926582336425781, + "logits/rejected": 0.6021265983581543, + "logps/accuracies": 0.5, + "logps/chosen": -535.192138671875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -454.7031555175781, + "logps/ref_rejected": -355.3533935546875, + "logps/rejected": -487.2066650390625, + "loss": 0.2189, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.024447441101074, + "rewards/grad_term": 0.007553639821708202, + "rewards/margins": 2.56821608543396, + "rewards/rejected": -6.592663764953613, + "step": 166 + }, + { + "epoch": 0.35779325120514194, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 12.44062714406834, + "learning_rate": 6.398568019093079e-07, + "logits/chosen": 0.9791277647018433, + "logits/rejected": 0.7142946720123291, + "logps/accuracies": 0.5, + "logps/chosen": -324.64971923828125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -303.8039245605469, + "logps/ref_rejected": -202.87620544433594, + "logps/rejected": -278.5081481933594, + "loss": 0.2451, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0422909259796143, + "rewards/grad_term": 0.009761723689734936, + "rewards/margins": 2.7393064498901367, + "rewards/rejected": -3.781597137451172, + "step": 167 + }, + { + "epoch": 0.35993572576325655, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.985736298029532, + "learning_rate": 6.390214797136038e-07, + "logits/chosen": 0.6620911955833435, + "logits/rejected": 0.6493997573852539, + "logps/accuracies": 0.75, + "logps/chosen": -189.7664794921875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -157.066650390625, + "logps/ref_rejected": -141.00611877441406, + "logps/rejected": -213.98069763183594, + "loss": 0.2257, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6349915266036987, + "rewards/grad_term": 0.010831539519131184, + "rewards/margins": 2.013737201690674, + "rewards/rejected": -3.648728370666504, + "step": 168 + }, + { + "epoch": 0.36207820032137117, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.312051247371532, + "learning_rate": 6.381861575178997e-07, + "logits/chosen": 0.9859127998352051, + "logits/rejected": 0.7602252960205078, + "logps/accuracies": 1.0, + "logps/chosen": -367.61090087890625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -327.2779541015625, + "logps/ref_rejected": -278.348876953125, + "logps/rejected": -416.47991943359375, + "loss": 0.2197, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.016645908355713, + "rewards/grad_term": 0.005300410091876984, + "rewards/margins": 4.88990592956543, + "rewards/rejected": -6.906552314758301, + "step": 169 + }, + { + "epoch": 0.3642206748794858, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.642763111324385, + "learning_rate": 6.373508353221956e-07, + "logits/chosen": 0.8475183248519897, + "logits/rejected": 0.7893280386924744, + "logps/accuracies": 1.0, + "logps/chosen": -466.7734375, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -354.476806640625, + "logps/ref_rejected": -379.0105285644531, + "logps/rejected": -591.1034545898438, + "loss": 0.2312, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.614831924438477, + "rewards/grad_term": 0.0010664674919098616, + "rewards/margins": 4.989813804626465, + "rewards/rejected": -10.604645729064941, + "step": 170 + }, + { + "epoch": 0.36636314943760045, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.565545454910559, + "learning_rate": 6.365155131264916e-07, + "logits/chosen": 0.7403796911239624, + "logits/rejected": 0.6863211393356323, + "logps/accuracies": 0.5, + "logps/chosen": -413.04949951171875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -341.1173400878906, + "logps/ref_rejected": -277.3340148925781, + "logps/rejected": -390.7886047363281, + "loss": 0.2088, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.596607208251953, + "rewards/grad_term": 0.006689072586596012, + "rewards/margins": 2.076122283935547, + "rewards/rejected": -5.672729015350342, + "step": 171 + }, + { + "epoch": 0.36850562399571507, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.792718661583799, + "learning_rate": 6.356801909307876e-07, + "logits/chosen": 1.0259259939193726, + "logits/rejected": 0.874252438545227, + "logps/accuracies": 0.75, + "logps/chosen": -411.4170837402344, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -338.65203857421875, + "logps/ref_rejected": -322.8931579589844, + "logps/rejected": -444.8133544921875, + "loss": 0.2075, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.638251543045044, + "rewards/grad_term": 0.00537948589771986, + "rewards/margins": 2.4577579498291016, + "rewards/rejected": -6.096009254455566, + "step": 172 + }, + { + "epoch": 0.3706480985538297, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.152790670378876, + "learning_rate": 6.348448687350834e-07, + "logits/chosen": 0.9457736015319824, + "logits/rejected": 0.7330727577209473, + "logps/accuracies": 0.5, + "logps/chosen": -449.04766845703125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -385.5958251953125, + "logps/ref_rejected": -290.0688171386719, + "logps/rejected": -419.5983581542969, + "loss": 0.2484, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1725926399230957, + "rewards/grad_term": 0.004974587354809046, + "rewards/margins": 3.3038861751556396, + "rewards/rejected": -6.476478576660156, + "step": 173 + }, + { + "epoch": 0.3727905731119443, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.514176538085659, + "learning_rate": 6.340095465393795e-07, + "logits/chosen": 0.6537495851516724, + "logits/rejected": 0.8065310716629028, + "logps/accuracies": 0.75, + "logps/chosen": -366.6497802734375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -316.4726867675781, + "logps/ref_rejected": -318.7170715332031, + "logps/rejected": -415.7598876953125, + "loss": 0.2316, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.508854389190674, + "rewards/grad_term": 0.017329072579741478, + "rewards/margins": 2.343287467956543, + "rewards/rejected": -4.852141857147217, + "step": 174 + }, + { + "epoch": 0.3749330476700589, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 11.348177913147394, + "learning_rate": 6.331742243436754e-07, + "logits/chosen": 0.8572670221328735, + "logits/rejected": 0.791947603225708, + "logps/accuracies": 0.75, + "logps/chosen": -487.83331298828125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -378.57891845703125, + "logps/ref_rejected": -352.18182373046875, + "logps/rejected": -550.4212646484375, + "loss": 0.1988, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.4627180099487305, + "rewards/grad_term": 0.0037125912494957447, + "rewards/margins": 4.449254989624023, + "rewards/rejected": -9.911972045898438, + "step": 175 + }, + { + "epoch": 0.37707552222817353, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 11.515808366477856, + "learning_rate": 6.323389021479714e-07, + "logits/chosen": 0.8945661783218384, + "logits/rejected": 0.6957411170005798, + "logps/accuracies": 0.75, + "logps/chosen": -261.3265380859375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -211.19613647460938, + "logps/ref_rejected": -165.06219482421875, + "logps/rejected": -267.1302795410156, + "loss": 0.2924, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5065195560455322, + "rewards/grad_term": 0.011427883058786392, + "rewards/margins": 2.5968849658966064, + "rewards/rejected": -5.103404521942139, + "step": 176 + }, + { + "epoch": 0.37921799678628815, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 12.786001249289741, + "learning_rate": 6.315035799522672e-07, + "logits/chosen": 0.8739601969718933, + "logits/rejected": 0.669786274433136, + "logps/accuracies": 0.5, + "logps/chosen": -329.546142578125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -280.71832275390625, + "logps/ref_rejected": -243.82247924804688, + "logps/rejected": -331.6929931640625, + "loss": 0.3338, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.441390037536621, + "rewards/grad_term": 0.007867410778999329, + "rewards/margins": 1.952134609222412, + "rewards/rejected": -4.393524646759033, + "step": 177 + }, + { + "epoch": 0.38136047134440276, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.208154660206382, + "learning_rate": 6.306682577565633e-07, + "logits/chosen": 0.9574888944625854, + "logits/rejected": 0.8498983979225159, + "logps/accuracies": 1.0, + "logps/chosen": -543.029052734375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -424.92236328125, + "logps/ref_rejected": -352.2156066894531, + "logps/rejected": -606.2493286132812, + "loss": 0.182, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.90533447265625, + "rewards/grad_term": 8.483060082653537e-05, + "rewards/margins": 6.796352386474609, + "rewards/rejected": -12.70168685913086, + "step": 178 + }, + { + "epoch": 0.38350294590251743, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.640876069823683, + "learning_rate": 6.298329355608592e-07, + "logits/chosen": 0.9586235284805298, + "logits/rejected": 0.8088182210922241, + "logps/accuracies": 0.75, + "logps/chosen": -398.8780212402344, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -298.6359558105469, + "logps/ref_rejected": -262.9993896484375, + "logps/rejected": -415.80419921875, + "loss": 0.196, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.012104034423828, + "rewards/grad_term": 0.006371453404426575, + "rewards/margins": 2.6281375885009766, + "rewards/rejected": -7.640241622924805, + "step": 179 + }, + { + "epoch": 0.38564542046063205, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.553393120180624, + "learning_rate": 6.289976133651551e-07, + "logits/chosen": 1.0224734544754028, + "logits/rejected": 0.756576657295227, + "logps/accuracies": 0.75, + "logps/chosen": -353.6854553222656, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -302.3289794921875, + "logps/ref_rejected": -237.62245178222656, + "logps/rejected": -358.02716064453125, + "loss": 0.2158, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5678231716156006, + "rewards/grad_term": 0.005727603565901518, + "rewards/margins": 3.452413558959961, + "rewards/rejected": -6.020236492156982, + "step": 180 + }, + { + "epoch": 0.38778789501874666, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.447493546427074, + "learning_rate": 6.28162291169451e-07, + "logits/chosen": 1.0232813358306885, + "logits/rejected": 0.7466151714324951, + "logps/accuracies": 0.5, + "logps/chosen": -400.76641845703125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -331.197998046875, + "logps/ref_rejected": -238.2156524658203, + "logps/rejected": -412.0185546875, + "loss": 0.2115, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.478421449661255, + "rewards/grad_term": 0.0015466721961274743, + "rewards/margins": 5.211723804473877, + "rewards/rejected": -8.690145492553711, + "step": 181 + }, + { + "epoch": 0.3899303695768613, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.697644696654893, + "learning_rate": 6.27326968973747e-07, + "logits/chosen": 0.9457991123199463, + "logits/rejected": 0.7241038084030151, + "logps/accuracies": 0.5, + "logps/chosen": -442.8717041015625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -379.5626525878906, + "logps/ref_rejected": -298.47650146484375, + "logps/rejected": -431.15509033203125, + "loss": 0.2081, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.165452480316162, + "rewards/grad_term": 0.008516497910022736, + "rewards/margins": 3.4684762954711914, + "rewards/rejected": -6.6339287757873535, + "step": 182 + }, + { + "epoch": 0.3920728441349759, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.25, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.8108725640856, + "learning_rate": 6.26491646778043e-07, + "logits/chosen": 1.0096490383148193, + "logits/rejected": 0.8373015522956848, + "logps/accuracies": 0.75, + "logps/chosen": -351.55303955078125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -287.40643310546875, + "logps/ref_rejected": -275.7420654296875, + "logps/rejected": -407.6207275390625, + "loss": 0.2152, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2073283195495605, + "rewards/grad_term": 0.011961029842495918, + "rewards/margins": 3.3866024017333984, + "rewards/rejected": -6.593931198120117, + "step": 183 + }, + { + "epoch": 0.3942153186930905, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 11.474698775172937, + "learning_rate": 6.256563245823388e-07, + "logits/chosen": 0.6665371060371399, + "logits/rejected": 0.6652243137359619, + "logps/accuracies": 0.75, + "logps/chosen": -505.3188171386719, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -441.2266845703125, + "logps/ref_rejected": -416.076904296875, + "logps/rejected": -567.68310546875, + "loss": 0.2278, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.204606056213379, + "rewards/grad_term": 0.001420854590833187, + "rewards/margins": 4.375702857971191, + "rewards/rejected": -7.58030891418457, + "step": 184 + }, + { + "epoch": 0.3963577932512051, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.943275839760174, + "learning_rate": 6.248210023866348e-07, + "logits/chosen": 0.9953018426895142, + "logits/rejected": 0.795741617679596, + "logps/accuracies": 0.5, + "logps/chosen": -452.40887451171875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -384.6745910644531, + "logps/ref_rejected": -307.6947326660156, + "logps/rejected": -431.2263488769531, + "loss": 0.2104, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.386714220046997, + "rewards/grad_term": 0.0036700021009892225, + "rewards/margins": 2.7898666858673096, + "rewards/rejected": -6.176580905914307, + "step": 185 + }, + { + "epoch": 0.39850026780931974, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.413011794039116, + "learning_rate": 6.239856801909308e-07, + "logits/chosen": 0.6716040968894958, + "logits/rejected": 0.619194507598877, + "logps/accuracies": 0.5, + "logps/chosen": -467.7266845703125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -381.2197570800781, + "logps/ref_rejected": -290.17254638671875, + "logps/rejected": -458.6622314453125, + "loss": 0.1671, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.325344562530518, + "rewards/grad_term": 0.004242981784045696, + "rewards/margins": 4.0991411209106445, + "rewards/rejected": -8.424485206604004, + "step": 186 + }, + { + "epoch": 0.4006427423674344, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.119460917704207, + "learning_rate": 6.231503579952267e-07, + "logits/chosen": 0.8225597143173218, + "logits/rejected": 0.6662357449531555, + "logps/accuracies": 0.5, + "logps/chosen": -469.7325439453125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -364.77740478515625, + "logps/ref_rejected": -268.07843017578125, + "logps/rejected": -472.31781005859375, + "loss": 0.1839, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.247758388519287, + "rewards/grad_term": 0.006101151462644339, + "rewards/margins": 4.964210033416748, + "rewards/rejected": -10.211968421936035, + "step": 187 + }, + { + "epoch": 0.40278521692554903, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 10.046352079716472, + "learning_rate": 6.223150357995226e-07, + "logits/chosen": 0.8952844142913818, + "logits/rejected": 0.6202220916748047, + "logps/accuracies": 1.0, + "logps/chosen": -472.1369323730469, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -377.0249938964844, + "logps/ref_rejected": -345.5789489746094, + "logps/rejected": -568.91357421875, + "loss": 0.2002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.755597114562988, + "rewards/grad_term": 0.0011726694647222757, + "rewards/margins": 6.411135673522949, + "rewards/rejected": -11.166732788085938, + "step": 188 + }, + { + "epoch": 0.40492769148366364, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.25, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.37154123816891, + "learning_rate": 6.214797136038185e-07, + "logits/chosen": 0.8080844879150391, + "logits/rejected": 0.843936026096344, + "logps/accuracies": 0.25, + "logps/chosen": -305.34619140625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -231.69813537597656, + "logps/ref_rejected": -215.05422973632812, + "logps/rejected": -299.5094299316406, + "loss": 0.2138, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.682403087615967, + "rewards/grad_term": 0.020120887085795403, + "rewards/margins": 0.5403570532798767, + "rewards/rejected": -4.222760200500488, + "step": 189 + }, + { + "epoch": 0.40707016604177826, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.174975913775362, + "learning_rate": 6.206443914081146e-07, + "logits/chosen": 0.7329360246658325, + "logits/rejected": 0.8288344144821167, + "logps/accuracies": 0.5, + "logps/chosen": -461.33984375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -360.3821105957031, + "logps/ref_rejected": -286.70745849609375, + "logps/rejected": -451.14569091796875, + "loss": 0.2014, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.047886848449707, + "rewards/grad_term": 0.013596764765679836, + "rewards/margins": 3.1740236282348633, + "rewards/rejected": -8.22191047668457, + "step": 190 + }, + { + "epoch": 0.4092126405998929, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.272281750209924, + "learning_rate": 6.198090692124104e-07, + "logits/chosen": 0.8074694275856018, + "logits/rejected": 0.7827705144882202, + "logps/accuracies": 0.5, + "logps/chosen": -419.7196960449219, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -305.22576904296875, + "logps/ref_rejected": -257.1751708984375, + "logps/rejected": -473.9278564453125, + "loss": 0.1775, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.724696159362793, + "rewards/grad_term": 0.008914883248507977, + "rewards/margins": 5.112937927246094, + "rewards/rejected": -10.837634086608887, + "step": 191 + }, + { + "epoch": 0.4113551151580075, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.921800868376472, + "learning_rate": 6.189737470167064e-07, + "logits/chosen": 0.8224954009056091, + "logits/rejected": 0.7576948404312134, + "logps/accuracies": 0.5, + "logps/chosen": -276.3191833496094, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -228.04379272460938, + "logps/ref_rejected": -168.54588317871094, + "logps/rejected": -262.92254638671875, + "loss": 0.221, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4137697219848633, + "rewards/grad_term": 0.011381752789020538, + "rewards/margins": 2.305063247680664, + "rewards/rejected": -4.718832969665527, + "step": 192 + }, + { + "epoch": 0.4113551151580075, + "eval_flips/correct->correct": 0.1599999964237213, + "eval_flips/correct->incorrect": 0.0, + "eval_flips/incorrect->correct": 0.3199999928474426, + "eval_flips/incorrect->incorrect": 0.5199999809265137, + "eval_logits/chosen": 0.8205481767654419, + "eval_logits/rejected": 0.7034481763839722, + "eval_logps/accuracies": 0.47999998927116394, + "eval_logps/chosen": -390.3917541503906, + "eval_logps/ref_accuracies": 0.1599999964237213, + "eval_logps/ref_chosen": -323.51568603515625, + "eval_logps/ref_rejected": -258.70098876953125, + "eval_logps/rejected": -389.7598876953125, + "eval_loss": 0.23252426087856293, + "eval_rewards/accuracies": 0.8600000143051147, + "eval_rewards/chosen": -3.3438057899475098, + "eval_rewards/grad_term": 0.009308630600571632, + "eval_rewards/margins": 3.2091403007507324, + "eval_rewards/rejected": -6.5529465675354, + "eval_runtime": 375.2407, + "eval_samples_per_second": 4.211, + "eval_steps_per_second": 0.133, + "step": 192 + }, + { + "epoch": 0.4134975897161221, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 12.176142217784117, + "learning_rate": 6.181384248210024e-07, + "logits/chosen": 0.9473562836647034, + "logits/rejected": 0.800918698310852, + "logps/accuracies": 1.0, + "logps/chosen": -362.5771484375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -297.9696350097656, + "logps/ref_rejected": -254.08636474609375, + "logps/rejected": -451.26666259765625, + "loss": 0.2425, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2303762435913086, + "rewards/grad_term": 0.0030712243169546127, + "rewards/margins": 6.628638744354248, + "rewards/rejected": -9.859014511108398, + "step": 193 + }, + { + "epoch": 0.4156400642742367, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.97633567390224, + "learning_rate": 6.173031026252983e-07, + "logits/chosen": 0.9676415324211121, + "logits/rejected": 0.9164015054702759, + "logps/accuracies": 0.5, + "logps/chosen": -323.4644470214844, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -252.95120239257812, + "logps/ref_rejected": -255.23983764648438, + "logps/rejected": -357.87921142578125, + "loss": 0.243, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.525662422180176, + "rewards/grad_term": 0.009022894315421581, + "rewards/margins": 1.606306791305542, + "rewards/rejected": -5.131969451904297, + "step": 194 + }, + { + "epoch": 0.4177825388323514, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.766543352301676, + "learning_rate": 6.164677804295942e-07, + "logits/chosen": 0.7897940278053284, + "logits/rejected": 0.8066399097442627, + "logps/accuracies": 0.5, + "logps/chosen": -236.746337890625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -197.38284301757812, + "logps/ref_rejected": -189.28884887695312, + "logps/rejected": -234.21102905273438, + "loss": 0.1805, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9681750535964966, + "rewards/grad_term": 0.022014902904629707, + "rewards/margins": 0.2779344618320465, + "rewards/rejected": -2.2461094856262207, + "step": 195 + }, + { + "epoch": 0.419925013390466, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.405842493569402, + "learning_rate": 6.156324582338901e-07, + "logits/chosen": 1.0093345642089844, + "logits/rejected": 0.8334782719612122, + "logps/accuracies": 0.75, + "logps/chosen": -445.7866516113281, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -337.7933349609375, + "logps/ref_rejected": -296.8775329589844, + "logps/rejected": -510.7151794433594, + "loss": 0.178, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.399665832519531, + "rewards/grad_term": 0.0011895910138264298, + "rewards/margins": 5.2922163009643555, + "rewards/rejected": -10.69188117980957, + "step": 196 + }, + { + "epoch": 0.4220674879485806, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 11.920133033788117, + "learning_rate": 6.147971360381862e-07, + "logits/chosen": 0.787909209728241, + "logits/rejected": 0.75300532579422, + "logps/accuracies": 0.5, + "logps/chosen": -440.1323547363281, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -368.47412109375, + "logps/ref_rejected": -353.4744873046875, + "logps/rejected": -485.1199951171875, + "loss": 0.2506, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5829124450683594, + "rewards/grad_term": 0.012279342859983444, + "rewards/margins": 2.9993643760681152, + "rewards/rejected": -6.582277297973633, + "step": 197 + }, + { + "epoch": 0.42420996250669524, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.419878913528332, + "learning_rate": 6.13961813842482e-07, + "logits/chosen": 1.1179535388946533, + "logits/rejected": 0.9940972924232483, + "logps/accuracies": 1.0, + "logps/chosen": -321.8353271484375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -260.9544372558594, + "logps/ref_rejected": -232.75350952148438, + "logps/rejected": -376.96441650390625, + "loss": 0.1569, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0440444946289062, + "rewards/grad_term": 0.007131978403776884, + "rewards/margins": 4.166501045227051, + "rewards/rejected": -7.210545539855957, + "step": 198 + }, + { + "epoch": 0.42635243706480985, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.76694640949059, + "learning_rate": 6.13126491646778e-07, + "logits/chosen": 0.9783276915550232, + "logits/rejected": 0.6347091794013977, + "logps/accuracies": 0.5, + "logps/chosen": -333.9166564941406, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -295.6625671386719, + "logps/ref_rejected": -178.05490112304688, + "logps/rejected": -262.8484191894531, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9127050638198853, + "rewards/grad_term": 0.009310072287917137, + "rewards/margins": 2.3269693851470947, + "rewards/rejected": -4.2396745681762695, + "step": 199 + }, + { + "epoch": 0.42849491162292447, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 6.871363821825847, + "learning_rate": 6.122911694510739e-07, + "logits/chosen": 0.713058590888977, + "logits/rejected": 0.7966564893722534, + "logps/accuracies": 1.0, + "logps/chosen": -270.65655517578125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -239.4276580810547, + "logps/ref_rejected": -272.2807312011719, + "logps/rejected": -361.95782470703125, + "loss": 0.1946, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.561445713043213, + "rewards/grad_term": 0.006638450548052788, + "rewards/margins": 2.9224092960357666, + "rewards/rejected": -4.483855247497559, + "step": 200 + }, + { + "epoch": 0.4306373861810391, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.189710382812372, + "learning_rate": 6.1145584725537e-07, + "logits/chosen": 0.8107240200042725, + "logits/rejected": 0.6742185950279236, + "logps/accuracies": 0.75, + "logps/chosen": -461.14404296875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -375.26239013671875, + "logps/ref_rejected": -366.937744140625, + "logps/rejected": -556.6005859375, + "loss": 0.1953, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2940826416015625, + "rewards/grad_term": 0.002399621531367302, + "rewards/margins": 5.189059734344482, + "rewards/rejected": -9.483142852783203, + "step": 201 + }, + { + "epoch": 0.4327798607391537, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.25, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.981730687281297, + "learning_rate": 6.106205250596658e-07, + "logits/chosen": 0.8751631379127502, + "logits/rejected": 0.8072733879089355, + "logps/accuracies": 0.75, + "logps/chosen": -323.69873046875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -253.70254516601562, + "logps/ref_rejected": -235.48675537109375, + "logps/rejected": -390.3155517578125, + "loss": 0.2163, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4998087882995605, + "rewards/grad_term": 0.009065371006727219, + "rewards/margins": 4.241631031036377, + "rewards/rejected": -7.741440296173096, + "step": 202 + }, + { + "epoch": 0.43492233529726837, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.322543837095315, + "learning_rate": 6.097852028639618e-07, + "logits/chosen": 0.8359104990959167, + "logits/rejected": 0.7406368255615234, + "logps/accuracies": 1.0, + "logps/chosen": -576.8265380859375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -415.6806335449219, + "logps/ref_rejected": -388.8533935546875, + "logps/rejected": -631.3206787109375, + "loss": 0.2296, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.057294845581055, + "rewards/grad_term": 0.004110483452677727, + "rewards/margins": 4.066068649291992, + "rewards/rejected": -12.123363494873047, + "step": 203 + }, + { + "epoch": 0.437064809855383, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.923150092849891, + "learning_rate": 6.089498806682577e-07, + "logits/chosen": 0.9337953329086304, + "logits/rejected": 0.5036557912826538, + "logps/accuracies": 0.75, + "logps/chosen": -321.8887939453125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -268.6906433105469, + "logps/ref_rejected": -248.19271850585938, + "logps/rejected": -372.2651062011719, + "loss": 0.2665, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6599063873291016, + "rewards/grad_term": 0.006289066281169653, + "rewards/margins": 3.543713092803955, + "rewards/rejected": -6.203619480133057, + "step": 204 + }, + { + "epoch": 0.4392072844134976, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.602466187696242, + "learning_rate": 6.081145584725537e-07, + "logits/chosen": 0.799699604511261, + "logits/rejected": 0.7706651091575623, + "logps/accuracies": 1.0, + "logps/chosen": -431.94512939453125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -358.3432312011719, + "logps/ref_rejected": -336.4741516113281, + "logps/rejected": -577.3936157226562, + "loss": 0.2874, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6800947189331055, + "rewards/grad_term": 0.0002930064802058041, + "rewards/margins": 8.365878105163574, + "rewards/rejected": -12.04597282409668, + "step": 205 + }, + { + "epoch": 0.4413497589716122, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.272614970451562, + "learning_rate": 6.072792362768496e-07, + "logits/chosen": 0.9661082625389099, + "logits/rejected": 0.8308844566345215, + "logps/accuracies": 0.75, + "logps/chosen": -429.0587158203125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -331.1964111328125, + "logps/ref_rejected": -307.45654296875, + "logps/rejected": -492.7054138183594, + "loss": 0.1984, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.8931169509887695, + "rewards/grad_term": 0.0081776799634099, + "rewards/margins": 4.369326114654541, + "rewards/rejected": -9.262442588806152, + "step": 206 + }, + { + "epoch": 0.44349223352972683, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.721689104729112, + "learning_rate": 6.064439140811455e-07, + "logits/chosen": 0.9881528615951538, + "logits/rejected": 0.8519094586372375, + "logps/accuracies": 0.5, + "logps/chosen": -221.66915893554688, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -187.01608276367188, + "logps/ref_rejected": -149.39364624023438, + "logps/rejected": -215.79965209960938, + "loss": 0.2711, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.732654094696045, + "rewards/grad_term": 0.008831696584820747, + "rewards/margins": 1.587646245956421, + "rewards/rejected": -3.320300340652466, + "step": 207 + }, + { + "epoch": 0.44563470808784145, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 11.793737716453558, + "learning_rate": 6.056085918854416e-07, + "logits/chosen": 1.012654185295105, + "logits/rejected": 0.9926575422286987, + "logps/accuracies": 0.75, + "logps/chosen": -402.0953369140625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -353.88946533203125, + "logps/ref_rejected": -340.03839111328125, + "logps/rejected": -436.39117431640625, + "loss": 0.237, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4102942943573, + "rewards/grad_term": 0.004451955668628216, + "rewards/margins": 2.4073448181152344, + "rewards/rejected": -4.817638874053955, + "step": 208 + }, + { + "epoch": 0.44777718264595606, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.289495153079532, + "learning_rate": 6.047732696897374e-07, + "logits/chosen": 0.8617650270462036, + "logits/rejected": 0.6589657068252563, + "logps/accuracies": 0.5, + "logps/chosen": -439.2054443359375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -381.7887268066406, + "logps/ref_rejected": -318.25933837890625, + "logps/rejected": -461.6387634277344, + "loss": 0.171, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.870835304260254, + "rewards/grad_term": 0.004674965050071478, + "rewards/margins": 4.2981367111206055, + "rewards/rejected": -7.168972015380859, + "step": 209 + }, + { + "epoch": 0.4499196572040707, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 11.537792356133702, + "learning_rate": 6.039379474940334e-07, + "logits/chosen": 0.9008455276489258, + "logits/rejected": 0.6296284198760986, + "logps/accuracies": 0.5, + "logps/chosen": -330.61273193359375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -269.29754638671875, + "logps/ref_rejected": -172.04388427734375, + "logps/rejected": -298.81842041015625, + "loss": 0.196, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.065760612487793, + "rewards/grad_term": 0.009436404332518578, + "rewards/margins": 3.272966146469116, + "rewards/rejected": -6.338726997375488, + "step": 210 + }, + { + "epoch": 0.45206213176218535, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 11.509811900422134, + "learning_rate": 6.031026252983293e-07, + "logits/chosen": 0.9384500980377197, + "logits/rejected": 0.7233452200889587, + "logps/accuracies": 0.5, + "logps/chosen": -262.9141845703125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -203.38661193847656, + "logps/ref_rejected": -200.27188110351562, + "logps/rejected": -254.28463745117188, + "loss": 0.2868, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.976378917694092, + "rewards/grad_term": 0.027324385941028595, + "rewards/margins": -0.275741308927536, + "rewards/rejected": -2.7006378173828125, + "step": 211 + }, + { + "epoch": 0.45420460632029996, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.335767333746587, + "learning_rate": 6.022673031026253e-07, + "logits/chosen": 0.8327051401138306, + "logits/rejected": 0.7020426988601685, + "logps/accuracies": 0.75, + "logps/chosen": -380.3292236328125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -343.1072998046875, + "logps/ref_rejected": -288.2660217285156, + "logps/rejected": -439.3631591796875, + "loss": 0.1867, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8610966205596924, + "rewards/grad_term": 0.005783412139862776, + "rewards/margins": 5.693758964538574, + "rewards/rejected": -7.554856300354004, + "step": 212 + }, + { + "epoch": 0.4563470808784146, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.086663858024284, + "learning_rate": 6.014319809069212e-07, + "logits/chosen": 0.8908300995826721, + "logits/rejected": 0.8858309984207153, + "logps/accuracies": 0.5, + "logps/chosen": -195.6943359375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -161.84103393554688, + "logps/ref_rejected": -157.9658966064453, + "logps/rejected": -229.5589141845703, + "loss": 0.1993, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6926652193069458, + "rewards/grad_term": 0.008458103984594345, + "rewards/margins": 1.8869857788085938, + "rewards/rejected": -3.57965087890625, + "step": 213 + }, + { + "epoch": 0.4584895554365292, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 7.664284257646828, + "learning_rate": 6.005966587112171e-07, + "logits/chosen": 0.7465036511421204, + "logits/rejected": 0.6328434348106384, + "logps/accuracies": 0.25, + "logps/chosen": -285.3111267089844, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -251.16213989257812, + "logps/ref_rejected": -238.5145721435547, + "logps/rejected": -321.26043701171875, + "loss": 0.1983, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7074486017227173, + "rewards/grad_term": 0.007980713620781898, + "rewards/margins": 2.4298453330993652, + "rewards/rejected": -4.137293815612793, + "step": 214 + }, + { + "epoch": 0.4606320299946438, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.440221701148605, + "learning_rate": 5.997613365155131e-07, + "logits/chosen": 0.44053223729133606, + "logits/rejected": 0.45330262184143066, + "logps/accuracies": 0.5, + "logps/chosen": -552.835205078125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -504.29376220703125, + "logps/ref_rejected": -250.1544189453125, + "logps/rejected": -491.84893798828125, + "loss": 0.1717, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4270737171173096, + "rewards/grad_term": 0.0030039078556001186, + "rewards/margins": 9.657651901245117, + "rewards/rejected": -12.084726333618164, + "step": 215 + }, + { + "epoch": 0.4627745045527584, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.9857967128079475, + "learning_rate": 5.989260143198091e-07, + "logits/chosen": 0.7373754978179932, + "logits/rejected": 0.7481766939163208, + "logps/accuracies": 0.5, + "logps/chosen": -352.7110290527344, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -312.5104675292969, + "logps/ref_rejected": -294.700439453125, + "logps/rejected": -409.80865478515625, + "loss": 0.1965, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.010028600692749, + "rewards/grad_term": 0.008961998857557774, + "rewards/margins": 3.7453832626342773, + "rewards/rejected": -5.7554121017456055, + "step": 216 + }, + { + "epoch": 0.46491697911087304, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.188711876243962, + "learning_rate": 5.98090692124105e-07, + "logits/chosen": 0.9295454025268555, + "logits/rejected": 0.6259232759475708, + "logps/accuracies": 0.5, + "logps/chosen": -415.08636474609375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -358.3822021484375, + "logps/ref_rejected": -257.28253173828125, + "logps/rejected": -432.2508544921875, + "loss": 0.22, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8352086544036865, + "rewards/grad_term": 0.00315161794424057, + "rewards/margins": 5.913206577301025, + "rewards/rejected": -8.748414993286133, + "step": 217 + }, + { + "epoch": 0.46705945366898766, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.492749863968397, + "learning_rate": 5.972553699284009e-07, + "logits/chosen": 0.9154873490333557, + "logits/rejected": 0.7640275955200195, + "logps/accuracies": 0.5, + "logps/chosen": -264.8331604003906, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -208.0253448486328, + "logps/ref_rejected": -161.46011352539062, + "logps/rejected": -261.93133544921875, + "loss": 0.2141, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8403897285461426, + "rewards/grad_term": 0.009426168166100979, + "rewards/margins": 2.183171510696411, + "rewards/rejected": -5.023561477661133, + "step": 218 + }, + { + "epoch": 0.4692019282271023, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.503071502702135, + "learning_rate": 5.96420047732697e-07, + "logits/chosen": 0.7595028281211853, + "logits/rejected": 0.800428032875061, + "logps/accuracies": 1.0, + "logps/chosen": -582.7049560546875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -475.7439270019531, + "logps/ref_rejected": -485.4010009765625, + "logps/rejected": -718.46337890625, + "loss": 0.2029, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.348053932189941, + "rewards/grad_term": 0.0018055308610200882, + "rewards/margins": 6.3050642013549805, + "rewards/rejected": -11.653118133544922, + "step": 219 + }, + { + "epoch": 0.47134440278521694, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.843688764755232, + "learning_rate": 5.955847255369928e-07, + "logits/chosen": 0.773861825466156, + "logits/rejected": 0.7292585372924805, + "logps/accuracies": 0.75, + "logps/chosen": -431.9044494628906, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -366.72967529296875, + "logps/ref_rejected": -330.95587158203125, + "logps/rejected": -468.05487060546875, + "loss": 0.1818, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2587406635284424, + "rewards/grad_term": 0.006896655540913343, + "rewards/margins": 3.59621000289917, + "rewards/rejected": -6.854950904846191, + "step": 220 + }, + { + "epoch": 0.47348687734333156, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.079186736183265, + "learning_rate": 5.947494033412888e-07, + "logits/chosen": 0.9851402044296265, + "logits/rejected": 0.7796863317489624, + "logps/accuracies": 0.75, + "logps/chosen": -564.6864624023438, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -508.427734375, + "logps/ref_rejected": -418.2183837890625, + "logps/rejected": -642.1084594726562, + "loss": 0.2627, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8129372596740723, + "rewards/grad_term": 0.00010136763739865273, + "rewards/margins": 8.381568908691406, + "rewards/rejected": -11.19450569152832, + "step": 221 + }, + { + "epoch": 0.4756293519014462, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 8.596915507884203, + "learning_rate": 5.939140811455847e-07, + "logits/chosen": 0.9327103495597839, + "logits/rejected": 0.7600051164627075, + "logps/accuracies": 0.25, + "logps/chosen": -458.21246337890625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -412.7664489746094, + "logps/ref_rejected": -339.1446533203125, + "logps/rejected": -441.3128662109375, + "loss": 0.2477, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2723002433776855, + "rewards/grad_term": 0.0055382088758051395, + "rewards/margins": 2.8361098766326904, + "rewards/rejected": -5.108409881591797, + "step": 222 + }, + { + "epoch": 0.4777718264595608, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.540065404325883, + "learning_rate": 5.930787589498806e-07, + "logits/chosen": 0.6721053123474121, + "logits/rejected": 0.5771878957748413, + "logps/accuracies": 0.5, + "logps/chosen": -351.98284912109375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -314.6383056640625, + "logps/ref_rejected": -232.02334594726562, + "logps/rejected": -344.7238464355469, + "loss": 0.1544, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.867226243019104, + "rewards/grad_term": 0.00492095947265625, + "rewards/margins": 3.7677993774414062, + "rewards/rejected": -5.635025978088379, + "step": 223 + }, + { + "epoch": 0.4799143010176754, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 6.681718039597231, + "learning_rate": 5.922434367541766e-07, + "logits/chosen": 0.9633818864822388, + "logits/rejected": 0.7339221239089966, + "logps/accuracies": 0.25, + "logps/chosen": -494.77099609375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -442.48052978515625, + "logps/ref_rejected": -331.5393981933594, + "logps/rejected": -480.627685546875, + "loss": 0.1871, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6145222187042236, + "rewards/grad_term": 0.000996602582745254, + "rewards/margins": 4.839890480041504, + "rewards/rejected": -7.454412937164307, + "step": 224 + }, + { + "epoch": 0.48205677557579, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 7.005924963769166, + "learning_rate": 5.914081145584725e-07, + "logits/chosen": 0.8806890845298767, + "logits/rejected": 0.6015447974205017, + "logps/accuracies": 0.25, + "logps/chosen": -366.54046630859375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -308.4888000488281, + "logps/ref_rejected": -246.00994873046875, + "logps/rejected": -355.2261047363281, + "loss": 0.1685, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.902583599090576, + "rewards/grad_term": 0.0076670260168612, + "rewards/margins": 2.558225631713867, + "rewards/rejected": -5.460808753967285, + "step": 225 + }, + { + "epoch": 0.48419925013390464, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.189114548174568, + "learning_rate": 5.905727923627685e-07, + "logits/chosen": 0.8432016968727112, + "logits/rejected": 0.4910334646701813, + "logps/accuracies": 0.5, + "logps/chosen": -529.419677734375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -399.1881103515625, + "logps/ref_rejected": -216.96324157714844, + "logps/rejected": -383.4134216308594, + "loss": 0.2154, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.511577606201172, + "rewards/grad_term": 0.013775240629911423, + "rewards/margins": 1.8109302520751953, + "rewards/rejected": -8.32250690460205, + "step": 226 + }, + { + "epoch": 0.4863417246920193, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.812465251320808, + "learning_rate": 5.897374701670644e-07, + "logits/chosen": 0.9737125039100647, + "logits/rejected": 0.8655239939689636, + "logps/accuracies": 0.75, + "logps/chosen": -478.7800598144531, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -391.8121643066406, + "logps/ref_rejected": -331.19952392578125, + "logps/rejected": -510.9104309082031, + "loss": 0.1759, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.348394393920898, + "rewards/grad_term": 0.0018389918841421604, + "rewards/margins": 4.637151718139648, + "rewards/rejected": -8.985546112060547, + "step": 227 + }, + { + "epoch": 0.4884841992501339, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.807007302887406, + "learning_rate": 5.889021479713604e-07, + "logits/chosen": 0.5967141389846802, + "logits/rejected": 0.588777482509613, + "logps/accuracies": 0.75, + "logps/chosen": -175.18948364257812, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -160.39163208007812, + "logps/ref_rejected": -131.2773895263672, + "logps/rejected": -182.36181640625, + "loss": 0.1968, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7398918867111206, + "rewards/grad_term": 0.008824177086353302, + "rewards/margins": 1.8143287897109985, + "rewards/rejected": -2.554220676422119, + "step": 228 + }, + { + "epoch": 0.49062667380824854, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.665149636062972, + "learning_rate": 5.880668257756563e-07, + "logits/chosen": 0.9138537645339966, + "logits/rejected": 0.8063441514968872, + "logps/accuracies": 0.5, + "logps/chosen": -377.0699768066406, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -307.6856384277344, + "logps/ref_rejected": -232.5096435546875, + "logps/rejected": -368.8970031738281, + "loss": 0.1393, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4692177772521973, + "rewards/grad_term": 0.00434906966984272, + "rewards/margins": 3.3501501083374023, + "rewards/rejected": -6.819368362426758, + "step": 229 + }, + { + "epoch": 0.49276914836636315, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.623147163579497, + "learning_rate": 5.872315035799522e-07, + "logits/chosen": 0.8319353461265564, + "logits/rejected": 0.7092019319534302, + "logps/accuracies": 0.5, + "logps/chosen": -471.31524658203125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -425.01544189453125, + "logps/ref_rejected": -352.10040283203125, + "logps/rejected": -474.41741943359375, + "loss": 0.1697, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3149900436401367, + "rewards/grad_term": 0.007428554352372885, + "rewards/margins": 3.8008623123168945, + "rewards/rejected": -6.115852355957031, + "step": 230 + }, + { + "epoch": 0.49491162292447777, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.671116246729667, + "learning_rate": 5.863961813842482e-07, + "logits/chosen": 0.7218674421310425, + "logits/rejected": 0.6530136466026306, + "logps/accuracies": 0.75, + "logps/chosen": -417.69482421875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -333.3896789550781, + "logps/ref_rejected": -285.8385009765625, + "logps/rejected": -459.2354431152344, + "loss": 0.1788, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.215256690979004, + "rewards/grad_term": 0.004520585294812918, + "rewards/margins": 4.454591274261475, + "rewards/rejected": -8.66984748840332, + "step": 231 + }, + { + "epoch": 0.4970540974825924, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.021075456821866, + "learning_rate": 5.855608591885441e-07, + "logits/chosen": 0.732083797454834, + "logits/rejected": 0.4362190365791321, + "logps/accuracies": 0.75, + "logps/chosen": -291.83062744140625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -250.48623657226562, + "logps/ref_rejected": -179.92550659179688, + "logps/rejected": -318.45306396484375, + "loss": 0.1443, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0672202110290527, + "rewards/grad_term": 0.002242325572296977, + "rewards/margins": 4.859157562255859, + "rewards/rejected": -6.92637825012207, + "step": 232 + }, + { + "epoch": 0.499196572040707, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.428848978245107, + "learning_rate": 5.847255369928401e-07, + "logits/chosen": 0.6398648023605347, + "logits/rejected": 0.5878071784973145, + "logps/accuracies": 0.75, + "logps/chosen": -290.29071044921875, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -244.7794647216797, + "logps/ref_rejected": -252.53553771972656, + "logps/rejected": -354.963134765625, + "loss": 0.2229, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.275561571121216, + "rewards/grad_term": 0.010291634127497673, + "rewards/margins": 2.8458194732666016, + "rewards/rejected": -5.121380805969238, + "step": 233 + }, + { + "epoch": 0.5013390465988217, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 5.246143439813943, + "learning_rate": 5.83890214797136e-07, + "logits/chosen": 0.9107778072357178, + "logits/rejected": 0.7426069378852844, + "logps/accuracies": 0.75, + "logps/chosen": -474.1260986328125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -367.388671875, + "logps/ref_rejected": -272.3711242675781, + "logps/rejected": -551.78857421875, + "loss": 0.1338, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.3368730545043945, + "rewards/grad_term": 0.0010807998478412628, + "rewards/margins": 8.633999824523926, + "rewards/rejected": -13.970873832702637, + "step": 234 + }, + { + "epoch": 0.5034815211569362, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 15.830907508822516, + "learning_rate": 5.83054892601432e-07, + "logits/chosen": 0.9917585849761963, + "logits/rejected": 0.7569248080253601, + "logps/accuracies": 1.0, + "logps/chosen": -554.43359375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -480.7333068847656, + "logps/ref_rejected": -434.622314453125, + "logps/rejected": -575.601318359375, + "loss": 0.1535, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.685014486312866, + "rewards/grad_term": 0.002452064771205187, + "rewards/margins": 3.3639354705810547, + "rewards/rejected": -7.0489501953125, + "step": 235 + }, + { + "epoch": 0.5056239957150509, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.346658607688704, + "learning_rate": 5.822195704057279e-07, + "logits/chosen": 0.623228132724762, + "logits/rejected": 0.5134543180465698, + "logps/accuracies": 0.75, + "logps/chosen": -271.30902099609375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -203.46876525878906, + "logps/ref_rejected": -188.93699645996094, + "logps/rejected": -337.220947265625, + "loss": 0.1585, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.392012596130371, + "rewards/grad_term": 0.009173048660159111, + "rewards/margins": 4.022184371948242, + "rewards/rejected": -7.4141974449157715, + "step": 236 + }, + { + "epoch": 0.5077664702731655, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 10.626683894288021, + "learning_rate": 5.813842482100238e-07, + "logits/chosen": 0.8421118855476379, + "logits/rejected": 0.7152860760688782, + "logps/accuracies": 0.5, + "logps/chosen": -363.5376892089844, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -290.9403381347656, + "logps/ref_rejected": -304.65911865234375, + "logps/rejected": -451.55059814453125, + "loss": 0.239, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.629868268966675, + "rewards/grad_term": 0.005975798238068819, + "rewards/margins": 3.7147045135498047, + "rewards/rejected": -7.344573020935059, + "step": 237 + }, + { + "epoch": 0.5099089448312801, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 11.218899504442438, + "learning_rate": 5.805489260143197e-07, + "logits/chosen": 0.7464509010314941, + "logits/rejected": 0.5703651309013367, + "logps/accuracies": 0.75, + "logps/chosen": -530.9554443359375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -405.5935363769531, + "logps/ref_rejected": -343.15496826171875, + "logps/rejected": -560.3563232421875, + "loss": 0.2231, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.268095970153809, + "rewards/grad_term": 0.007501318119466305, + "rewards/margins": 4.591974258422852, + "rewards/rejected": -10.86007022857666, + "step": 238 + }, + { + "epoch": 0.5120514193893948, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 5.019653658814029, + "learning_rate": 5.797136038186157e-07, + "logits/chosen": 0.8765060901641846, + "logits/rejected": 0.5701332688331604, + "logps/accuracies": 0.5, + "logps/chosen": -290.7047424316406, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -251.8502960205078, + "logps/ref_rejected": -161.183837890625, + "logps/rejected": -263.0386962890625, + "loss": 0.1661, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.942723035812378, + "rewards/grad_term": 0.005877365358173847, + "rewards/margins": 3.1500186920166016, + "rewards/rejected": -5.092741966247559, + "step": 239 + }, + { + "epoch": 0.5141938939475094, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 5.853619053843736, + "learning_rate": 5.788782816229117e-07, + "logits/chosen": 0.6634964346885681, + "logits/rejected": 0.6507644653320312, + "logps/accuracies": 1.0, + "logps/chosen": -361.76287841796875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -260.14697265625, + "logps/ref_rejected": -247.04840087890625, + "logps/rejected": -400.8096618652344, + "loss": 0.1472, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.080793857574463, + "rewards/grad_term": 0.00737812090665102, + "rewards/margins": 2.6072704792022705, + "rewards/rejected": -7.688064098358154, + "step": 240 + }, + { + "epoch": 0.516336368505624, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 13.513765248794778, + "learning_rate": 5.780429594272076e-07, + "logits/chosen": 0.8699438571929932, + "logits/rejected": 0.7703713774681091, + "logps/accuracies": 0.75, + "logps/chosen": -379.037353515625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -295.375, + "logps/ref_rejected": -275.28857421875, + "logps/rejected": -539.343017578125, + "loss": 0.2574, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1831183433532715, + "rewards/grad_term": 0.002768411999568343, + "rewards/margins": 9.019603729248047, + "rewards/rejected": -13.202722549438477, + "step": 241 + }, + { + "epoch": 0.5184788430637386, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.871277063040404, + "learning_rate": 5.772076372315036e-07, + "logits/chosen": 0.7803428769111633, + "logits/rejected": 0.6543869376182556, + "logps/accuracies": 0.5, + "logps/chosen": -564.8624267578125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -448.4819030761719, + "logps/ref_rejected": -379.95556640625, + "logps/rejected": -606.3104248046875, + "loss": 0.2026, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.81902551651001, + "rewards/grad_term": 0.00902615487575531, + "rewards/margins": 5.498717784881592, + "rewards/rejected": -11.317742347717285, + "step": 242 + }, + { + "epoch": 0.5206213176218533, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.769246851000191, + "learning_rate": 5.763723150357995e-07, + "logits/chosen": 0.6692153811454773, + "logits/rejected": 0.896048367023468, + "logps/accuracies": 0.75, + "logps/chosen": -439.7333679199219, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -363.28973388671875, + "logps/ref_rejected": -574.2049560546875, + "logps/rejected": -756.08203125, + "loss": 0.155, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8221817016601562, + "rewards/grad_term": 0.0039497376419603825, + "rewards/margins": 5.271674156188965, + "rewards/rejected": -9.093855857849121, + "step": 243 + }, + { + "epoch": 0.5227637921799678, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.5351054842566185, + "learning_rate": 5.755369928400955e-07, + "logits/chosen": 0.9571207761764526, + "logits/rejected": 0.7909256815910339, + "logps/accuracies": 0.75, + "logps/chosen": -415.9464111328125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -306.4901428222656, + "logps/ref_rejected": -293.0599060058594, + "logps/rejected": -455.033935546875, + "loss": 0.1491, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.472814559936523, + "rewards/grad_term": 0.005084376782178879, + "rewards/margins": 2.625887870788574, + "rewards/rejected": -8.098702430725098, + "step": 244 + }, + { + "epoch": 0.5249062667380825, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 16.74633297722825, + "learning_rate": 5.747016706443913e-07, + "logits/chosen": 0.8858977556228638, + "logits/rejected": 0.7780598998069763, + "logps/accuracies": 0.75, + "logps/chosen": -464.2574462890625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -314.8685302734375, + "logps/ref_rejected": -267.0308532714844, + "logps/rejected": -507.6094055175781, + "loss": 0.1805, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.469447135925293, + "rewards/grad_term": 0.007684916723519564, + "rewards/margins": 4.559481620788574, + "rewards/rejected": -12.028928756713867, + "step": 245 + }, + { + "epoch": 0.527048741296197, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.317686067596801, + "learning_rate": 5.738663484486874e-07, + "logits/chosen": 0.30461055040359497, + "logits/rejected": 0.4746954143047333, + "logps/accuracies": 0.75, + "logps/chosen": -107.72335815429688, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -91.86939239501953, + "logps/ref_rejected": -89.95404052734375, + "logps/rejected": -137.3914031982422, + "loss": 0.22, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.792698323726654, + "rewards/grad_term": 0.012737632729113102, + "rewards/margins": 1.5791699886322021, + "rewards/rejected": -2.371868371963501, + "step": 246 + }, + { + "epoch": 0.5291912158543117, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 6.560628687601231, + "learning_rate": 5.730310262529833e-07, + "logits/chosen": 0.7160434722900391, + "logits/rejected": 0.5276747941970825, + "logps/accuracies": 1.0, + "logps/chosen": -328.2812805175781, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -279.67138671875, + "logps/ref_rejected": -251.840576171875, + "logps/rejected": -399.2860107421875, + "loss": 0.1576, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4304940700531006, + "rewards/grad_term": 0.0015676068142056465, + "rewards/margins": 4.941778659820557, + "rewards/rejected": -7.372272968292236, + "step": 247 + }, + { + "epoch": 0.5313336904124264, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.552825805844636, + "learning_rate": 5.721957040572792e-07, + "logits/chosen": 0.665590763092041, + "logits/rejected": 0.6970337629318237, + "logps/accuracies": 1.0, + "logps/chosen": -445.61358642578125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -375.84332275390625, + "logps/ref_rejected": -374.01043701171875, + "logps/rejected": -503.5439453125, + "loss": 0.1957, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4885122776031494, + "rewards/grad_term": 0.005766700953245163, + "rewards/margins": 2.9881629943847656, + "rewards/rejected": -6.476675510406494, + "step": 248 + }, + { + "epoch": 0.533476164970541, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 6.35131355845974, + "learning_rate": 5.713603818615751e-07, + "logits/chosen": 0.8978402614593506, + "logits/rejected": 0.5600339770317078, + "logps/accuracies": 0.25, + "logps/chosen": -461.3507385253906, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -393.03179931640625, + "logps/ref_rejected": -272.4175109863281, + "logps/rejected": -435.42462158203125, + "loss": 0.1457, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.415947675704956, + "rewards/grad_term": 0.004269158001989126, + "rewards/margins": 4.734410285949707, + "rewards/rejected": -8.150358200073242, + "step": 249 + }, + { + "epoch": 0.5356186395286556, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 17.580885514236424, + "learning_rate": 5.705250596658711e-07, + "logits/chosen": 0.6814495921134949, + "logits/rejected": 0.75849449634552, + "logps/accuracies": 0.75, + "logps/chosen": -353.9872131347656, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -253.66598510742188, + "logps/ref_rejected": -231.22552490234375, + "logps/rejected": -383.4581298828125, + "loss": 0.2176, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.016061305999756, + "rewards/grad_term": 0.007379300892353058, + "rewards/margins": 2.595566987991333, + "rewards/rejected": -7.611629009246826, + "step": 250 + }, + { + "epoch": 0.5377611140867702, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.1971499013479, + "learning_rate": 5.696897374701671e-07, + "logits/chosen": 0.6412093639373779, + "logits/rejected": 0.6849941611289978, + "logps/accuracies": 0.75, + "logps/chosen": -354.33367919921875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -277.5402526855469, + "logps/ref_rejected": -235.74111938476562, + "logps/rejected": -374.9162902832031, + "loss": 0.2137, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.839670181274414, + "rewards/grad_term": 0.01258145458996296, + "rewards/margins": 3.119089126586914, + "rewards/rejected": -6.958759307861328, + "step": 251 + }, + { + "epoch": 0.5399035886448849, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.69469276334864, + "learning_rate": 5.68854415274463e-07, + "logits/chosen": 0.8622183799743652, + "logits/rejected": 0.5919508337974548, + "logps/accuracies": 0.5, + "logps/chosen": -274.1978759765625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -238.06967163085938, + "logps/ref_rejected": -164.38018798828125, + "logps/rejected": -275.35992431640625, + "loss": 0.2049, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.806409478187561, + "rewards/grad_term": 0.010785759426653385, + "rewards/margins": 3.742577075958252, + "rewards/rejected": -5.548986434936523, + "step": 252 + }, + { + "epoch": 0.5420460632029994, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.021225410718323, + "learning_rate": 5.680190930787589e-07, + "logits/chosen": 0.7470685243606567, + "logits/rejected": 0.6984888911247253, + "logps/accuracies": 0.75, + "logps/chosen": -221.68431091308594, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -187.86117553710938, + "logps/ref_rejected": -150.86964416503906, + "logps/rejected": -256.9394836425781, + "loss": 0.1673, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6911565065383911, + "rewards/grad_term": 0.011116426438093185, + "rewards/margins": 3.612335205078125, + "rewards/rejected": -5.303491592407227, + "step": 253 + }, + { + "epoch": 0.5441885377611141, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.151951865155123, + "learning_rate": 5.671837708830549e-07, + "logits/chosen": 0.22945332527160645, + "logits/rejected": 0.5243977308273315, + "logps/accuracies": 0.5, + "logps/chosen": -282.75384521484375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -227.72982788085938, + "logps/ref_rejected": -320.92535400390625, + "logps/rejected": -430.75787353515625, + "loss": 0.1473, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7512013912200928, + "rewards/grad_term": 0.013240108266472816, + "rewards/margins": 2.740424394607544, + "rewards/rejected": -5.491625785827637, + "step": 254 + }, + { + "epoch": 0.5463310123192288, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 11.292561948011548, + "learning_rate": 5.663484486873508e-07, + "logits/chosen": 0.7049826979637146, + "logits/rejected": 0.7030065059661865, + "logps/accuracies": 1.0, + "logps/chosen": -465.6855773925781, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -391.0091552734375, + "logps/ref_rejected": -373.00775146484375, + "logps/rejected": -555.9122314453125, + "loss": 0.1945, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.733822822570801, + "rewards/grad_term": 0.001258535892702639, + "rewards/margins": 5.411401271820068, + "rewards/rejected": -9.145223617553711, + "step": 255 + }, + { + "epoch": 0.5484734868773433, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.25, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 12.415742091600887, + "learning_rate": 5.655131264916467e-07, + "logits/chosen": 0.9153692722320557, + "logits/rejected": 0.5475519895553589, + "logps/accuracies": 0.5, + "logps/chosen": -304.13330078125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -245.09902954101562, + "logps/ref_rejected": -193.71463012695312, + "logps/rejected": -393.2826843261719, + "loss": 0.2106, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.951713800430298, + "rewards/grad_term": 0.011270977556705475, + "rewards/margins": 7.026688575744629, + "rewards/rejected": -9.978402137756348, + "step": 256 + }, + { + "epoch": 0.5484734868773433, + "eval_flips/correct->correct": 0.14000000059604645, + "eval_flips/correct->incorrect": 0.019999999552965164, + "eval_flips/incorrect->correct": 0.4399999976158142, + "eval_flips/incorrect->incorrect": 0.4000000059604645, + "eval_logits/chosen": 0.7667725086212158, + "eval_logits/rejected": 0.6504298448562622, + "eval_logps/accuracies": 0.5799999833106995, + "eval_logps/chosen": -395.9922790527344, + "eval_logps/ref_accuracies": 0.1599999964237213, + "eval_logps/ref_chosen": -323.51568603515625, + "eval_logps/ref_rejected": -258.70098876953125, + "eval_logps/rejected": -407.676025390625, + "eval_loss": 0.19521716237068176, + "eval_rewards/accuracies": 0.8399999737739563, + "eval_rewards/chosen": -3.6238298416137695, + "eval_rewards/grad_term": 0.008681231178343296, + "eval_rewards/margins": 3.824923038482666, + "eval_rewards/rejected": -7.4487528800964355, + "eval_runtime": 372.955, + "eval_samples_per_second": 4.236, + "eval_steps_per_second": 0.134, + "step": 256 + }, + { + "epoch": 0.550615961435458, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.31965267361933, + "learning_rate": 5.646778042959426e-07, + "logits/chosen": 0.9101255536079407, + "logits/rejected": 0.8786407113075256, + "logps/accuracies": 0.75, + "logps/chosen": -516.9441528320312, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -426.1374206542969, + "logps/ref_rejected": -390.5966796875, + "logps/rejected": -595.52685546875, + "loss": 0.1809, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.540337085723877, + "rewards/grad_term": 0.0009564714273437858, + "rewards/margins": 5.706172466278076, + "rewards/rejected": -10.246509552001953, + "step": 257 + }, + { + "epoch": 0.5527584359935725, + "flips/correct->correct": 1.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.682059725515248, + "learning_rate": 5.638424821002387e-07, + "logits/chosen": 0.8498424291610718, + "logits/rejected": 0.8489320874214172, + "logps/accuracies": 1.0, + "logps/chosen": -413.9257507324219, + "logps/ref_accuracies": 1.0, + "logps/ref_chosen": -327.97869873046875, + "logps/ref_rejected": -369.13482666015625, + "logps/rejected": -582.4822998046875, + "loss": 0.169, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2973527908325195, + "rewards/grad_term": 0.005528903566300869, + "rewards/margins": 6.370021820068359, + "rewards/rejected": -10.667373657226562, + "step": 258 + }, + { + "epoch": 0.5549009105516872, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.707978512883248, + "learning_rate": 5.630071599045346e-07, + "logits/chosen": 0.7243056297302246, + "logits/rejected": 0.6144933104515076, + "logps/accuracies": 0.75, + "logps/chosen": -426.29638671875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -336.075439453125, + "logps/ref_rejected": -338.01348876953125, + "logps/rejected": -613.092041015625, + "loss": 0.1699, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.511048316955566, + "rewards/grad_term": 0.00518822530284524, + "rewards/margins": 9.242880821228027, + "rewards/rejected": -13.753929138183594, + "step": 259 + }, + { + "epoch": 0.5570433851098018, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.457819309849194, + "learning_rate": 5.621718377088305e-07, + "logits/chosen": 0.7747801542282104, + "logits/rejected": 0.6980942487716675, + "logps/accuracies": 1.0, + "logps/chosen": -333.2770080566406, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -263.5532531738281, + "logps/ref_rejected": -270.31927490234375, + "logps/rejected": -400.2752990722656, + "loss": 0.1812, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.486187696456909, + "rewards/grad_term": 0.014478763565421104, + "rewards/margins": 3.0116138458251953, + "rewards/rejected": -6.497801780700684, + "step": 260 + }, + { + "epoch": 0.5591858596679165, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 12.791539513426143, + "learning_rate": 5.613365155131265e-07, + "logits/chosen": 1.010023832321167, + "logits/rejected": 0.8089584708213806, + "logps/accuracies": 0.5, + "logps/chosen": -434.7796325683594, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -369.63201904296875, + "logps/ref_rejected": -312.5323791503906, + "logps/rejected": -433.6856994628906, + "loss": 0.2167, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2573814392089844, + "rewards/grad_term": 0.011615730822086334, + "rewards/margins": 2.800283908843994, + "rewards/rejected": -6.057665824890137, + "step": 261 + }, + { + "epoch": 0.561328334226031, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 5.761871953548487, + "learning_rate": 5.605011933174224e-07, + "logits/chosen": 0.7621825337409973, + "logits/rejected": 0.6671872138977051, + "logps/accuracies": 1.0, + "logps/chosen": -520.250732421875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -421.12890625, + "logps/ref_rejected": -388.18841552734375, + "logps/rejected": -616.8232421875, + "loss": 0.1195, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.956088066101074, + "rewards/grad_term": 0.0011122592259198427, + "rewards/margins": 6.475651741027832, + "rewards/rejected": -11.431740760803223, + "step": 262 + }, + { + "epoch": 0.5634708087841457, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.615576943912552, + "learning_rate": 5.596658711217183e-07, + "logits/chosen": 0.7528675198554993, + "logits/rejected": 0.5586297512054443, + "logps/accuracies": 1.0, + "logps/chosen": -235.1770477294922, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -193.3452606201172, + "logps/ref_rejected": -156.8057861328125, + "logps/rejected": -284.637939453125, + "loss": 0.1786, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.0915887355804443, + "rewards/grad_term": 0.012236223556101322, + "rewards/margins": 4.300019264221191, + "rewards/rejected": -6.391608238220215, + "step": 263 + }, + { + "epoch": 0.5656132833422604, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 5.642389085683032, + "learning_rate": 5.588305489260142e-07, + "logits/chosen": 0.7206822633743286, + "logits/rejected": 0.6255587339401245, + "logps/accuracies": 1.0, + "logps/chosen": -403.0487976074219, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -323.03924560546875, + "logps/ref_rejected": -263.4703369140625, + "logps/rejected": -451.4239196777344, + "loss": 0.1584, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.000478267669678, + "rewards/grad_term": 0.009540688246488571, + "rewards/margins": 5.3972015380859375, + "rewards/rejected": -9.397679328918457, + "step": 264 + }, + { + "epoch": 0.5677557579003749, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 14.507972697356939, + "learning_rate": 5.579952267303103e-07, + "logits/chosen": 0.8435995578765869, + "logits/rejected": 0.30664098262786865, + "logps/accuracies": 0.25, + "logps/chosen": -513.014404296875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -417.4500427246094, + "logps/ref_rejected": -269.4376220703125, + "logps/rejected": -427.7491760253906, + "loss": 0.2097, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.7782206535339355, + "rewards/grad_term": 0.010889217257499695, + "rewards/margins": 3.137356996536255, + "rewards/rejected": -7.9155778884887695, + "step": 265 + }, + { + "epoch": 0.5698982324584896, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.002273528640038, + "learning_rate": 5.571599045346062e-07, + "logits/chosen": 0.6676109433174133, + "logits/rejected": 0.7191418409347534, + "logps/accuracies": 0.75, + "logps/chosen": -496.98822021484375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -340.63250732421875, + "logps/ref_rejected": -327.24847412109375, + "logps/rejected": -582.9611206054688, + "loss": 0.1535, + "rewards/accuracies": 0.75, + "rewards/chosen": -7.817786693572998, + "rewards/grad_term": 0.011708484031260014, + "rewards/margins": 4.9678449630737305, + "rewards/rejected": -12.785632133483887, + "step": 266 + }, + { + "epoch": 0.5720407070166041, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 5.543702357825273, + "learning_rate": 5.563245823389021e-07, + "logits/chosen": 0.8063835501670837, + "logits/rejected": 0.6988131999969482, + "logps/accuracies": 0.75, + "logps/chosen": -403.5315856933594, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -330.3784484863281, + "logps/ref_rejected": -266.2269287109375, + "logps/rejected": -429.4187316894531, + "loss": 0.1285, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6576569080352783, + "rewards/grad_term": 0.006236384157091379, + "rewards/margins": 4.501932621002197, + "rewards/rejected": -8.159589767456055, + "step": 267 + }, + { + "epoch": 0.5741831815747188, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.10211268941143, + "learning_rate": 5.55489260143198e-07, + "logits/chosen": 0.6175810694694519, + "logits/rejected": 0.4528239965438843, + "logps/accuracies": 0.75, + "logps/chosen": -391.7451477050781, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -323.01617431640625, + "logps/ref_rejected": -281.732421875, + "logps/rejected": -443.7324523925781, + "loss": 0.1603, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4364490509033203, + "rewards/grad_term": 0.003603234887123108, + "rewards/margins": 4.663552284240723, + "rewards/rejected": -8.100001335144043, + "step": 268 + }, + { + "epoch": 0.5763256561328334, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.454583306377835, + "learning_rate": 5.546539379474941e-07, + "logits/chosen": 0.5018086433410645, + "logits/rejected": 0.3208431601524353, + "logps/accuracies": 0.75, + "logps/chosen": -292.32537841796875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -235.02024841308594, + "logps/ref_rejected": -239.4210968017578, + "logps/rejected": -393.904541015625, + "loss": 0.1477, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8652560710906982, + "rewards/grad_term": 0.005141068249940872, + "rewards/margins": 4.858916282653809, + "rewards/rejected": -7.724172115325928, + "step": 269 + }, + { + "epoch": 0.578468130690948, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.116562665988194, + "learning_rate": 5.5381861575179e-07, + "logits/chosen": 0.8009479641914368, + "logits/rejected": 0.5304053425788879, + "logps/accuracies": 0.75, + "logps/chosen": -567.357666015625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -447.6595458984375, + "logps/ref_rejected": -327.67388916015625, + "logps/rejected": -578.609619140625, + "loss": 0.181, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.984908580780029, + "rewards/grad_term": 0.001012351829558611, + "rewards/margins": 6.561877727508545, + "rewards/rejected": -12.546786308288574, + "step": 270 + }, + { + "epoch": 0.5806106052490627, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.286921940219483, + "learning_rate": 5.529832935560859e-07, + "logits/chosen": 0.8755187392234802, + "logits/rejected": 0.7794501781463623, + "logps/accuracies": 0.75, + "logps/chosen": -271.8689880371094, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -215.5362548828125, + "logps/ref_rejected": -208.42568969726562, + "logps/rejected": -326.4412536621094, + "loss": 0.1667, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8166375160217285, + "rewards/grad_term": 0.006757371127605438, + "rewards/margins": 3.084141254425049, + "rewards/rejected": -5.900778770446777, + "step": 271 + }, + { + "epoch": 0.5827530798071773, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.808675787712881, + "learning_rate": 5.521479713603818e-07, + "logits/chosen": 0.5810420513153076, + "logits/rejected": 0.5697520971298218, + "logps/accuracies": 0.75, + "logps/chosen": -252.84695434570312, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -222.0087890625, + "logps/ref_rejected": -220.40602111816406, + "logps/rejected": -321.25408935546875, + "loss": 0.1784, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5419079065322876, + "rewards/grad_term": 0.006449728738516569, + "rewards/margins": 3.500495433807373, + "rewards/rejected": -5.042403697967529, + "step": 272 + }, + { + "epoch": 0.584895554365292, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 6.461051225414454, + "learning_rate": 5.513126491646778e-07, + "logits/chosen": 0.6986079812049866, + "logits/rejected": 0.7241477370262146, + "logps/accuracies": 1.0, + "logps/chosen": -478.98333740234375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -346.74542236328125, + "logps/ref_rejected": -319.70550537109375, + "logps/rejected": -641.8858642578125, + "loss": 0.1217, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.61189603805542, + "rewards/grad_term": 0.004527165554463863, + "rewards/margins": 9.497122764587402, + "rewards/rejected": -16.109020233154297, + "step": 273 + }, + { + "epoch": 0.5870380289234065, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.28604619396496, + "learning_rate": 5.504773269689737e-07, + "logits/chosen": 0.9160727262496948, + "logits/rejected": 0.6226189732551575, + "logps/accuracies": 0.75, + "logps/chosen": -482.9158630371094, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -380.3866882324219, + "logps/ref_rejected": -318.1195068359375, + "logps/rejected": -563.2010498046875, + "loss": 0.1824, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.126457691192627, + "rewards/grad_term": 0.0009991895640268922, + "rewards/margins": 7.12761926651001, + "rewards/rejected": -12.254076957702637, + "step": 274 + }, + { + "epoch": 0.5891805034815212, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.25, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.96745166065974, + "learning_rate": 5.496420047732696e-07, + "logits/chosen": 0.7636332511901855, + "logits/rejected": 0.7924087643623352, + "logps/accuracies": 0.75, + "logps/chosen": -232.80300903320312, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -201.30563354492188, + "logps/ref_rejected": -200.0108184814453, + "logps/rejected": -281.12127685546875, + "loss": 0.19, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5748703479766846, + "rewards/grad_term": 0.011370973661541939, + "rewards/margins": 2.480652332305908, + "rewards/rejected": -4.055522918701172, + "step": 275 + }, + { + "epoch": 0.5913229780396357, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 10.556600598116974, + "learning_rate": 5.488066825775657e-07, + "logits/chosen": 0.3400154113769531, + "logits/rejected": 0.8462868332862854, + "logps/accuracies": 1.0, + "logps/chosen": -457.0469970703125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -380.0611877441406, + "logps/ref_rejected": -327.03466796875, + "logps/rejected": -536.0282592773438, + "loss": 0.171, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8492913246154785, + "rewards/grad_term": 0.0008406995330005884, + "rewards/margins": 6.6003899574279785, + "rewards/rejected": -10.449681282043457, + "step": 276 + }, + { + "epoch": 0.5934654525977504, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 10.78258861657989, + "learning_rate": 5.479713603818616e-07, + "logits/chosen": 0.5925794839859009, + "logits/rejected": 0.3707428276538849, + "logps/accuracies": 0.5, + "logps/chosen": -392.9906921386719, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -328.11785888671875, + "logps/ref_rejected": -267.7380676269531, + "logps/rejected": -379.7018737792969, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.243643283843994, + "rewards/grad_term": 0.007178822532296181, + "rewards/margins": 2.354548215866089, + "rewards/rejected": -5.598191261291504, + "step": 277 + }, + { + "epoch": 0.595607927155865, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.592007769382429, + "learning_rate": 5.471360381861575e-07, + "logits/chosen": 0.6001948714256287, + "logits/rejected": 0.4797150790691376, + "logps/accuracies": 0.75, + "logps/chosen": -329.23004150390625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -272.231201171875, + "logps/ref_rejected": -260.5545349121094, + "logps/rejected": -435.8937072753906, + "loss": 0.201, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8499412536621094, + "rewards/grad_term": 0.005573004484176636, + "rewards/margins": 5.917016506195068, + "rewards/rejected": -8.766958236694336, + "step": 278 + }, + { + "epoch": 0.5977504017139796, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.70464881667517, + "learning_rate": 5.463007159904534e-07, + "logits/chosen": 0.9618555307388306, + "logits/rejected": 0.7959021329879761, + "logps/accuracies": 0.75, + "logps/chosen": -419.58148193359375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -360.5795593261719, + "logps/ref_rejected": -305.4864196777344, + "logps/rejected": -494.42156982421875, + "loss": 0.1709, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9500961303710938, + "rewards/grad_term": 0.002942422404885292, + "rewards/margins": 6.496662616729736, + "rewards/rejected": -9.446758270263672, + "step": 279 + }, + { + "epoch": 0.5998928762720943, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 13.531014928298436, + "learning_rate": 5.454653937947494e-07, + "logits/chosen": 0.5968809127807617, + "logits/rejected": 0.6607197523117065, + "logps/accuracies": 0.75, + "logps/chosen": -369.2514343261719, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -323.9585266113281, + "logps/ref_rejected": -320.01959228515625, + "logps/rejected": -448.68682861328125, + "loss": 0.2235, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2646453380584717, + "rewards/grad_term": 0.004641966428607702, + "rewards/margins": 4.168717861175537, + "rewards/rejected": -6.433363437652588, + "step": 280 + }, + { + "epoch": 0.6020353508302089, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.57138907693324, + "learning_rate": 5.446300715990454e-07, + "logits/chosen": 1.1103699207305908, + "logits/rejected": 0.9430161714553833, + "logps/accuracies": 0.5, + "logps/chosen": -314.4698486328125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -291.67144775390625, + "logps/ref_rejected": -255.3843994140625, + "logps/rejected": -313.588134765625, + "loss": 0.1767, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.139919638633728, + "rewards/grad_term": 0.011935700662434101, + "rewards/margins": 1.7702679634094238, + "rewards/rejected": -2.9101874828338623, + "step": 281 + }, + { + "epoch": 0.6041778253883235, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.587776679123657, + "learning_rate": 5.437947494033412e-07, + "logits/chosen": 0.8917209506034851, + "logits/rejected": 0.7313340306282043, + "logps/accuracies": 0.5, + "logps/chosen": -615.8944702148438, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -534.5302734375, + "logps/ref_rejected": -437.1783447265625, + "logps/rejected": -592.5615844726562, + "loss": 0.2, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.068208694458008, + "rewards/grad_term": 0.002287252340465784, + "rewards/margins": 3.7009527683258057, + "rewards/rejected": -7.769161701202393, + "step": 282 + }, + { + "epoch": 0.6063202999464381, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.603440187653598, + "learning_rate": 5.429594272076372e-07, + "logits/chosen": 0.8125787377357483, + "logits/rejected": 0.5284969806671143, + "logps/accuracies": 0.75, + "logps/chosen": -394.273681640625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -301.3934326171875, + "logps/ref_rejected": -264.20989990234375, + "logps/rejected": -396.23590087890625, + "loss": 0.1553, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.644012451171875, + "rewards/grad_term": 0.012798898853361607, + "rewards/margins": 1.9572882652282715, + "rewards/rejected": -6.601301193237305, + "step": 283 + }, + { + "epoch": 0.6084627745045528, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.706538315869954, + "learning_rate": 5.421241050119332e-07, + "logits/chosen": 0.6700544357299805, + "logits/rejected": 0.6845322847366333, + "logps/accuracies": 1.0, + "logps/chosen": -373.3970031738281, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -308.53228759765625, + "logps/ref_rejected": -321.16717529296875, + "logps/rejected": -476.41796875, + "loss": 0.2091, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2432353496551514, + "rewards/grad_term": 0.002791226841509342, + "rewards/margins": 4.5193047523498535, + "rewards/rejected": -7.762540340423584, + "step": 284 + }, + { + "epoch": 0.6106052490626673, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.616290838954344, + "learning_rate": 5.412887828162291e-07, + "logits/chosen": 0.8290910720825195, + "logits/rejected": 0.931686520576477, + "logps/accuracies": 1.0, + "logps/chosen": -446.44232177734375, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -383.1173400878906, + "logps/ref_rejected": -422.9083557128906, + "logps/rejected": -596.1305541992188, + "loss": 0.1758, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1662492752075195, + "rewards/grad_term": 0.005467691924422979, + "rewards/margins": 5.49485969543457, + "rewards/rejected": -8.66110897064209, + "step": 285 + }, + { + "epoch": 0.612747723620782, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 11.608396954924636, + "learning_rate": 5.40453460620525e-07, + "logits/chosen": 0.45172828435897827, + "logits/rejected": 0.7064520120620728, + "logps/accuracies": 1.0, + "logps/chosen": -322.91387939453125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -258.16265869140625, + "logps/ref_rejected": -297.7436218261719, + "logps/rejected": -475.2723388671875, + "loss": 0.1969, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.237560272216797, + "rewards/grad_term": 0.005947392899543047, + "rewards/margins": 5.638874530792236, + "rewards/rejected": -8.876434326171875, + "step": 286 + }, + { + "epoch": 0.6148901981788967, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 10.779231877934484, + "learning_rate": 5.39618138424821e-07, + "logits/chosen": 0.9248701333999634, + "logits/rejected": 0.8036876916885376, + "logps/accuracies": 0.5, + "logps/chosen": -467.22015380859375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -380.78204345703125, + "logps/ref_rejected": -339.566162109375, + "logps/rejected": -530.2546997070312, + "loss": 0.2517, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.321907043457031, + "rewards/grad_term": 0.002679330063983798, + "rewards/margins": 5.212520599365234, + "rewards/rejected": -9.534428596496582, + "step": 287 + }, + { + "epoch": 0.6170326727370112, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.667742293114383, + "learning_rate": 5.38782816229117e-07, + "logits/chosen": 0.5336281061172485, + "logits/rejected": 0.5964027643203735, + "logps/accuracies": 1.0, + "logps/chosen": -413.2911376953125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -360.906982421875, + "logps/ref_rejected": -325.00140380859375, + "logps/rejected": -499.9887390136719, + "loss": 0.2106, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6192078590393066, + "rewards/grad_term": 0.007026966195553541, + "rewards/margins": 6.1301589012146, + "rewards/rejected": -8.749366760253906, + "step": 288 + }, + { + "epoch": 0.6191751472951259, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 11.794004557760429, + "learning_rate": 5.379474940334129e-07, + "logits/chosen": 0.5764543414115906, + "logits/rejected": 0.49163103103637695, + "logps/accuracies": 1.0, + "logps/chosen": -346.34478759765625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -279.2290344238281, + "logps/ref_rejected": -275.8312072753906, + "logps/rejected": -466.09326171875, + "loss": 0.1684, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3557891845703125, + "rewards/grad_term": 0.00030989584047347307, + "rewards/margins": 6.157315254211426, + "rewards/rejected": -9.513103485107422, + "step": 289 + }, + { + "epoch": 0.6213176218532405, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.147575761036771, + "learning_rate": 5.371121718377088e-07, + "logits/chosen": 0.9949532747268677, + "logits/rejected": 0.8375188708305359, + "logps/accuracies": 0.75, + "logps/chosen": -450.9200134277344, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -368.46990966796875, + "logps/ref_rejected": -288.4126281738281, + "logps/rejected": -482.5291748046875, + "loss": 0.1457, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.122506141662598, + "rewards/grad_term": 0.0010359850712120533, + "rewards/margins": 5.583320140838623, + "rewards/rejected": -9.705825805664062, + "step": 290 + }, + { + "epoch": 0.6234600964113551, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 6.480445656722806, + "learning_rate": 5.362768496420047e-07, + "logits/chosen": 0.46533939242362976, + "logits/rejected": 0.4300745725631714, + "logps/accuracies": 1.0, + "logps/chosen": -365.7827453613281, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -291.9012145996094, + "logps/ref_rejected": -255.63128662109375, + "logps/rejected": -422.51983642578125, + "loss": 0.1266, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6940770149230957, + "rewards/grad_term": 0.004254742059856653, + "rewards/margins": 4.650350093841553, + "rewards/rejected": -8.344427108764648, + "step": 291 + }, + { + "epoch": 0.6256025709694697, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.156244989772967, + "learning_rate": 5.354415274463007e-07, + "logits/chosen": 0.9660448431968689, + "logits/rejected": 0.5434409379959106, + "logps/accuracies": 0.75, + "logps/chosen": -476.6251220703125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -414.31597900390625, + "logps/ref_rejected": -342.433837890625, + "logps/rejected": -571.8237915039062, + "loss": 0.1555, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.115457534790039, + "rewards/grad_term": 0.006694721523672342, + "rewards/margins": 8.354040145874023, + "rewards/rejected": -11.469497680664062, + "step": 292 + }, + { + "epoch": 0.6277450455275844, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 9.41012930540287, + "learning_rate": 5.346062052505966e-07, + "logits/chosen": 0.9397240877151489, + "logits/rejected": 0.7151464223861694, + "logps/accuracies": 0.5, + "logps/chosen": -548.965576171875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -421.5787353515625, + "logps/ref_rejected": -343.83221435546875, + "logps/rejected": -598.891357421875, + "loss": 0.1439, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.369344234466553, + "rewards/grad_term": 0.0019262685673311353, + "rewards/margins": 6.383614540100098, + "rewards/rejected": -12.752958297729492, + "step": 293 + }, + { + "epoch": 0.629887520085699, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 5.624072315857759, + "learning_rate": 5.337708830548926e-07, + "logits/chosen": 0.45759594440460205, + "logits/rejected": 0.47171294689178467, + "logps/accuracies": 0.5, + "logps/chosen": -386.42669677734375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -328.4192199707031, + "logps/ref_rejected": -252.86329650878906, + "logps/rejected": -425.3404541015625, + "loss": 0.1063, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9003729820251465, + "rewards/grad_term": 0.0017550851916894317, + "rewards/margins": 5.723484039306641, + "rewards/rejected": -8.623857498168945, + "step": 294 + }, + { + "epoch": 0.6320299946438136, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.713346436229937, + "learning_rate": 5.329355608591886e-07, + "logits/chosen": 0.7498375177383423, + "logits/rejected": 0.6682128310203552, + "logps/accuracies": 0.75, + "logps/chosen": -446.07830810546875, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -382.056884765625, + "logps/ref_rejected": -337.1555480957031, + "logps/rejected": -528.916015625, + "loss": 0.158, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.201070785522461, + "rewards/grad_term": 0.006979628466069698, + "rewards/margins": 6.386953830718994, + "rewards/rejected": -9.588025093078613, + "step": 295 + }, + { + "epoch": 0.6341724692019283, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.364585833613935, + "learning_rate": 5.321002386634845e-07, + "logits/chosen": 0.7363272905349731, + "logits/rejected": 0.6899486780166626, + "logps/accuracies": 0.75, + "logps/chosen": -605.6181030273438, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -478.4359130859375, + "logps/ref_rejected": -392.2603759765625, + "logps/rejected": -669.0337524414062, + "loss": 0.176, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.359109401702881, + "rewards/grad_term": 0.0019621604587882757, + "rewards/margins": 7.479557991027832, + "rewards/rejected": -13.838666915893555, + "step": 296 + }, + { + "epoch": 0.6363149437600428, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.505205942394765, + "learning_rate": 5.312649164677804e-07, + "logits/chosen": 0.5754671096801758, + "logits/rejected": 0.594577431678772, + "logps/accuracies": 0.75, + "logps/chosen": -347.17852783203125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -265.71417236328125, + "logps/ref_rejected": -234.78533935546875, + "logps/rejected": -441.4500732421875, + "loss": 0.1279, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.07321834564209, + "rewards/grad_term": 0.004567963071167469, + "rewards/margins": 6.260017395019531, + "rewards/rejected": -10.333235740661621, + "step": 297 + }, + { + "epoch": 0.6384574183181575, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 18.711798908689843, + "learning_rate": 5.304295942720763e-07, + "logits/chosen": 0.870806872844696, + "logits/rejected": 0.6316779851913452, + "logps/accuracies": 0.75, + "logps/chosen": -419.4470520019531, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -333.89263916015625, + "logps/ref_rejected": -268.04510498046875, + "logps/rejected": -475.2600402832031, + "loss": 0.1525, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.277721405029297, + "rewards/grad_term": 0.0027266484685242176, + "rewards/margins": 6.083026885986328, + "rewards/rejected": -10.360748291015625, + "step": 298 + }, + { + "epoch": 0.6405998928762721, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 13.094138352584997, + "learning_rate": 5.295942720763724e-07, + "logits/chosen": 0.4675275683403015, + "logits/rejected": 0.606252133846283, + "logps/accuracies": 0.75, + "logps/chosen": -437.78228759765625, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -361.85552978515625, + "logps/ref_rejected": -334.0460510253906, + "logps/rejected": -510.67462158203125, + "loss": 0.162, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7963359355926514, + "rewards/grad_term": 0.00101923244073987, + "rewards/margins": 5.035091876983643, + "rewards/rejected": -8.831427574157715, + "step": 299 + }, + { + "epoch": 0.6427423674343867, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 1.0, + "grad_norm": 10.604360471548478, + "learning_rate": 5.287589498806682e-07, + "logits/chosen": 0.7601810097694397, + "logits/rejected": 0.5254924297332764, + "logps/accuracies": 0.0, + "logps/chosen": -309.6020202636719, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -279.2956848144531, + "logps/ref_rejected": -154.2837371826172, + "logps/rejected": -275.95166015625, + "loss": 0.1947, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5153169631958008, + "rewards/grad_term": 0.00842749048024416, + "rewards/margins": 4.56807804107666, + "rewards/rejected": -6.083395004272461, + "step": 300 + }, + { + "epoch": 0.6448848419925013, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 16.672250976383555, + "learning_rate": 5.279236276849642e-07, + "logits/chosen": 1.0493147373199463, + "logits/rejected": 0.5671635270118713, + "logps/accuracies": 0.75, + "logps/chosen": -357.0266418457031, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -274.6451721191406, + "logps/ref_rejected": -204.9969482421875, + "logps/rejected": -309.4927978515625, + "loss": 0.2159, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.119072914123535, + "rewards/grad_term": 0.014873827807605267, + "rewards/margins": 1.1057183742523193, + "rewards/rejected": -5.224791049957275, + "step": 301 + }, + { + "epoch": 0.647027316550616, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.772181415056538, + "learning_rate": 5.270883054892601e-07, + "logits/chosen": 0.6905862092971802, + "logits/rejected": 0.6120530366897583, + "logps/accuracies": 1.0, + "logps/chosen": -301.63970947265625, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -240.38360595703125, + "logps/ref_rejected": -259.4972839355469, + "logps/rejected": -422.0617370605469, + "loss": 0.1314, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0628066062927246, + "rewards/grad_term": 0.002045161323621869, + "rewards/margins": 5.065417289733887, + "rewards/rejected": -8.128223419189453, + "step": 302 + }, + { + "epoch": 0.6491697911087306, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.52033177772429, + "learning_rate": 5.262529832935561e-07, + "logits/chosen": 0.5544182658195496, + "logits/rejected": 0.45686784386634827, + "logps/accuracies": 1.0, + "logps/chosen": -278.7823181152344, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -196.63429260253906, + "logps/ref_rejected": -185.2466583251953, + "logps/rejected": -335.3304443359375, + "loss": 0.1615, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1074018478393555, + "rewards/grad_term": 0.003845647443085909, + "rewards/margins": 3.3967883586883545, + "rewards/rejected": -7.504190444946289, + "step": 303 + }, + { + "epoch": 0.6513122656668452, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.017492432602673, + "learning_rate": 5.25417661097852e-07, + "logits/chosen": 0.8323764204978943, + "logits/rejected": 0.7166695594787598, + "logps/accuracies": 1.0, + "logps/chosen": -524.2388916015625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -408.53387451171875, + "logps/ref_rejected": -332.67181396484375, + "logps/rejected": -610.9110717773438, + "loss": 0.1781, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.785252571105957, + "rewards/grad_term": 8.712082490092143e-05, + "rewards/margins": 8.12671184539795, + "rewards/rejected": -13.911964416503906, + "step": 304 + }, + { + "epoch": 0.6534547402249599, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.355578424387678, + "learning_rate": 5.245823389021479e-07, + "logits/chosen": 0.6990536451339722, + "logits/rejected": 0.6334518790245056, + "logps/accuracies": 0.75, + "logps/chosen": -439.53704833984375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -357.226318359375, + "logps/ref_rejected": -357.77783203125, + "logps/rejected": -587.9480590820312, + "loss": 0.1535, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.115536689758301, + "rewards/grad_term": 7.772783283144236e-05, + "rewards/margins": 7.392976760864258, + "rewards/rejected": -11.508513450622559, + "step": 305 + }, + { + "epoch": 0.6555972147830744, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 5.205005797383153, + "learning_rate": 5.237470167064439e-07, + "logits/chosen": 0.8662493228912354, + "logits/rejected": 0.800337553024292, + "logps/accuracies": 0.75, + "logps/chosen": -347.75860595703125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -291.16888427734375, + "logps/ref_rejected": -224.49195861816406, + "logps/rejected": -400.5769958496094, + "loss": 0.1112, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.829484701156616, + "rewards/grad_term": 0.004376052878797054, + "rewards/margins": 5.974765777587891, + "rewards/rejected": -8.804250717163086, + "step": 306 + }, + { + "epoch": 0.6577396893411891, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 13.319843033839788, + "learning_rate": 5.229116945107398e-07, + "logits/chosen": 0.6567318439483643, + "logits/rejected": 0.7602465748786926, + "logps/accuracies": 1.0, + "logps/chosen": -388.985107421875, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -338.995361328125, + "logps/ref_rejected": -395.92230224609375, + "logps/rejected": -499.59552001953125, + "loss": 0.2511, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4994890689849854, + "rewards/grad_term": 0.0039435261860489845, + "rewards/margins": 2.684171438217163, + "rewards/rejected": -5.183660984039307, + "step": 307 + }, + { + "epoch": 0.6598821638993037, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.066998414831264, + "learning_rate": 5.220763723150358e-07, + "logits/chosen": 0.8940625786781311, + "logits/rejected": 0.903926432132721, + "logps/accuracies": 1.0, + "logps/chosen": -430.4771423339844, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -328.47900390625, + "logps/ref_rejected": -382.0928955078125, + "logps/rejected": -580.9340209960938, + "loss": 0.1645, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.099907875061035, + "rewards/grad_term": 0.0023625774774700403, + "rewards/margins": 4.842148780822754, + "rewards/rejected": -9.942056655883789, + "step": 308 + }, + { + "epoch": 0.6620246384574183, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 5.765055407846113, + "learning_rate": 5.212410501193317e-07, + "logits/chosen": 0.7993382215499878, + "logits/rejected": 0.6162198185920715, + "logps/accuracies": 0.75, + "logps/chosen": -455.43585205078125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -387.1122741699219, + "logps/ref_rejected": -310.28033447265625, + "logps/rejected": -487.04913330078125, + "loss": 0.1081, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4161789417266846, + "rewards/grad_term": 0.0006664180546067655, + "rewards/margins": 5.422262668609619, + "rewards/rejected": -8.838441848754883, + "step": 309 + }, + { + "epoch": 0.664167113015533, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 10.526420429572342, + "learning_rate": 5.204057279236276e-07, + "logits/chosen": 0.6536291241645813, + "logits/rejected": 0.7170487642288208, + "logps/accuracies": 1.0, + "logps/chosen": -343.4113464355469, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -269.9009704589844, + "logps/ref_rejected": -301.39697265625, + "logps/rejected": -437.5945129394531, + "loss": 0.1736, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6755175590515137, + "rewards/grad_term": 0.004929536487907171, + "rewards/margins": 3.1343586444854736, + "rewards/rejected": -6.809875965118408, + "step": 310 + }, + { + "epoch": 0.6663095875736476, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.317599300319795, + "learning_rate": 5.195704057279236e-07, + "logits/chosen": 0.7689659595489502, + "logits/rejected": 0.5763236284255981, + "logps/accuracies": 0.75, + "logps/chosen": -359.8375244140625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -294.6138916015625, + "logps/ref_rejected": -236.17083740234375, + "logps/rejected": -379.31927490234375, + "loss": 0.1194, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2611827850341797, + "rewards/grad_term": 0.003922306001186371, + "rewards/margins": 3.896237850189209, + "rewards/rejected": -7.157420635223389, + "step": 311 + }, + { + "epoch": 0.6684520621317622, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.491312588715147, + "learning_rate": 5.187350835322196e-07, + "logits/chosen": 0.6590454578399658, + "logits/rejected": 0.3621766269207001, + "logps/accuracies": 0.75, + "logps/chosen": -279.87823486328125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -219.8527374267578, + "logps/ref_rejected": -189.44700622558594, + "logps/rejected": -309.5084533691406, + "loss": 0.2011, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.001275062561035, + "rewards/grad_term": 0.0161147378385067, + "rewards/margins": 3.0017971992492676, + "rewards/rejected": -6.003072261810303, + "step": 312 + }, + { + "epoch": 0.6705945366898768, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.580563906265365, + "learning_rate": 5.178997613365155e-07, + "logits/chosen": 0.8489370942115784, + "logits/rejected": 0.7335869073867798, + "logps/accuracies": 0.75, + "logps/chosen": -423.52874755859375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -348.4372863769531, + "logps/ref_rejected": -324.8551025390625, + "logps/rejected": -563.6224975585938, + "loss": 0.131, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.754572629928589, + "rewards/grad_term": 0.00203010905534029, + "rewards/margins": 8.183797836303711, + "rewards/rejected": -11.938370704650879, + "step": 313 + }, + { + "epoch": 0.6727370112479915, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 8.743399053571068, + "learning_rate": 5.170644391408115e-07, + "logits/chosen": 0.940852165222168, + "logits/rejected": 0.7845810651779175, + "logps/accuracies": 0.25, + "logps/chosen": -512.4435424804688, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -435.47698974609375, + "logps/ref_rejected": -288.2704162597656, + "logps/rejected": -490.13836669921875, + "loss": 0.1653, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.84832763671875, + "rewards/grad_term": 0.00016607397992629558, + "rewards/margins": 6.24506950378418, + "rewards/rejected": -10.09339714050293, + "step": 314 + }, + { + "epoch": 0.674879485806106, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.878420297933516, + "learning_rate": 5.162291169451074e-07, + "logits/chosen": 0.7630524039268494, + "logits/rejected": 0.6900883913040161, + "logps/accuracies": 0.75, + "logps/chosen": -469.132080078125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -356.17041015625, + "logps/ref_rejected": -346.5165100097656, + "logps/rejected": -575.3904418945312, + "loss": 0.1708, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.648085594177246, + "rewards/grad_term": 0.0032002755906432867, + "rewards/margins": 5.795612335205078, + "rewards/rejected": -11.443696975708008, + "step": 315 + }, + { + "epoch": 0.6770219603642207, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.731907474004624, + "learning_rate": 5.153937947494033e-07, + "logits/chosen": 0.5569190979003906, + "logits/rejected": 0.6198952794075012, + "logps/accuracies": 1.0, + "logps/chosen": -371.6037292480469, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -293.22381591796875, + "logps/ref_rejected": -300.0587463378906, + "logps/rejected": -534.5460205078125, + "loss": 0.1858, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9189953804016113, + "rewards/grad_term": 0.00014798434858676046, + "rewards/margins": 7.8053669929504395, + "rewards/rejected": -11.72436237335205, + "step": 316 + }, + { + "epoch": 0.6791644349223352, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.47786260702623, + "learning_rate": 5.145584725536993e-07, + "logits/chosen": 0.8376766443252563, + "logits/rejected": 0.7569788098335266, + "logps/accuracies": 0.75, + "logps/chosen": -496.47161865234375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -384.5050048828125, + "logps/ref_rejected": -329.0260925292969, + "logps/rejected": -620.14892578125, + "loss": 0.1397, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.598330497741699, + "rewards/grad_term": 0.0014825062826275826, + "rewards/margins": 8.95781135559082, + "rewards/rejected": -14.556142807006836, + "step": 317 + }, + { + "epoch": 0.6813069094804499, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 6.306648680307601, + "learning_rate": 5.137231503579952e-07, + "logits/chosen": 0.6440654993057251, + "logits/rejected": 0.5563719868659973, + "logps/accuracies": 1.0, + "logps/chosen": -464.43914794921875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -345.64239501953125, + "logps/ref_rejected": -302.1645202636719, + "logps/rejected": -522.718505859375, + "loss": 0.1143, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.939836502075195, + "rewards/grad_term": 0.003235024632886052, + "rewards/margins": 5.087862968444824, + "rewards/rejected": -11.02769947052002, + "step": 318 + }, + { + "epoch": 0.6834493840385646, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 11.209754726841052, + "learning_rate": 5.128878281622912e-07, + "logits/chosen": 0.7397335171699524, + "logits/rejected": 0.6839704513549805, + "logps/accuracies": 0.75, + "logps/chosen": -304.3392639160156, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -254.27145385742188, + "logps/ref_rejected": -232.0565948486328, + "logps/rejected": -400.621337890625, + "loss": 0.179, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.503389835357666, + "rewards/grad_term": 0.005540979094803333, + "rewards/margins": 5.924847602844238, + "rewards/rejected": -8.428237915039062, + "step": 319 + }, + { + "epoch": 0.6855918585966791, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.033781332110546, + "learning_rate": 5.120525059665871e-07, + "logits/chosen": 0.7168871164321899, + "logits/rejected": 0.7381715774536133, + "logps/accuracies": 1.0, + "logps/chosen": -508.3505859375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -435.618896484375, + "logps/ref_rejected": -405.5649719238281, + "logps/rejected": -566.0382080078125, + "loss": 0.1406, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.636585235595703, + "rewards/grad_term": 0.0022972766309976578, + "rewards/margins": 4.387078762054443, + "rewards/rejected": -8.023663520812988, + "step": 320 + }, + { + "epoch": 0.6855918585966791, + "eval_flips/correct->correct": 0.14000000059604645, + "eval_flips/correct->incorrect": 0.019999999552965164, + "eval_flips/incorrect->correct": 0.5400000214576721, + "eval_flips/incorrect->incorrect": 0.30000001192092896, + "eval_logits/chosen": 0.7168383002281189, + "eval_logits/rejected": 0.599204957485199, + "eval_logps/accuracies": 0.6800000071525574, + "eval_logps/chosen": -391.7984619140625, + "eval_logps/ref_accuracies": 0.1599999964237213, + "eval_logps/ref_chosen": -323.51568603515625, + "eval_logps/ref_rejected": -258.70098876953125, + "eval_logps/rejected": -410.0682678222656, + "eval_loss": 0.17036853730678558, + "eval_rewards/accuracies": 0.8999999761581421, + "eval_rewards/chosen": -3.414141893386841, + "eval_rewards/grad_term": 0.007643704302608967, + "eval_rewards/margins": 4.154223442077637, + "eval_rewards/rejected": -7.568365097045898, + "eval_runtime": 374.585, + "eval_samples_per_second": 4.218, + "eval_steps_per_second": 0.133, + "step": 320 + }, + { + "epoch": 0.6877343331547938, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.190622967503248, + "learning_rate": 5.11217183770883e-07, + "logits/chosen": 0.8069337606430054, + "logits/rejected": 0.7580575942993164, + "logps/accuracies": 1.0, + "logps/chosen": -503.8867492675781, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -380.65765380859375, + "logps/ref_rejected": -327.27337646484375, + "logps/rejected": -596.4686279296875, + "loss": 0.1443, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.1614532470703125, + "rewards/grad_term": 0.00022571110457647592, + "rewards/margins": 7.298309326171875, + "rewards/rejected": -13.459762573242188, + "step": 321 + }, + { + "epoch": 0.6898768077129084, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 13.881500506001192, + "learning_rate": 5.10381861575179e-07, + "logits/chosen": 0.9965633749961853, + "logits/rejected": 0.7457981109619141, + "logps/accuracies": 0.75, + "logps/chosen": -474.2216491699219, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -421.9150390625, + "logps/ref_rejected": -353.48065185546875, + "logps/rejected": -548.97265625, + "loss": 0.1638, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6153299808502197, + "rewards/grad_term": 0.0007248412584885955, + "rewards/margins": 7.159272193908691, + "rewards/rejected": -9.774601936340332, + "step": 322 + }, + { + "epoch": 0.692019282271023, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.643673103869992, + "learning_rate": 5.095465393794749e-07, + "logits/chosen": 0.49757474660873413, + "logits/rejected": 0.41455477476119995, + "logps/accuracies": 0.5, + "logps/chosen": -357.2688293457031, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -293.4722900390625, + "logps/ref_rejected": -242.71273803710938, + "logps/rejected": -381.94488525390625, + "loss": 0.1431, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1898274421691895, + "rewards/grad_term": 0.0033772799652069807, + "rewards/margins": 3.771780490875244, + "rewards/rejected": -6.961607933044434, + "step": 323 + }, + { + "epoch": 0.6941617568291376, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.731140676983227, + "learning_rate": 5.087112171837709e-07, + "logits/chosen": 0.8970646858215332, + "logits/rejected": 0.7831761240959167, + "logps/accuracies": 1.0, + "logps/chosen": -555.628173828125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -494.30938720703125, + "logps/ref_rejected": -413.06451416015625, + "logps/rejected": -587.2745971679688, + "loss": 0.1652, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.065938711166382, + "rewards/grad_term": 0.002578580752015114, + "rewards/margins": 5.644565582275391, + "rewards/rejected": -8.710504531860352, + "step": 324 + }, + { + "epoch": 0.6963042313872523, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 6.5010699802203655, + "learning_rate": 5.078758949880667e-07, + "logits/chosen": 0.8070268630981445, + "logits/rejected": 0.6490368247032166, + "logps/accuracies": 1.0, + "logps/chosen": -577.15478515625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -460.09661865234375, + "logps/ref_rejected": -362.884033203125, + "logps/rejected": -633.048095703125, + "loss": 0.1418, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.852906227111816, + "rewards/grad_term": 0.00013052637223154306, + "rewards/margins": 7.655299186706543, + "rewards/rejected": -13.50820541381836, + "step": 325 + }, + { + "epoch": 0.698446705945367, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.546015605698586, + "learning_rate": 5.070405727923628e-07, + "logits/chosen": 0.5231800079345703, + "logits/rejected": 0.4889012575149536, + "logps/accuracies": 0.75, + "logps/chosen": -264.241455078125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -208.79318237304688, + "logps/ref_rejected": -202.1544189453125, + "logps/rejected": -345.9344787597656, + "loss": 0.1226, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7724146842956543, + "rewards/grad_term": 0.007728134281933308, + "rewards/margins": 4.41658878326416, + "rewards/rejected": -7.189002990722656, + "step": 326 + }, + { + "epoch": 0.7005891805034815, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 12.648714009679955, + "learning_rate": 5.062052505966587e-07, + "logits/chosen": 0.7718223333358765, + "logits/rejected": 0.7055037021636963, + "logps/accuracies": 0.75, + "logps/chosen": -459.9751892089844, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -358.0395812988281, + "logps/ref_rejected": -338.55364990234375, + "logps/rejected": -556.436279296875, + "loss": 0.1803, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.096780776977539, + "rewards/grad_term": 0.0066120820119977, + "rewards/margins": 5.797348976135254, + "rewards/rejected": -10.89413070678711, + "step": 327 + }, + { + "epoch": 0.7027316550615962, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.30205872449101, + "learning_rate": 5.053699284009546e-07, + "logits/chosen": 0.6625626683235168, + "logits/rejected": 0.6732759475708008, + "logps/accuracies": 0.75, + "logps/chosen": -315.85107421875, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -257.46136474609375, + "logps/ref_rejected": -273.6905212402344, + "logps/rejected": -384.72711181640625, + "loss": 0.1767, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.919485092163086, + "rewards/grad_term": 0.007900861091911793, + "rewards/margins": 2.6323447227478027, + "rewards/rejected": -5.551829814910889, + "step": 328 + }, + { + "epoch": 0.7048741296197107, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 10.156605830327571, + "learning_rate": 5.045346062052505e-07, + "logits/chosen": 0.7717033624649048, + "logits/rejected": 0.6601508855819702, + "logps/accuracies": 1.0, + "logps/chosen": -463.640380859375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -382.90325927734375, + "logps/ref_rejected": -358.9565734863281, + "logps/rejected": -566.164306640625, + "loss": 0.1797, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.036858558654785, + "rewards/grad_term": 0.0012353091733530164, + "rewards/margins": 6.323529243469238, + "rewards/rejected": -10.360387802124023, + "step": 329 + }, + { + "epoch": 0.7070166041778254, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.641900480934386, + "learning_rate": 5.036992840095465e-07, + "logits/chosen": 0.6714078187942505, + "logits/rejected": 0.5740436315536499, + "logps/accuracies": 1.0, + "logps/chosen": -453.4899597167969, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -362.24810791015625, + "logps/ref_rejected": -388.57666015625, + "logps/rejected": -607.985595703125, + "loss": 0.1501, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.562093257904053, + "rewards/grad_term": 0.011679948307573795, + "rewards/margins": 6.408352851867676, + "rewards/rejected": -10.97044563293457, + "step": 330 + }, + { + "epoch": 0.70915907873594, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.768091277395346, + "learning_rate": 5.028639618138425e-07, + "logits/chosen": 0.4849713146686554, + "logits/rejected": 0.3761657476425171, + "logps/accuracies": 0.75, + "logps/chosen": -472.632080078125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -394.3712158203125, + "logps/ref_rejected": -320.7559814453125, + "logps/rejected": -576.399658203125, + "loss": 0.1809, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9130444526672363, + "rewards/grad_term": 0.0002858239458873868, + "rewards/margins": 8.869141578674316, + "rewards/rejected": -12.782186508178711, + "step": 331 + }, + { + "epoch": 0.7113015532940546, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.361339079624875, + "learning_rate": 5.020286396181383e-07, + "logits/chosen": 0.7538321614265442, + "logits/rejected": 0.619208574295044, + "logps/accuracies": 0.5, + "logps/chosen": -538.201416015625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -406.4328308105469, + "logps/ref_rejected": -341.9276123046875, + "logps/rejected": -551.7299194335938, + "loss": 0.1106, + "rewards/accuracies": 0.75, + "rewards/chosen": -6.588429927825928, + "rewards/grad_term": 0.009247011505067348, + "rewards/margins": 3.9016823768615723, + "rewards/rejected": -10.4901123046875, + "step": 332 + }, + { + "epoch": 0.7134440278521692, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.211461201062549, + "learning_rate": 5.011933174224344e-07, + "logits/chosen": 0.9981447458267212, + "logits/rejected": 0.6901638507843018, + "logps/accuracies": 0.75, + "logps/chosen": -473.5837707519531, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -413.8009948730469, + "logps/ref_rejected": -332.0254211425781, + "logps/rejected": -507.0432434082031, + "loss": 0.1231, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9891393184661865, + "rewards/grad_term": 0.0005725694936700165, + "rewards/margins": 5.761752605438232, + "rewards/rejected": -8.75089168548584, + "step": 333 + }, + { + "epoch": 0.7155865024102839, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.170697657476269, + "learning_rate": 5.003579952267303e-07, + "logits/chosen": 0.826168954372406, + "logits/rejected": 0.6049452424049377, + "logps/accuracies": 0.5, + "logps/chosen": -268.6787109375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -210.28704833984375, + "logps/ref_rejected": -152.79354858398438, + "logps/rejected": -252.23291015625, + "loss": 0.1311, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.919583797454834, + "rewards/grad_term": 0.009706183336675167, + "rewards/margins": 2.0523836612701416, + "rewards/rejected": -4.9719672203063965, + "step": 334 + }, + { + "epoch": 0.7177289769683985, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.060208207763219, + "learning_rate": 4.995226730310263e-07, + "logits/chosen": 0.8894015550613403, + "logits/rejected": 0.6564974784851074, + "logps/accuracies": 1.0, + "logps/chosen": -500.45635986328125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -414.86962890625, + "logps/ref_rejected": -351.6375732421875, + "logps/rejected": -605.1538696289062, + "loss": 0.1278, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2793378829956055, + "rewards/grad_term": 0.0034387409687042236, + "rewards/margins": 8.396476745605469, + "rewards/rejected": -12.675814628601074, + "step": 335 + }, + { + "epoch": 0.7198714515265131, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.25, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.56000560457653, + "learning_rate": 4.986873508353221e-07, + "logits/chosen": 0.6553113460540771, + "logits/rejected": 0.6264476180076599, + "logps/accuracies": 0.75, + "logps/chosen": -260.0503234863281, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -188.68035888671875, + "logps/ref_rejected": -198.7422332763672, + "logps/rejected": -327.9527282714844, + "loss": 0.157, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.5684990882873535, + "rewards/grad_term": 0.014257272705435753, + "rewards/margins": 2.8920247554779053, + "rewards/rejected": -6.46052360534668, + "step": 336 + }, + { + "epoch": 0.7220139260846278, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 5.247249685460771, + "learning_rate": 4.978520286396182e-07, + "logits/chosen": 0.8244426250457764, + "logits/rejected": 0.6587880849838257, + "logps/accuracies": 0.75, + "logps/chosen": -550.476318359375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -413.40692138671875, + "logps/ref_rejected": -312.9790954589844, + "logps/rejected": -601.8764038085938, + "loss": 0.1166, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.853466987609863, + "rewards/grad_term": 0.0013105407124385238, + "rewards/margins": 7.591399192810059, + "rewards/rejected": -14.444866180419922, + "step": 337 + }, + { + "epoch": 0.7241564006427423, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.3962044900308355, + "learning_rate": 4.970167064439141e-07, + "logits/chosen": 0.36316683888435364, + "logits/rejected": 0.26318785548210144, + "logps/accuracies": 0.5, + "logps/chosen": -459.98480224609375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -393.48419189453125, + "logps/ref_rejected": -242.56094360351562, + "logps/rejected": -432.6356506347656, + "loss": 0.1479, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3250298500061035, + "rewards/grad_term": 0.0004948938149027526, + "rewards/margins": 6.178703784942627, + "rewards/rejected": -9.50373363494873, + "step": 338 + }, + { + "epoch": 0.726298875200857, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.583520613174011, + "learning_rate": 4.9618138424821e-07, + "logits/chosen": 0.6698114275932312, + "logits/rejected": 0.6079827547073364, + "logps/accuracies": 1.0, + "logps/chosen": -316.6778869628906, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -285.0715637207031, + "logps/ref_rejected": -286.4187316894531, + "logps/rejected": -387.62554931640625, + "loss": 0.1442, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5803159475326538, + "rewards/grad_term": 0.007696358487010002, + "rewards/margins": 3.480024814605713, + "rewards/rejected": -5.060340881347656, + "step": 339 + }, + { + "epoch": 0.7284413497589716, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.838033966465567, + "learning_rate": 4.953460620525059e-07, + "logits/chosen": 0.7049505710601807, + "logits/rejected": 0.6646623015403748, + "logps/accuracies": 1.0, + "logps/chosen": -391.0535888671875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -312.54132080078125, + "logps/ref_rejected": -331.8018798828125, + "logps/rejected": -517.3717041015625, + "loss": 0.146, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.925614833831787, + "rewards/grad_term": 0.007477066479623318, + "rewards/margins": 5.35287618637085, + "rewards/rejected": -9.278491020202637, + "step": 340 + }, + { + "epoch": 0.7305838243170862, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.719543349851092, + "learning_rate": 4.945107398568019e-07, + "logits/chosen": 0.6794430017471313, + "logits/rejected": 0.5714715719223022, + "logps/accuracies": 1.0, + "logps/chosen": -349.4491271972656, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -272.2052001953125, + "logps/ref_rejected": -253.03768920898438, + "logps/rejected": -437.6853942871094, + "loss": 0.1376, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.862197160720825, + "rewards/grad_term": 0.004704746417701244, + "rewards/margins": 5.3701887130737305, + "rewards/rejected": -9.232385635375977, + "step": 341 + }, + { + "epoch": 0.7327262988752009, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.451256306905016, + "learning_rate": 4.936754176610979e-07, + "logits/chosen": 0.6575830578804016, + "logits/rejected": 0.6638262271881104, + "logps/accuracies": 0.75, + "logps/chosen": -314.11114501953125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -257.2018737792969, + "logps/ref_rejected": -241.7303009033203, + "logps/rejected": -397.7080078125, + "loss": 0.1602, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.845463275909424, + "rewards/grad_term": 0.009044932201504707, + "rewards/margins": 4.953423023223877, + "rewards/rejected": -7.798886299133301, + "step": 342 + }, + { + "epoch": 0.7348687734333155, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.459693770757402, + "learning_rate": 4.928400954653937e-07, + "logits/chosen": 0.529110312461853, + "logits/rejected": 0.46302855014801025, + "logps/accuracies": 0.75, + "logps/chosen": -384.07061767578125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -316.386962890625, + "logps/ref_rejected": -298.5597839355469, + "logps/rejected": -452.8936767578125, + "loss": 0.1482, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3841824531555176, + "rewards/grad_term": 0.0015116020804271102, + "rewards/margins": 4.332512378692627, + "rewards/rejected": -7.7166948318481445, + "step": 343 + }, + { + "epoch": 0.7370112479914301, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.920848833475498, + "learning_rate": 4.920047732696897e-07, + "logits/chosen": 0.7953596711158752, + "logits/rejected": 0.7348934412002563, + "logps/accuracies": 0.75, + "logps/chosen": -594.716552734375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -465.59210205078125, + "logps/ref_rejected": -436.11572265625, + "logps/rejected": -708.698974609375, + "loss": 0.1812, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.456222057342529, + "rewards/grad_term": 0.0025254576466977596, + "rewards/margins": 7.172940731048584, + "rewards/rejected": -13.629162788391113, + "step": 344 + }, + { + "epoch": 0.7391537225495447, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.924633562541802, + "learning_rate": 4.911694510739857e-07, + "logits/chosen": 0.5842578411102295, + "logits/rejected": 0.6216070652008057, + "logps/accuracies": 1.0, + "logps/chosen": -463.14154052734375, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -373.7330322265625, + "logps/ref_rejected": -408.91278076171875, + "logps/rejected": -685.0960693359375, + "loss": 0.1078, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.470427513122559, + "rewards/grad_term": 0.0002844279515556991, + "rewards/margins": 9.338738441467285, + "rewards/rejected": -13.809165000915527, + "step": 345 + }, + { + "epoch": 0.7412961971076594, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 12.414755725942422, + "learning_rate": 4.903341288782816e-07, + "logits/chosen": 0.8328725099563599, + "logits/rejected": 0.7645402550697327, + "logps/accuracies": 0.75, + "logps/chosen": -414.1654968261719, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -368.20654296875, + "logps/ref_rejected": -308.8746337890625, + "logps/rejected": -470.432373046875, + "loss": 0.2259, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.297947883605957, + "rewards/grad_term": 0.00027060159482061863, + "rewards/margins": 5.779940605163574, + "rewards/rejected": -8.077888488769531, + "step": 346 + }, + { + "epoch": 0.7434386716657739, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.169966513004652, + "learning_rate": 4.894988066825775e-07, + "logits/chosen": 0.554535984992981, + "logits/rejected": 0.265200138092041, + "logps/accuracies": 0.75, + "logps/chosen": -356.061767578125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -315.3848876953125, + "logps/ref_rejected": -268.03143310546875, + "logps/rejected": -405.561767578125, + "loss": 0.1324, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.033844470977783, + "rewards/grad_term": 0.005571221467107534, + "rewards/margins": 4.842672348022461, + "rewards/rejected": -6.876516342163086, + "step": 347 + }, + { + "epoch": 0.7455811462238886, + "flips/correct->correct": 1.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 6.180681687024289, + "learning_rate": 4.886634844868734e-07, + "logits/chosen": 0.6118199229240417, + "logits/rejected": 0.6814651489257812, + "logps/accuracies": 1.0, + "logps/chosen": -246.84640502929688, + "logps/ref_accuracies": 1.0, + "logps/ref_chosen": -195.61514282226562, + "logps/ref_rejected": -214.57460021972656, + "logps/rejected": -361.24395751953125, + "loss": 0.1331, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5615625381469727, + "rewards/grad_term": 0.007279230747371912, + "rewards/margins": 4.77190637588501, + "rewards/rejected": -7.333469390869141, + "step": 348 + }, + { + "epoch": 0.7477236207820032, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.210731357114345, + "learning_rate": 4.878281622911695e-07, + "logits/chosen": 1.0106936693191528, + "logits/rejected": 0.45131033658981323, + "logps/accuracies": 0.5, + "logps/chosen": -454.847412109375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -324.7191162109375, + "logps/ref_rejected": -223.54873657226562, + "logps/rejected": -440.8221740722656, + "loss": 0.1168, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.506414890289307, + "rewards/grad_term": 0.009222909808158875, + "rewards/margins": 4.357255935668945, + "rewards/rejected": -10.86367130279541, + "step": 349 + }, + { + "epoch": 0.7498660953401178, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.362818321342788, + "learning_rate": 4.869928400954653e-07, + "logits/chosen": 0.586777925491333, + "logits/rejected": 0.5780112147331238, + "logps/accuracies": 1.0, + "logps/chosen": -262.08038330078125, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -232.11459350585938, + "logps/ref_rejected": -231.54193115234375, + "logps/rejected": -376.2002868652344, + "loss": 0.1459, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4982905387878418, + "rewards/grad_term": 0.006848993711173534, + "rewards/margins": 5.734626770019531, + "rewards/rejected": -7.232917785644531, + "step": 350 + }, + { + "epoch": 0.7520085698982325, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.297264703165888, + "learning_rate": 4.861575178997613e-07, + "logits/chosen": 0.5312216877937317, + "logits/rejected": 0.5838853716850281, + "logps/accuracies": 1.0, + "logps/chosen": -411.0349426269531, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -309.9207763671875, + "logps/ref_rejected": -289.38275146484375, + "logps/rejected": -511.20758056640625, + "loss": 0.1103, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.055708885192871, + "rewards/grad_term": 0.0036189735401421785, + "rewards/margins": 6.035533428192139, + "rewards/rejected": -11.091242790222168, + "step": 351 + }, + { + "epoch": 0.7541510444563471, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.720091390841414, + "learning_rate": 4.853221957040573e-07, + "logits/chosen": 0.8272877931594849, + "logits/rejected": 0.6150888204574585, + "logps/accuracies": 0.75, + "logps/chosen": -382.7689514160156, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -322.699951171875, + "logps/ref_rejected": -250.23941040039062, + "logps/rejected": -417.6976623535156, + "loss": 0.1311, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0034492015838623, + "rewards/grad_term": 0.0010191942565143108, + "rewards/margins": 5.369463920593262, + "rewards/rejected": -8.372913360595703, + "step": 352 + }, + { + "epoch": 0.7562935190144617, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.819345327687853, + "learning_rate": 4.844868735083532e-07, + "logits/chosen": 0.8716313242912292, + "logits/rejected": 0.7711046934127808, + "logps/accuracies": 0.75, + "logps/chosen": -352.4783020019531, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -285.69561767578125, + "logps/ref_rejected": -241.1641845703125, + "logps/rejected": -428.366943359375, + "loss": 0.131, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3391335010528564, + "rewards/grad_term": 0.006919885985553265, + "rewards/margins": 6.021005153656006, + "rewards/rejected": -9.360138893127441, + "step": 353 + }, + { + "epoch": 0.7584359935725763, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.544013532966439, + "learning_rate": 4.836515513126491e-07, + "logits/chosen": 0.8605263233184814, + "logits/rejected": 0.7548648715019226, + "logps/accuracies": 0.75, + "logps/chosen": -513.380126953125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -414.0445861816406, + "logps/ref_rejected": -390.1202392578125, + "logps/rejected": -633.4596557617188, + "loss": 0.1838, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.966775417327881, + "rewards/grad_term": 0.005854703951627016, + "rewards/margins": 7.2001953125, + "rewards/rejected": -12.166970252990723, + "step": 354 + }, + { + "epoch": 0.760578468130691, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.840668647752036, + "learning_rate": 4.82816229116945e-07, + "logits/chosen": 0.49805212020874023, + "logits/rejected": 0.5557568669319153, + "logps/accuracies": 0.75, + "logps/chosen": -327.806884765625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -277.4596252441406, + "logps/ref_rejected": -261.7403869628906, + "logps/rejected": -415.97454833984375, + "loss": 0.083, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5173633098602295, + "rewards/grad_term": 0.0034056631848216057, + "rewards/margins": 5.194344997406006, + "rewards/rejected": -7.711708068847656, + "step": 355 + }, + { + "epoch": 0.7627209426888055, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 6.886486362606452, + "learning_rate": 4.819809069212411e-07, + "logits/chosen": 0.523609459400177, + "logits/rejected": 0.3943213224411011, + "logps/accuracies": 1.0, + "logps/chosen": -449.4044494628906, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -386.558349609375, + "logps/ref_rejected": -362.3239440917969, + "logps/rejected": -540.4671020507812, + "loss": 0.1275, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1423020362854004, + "rewards/grad_term": 0.000673395290505141, + "rewards/margins": 5.76485538482666, + "rewards/rejected": -8.907157897949219, + "step": 356 + }, + { + "epoch": 0.7648634172469202, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.772834608130855, + "learning_rate": 4.811455847255369e-07, + "logits/chosen": 0.7413110733032227, + "logits/rejected": 0.6600308418273926, + "logps/accuracies": 0.75, + "logps/chosen": -372.83050537109375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -308.9735107421875, + "logps/ref_rejected": -233.7576904296875, + "logps/rejected": -412.01788330078125, + "loss": 0.107, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.192850351333618, + "rewards/grad_term": 0.0023573609068989754, + "rewards/margins": 5.720157623291016, + "rewards/rejected": -8.913007736206055, + "step": 357 + }, + { + "epoch": 0.7670058918050349, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.586255491609359, + "learning_rate": 4.803102625298329e-07, + "logits/chosen": 0.7194008827209473, + "logits/rejected": 0.529259443283081, + "logps/accuracies": 0.25, + "logps/chosen": -470.25994873046875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -412.6962890625, + "logps/ref_rejected": -344.50018310546875, + "logps/rejected": -454.2938537597656, + "loss": 0.1335, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8781819343566895, + "rewards/grad_term": 0.006395381409674883, + "rewards/margins": 2.6115007400512695, + "rewards/rejected": -5.489683151245117, + "step": 358 + }, + { + "epoch": 0.7691483663631494, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.427948430933896, + "learning_rate": 4.794749403341288e-07, + "logits/chosen": 0.5997850298881531, + "logits/rejected": 0.39114004373550415, + "logps/accuracies": 0.5, + "logps/chosen": -450.5962219238281, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -360.58770751953125, + "logps/ref_rejected": -266.60211181640625, + "logps/rejected": -491.2527770996094, + "loss": 0.1088, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.500424861907959, + "rewards/grad_term": 0.0012759572127833962, + "rewards/margins": 6.7321085929870605, + "rewards/rejected": -11.232534408569336, + "step": 359 + }, + { + "epoch": 0.7712908409212641, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 11.10147872579214, + "learning_rate": 4.786396181384249e-07, + "logits/chosen": 0.6526762247085571, + "logits/rejected": 0.406318336725235, + "logps/accuracies": 0.75, + "logps/chosen": -362.89215087890625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -327.1744079589844, + "logps/ref_rejected": -287.7540588378906, + "logps/rejected": -391.6522216796875, + "loss": 0.1822, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7858877182006836, + "rewards/grad_term": 0.003739753272384405, + "rewards/margins": 3.4090189933776855, + "rewards/rejected": -5.194906234741211, + "step": 360 + }, + { + "epoch": 0.7734333154793787, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.009801801350886, + "learning_rate": 4.778042959427207e-07, + "logits/chosen": 0.9286985993385315, + "logits/rejected": 0.75478196144104, + "logps/accuracies": 0.75, + "logps/chosen": -341.165771484375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -286.45068359375, + "logps/ref_rejected": -226.50819396972656, + "logps/rejected": -388.7208251953125, + "loss": 0.1502, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7357535362243652, + "rewards/grad_term": 0.005150636192411184, + "rewards/margins": 5.374879360198975, + "rewards/rejected": -8.11063289642334, + "step": 361 + }, + { + "epoch": 0.7755757900374933, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.079915978073272, + "learning_rate": 4.769689737470167e-07, + "logits/chosen": 0.15154126286506653, + "logits/rejected": 0.23431162536144257, + "logps/accuracies": 0.75, + "logps/chosen": -141.20352172851562, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -122.65444946289062, + "logps/ref_rejected": -115.94908142089844, + "logps/rejected": -179.49026489257812, + "loss": 0.1532, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9274539351463318, + "rewards/grad_term": 0.005933484528213739, + "rewards/margins": 2.249605417251587, + "rewards/rejected": -3.1770591735839844, + "step": 362 + }, + { + "epoch": 0.7777182645956079, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.620781698654811, + "learning_rate": 4.7613365155131265e-07, + "logits/chosen": 0.6806411147117615, + "logits/rejected": 0.5877007842063904, + "logps/accuracies": 1.0, + "logps/chosen": -305.52105712890625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -225.72100830078125, + "logps/ref_rejected": -230.88204956054688, + "logps/rejected": -416.0865783691406, + "loss": 0.1264, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9900035858154297, + "rewards/grad_term": 0.0007602861733175814, + "rewards/margins": 5.270223617553711, + "rewards/rejected": -9.26022720336914, + "step": 363 + }, + { + "epoch": 0.7798607391537226, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.162163225237173, + "learning_rate": 4.752983293556086e-07, + "logits/chosen": 0.3107702434062958, + "logits/rejected": 0.46145111322402954, + "logps/accuracies": 0.75, + "logps/chosen": -376.48138427734375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -298.7530517578125, + "logps/ref_rejected": -248.01364135742188, + "logps/rejected": -483.236572265625, + "loss": 0.1518, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8864171504974365, + "rewards/grad_term": 0.0027246952522546053, + "rewards/margins": 7.874729156494141, + "rewards/rejected": -11.761146545410156, + "step": 364 + }, + { + "epoch": 0.7820032137118371, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.682442207333185, + "learning_rate": 4.744630071599045e-07, + "logits/chosen": 0.8286511301994324, + "logits/rejected": 0.7338289618492126, + "logps/accuracies": 0.75, + "logps/chosen": -475.2028503417969, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -419.9889221191406, + "logps/ref_rejected": -337.7398681640625, + "logps/rejected": -524.1643676757812, + "loss": 0.1149, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7606959342956543, + "rewards/grad_term": 0.0009419883135706186, + "rewards/margins": 6.5605316162109375, + "rewards/rejected": -9.321227073669434, + "step": 365 + }, + { + "epoch": 0.7841456882699518, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.969521309776873, + "learning_rate": 4.736276849642005e-07, + "logits/chosen": 0.4039073884487152, + "logits/rejected": 0.4401341676712036, + "logps/accuracies": 0.75, + "logps/chosen": -332.14044189453125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -283.09283447265625, + "logps/ref_rejected": -206.31539916992188, + "logps/rejected": -350.80340576171875, + "loss": 0.1418, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4523794651031494, + "rewards/grad_term": 0.0033193090930581093, + "rewards/margins": 4.772021293640137, + "rewards/rejected": -7.224400997161865, + "step": 366 + }, + { + "epoch": 0.7862881628280665, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.54876490811146, + "learning_rate": 4.727923627684964e-07, + "logits/chosen": 0.3236476182937622, + "logits/rejected": 0.3070971667766571, + "logps/accuracies": 0.5, + "logps/chosen": -317.3150329589844, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -236.6580810546875, + "logps/ref_rejected": -158.20616149902344, + "logps/rejected": -292.539794921875, + "loss": 0.1308, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.0328474044799805, + "rewards/grad_term": 0.016018129885196686, + "rewards/margins": 2.683833599090576, + "rewards/rejected": -6.716681003570557, + "step": 367 + }, + { + "epoch": 0.788430637386181, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 10.13550622656799, + "learning_rate": 4.7195704057279233e-07, + "logits/chosen": 0.762974739074707, + "logits/rejected": 0.506515622138977, + "logps/accuracies": 0.5, + "logps/chosen": -476.596923828125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -387.86663818359375, + "logps/ref_rejected": -268.920166015625, + "logps/rejected": -499.3272705078125, + "loss": 0.1571, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.4365129470825195, + "rewards/grad_term": 0.0013110407162457705, + "rewards/margins": 7.083842754364014, + "rewards/rejected": -11.520356178283691, + "step": 368 + }, + { + "epoch": 0.7905731119442957, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.573319632626368, + "learning_rate": 4.7112171837708825e-07, + "logits/chosen": 0.9512593150138855, + "logits/rejected": 0.7041115164756775, + "logps/accuracies": 0.75, + "logps/chosen": -443.4069519042969, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -363.83258056640625, + "logps/ref_rejected": -308.1307373046875, + "logps/rejected": -442.51458740234375, + "loss": 0.1507, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9787182807922363, + "rewards/grad_term": 0.005531268659979105, + "rewards/margins": 2.7404749393463135, + "rewards/rejected": -6.719193458557129, + "step": 369 + }, + { + "epoch": 0.7927155865024103, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 5.584409310189419, + "learning_rate": 4.7028639618138423e-07, + "logits/chosen": 0.6789947152137756, + "logits/rejected": 0.46235227584838867, + "logps/accuracies": 0.5, + "logps/chosen": -433.4268798828125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -329.707763671875, + "logps/ref_rejected": -254.446533203125, + "logps/rejected": -444.1405334472656, + "loss": 0.1189, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.185955047607422, + "rewards/grad_term": 0.010698116384446621, + "rewards/margins": 4.2987446784973145, + "rewards/rejected": -9.484700202941895, + "step": 370 + }, + { + "epoch": 0.7948580610605249, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 11.82663440003344, + "learning_rate": 4.694510739856802e-07, + "logits/chosen": 0.8684903383255005, + "logits/rejected": 0.864480197429657, + "logps/accuracies": 1.0, + "logps/chosen": -616.7009887695312, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -439.9715270996094, + "logps/ref_rejected": -483.77447509765625, + "logps/rejected": -717.4370727539062, + "loss": 0.1749, + "rewards/accuracies": 0.75, + "rewards/chosen": -8.83647346496582, + "rewards/grad_term": 0.009361391887068748, + "rewards/margins": 2.84665846824646, + "rewards/rejected": -11.68313217163086, + "step": 371 + }, + { + "epoch": 0.7970005356186395, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.700184569183158, + "learning_rate": 4.686157517899761e-07, + "logits/chosen": 0.3798208236694336, + "logits/rejected": 0.18705379962921143, + "logps/accuracies": 0.75, + "logps/chosen": -290.2025451660156, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -256.36285400390625, + "logps/ref_rejected": -187.69712829589844, + "logps/rejected": -312.7464599609375, + "loss": 0.1903, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6919848918914795, + "rewards/grad_term": 0.0075987353920936584, + "rewards/margins": 4.560481548309326, + "rewards/rejected": -6.252466201782227, + "step": 372 + }, + { + "epoch": 0.7991430101767542, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.246215247843911, + "learning_rate": 4.6778042959427206e-07, + "logits/chosen": 0.8069986701011658, + "logits/rejected": 0.7122650146484375, + "logps/accuracies": 1.0, + "logps/chosen": -267.30328369140625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -210.80979919433594, + "logps/ref_rejected": -209.55477905273438, + "logps/rejected": -342.04205322265625, + "loss": 0.1123, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8246755599975586, + "rewards/grad_term": 0.003649149788543582, + "rewards/margins": 3.799687385559082, + "rewards/rejected": -6.624362468719482, + "step": 373 + }, + { + "epoch": 0.8012854847348688, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.128291731066904, + "learning_rate": 4.6694510739856804e-07, + "logits/chosen": 0.8819460868835449, + "logits/rejected": 0.760127067565918, + "logps/accuracies": 0.75, + "logps/chosen": -614.6334228515625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -528.1080932617188, + "logps/ref_rejected": -390.6625061035156, + "logps/rejected": -600.1669921875, + "loss": 0.1455, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.326266288757324, + "rewards/grad_term": 0.0019579888321459293, + "rewards/margins": 6.148959159851074, + "rewards/rejected": -10.475224494934082, + "step": 374 + }, + { + "epoch": 0.8034279592929834, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 15.828594310560685, + "learning_rate": 4.661097852028639e-07, + "logits/chosen": 0.8489161729812622, + "logits/rejected": 0.753060519695282, + "logps/accuracies": 0.5, + "logps/chosen": -382.72735595703125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -323.9566650390625, + "logps/ref_rejected": -298.6670227050781, + "logps/rejected": -467.1494140625, + "loss": 0.1952, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.938535213470459, + "rewards/grad_term": 0.004120782017707825, + "rewards/margins": 5.485583305358887, + "rewards/rejected": -8.424118041992188, + "step": 375 + }, + { + "epoch": 0.8055704338510981, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 10.583125585454663, + "learning_rate": 4.652744630071599e-07, + "logits/chosen": 0.5661925673484802, + "logits/rejected": 0.3597102761268616, + "logps/accuracies": 0.5, + "logps/chosen": -306.2489318847656, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -252.90496826171875, + "logps/ref_rejected": -201.43569946289062, + "logps/rejected": -383.89324951171875, + "loss": 0.1964, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6671996116638184, + "rewards/grad_term": 0.0024612259585410357, + "rewards/margins": 6.455678462982178, + "rewards/rejected": -9.122878074645996, + "step": 376 + }, + { + "epoch": 0.8077129084092126, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.4392788570254815, + "learning_rate": 4.644391408114558e-07, + "logits/chosen": 0.3249109983444214, + "logits/rejected": 0.32942840456962585, + "logps/accuracies": 0.75, + "logps/chosen": -448.28216552734375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -387.504638671875, + "logps/ref_rejected": -307.7704162597656, + "logps/rejected": -473.29437255859375, + "loss": 0.1221, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0388755798339844, + "rewards/grad_term": 0.0016592949395999312, + "rewards/margins": 5.237322807312012, + "rewards/rejected": -8.276198387145996, + "step": 377 + }, + { + "epoch": 0.8098553829673273, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.329181119972007, + "learning_rate": 4.636038186157518e-07, + "logits/chosen": 0.404056191444397, + "logits/rejected": 0.6022278070449829, + "logps/accuracies": 1.0, + "logps/chosen": -382.3240051269531, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -305.3075256347656, + "logps/ref_rejected": -321.82904052734375, + "logps/rejected": -545.7882080078125, + "loss": 0.1446, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8508248329162598, + "rewards/grad_term": 0.0002711013949010521, + "rewards/margins": 7.347134590148926, + "rewards/rejected": -11.197959899902344, + "step": 378 + }, + { + "epoch": 0.8119978575254418, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.175949590736797, + "learning_rate": 4.6276849642004767e-07, + "logits/chosen": 0.751959502696991, + "logits/rejected": 0.5633019804954529, + "logps/accuracies": 1.0, + "logps/chosen": -502.3103942871094, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -433.314697265625, + "logps/ref_rejected": -358.736083984375, + "logps/rejected": -592.6541748046875, + "loss": 0.1823, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.449784278869629, + "rewards/grad_term": 4.590416210703552e-05, + "rewards/margins": 8.24611759185791, + "rewards/rejected": -11.695901870727539, + "step": 379 + }, + { + "epoch": 0.8141403320835565, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 11.61637016421897, + "learning_rate": 4.6193317422434364e-07, + "logits/chosen": 0.5796254873275757, + "logits/rejected": 0.6662420630455017, + "logps/accuracies": 1.0, + "logps/chosen": -412.05316162109375, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -339.7696533203125, + "logps/ref_rejected": -340.8929748535156, + "logps/rejected": -524.9725341796875, + "loss": 0.1719, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.61417555809021, + "rewards/grad_term": 0.0034167964477092028, + "rewards/margins": 5.589802265167236, + "rewards/rejected": -9.203977584838867, + "step": 380 + }, + { + "epoch": 0.8162828066416711, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 6.094201585504359, + "learning_rate": 4.610978520286396e-07, + "logits/chosen": 0.7151302695274353, + "logits/rejected": 0.6419375538825989, + "logps/accuracies": 1.0, + "logps/chosen": -305.9559326171875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -270.7724609375, + "logps/ref_rejected": -270.0223083496094, + "logps/rejected": -445.2619934082031, + "loss": 0.125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7591745853424072, + "rewards/grad_term": 0.0021109317895025015, + "rewards/margins": 7.002810001373291, + "rewards/rejected": -8.761983871459961, + "step": 381 + }, + { + "epoch": 0.8184252811997857, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 14.362881277342565, + "learning_rate": 4.602625298329356e-07, + "logits/chosen": 0.8437117338180542, + "logits/rejected": 0.7384243607521057, + "logps/accuracies": 0.5, + "logps/chosen": -461.12603759765625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -386.73773193359375, + "logps/ref_rejected": -304.27459716796875, + "logps/rejected": -513.4872436523438, + "loss": 0.1601, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7194161415100098, + "rewards/grad_term": 0.00258549302816391, + "rewards/margins": 6.741215705871582, + "rewards/rejected": -10.46063232421875, + "step": 382 + }, + { + "epoch": 0.8205677557579004, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.456611514133404, + "learning_rate": 4.594272076372315e-07, + "logits/chosen": 0.7477813363075256, + "logits/rejected": 0.3820553123950958, + "logps/accuracies": 0.75, + "logps/chosen": -507.67462158203125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -389.72613525390625, + "logps/ref_rejected": -367.64569091796875, + "logps/rejected": -579.4849243164062, + "loss": 0.1604, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.89742374420166, + "rewards/grad_term": 0.007863424718379974, + "rewards/margins": 4.694538593292236, + "rewards/rejected": -10.591961860656738, + "step": 383 + }, + { + "epoch": 0.822710230316015, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 5.8619420532382325, + "learning_rate": 4.5859188544152745e-07, + "logits/chosen": 0.7787905931472778, + "logits/rejected": 0.767812967300415, + "logps/accuracies": 0.75, + "logps/chosen": -384.25, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -312.675537109375, + "logps/ref_rejected": -270.84429931640625, + "logps/rejected": -437.036376953125, + "loss": 0.1155, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5787243843078613, + "rewards/grad_term": 0.003301040269434452, + "rewards/margins": 4.730880260467529, + "rewards/rejected": -8.30960464477539, + "step": 384 + }, + { + "epoch": 0.822710230316015, + "eval_flips/correct->correct": 0.14000000059604645, + "eval_flips/correct->incorrect": 0.019999999552965164, + "eval_flips/incorrect->correct": 0.5400000214576721, + "eval_flips/incorrect->incorrect": 0.30000001192092896, + "eval_logits/chosen": 0.6912816762924194, + "eval_logits/rejected": 0.5833743810653687, + "eval_logps/accuracies": 0.6800000071525574, + "eval_logps/chosen": -383.8902282714844, + "eval_logps/ref_accuracies": 0.1599999964237213, + "eval_logps/ref_chosen": -323.51568603515625, + "eval_logps/ref_rejected": -258.70098876953125, + "eval_logps/rejected": -404.5517578125, + "eval_loss": 0.15501657128334045, + "eval_rewards/accuracies": 0.8799999952316284, + "eval_rewards/chosen": -3.018728256225586, + "eval_rewards/grad_term": 0.00715669384226203, + "eval_rewards/margins": 4.273808002471924, + "eval_rewards/rejected": -7.292536735534668, + "eval_runtime": 372.7419, + "eval_samples_per_second": 4.239, + "eval_steps_per_second": 0.134, + "step": 384 + }, + { + "epoch": 0.8248527048741296, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 8.34443643722511, + "learning_rate": 4.577565632458234e-07, + "logits/chosen": 0.6030393242835999, + "logits/rejected": 0.5619946718215942, + "logps/accuracies": 0.5, + "logps/chosen": -445.7275695800781, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -341.169921875, + "logps/ref_rejected": -280.4991455078125, + "logps/rejected": -452.7607116699219, + "loss": 0.1376, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.227883815765381, + "rewards/grad_term": 0.003972820471972227, + "rewards/margins": 3.3851945400238037, + "rewards/rejected": -8.613078117370605, + "step": 385 + }, + { + "epoch": 0.8269951794322442, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.067319624450543, + "learning_rate": 4.569212410501193e-07, + "logits/chosen": 0.6664745211601257, + "logits/rejected": 0.6004454493522644, + "logps/accuracies": 0.5, + "logps/chosen": -461.43505859375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -375.47607421875, + "logps/ref_rejected": -329.8832092285156, + "logps/rejected": -491.4964599609375, + "loss": 0.1263, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.297948360443115, + "rewards/grad_term": 0.006247952580451965, + "rewards/margins": 3.7827141284942627, + "rewards/rejected": -8.080662727355957, + "step": 386 + }, + { + "epoch": 0.8291376539903589, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.42647077537532, + "learning_rate": 4.5608591885441523e-07, + "logits/chosen": 0.6861754655838013, + "logits/rejected": 0.39711299538612366, + "logps/accuracies": 0.75, + "logps/chosen": -266.0076904296875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -222.40213012695312, + "logps/ref_rejected": -151.0525665283203, + "logps/rejected": -257.8247985839844, + "loss": 0.1474, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1802759170532227, + "rewards/grad_term": 0.0029023890383541584, + "rewards/margins": 3.1583352088928223, + "rewards/rejected": -5.338611125946045, + "step": 387 + }, + { + "epoch": 0.8312801285484734, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.54998768600324, + "learning_rate": 4.552505966587112e-07, + "logits/chosen": 0.7813968658447266, + "logits/rejected": 0.675197184085846, + "logps/accuracies": 0.75, + "logps/chosen": -384.80474853515625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -335.5448913574219, + "logps/ref_rejected": -288.6382141113281, + "logps/rejected": -429.424072265625, + "loss": 0.105, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.462993621826172, + "rewards/grad_term": 0.004009037744253874, + "rewards/margins": 4.57629919052124, + "rewards/rejected": -7.039292335510254, + "step": 388 + }, + { + "epoch": 0.8334226031065881, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.3771391402744175, + "learning_rate": 4.544152744630072e-07, + "logits/chosen": 0.5336940884590149, + "logits/rejected": 0.61911940574646, + "logps/accuracies": 1.0, + "logps/chosen": -435.4273986816406, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -310.7679443359375, + "logps/ref_rejected": -220.2869110107422, + "logps/rejected": -539.0606079101562, + "loss": 0.1034, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.232973098754883, + "rewards/grad_term": 0.002168377162888646, + "rewards/margins": 9.705711364746094, + "rewards/rejected": -15.938684463500977, + "step": 389 + }, + { + "epoch": 0.8355650776647028, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 5.772490840913882, + "learning_rate": 4.5357995226730306e-07, + "logits/chosen": 0.7710881233215332, + "logits/rejected": 0.52399080991745, + "logps/accuracies": 0.75, + "logps/chosen": -394.009033203125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -299.70672607421875, + "logps/ref_rejected": -240.57080078125, + "logps/rejected": -437.76953125, + "loss": 0.0987, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.715115070343018, + "rewards/grad_term": 0.0006557138403877616, + "rewards/margins": 5.14482307434082, + "rewards/rejected": -9.85993766784668, + "step": 390 + }, + { + "epoch": 0.8377075522228173, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.038299673221383, + "learning_rate": 4.5274463007159904e-07, + "logits/chosen": 0.825120747089386, + "logits/rejected": 0.6199119687080383, + "logps/accuracies": 0.25, + "logps/chosen": -434.44329833984375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -317.6658630371094, + "logps/ref_rejected": -241.26890563964844, + "logps/rejected": -444.71636962890625, + "loss": 0.1355, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.838871002197266, + "rewards/grad_term": 0.001953305210918188, + "rewards/margins": 4.333502769470215, + "rewards/rejected": -10.172372817993164, + "step": 391 + }, + { + "epoch": 0.839850026780932, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.157799585200374, + "learning_rate": 4.5190930787589496e-07, + "logits/chosen": 0.7274588346481323, + "logits/rejected": 0.658560037612915, + "logps/accuracies": 1.0, + "logps/chosen": -377.17510986328125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -304.23382568359375, + "logps/ref_rejected": -287.71441650390625, + "logps/rejected": -464.94073486328125, + "loss": 0.1512, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6470634937286377, + "rewards/grad_term": 0.0007109759608283639, + "rewards/margins": 5.214252948760986, + "rewards/rejected": -8.861316680908203, + "step": 392 + }, + { + "epoch": 0.8419925013390466, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.274433302537343, + "learning_rate": 4.510739856801909e-07, + "logits/chosen": 0.5300111770629883, + "logits/rejected": 0.4925421476364136, + "logps/accuracies": 0.75, + "logps/chosen": -535.7554321289062, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -405.7460021972656, + "logps/ref_rejected": -341.57281494140625, + "logps/rejected": -637.828369140625, + "loss": 0.1401, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.500471115112305, + "rewards/grad_term": 0.0015781800029799342, + "rewards/margins": 8.312305450439453, + "rewards/rejected": -14.812776565551758, + "step": 393 + }, + { + "epoch": 0.8441349758971612, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 6.636233486579053, + "learning_rate": 4.502386634844868e-07, + "logits/chosen": 0.4728167653083801, + "logits/rejected": 0.7265413999557495, + "logps/accuracies": 1.0, + "logps/chosen": -329.78082275390625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -262.4573669433594, + "logps/ref_rejected": -250.28829956054688, + "logps/rejected": -432.49798583984375, + "loss": 0.1203, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3661720752716064, + "rewards/grad_term": 0.002081993967294693, + "rewards/margins": 5.744311332702637, + "rewards/rejected": -9.110483169555664, + "step": 394 + }, + { + "epoch": 0.8462774504552758, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 11.191471059477506, + "learning_rate": 4.494033412887828e-07, + "logits/chosen": 0.5292783975601196, + "logits/rejected": 0.2622219920158386, + "logps/accuracies": 0.75, + "logps/chosen": -296.090576171875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -242.09730529785156, + "logps/ref_rejected": -185.52487182617188, + "logps/rejected": -350.0084228515625, + "loss": 0.1594, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6996638774871826, + "rewards/grad_term": 0.004369885195046663, + "rewards/margins": 5.5245137214660645, + "rewards/rejected": -8.224177360534668, + "step": 395 + }, + { + "epoch": 0.8484199250133905, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.697422074983217, + "learning_rate": 4.4856801909307877e-07, + "logits/chosen": 0.5663112998008728, + "logits/rejected": 0.41860711574554443, + "logps/accuracies": 0.75, + "logps/chosen": -370.9486083984375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -322.1562805175781, + "logps/ref_rejected": -241.59457397460938, + "logps/rejected": -408.6671142578125, + "loss": 0.1077, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.439617156982422, + "rewards/grad_term": 0.0069171166978776455, + "rewards/margins": 5.914010047912598, + "rewards/rejected": -8.353628158569336, + "step": 396 + }, + { + "epoch": 0.850562399571505, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 13.015020922003387, + "learning_rate": 4.4773269689737464e-07, + "logits/chosen": 0.7214323282241821, + "logits/rejected": 0.633575975894928, + "logps/accuracies": 1.0, + "logps/chosen": -596.1569213867188, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -488.6678161621094, + "logps/ref_rejected": -393.7802734375, + "logps/rejected": -751.7131958007812, + "loss": 0.2131, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.374455451965332, + "rewards/grad_term": 6.355544996949902e-07, + "rewards/margins": 12.522188186645508, + "rewards/rejected": -17.896644592285156, + "step": 397 + }, + { + "epoch": 0.8527048741296197, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 4.892748358005199, + "learning_rate": 4.468973747016706e-07, + "logits/chosen": 0.7805695533752441, + "logits/rejected": 0.6413770914077759, + "logps/accuracies": 1.0, + "logps/chosen": -511.5725402832031, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -438.80120849609375, + "logps/ref_rejected": -409.7392578125, + "logps/rejected": -677.0660400390625, + "loss": 0.0722, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6385655403137207, + "rewards/grad_term": 2.607361057016533e-05, + "rewards/margins": 9.727773666381836, + "rewards/rejected": -13.366338729858398, + "step": 398 + }, + { + "epoch": 0.8548473486877344, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 12.448276262372078, + "learning_rate": 4.460620525059666e-07, + "logits/chosen": 0.7015663385391235, + "logits/rejected": 0.5971590280532837, + "logps/accuracies": 1.0, + "logps/chosen": -475.1459655761719, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -387.7801818847656, + "logps/ref_rejected": -357.5787353515625, + "logps/rejected": -657.3110961914062, + "loss": 0.1691, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.368288993835449, + "rewards/grad_term": 0.0006424501189030707, + "rewards/margins": 10.618330955505371, + "rewards/rejected": -14.98661994934082, + "step": 399 + }, + { + "epoch": 0.8569898232458489, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.52996684599927, + "learning_rate": 4.452267303102625e-07, + "logits/chosen": 0.5823007822036743, + "logits/rejected": 0.5328406095504761, + "logps/accuracies": 1.0, + "logps/chosen": -302.14312744140625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -234.80886840820312, + "logps/ref_rejected": -250.11843872070312, + "logps/rejected": -457.92333984375, + "loss": 0.1442, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3667118549346924, + "rewards/grad_term": 0.0009050341905094683, + "rewards/margins": 7.023532867431641, + "rewards/rejected": -10.390244483947754, + "step": 400 + }, + { + "epoch": 0.8591322978039636, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.537952351751538, + "learning_rate": 4.4439140811455845e-07, + "logits/chosen": 0.929060161113739, + "logits/rejected": 0.7344148755073547, + "logps/accuracies": 0.5, + "logps/chosen": -484.63690185546875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -397.1466064453125, + "logps/ref_rejected": -278.0077819824219, + "logps/rejected": -507.8617858886719, + "loss": 0.1082, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.374514102935791, + "rewards/grad_term": 0.0002766952384263277, + "rewards/margins": 7.11818790435791, + "rewards/rejected": -11.492701530456543, + "step": 401 + }, + { + "epoch": 0.8612747723620782, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 5.644686107983424, + "learning_rate": 4.435560859188544e-07, + "logits/chosen": 0.771916389465332, + "logits/rejected": 0.6290404200553894, + "logps/accuracies": 0.75, + "logps/chosen": -315.5869140625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -250.51466369628906, + "logps/ref_rejected": -214.51107788085938, + "logps/rejected": -375.64447021484375, + "loss": 0.1285, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.253613233566284, + "rewards/grad_term": 0.0008995306561701, + "rewards/margins": 4.803055763244629, + "rewards/rejected": -8.056669235229492, + "step": 402 + }, + { + "epoch": 0.8634172469201928, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.733257087221838, + "learning_rate": 4.4272076372315035e-07, + "logits/chosen": 0.7453795075416565, + "logits/rejected": 0.6315554976463318, + "logps/accuracies": 1.0, + "logps/chosen": -232.2364501953125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -196.71493530273438, + "logps/ref_rejected": -155.52903747558594, + "logps/rejected": -270.3780822753906, + "loss": 0.0766, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7760753631591797, + "rewards/grad_term": 0.002003555418923497, + "rewards/margins": 3.966376543045044, + "rewards/rejected": -5.7424516677856445, + "step": 403 + }, + { + "epoch": 0.8655597214783074, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.60971738924586, + "learning_rate": 4.418854415274462e-07, + "logits/chosen": 0.7506588697433472, + "logits/rejected": 0.6142828464508057, + "logps/accuracies": 1.0, + "logps/chosen": -397.05242919921875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -317.8343505859375, + "logps/ref_rejected": -303.29852294921875, + "logps/rejected": -455.8990783691406, + "loss": 0.1456, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9609031677246094, + "rewards/grad_term": 0.0018456345424056053, + "rewards/margins": 3.6691231727600098, + "rewards/rejected": -7.630025863647461, + "step": 404 + }, + { + "epoch": 0.8677021960364221, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.056320190778132, + "learning_rate": 4.410501193317422e-07, + "logits/chosen": 0.5372669696807861, + "logits/rejected": 0.5302236676216125, + "logps/accuracies": 0.75, + "logps/chosen": -419.73486328125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -331.661865234375, + "logps/ref_rejected": -297.73260498046875, + "logps/rejected": -506.001220703125, + "loss": 0.1351, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.403650283813477, + "rewards/grad_term": 0.0054626427590847015, + "rewards/margins": 6.0097808837890625, + "rewards/rejected": -10.413432121276855, + "step": 405 + }, + { + "epoch": 0.8698446705945367, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.174234002962357, + "learning_rate": 4.402147971360382e-07, + "logits/chosen": 0.7001669406890869, + "logits/rejected": 0.39515483379364014, + "logps/accuracies": 0.75, + "logps/chosen": -257.07257080078125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -186.89627075195312, + "logps/ref_rejected": -187.3160400390625, + "logps/rejected": -366.7100524902344, + "loss": 0.1342, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.508815288543701, + "rewards/grad_term": 0.0030996918212622404, + "rewards/margins": 5.4608845710754395, + "rewards/rejected": -8.96969985961914, + "step": 406 + }, + { + "epoch": 0.8719871451526513, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 5.813541027309185, + "learning_rate": 4.3937947494033416e-07, + "logits/chosen": 0.7106292247772217, + "logits/rejected": 0.17329132556915283, + "logps/accuracies": 0.5, + "logps/chosen": -406.671142578125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -309.2118835449219, + "logps/ref_rejected": -226.40023803710938, + "logps/rejected": -371.4858093261719, + "loss": 0.1001, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.872962474822998, + "rewards/grad_term": 0.007784712128341198, + "rewards/margins": 2.381316900253296, + "rewards/rejected": -7.254279136657715, + "step": 407 + }, + { + "epoch": 0.874129619710766, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 5.142665145862327, + "learning_rate": 4.3854415274463003e-07, + "logits/chosen": 0.6431873440742493, + "logits/rejected": 0.5117607116699219, + "logps/accuracies": 0.5, + "logps/chosen": -388.5247802734375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -360.99249267578125, + "logps/ref_rejected": -295.61029052734375, + "logps/rejected": -396.9288330078125, + "loss": 0.0839, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3766143321990967, + "rewards/grad_term": 0.008713779971003532, + "rewards/margins": 3.689311981201172, + "rewards/rejected": -5.0659260749816895, + "step": 408 + }, + { + "epoch": 0.8762720942688805, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.302023436651439, + "learning_rate": 4.37708830548926e-07, + "logits/chosen": 0.7255518436431885, + "logits/rejected": 0.6550572514533997, + "logps/accuracies": 1.0, + "logps/chosen": -503.82879638671875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -395.3870849609375, + "logps/ref_rejected": -354.33709716796875, + "logps/rejected": -625.1456298828125, + "loss": 0.1401, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.422085285186768, + "rewards/grad_term": 0.0003362699644640088, + "rewards/margins": 8.118339538574219, + "rewards/rejected": -13.540424346923828, + "step": 409 + }, + { + "epoch": 0.8784145688269952, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 6.292464980810114, + "learning_rate": 4.3687350835322194e-07, + "logits/chosen": 0.6224971413612366, + "logits/rejected": 0.7813397645950317, + "logps/accuracies": 1.0, + "logps/chosen": -246.21495056152344, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -178.90476989746094, + "logps/ref_rejected": -192.27903747558594, + "logps/rejected": -309.7457275390625, + "loss": 0.0869, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.365509033203125, + "rewards/grad_term": 0.005968024022877216, + "rewards/margins": 2.5078253746032715, + "rewards/rejected": -5.8733344078063965, + "step": 410 + }, + { + "epoch": 0.8805570433851098, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.994486636309479, + "learning_rate": 4.360381861575179e-07, + "logits/chosen": 0.5689117908477783, + "logits/rejected": 0.4422415494918823, + "logps/accuracies": 0.75, + "logps/chosen": -327.41168212890625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -288.2694091796875, + "logps/ref_rejected": -235.6809844970703, + "logps/rejected": -354.2411804199219, + "loss": 0.1481, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9571146965026855, + "rewards/grad_term": 0.01035099197179079, + "rewards/margins": 3.9708945751190186, + "rewards/rejected": -5.928009033203125, + "step": 411 + }, + { + "epoch": 0.8826995179432244, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.858727469447434, + "learning_rate": 4.352028639618138e-07, + "logits/chosen": 0.758124589920044, + "logits/rejected": 0.7636011242866516, + "logps/accuracies": 1.0, + "logps/chosen": -433.85906982421875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -363.50360107421875, + "logps/ref_rejected": -367.9163513183594, + "logps/rejected": -580.310791015625, + "loss": 0.0963, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5177736282348633, + "rewards/grad_term": 0.0005504750879481435, + "rewards/margins": 7.101946830749512, + "rewards/rejected": -10.619720458984375, + "step": 412 + }, + { + "epoch": 0.884841992501339, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.354863325065816, + "learning_rate": 4.3436754176610977e-07, + "logits/chosen": 0.6459320783615112, + "logits/rejected": 0.5770057439804077, + "logps/accuracies": 1.0, + "logps/chosen": -574.0004272460938, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -458.69244384765625, + "logps/ref_rejected": -463.82720947265625, + "logps/rejected": -754.6375122070312, + "loss": 0.0889, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.7653985023498535, + "rewards/grad_term": 0.00017192790983244777, + "rewards/margins": 8.775115966796875, + "rewards/rejected": -14.540514945983887, + "step": 413 + }, + { + "epoch": 0.8869844670594537, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.641937024128401, + "learning_rate": 4.3353221957040575e-07, + "logits/chosen": 0.6838860511779785, + "logits/rejected": 0.5406701564788818, + "logps/accuracies": 0.75, + "logps/chosen": -305.5446472167969, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -251.28353881835938, + "logps/ref_rejected": -207.93319702148438, + "logps/rejected": -345.9137268066406, + "loss": 0.1227, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7130556106567383, + "rewards/grad_term": 0.000992645276710391, + "rewards/margins": 4.185970306396484, + "rewards/rejected": -6.899025917053223, + "step": 414 + }, + { + "epoch": 0.8891269416175683, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.853463086288327, + "learning_rate": 4.326968973747016e-07, + "logits/chosen": 0.6326106786727905, + "logits/rejected": 0.5018441677093506, + "logps/accuracies": 1.0, + "logps/chosen": -350.4998779296875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -291.879150390625, + "logps/ref_rejected": -234.96595764160156, + "logps/rejected": -414.79351806640625, + "loss": 0.1072, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.931034564971924, + "rewards/grad_term": 0.007088256999850273, + "rewards/margins": 6.060344696044922, + "rewards/rejected": -8.991378784179688, + "step": 415 + }, + { + "epoch": 0.8912694161756829, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 10.353678579870046, + "learning_rate": 4.318615751789976e-07, + "logits/chosen": 0.7541974186897278, + "logits/rejected": 0.6350035667419434, + "logps/accuracies": 1.0, + "logps/chosen": -488.8894348144531, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -403.95233154296875, + "logps/ref_rejected": -311.3324890136719, + "logps/rejected": -574.9945678710938, + "loss": 0.1862, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.246854782104492, + "rewards/grad_term": 0.0008102880092337728, + "rewards/margins": 8.936247825622559, + "rewards/rejected": -13.18310260772705, + "step": 416 + }, + { + "epoch": 0.8934118907337976, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 10.150177255687934, + "learning_rate": 4.310262529832935e-07, + "logits/chosen": 0.7215501666069031, + "logits/rejected": 0.7029194235801697, + "logps/accuracies": 1.0, + "logps/chosen": -449.3460693359375, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -361.64422607421875, + "logps/ref_rejected": -362.3080139160156, + "logps/rejected": -613.3135375976562, + "loss": 0.1859, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.385092258453369, + "rewards/grad_term": 0.0001239069679286331, + "rewards/margins": 8.165184020996094, + "rewards/rejected": -12.550276756286621, + "step": 417 + }, + { + "epoch": 0.8955543652919121, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 7.057751158218558, + "learning_rate": 4.301909307875895e-07, + "logits/chosen": 0.49071842432022095, + "logits/rejected": 0.4239951968193054, + "logps/accuracies": 0.25, + "logps/chosen": -355.53564453125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -259.5860900878906, + "logps/ref_rejected": -194.45254516601562, + "logps/rejected": -348.5555725097656, + "loss": 0.1136, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.797478675842285, + "rewards/grad_term": 0.01176033727824688, + "rewards/margins": 2.9076716899871826, + "rewards/rejected": -7.705150127410889, + "step": 418 + }, + { + "epoch": 0.8976968398500268, + "flips/correct->correct": 0.75, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.128643742839769, + "learning_rate": 4.2935560859188537e-07, + "logits/chosen": 0.5078434944152832, + "logits/rejected": 0.3587588369846344, + "logps/accuracies": 0.75, + "logps/chosen": -334.2418518066406, + "logps/ref_accuracies": 0.75, + "logps/ref_chosen": -276.947998046875, + "logps/ref_rejected": -290.69976806640625, + "logps/rejected": -465.8039245605469, + "loss": 0.1086, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.864691972732544, + "rewards/grad_term": 0.001770102884620428, + "rewards/margins": 5.890515327453613, + "rewards/rejected": -8.755208015441895, + "step": 419 + }, + { + "epoch": 0.8998393144081414, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.460646869207045, + "learning_rate": 4.2852028639618135e-07, + "logits/chosen": 0.6151570081710815, + "logits/rejected": 0.5713327527046204, + "logps/accuracies": 0.75, + "logps/chosen": -357.33209228515625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -289.7698974609375, + "logps/ref_rejected": -239.04769897460938, + "logps/rejected": -366.888427734375, + "loss": 0.1259, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.378108501434326, + "rewards/grad_term": 0.004311670083552599, + "rewards/margins": 3.013929843902588, + "rewards/rejected": -6.392038345336914, + "step": 420 + }, + { + "epoch": 0.901981788966256, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 11.054391516220369, + "learning_rate": 4.2768496420047733e-07, + "logits/chosen": 0.6619023084640503, + "logits/rejected": 0.6487151980400085, + "logps/accuracies": 0.75, + "logps/chosen": -419.2369689941406, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -322.2930603027344, + "logps/ref_rejected": -283.63616943359375, + "logps/rejected": -483.73504638671875, + "loss": 0.1269, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.847194671630859, + "rewards/grad_term": 0.0035053249448537827, + "rewards/margins": 5.157749652862549, + "rewards/rejected": -10.004944801330566, + "step": 421 + }, + { + "epoch": 0.9041242635243707, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.627822881128733, + "learning_rate": 4.268496420047732e-07, + "logits/chosen": 0.7480889558792114, + "logits/rejected": 0.6404839754104614, + "logps/accuracies": 0.75, + "logps/chosen": -348.94525146484375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -275.64013671875, + "logps/ref_rejected": -239.08200073242188, + "logps/rejected": -421.51513671875, + "loss": 0.0891, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.665255546569824, + "rewards/grad_term": 0.008442794904112816, + "rewards/margins": 5.456399917602539, + "rewards/rejected": -9.121655464172363, + "step": 422 + }, + { + "epoch": 0.9062667380824853, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 7.335769347864956, + "learning_rate": 4.260143198090692e-07, + "logits/chosen": 0.6401829123497009, + "logits/rejected": 0.7815831899642944, + "logps/accuracies": 0.5, + "logps/chosen": -379.6376647949219, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -265.36968994140625, + "logps/ref_rejected": -255.16949462890625, + "logps/rejected": -414.13201904296875, + "loss": 0.1133, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.713399410247803, + "rewards/grad_term": 0.007248271256685257, + "rewards/margins": 2.2347278594970703, + "rewards/rejected": -7.948126792907715, + "step": 423 + }, + { + "epoch": 0.9084092126405999, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 9.129137286638404, + "learning_rate": 4.2517899761336516e-07, + "logits/chosen": 0.6732430458068848, + "logits/rejected": 0.5181256532669067, + "logps/accuracies": 0.25, + "logps/chosen": -605.279541015625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -473.38079833984375, + "logps/ref_rejected": -406.4195861816406, + "logps/rejected": -641.9754638671875, + "loss": 0.1132, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.594937801361084, + "rewards/grad_term": 0.0023317153099924326, + "rewards/margins": 5.182857513427734, + "rewards/rejected": -11.777795791625977, + "step": 424 + }, + { + "epoch": 0.9105516871987145, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 14.734996244623405, + "learning_rate": 4.243436754176611e-07, + "logits/chosen": 0.48378825187683105, + "logits/rejected": 0.3249673843383789, + "logps/accuracies": 0.75, + "logps/chosen": -461.893310546875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -390.8454895019531, + "logps/ref_rejected": -320.3680419921875, + "logps/rejected": -476.0527038574219, + "loss": 0.1884, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.552391290664673, + "rewards/grad_term": 0.004285210277885199, + "rewards/margins": 4.231842041015625, + "rewards/rejected": -7.784233093261719, + "step": 425 + }, + { + "epoch": 0.9126941617568292, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.389398153896957, + "learning_rate": 4.23508353221957e-07, + "logits/chosen": 0.7646510004997253, + "logits/rejected": 0.669806957244873, + "logps/accuracies": 1.0, + "logps/chosen": -448.4755859375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -365.5307922363281, + "logps/ref_rejected": -346.1925354003906, + "logps/rejected": -590.5164184570312, + "loss": 0.1623, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.147238254547119, + "rewards/grad_term": 0.00027649421826936305, + "rewards/margins": 8.068957328796387, + "rewards/rejected": -12.216196060180664, + "step": 426 + }, + { + "epoch": 0.9148366363149437, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 11.370091633753265, + "learning_rate": 4.2267303102625293e-07, + "logits/chosen": 0.7482293844223022, + "logits/rejected": 0.6367194652557373, + "logps/accuracies": 1.0, + "logps/chosen": -521.5593872070312, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -416.5623779296875, + "logps/ref_rejected": -381.7167663574219, + "logps/rejected": -664.441162109375, + "loss": 0.1458, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.249850273132324, + "rewards/grad_term": 0.0006000981666147709, + "rewards/margins": 8.886370658874512, + "rewards/rejected": -14.136220932006836, + "step": 427 + }, + { + "epoch": 0.9169791108730584, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 7.038400084548858, + "learning_rate": 4.218377088305489e-07, + "logits/chosen": 0.8124886751174927, + "logits/rejected": 0.47504645586013794, + "logps/accuracies": 0.25, + "logps/chosen": -516.41015625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -438.93255615234375, + "logps/ref_rejected": -273.8423156738281, + "logps/rejected": -499.37445068359375, + "loss": 0.145, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8738789558410645, + "rewards/grad_term": 0.0022317732218652964, + "rewards/margins": 7.402729034423828, + "rewards/rejected": -11.276607513427734, + "step": 428 + }, + { + "epoch": 0.919121585431173, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.572014319270594, + "learning_rate": 4.210023866348449e-07, + "logits/chosen": 0.6831299066543579, + "logits/rejected": 0.6394398212432861, + "logps/accuracies": 1.0, + "logps/chosen": -486.3926086425781, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -410.18096923828125, + "logps/ref_rejected": -368.5516357421875, + "logps/rejected": -578.9801635742188, + "loss": 0.1459, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8105835914611816, + "rewards/grad_term": 0.00010745471809059381, + "rewards/margins": 6.710842609405518, + "rewards/rejected": -10.521427154541016, + "step": 429 + }, + { + "epoch": 0.9212640599892876, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 11.689438797952636, + "learning_rate": 4.2016706443914076e-07, + "logits/chosen": 0.7243601083755493, + "logits/rejected": 0.6393451690673828, + "logps/accuracies": 1.0, + "logps/chosen": -562.4426879882812, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -483.3052978515625, + "logps/ref_rejected": -426.74920654296875, + "logps/rejected": -627.9085693359375, + "loss": 0.186, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9568703174591064, + "rewards/grad_term": 0.000488718505948782, + "rewards/margins": 6.101097106933594, + "rewards/rejected": -10.057967185974121, + "step": 430 + }, + { + "epoch": 0.9234065345474023, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 5.8721575355892615, + "learning_rate": 4.1933174224343674e-07, + "logits/chosen": 0.7541458010673523, + "logits/rejected": 0.6002135276794434, + "logps/accuracies": 1.0, + "logps/chosen": -398.8825378417969, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -333.1788330078125, + "logps/ref_rejected": -287.464599609375, + "logps/rejected": -493.71929931640625, + "loss": 0.0908, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.285184860229492, + "rewards/grad_term": 0.0002467437880113721, + "rewards/margins": 7.027547359466553, + "rewards/rejected": -10.312731742858887, + "step": 431 + }, + { + "epoch": 0.9255490091055169, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 5.73075697487602, + "learning_rate": 4.184964200477327e-07, + "logits/chosen": 0.5785739421844482, + "logits/rejected": 0.6086280345916748, + "logps/accuracies": 1.0, + "logps/chosen": -356.680419921875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -259.8116149902344, + "logps/ref_rejected": -233.67578125, + "logps/rejected": -508.74652099609375, + "loss": 0.0957, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.843441009521484, + "rewards/grad_term": 0.003247784450650215, + "rewards/margins": 8.91009521484375, + "rewards/rejected": -13.75353717803955, + "step": 432 + }, + { + "epoch": 0.9276914836636315, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.580143703823092, + "learning_rate": 4.176610978520286e-07, + "logits/chosen": 0.5819364190101624, + "logits/rejected": 0.6432737708091736, + "logps/accuracies": 0.75, + "logps/chosen": -228.54739379882812, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -149.87022399902344, + "logps/ref_rejected": -146.04185485839844, + "logps/rejected": -256.3289489746094, + "loss": 0.1975, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.933859348297119, + "rewards/grad_term": 0.014459378086030483, + "rewards/margins": 1.580496072769165, + "rewards/rejected": -5.514355659484863, + "step": 433 + }, + { + "epoch": 0.9298339582217461, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.738131094632276, + "learning_rate": 4.1682577565632457e-07, + "logits/chosen": 0.6606433391571045, + "logits/rejected": 0.7345594167709351, + "logps/accuracies": 0.75, + "logps/chosen": -257.4786376953125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -191.63543701171875, + "logps/ref_rejected": -178.1594696044922, + "logps/rejected": -321.8557434082031, + "loss": 0.1658, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2921600341796875, + "rewards/grad_term": 0.003366851480677724, + "rewards/margins": 3.8926539421081543, + "rewards/rejected": -7.184814453125, + "step": 434 + }, + { + "epoch": 0.9319764327798608, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.358516114828291, + "learning_rate": 4.159904534606205e-07, + "logits/chosen": 0.7316790819168091, + "logits/rejected": 0.4060133099555969, + "logps/accuracies": 0.75, + "logps/chosen": -300.3427734375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -247.51315307617188, + "logps/ref_rejected": -180.51429748535156, + "logps/rejected": -308.9923400878906, + "loss": 0.1218, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6414809226989746, + "rewards/grad_term": 0.010488856583833694, + "rewards/margins": 3.7824203968048096, + "rewards/rejected": -6.423901081085205, + "step": 435 + }, + { + "epoch": 0.9341189073379753, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.691996285863658, + "learning_rate": 4.151551312649165e-07, + "logits/chosen": 0.7785161733627319, + "logits/rejected": 0.6878386735916138, + "logps/accuracies": 0.75, + "logps/chosen": -333.62005615234375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -292.43572998046875, + "logps/ref_rejected": -241.63511657714844, + "logps/rejected": -372.54913330078125, + "loss": 0.1926, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.059215545654297, + "rewards/grad_term": 0.0018129110103473067, + "rewards/margins": 4.486485481262207, + "rewards/rejected": -6.545701026916504, + "step": 436 + }, + { + "epoch": 0.93626138189609, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 6.403058396602235, + "learning_rate": 4.1431980906921235e-07, + "logits/chosen": 0.6294098496437073, + "logits/rejected": 0.4260585904121399, + "logps/accuracies": 1.0, + "logps/chosen": -306.869384765625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -250.37619018554688, + "logps/ref_rejected": -255.60894775390625, + "logps/rejected": -436.3756103515625, + "loss": 0.1025, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.824659824371338, + "rewards/grad_term": 0.0016512804431840777, + "rewards/margins": 6.2136712074279785, + "rewards/rejected": -9.038331031799316, + "step": 437 + }, + { + "epoch": 0.9384038564542047, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 10.33335056458741, + "learning_rate": 4.1348448687350833e-07, + "logits/chosen": 0.5839954018592834, + "logits/rejected": 0.5881964564323425, + "logps/accuracies": 0.75, + "logps/chosen": -430.4659118652344, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -370.88543701171875, + "logps/ref_rejected": -322.64886474609375, + "logps/rejected": -474.2005615234375, + "loss": 0.1119, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9790241718292236, + "rewards/grad_term": 0.003231536131352186, + "rewards/margins": 4.598560810089111, + "rewards/rejected": -7.577584743499756, + "step": 438 + }, + { + "epoch": 0.9405463310123192, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.524534921674278, + "learning_rate": 4.126491646778043e-07, + "logits/chosen": 0.8089597225189209, + "logits/rejected": 0.6896387338638306, + "logps/accuracies": 1.0, + "logps/chosen": -643.2845458984375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -516.3448486328125, + "logps/ref_rejected": -450.1552734375, + "logps/rejected": -712.5276489257812, + "loss": 0.1106, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.346982955932617, + "rewards/grad_term": 0.0003070410166401416, + "rewards/margins": 6.771636486053467, + "rewards/rejected": -13.118619918823242, + "step": 439 + }, + { + "epoch": 0.9426888055704339, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.35386030961725, + "learning_rate": 4.118138424821002e-07, + "logits/chosen": 0.4550638198852539, + "logits/rejected": 0.11324800550937653, + "logps/accuracies": 0.75, + "logps/chosen": -256.8120422363281, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -193.059326171875, + "logps/ref_rejected": -195.65859985351562, + "logps/rejected": -337.16546630859375, + "loss": 0.1343, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1876347064971924, + "rewards/grad_term": 0.007398877292871475, + "rewards/margins": 3.8877077102661133, + "rewards/rejected": -7.075342655181885, + "step": 440 + }, + { + "epoch": 0.9448312801285484, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 11.730894891356659, + "learning_rate": 4.1097852028639616e-07, + "logits/chosen": 0.7687960267066956, + "logits/rejected": 0.6161290407180786, + "logps/accuracies": 1.0, + "logps/chosen": -429.12823486328125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -342.45672607421875, + "logps/ref_rejected": -322.35748291015625, + "logps/rejected": -519.225341796875, + "loss": 0.1435, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.3335771560668945, + "rewards/grad_term": 0.0011160215362906456, + "rewards/margins": 5.5098161697387695, + "rewards/rejected": -9.843393325805664, + "step": 441 + }, + { + "epoch": 0.9469737546866631, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.1151510120037, + "learning_rate": 4.1014319809069213e-07, + "logits/chosen": 0.5211978554725647, + "logits/rejected": 0.4055239260196686, + "logps/accuracies": 0.75, + "logps/chosen": -252.66140747070312, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -219.652099609375, + "logps/ref_rejected": -156.125244140625, + "logps/rejected": -275.7467956542969, + "loss": 0.1075, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6504652500152588, + "rewards/grad_term": 0.005798153579235077, + "rewards/margins": 4.330613136291504, + "rewards/rejected": -5.981078624725342, + "step": 442 + }, + { + "epoch": 0.9491162292447777, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.643922368612464, + "learning_rate": 4.0930787589498806e-07, + "logits/chosen": 0.7781935930252075, + "logits/rejected": 0.3824615180492401, + "logps/accuracies": 0.75, + "logps/chosen": -381.4145202636719, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -311.0098571777344, + "logps/ref_rejected": -217.48388671875, + "logps/rejected": -411.19970703125, + "loss": 0.0984, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.520233631134033, + "rewards/grad_term": 0.00042740529170259833, + "rewards/margins": 6.1655592918396, + "rewards/rejected": -9.685792922973633, + "step": 443 + }, + { + "epoch": 0.9512587038028923, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.91825980199573, + "learning_rate": 4.08472553699284e-07, + "logits/chosen": 0.6866825819015503, + "logits/rejected": 0.6368831992149353, + "logps/accuracies": 0.5, + "logps/chosen": -438.6324462890625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -397.3633728027344, + "logps/ref_rejected": -329.361572265625, + "logps/rejected": -486.6770324707031, + "loss": 0.133, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.06345272064209, + "rewards/grad_term": 0.0025374325923621655, + "rewards/margins": 5.80232048034668, + "rewards/rejected": -7.865772724151611, + "step": 444 + }, + { + "epoch": 0.953401178361007, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 5.821241307927092, + "learning_rate": 4.076372315035799e-07, + "logits/chosen": 0.8077370524406433, + "logits/rejected": 0.5723898410797119, + "logps/accuracies": 0.75, + "logps/chosen": -629.1956787109375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -488.9486389160156, + "logps/ref_rejected": -358.0147705078125, + "logps/rejected": -686.6176147460938, + "loss": 0.1339, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.012351036071777, + "rewards/grad_term": 8.414402145717759e-06, + "rewards/margins": 9.417790412902832, + "rewards/rejected": -16.430139541625977, + "step": 445 + }, + { + "epoch": 0.9555436529191216, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.65211568963401, + "learning_rate": 4.068019093078759e-07, + "logits/chosen": 0.7759550213813782, + "logits/rejected": 0.7140344977378845, + "logps/accuracies": 1.0, + "logps/chosen": -505.0233459472656, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -406.8385009765625, + "logps/ref_rejected": -350.51312255859375, + "logps/rejected": -550.5768432617188, + "loss": 0.1625, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.909243106842041, + "rewards/grad_term": 0.0008991943905130029, + "rewards/margins": 5.0939459800720215, + "rewards/rejected": -10.003189086914062, + "step": 446 + }, + { + "epoch": 0.9576861274772362, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.561770484711239, + "learning_rate": 4.0596658711217187e-07, + "logits/chosen": 0.761772632598877, + "logits/rejected": 0.5069707632064819, + "logps/accuracies": 0.75, + "logps/chosen": -414.2353210449219, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -330.43389892578125, + "logps/ref_rejected": -286.1955261230469, + "logps/rejected": -513.2139892578125, + "loss": 0.1028, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.190073490142822, + "rewards/grad_term": 0.0025082218926399946, + "rewards/margins": 7.160850524902344, + "rewards/rejected": -11.350923538208008, + "step": 447 + }, + { + "epoch": 0.9598286020353508, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 11.50375604154419, + "learning_rate": 4.0513126491646774e-07, + "logits/chosen": 0.7668182849884033, + "logits/rejected": 0.6712717413902283, + "logps/accuracies": 0.75, + "logps/chosen": -451.71124267578125, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -316.8441467285156, + "logps/ref_rejected": -260.1050720214844, + "logps/rejected": -519.7904052734375, + "loss": 0.1387, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.743355751037598, + "rewards/grad_term": 0.0019724913872778416, + "rewards/margins": 6.240912437438965, + "rewards/rejected": -12.984268188476562, + "step": 448 + }, + { + "epoch": 0.9598286020353508, + "eval_flips/correct->correct": 0.14000000059604645, + "eval_flips/correct->incorrect": 0.019999999552965164, + "eval_flips/incorrect->correct": 0.5799999833106995, + "eval_flips/incorrect->incorrect": 0.25999999046325684, + "eval_logits/chosen": 0.6544824838638306, + "eval_logits/rejected": 0.5577883720397949, + "eval_logps/accuracies": 0.7200000286102295, + "eval_logps/chosen": -392.440185546875, + "eval_logps/ref_accuracies": 0.1599999964237213, + "eval_logps/ref_chosen": -323.51568603515625, + "eval_logps/ref_rejected": -258.70098876953125, + "eval_logps/rejected": -422.9569091796875, + "eval_loss": 0.13714653253555298, + "eval_rewards/accuracies": 0.9200000166893005, + "eval_rewards/chosen": -3.446227788925171, + "eval_rewards/grad_term": 0.006565955467522144, + "eval_rewards/margins": 4.766568660736084, + "eval_rewards/rejected": -8.212796211242676, + "eval_runtime": 373.2435, + "eval_samples_per_second": 4.233, + "eval_steps_per_second": 0.134, + "step": 448 + }, + { + "epoch": 0.9619710765934655, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.75, + "grad_norm": 7.143313955724084, + "learning_rate": 4.042959427207637e-07, + "logits/chosen": 0.6099262833595276, + "logits/rejected": 0.41160258650779724, + "logps/accuracies": 0.25, + "logps/chosen": -495.52197265625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -402.29571533203125, + "logps/ref_rejected": -238.91366577148438, + "logps/rejected": -411.18255615234375, + "loss": 0.0955, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.661313056945801, + "rewards/grad_term": 0.008577575907111168, + "rewards/margins": 3.9521327018737793, + "rewards/rejected": -8.613445281982422, + "step": 449 + }, + { + "epoch": 0.96411355115158, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.099073379071028, + "learning_rate": 4.0346062052505964e-07, + "logits/chosen": 0.7317103147506714, + "logits/rejected": 0.5711554884910583, + "logps/accuracies": 0.75, + "logps/chosen": -509.8515319824219, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -422.57989501953125, + "logps/ref_rejected": -335.5193176269531, + "logps/rejected": -622.3995971679688, + "loss": 0.1295, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.363581657409668, + "rewards/grad_term": 4.9737202061805874e-05, + "rewards/margins": 9.980432510375977, + "rewards/rejected": -14.344014167785645, + "step": 450 + }, + { + "epoch": 0.9662560257096947, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.501700427807891, + "learning_rate": 4.0262529832935557e-07, + "logits/chosen": 0.7478474974632263, + "logits/rejected": 0.6434618234634399, + "logps/accuracies": 1.0, + "logps/chosen": -299.7742004394531, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -228.0120849609375, + "logps/ref_rejected": -183.03488159179688, + "logps/rejected": -342.20123291015625, + "loss": 0.1458, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.588106632232666, + "rewards/grad_term": 0.003840662771835923, + "rewards/margins": 4.370211124420166, + "rewards/rejected": -7.958317279815674, + "step": 451 + }, + { + "epoch": 0.9683985002678093, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.391268423681992, + "learning_rate": 4.017899761336515e-07, + "logits/chosen": 0.4842509627342224, + "logits/rejected": 0.3438522517681122, + "logps/accuracies": 0.5, + "logps/chosen": -378.8311767578125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -306.450439453125, + "logps/ref_rejected": -239.88665771484375, + "logps/rejected": -436.4774475097656, + "loss": 0.1131, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.61903715133667, + "rewards/grad_term": 0.0038894114550203085, + "rewards/margins": 6.210503578186035, + "rewards/rejected": -9.829540252685547, + "step": 452 + }, + { + "epoch": 0.9705409748259239, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 13.121406224741564, + "learning_rate": 4.0095465393794747e-07, + "logits/chosen": 0.7076852321624756, + "logits/rejected": 0.5975925922393799, + "logps/accuracies": 0.75, + "logps/chosen": -533.9036254882812, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -425.0966796875, + "logps/ref_rejected": -362.5286560058594, + "logps/rejected": -631.1089477539062, + "loss": 0.162, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.4403462409973145, + "rewards/grad_term": 7.804081542417407e-05, + "rewards/margins": 7.988667011260986, + "rewards/rejected": -13.4290132522583, + "step": 453 + }, + { + "epoch": 0.9726834493840386, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 11.264528709557153, + "learning_rate": 4.0011933174224345e-07, + "logits/chosen": 0.5773014426231384, + "logits/rejected": 0.37737879157066345, + "logps/accuracies": 0.5, + "logps/chosen": -441.2142639160156, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -361.462890625, + "logps/ref_rejected": -243.72628784179688, + "logps/rejected": -388.4267272949219, + "loss": 0.1549, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9875683784484863, + "rewards/grad_term": 0.0044038868509233, + "rewards/margins": 3.2474539279937744, + "rewards/rejected": -7.23502254486084, + "step": 454 + }, + { + "epoch": 0.9748259239421532, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 17.35451032981818, + "learning_rate": 3.992840095465393e-07, + "logits/chosen": 0.7966763377189636, + "logits/rejected": 0.6966639161109924, + "logps/accuracies": 0.75, + "logps/chosen": -576.56640625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -470.6474304199219, + "logps/ref_rejected": -458.44146728515625, + "logps/rejected": -663.11572265625, + "loss": 0.2188, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.295950889587402, + "rewards/grad_term": 0.0004940250655636191, + "rewards/margins": 4.937762260437012, + "rewards/rejected": -10.233713150024414, + "step": 455 + }, + { + "epoch": 0.9769683985002678, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.682054456108627, + "learning_rate": 3.984486873508353e-07, + "logits/chosen": 0.9120834469795227, + "logits/rejected": 0.6336569786071777, + "logps/accuracies": 1.0, + "logps/chosen": -432.57769775390625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -369.40673828125, + "logps/ref_rejected": -295.18524169921875, + "logps/rejected": -496.9414978027344, + "loss": 0.1222, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1585474014282227, + "rewards/grad_term": 8.77337297424674e-05, + "rewards/margins": 6.929265022277832, + "rewards/rejected": -10.087812423706055, + "step": 456 + }, + { + "epoch": 0.9791108730583824, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.798800443633192, + "learning_rate": 3.976133651551313e-07, + "logits/chosen": 0.7685093879699707, + "logits/rejected": 0.6625791788101196, + "logps/accuracies": 0.75, + "logps/chosen": -364.49139404296875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -279.5849609375, + "logps/ref_rejected": -255.69351196289062, + "logps/rejected": -443.4805908203125, + "loss": 0.1135, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.245321750640869, + "rewards/grad_term": 0.007468358147889376, + "rewards/margins": 5.144031524658203, + "rewards/rejected": -9.389352798461914, + "step": 457 + }, + { + "epoch": 0.9812533476164971, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 10.393895759274342, + "learning_rate": 3.967780429594272e-07, + "logits/chosen": 0.7607497572898865, + "logits/rejected": 0.7056368589401245, + "logps/accuracies": 1.0, + "logps/chosen": -333.776123046875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -276.8760986328125, + "logps/ref_rejected": -238.05545043945312, + "logps/rejected": -385.234375, + "loss": 0.1466, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8449995517730713, + "rewards/grad_term": 0.010424750857055187, + "rewards/margins": 4.513948917388916, + "rewards/rejected": -7.358948230743408, + "step": 458 + }, + { + "epoch": 0.9833958221746116, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 13.14222406723579, + "learning_rate": 3.9594272076372313e-07, + "logits/chosen": 0.5317684412002563, + "logits/rejected": 0.3798009753227234, + "logps/accuracies": 0.5, + "logps/chosen": -388.8833923339844, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -284.9061584472656, + "logps/ref_rejected": -228.88584899902344, + "logps/rejected": -419.9346923828125, + "loss": 0.2091, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1988606452941895, + "rewards/grad_term": 0.006651153787970543, + "rewards/margins": 4.35358190536499, + "rewards/rejected": -9.55244255065918, + "step": 459 + }, + { + "epoch": 0.9855382967327263, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 9.290117062157519, + "learning_rate": 3.9510739856801906e-07, + "logits/chosen": 0.7666717171669006, + "logits/rejected": 0.6038914918899536, + "logps/accuracies": 0.75, + "logps/chosen": -517.7335205078125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -393.77362060546875, + "logps/ref_rejected": -367.8737487792969, + "logps/rejected": -612.9761962890625, + "loss": 0.1298, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.197994232177734, + "rewards/grad_term": 0.002154412679374218, + "rewards/margins": 6.057126045227051, + "rewards/rejected": -12.255121231079102, + "step": 460 + }, + { + "epoch": 0.987680771290841, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 8.418543548161525, + "learning_rate": 3.9427207637231504e-07, + "logits/chosen": 0.4025576114654541, + "logits/rejected": 0.24721147119998932, + "logps/accuracies": 0.75, + "logps/chosen": -312.319091796875, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -264.7703857421875, + "logps/ref_rejected": -213.37884521484375, + "logps/rejected": -414.29150390625, + "loss": 0.1181, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3774335384368896, + "rewards/grad_term": 0.002074267016723752, + "rewards/margins": 7.668199062347412, + "rewards/rejected": -10.045632362365723, + "step": 461 + }, + { + "epoch": 0.9898232458489555, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 5.924066455521681, + "learning_rate": 3.934367541766109e-07, + "logits/chosen": 0.7529336810112, + "logits/rejected": 0.5446640253067017, + "logps/accuracies": 0.5, + "logps/chosen": -391.60406494140625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -341.4349365234375, + "logps/ref_rejected": -242.84326171875, + "logps/rejected": -389.7578125, + "loss": 0.0904, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5084564685821533, + "rewards/grad_term": 0.0086339320987463, + "rewards/margins": 4.837271690368652, + "rewards/rejected": -7.345727920532227, + "step": 462 + }, + { + "epoch": 0.9919657204070702, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.341264657659469, + "learning_rate": 3.926014319809069e-07, + "logits/chosen": 0.5941088199615479, + "logits/rejected": 0.5042127370834351, + "logps/accuracies": 1.0, + "logps/chosen": -501.0787353515625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -397.529541015625, + "logps/ref_rejected": -341.83441162109375, + "logps/rejected": -633.3097534179688, + "loss": 0.1559, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.177460193634033, + "rewards/grad_term": 4.0835613617673516e-05, + "rewards/margins": 9.396307945251465, + "rewards/rejected": -14.573768615722656, + "step": 463 + }, + { + "epoch": 0.9941081949651848, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.207024818309272, + "learning_rate": 3.9176610978520286e-07, + "logits/chosen": 0.3791959285736084, + "logits/rejected": 0.4412694573402405, + "logps/accuracies": 1.0, + "logps/chosen": -288.90289306640625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -218.5885772705078, + "logps/ref_rejected": -224.43006896972656, + "logps/rejected": -435.8794860839844, + "loss": 0.1242, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5157151222229004, + "rewards/grad_term": 0.0030132310930639505, + "rewards/margins": 7.056755065917969, + "rewards/rejected": -10.572470664978027, + "step": 464 + }, + { + "epoch": 0.9962506695232994, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.607745217245667, + "learning_rate": 3.9093078758949884e-07, + "logits/chosen": 0.5457050800323486, + "logits/rejected": 0.5041043162345886, + "logps/accuracies": 1.0, + "logps/chosen": -283.53179931640625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -227.90573120117188, + "logps/ref_rejected": -220.95458984375, + "logps/rejected": -366.6824951171875, + "loss": 0.1246, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.781303882598877, + "rewards/grad_term": 0.003597520524635911, + "rewards/margins": 4.50508975982666, + "rewards/rejected": -7.286393642425537, + "step": 465 + }, + { + "epoch": 0.998393144081414, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 7.293270758501063, + "learning_rate": 3.900954653937947e-07, + "logits/chosen": 0.6813356280326843, + "logits/rejected": 0.6149817109107971, + "logps/accuracies": 0.75, + "logps/chosen": -369.541748046875, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -312.1705017089844, + "logps/ref_rejected": -293.1113586425781, + "logps/rejected": -475.52325439453125, + "loss": 0.1107, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8685624599456787, + "rewards/grad_term": 0.005431050434708595, + "rewards/margins": 6.25203275680542, + "rewards/rejected": -9.120595932006836, + "step": 466 + }, + { + "epoch": 1.0005356186395287, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 7.806449837480457, + "learning_rate": 3.892601431980907e-07, + "logits/chosen": 0.5919859409332275, + "logits/rejected": 0.5162959694862366, + "logps/accuracies": 1.0, + "logps/chosen": -398.9145812988281, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -307.1631774902344, + "logps/ref_rejected": -271.4671630859375, + "logps/rejected": -484.1536865234375, + "loss": 0.0956, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.5875701904296875, + "rewards/grad_term": 0.0008761522476561368, + "rewards/margins": 6.046757221221924, + "rewards/rejected": -10.63432788848877, + "step": 467 + }, + { + "epoch": 1.0026780931976433, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 3.522329323680681, + "learning_rate": 3.884248210023866e-07, + "logits/chosen": 0.6553293466567993, + "logits/rejected": 0.5018194913864136, + "logps/accuracies": 0.75, + "logps/chosen": -324.5142517089844, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -261.5387878417969, + "logps/ref_rejected": -222.56907653808594, + "logps/rejected": -426.75311279296875, + "loss": 0.0507, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1487741470336914, + "rewards/grad_term": 0.0020364022348076105, + "rewards/margins": 7.060428619384766, + "rewards/rejected": -10.20920181274414, + "step": 468 + }, + { + "epoch": 1.004820567755758, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 6.612470098819514, + "learning_rate": 3.8758949880668254e-07, + "logits/chosen": 0.6158171892166138, + "logits/rejected": 0.5050429105758667, + "logps/accuracies": 0.5, + "logps/chosen": -406.8590087890625, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -289.06756591796875, + "logps/ref_rejected": -229.218994140625, + "logps/rejected": -454.9029846191406, + "loss": 0.1037, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.889570713043213, + "rewards/grad_term": 0.0034782271832227707, + "rewards/margins": 5.39462947845459, + "rewards/rejected": -11.284200668334961, + "step": 469 + }, + { + "epoch": 1.0069630423138725, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 3.539220855844499, + "learning_rate": 3.8675417661097847e-07, + "logits/chosen": 0.7073222994804382, + "logits/rejected": 0.671281635761261, + "logps/accuracies": 0.75, + "logps/chosen": -430.43096923828125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -342.07391357421875, + "logps/ref_rejected": -324.9189453125, + "logps/rejected": -591.3267211914062, + "loss": 0.041, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.417852401733398, + "rewards/grad_term": 0.00558136124163866, + "rewards/margins": 8.902535438537598, + "rewards/rejected": -13.320388793945312, + "step": 470 + }, + { + "epoch": 1.0091055168719871, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 6.354661123726489, + "learning_rate": 3.8591885441527445e-07, + "logits/chosen": 0.7619997262954712, + "logits/rejected": 0.706304669380188, + "logps/accuracies": 0.75, + "logps/chosen": -384.338623046875, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -320.0787658691406, + "logps/ref_rejected": -228.5266876220703, + "logps/rejected": -378.6219482421875, + "loss": 0.0982, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.212991237640381, + "rewards/grad_term": 0.0007301281439140439, + "rewards/margins": 4.291770935058594, + "rewards/rejected": -7.504762649536133, + "step": 471 + }, + { + "epoch": 1.0112479914301018, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 4.790880655224275, + "learning_rate": 3.8508353221957043e-07, + "logits/chosen": 0.7498035430908203, + "logits/rejected": 0.518231987953186, + "logps/accuracies": 0.75, + "logps/chosen": -450.41552734375, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -375.4889221191406, + "logps/ref_rejected": -315.0843200683594, + "logps/rejected": -485.2691650390625, + "loss": 0.0852, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7463297843933105, + "rewards/grad_term": 0.0010887248208746314, + "rewards/margins": 4.762913227081299, + "rewards/rejected": -8.50924301147461, + "step": 472 + }, + { + "epoch": 1.0133904659882165, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 5.7481678574942885, + "learning_rate": 3.842482100238663e-07, + "logits/chosen": 0.5814411640167236, + "logits/rejected": 0.48274481296539307, + "logps/accuracies": 1.0, + "logps/chosen": -477.7788391113281, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -356.0843200683594, + "logps/ref_rejected": -339.63299560546875, + "logps/rejected": -642.8599853515625, + "loss": 0.1017, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.084725856781006, + "rewards/grad_term": 1.688380325504113e-05, + "rewards/margins": 9.07662296295166, + "rewards/rejected": -15.16135025024414, + "step": 473 + }, + { + "epoch": 1.015532940546331, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.361744125881987, + "learning_rate": 3.834128878281623e-07, + "logits/chosen": 0.9029494524002075, + "logits/rejected": 0.8422863483428955, + "logps/accuracies": 1.0, + "logps/chosen": -642.5513916015625, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -501.9047546386719, + "logps/ref_rejected": -415.24755859375, + "logps/rejected": -698.61962890625, + "loss": 0.104, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.0323286056518555, + "rewards/grad_term": 0.001934091211296618, + "rewards/margins": 7.1362762451171875, + "rewards/rejected": -14.16860580444336, + "step": 474 + }, + { + "epoch": 1.0176754151044456, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 4.64410207259052, + "learning_rate": 3.8257756563245826e-07, + "logits/chosen": 0.5650697946548462, + "logits/rejected": 0.5173856019973755, + "logps/accuracies": 1.0, + "logps/chosen": -540.633056640625, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -457.04205322265625, + "logps/ref_rejected": -435.2861633300781, + "logps/rejected": -643.3292846679688, + "loss": 0.0748, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.1795477867126465, + "rewards/grad_term": 0.0007545886328443885, + "rewards/margins": 6.2226080894470215, + "rewards/rejected": -10.402155876159668, + "step": 475 + }, + { + "epoch": 1.0198178896625603, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.25, + "flips/incorrect->incorrect": 0.25, + "grad_norm": 4.573143939908089, + "learning_rate": 3.817422434367542e-07, + "logits/chosen": 0.5865468978881836, + "logits/rejected": 0.7002366185188293, + "logps/accuracies": 0.75, + "logps/chosen": -433.32452392578125, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -351.2342529296875, + "logps/ref_rejected": -321.6819152832031, + "logps/rejected": -513.435791015625, + "loss": 0.0725, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.104512691497803, + "rewards/grad_term": 0.003012165194377303, + "rewards/margins": 5.483182430267334, + "rewards/rejected": -9.587695121765137, + "step": 476 + }, + { + "epoch": 1.021960364220675, + "flips/correct->correct": 0.25, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.75, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 8.11310764037292, + "learning_rate": 3.809069212410501e-07, + "logits/chosen": 0.4245867133140564, + "logits/rejected": 0.4632042348384857, + "logps/accuracies": 1.0, + "logps/chosen": -426.9747314453125, + "logps/ref_accuracies": 0.25, + "logps/ref_chosen": -358.86248779296875, + "logps/ref_rejected": -292.428466796875, + "logps/rejected": -462.8052673339844, + "loss": 0.0946, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4056124687194824, + "rewards/grad_term": 0.001515088020823896, + "rewards/margins": 5.113227844238281, + "rewards/rejected": -8.518840789794922, + "step": 477 + }, + { + "epoch": 1.0241028387787896, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.5, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 4.791373581896772, + "learning_rate": 3.8007159904534603e-07, + "logits/chosen": 0.5890440940856934, + "logits/rejected": 0.36139726638793945, + "logps/accuracies": 0.5, + "logps/chosen": -490.9143371582031, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -361.0188903808594, + "logps/ref_rejected": -289.797119140625, + "logps/rejected": -517.291259765625, + "loss": 0.0621, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.494773864746094, + "rewards/grad_term": 0.0016336999833583832, + "rewards/margins": 4.879931449890137, + "rewards/rejected": -11.37470531463623, + "step": 478 + }, + { + "epoch": 1.026245313336904, + "flips/correct->correct": 0.0, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 1.0, + "flips/incorrect->incorrect": 0.0, + "grad_norm": 9.419742623980941, + "learning_rate": 3.79236276849642e-07, + "logits/chosen": 0.6677709817886353, + "logits/rejected": 0.6186258792877197, + "logps/accuracies": 1.0, + "logps/chosen": -357.1605224609375, + "logps/ref_accuracies": 0.0, + "logps/ref_chosen": -280.62677001953125, + "logps/ref_rejected": -218.89462280273438, + "logps/rejected": -433.9444274902344, + "loss": 0.0807, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.826688766479492, + "rewards/grad_term": 0.004192608408629894, + "rewards/margins": 6.925801753997803, + "rewards/rejected": -10.752490043640137, + "step": 479 + }, + { + "epoch": 1.0283877878950187, + "flips/correct->correct": 0.5, + "flips/correct->incorrect": 0.0, + "flips/incorrect->correct": 0.0, + "flips/incorrect->incorrect": 0.5, + "grad_norm": 5.500558735593387, + "learning_rate": 3.784009546539379e-07, + "logits/chosen": 0.5895904302597046, + "logits/rejected": 0.5445250868797302, + "logps/accuracies": 0.5, + "logps/chosen": -155.37521362304688, + "logps/ref_accuracies": 0.5, + "logps/ref_chosen": -109.69457244873047, + "logps/ref_rejected": -98.66912078857422, + "logps/rejected": -187.35279846191406, + "loss": 0.0968, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.284031867980957, + "rewards/grad_term": 0.0074999695643782616, + "rewards/margins": 2.1501517295837402, + "rewards/rejected": -4.4341840744018555, + "step": 480 + } + ], + "logging_steps": 1, + "max_steps": 932, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 96, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}