diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6297 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 100, + "global_step": 4164, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007204610951008645, + "grad_norm": 14.58157977905889, + "learning_rate": 1.199040767386091e-10, + "logits/chosen": -1.901450514793396, + "logits/rejected": -1.9076323509216309, + "logps/chosen": -0.8524526953697205, + "logps/rejected": -0.9626365900039673, + "loss": 1.1927, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.704905390739441, + "rewards/margins": 0.22036786377429962, + "rewards/rejected": -1.9252731800079346, + "step": 1 + }, + { + "epoch": 0.007204610951008645, + "grad_norm": 17.76736608782741, + "learning_rate": 1.199040767386091e-09, + "logits/chosen": -2.0206170082092285, + "logits/rejected": -2.0063347816467285, + "logps/chosen": -1.0049196481704712, + "logps/rejected": -1.1093952655792236, + "loss": 1.2168, + "rewards/accuracies": 0.5208333134651184, + "rewards/chosen": -2.0098392963409424, + "rewards/margins": 0.2089509218931198, + "rewards/rejected": -2.2187905311584473, + "step": 10 + }, + { + "epoch": 0.01440922190201729, + "grad_norm": 22.614753087644292, + "learning_rate": 2.398081534772182e-09, + "logits/chosen": -2.026459217071533, + "logits/rejected": -2.0231809616088867, + "logps/chosen": -1.051859736442566, + "logps/rejected": -1.1832743883132935, + "loss": 1.1863, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.103719472885132, + "rewards/margins": 0.2628290057182312, + "rewards/rejected": -2.366548776626587, + "step": 20 + }, + { + "epoch": 0.021613832853025938, + "grad_norm": 17.824346372572926, + "learning_rate": 3.597122302158273e-09, + "logits/chosen": -1.981697678565979, + "logits/rejected": -1.9744222164154053, + "logps/chosen": -1.053879976272583, + "logps/rejected": -1.1511423587799072, + "loss": 1.2353, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.107759952545166, + "rewards/margins": 0.19452433288097382, + "rewards/rejected": -2.3022847175598145, + "step": 30 + }, + { + "epoch": 0.02881844380403458, + "grad_norm": 19.247706292689507, + "learning_rate": 4.796163069544364e-09, + "logits/chosen": -2.0287587642669678, + "logits/rejected": -2.028596878051758, + "logps/chosen": -1.0359481573104858, + "logps/rejected": -1.1375384330749512, + "loss": 1.2355, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.0718963146209717, + "rewards/margins": 0.20318038761615753, + "rewards/rejected": -2.2750768661499023, + "step": 40 + }, + { + "epoch": 0.03602305475504323, + "grad_norm": 14.992901360893413, + "learning_rate": 5.995203836930456e-09, + "logits/chosen": -1.962505578994751, + "logits/rejected": -1.9632362127304077, + "logps/chosen": -0.9416370391845703, + "logps/rejected": -1.0078415870666504, + "loss": 1.2545, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.8832740783691406, + "rewards/margins": 0.13240887224674225, + "rewards/rejected": -2.015683174133301, + "step": 50 + }, + { + "epoch": 0.043227665706051875, + "grad_norm": 21.508515110852976, + "learning_rate": 7.194244604316546e-09, + "logits/chosen": -2.0391106605529785, + "logits/rejected": -2.034660816192627, + "logps/chosen": -1.0891697406768799, + "logps/rejected": -1.145775556564331, + "loss": 1.2676, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.1783394813537598, + "rewards/margins": 0.11321155726909637, + "rewards/rejected": -2.291551113128662, + "step": 60 + }, + { + "epoch": 0.05043227665706052, + "grad_norm": 20.688044326046224, + "learning_rate": 8.393285371702639e-09, + "logits/chosen": -2.029348373413086, + "logits/rejected": -2.016831636428833, + "logps/chosen": -1.1090962886810303, + "logps/rejected": -1.204714059829712, + "loss": 1.226, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.2181925773620605, + "rewards/margins": 0.19123554229736328, + "rewards/rejected": -2.409428119659424, + "step": 70 + }, + { + "epoch": 0.05763688760806916, + "grad_norm": 24.41033214526541, + "learning_rate": 9.592326139088728e-09, + "logits/chosen": -2.046764850616455, + "logits/rejected": -2.043759822845459, + "logps/chosen": -1.166001558303833, + "logps/rejected": -1.237687110900879, + "loss": 1.2535, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.332003116607666, + "rewards/margins": 0.1433708667755127, + "rewards/rejected": -2.475374221801758, + "step": 80 + }, + { + "epoch": 0.06484149855907781, + "grad_norm": 15.594986746473925, + "learning_rate": 1.0791366906474819e-08, + "logits/chosen": -2.0026838779449463, + "logits/rejected": -2.00419545173645, + "logps/chosen": -1.0416425466537476, + "logps/rejected": -1.148652195930481, + "loss": 1.215, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.083285093307495, + "rewards/margins": 0.2140192985534668, + "rewards/rejected": -2.297304391860962, + "step": 90 + }, + { + "epoch": 0.07204610951008646, + "grad_norm": 19.00699314951204, + "learning_rate": 1.1990407673860912e-08, + "logits/chosen": -2.040858268737793, + "logits/rejected": -2.0346200466156006, + "logps/chosen": -1.0072879791259766, + "logps/rejected": -1.1140906810760498, + "loss": 1.2176, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.014575958251953, + "rewards/margins": 0.21360567212104797, + "rewards/rejected": -2.2281813621520996, + "step": 100 + }, + { + "epoch": 0.0792507204610951, + "grad_norm": 16.51858878389513, + "learning_rate": 1.3189448441247003e-08, + "logits/chosen": -1.9792842864990234, + "logits/rejected": -1.9680954217910767, + "logps/chosen": -1.0292143821716309, + "logps/rejected": -1.1284914016723633, + "loss": 1.2285, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.0584287643432617, + "rewards/margins": 0.19855372607707977, + "rewards/rejected": -2.2569828033447266, + "step": 110 + }, + { + "epoch": 0.08645533141210375, + "grad_norm": 18.233760151089655, + "learning_rate": 1.4388489208633092e-08, + "logits/chosen": -1.972887396812439, + "logits/rejected": -1.9710506200790405, + "logps/chosen": -0.9646250009536743, + "logps/rejected": -1.0660240650177002, + "loss": 1.2089, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.9292500019073486, + "rewards/margins": 0.20279808342456818, + "rewards/rejected": -2.1320481300354004, + "step": 120 + }, + { + "epoch": 0.0936599423631124, + "grad_norm": 17.354185009707173, + "learning_rate": 1.5587529976019183e-08, + "logits/chosen": -2.062894105911255, + "logits/rejected": -2.0622401237487793, + "logps/chosen": -1.0803730487823486, + "logps/rejected": -1.1523029804229736, + "loss": 1.2547, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.1607460975646973, + "rewards/margins": 0.14385986328125, + "rewards/rejected": -2.3046059608459473, + "step": 130 + }, + { + "epoch": 0.10086455331412104, + "grad_norm": 20.84722939621763, + "learning_rate": 1.6786570743405277e-08, + "logits/chosen": -1.9781713485717773, + "logits/rejected": -1.971671462059021, + "logps/chosen": -0.9779410362243652, + "logps/rejected": -1.1225957870483398, + "loss": 1.1689, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.9558820724487305, + "rewards/margins": 0.28930944204330444, + "rewards/rejected": -2.2451915740966797, + "step": 140 + }, + { + "epoch": 0.10806916426512968, + "grad_norm": 20.178997351363016, + "learning_rate": 1.7985611510791365e-08, + "logits/chosen": -1.9949369430541992, + "logits/rejected": -1.990666389465332, + "logps/chosen": -1.0193713903427124, + "logps/rejected": -1.136603593826294, + "loss": 1.2076, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.038742780685425, + "rewards/margins": 0.23446419835090637, + "rewards/rejected": -2.273207187652588, + "step": 150 + }, + { + "epoch": 0.11527377521613832, + "grad_norm": 17.43558543499963, + "learning_rate": 1.9184652278177456e-08, + "logits/chosen": -2.002195358276367, + "logits/rejected": -1.9960581064224243, + "logps/chosen": -0.948249340057373, + "logps/rejected": -1.0968583822250366, + "loss": 1.1513, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.896498680114746, + "rewards/margins": 0.2972180247306824, + "rewards/rejected": -2.1937167644500732, + "step": 160 + }, + { + "epoch": 0.12247838616714697, + "grad_norm": 22.73064399272494, + "learning_rate": 2.038369304556355e-08, + "logits/chosen": -2.005837917327881, + "logits/rejected": -1.9983062744140625, + "logps/chosen": -1.0370620489120483, + "logps/rejected": -1.1609737873077393, + "loss": 1.2056, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0741240978240967, + "rewards/margins": 0.24782316386699677, + "rewards/rejected": -2.3219475746154785, + "step": 170 + }, + { + "epoch": 0.12968299711815562, + "grad_norm": 23.537767016698364, + "learning_rate": 2.1582733812949638e-08, + "logits/chosen": -2.0367612838745117, + "logits/rejected": -2.029956817626953, + "logps/chosen": -1.02077317237854, + "logps/rejected": -1.1086028814315796, + "loss": 1.2477, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.04154634475708, + "rewards/margins": 0.1756592094898224, + "rewards/rejected": -2.217205762863159, + "step": 180 + }, + { + "epoch": 0.13688760806916425, + "grad_norm": 23.18810653891807, + "learning_rate": 2.278177458033573e-08, + "logits/chosen": -2.077205181121826, + "logits/rejected": -2.0750718116760254, + "logps/chosen": -0.9699970483779907, + "logps/rejected": -1.065187692642212, + "loss": 1.2125, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.9399940967559814, + "rewards/margins": 0.19038136303424835, + "rewards/rejected": -2.130375385284424, + "step": 190 + }, + { + "epoch": 0.1440922190201729, + "grad_norm": 22.445024845318002, + "learning_rate": 2.3980815347721823e-08, + "logits/chosen": -2.0375380516052246, + "logits/rejected": -2.034369945526123, + "logps/chosen": -1.026186227798462, + "logps/rejected": -1.1526433229446411, + "loss": 1.1878, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.052372455596924, + "rewards/margins": 0.2529138922691345, + "rewards/rejected": -2.3052866458892822, + "step": 200 + }, + { + "epoch": 0.15129682997118155, + "grad_norm": 21.106523936582494, + "learning_rate": 2.517985611510791e-08, + "logits/chosen": -2.036905288696289, + "logits/rejected": -2.0340917110443115, + "logps/chosen": -1.073853611946106, + "logps/rejected": -1.150638461112976, + "loss": 1.2507, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.147707223892212, + "rewards/margins": 0.1535697877407074, + "rewards/rejected": -2.301276922225952, + "step": 210 + }, + { + "epoch": 0.1585014409221902, + "grad_norm": 15.517023295570512, + "learning_rate": 2.6378896882494006e-08, + "logits/chosen": -1.9886398315429688, + "logits/rejected": -1.9846597909927368, + "logps/chosen": -1.0078786611557007, + "logps/rejected": -1.1769925355911255, + "loss": 1.1505, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0157573223114014, + "rewards/margins": 0.3382275700569153, + "rewards/rejected": -2.353985071182251, + "step": 220 + }, + { + "epoch": 0.16570605187319884, + "grad_norm": 17.085486504816398, + "learning_rate": 2.7577937649880097e-08, + "logits/chosen": -2.0190815925598145, + "logits/rejected": -2.0195024013519287, + "logps/chosen": -1.01227605342865, + "logps/rejected": -1.1264681816101074, + "loss": 1.2015, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0245521068573, + "rewards/margins": 0.2283841073513031, + "rewards/rejected": -2.252936363220215, + "step": 230 + }, + { + "epoch": 0.1729106628242075, + "grad_norm": 22.24970353019487, + "learning_rate": 2.8776978417266184e-08, + "logits/chosen": -2.0530002117156982, + "logits/rejected": -2.0478739738464355, + "logps/chosen": -1.0617554187774658, + "logps/rejected": -1.1395084857940674, + "loss": 1.2618, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.1235108375549316, + "rewards/margins": 0.15550628304481506, + "rewards/rejected": -2.2790169715881348, + "step": 240 + }, + { + "epoch": 0.18011527377521613, + "grad_norm": 19.11674829624308, + "learning_rate": 2.997601918465228e-08, + "logits/chosen": -1.9721157550811768, + "logits/rejected": -1.968205451965332, + "logps/chosen": -1.0830333232879639, + "logps/rejected": -1.1736047267913818, + "loss": 1.2384, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.1660666465759277, + "rewards/margins": 0.18114277720451355, + "rewards/rejected": -2.3472094535827637, + "step": 250 + }, + { + "epoch": 0.1873198847262248, + "grad_norm": 21.26207716688974, + "learning_rate": 3.1175059952038366e-08, + "logits/chosen": -1.9892946481704712, + "logits/rejected": -1.997536063194275, + "logps/chosen": -1.1055234670639038, + "logps/rejected": -1.2160685062408447, + "loss": 1.2139, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.2110469341278076, + "rewards/margins": 0.22109034657478333, + "rewards/rejected": -2.4321370124816895, + "step": 260 + }, + { + "epoch": 0.19452449567723343, + "grad_norm": 20.68682788684347, + "learning_rate": 3.237410071942446e-08, + "logits/chosen": -2.064192295074463, + "logits/rejected": -2.0562119483947754, + "logps/chosen": -1.0712614059448242, + "logps/rejected": -1.2003023624420166, + "loss": 1.1803, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.1425228118896484, + "rewards/margins": 0.2580817937850952, + "rewards/rejected": -2.400604724884033, + "step": 270 + }, + { + "epoch": 0.2017291066282421, + "grad_norm": 25.11253609162999, + "learning_rate": 3.3573141486810555e-08, + "logits/chosen": -2.008389472961426, + "logits/rejected": -2.0066072940826416, + "logps/chosen": -0.9357258677482605, + "logps/rejected": -1.049773097038269, + "loss": 1.1981, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.871451735496521, + "rewards/margins": 0.22809453308582306, + "rewards/rejected": -2.099546194076538, + "step": 280 + }, + { + "epoch": 0.20893371757925072, + "grad_norm": 21.796986905635144, + "learning_rate": 3.477218225419664e-08, + "logits/chosen": -2.0430212020874023, + "logits/rejected": -2.044867992401123, + "logps/chosen": -1.0136518478393555, + "logps/rejected": -1.1080281734466553, + "loss": 1.2347, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.027303695678711, + "rewards/margins": 0.1887526512145996, + "rewards/rejected": -2.2160563468933105, + "step": 290 + }, + { + "epoch": 0.21613832853025935, + "grad_norm": 20.372987042015918, + "learning_rate": 3.597122302158273e-08, + "logits/chosen": -2.0230350494384766, + "logits/rejected": -2.0147769451141357, + "logps/chosen": -1.0902057886123657, + "logps/rejected": -1.191245436668396, + "loss": 1.2137, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.1804115772247314, + "rewards/margins": 0.20207944512367249, + "rewards/rejected": -2.382490873336792, + "step": 300 + }, + { + "epoch": 0.22334293948126802, + "grad_norm": 18.57882925465559, + "learning_rate": 3.717026378896883e-08, + "logits/chosen": -1.9549649953842163, + "logits/rejected": -1.9548736810684204, + "logps/chosen": -1.0871379375457764, + "logps/rejected": -1.1725897789001465, + "loss": 1.2377, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.1742758750915527, + "rewards/margins": 0.17090332508087158, + "rewards/rejected": -2.345179557800293, + "step": 310 + }, + { + "epoch": 0.23054755043227665, + "grad_norm": 15.975684555438873, + "learning_rate": 3.836930455635491e-08, + "logits/chosen": -2.0300118923187256, + "logits/rejected": -2.0213980674743652, + "logps/chosen": -1.0087685585021973, + "logps/rejected": -1.1406135559082031, + "loss": 1.1934, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0175371170043945, + "rewards/margins": 0.2636898159980774, + "rewards/rejected": -2.2812271118164062, + "step": 320 + }, + { + "epoch": 0.2377521613832853, + "grad_norm": 15.772574632019396, + "learning_rate": 3.9568345323741003e-08, + "logits/chosen": -2.0156402587890625, + "logits/rejected": -2.0179450511932373, + "logps/chosen": -1.0460145473480225, + "logps/rejected": -1.069695234298706, + "loss": 1.3364, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -2.092029094696045, + "rewards/margins": 0.04736141860485077, + "rewards/rejected": -2.139390468597412, + "step": 330 + }, + { + "epoch": 0.24495677233429394, + "grad_norm": 18.38190578321181, + "learning_rate": 4.07673860911271e-08, + "logits/chosen": -2.0608153343200684, + "logits/rejected": -2.055126667022705, + "logps/chosen": -1.0875434875488281, + "logps/rejected": -1.167794108390808, + "loss": 1.2366, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.1750869750976562, + "rewards/margins": 0.16050121188163757, + "rewards/rejected": -2.335588216781616, + "step": 340 + }, + { + "epoch": 0.2521613832853026, + "grad_norm": 19.343882527589155, + "learning_rate": 4.1966426858513185e-08, + "logits/chosen": -1.9883911609649658, + "logits/rejected": -1.9827582836151123, + "logps/chosen": -0.9889104962348938, + "logps/rejected": -1.1158192157745361, + "loss": 1.1858, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.9778209924697876, + "rewards/margins": 0.25381720066070557, + "rewards/rejected": -2.2316384315490723, + "step": 350 + }, + { + "epoch": 0.25936599423631124, + "grad_norm": 21.595830091643787, + "learning_rate": 4.3165467625899276e-08, + "logits/chosen": -1.9964408874511719, + "logits/rejected": -1.9924728870391846, + "logps/chosen": -1.0861265659332275, + "logps/rejected": -1.2027567625045776, + "loss": 1.1971, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.172253131866455, + "rewards/margins": 0.23326051235198975, + "rewards/rejected": -2.4055135250091553, + "step": 360 + }, + { + "epoch": 0.2665706051873199, + "grad_norm": 18.205487526741695, + "learning_rate": 4.4364508393285374e-08, + "logits/chosen": -2.007871389389038, + "logits/rejected": -2.007930278778076, + "logps/chosen": -1.05240797996521, + "logps/rejected": -1.1806955337524414, + "loss": 1.1777, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.10481595993042, + "rewards/margins": 0.25657495856285095, + "rewards/rejected": -2.361391067504883, + "step": 370 + }, + { + "epoch": 0.2737752161383285, + "grad_norm": 16.5239346092299, + "learning_rate": 4.556354916067146e-08, + "logits/chosen": -2.0331404209136963, + "logits/rejected": -2.0373125076293945, + "logps/chosen": -1.0126136541366577, + "logps/rejected": -1.0856488943099976, + "loss": 1.2688, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.0252273082733154, + "rewards/margins": 0.14607074856758118, + "rewards/rejected": -2.171297788619995, + "step": 380 + }, + { + "epoch": 0.28097982708933716, + "grad_norm": 15.274744597058485, + "learning_rate": 4.676258992805755e-08, + "logits/chosen": -2.0328099727630615, + "logits/rejected": -2.0266880989074707, + "logps/chosen": -1.0222868919372559, + "logps/rejected": -1.1483510732650757, + "loss": 1.1822, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0445737838745117, + "rewards/margins": 0.2521281838417053, + "rewards/rejected": -2.2967021465301514, + "step": 390 + }, + { + "epoch": 0.2881844380403458, + "grad_norm": 19.044775104319285, + "learning_rate": 4.796163069544365e-08, + "logits/chosen": -2.0326080322265625, + "logits/rejected": -2.0330114364624023, + "logps/chosen": -0.9962165951728821, + "logps/rejected": -1.049239993095398, + "loss": 1.2716, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.9924331903457642, + "rewards/margins": 0.106046661734581, + "rewards/rejected": -2.098479986190796, + "step": 400 + }, + { + "epoch": 0.2953890489913545, + "grad_norm": 18.59587302480316, + "learning_rate": 4.916067146282973e-08, + "logits/chosen": -2.0307698249816895, + "logits/rejected": -2.028846025466919, + "logps/chosen": -1.0742970705032349, + "logps/rejected": -1.1461079120635986, + "loss": 1.2611, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.1485941410064697, + "rewards/margins": 0.1436215192079544, + "rewards/rejected": -2.2922158241271973, + "step": 410 + }, + { + "epoch": 0.3025936599423631, + "grad_norm": 16.815742736953002, + "learning_rate": 4.999992091672379e-08, + "logits/chosen": -2.0101230144500732, + "logits/rejected": -2.0144143104553223, + "logps/chosen": -1.0453675985336304, + "logps/rejected": -1.1239204406738281, + "loss": 1.2427, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.0907351970672607, + "rewards/margins": 0.1571054756641388, + "rewards/rejected": -2.2478408813476562, + "step": 420 + }, + { + "epoch": 0.30979827089337175, + "grad_norm": 17.75452942549568, + "learning_rate": 4.999851500573209e-08, + "logits/chosen": -1.9898436069488525, + "logits/rejected": -1.9907649755477905, + "logps/chosen": -1.0584254264831543, + "logps/rejected": -1.0997257232666016, + "loss": 1.3009, + "rewards/accuracies": 0.46875, + "rewards/chosen": -2.1168508529663086, + "rewards/margins": 0.08260075747966766, + "rewards/rejected": -2.199451446533203, + "step": 430 + }, + { + "epoch": 0.3170028818443804, + "grad_norm": 15.929802428494124, + "learning_rate": 4.999535180235972e-08, + "logits/chosen": -1.9864879846572876, + "logits/rejected": -1.9866752624511719, + "logps/chosen": -1.0216079950332642, + "logps/rejected": -1.143937110900879, + "loss": 1.1961, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.0432159900665283, + "rewards/margins": 0.2446581870317459, + "rewards/rejected": -2.287874221801758, + "step": 440 + }, + { + "epoch": 0.3242074927953891, + "grad_norm": 17.924032675250256, + "learning_rate": 4.9990431528966836e-08, + "logits/chosen": -2.0104360580444336, + "logits/rejected": -2.006624221801758, + "logps/chosen": -1.1455620527267456, + "logps/rejected": -1.1853464841842651, + "loss": 1.3022, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.291124105453491, + "rewards/margins": 0.07956884801387787, + "rewards/rejected": -2.3706929683685303, + "step": 450 + }, + { + "epoch": 0.3314121037463977, + "grad_norm": 24.17220460895587, + "learning_rate": 4.9983754531428326e-08, + "logits/chosen": -2.0079081058502197, + "logits/rejected": -2.00258731842041, + "logps/chosen": -1.1706523895263672, + "logps/rejected": -1.2871944904327393, + "loss": 1.2011, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.3413047790527344, + "rewards/margins": 0.23308411240577698, + "rewards/rejected": -2.5743889808654785, + "step": 460 + }, + { + "epoch": 0.33861671469740634, + "grad_norm": 22.958829143377635, + "learning_rate": 4.997532127910954e-08, + "logits/chosen": -2.04119873046875, + "logits/rejected": -2.029076099395752, + "logps/chosen": -1.100618839263916, + "logps/rejected": -1.202358365058899, + "loss": 1.2196, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.201237678527832, + "rewards/margins": 0.20347890257835388, + "rewards/rejected": -2.404716730117798, + "step": 470 + }, + { + "epoch": 0.345821325648415, + "grad_norm": 21.220771734165737, + "learning_rate": 4.996513236483331e-08, + "logits/chosen": -2.10054087638855, + "logits/rejected": -2.090383768081665, + "logps/chosen": -0.9847520589828491, + "logps/rejected": -1.1071968078613281, + "loss": 1.1835, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.9695041179656982, + "rewards/margins": 0.24488914012908936, + "rewards/rejected": -2.2143936157226562, + "step": 480 + }, + { + "epoch": 0.3530259365994236, + "grad_norm": 18.815281247411335, + "learning_rate": 4.9953188504838225e-08, + "logits/chosen": -2.023686408996582, + "logits/rejected": -2.0228590965270996, + "logps/chosen": -0.9880355596542358, + "logps/rejected": -1.1021173000335693, + "loss": 1.1932, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.9760711193084717, + "rewards/margins": 0.22816362977027893, + "rewards/rejected": -2.2042346000671387, + "step": 490 + }, + { + "epoch": 0.36023054755043227, + "grad_norm": 18.652142843140656, + "learning_rate": 4.993949053872834e-08, + "logits/chosen": -2.0161242485046387, + "logits/rejected": -2.0025501251220703, + "logps/chosen": -1.012613296508789, + "logps/rejected": -1.140053391456604, + "loss": 1.1806, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.025226593017578, + "rewards/margins": 0.25488021969795227, + "rewards/rejected": -2.280106782913208, + "step": 500 + }, + { + "epoch": 0.36743515850144093, + "grad_norm": 19.243693238325108, + "learning_rate": 4.9924039429414086e-08, + "logits/chosen": -2.087251663208008, + "logits/rejected": -2.080854654312134, + "logps/chosen": -1.0440865755081177, + "logps/rejected": -1.158582091331482, + "loss": 1.2076, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.0881731510162354, + "rewards/margins": 0.2289910614490509, + "rewards/rejected": -2.317164182662964, + "step": 510 + }, + { + "epoch": 0.3746397694524496, + "grad_norm": 16.084908380302842, + "learning_rate": 4.990683626304467e-08, + "logits/chosen": -2.010878801345825, + "logits/rejected": -2.009476900100708, + "logps/chosen": -1.1068270206451416, + "logps/rejected": -1.2030669450759888, + "loss": 1.2196, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.213654041290283, + "rewards/margins": 0.19247998297214508, + "rewards/rejected": -2.4061338901519775, + "step": 520 + }, + { + "epoch": 0.3818443804034582, + "grad_norm": 17.74791604713886, + "learning_rate": 4.9887882248931646e-08, + "logits/chosen": -1.9751720428466797, + "logits/rejected": -1.9651315212249756, + "logps/chosen": -0.9842063188552856, + "logps/rejected": -1.0612623691558838, + "loss": 1.25, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.9684126377105713, + "rewards/margins": 0.1541123390197754, + "rewards/rejected": -2.1225247383117676, + "step": 530 + }, + { + "epoch": 0.38904899135446686, + "grad_norm": 22.74397828503106, + "learning_rate": 4.986717871946393e-08, + "logits/chosen": -2.0001473426818848, + "logits/rejected": -1.9932626485824585, + "logps/chosen": -1.0306423902511597, + "logps/rejected": -1.132361650466919, + "loss": 1.2207, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.0612847805023193, + "rewards/margins": 0.20343880355358124, + "rewards/rejected": -2.264723300933838, + "step": 540 + }, + { + "epoch": 0.3962536023054755, + "grad_norm": 17.28890965818364, + "learning_rate": 4.984472713001416e-08, + "logits/chosen": -1.9692100286483765, + "logits/rejected": -1.9698715209960938, + "logps/chosen": -1.0003505945205688, + "logps/rejected": -1.0772594213485718, + "loss": 1.2685, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.0007011890411377, + "rewards/margins": 0.1538175493478775, + "rewards/rejected": -2.1545188426971436, + "step": 550 + }, + { + "epoch": 0.4034582132564842, + "grad_norm": 17.119812277596985, + "learning_rate": 4.982052905883637e-08, + "logits/chosen": -2.0280909538269043, + "logits/rejected": -2.0286812782287598, + "logps/chosen": -1.0809977054595947, + "logps/rejected": -1.1807363033294678, + "loss": 1.2255, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.1619954109191895, + "rewards/margins": 0.1994771659374237, + "rewards/rejected": -2.3614726066589355, + "step": 560 + }, + { + "epoch": 0.4106628242074928, + "grad_norm": 16.296045615559738, + "learning_rate": 4.979458620695505e-08, + "logits/chosen": -2.0341217517852783, + "logits/rejected": -2.0200934410095215, + "logps/chosen": -1.0948175191879272, + "logps/rejected": -1.2078857421875, + "loss": 1.2102, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.1896350383758545, + "rewards/margins": 0.22613653540611267, + "rewards/rejected": -2.415771484375, + "step": 570 + }, + { + "epoch": 0.41786743515850144, + "grad_norm": 19.547805034076156, + "learning_rate": 4.976690039804555e-08, + "logits/chosen": -2.0328009128570557, + "logits/rejected": -2.03126859664917, + "logps/chosen": -0.9873042106628418, + "logps/rejected": -1.0673751831054688, + "loss": 1.2467, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.9746084213256836, + "rewards/margins": 0.16014157235622406, + "rewards/rejected": -2.1347503662109375, + "step": 580 + }, + { + "epoch": 0.4250720461095101, + "grad_norm": 21.43040480262364, + "learning_rate": 4.973747357830592e-08, + "logits/chosen": -2.020263195037842, + "logits/rejected": -2.0205166339874268, + "logps/chosen": -1.0274614095687866, + "logps/rejected": -1.164903998374939, + "loss": 1.1672, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.0549228191375732, + "rewards/margins": 0.2748851776123047, + "rewards/rejected": -2.329807996749878, + "step": 590 + }, + { + "epoch": 0.4322766570605187, + "grad_norm": 19.58018590074545, + "learning_rate": 4.970630781632009e-08, + "logits/chosen": -2.076049566268921, + "logits/rejected": -2.072026491165161, + "logps/chosen": -1.0331029891967773, + "logps/rejected": -1.1752078533172607, + "loss": 1.1687, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0662059783935547, + "rewards/margins": 0.2842100262641907, + "rewards/rejected": -2.3504157066345215, + "step": 600 + }, + { + "epoch": 0.43948126801152737, + "grad_norm": 21.116047152924747, + "learning_rate": 4.967340530291242e-08, + "logits/chosen": -2.02950382232666, + "logits/rejected": -2.01965594291687, + "logps/chosen": -1.09267258644104, + "logps/rejected": -1.150689959526062, + "loss": 1.2681, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.18534517288208, + "rewards/margins": 0.11603420972824097, + "rewards/rejected": -2.301379919052124, + "step": 610 + }, + { + "epoch": 0.44668587896253603, + "grad_norm": 24.714444992958434, + "learning_rate": 4.9638768350993755e-08, + "logits/chosen": -2.0273375511169434, + "logits/rejected": -2.019911289215088, + "logps/chosen": -0.9958856701850891, + "logps/rejected": -1.082914113998413, + "loss": 1.2353, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.9917713403701782, + "rewards/margins": 0.1740569919347763, + "rewards/rejected": -2.165828227996826, + "step": 620 + }, + { + "epoch": 0.4538904899135447, + "grad_norm": 20.802905839756523, + "learning_rate": 4.9602399395398786e-08, + "logits/chosen": -2.040907859802246, + "logits/rejected": -2.040799379348755, + "logps/chosen": -1.0272337198257446, + "logps/rejected": -1.1544668674468994, + "loss": 1.1828, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.0544674396514893, + "rewards/margins": 0.2544659674167633, + "rewards/rejected": -2.308933734893799, + "step": 630 + }, + { + "epoch": 0.4610951008645533, + "grad_norm": 16.14908066968672, + "learning_rate": 4.9564300992714914e-08, + "logits/chosen": -1.9591153860092163, + "logits/rejected": -1.9602371454238892, + "logps/chosen": -1.0113928318023682, + "logps/rejected": -1.1170381307601929, + "loss": 1.2104, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.0227856636047363, + "rewards/margins": 0.211290642619133, + "rewards/rejected": -2.2340762615203857, + "step": 640 + }, + { + "epoch": 0.46829971181556196, + "grad_norm": 21.97651048317045, + "learning_rate": 4.952447582110253e-08, + "logits/chosen": -2.0540075302124023, + "logits/rejected": -2.039557933807373, + "logps/chosen": -1.037952184677124, + "logps/rejected": -1.117681860923767, + "loss": 1.2477, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.075904369354248, + "rewards/margins": 0.15945938229560852, + "rewards/rejected": -2.235363721847534, + "step": 650 + }, + { + "epoch": 0.4755043227665706, + "grad_norm": 23.865036653626944, + "learning_rate": 4.948292668010676e-08, + "logits/chosen": -2.033937454223633, + "logits/rejected": -2.0348212718963623, + "logps/chosen": -1.0879608392715454, + "logps/rejected": -1.1745898723602295, + "loss": 1.2453, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.175921678543091, + "rewards/margins": 0.17325839400291443, + "rewards/rejected": -2.349179744720459, + "step": 660 + }, + { + "epoch": 0.4827089337175792, + "grad_norm": 20.3891833338485, + "learning_rate": 4.943965649046064e-08, + "logits/chosen": -2.00368332862854, + "logits/rejected": -1.994360327720642, + "logps/chosen": -1.0627825260162354, + "logps/rejected": -1.1664403676986694, + "loss": 1.2155, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.1255650520324707, + "rewards/margins": 0.2073155641555786, + "rewards/rejected": -2.332880735397339, + "step": 670 + }, + { + "epoch": 0.4899135446685879, + "grad_norm": 19.075566809881007, + "learning_rate": 4.9394668293879835e-08, + "logits/chosen": -1.9593700170516968, + "logits/rejected": -1.950269341468811, + "logps/chosen": -1.0373146533966064, + "logps/rejected": -1.1066360473632812, + "loss": 1.2628, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.074629306793213, + "rewards/margins": 0.13864275813102722, + "rewards/rejected": -2.2132720947265625, + "step": 680 + }, + { + "epoch": 0.49711815561959655, + "grad_norm": 24.96158275365681, + "learning_rate": 4.93479652528488e-08, + "logits/chosen": -2.0235979557037354, + "logits/rejected": -2.0184168815612793, + "logps/chosen": -1.1050894260406494, + "logps/rejected": -1.2094438076019287, + "loss": 1.226, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.210178852081299, + "rewards/margins": 0.20870868861675262, + "rewards/rejected": -2.4188876152038574, + "step": 690 + }, + { + "epoch": 0.5043227665706052, + "grad_norm": 20.196803734367005, + "learning_rate": 4.929955065039848e-08, + "logits/chosen": -2.0198333263397217, + "logits/rejected": -2.014254093170166, + "logps/chosen": -1.0191075801849365, + "logps/rejected": -1.1514732837677002, + "loss": 1.183, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.038215160369873, + "rewards/margins": 0.2647314965724945, + "rewards/rejected": -2.3029465675354004, + "step": 700 + }, + { + "epoch": 0.5115273775216138, + "grad_norm": 19.12481684007211, + "learning_rate": 4.92494278898755e-08, + "logits/chosen": -1.9860668182373047, + "logits/rejected": -1.9829334020614624, + "logps/chosen": -0.8973162770271301, + "logps/rejected": -1.0221717357635498, + "loss": 1.1965, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.7946325540542603, + "rewards/margins": 0.2497110813856125, + "rewards/rejected": -2.0443434715270996, + "step": 710 + }, + { + "epoch": 0.5187319884726225, + "grad_norm": 18.885298336896575, + "learning_rate": 4.9197600494702955e-08, + "logits/chosen": -2.0109028816223145, + "logits/rejected": -2.0047824382781982, + "logps/chosen": -1.042280912399292, + "logps/rejected": -1.1657941341400146, + "loss": 1.1849, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.084561824798584, + "rewards/margins": 0.2470264732837677, + "rewards/rejected": -2.3315882682800293, + "step": 720 + }, + { + "epoch": 0.5259365994236311, + "grad_norm": 20.51392204287621, + "learning_rate": 4.9144072108132725e-08, + "logits/chosen": -2.0101022720336914, + "logits/rejected": -1.9990341663360596, + "logps/chosen": -1.0220484733581543, + "logps/rejected": -1.1054035425186157, + "loss": 1.2509, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0440969467163086, + "rewards/margins": 0.16670992970466614, + "rewards/rejected": -2.2108070850372314, + "step": 730 + }, + { + "epoch": 0.5331412103746398, + "grad_norm": 17.784848123993633, + "learning_rate": 4.908884649298937e-08, + "logits/chosen": -2.0006046295166016, + "logits/rejected": -2.0075409412384033, + "logps/chosen": -1.0191365480422974, + "logps/rejected": -1.079302430152893, + "loss": 1.2835, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -2.0382730960845947, + "rewards/margins": 0.12033157050609589, + "rewards/rejected": -2.158604860305786, + "step": 740 + }, + { + "epoch": 0.5403458213256485, + "grad_norm": 23.05051802988211, + "learning_rate": 4.903192753140557e-08, + "logits/chosen": -2.0201849937438965, + "logits/rejected": -2.0148415565490723, + "logps/chosen": -1.1010781526565552, + "logps/rejected": -1.1906977891921997, + "loss": 1.2389, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.2021563053131104, + "rewards/margins": 0.1792391687631607, + "rewards/rejected": -2.3813955783843994, + "step": 750 + }, + { + "epoch": 0.547550432276657, + "grad_norm": 19.69096268460173, + "learning_rate": 4.897331922454931e-08, + "logits/chosen": -1.9745019674301147, + "logits/rejected": -1.9782997369766235, + "logps/chosen": -1.0038034915924072, + "logps/rejected": -1.1137539148330688, + "loss": 1.216, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0076069831848145, + "rewards/margins": 0.21990080177783966, + "rewards/rejected": -2.2275078296661377, + "step": 760 + }, + { + "epoch": 0.5547550432276657, + "grad_norm": 20.603497657086212, + "learning_rate": 4.891302569234256e-08, + "logits/chosen": -1.9754327535629272, + "logits/rejected": -1.978355050086975, + "logps/chosen": -0.9768314361572266, + "logps/rejected": -1.1296206712722778, + "loss": 1.1631, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.9536628723144531, + "rewards/margins": 0.30557847023010254, + "rewards/rejected": -2.2592413425445557, + "step": 770 + }, + { + "epoch": 0.5619596541786743, + "grad_norm": 22.064873844463353, + "learning_rate": 4.8851051173171656e-08, + "logits/chosen": -1.9894813299179077, + "logits/rejected": -1.988013505935669, + "logps/chosen": -1.0405890941619873, + "logps/rejected": -1.1219489574432373, + "loss": 1.2394, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0811781883239746, + "rewards/margins": 0.1627195179462433, + "rewards/rejected": -2.2438979148864746, + "step": 780 + }, + { + "epoch": 0.569164265129683, + "grad_norm": 17.44205394859882, + "learning_rate": 4.87874000235894e-08, + "logits/chosen": -2.015829563140869, + "logits/rejected": -2.010009765625, + "logps/chosen": -1.075958490371704, + "logps/rejected": -1.2331421375274658, + "loss": 1.1593, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.151916980743408, + "rewards/margins": 0.3143673837184906, + "rewards/rejected": -2.4662842750549316, + "step": 790 + }, + { + "epoch": 0.5763688760806917, + "grad_norm": 19.511392524751066, + "learning_rate": 4.872207671800876e-08, + "logits/chosen": -2.0384624004364014, + "logits/rejected": -2.0348961353302, + "logps/chosen": -1.0448932647705078, + "logps/rejected": -1.1220283508300781, + "loss": 1.2573, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.0897865295410156, + "rewards/margins": 0.1542699635028839, + "rewards/rejected": -2.2440567016601562, + "step": 800 + }, + { + "epoch": 0.5835734870317003, + "grad_norm": 16.00392428501834, + "learning_rate": 4.865508584838841e-08, + "logits/chosen": -2.020700693130493, + "logits/rejected": -2.0231757164001465, + "logps/chosen": -1.0133837461471558, + "logps/rejected": -1.103539228439331, + "loss": 1.2329, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.0267674922943115, + "rewards/margins": 0.18031062185764313, + "rewards/rejected": -2.207078456878662, + "step": 810 + }, + { + "epoch": 0.590778097982709, + "grad_norm": 21.25089832156162, + "learning_rate": 4.858643212390985e-08, + "logits/chosen": -2.02546763420105, + "logits/rejected": -2.015793561935425, + "logps/chosen": -1.029206395149231, + "logps/rejected": -1.115379810333252, + "loss": 1.2494, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.058412790298462, + "rewards/margins": 0.1723467856645584, + "rewards/rejected": -2.230759620666504, + "step": 820 + }, + { + "epoch": 0.5979827089337176, + "grad_norm": 18.36102437546264, + "learning_rate": 4.851612037064643e-08, + "logits/chosen": -1.9968347549438477, + "logits/rejected": -1.9947017431259155, + "logps/chosen": -0.9606590270996094, + "logps/rejected": -1.0800572633743286, + "loss": 1.2043, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9213180541992188, + "rewards/margins": 0.2387961596250534, + "rewards/rejected": -2.1601145267486572, + "step": 830 + }, + { + "epoch": 0.6051873198847262, + "grad_norm": 15.958170019881827, + "learning_rate": 4.8444155531224065e-08, + "logits/chosen": -2.0284435749053955, + "logits/rejected": -2.028543472290039, + "logps/chosen": -1.0880385637283325, + "logps/rejected": -1.1600936651229858, + "loss": 1.2625, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.176077127456665, + "rewards/margins": 0.14411017298698425, + "rewards/rejected": -2.3201873302459717, + "step": 840 + }, + { + "epoch": 0.6123919308357348, + "grad_norm": 15.447742706439062, + "learning_rate": 4.8370542664473805e-08, + "logits/chosen": -2.034883499145508, + "logits/rejected": -2.029095411300659, + "logps/chosen": -1.0500915050506592, + "logps/rejected": -1.1546186208724976, + "loss": 1.2247, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.1001830101013184, + "rewards/margins": 0.20905427634716034, + "rewards/rejected": -2.309237241744995, + "step": 850 + }, + { + "epoch": 0.6195965417867435, + "grad_norm": 17.975604985927344, + "learning_rate": 4.829528694507624e-08, + "logits/chosen": -2.0074715614318848, + "logits/rejected": -2.003307342529297, + "logps/chosen": -1.1618878841400146, + "logps/rejected": -1.2185773849487305, + "loss": 1.2792, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.3237757682800293, + "rewards/margins": 0.11337918043136597, + "rewards/rejected": -2.437154769897461, + "step": 860 + }, + { + "epoch": 0.6268011527377522, + "grad_norm": 20.210558301206753, + "learning_rate": 4.821839366319768e-08, + "logits/chosen": -2.0488550662994385, + "logits/rejected": -2.0428953170776367, + "logps/chosen": -1.004872441291809, + "logps/rejected": -1.1229604482650757, + "loss": 1.1965, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.009744882583618, + "rewards/margins": 0.2361760139465332, + "rewards/rejected": -2.2459208965301514, + "step": 870 + }, + { + "epoch": 0.6340057636887608, + "grad_norm": 19.796801319824315, + "learning_rate": 4.813986822411833e-08, + "logits/chosen": -2.0358963012695312, + "logits/rejected": -2.0338990688323975, + "logps/chosen": -1.0155360698699951, + "logps/rejected": -1.0795470476150513, + "loss": 1.2675, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.0310721397399902, + "rewards/margins": 0.12802186608314514, + "rewards/rejected": -2.1590940952301025, + "step": 880 + }, + { + "epoch": 0.6412103746397695, + "grad_norm": 19.799887695381567, + "learning_rate": 4.805971614785231e-08, + "logits/chosen": -2.0680534839630127, + "logits/rejected": -2.0668766498565674, + "logps/chosen": -1.015794038772583, + "logps/rejected": -1.1115624904632568, + "loss": 1.2198, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.031588077545166, + "rewards/margins": 0.19153663516044617, + "rewards/rejected": -2.2231249809265137, + "step": 890 + }, + { + "epoch": 0.6484149855907781, + "grad_norm": 20.13934417953023, + "learning_rate": 4.797794306875963e-08, + "logits/chosen": -1.9745270013809204, + "logits/rejected": -1.9761186838150024, + "logps/chosen": -1.1419258117675781, + "logps/rejected": -1.2145435810089111, + "loss": 1.2677, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.2838516235351562, + "rewards/margins": 0.14523524045944214, + "rewards/rejected": -2.4290871620178223, + "step": 900 + }, + { + "epoch": 0.6556195965417867, + "grad_norm": 20.244043065077097, + "learning_rate": 4.7894554735150076e-08, + "logits/chosen": -1.9853111505508423, + "logits/rejected": -1.9889112710952759, + "logps/chosen": -1.0432493686676025, + "logps/rejected": -1.1088694334030151, + "loss": 1.2634, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.086498737335205, + "rewards/margins": 0.13123992085456848, + "rewards/rejected": -2.2177388668060303, + "step": 910 + }, + { + "epoch": 0.6628242074927954, + "grad_norm": 23.228641230109883, + "learning_rate": 4.7809557008879185e-08, + "logits/chosen": -2.0164051055908203, + "logits/rejected": -2.0110771656036377, + "logps/chosen": -0.9736042022705078, + "logps/rejected": -1.061522126197815, + "loss": 1.2385, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9472084045410156, + "rewards/margins": 0.17583578824996948, + "rewards/rejected": -2.12304425239563, + "step": 920 + }, + { + "epoch": 0.670028818443804, + "grad_norm": 18.047096637073768, + "learning_rate": 4.772295586493613e-08, + "logits/chosen": -2.056691884994507, + "logits/rejected": -2.0538461208343506, + "logps/chosen": -1.0346996784210205, + "logps/rejected": -1.1515998840332031, + "loss": 1.1922, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.069399356842041, + "rewards/margins": 0.23380064964294434, + "rewards/rejected": -2.3031997680664062, + "step": 930 + }, + { + "epoch": 0.6772334293948127, + "grad_norm": 19.617510099326854, + "learning_rate": 4.763475739102374e-08, + "logits/chosen": -2.0092320442199707, + "logits/rejected": -2.015103816986084, + "logps/chosen": -1.1271753311157227, + "logps/rejected": -1.1942684650421143, + "loss": 1.2568, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.2543506622314453, + "rewards/margins": 0.13418647646903992, + "rewards/rejected": -2.3885369300842285, + "step": 940 + }, + { + "epoch": 0.6844380403458213, + "grad_norm": 15.419611013936972, + "learning_rate": 4.754496778713054e-08, + "logits/chosen": -1.9684407711029053, + "logits/rejected": -1.9725189208984375, + "logps/chosen": -1.011788249015808, + "logps/rejected": -1.1345211267471313, + "loss": 1.2007, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.023576498031616, + "rewards/margins": 0.24546551704406738, + "rewards/rejected": -2.2690422534942627, + "step": 950 + }, + { + "epoch": 0.69164265129683, + "grad_norm": 21.26727298981788, + "learning_rate": 4.7453593365094926e-08, + "logits/chosen": -2.0424587726593018, + "logits/rejected": -2.0415501594543457, + "logps/chosen": -1.0493271350860596, + "logps/rejected": -1.1593568325042725, + "loss": 1.2079, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.098654270172119, + "rewards/margins": 0.2200593203306198, + "rewards/rejected": -2.318713665008545, + "step": 960 + }, + { + "epoch": 0.6988472622478387, + "grad_norm": 21.287126597379253, + "learning_rate": 4.736064054816145e-08, + "logits/chosen": -2.0447025299072266, + "logits/rejected": -2.040843963623047, + "logps/chosen": -0.9683746099472046, + "logps/rejected": -1.0945460796356201, + "loss": 1.1791, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.9367492198944092, + "rewards/margins": 0.25234299898147583, + "rewards/rejected": -2.1890921592712402, + "step": 970 + }, + { + "epoch": 0.7060518731988472, + "grad_norm": 17.20400787363182, + "learning_rate": 4.726611587052933e-08, + "logits/chosen": -1.9701964855194092, + "logits/rejected": -1.9697643518447876, + "logps/chosen": -1.1084095239639282, + "logps/rejected": -1.2358492612838745, + "loss": 1.1795, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2168190479278564, + "rewards/margins": 0.25487983226776123, + "rewards/rejected": -2.471698522567749, + "step": 980 + }, + { + "epoch": 0.7132564841498559, + "grad_norm": 22.283752817420396, + "learning_rate": 4.71700259768931e-08, + "logits/chosen": -2.0302042961120605, + "logits/rejected": -2.027015447616577, + "logps/chosen": -1.1091606616973877, + "logps/rejected": -1.2056801319122314, + "loss": 1.234, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.2183213233947754, + "rewards/margins": 0.19303929805755615, + "rewards/rejected": -2.411360263824463, + "step": 990 + }, + { + "epoch": 0.7204610951008645, + "grad_norm": 19.81391793524057, + "learning_rate": 4.707237762197549e-08, + "logits/chosen": -2.0068042278289795, + "logits/rejected": -2.0036396980285645, + "logps/chosen": -1.0078332424163818, + "logps/rejected": -1.1272451877593994, + "loss": 1.2127, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0156664848327637, + "rewards/margins": 0.23882417380809784, + "rewards/rejected": -2.254490375518799, + "step": 1000 + }, + { + "epoch": 0.7276657060518732, + "grad_norm": 23.576990411457878, + "learning_rate": 4.697317767005265e-08, + "logits/chosen": -2.0253381729125977, + "logits/rejected": -2.0218160152435303, + "logps/chosen": -1.0018768310546875, + "logps/rejected": -1.0942353010177612, + "loss": 1.2568, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.003753662109375, + "rewards/margins": 0.18471679091453552, + "rewards/rejected": -2.1884706020355225, + "step": 1010 + }, + { + "epoch": 0.7348703170028819, + "grad_norm": 17.3299238751784, + "learning_rate": 4.6872433094471577e-08, + "logits/chosen": -2.0205771923065186, + "logits/rejected": -2.0157430171966553, + "logps/chosen": -1.0319100618362427, + "logps/rejected": -1.1281745433807373, + "loss": 1.2108, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.0638201236724854, + "rewards/margins": 0.19252923130989075, + "rewards/rejected": -2.2563490867614746, + "step": 1020 + }, + { + "epoch": 0.7420749279538905, + "grad_norm": 16.502866342573686, + "learning_rate": 4.677015097715994e-08, + "logits/chosen": -1.9677197933197021, + "logits/rejected": -1.9671709537506104, + "logps/chosen": -1.0225335359573364, + "logps/rejected": -1.1552186012268066, + "loss": 1.1991, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.045067071914673, + "rewards/margins": 0.26537007093429565, + "rewards/rejected": -2.3104372024536133, + "step": 1030 + }, + { + "epoch": 0.7492795389048992, + "grad_norm": 17.494781512319758, + "learning_rate": 4.666633850812825e-08, + "logits/chosen": -2.0190138816833496, + "logits/rejected": -2.0128960609436035, + "logps/chosen": -1.0130321979522705, + "logps/rejected": -1.0945827960968018, + "loss": 1.2371, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.026064395904541, + "rewards/margins": 0.163101464509964, + "rewards/rejected": -2.1891655921936035, + "step": 1040 + }, + { + "epoch": 0.7564841498559077, + "grad_norm": 17.52008871289576, + "learning_rate": 4.656100298496439e-08, + "logits/chosen": -1.9722799062728882, + "logits/rejected": -1.9686037302017212, + "logps/chosen": -0.9384390711784363, + "logps/rejected": -1.0692068338394165, + "loss": 1.1854, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.8768781423568726, + "rewards/margins": 0.2615353763103485, + "rewards/rejected": -2.138413667678833, + "step": 1050 + }, + { + "epoch": 0.7636887608069164, + "grad_norm": 17.92122527797355, + "learning_rate": 4.6454151812320715e-08, + "logits/chosen": -2.002856969833374, + "logits/rejected": -1.9969203472137451, + "logps/chosen": -1.0393486022949219, + "logps/rejected": -1.1481153964996338, + "loss": 1.2176, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0786972045898438, + "rewards/margins": 0.21753337979316711, + "rewards/rejected": -2.2962307929992676, + "step": 1060 + }, + { + "epoch": 0.770893371757925, + "grad_norm": 20.81341856841254, + "learning_rate": 4.6345792501393434e-08, + "logits/chosen": -2.0019800662994385, + "logits/rejected": -2.0001296997070312, + "logps/chosen": -1.074857234954834, + "logps/rejected": -1.201908826828003, + "loss": 1.2046, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.149714469909668, + "rewards/margins": 0.25410330295562744, + "rewards/rejected": -2.403817653656006, + "step": 1070 + }, + { + "epoch": 0.7780979827089337, + "grad_norm": 20.6783251824387, + "learning_rate": 4.6235932669394676e-08, + "logits/chosen": -2.026952028274536, + "logits/rejected": -2.02778697013855, + "logps/chosen": -1.087725043296814, + "logps/rejected": -1.1969935894012451, + "loss": 1.218, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.175450086593628, + "rewards/margins": 0.2185373604297638, + "rewards/rejected": -2.3939871788024902, + "step": 1080 + }, + { + "epoch": 0.7853025936599424, + "grad_norm": 24.297736541402326, + "learning_rate": 4.612458003901698e-08, + "logits/chosen": -2.035487174987793, + "logits/rejected": -2.0278096199035645, + "logps/chosen": -1.1088950634002686, + "logps/rejected": -1.2112846374511719, + "loss": 1.2279, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.217790126800537, + "rewards/margins": 0.2047790288925171, + "rewards/rejected": -2.4225692749023438, + "step": 1090 + }, + { + "epoch": 0.792507204610951, + "grad_norm": 23.424012674763112, + "learning_rate": 4.6011742437890476e-08, + "logits/chosen": -2.028799533843994, + "logits/rejected": -2.023322582244873, + "logps/chosen": -1.0456212759017944, + "logps/rejected": -1.179602026939392, + "loss": 1.1772, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.091242551803589, + "rewards/margins": 0.2679617702960968, + "rewards/rejected": -2.359204053878784, + "step": 1100 + }, + { + "epoch": 0.7997118155619597, + "grad_norm": 16.84111836422693, + "learning_rate": 4.589742779803259e-08, + "logits/chosen": -2.0229039192199707, + "logits/rejected": -2.015812397003174, + "logps/chosen": -1.008448839187622, + "logps/rejected": -1.1298694610595703, + "loss": 1.1935, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.016897678375244, + "rewards/margins": 0.2428409308195114, + "rewards/rejected": -2.2597389221191406, + "step": 1110 + }, + { + "epoch": 0.8069164265129684, + "grad_norm": 18.417073733768728, + "learning_rate": 4.5781644155290486e-08, + "logits/chosen": -1.9799926280975342, + "logits/rejected": -1.9722731113433838, + "logps/chosen": -1.0475791692733765, + "logps/rejected": -1.1082106828689575, + "loss": 1.2711, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.095158338546753, + "rewards/margins": 0.12126290798187256, + "rewards/rejected": -2.216421365737915, + "step": 1120 + }, + { + "epoch": 0.8141210374639769, + "grad_norm": 18.07389739764431, + "learning_rate": 4.566439964877613e-08, + "logits/chosen": -2.0103983879089355, + "logits/rejected": -2.0063552856445312, + "logps/chosen": -0.9987322092056274, + "logps/rejected": -1.0854665040969849, + "loss": 1.2431, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.9974644184112549, + "rewards/margins": 0.17346863448619843, + "rewards/rejected": -2.1709330081939697, + "step": 1130 + }, + { + "epoch": 0.8213256484149856, + "grad_norm": 16.27730585147678, + "learning_rate": 4.554570252029421e-08, + "logits/chosen": -2.0533928871154785, + "logits/rejected": -2.0521607398986816, + "logps/chosen": -1.0483338832855225, + "logps/rejected": -1.164975881576538, + "loss": 1.1998, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.096667766571045, + "rewards/margins": 0.2332839071750641, + "rewards/rejected": -2.329951763153076, + "step": 1140 + }, + { + "epoch": 0.8285302593659942, + "grad_norm": 17.887214280151717, + "learning_rate": 4.542556111376274e-08, + "logits/chosen": -2.045485496520996, + "logits/rejected": -2.039069175720215, + "logps/chosen": -1.0746331214904785, + "logps/rejected": -1.1668939590454102, + "loss": 1.2386, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.149266242980957, + "rewards/margins": 0.1845216453075409, + "rewards/rejected": -2.3337879180908203, + "step": 1150 + }, + { + "epoch": 0.8357348703170029, + "grad_norm": 23.056739915075795, + "learning_rate": 4.5303983874626506e-08, + "logits/chosen": -1.9926433563232422, + "logits/rejected": -1.9910094738006592, + "logps/chosen": -1.0387976169586182, + "logps/rejected": -1.1165183782577515, + "loss": 1.2645, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.0775952339172363, + "rewards/margins": 0.15544185042381287, + "rewards/rejected": -2.233036756515503, + "step": 1160 + }, + { + "epoch": 0.8429394812680115, + "grad_norm": 20.255328662941217, + "learning_rate": 4.518097934926339e-08, + "logits/chosen": -1.9955031871795654, + "logits/rejected": -1.9868882894515991, + "logps/chosen": -1.01637601852417, + "logps/rejected": -1.1265536546707153, + "loss": 1.2046, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.03275203704834, + "rewards/margins": 0.22035527229309082, + "rewards/rejected": -2.2531073093414307, + "step": 1170 + }, + { + "epoch": 0.8501440922190202, + "grad_norm": 22.820031947974105, + "learning_rate": 4.505655618438363e-08, + "logits/chosen": -1.9624055624008179, + "logits/rejected": -1.958373785018921, + "logps/chosen": -1.0602306127548218, + "logps/rejected": -1.1650116443634033, + "loss": 1.2288, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.1204612255096436, + "rewards/margins": 0.20956222712993622, + "rewards/rejected": -2.3300232887268066, + "step": 1180 + }, + { + "epoch": 0.8573487031700289, + "grad_norm": 17.32188484657031, + "learning_rate": 4.4930723126421945e-08, + "logits/chosen": -2.052605152130127, + "logits/rejected": -2.045747995376587, + "logps/chosen": -1.0718796253204346, + "logps/rejected": -1.1474034786224365, + "loss": 1.2515, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.143759250640869, + "rewards/margins": 0.1510476917028427, + "rewards/rejected": -2.294806957244873, + "step": 1190 + }, + { + "epoch": 0.8645533141210374, + "grad_norm": 22.29345313385636, + "learning_rate": 4.48034890209227e-08, + "logits/chosen": -1.9834630489349365, + "logits/rejected": -1.9713430404663086, + "logps/chosen": -1.0877046585083008, + "logps/rejected": -1.1743061542510986, + "loss": 1.2302, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.1754093170166016, + "rewards/margins": 0.17320279777050018, + "rewards/rejected": -2.3486123085021973, + "step": 1200 + }, + { + "epoch": 0.8717579250720461, + "grad_norm": 18.453910031808036, + "learning_rate": 4.4674862811918155e-08, + "logits/chosen": -1.9687246084213257, + "logits/rejected": -1.9770466089248657, + "logps/chosen": -0.9387677907943726, + "logps/rejected": -1.091802954673767, + "loss": 1.1595, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.8775355815887451, + "rewards/margins": 0.30607035756111145, + "rewards/rejected": -2.183605909347534, + "step": 1210 + }, + { + "epoch": 0.8789625360230547, + "grad_norm": 17.380965389789868, + "learning_rate": 4.454485354129966e-08, + "logits/chosen": -1.9993393421173096, + "logits/rejected": -1.9949222803115845, + "logps/chosen": -1.0104951858520508, + "logps/rejected": -1.115613579750061, + "loss": 1.2192, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.0209903717041016, + "rewards/margins": 0.21023674309253693, + "rewards/rejected": -2.231227159500122, + "step": 1220 + }, + { + "epoch": 0.8861671469740634, + "grad_norm": 17.212065292460622, + "learning_rate": 4.4413470348182124e-08, + "logits/chosen": -1.9702112674713135, + "logits/rejected": -1.957925796508789, + "logps/chosen": -0.9851275682449341, + "logps/rejected": -1.076827883720398, + "loss": 1.2315, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.9702551364898682, + "rewards/margins": 0.1834007203578949, + "rewards/rejected": -2.153655767440796, + "step": 1230 + }, + { + "epoch": 0.8933717579250721, + "grad_norm": 21.132247964323447, + "learning_rate": 4.42807224682615e-08, + "logits/chosen": -1.9841238260269165, + "logits/rejected": -1.9821048974990845, + "logps/chosen": -0.9365342855453491, + "logps/rejected": -1.0724506378173828, + "loss": 1.1805, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.8730685710906982, + "rewards/margins": 0.2718326449394226, + "rewards/rejected": -2.1449012756347656, + "step": 1240 + }, + { + "epoch": 0.9005763688760807, + "grad_norm": 18.771464131402308, + "learning_rate": 4.4146619233165604e-08, + "logits/chosen": -2.0202784538269043, + "logits/rejected": -2.022472858428955, + "logps/chosen": -1.0653743743896484, + "logps/rejected": -1.2193849086761475, + "loss": 1.1677, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.130748748779297, + "rewards/margins": 0.3080212473869324, + "rewards/rejected": -2.438769817352295, + "step": 1250 + }, + { + "epoch": 0.9077809798270894, + "grad_norm": 24.969245977085052, + "learning_rate": 4.4011170069798126e-08, + "logits/chosen": -2.016045331954956, + "logits/rejected": -2.0211358070373535, + "logps/chosen": -1.1183704137802124, + "logps/rejected": -1.2435810565948486, + "loss": 1.1935, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.236740827560425, + "rewards/margins": 0.2504214644432068, + "rewards/rejected": -2.4871621131896973, + "step": 1260 + }, + { + "epoch": 0.9149855907780979, + "grad_norm": 17.906289261915187, + "learning_rate": 4.387438449967594e-08, + "logits/chosen": -1.981329321861267, + "logits/rejected": -1.9747323989868164, + "logps/chosen": -0.966105580329895, + "logps/rejected": -1.086094856262207, + "loss": 1.1911, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.93221116065979, + "rewards/margins": 0.2399786412715912, + "rewards/rejected": -2.172189712524414, + "step": 1270 + }, + { + "epoch": 0.9221902017291066, + "grad_norm": 21.194952039998515, + "learning_rate": 4.373627213825983e-08, + "logits/chosen": -2.0677618980407715, + "logits/rejected": -2.063303232192993, + "logps/chosen": -1.0270278453826904, + "logps/rejected": -1.1622191667556763, + "loss": 1.1834, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.054055690765381, + "rewards/margins": 0.27038270235061646, + "rewards/rejected": -2.3244383335113525, + "step": 1280 + }, + { + "epoch": 0.9293948126801153, + "grad_norm": 16.751477190296892, + "learning_rate": 4.359684269427848e-08, + "logits/chosen": -2.038684368133545, + "logits/rejected": -2.03769588470459, + "logps/chosen": -0.9954586029052734, + "logps/rejected": -1.0992056131362915, + "loss": 1.2108, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9909172058105469, + "rewards/margins": 0.20749418437480927, + "rewards/rejected": -2.198411226272583, + "step": 1290 + }, + { + "epoch": 0.9365994236311239, + "grad_norm": 23.518080524189983, + "learning_rate": 4.34561059690461e-08, + "logits/chosen": -2.0750319957733154, + "logits/rejected": -2.0769741535186768, + "logps/chosen": -1.048097014427185, + "logps/rejected": -1.1116466522216797, + "loss": 1.2718, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.09619402885437, + "rewards/margins": 0.12709912657737732, + "rewards/rejected": -2.2232933044433594, + "step": 1300 + }, + { + "epoch": 0.9438040345821326, + "grad_norm": 21.385571779096182, + "learning_rate": 4.3314071855773314e-08, + "logits/chosen": -2.0412631034851074, + "logits/rejected": -2.0419461727142334, + "logps/chosen": -0.9842621684074402, + "logps/rejected": -1.079594612121582, + "loss": 1.2226, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.9685243368148804, + "rewards/margins": 0.19066500663757324, + "rewards/rejected": -2.159189224243164, + "step": 1310 + }, + { + "epoch": 0.9510086455331412, + "grad_norm": 20.47235892448916, + "learning_rate": 4.3170750338871806e-08, + "logits/chosen": -2.015406847000122, + "logits/rejected": -2.0090079307556152, + "logps/chosen": -1.077161431312561, + "logps/rejected": -1.2194509506225586, + "loss": 1.167, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.154322862625122, + "rewards/margins": 0.28457918763160706, + "rewards/rejected": -2.438901901245117, + "step": 1320 + }, + { + "epoch": 0.9582132564841499, + "grad_norm": 14.760860419836652, + "learning_rate": 4.3026151493252414e-08, + "logits/chosen": -2.04630446434021, + "logits/rejected": -2.0420799255371094, + "logps/chosen": -1.0609397888183594, + "logps/rejected": -1.182420253753662, + "loss": 1.1998, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.1218795776367188, + "rewards/margins": 0.24296097457408905, + "rewards/rejected": -2.364840507507324, + "step": 1330 + }, + { + "epoch": 0.9654178674351584, + "grad_norm": 25.51297506847031, + "learning_rate": 4.2880285483616895e-08, + "logits/chosen": -2.006889820098877, + "logits/rejected": -2.007575750350952, + "logps/chosen": -1.0171369314193726, + "logps/rejected": -1.1325743198394775, + "loss": 1.2089, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.034273862838745, + "rewards/margins": 0.23087477684020996, + "rewards/rejected": -2.265148639678955, + "step": 1340 + }, + { + "epoch": 0.9726224783861671, + "grad_norm": 16.088184668896073, + "learning_rate": 4.273316256374342e-08, + "logits/chosen": -1.940446138381958, + "logits/rejected": -1.9386374950408936, + "logps/chosen": -1.0138260126113892, + "logps/rejected": -1.0874342918395996, + "loss": 1.2632, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.0276520252227783, + "rewards/margins": 0.1472165733575821, + "rewards/rejected": -2.174868583679199, + "step": 1350 + }, + { + "epoch": 0.9798270893371758, + "grad_norm": 16.07758733928266, + "learning_rate": 4.258479307576576e-08, + "logits/chosen": -1.9868743419647217, + "logits/rejected": -1.9846910238265991, + "logps/chosen": -0.9640612602233887, + "logps/rejected": -1.0554001331329346, + "loss": 1.2393, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9281225204467773, + "rewards/margins": 0.18267770111560822, + "rewards/rejected": -2.110800266265869, + "step": 1360 + }, + { + "epoch": 0.9870317002881844, + "grad_norm": 21.299668948105722, + "learning_rate": 4.243518744944626e-08, + "logits/chosen": -2.015906572341919, + "logits/rejected": -2.0112671852111816, + "logps/chosen": -1.0006954669952393, + "logps/rejected": -1.1211137771606445, + "loss": 1.189, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0013909339904785, + "rewards/margins": 0.24083688855171204, + "rewards/rejected": -2.242227554321289, + "step": 1370 + }, + { + "epoch": 0.9942363112391931, + "grad_norm": 20.892427881150027, + "learning_rate": 4.22843562014427e-08, + "logits/chosen": -1.9761593341827393, + "logits/rejected": -1.9725821018218994, + "logps/chosen": -1.0507876873016357, + "logps/rejected": -1.1257398128509521, + "loss": 1.2494, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.1015753746032715, + "rewards/margins": 0.14990456402301788, + "rewards/rejected": -2.2514796257019043, + "step": 1380 + }, + { + "epoch": 1.0014409221902016, + "grad_norm": 27.91445112679855, + "learning_rate": 4.2132309934569e-08, + "logits/chosen": -2.0479187965393066, + "logits/rejected": -2.048383951187134, + "logps/chosen": -1.0160915851593018, + "logps/rejected": -1.1285258531570435, + "loss": 1.211, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0321831703186035, + "rewards/margins": 0.22486881911754608, + "rewards/rejected": -2.257051706314087, + "step": 1390 + }, + { + "epoch": 1.0086455331412103, + "grad_norm": 18.441960013603726, + "learning_rate": 4.197905933704989e-08, + "logits/chosen": -1.9482128620147705, + "logits/rejected": -1.9455543756484985, + "logps/chosen": -1.0604915618896484, + "logps/rejected": -1.1943556070327759, + "loss": 1.2012, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.120983123779297, + "rewards/margins": 0.2677280306816101, + "rewards/rejected": -2.3887112140655518, + "step": 1400 + }, + { + "epoch": 1.015850144092219, + "grad_norm": 23.66473994264621, + "learning_rate": 4.1824615181769577e-08, + "logits/chosen": -1.9916549921035767, + "logits/rejected": -1.9958763122558594, + "logps/chosen": -1.0126399993896484, + "logps/rejected": -1.138517141342163, + "loss": 1.2028, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.025279998779297, + "rewards/margins": 0.25175410509109497, + "rewards/rejected": -2.277034282684326, + "step": 1410 + }, + { + "epoch": 1.0230547550432276, + "grad_norm": 18.457666274318385, + "learning_rate": 4.1668988325514434e-08, + "logits/chosen": -2.015357494354248, + "logits/rejected": -2.0103044509887695, + "logps/chosen": -1.1170918941497803, + "logps/rejected": -1.232860803604126, + "loss": 1.2242, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.2341837882995605, + "rewards/margins": 0.23153769969940186, + "rewards/rejected": -2.465721607208252, + "step": 1420 + }, + { + "epoch": 1.0302593659942363, + "grad_norm": 21.01670306409276, + "learning_rate": 4.1512189708209844e-08, + "logits/chosen": -2.0597169399261475, + "logits/rejected": -2.058657169342041, + "logps/chosen": -0.9408125877380371, + "logps/rejected": -1.027007818222046, + "loss": 1.2466, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.8816251754760742, + "rewards/margins": 0.17239060997962952, + "rewards/rejected": -2.054015636444092, + "step": 1430 + }, + { + "epoch": 1.037463976945245, + "grad_norm": 22.302085677116274, + "learning_rate": 4.1354230352151143e-08, + "logits/chosen": -2.0084290504455566, + "logits/rejected": -2.0017716884613037, + "logps/chosen": -1.1378480195999146, + "logps/rejected": -1.2201100587844849, + "loss": 1.2575, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.275696039199829, + "rewards/margins": 0.1645239144563675, + "rewards/rejected": -2.4402201175689697, + "step": 1440 + }, + { + "epoch": 1.0446685878962536, + "grad_norm": 16.946791101993835, + "learning_rate": 4.119512136122882e-08, + "logits/chosen": -2.07346773147583, + "logits/rejected": -2.0827276706695557, + "logps/chosen": -0.9949871897697449, + "logps/rejected": -1.1448405981063843, + "loss": 1.1711, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.9899743795394897, + "rewards/margins": 0.2997070550918579, + "rewards/rejected": -2.2896811962127686, + "step": 1450 + }, + { + "epoch": 1.0518731988472623, + "grad_norm": 15.526038466199521, + "learning_rate": 4.103487392014795e-08, + "logits/chosen": -1.9936816692352295, + "logits/rejected": -1.9814279079437256, + "logps/chosen": -1.0004615783691406, + "logps/rejected": -1.1593652963638306, + "loss": 1.1449, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.0009231567382812, + "rewards/margins": 0.31780725717544556, + "rewards/rejected": -2.318730592727661, + "step": 1460 + }, + { + "epoch": 1.059077809798271, + "grad_norm": 16.90228466492342, + "learning_rate": 4.087349929364192e-08, + "logits/chosen": -2.027029514312744, + "logits/rejected": -2.017503261566162, + "logps/chosen": -0.9608215093612671, + "logps/rejected": -1.091578722000122, + "loss": 1.1869, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.9216430187225342, + "rewards/margins": 0.261514276266098, + "rewards/rejected": -2.183157444000244, + "step": 1470 + }, + { + "epoch": 1.0662824207492796, + "grad_norm": 17.442053462217785, + "learning_rate": 4.0711008825680645e-08, + "logits/chosen": -1.9791135787963867, + "logits/rejected": -1.978002905845642, + "logps/chosen": -1.006446123123169, + "logps/rejected": -1.1246168613433838, + "loss": 1.2067, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.012892246246338, + "rewards/margins": 0.23634123802185059, + "rewards/rejected": -2.2492337226867676, + "step": 1480 + }, + { + "epoch": 1.0734870317002883, + "grad_norm": 19.86972344245805, + "learning_rate": 4.054741393867306e-08, + "logits/chosen": -1.994312047958374, + "logits/rejected": -1.9914157390594482, + "logps/chosen": -1.1115689277648926, + "logps/rejected": -1.1622049808502197, + "loss": 1.2879, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.223137855529785, + "rewards/margins": 0.10127194970846176, + "rewards/rejected": -2.3244099617004395, + "step": 1490 + }, + { + "epoch": 1.080691642651297, + "grad_norm": 18.714554731415358, + "learning_rate": 4.038272613266419e-08, + "logits/chosen": -2.0033118724823, + "logits/rejected": -1.9902782440185547, + "logps/chosen": -1.0098048448562622, + "logps/rejected": -1.1201963424682617, + "loss": 1.2025, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.0196096897125244, + "rewards/margins": 0.2207828313112259, + "rewards/rejected": -2.2403926849365234, + "step": 1500 + }, + { + "epoch": 1.0878962536023056, + "grad_norm": 18.243834119172774, + "learning_rate": 4.0216956984526784e-08, + "logits/chosen": -2.0470855236053467, + "logits/rejected": -2.049050807952881, + "logps/chosen": -1.0156313180923462, + "logps/rejected": -1.1249277591705322, + "loss": 1.2154, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.0312626361846924, + "rewards/margins": 0.2185930460691452, + "rewards/rejected": -2.2498555183410645, + "step": 1510 + }, + { + "epoch": 1.0951008645533142, + "grad_norm": 16.15363780677068, + "learning_rate": 4.0050118147147446e-08, + "logits/chosen": -1.9841066598892212, + "logits/rejected": -1.9844478368759155, + "logps/chosen": -1.0981109142303467, + "logps/rejected": -1.1102923154830933, + "loss": 1.3395, + "rewards/accuracies": 0.4312500059604645, + "rewards/chosen": -2.1962218284606934, + "rewards/margins": 0.024362847208976746, + "rewards/rejected": -2.2205846309661865, + "step": 1520 + }, + { + "epoch": 1.1023054755043227, + "grad_norm": 17.76262200063469, + "learning_rate": 3.988222134860755e-08, + "logits/chosen": -2.029658317565918, + "logits/rejected": -2.020962953567505, + "logps/chosen": -0.9501702189445496, + "logps/rejected": -1.1164584159851074, + "loss": 1.1391, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.9003404378890991, + "rewards/margins": 0.33257636427879333, + "rewards/rejected": -2.232916831970215, + "step": 1530 + }, + { + "epoch": 1.1095100864553313, + "grad_norm": 23.81823709896279, + "learning_rate": 3.9713278391358724e-08, + "logits/chosen": -2.0359702110290527, + "logits/rejected": -2.0298221111297607, + "logps/chosen": -1.0248148441314697, + "logps/rejected": -1.1481475830078125, + "loss": 1.1877, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.0496296882629395, + "rewards/margins": 0.24666526913642883, + "rewards/rejected": -2.296295166015625, + "step": 1540 + }, + { + "epoch": 1.11671469740634, + "grad_norm": 17.830908151102502, + "learning_rate": 3.954330115139328e-08, + "logits/chosen": -2.015063762664795, + "logits/rejected": -2.0099833011627197, + "logps/chosen": -1.0277677774429321, + "logps/rejected": -1.1327736377716064, + "loss": 1.2221, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.0555355548858643, + "rewards/margins": 0.21001163125038147, + "rewards/rejected": -2.265547275543213, + "step": 1550 + }, + { + "epoch": 1.1239193083573487, + "grad_norm": 25.68887769776998, + "learning_rate": 3.937230157740931e-08, + "logits/chosen": -2.070219039916992, + "logits/rejected": -2.064025640487671, + "logps/chosen": -1.0480725765228271, + "logps/rejected": -1.1831330060958862, + "loss": 1.1827, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0961451530456543, + "rewards/margins": 0.27012091875076294, + "rewards/rejected": -2.3662660121917725, + "step": 1560 + }, + { + "epoch": 1.1311239193083573, + "grad_norm": 16.154765963959324, + "learning_rate": 3.920029168997077e-08, + "logits/chosen": -2.0501182079315186, + "logits/rejected": -2.048215389251709, + "logps/chosen": -1.0040074586868286, + "logps/rejected": -1.1317455768585205, + "loss": 1.1863, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0080149173736572, + "rewards/margins": 0.25547635555267334, + "rewards/rejected": -2.263491153717041, + "step": 1570 + }, + { + "epoch": 1.138328530259366, + "grad_norm": 29.765979988811136, + "learning_rate": 3.9027283580662476e-08, + "logits/chosen": -2.0178141593933105, + "logits/rejected": -2.0118331909179688, + "logps/chosen": -1.047828197479248, + "logps/rejected": -1.193880319595337, + "loss": 1.1762, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.095656394958496, + "rewards/margins": 0.2921043336391449, + "rewards/rejected": -2.387760639190674, + "step": 1580 + }, + { + "epoch": 1.1455331412103746, + "grad_norm": 16.865277551940466, + "learning_rate": 3.885328941124014e-08, + "logits/chosen": -1.9888120889663696, + "logits/rejected": -1.9842865467071533, + "logps/chosen": -0.9665737152099609, + "logps/rejected": -1.1005498170852661, + "loss": 1.1706, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.9331474304199219, + "rewards/margins": 0.2679522633552551, + "rewards/rejected": -2.2010996341705322, + "step": 1590 + }, + { + "epoch": 1.1527377521613833, + "grad_norm": 20.89972191635711, + "learning_rate": 3.867832141277539e-08, + "logits/chosen": -2.0299296379089355, + "logits/rejected": -2.020932912826538, + "logps/chosen": -1.0687669515609741, + "logps/rejected": -1.1805663108825684, + "loss": 1.2101, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.1375339031219482, + "rewards/margins": 0.2235983908176422, + "rewards/rejected": -2.3611326217651367, + "step": 1600 + }, + { + "epoch": 1.159942363112392, + "grad_norm": 20.78071767638211, + "learning_rate": 3.850239188479606e-08, + "logits/chosen": -1.9834659099578857, + "logits/rejected": -1.9868577718734741, + "logps/chosen": -1.0097862482070923, + "logps/rejected": -1.1004573106765747, + "loss": 1.237, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.0195724964141846, + "rewards/margins": 0.18134194612503052, + "rewards/rejected": -2.2009146213531494, + "step": 1610 + }, + { + "epoch": 1.1671469740634006, + "grad_norm": 22.02097078416428, + "learning_rate": 3.832551319442151e-08, + "logits/chosen": -2.057338237762451, + "logits/rejected": -2.0585570335388184, + "logps/chosen": -1.057908296585083, + "logps/rejected": -1.1848082542419434, + "loss": 1.1897, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.115816593170166, + "rewards/margins": 0.25380033254623413, + "rewards/rejected": -2.3696165084838867, + "step": 1620 + }, + { + "epoch": 1.1743515850144093, + "grad_norm": 17.325160620804777, + "learning_rate": 3.81476977754933e-08, + "logits/chosen": -1.9559204578399658, + "logits/rejected": -1.952262282371521, + "logps/chosen": -1.0270769596099854, + "logps/rejected": -1.0972059965133667, + "loss": 1.2578, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0541539192199707, + "rewards/margins": 0.14025799930095673, + "rewards/rejected": -2.1944119930267334, + "step": 1630 + }, + { + "epoch": 1.181556195965418, + "grad_norm": 16.860559579230735, + "learning_rate": 3.796895812770114e-08, + "logits/chosen": -1.9805179834365845, + "logits/rejected": -1.981414794921875, + "logps/chosen": -1.0173685550689697, + "logps/rejected": -1.1094672679901123, + "loss": 1.2405, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.0347371101379395, + "rewards/margins": 0.18419703841209412, + "rewards/rejected": -2.2189345359802246, + "step": 1640 + }, + { + "epoch": 1.1887608069164266, + "grad_norm": 22.22569351927079, + "learning_rate": 3.7789306815704216e-08, + "logits/chosen": -2.010031223297119, + "logits/rejected": -2.0077967643737793, + "logps/chosen": -1.0069730281829834, + "logps/rejected": -1.0787549018859863, + "loss": 1.261, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.013946056365967, + "rewards/margins": 0.14356335997581482, + "rewards/rejected": -2.1575098037719727, + "step": 1650 + }, + { + "epoch": 1.195965417867435, + "grad_norm": 18.941008996813895, + "learning_rate": 3.760875646824795e-08, + "logits/chosen": -1.9386460781097412, + "logits/rejected": -1.942348837852478, + "logps/chosen": -0.9752788543701172, + "logps/rejected": -1.0792890787124634, + "loss": 1.2239, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.9505577087402344, + "rewards/margins": 0.20802041888237, + "rewards/rejected": -2.1585781574249268, + "step": 1660 + }, + { + "epoch": 1.2031700288184437, + "grad_norm": 22.29470132845054, + "learning_rate": 3.742731977727623e-08, + "logits/chosen": -2.031289577484131, + "logits/rejected": -2.028223991394043, + "logps/chosen": -1.0405927896499634, + "logps/rejected": -1.1778171062469482, + "loss": 1.1781, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.0811855792999268, + "rewards/margins": 0.27444881200790405, + "rewards/rejected": -2.3556342124938965, + "step": 1670 + }, + { + "epoch": 1.2103746397694524, + "grad_norm": 19.668800943191464, + "learning_rate": 3.7245009497039244e-08, + "logits/chosen": -1.9710372686386108, + "logits/rejected": -1.9631189107894897, + "logps/chosen": -1.0121662616729736, + "logps/rejected": -1.1485233306884766, + "loss": 1.1722, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.0243325233459473, + "rewards/margins": 0.2727140784263611, + "rewards/rejected": -2.297046661376953, + "step": 1680 + }, + { + "epoch": 1.217579250720461, + "grad_norm": 18.855322537148837, + "learning_rate": 3.7061838443196886e-08, + "logits/chosen": -2.0141379833221436, + "logits/rejected": -2.0157604217529297, + "logps/chosen": -1.0264530181884766, + "logps/rejected": -1.149954080581665, + "loss": 1.1888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.052906036376953, + "rewards/margins": 0.24700184166431427, + "rewards/rejected": -2.29990816116333, + "step": 1690 + }, + { + "epoch": 1.2247838616714697, + "grad_norm": 22.698689716068593, + "learning_rate": 3.68778194919179e-08, + "logits/chosen": -1.984043836593628, + "logits/rejected": -1.9850364923477173, + "logps/chosen": -1.0795161724090576, + "logps/rejected": -1.2015224695205688, + "loss": 1.195, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.1590323448181152, + "rewards/margins": 0.24401259422302246, + "rewards/rejected": -2.4030449390411377, + "step": 1700 + }, + { + "epoch": 1.2319884726224783, + "grad_norm": 20.214389140478467, + "learning_rate": 3.66929655789747e-08, + "logits/chosen": -2.0348191261291504, + "logits/rejected": -2.023660659790039, + "logps/chosen": -0.9398587346076965, + "logps/rejected": -1.0924385786056519, + "loss": 1.1626, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.879717469215393, + "rewards/margins": 0.3051597476005554, + "rewards/rejected": -2.1848771572113037, + "step": 1710 + }, + { + "epoch": 1.239193083573487, + "grad_norm": 16.465610751100254, + "learning_rate": 3.6507289698834064e-08, + "logits/chosen": -1.9764940738677979, + "logits/rejected": -1.9729808568954468, + "logps/chosen": -0.9838182330131531, + "logps/rejected": -1.1163631677627563, + "loss": 1.1955, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.9676364660263062, + "rewards/margins": 0.2650895118713379, + "rewards/rejected": -2.2327263355255127, + "step": 1720 + }, + { + "epoch": 1.2463976945244957, + "grad_norm": 25.365894303851952, + "learning_rate": 3.6320804903743684e-08, + "logits/chosen": -2.0223116874694824, + "logits/rejected": -2.0218966007232666, + "logps/chosen": -1.0339914560317993, + "logps/rejected": -1.159183144569397, + "loss": 1.1983, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0679829120635986, + "rewards/margins": 0.250383585691452, + "rewards/rejected": -2.318366289138794, + "step": 1730 + }, + { + "epoch": 1.2536023054755043, + "grad_norm": 17.275189136976564, + "learning_rate": 3.61335243028146e-08, + "logits/chosen": -2.011654853820801, + "logits/rejected": -2.01637601852417, + "logps/chosen": -1.0918588638305664, + "logps/rejected": -1.2234910726547241, + "loss": 1.1905, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.183717727661133, + "rewards/margins": 0.26326465606689453, + "rewards/rejected": -2.4469821453094482, + "step": 1740 + }, + { + "epoch": 1.260806916426513, + "grad_norm": 18.5007118428664, + "learning_rate": 3.5945461061099736e-08, + "logits/chosen": -1.9712812900543213, + "logits/rejected": -1.9578218460083008, + "logps/chosen": -1.0444309711456299, + "logps/rejected": -1.1218526363372803, + "loss": 1.2707, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.0888619422912598, + "rewards/margins": 0.1548432558774948, + "rewards/rejected": -2.2437052726745605, + "step": 1750 + }, + { + "epoch": 1.2680115273775217, + "grad_norm": 19.826773581846037, + "learning_rate": 3.5756628398668446e-08, + "logits/chosen": -2.0560269355773926, + "logits/rejected": -2.061149835586548, + "logps/chosen": -1.1327307224273682, + "logps/rejected": -1.2321850061416626, + "loss": 1.2409, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.2654614448547363, + "rewards/margins": 0.1989085078239441, + "rewards/rejected": -2.464370012283325, + "step": 1760 + }, + { + "epoch": 1.2752161383285303, + "grad_norm": 17.659111449492986, + "learning_rate": 3.556703958967716e-08, + "logits/chosen": -2.042252779006958, + "logits/rejected": -2.0375871658325195, + "logps/chosen": -1.051990270614624, + "logps/rejected": -1.1855313777923584, + "loss": 1.1891, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.103980541229248, + "rewards/margins": 0.2670823037624359, + "rewards/rejected": -2.371062755584717, + "step": 1770 + }, + { + "epoch": 1.282420749279539, + "grad_norm": 24.065221888255977, + "learning_rate": 3.5376707961436297e-08, + "logits/chosen": -2.0278120040893555, + "logits/rejected": -2.022207498550415, + "logps/chosen": -1.1405036449432373, + "logps/rejected": -1.2026771306991577, + "loss": 1.2719, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.2810072898864746, + "rewards/margins": 0.12434691190719604, + "rewards/rejected": -2.4053542613983154, + "step": 1780 + }, + { + "epoch": 1.2896253602305476, + "grad_norm": 12.853822919695737, + "learning_rate": 3.51856468934734e-08, + "logits/chosen": -1.9812190532684326, + "logits/rejected": -1.9826923608779907, + "logps/chosen": -0.9758992195129395, + "logps/rejected": -1.0700486898422241, + "loss": 1.2226, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.951798439025879, + "rewards/margins": 0.18829897046089172, + "rewards/rejected": -2.1400973796844482, + "step": 1790 + }, + { + "epoch": 1.2968299711815563, + "grad_norm": 20.14401512444606, + "learning_rate": 3.499386981659262e-08, + "logits/chosen": -2.0630898475646973, + "logits/rejected": -2.0576171875, + "logps/chosen": -1.018842101097107, + "logps/rejected": -1.209272027015686, + "loss": 1.1238, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.037684202194214, + "rewards/margins": 0.3808597922325134, + "rewards/rejected": -2.418544054031372, + "step": 1800 + }, + { + "epoch": 1.304034582132565, + "grad_norm": 20.90969448735332, + "learning_rate": 3.480139021193057e-08, + "logits/chosen": -1.9834129810333252, + "logits/rejected": -1.985131859779358, + "logps/chosen": -0.9966486692428589, + "logps/rejected": -1.1168452501296997, + "loss": 1.2127, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.9932973384857178, + "rewards/margins": 0.24039287865161896, + "rewards/rejected": -2.2336905002593994, + "step": 1810 + }, + { + "epoch": 1.3112391930835736, + "grad_norm": 28.592045906928604, + "learning_rate": 3.4608221610008666e-08, + "logits/chosen": -2.018594264984131, + "logits/rejected": -2.0142102241516113, + "logps/chosen": -0.9736968278884888, + "logps/rejected": -1.1203354597091675, + "loss": 1.1713, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.9473936557769775, + "rewards/margins": 0.2932773232460022, + "rewards/rejected": -2.240670919418335, + "step": 1820 + }, + { + "epoch": 1.318443804034582, + "grad_norm": 15.191719500704991, + "learning_rate": 3.4414377589782e-08, + "logits/chosen": -1.9855458736419678, + "logits/rejected": -1.9946119785308838, + "logps/chosen": -1.0181246995925903, + "logps/rejected": -1.1509206295013428, + "loss": 1.1963, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.0362493991851807, + "rewards/margins": 0.2655918300151825, + "rewards/rejected": -2.3018412590026855, + "step": 1830 + }, + { + "epoch": 1.3256484149855907, + "grad_norm": 18.19541860204369, + "learning_rate": 3.4219871777684745e-08, + "logits/chosen": -1.9971675872802734, + "logits/rejected": -1.984905481338501, + "logps/chosen": -0.9933854937553406, + "logps/rejected": -1.1145892143249512, + "loss": 1.2078, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9867709875106812, + "rewards/margins": 0.24240756034851074, + "rewards/rejected": -2.2291784286499023, + "step": 1840 + }, + { + "epoch": 1.3328530259365994, + "grad_norm": 17.72178124025082, + "learning_rate": 3.4024717846672364e-08, + "logits/chosen": -2.0331177711486816, + "logits/rejected": -2.026477336883545, + "logps/chosen": -0.9942334890365601, + "logps/rejected": -1.1216745376586914, + "loss": 1.1967, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.9884669780731201, + "rewards/margins": 0.25488215684890747, + "rewards/rejected": -2.243349075317383, + "step": 1850 + }, + { + "epoch": 1.340057636887608, + "grad_norm": 17.801651438890392, + "learning_rate": 3.382892951526036e-08, + "logits/chosen": -2.018220901489258, + "logits/rejected": -2.0154216289520264, + "logps/chosen": -1.0521572828292847, + "logps/rejected": -1.2008370161056519, + "loss": 1.163, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.1043145656585693, + "rewards/margins": 0.2973593771457672, + "rewards/rejected": -2.4016740322113037, + "step": 1860 + }, + { + "epoch": 1.3472622478386167, + "grad_norm": 20.318968041064025, + "learning_rate": 3.3632520546559974e-08, + "logits/chosen": -1.9867897033691406, + "logits/rejected": -1.9751968383789062, + "logps/chosen": -0.926361083984375, + "logps/rejected": -1.0954601764678955, + "loss": 1.1271, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.85272216796875, + "rewards/margins": 0.3381980061531067, + "rewards/rejected": -2.190920352935791, + "step": 1870 + }, + { + "epoch": 1.3544668587896254, + "grad_norm": 19.630592165862417, + "learning_rate": 3.34355047473107e-08, + "logits/chosen": -2.0014548301696777, + "logits/rejected": -1.997385025024414, + "logps/chosen": -1.0289537906646729, + "logps/rejected": -1.118239164352417, + "loss": 1.245, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.0579075813293457, + "rewards/margins": 0.1785707026720047, + "rewards/rejected": -2.236478328704834, + "step": 1880 + }, + { + "epoch": 1.361671469740634, + "grad_norm": 22.9649823694943, + "learning_rate": 3.323789596690971e-08, + "logits/chosen": -1.9707273244857788, + "logits/rejected": -1.9716689586639404, + "logps/chosen": -1.0208336114883423, + "logps/rejected": -1.1546828746795654, + "loss": 1.1792, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0416672229766846, + "rewards/margins": 0.26769858598709106, + "rewards/rejected": -2.309365749359131, + "step": 1890 + }, + { + "epoch": 1.3688760806916427, + "grad_norm": 15.719014755563348, + "learning_rate": 3.303970809643828e-08, + "logits/chosen": -2.000805139541626, + "logits/rejected": -2.0052528381347656, + "logps/chosen": -1.0358805656433105, + "logps/rejected": -1.164954423904419, + "loss": 1.1925, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.071761131286621, + "rewards/margins": 0.2581479847431183, + "rewards/rejected": -2.329908847808838, + "step": 1900 + }, + { + "epoch": 1.3760806916426513, + "grad_norm": 20.784381707823112, + "learning_rate": 3.2840955067685356e-08, + "logits/chosen": -2.0275561809539795, + "logits/rejected": -2.031751871109009, + "logps/chosen": -1.0541309118270874, + "logps/rejected": -1.2037460803985596, + "loss": 1.1612, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.108261823654175, + "rewards/margins": 0.29923057556152344, + "rewards/rejected": -2.407492160797119, + "step": 1910 + }, + { + "epoch": 1.38328530259366, + "grad_norm": 16.91937122091795, + "learning_rate": 3.264165085216817e-08, + "logits/chosen": -2.0380663871765137, + "logits/rejected": -2.0380921363830566, + "logps/chosen": -0.9351627230644226, + "logps/rejected": -1.1040947437286377, + "loss": 1.1393, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.8703254461288452, + "rewards/margins": 0.3378642499446869, + "rewards/rejected": -2.2081894874572754, + "step": 1920 + }, + { + "epoch": 1.3904899135446687, + "grad_norm": 18.773223856961657, + "learning_rate": 3.244180946015008e-08, + "logits/chosen": -1.9662561416625977, + "logits/rejected": -1.96682608127594, + "logps/chosen": -1.0346488952636719, + "logps/rejected": -1.0978872776031494, + "loss": 1.2739, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.0692977905273438, + "rewards/margins": 0.12647677958011627, + "rewards/rejected": -2.195774555206299, + "step": 1930 + }, + { + "epoch": 1.397694524495677, + "grad_norm": 15.42949606476335, + "learning_rate": 3.224144493965578e-08, + "logits/chosen": -2.0522544384002686, + "logits/rejected": -2.0557808876037598, + "logps/chosen": -0.9907134175300598, + "logps/rejected": -1.095879316329956, + "loss": 1.217, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.9814268350601196, + "rewards/margins": 0.2103317677974701, + "rewards/rejected": -2.191758632659912, + "step": 1940 + }, + { + "epoch": 1.4048991354466858, + "grad_norm": 17.867258553909902, + "learning_rate": 3.204057137548371e-08, + "logits/chosen": -2.0167171955108643, + "logits/rejected": -2.011385202407837, + "logps/chosen": -0.9775940179824829, + "logps/rejected": -1.0835435390472412, + "loss": 1.2152, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.9551880359649658, + "rewards/margins": 0.21189892292022705, + "rewards/rejected": -2.1670870780944824, + "step": 1950 + }, + { + "epoch": 1.4121037463976944, + "grad_norm": 19.490021831335156, + "learning_rate": 3.183920288821597e-08, + "logits/chosen": -1.9968830347061157, + "logits/rejected": -1.993549108505249, + "logps/chosen": -1.0021111965179443, + "logps/rejected": -1.163733959197998, + "loss": 1.1461, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0042223930358887, + "rewards/margins": 0.3232455551624298, + "rewards/rejected": -2.327467918395996, + "step": 1960 + }, + { + "epoch": 1.419308357348703, + "grad_norm": 23.55301780513215, + "learning_rate": 3.1637353633225735e-08, + "logits/chosen": -2.042677879333496, + "logits/rejected": -2.0366151332855225, + "logps/chosen": -1.0290377140045166, + "logps/rejected": -1.1746978759765625, + "loss": 1.1717, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.058075428009033, + "rewards/margins": 0.2913200259208679, + "rewards/rejected": -2.349395751953125, + "step": 1970 + }, + { + "epoch": 1.4265129682997117, + "grad_norm": 19.629394462964214, + "learning_rate": 3.143503779968213e-08, + "logits/chosen": -2.011504650115967, + "logits/rejected": -2.011737823486328, + "logps/chosen": -1.015564203262329, + "logps/rejected": -1.1511462926864624, + "loss": 1.196, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.031128406524658, + "rewards/margins": 0.27116426825523376, + "rewards/rejected": -2.302292585372925, + "step": 1980 + }, + { + "epoch": 1.4337175792507204, + "grad_norm": 18.105081870748133, + "learning_rate": 3.1232269609552875e-08, + "logits/chosen": -1.9979522228240967, + "logits/rejected": -1.995548963546753, + "logps/chosen": -0.9974485635757446, + "logps/rejected": -1.1187690496444702, + "loss": 1.2004, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.9948971271514893, + "rewards/margins": 0.24264100193977356, + "rewards/rejected": -2.2375380992889404, + "step": 1990 + }, + { + "epoch": 1.440922190201729, + "grad_norm": 16.05671541036881, + "learning_rate": 3.102906331660444e-08, + "logits/chosen": -2.0580544471740723, + "logits/rejected": -2.0497653484344482, + "logps/chosen": -0.9930634498596191, + "logps/rejected": -1.1649951934814453, + "loss": 1.1345, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.9861268997192383, + "rewards/margins": 0.3438633382320404, + "rewards/rejected": -2.3299903869628906, + "step": 2000 + }, + { + "epoch": 1.4481268011527377, + "grad_norm": 16.003021454074023, + "learning_rate": 3.082543320540015e-08, + "logits/chosen": -1.9997708797454834, + "logits/rejected": -1.992846131324768, + "logps/chosen": -1.0060840845108032, + "logps/rejected": -1.1501317024230957, + "loss": 1.1672, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0121681690216064, + "rewards/margins": 0.28809523582458496, + "rewards/rejected": -2.3002634048461914, + "step": 2010 + }, + { + "epoch": 1.4553314121037464, + "grad_norm": 17.99920026821121, + "learning_rate": 3.062139359029599e-08, + "logits/chosen": -2.029757022857666, + "logits/rejected": -2.029585361480713, + "logps/chosen": -1.0290124416351318, + "logps/rejected": -1.1135772466659546, + "loss": 1.2472, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.0580248832702637, + "rewards/margins": 0.16912977397441864, + "rewards/rejected": -2.227154493331909, + "step": 2020 + }, + { + "epoch": 1.462536023054755, + "grad_norm": 18.76605541973094, + "learning_rate": 3.041695881443437e-08, + "logits/chosen": -2.051182270050049, + "logits/rejected": -2.04660964012146, + "logps/chosen": -0.9734565019607544, + "logps/rejected": -1.1085374355316162, + "loss": 1.1781, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.9469130039215088, + "rewards/margins": 0.27016210556030273, + "rewards/rejected": -2.2170748710632324, + "step": 2030 + }, + { + "epoch": 1.4697406340057637, + "grad_norm": 22.16241778931301, + "learning_rate": 3.0212143248735886e-08, + "logits/chosen": -2.0314226150512695, + "logits/rejected": -2.0317978858947754, + "logps/chosen": -0.9990888833999634, + "logps/rejected": -1.1364792585372925, + "loss": 1.1757, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.9981777667999268, + "rewards/margins": 0.2747807502746582, + "rewards/rejected": -2.272958517074585, + "step": 2040 + }, + { + "epoch": 1.4769452449567724, + "grad_norm": 19.71057724214497, + "learning_rate": 3.0006961290889077e-08, + "logits/chosen": -2.0183329582214355, + "logits/rejected": -2.009127378463745, + "logps/chosen": -1.1189202070236206, + "logps/rejected": -1.286902904510498, + "loss": 1.1652, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.237840414047241, + "rewards/margins": 0.33596524596214294, + "rewards/rejected": -2.573805809020996, + "step": 2050 + }, + { + "epoch": 1.484149855907781, + "grad_norm": 21.95290350074783, + "learning_rate": 2.980142736433833e-08, + "logits/chosen": -2.008192777633667, + "logits/rejected": -2.001173496246338, + "logps/chosen": -1.0314289331436157, + "logps/rejected": -1.0944207906723022, + "loss": 1.2767, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.0628578662872314, + "rewards/margins": 0.12598386406898499, + "rewards/rejected": -2.1888415813446045, + "step": 2060 + }, + { + "epoch": 1.4913544668587897, + "grad_norm": 24.378073368269256, + "learning_rate": 2.9595555917269997e-08, + "logits/chosen": -2.039536952972412, + "logits/rejected": -2.0248141288757324, + "logps/chosen": -1.140825867652893, + "logps/rejected": -1.2375624179840088, + "loss": 1.2147, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.281651735305786, + "rewards/margins": 0.19347305595874786, + "rewards/rejected": -2.4751248359680176, + "step": 2070 + }, + { + "epoch": 1.4985590778097984, + "grad_norm": 18.478093665467945, + "learning_rate": 2.9389361421596725e-08, + "logits/chosen": -1.9539821147918701, + "logits/rejected": -1.9563089609146118, + "logps/chosen": -1.0598758459091187, + "logps/rejected": -1.1930882930755615, + "loss": 1.1842, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.1197516918182373, + "rewards/margins": 0.26642483472824097, + "rewards/rejected": -2.386176586151123, + "step": 2080 + }, + { + "epoch": 1.505763688760807, + "grad_norm": 20.36561616479061, + "learning_rate": 2.9182858371940126e-08, + "logits/chosen": -2.0380711555480957, + "logits/rejected": -2.032642364501953, + "logps/chosen": -1.046942949295044, + "logps/rejected": -1.1760826110839844, + "loss": 1.1872, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.093885898590088, + "rewards/margins": 0.2582792639732361, + "rewards/rejected": -2.3521652221679688, + "step": 2090 + }, + { + "epoch": 1.5129682997118157, + "grad_norm": 18.97930654154894, + "learning_rate": 2.8976061284611908e-08, + "logits/chosen": -1.9913969039916992, + "logits/rejected": -2.0002284049987793, + "logps/chosen": -0.9360917806625366, + "logps/rejected": -1.0654878616333008, + "loss": 1.194, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.8721835613250732, + "rewards/margins": 0.2587924301624298, + "rewards/rejected": -2.1309757232666016, + "step": 2100 + }, + { + "epoch": 1.5201729106628243, + "grad_norm": 21.429455246868766, + "learning_rate": 2.8768984696593384e-08, + "logits/chosen": -1.978727102279663, + "logits/rejected": -1.9692051410675049, + "logps/chosen": -1.0171012878417969, + "logps/rejected": -1.1342874765396118, + "loss": 1.2159, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0342025756835938, + "rewards/margins": 0.23437246680259705, + "rewards/rejected": -2.2685749530792236, + "step": 2110 + }, + { + "epoch": 1.527377521613833, + "grad_norm": 17.997474835260586, + "learning_rate": 2.8561643164513637e-08, + "logits/chosen": -1.9045627117156982, + "logits/rejected": -1.900713324546814, + "logps/chosen": -1.0493916273117065, + "logps/rejected": -1.1679325103759766, + "loss": 1.2012, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.098783254623413, + "rewards/margins": 0.2370818853378296, + "rewards/rejected": -2.335865020751953, + "step": 2120 + }, + { + "epoch": 1.5345821325648417, + "grad_norm": 18.947013598861446, + "learning_rate": 2.8354051263626227e-08, + "logits/chosen": -1.9892809391021729, + "logits/rejected": -1.9950027465820312, + "logps/chosen": -1.0602303743362427, + "logps/rejected": -1.1739084720611572, + "loss": 1.2059, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.1204607486724854, + "rewards/margins": 0.22735624015331268, + "rewards/rejected": -2.3478169441223145, + "step": 2130 + }, + { + "epoch": 1.54178674351585, + "grad_norm": 19.64777274554225, + "learning_rate": 2.8146223586784573e-08, + "logits/chosen": -1.980348825454712, + "logits/rejected": -1.9723879098892212, + "logps/chosen": -1.065375566482544, + "logps/rejected": -1.1992082595825195, + "loss": 1.1877, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.130751132965088, + "rewards/margins": 0.2676653265953064, + "rewards/rejected": -2.398416519165039, + "step": 2140 + }, + { + "epoch": 1.5489913544668588, + "grad_norm": 25.24267359813586, + "learning_rate": 2.7938174743416205e-08, + "logits/chosen": -1.9437439441680908, + "logits/rejected": -1.9405949115753174, + "logps/chosen": -1.0510722398757935, + "logps/rejected": -1.1613985300064087, + "loss": 1.2136, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.102144479751587, + "rewards/margins": 0.22065265476703644, + "rewards/rejected": -2.3227970600128174, + "step": 2150 + }, + { + "epoch": 1.5561959654178674, + "grad_norm": 19.677128525439006, + "learning_rate": 2.7729919358495728e-08, + "logits/chosen": -2.002791404724121, + "logits/rejected": -2.0038230419158936, + "logps/chosen": -1.112272024154663, + "logps/rejected": -1.1909116506576538, + "loss": 1.2588, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.224544048309326, + "rewards/margins": 0.15727964043617249, + "rewards/rejected": -2.3818233013153076, + "step": 2160 + }, + { + "epoch": 1.563400576368876, + "grad_norm": 19.250624519916794, + "learning_rate": 2.7521472071516772e-08, + "logits/chosen": -1.997267484664917, + "logits/rejected": -1.9961631298065186, + "logps/chosen": -0.9451554417610168, + "logps/rejected": -1.0597981214523315, + "loss": 1.2086, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.8903108835220337, + "rewards/margins": 0.22928544878959656, + "rewards/rejected": -2.119596242904663, + "step": 2170 + }, + { + "epoch": 1.5706051873198847, + "grad_norm": 21.883332512382378, + "learning_rate": 2.731284753546289e-08, + "logits/chosen": -1.987908959388733, + "logits/rejected": -1.9858767986297607, + "logps/chosen": -1.081416130065918, + "logps/rejected": -1.222598671913147, + "loss": 1.1737, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.162832260131836, + "rewards/margins": 0.28236496448516846, + "rewards/rejected": -2.445197343826294, + "step": 2180 + }, + { + "epoch": 1.5778097982708934, + "grad_norm": 21.934933640290577, + "learning_rate": 2.710406041577751e-08, + "logits/chosen": -2.0529561042785645, + "logits/rejected": -2.049743175506592, + "logps/chosen": -1.0325100421905518, + "logps/rejected": -1.1861141920089722, + "loss": 1.1625, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.0650200843811035, + "rewards/margins": 0.3072081506252289, + "rewards/rejected": -2.3722283840179443, + "step": 2190 + }, + { + "epoch": 1.585014409221902, + "grad_norm": 18.04514565686392, + "learning_rate": 2.6895125389333017e-08, + "logits/chosen": -2.01184344291687, + "logits/rejected": -2.0075669288635254, + "logps/chosen": -1.026865839958191, + "logps/rejected": -1.1786205768585205, + "loss": 1.1613, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.053731679916382, + "rewards/margins": 0.30350956320762634, + "rewards/rejected": -2.357241153717041, + "step": 2200 + }, + { + "epoch": 1.5922190201729105, + "grad_norm": 17.097050510727, + "learning_rate": 2.6686057143399028e-08, + "logits/chosen": -2.0109105110168457, + "logits/rejected": -2.0125486850738525, + "logps/chosen": -1.0616767406463623, + "logps/rejected": -1.1599900722503662, + "loss": 1.2429, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.1233534812927246, + "rewards/margins": 0.1966264694929123, + "rewards/rejected": -2.3199801445007324, + "step": 2210 + }, + { + "epoch": 1.5994236311239192, + "grad_norm": 19.369226164400942, + "learning_rate": 2.647687037460996e-08, + "logits/chosen": -2.0144858360290527, + "logits/rejected": -2.0138607025146484, + "logps/chosen": -1.0874732732772827, + "logps/rejected": -1.2833433151245117, + "loss": 1.1241, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1749465465545654, + "rewards/margins": 0.39173993468284607, + "rewards/rejected": -2.5666866302490234, + "step": 2220 + }, + { + "epoch": 1.6066282420749278, + "grad_norm": 20.354220577910063, + "learning_rate": 2.626757978793187e-08, + "logits/chosen": -2.025035858154297, + "logits/rejected": -2.018566846847534, + "logps/chosen": -1.0852900743484497, + "logps/rejected": -1.2093784809112549, + "loss": 1.2036, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1705801486968994, + "rewards/margins": 0.24817702174186707, + "rewards/rejected": -2.4187569618225098, + "step": 2230 + }, + { + "epoch": 1.6138328530259365, + "grad_norm": 23.587038296125034, + "learning_rate": 2.6058200095628797e-08, + "logits/chosen": -1.9932403564453125, + "logits/rejected": -1.9965318441390991, + "logps/chosen": -0.917451024055481, + "logps/rejected": -1.0861783027648926, + "loss": 1.1444, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.834902048110962, + "rewards/margins": 0.33745482563972473, + "rewards/rejected": -2.172356605529785, + "step": 2240 + }, + { + "epoch": 1.6210374639769451, + "grad_norm": 18.85455738747805, + "learning_rate": 2.584874601622854e-08, + "logits/chosen": -2.0577220916748047, + "logits/rejected": -2.048609972000122, + "logps/chosen": -1.0844237804412842, + "logps/rejected": -1.2160663604736328, + "loss": 1.2066, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.1688475608825684, + "rewards/margins": 0.26328495144844055, + "rewards/rejected": -2.4321327209472656, + "step": 2250 + }, + { + "epoch": 1.6282420749279538, + "grad_norm": 21.402470423847422, + "learning_rate": 2.5639232273487993e-08, + "logits/chosen": -1.9839709997177124, + "logits/rejected": -1.9742904901504517, + "logps/chosen": -0.9783967137336731, + "logps/rejected": -1.1004703044891357, + "loss": 1.2014, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.9567934274673462, + "rewards/margins": 0.24414721131324768, + "rewards/rejected": -2.2009406089782715, + "step": 2260 + }, + { + "epoch": 1.6354466858789625, + "grad_norm": 20.835507437389413, + "learning_rate": 2.5429673595358142e-08, + "logits/chosen": -2.0180060863494873, + "logits/rejected": -2.0165977478027344, + "logps/chosen": -1.044081687927246, + "logps/rejected": -1.1655093431472778, + "loss": 1.198, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.088163375854492, + "rewards/margins": 0.24285531044006348, + "rewards/rejected": -2.3310186862945557, + "step": 2270 + }, + { + "epoch": 1.6426512968299711, + "grad_norm": 23.508207659870397, + "learning_rate": 2.5220084712948764e-08, + "logits/chosen": -1.9833030700683594, + "logits/rejected": -1.9724918603897095, + "logps/chosen": -1.1200716495513916, + "logps/rejected": -1.2384498119354248, + "loss": 1.1901, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.240143299102783, + "rewards/margins": 0.2367565631866455, + "rewards/rejected": -2.4768996238708496, + "step": 2280 + }, + { + "epoch": 1.6498559077809798, + "grad_norm": 19.732301868207852, + "learning_rate": 2.5010480359492838e-08, + "logits/chosen": -1.9684820175170898, + "logits/rejected": -1.9655864238739014, + "logps/chosen": -1.0514031648635864, + "logps/rejected": -1.1109318733215332, + "loss": 1.287, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.102806329727173, + "rewards/margins": 0.11905747652053833, + "rewards/rejected": -2.2218637466430664, + "step": 2290 + }, + { + "epoch": 1.6570605187319885, + "grad_norm": 21.18423154150406, + "learning_rate": 2.480087526931091e-08, + "logits/chosen": -2.0103702545166016, + "logits/rejected": -1.99822998046875, + "logps/chosen": -1.0027062892913818, + "logps/rejected": -1.1175159215927124, + "loss": 1.2155, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.0054125785827637, + "rewards/margins": 0.22961954772472382, + "rewards/rejected": -2.235031843185425, + "step": 2300 + }, + { + "epoch": 1.6642651296829971, + "grad_norm": 19.472737759749933, + "learning_rate": 2.4591284176775326e-08, + "logits/chosen": -1.969321608543396, + "logits/rejected": -1.9655838012695312, + "logps/chosen": -1.0758287906646729, + "logps/rejected": -1.1590468883514404, + "loss": 1.2565, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.1516575813293457, + "rewards/margins": 0.16643603146076202, + "rewards/rejected": -2.318093776702881, + "step": 2310 + }, + { + "epoch": 1.6714697406340058, + "grad_norm": 21.89430802713767, + "learning_rate": 2.4381721815274443e-08, + "logits/chosen": -2.043560028076172, + "logits/rejected": -2.0437939167022705, + "logps/chosen": -1.0199779272079468, + "logps/rejected": -1.1517260074615479, + "loss": 1.1926, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0399558544158936, + "rewards/margins": 0.26349639892578125, + "rewards/rejected": -2.3034520149230957, + "step": 2320 + }, + { + "epoch": 1.6786743515850144, + "grad_norm": 19.55460769848809, + "learning_rate": 2.4172202916176936e-08, + "logits/chosen": -2.046525478363037, + "logits/rejected": -2.048698663711548, + "logps/chosen": -0.9680402874946594, + "logps/rejected": -1.1348791122436523, + "loss": 1.1576, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9360805749893188, + "rewards/margins": 0.33367738127708435, + "rewards/rejected": -2.2697582244873047, + "step": 2330 + }, + { + "epoch": 1.685878962536023, + "grad_norm": 19.2299692018981, + "learning_rate": 2.3962742207796268e-08, + "logits/chosen": -1.9817912578582764, + "logits/rejected": -1.9795825481414795, + "logps/chosen": -0.9565426707267761, + "logps/rejected": -1.1191436052322388, + "loss": 1.1591, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.9130853414535522, + "rewards/margins": 0.325202077627182, + "rewards/rejected": -2.2382872104644775, + "step": 2340 + }, + { + "epoch": 1.6930835734870318, + "grad_norm": 22.728076530066133, + "learning_rate": 2.3753354414355334e-08, + "logits/chosen": -1.950277328491211, + "logits/rejected": -1.9395864009857178, + "logps/chosen": -1.0648443698883057, + "logps/rejected": -1.1816734075546265, + "loss": 1.2133, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.1296887397766113, + "rewards/margins": 0.23365814983844757, + "rewards/rejected": -2.363346815109253, + "step": 2350 + }, + { + "epoch": 1.7002881844380404, + "grad_norm": 18.628936490431073, + "learning_rate": 2.3544054254951408e-08, + "logits/chosen": -1.9891974925994873, + "logits/rejected": -1.9805711507797241, + "logps/chosen": -0.9382593035697937, + "logps/rejected": -1.1349287033081055, + "loss": 1.1143, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.8765186071395874, + "rewards/margins": 0.3933386504650116, + "rewards/rejected": -2.269857406616211, + "step": 2360 + }, + { + "epoch": 1.707492795389049, + "grad_norm": 18.210045869943635, + "learning_rate": 2.3334856442521435e-08, + "logits/chosen": -2.037346124649048, + "logits/rejected": -2.0299322605133057, + "logps/chosen": -1.0963926315307617, + "logps/rejected": -1.1663601398468018, + "loss": 1.2702, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.1927852630615234, + "rewards/margins": 0.1399351954460144, + "rewards/rejected": -2.3327202796936035, + "step": 2370 + }, + { + "epoch": 1.7146974063400577, + "grad_norm": 19.872042875254735, + "learning_rate": 2.3125775682807826e-08, + "logits/chosen": -2.0520217418670654, + "logits/rejected": -2.051881790161133, + "logps/chosen": -1.166526198387146, + "logps/rejected": -1.2666442394256592, + "loss": 1.232, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.333052396774292, + "rewards/margins": 0.20023572444915771, + "rewards/rejected": -2.5332884788513184, + "step": 2380 + }, + { + "epoch": 1.7219020172910664, + "grad_norm": 20.61950838692453, + "learning_rate": 2.291682667332464e-08, + "logits/chosen": -2.0643744468688965, + "logits/rejected": -2.059324264526367, + "logps/chosen": -1.0485190153121948, + "logps/rejected": -1.1792809963226318, + "loss": 1.1918, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.0970380306243896, + "rewards/margins": 0.26152390241622925, + "rewards/rejected": -2.3585619926452637, + "step": 2390 + }, + { + "epoch": 1.729106628242075, + "grad_norm": 15.170314603632118, + "learning_rate": 2.2708024102324454e-08, + "logits/chosen": -2.0271968841552734, + "logits/rejected": -2.0215301513671875, + "logps/chosen": -1.0329598188400269, + "logps/rejected": -1.2096232175827026, + "loss": 1.1496, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0659196376800537, + "rewards/margins": 0.3533265292644501, + "rewards/rejected": -2.4192464351654053, + "step": 2400 + }, + { + "epoch": 1.7363112391930837, + "grad_norm": 22.430906564092297, + "learning_rate": 2.2499382647765797e-08, + "logits/chosen": -2.0221495628356934, + "logits/rejected": -2.018479347229004, + "logps/chosen": -1.0721267461776733, + "logps/rejected": -1.1613738536834717, + "loss": 1.2462, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.1442534923553467, + "rewards/margins": 0.17849409580230713, + "rewards/rejected": -2.3227477073669434, + "step": 2410 + }, + { + "epoch": 1.7435158501440924, + "grad_norm": 21.08508798237925, + "learning_rate": 2.2290916976281427e-08, + "logits/chosen": -2.003178119659424, + "logits/rejected": -1.996995210647583, + "logps/chosen": -0.9998496174812317, + "logps/rejected": -1.1318352222442627, + "loss": 1.2149, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9996992349624634, + "rewards/margins": 0.2639711797237396, + "rewards/rejected": -2.2636704444885254, + "step": 2420 + }, + { + "epoch": 1.7507204610951008, + "grad_norm": 18.07451059922042, + "learning_rate": 2.2082641742147238e-08, + "logits/chosen": -1.9808975458145142, + "logits/rejected": -1.9742380380630493, + "logps/chosen": -1.016533613204956, + "logps/rejected": -1.2077422142028809, + "loss": 1.1153, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.033067226409912, + "rewards/margins": 0.3824174702167511, + "rewards/rejected": -2.4154844284057617, + "step": 2430 + }, + { + "epoch": 1.7579250720461095, + "grad_norm": 20.71956116817728, + "learning_rate": 2.1874571586252177e-08, + "logits/chosen": -2.029297351837158, + "logits/rejected": -2.02240252494812, + "logps/chosen": -1.0277835130691528, + "logps/rejected": -1.1069574356079102, + "loss": 1.2557, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.0555670261383057, + "rewards/margins": 0.1583479940891266, + "rewards/rejected": -2.2139148712158203, + "step": 2440 + }, + { + "epoch": 1.7651296829971181, + "grad_norm": 20.82700891746505, + "learning_rate": 2.1666721135069037e-08, + "logits/chosen": -2.014781951904297, + "logits/rejected": -2.0112876892089844, + "logps/chosen": -1.1093708276748657, + "logps/rejected": -1.2040464878082275, + "loss": 1.2422, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.2187416553497314, + "rewards/margins": 0.18935146927833557, + "rewards/rejected": -2.408092975616455, + "step": 2450 + }, + { + "epoch": 1.7723342939481268, + "grad_norm": 15.571889556211572, + "learning_rate": 2.145910499962628e-08, + "logits/chosen": -2.065460681915283, + "logits/rejected": -2.0574898719787598, + "logps/chosen": -0.9591764211654663, + "logps/rejected": -1.1014626026153564, + "loss": 1.1832, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.9183528423309326, + "rewards/margins": 0.2845722734928131, + "rewards/rejected": -2.202925205230713, + "step": 2460 + }, + { + "epoch": 1.7795389048991355, + "grad_norm": 23.742748347267703, + "learning_rate": 2.1251737774480915e-08, + "logits/chosen": -2.0418546199798584, + "logits/rejected": -2.032393455505371, + "logps/chosen": -1.169003963470459, + "logps/rejected": -1.2600603103637695, + "loss": 1.2691, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.338007926940918, + "rewards/margins": 0.18211248517036438, + "rewards/rejected": -2.520120620727539, + "step": 2470 + }, + { + "epoch": 1.7867435158501441, + "grad_norm": 17.445595980137064, + "learning_rate": 2.104463403669264e-08, + "logits/chosen": -2.0002996921539307, + "logits/rejected": -1.9975649118423462, + "logps/chosen": -1.0450685024261475, + "logps/rejected": -1.189254879951477, + "loss": 1.1814, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.090137004852295, + "rewards/margins": 0.288372665643692, + "rewards/rejected": -2.378509759902954, + "step": 2480 + }, + { + "epoch": 1.7939481268011528, + "grad_norm": 17.073648259711092, + "learning_rate": 2.0837808344799028e-08, + "logits/chosen": -1.982496976852417, + "logits/rejected": -1.9782216548919678, + "logps/chosen": -0.9407302141189575, + "logps/rejected": -1.0727360248565674, + "loss": 1.1832, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.881460428237915, + "rewards/margins": 0.26401159167289734, + "rewards/rejected": -2.1454720497131348, + "step": 2490 + }, + { + "epoch": 1.8011527377521612, + "grad_norm": 18.275713028107056, + "learning_rate": 2.063127523779219e-08, + "logits/chosen": -1.9809592962265015, + "logits/rejected": -1.9768626689910889, + "logps/chosen": -1.0079666376113892, + "logps/rejected": -1.1944630146026611, + "loss": 1.1139, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.0159332752227783, + "rewards/margins": 0.372992604970932, + "rewards/rejected": -2.3889260292053223, + "step": 2500 + }, + { + "epoch": 1.8083573487031699, + "grad_norm": 19.734595343704587, + "learning_rate": 2.0425049234096737e-08, + "logits/chosen": -1.9899470806121826, + "logits/rejected": -1.9840829372406006, + "logps/chosen": -1.0095808506011963, + "logps/rejected": -1.1258000135421753, + "loss": 1.2167, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.0191617012023926, + "rewards/margins": 0.23243825137615204, + "rewards/rejected": -2.2516000270843506, + "step": 2510 + }, + { + "epoch": 1.8155619596541785, + "grad_norm": 19.367202320168705, + "learning_rate": 2.0219144830549163e-08, + "logits/chosen": -1.9627516269683838, + "logits/rejected": -1.9618648290634155, + "logps/chosen": -1.0147254467010498, + "logps/rejected": -1.1612242460250854, + "loss": 1.1826, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0294508934020996, + "rewards/margins": 0.29299798607826233, + "rewards/rejected": -2.322448492050171, + "step": 2520 + }, + { + "epoch": 1.8227665706051872, + "grad_norm": 19.099230695899127, + "learning_rate": 2.0013576501378823e-08, + "logits/chosen": -1.9792697429656982, + "logits/rejected": -1.9728553295135498, + "logps/chosen": -1.0101633071899414, + "logps/rejected": -1.144830584526062, + "loss": 1.1941, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.020326614379883, + "rewards/margins": 0.26933470368385315, + "rewards/rejected": -2.289661169052124, + "step": 2530 + }, + { + "epoch": 1.8299711815561959, + "grad_norm": 20.166585492166963, + "learning_rate": 1.9808358697190426e-08, + "logits/chosen": -1.972886085510254, + "logits/rejected": -1.969310998916626, + "logps/chosen": -0.9306098222732544, + "logps/rejected": -1.065213918685913, + "loss": 1.1987, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.8612196445465088, + "rewards/margins": 0.26920828223228455, + "rewards/rejected": -2.130427837371826, + "step": 2540 + }, + { + "epoch": 1.8371757925072045, + "grad_norm": 21.09479586621852, + "learning_rate": 1.9603505843948214e-08, + "logits/chosen": -2.017627239227295, + "logits/rejected": -2.0076773166656494, + "logps/chosen": -0.9474620819091797, + "logps/rejected": -1.1191534996032715, + "loss": 1.1394, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.8949241638183594, + "rewards/margins": 0.3433830738067627, + "rewards/rejected": -2.238306999206543, + "step": 2550 + }, + { + "epoch": 1.8443804034582132, + "grad_norm": 20.423562577336806, + "learning_rate": 1.9399032341961886e-08, + "logits/chosen": -1.9809995889663696, + "logits/rejected": -1.965026617050171, + "logps/chosen": -0.9898856282234192, + "logps/rejected": -1.0629730224609375, + "loss": 1.2699, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.9797712564468384, + "rewards/margins": 0.14617487788200378, + "rewards/rejected": -2.125946044921875, + "step": 2560 + }, + { + "epoch": 1.8515850144092219, + "grad_norm": 26.09646394087967, + "learning_rate": 1.9194952564874323e-08, + "logits/chosen": -2.0236237049102783, + "logits/rejected": -2.0175366401672363, + "logps/chosen": -1.0653067827224731, + "logps/rejected": -1.2080482244491577, + "loss": 1.1687, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1306135654449463, + "rewards/margins": 0.2854826748371124, + "rewards/rejected": -2.4160964488983154, + "step": 2570 + }, + { + "epoch": 1.8587896253602305, + "grad_norm": 20.54864054770584, + "learning_rate": 1.8991280858651157e-08, + "logits/chosen": -1.9798238277435303, + "logits/rejected": -1.9740060567855835, + "logps/chosen": -1.0638706684112549, + "logps/rejected": -1.1493985652923584, + "loss": 1.2503, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.1277413368225098, + "rewards/margins": 0.17105570435523987, + "rewards/rejected": -2.298797130584717, + "step": 2580 + }, + { + "epoch": 1.8659942363112392, + "grad_norm": 16.856180614276347, + "learning_rate": 1.8788031540572327e-08, + "logits/chosen": -1.9806411266326904, + "logits/rejected": -1.97232985496521, + "logps/chosen": -0.9993877410888672, + "logps/rejected": -1.1446069478988647, + "loss": 1.1723, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.9987754821777344, + "rewards/margins": 0.2904384136199951, + "rewards/rejected": -2.2892138957977295, + "step": 2590 + }, + { + "epoch": 1.8731988472622478, + "grad_norm": 17.03483106236852, + "learning_rate": 1.858521889822565e-08, + "logits/chosen": -1.99444580078125, + "logits/rejected": -1.9968360662460327, + "logps/chosen": -0.9734609723091125, + "logps/rejected": -1.0829355716705322, + "loss": 1.2235, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.946921944618225, + "rewards/margins": 0.21894919872283936, + "rewards/rejected": -2.1658711433410645, + "step": 2600 + }, + { + "epoch": 1.8804034582132565, + "grad_norm": 16.460944487260154, + "learning_rate": 1.8382857188502422e-08, + "logits/chosen": -1.987308144569397, + "logits/rejected": -1.9824047088623047, + "logps/chosen": -0.9851255416870117, + "logps/rejected": -1.1126196384429932, + "loss": 1.1823, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.9702510833740234, + "rewards/margins": 0.2549881935119629, + "rewards/rejected": -2.2252392768859863, + "step": 2610 + }, + { + "epoch": 1.8876080691642652, + "grad_norm": 22.305541216871692, + "learning_rate": 1.8180960636595234e-08, + "logits/chosen": -1.9680871963500977, + "logits/rejected": -1.9659591913223267, + "logps/chosen": -1.0361220836639404, + "logps/rejected": -1.1791760921478271, + "loss": 1.1801, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.072244167327881, + "rewards/margins": 0.2861078977584839, + "rewards/rejected": -2.3583521842956543, + "step": 2620 + }, + { + "epoch": 1.8948126801152738, + "grad_norm": 20.388566247239506, + "learning_rate": 1.7979543434998015e-08, + "logits/chosen": -2.036707639694214, + "logits/rejected": -2.041584014892578, + "logps/chosen": -1.1235167980194092, + "logps/rejected": -1.2117664813995361, + "loss": 1.238, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.2470335960388184, + "rewards/margins": 0.17649903893470764, + "rewards/rejected": -2.4235329627990723, + "step": 2630 + }, + { + "epoch": 1.9020172910662825, + "grad_norm": 26.051372198929382, + "learning_rate": 1.7778619742508345e-08, + "logits/chosen": -2.0007290840148926, + "logits/rejected": -1.9938418865203857, + "logps/chosen": -1.0931371450424194, + "logps/rejected": -1.1865342855453491, + "loss": 1.2551, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.186274290084839, + "rewards/margins": 0.186794251203537, + "rewards/rejected": -2.3730685710906982, + "step": 2640 + }, + { + "epoch": 1.9092219020172911, + "grad_norm": 23.4369600871899, + "learning_rate": 1.757820368323213e-08, + "logits/chosen": -1.9935007095336914, + "logits/rejected": -1.9837433099746704, + "logps/chosen": -1.1060357093811035, + "logps/rejected": -1.265039324760437, + "loss": 1.1608, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.212071418762207, + "rewards/margins": 0.31800705194473267, + "rewards/rejected": -2.530078649520874, + "step": 2650 + }, + { + "epoch": 1.9164265129682998, + "grad_norm": 22.386136161896175, + "learning_rate": 1.7378309345590803e-08, + "logits/chosen": -2.011643409729004, + "logits/rejected": -2.0210494995117188, + "logps/chosen": -1.0864661931991577, + "logps/rejected": -1.2283092737197876, + "loss": 1.1822, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.1729323863983154, + "rewards/margins": 0.28368598222732544, + "rewards/rejected": -2.456618547439575, + "step": 2660 + }, + { + "epoch": 1.9236311239193085, + "grad_norm": 20.06093474919507, + "learning_rate": 1.717895078133088e-08, + "logits/chosen": -2.058295726776123, + "logits/rejected": -2.054426670074463, + "logps/chosen": -1.0596580505371094, + "logps/rejected": -1.2001304626464844, + "loss": 1.1836, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.1193161010742188, + "rewards/margins": 0.2809443771839142, + "rewards/rejected": -2.4002609252929688, + "step": 2670 + }, + { + "epoch": 1.9308357348703171, + "grad_norm": 21.13851274050523, + "learning_rate": 1.698014200453624e-08, + "logits/chosen": -2.0123298168182373, + "logits/rejected": -2.0198843479156494, + "logps/chosen": -1.0314449071884155, + "logps/rejected": -1.162095308303833, + "loss": 1.1778, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.062889814376831, + "rewards/margins": 0.26130083203315735, + "rewards/rejected": -2.324190616607666, + "step": 2680 + }, + { + "epoch": 1.9380403458213258, + "grad_norm": 24.618506838094923, + "learning_rate": 1.6781896990642964e-08, + "logits/chosen": -1.9457242488861084, + "logits/rejected": -1.9430221319198608, + "logps/chosen": -1.1473594903945923, + "logps/rejected": -1.237646460533142, + "loss": 1.2441, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.2947189807891846, + "rewards/margins": 0.18057429790496826, + "rewards/rejected": -2.475292921066284, + "step": 2690 + }, + { + "epoch": 1.9452449567723344, + "grad_norm": 24.327220053141147, + "learning_rate": 1.658422967545693e-08, + "logits/chosen": -2.047414541244507, + "logits/rejected": -2.0341315269470215, + "logps/chosen": -1.0049512386322021, + "logps/rejected": -1.1183750629425049, + "loss": 1.2203, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -2.0099024772644043, + "rewards/margins": 0.22684772312641144, + "rewards/rejected": -2.2367501258850098, + "step": 2700 + }, + { + "epoch": 1.952449567723343, + "grad_norm": 20.72145176972204, + "learning_rate": 1.638715395417418e-08, + "logits/chosen": -2.023325204849243, + "logits/rejected": -2.0211892127990723, + "logps/chosen": -1.068524956703186, + "logps/rejected": -1.2049812078475952, + "loss": 1.1841, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.137049913406372, + "rewards/margins": 0.2729126214981079, + "rewards/rejected": -2.4099624156951904, + "step": 2710 + }, + { + "epoch": 1.9596541786743515, + "grad_norm": 22.430027131821607, + "learning_rate": 1.619068368040416e-08, + "logits/chosen": -2.0218818187713623, + "logits/rejected": -2.0176615715026855, + "logps/chosen": -1.0006544589996338, + "logps/rejected": -1.178510069847107, + "loss": 1.1299, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.0013089179992676, + "rewards/margins": 0.3557109236717224, + "rewards/rejected": -2.357020139694214, + "step": 2720 + }, + { + "epoch": 1.9668587896253602, + "grad_norm": 17.44373356290383, + "learning_rate": 1.5994832665195853e-08, + "logits/chosen": -1.9683250188827515, + "logits/rejected": -1.9688348770141602, + "logps/chosen": -1.0345633029937744, + "logps/rejected": -1.1477251052856445, + "loss": 1.2115, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.069126605987549, + "rewards/margins": 0.22632364928722382, + "rewards/rejected": -2.295450210571289, + "step": 2730 + }, + { + "epoch": 1.9740634005763689, + "grad_norm": 20.229969474249543, + "learning_rate": 1.5799614676066906e-08, + "logits/chosen": -2.069178819656372, + "logits/rejected": -2.0663199424743652, + "logps/chosen": -0.9492548108100891, + "logps/rejected": -1.086529016494751, + "loss": 1.1756, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8985096216201782, + "rewards/margins": 0.2745482623577118, + "rewards/rejected": -2.173058032989502, + "step": 2740 + }, + { + "epoch": 1.9812680115273775, + "grad_norm": 15.993803030514755, + "learning_rate": 1.560504343603587e-08, + "logits/chosen": -1.9762630462646484, + "logits/rejected": -1.9768295288085938, + "logps/chosen": -1.0684893131256104, + "logps/rejected": -1.2240431308746338, + "loss": 1.1605, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1369786262512207, + "rewards/margins": 0.3111076056957245, + "rewards/rejected": -2.4480862617492676, + "step": 2750 + }, + { + "epoch": 1.9884726224783862, + "grad_norm": 18.752561218885347, + "learning_rate": 1.541113262265748e-08, + "logits/chosen": -2.069488286972046, + "logits/rejected": -2.0672833919525146, + "logps/chosen": -1.0279682874679565, + "logps/rejected": -1.1458810567855835, + "loss": 1.2067, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.055936574935913, + "rewards/margins": 0.23582550883293152, + "rewards/rejected": -2.291762113571167, + "step": 2760 + }, + { + "epoch": 1.9956772334293948, + "grad_norm": 25.86621954412604, + "learning_rate": 1.5217895867061227e-08, + "logits/chosen": -2.0054798126220703, + "logits/rejected": -1.9995949268341064, + "logps/chosen": -1.083843469619751, + "logps/rejected": -1.1836421489715576, + "loss": 1.2456, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.167686939239502, + "rewards/margins": 0.1995975375175476, + "rewards/rejected": -2.3672842979431152, + "step": 2770 + }, + { + "epoch": 2.0028818443804033, + "grad_norm": 22.661974621135478, + "learning_rate": 1.5025346752993098e-08, + "logits/chosen": -1.9982630014419556, + "logits/rejected": -1.9999568462371826, + "logps/chosen": -1.072928547859192, + "logps/rejected": -1.1990954875946045, + "loss": 1.2011, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.145857095718384, + "rewards/margins": 0.2523340880870819, + "rewards/rejected": -2.398190975189209, + "step": 2780 + }, + { + "epoch": 2.010086455331412, + "grad_norm": 23.303370032175714, + "learning_rate": 1.4833498815860756e-08, + "logits/chosen": -2.052605390548706, + "logits/rejected": -2.0548830032348633, + "logps/chosen": -0.9998857378959656, + "logps/rejected": -1.1840670108795166, + "loss": 1.1499, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.9997714757919312, + "rewards/margins": 0.3683624267578125, + "rewards/rejected": -2.368134021759033, + "step": 2790 + }, + { + "epoch": 2.0172910662824206, + "grad_norm": 18.168458664217237, + "learning_rate": 1.4642365541781993e-08, + "logits/chosen": -1.9642353057861328, + "logits/rejected": -1.9557300806045532, + "logps/chosen": -1.0272754430770874, + "logps/rejected": -1.1926220655441284, + "loss": 1.1518, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.054550886154175, + "rewards/margins": 0.33069342374801636, + "rewards/rejected": -2.385244131088257, + "step": 2800 + }, + { + "epoch": 2.0244956772334293, + "grad_norm": 17.943557689678926, + "learning_rate": 1.4451960366636745e-08, + "logits/chosen": -2.021503448486328, + "logits/rejected": -2.032627582550049, + "logps/chosen": -1.0408049821853638, + "logps/rejected": -1.1747071743011475, + "loss": 1.1831, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.0816099643707275, + "rewards/margins": 0.2678046226501465, + "rewards/rejected": -2.349414348602295, + "step": 2810 + }, + { + "epoch": 2.031700288184438, + "grad_norm": 19.245950456686252, + "learning_rate": 1.4262296675122592e-08, + "logits/chosen": -2.014530658721924, + "logits/rejected": -2.0107593536376953, + "logps/chosen": -1.0312591791152954, + "logps/rejected": -1.1913511753082275, + "loss": 1.1526, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.062518358230591, + "rewards/margins": 0.32018426060676575, + "rewards/rejected": -2.382702350616455, + "step": 2820 + }, + { + "epoch": 2.0389048991354466, + "grad_norm": 17.81271779194038, + "learning_rate": 1.407338779981389e-08, + "logits/chosen": -1.9946855306625366, + "logits/rejected": -1.9926750659942627, + "logps/chosen": -0.9136768579483032, + "logps/rejected": -1.0949757099151611, + "loss": 1.1116, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8273537158966064, + "rewards/margins": 0.3625979423522949, + "rewards/rejected": -2.1899514198303223, + "step": 2830 + }, + { + "epoch": 2.0461095100864553, + "grad_norm": 21.372732915554494, + "learning_rate": 1.3885247020224534e-08, + "logits/chosen": -2.0047404766082764, + "logits/rejected": -1.9999935626983643, + "logps/chosen": -1.0015006065368652, + "logps/rejected": -1.1312209367752075, + "loss": 1.1909, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.0030012130737305, + "rewards/margins": 0.2594410181045532, + "rewards/rejected": -2.262441873550415, + "step": 2840 + }, + { + "epoch": 2.053314121037464, + "grad_norm": 17.00779028228168, + "learning_rate": 1.369788756187445e-08, + "logits/chosen": -2.0100817680358887, + "logits/rejected": -2.006643056869507, + "logps/chosen": -1.0269567966461182, + "logps/rejected": -1.1220487356185913, + "loss": 1.2348, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -2.0539135932922363, + "rewards/margins": 0.1901838779449463, + "rewards/rejected": -2.2440974712371826, + "step": 2850 + }, + { + "epoch": 2.0605187319884726, + "grad_norm": 18.64801286449761, + "learning_rate": 1.3511322595359925e-08, + "logits/chosen": -2.035876750946045, + "logits/rejected": -2.0276687145233154, + "logps/chosen": -0.9376466870307922, + "logps/rejected": -1.1057026386260986, + "loss": 1.1393, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.8752933740615845, + "rewards/margins": 0.3361119329929352, + "rewards/rejected": -2.2114052772521973, + "step": 2860 + }, + { + "epoch": 2.0677233429394812, + "grad_norm": 17.383929031131085, + "learning_rate": 1.3325565235427716e-08, + "logits/chosen": -2.0277578830718994, + "logits/rejected": -2.026214122772217, + "logps/chosen": -0.982566237449646, + "logps/rejected": -1.1270108222961426, + "loss": 1.1768, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.965132474899292, + "rewards/margins": 0.2888889014720917, + "rewards/rejected": -2.254021644592285, + "step": 2870 + }, + { + "epoch": 2.07492795389049, + "grad_norm": 17.047214261003482, + "learning_rate": 1.3140628540053218e-08, + "logits/chosen": -1.9946644306182861, + "logits/rejected": -1.9967546463012695, + "logps/chosen": -0.9750404357910156, + "logps/rejected": -1.1103841066360474, + "loss": 1.1831, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9500808715820312, + "rewards/margins": 0.2706873416900635, + "rewards/rejected": -2.2207682132720947, + "step": 2880 + }, + { + "epoch": 2.0821325648414986, + "grad_norm": 19.26958654488348, + "learning_rate": 1.2956525509522451e-08, + "logits/chosen": -1.9811160564422607, + "logits/rejected": -1.9807733297348022, + "logps/chosen": -1.111905574798584, + "logps/rejected": -1.2153931856155396, + "loss": 1.2339, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.223811149597168, + "rewards/margins": 0.2069750279188156, + "rewards/rejected": -2.430786371231079, + "step": 2890 + }, + { + "epoch": 2.089337175792507, + "grad_norm": 19.78025222762576, + "learning_rate": 1.2773269085518267e-08, + "logits/chosen": -2.0117239952087402, + "logits/rejected": -2.0130703449249268, + "logps/chosen": -1.0760631561279297, + "logps/rejected": -1.206182599067688, + "loss": 1.1842, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.1521263122558594, + "rewards/margins": 0.2602389454841614, + "rewards/rejected": -2.412365198135376, + "step": 2900 + }, + { + "epoch": 2.096541786743516, + "grad_norm": 20.115808813369917, + "learning_rate": 1.2590872150210574e-08, + "logits/chosen": -2.06766414642334, + "logits/rejected": -2.0607457160949707, + "logps/chosen": -1.0578199625015259, + "logps/rejected": -1.1676528453826904, + "loss": 1.226, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.1156399250030518, + "rewards/margins": 0.21966581046581268, + "rewards/rejected": -2.335305690765381, + "step": 2910 + }, + { + "epoch": 2.1037463976945245, + "grad_norm": 20.584492670465366, + "learning_rate": 1.2409347525350775e-08, + "logits/chosen": -2.0331482887268066, + "logits/rejected": -2.02323842048645, + "logps/chosen": -1.1097338199615479, + "logps/rejected": -1.2564305067062378, + "loss": 1.1674, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.2194676399230957, + "rewards/margins": 0.2933935225009918, + "rewards/rejected": -2.5128610134124756, + "step": 2920 + }, + { + "epoch": 2.110951008645533, + "grad_norm": 22.815499062996494, + "learning_rate": 1.2228707971370421e-08, + "logits/chosen": -2.0209853649139404, + "logits/rejected": -2.0140042304992676, + "logps/chosen": -0.9935145378112793, + "logps/rejected": -1.105963945388794, + "loss": 1.2266, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.9870290756225586, + "rewards/margins": 0.22489885985851288, + "rewards/rejected": -2.211927890777588, + "step": 2930 + }, + { + "epoch": 2.118155619596542, + "grad_norm": 21.166376348681847, + "learning_rate": 1.2048966186484282e-08, + "logits/chosen": -2.017411708831787, + "logits/rejected": -2.0006022453308105, + "logps/chosen": -1.1170910596847534, + "logps/rejected": -1.2319118976593018, + "loss": 1.2115, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.234182119369507, + "rewards/margins": 0.2296416014432907, + "rewards/rejected": -2.4638237953186035, + "step": 2940 + }, + { + "epoch": 2.1253602305475505, + "grad_norm": 28.013077396296502, + "learning_rate": 1.187013480579762e-08, + "logits/chosen": -2.0150485038757324, + "logits/rejected": -2.0178308486938477, + "logps/chosen": -1.0425622463226318, + "logps/rejected": -1.1764047145843506, + "loss": 1.2005, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.0851244926452637, + "rewards/margins": 0.2676849961280823, + "rewards/rejected": -2.352809429168701, + "step": 2950 + }, + { + "epoch": 2.132564841498559, + "grad_norm": 39.69497918705247, + "learning_rate": 1.1692226400418073e-08, + "logits/chosen": -1.9483245611190796, + "logits/rejected": -1.9468435049057007, + "logps/chosen": -1.0812904834747314, + "logps/rejected": -1.211723804473877, + "loss": 1.2174, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.162580966949463, + "rewards/margins": 0.2608664035797119, + "rewards/rejected": -2.423447608947754, + "step": 2960 + }, + { + "epoch": 2.139769452449568, + "grad_norm": 16.186051185633424, + "learning_rate": 1.1515253476571923e-08, + "logits/chosen": -1.9795656204223633, + "logits/rejected": -1.9738327264785767, + "logps/chosen": -1.0102039575576782, + "logps/rejected": -1.191947102546692, + "loss": 1.1203, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.0204079151153564, + "rewards/margins": 0.36348623037338257, + "rewards/rejected": -2.383894205093384, + "step": 2970 + }, + { + "epoch": 2.1469740634005765, + "grad_norm": 19.900013908570074, + "learning_rate": 1.133922847472496e-08, + "logits/chosen": -2.0021681785583496, + "logits/rejected": -2.002943515777588, + "logps/chosen": -1.1099964380264282, + "logps/rejected": -1.2084180116653442, + "loss": 1.2521, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.2199928760528564, + "rewards/margins": 0.1968432366847992, + "rewards/rejected": -2.4168360233306885, + "step": 2980 + }, + { + "epoch": 2.154178674351585, + "grad_norm": 22.72131090745606, + "learning_rate": 1.1164163768707952e-08, + "logits/chosen": -2.0033812522888184, + "logits/rejected": -1.9982162714004517, + "logps/chosen": -1.0048575401306152, + "logps/rejected": -1.1429532766342163, + "loss": 1.1846, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.0097150802612305, + "rewards/margins": 0.2761916518211365, + "rewards/rejected": -2.2859065532684326, + "step": 2990 + }, + { + "epoch": 2.161383285302594, + "grad_norm": 17.87155121185044, + "learning_rate": 1.0990071664846861e-08, + "logits/chosen": -1.9833234548568726, + "logits/rejected": -1.9822227954864502, + "logps/chosen": -1.0197571516036987, + "logps/rejected": -1.198885202407837, + "loss": 1.1594, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0395143032073975, + "rewards/margins": 0.35825610160827637, + "rewards/rejected": -2.397770404815674, + "step": 3000 + }, + { + "epoch": 2.1685878962536025, + "grad_norm": 18.894758055694265, + "learning_rate": 1.0816964401097739e-08, + "logits/chosen": -1.9618886709213257, + "logits/rejected": -1.9587116241455078, + "logps/chosen": -0.9558318853378296, + "logps/rejected": -1.0796352624893188, + "loss": 1.205, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.9116637706756592, + "rewards/margins": 0.24760663509368896, + "rewards/rejected": -2.1592705249786377, + "step": 3010 + }, + { + "epoch": 2.175792507204611, + "grad_norm": 19.906085929832862, + "learning_rate": 1.0644854146186406e-08, + "logits/chosen": -2.025203227996826, + "logits/rejected": -2.0189619064331055, + "logps/chosen": -1.024359107017517, + "logps/rejected": -1.1828162670135498, + "loss": 1.1636, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.048718214035034, + "rewards/margins": 0.31691429018974304, + "rewards/rejected": -2.3656325340270996, + "step": 3020 + }, + { + "epoch": 2.18299711815562, + "grad_norm": 19.050052296968172, + "learning_rate": 1.0473752998753114e-08, + "logits/chosen": -2.0075266361236572, + "logits/rejected": -1.9991521835327148, + "logps/chosen": -1.0193443298339844, + "logps/rejected": -1.1795679330825806, + "loss": 1.1534, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.0386886596679688, + "rewards/margins": 0.3204469382762909, + "rewards/rejected": -2.359135866165161, + "step": 3030 + }, + { + "epoch": 2.1902017291066285, + "grad_norm": 19.33190356248751, + "learning_rate": 1.030367298650201e-08, + "logits/chosen": -2.0206215381622314, + "logits/rejected": -2.0204710960388184, + "logps/chosen": -1.039102554321289, + "logps/rejected": -1.191624402999878, + "loss": 1.1575, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.078205108642578, + "rewards/margins": 0.30504345893859863, + "rewards/rejected": -2.383248805999756, + "step": 3040 + }, + { + "epoch": 2.1974063400576367, + "grad_norm": 21.9617981131132, + "learning_rate": 1.0134626065355675e-08, + "logits/chosen": -2.074868679046631, + "logits/rejected": -2.071895122528076, + "logps/chosen": -1.0231993198394775, + "logps/rejected": -1.1662580966949463, + "loss": 1.1883, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.046398639678955, + "rewards/margins": 0.2861180305480957, + "rewards/rejected": -2.3325161933898926, + "step": 3050 + }, + { + "epoch": 2.2046109510086453, + "grad_norm": 19.55623294249282, + "learning_rate": 9.966624118614611e-09, + "logits/chosen": -2.013423442840576, + "logits/rejected": -2.0084660053253174, + "logps/chosen": -1.0631778240203857, + "logps/rejected": -1.2087138891220093, + "loss": 1.1876, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.1263556480407715, + "rewards/margins": 0.2910720705986023, + "rewards/rejected": -2.4174277782440186, + "step": 3060 + }, + { + "epoch": 2.211815561959654, + "grad_norm": 14.867171520984334, + "learning_rate": 9.799678956121976e-09, + "logits/chosen": -1.970645546913147, + "logits/rejected": -1.9662506580352783, + "logps/chosen": -1.030286431312561, + "logps/rejected": -1.1386739015579224, + "loss": 1.2008, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.060572862625122, + "rewards/margins": 0.21677501499652863, + "rewards/rejected": -2.2773478031158447, + "step": 3070 + }, + { + "epoch": 2.2190201729106627, + "grad_norm": 23.6722369037589, + "learning_rate": 9.633802313433314e-09, + "logits/chosen": -1.942859411239624, + "logits/rejected": -1.9486020803451538, + "logps/chosen": -1.0193486213684082, + "logps/rejected": -1.1251273155212402, + "loss": 1.2058, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.0386972427368164, + "rewards/margins": 0.21155771613121033, + "rewards/rejected": -2.2502546310424805, + "step": 3080 + }, + { + "epoch": 2.2262247838616713, + "grad_norm": 20.75677230247985, + "learning_rate": 9.469005850991705e-09, + "logits/chosen": -2.0128586292266846, + "logits/rejected": -2.0072181224823, + "logps/chosen": -1.0143338441848755, + "logps/rejected": -1.1310632228851318, + "loss": 1.2348, + "rewards/accuracies": 0.53125, + "rewards/chosen": -2.028667688369751, + "rewards/margins": 0.23345866799354553, + "rewards/rejected": -2.2621264457702637, + "step": 3090 + }, + { + "epoch": 2.23342939481268, + "grad_norm": 18.746796539063972, + "learning_rate": 9.305301153307949e-09, + "logits/chosen": -2.0094423294067383, + "logits/rejected": -2.017090320587158, + "logps/chosen": -0.9450882077217102, + "logps/rejected": -1.1097722053527832, + "loss": 1.157, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.8901764154434204, + "rewards/margins": 0.32936811447143555, + "rewards/rejected": -2.2195444107055664, + "step": 3100 + }, + { + "epoch": 2.2406340057636887, + "grad_norm": 18.096061545612795, + "learning_rate": 9.142699728146336e-09, + "logits/chosen": -1.9791491031646729, + "logits/rejected": -1.9722025394439697, + "logps/chosen": -1.0312968492507935, + "logps/rejected": -1.163287878036499, + "loss": 1.2018, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.062593698501587, + "rewards/margins": 0.26398202776908875, + "rewards/rejected": -2.326575756072998, + "step": 3110 + }, + { + "epoch": 2.2478386167146973, + "grad_norm": 16.703535569291137, + "learning_rate": 8.981213005715627e-09, + "logits/chosen": -2.0036609172821045, + "logits/rejected": -2.006706714630127, + "logps/chosen": -0.9921053647994995, + "logps/rejected": -1.1647652387619019, + "loss": 1.1486, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.984210729598999, + "rewards/margins": 0.34531962871551514, + "rewards/rejected": -2.3295304775238037, + "step": 3120 + }, + { + "epoch": 2.255043227665706, + "grad_norm": 21.958272583090416, + "learning_rate": 8.820852337865611e-09, + "logits/chosen": -2.0320816040039062, + "logits/rejected": -2.028298854827881, + "logps/chosen": -0.9958044290542603, + "logps/rejected": -1.1435072422027588, + "loss": 1.1727, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9916088581085205, + "rewards/margins": 0.29540568590164185, + "rewards/rejected": -2.2870144844055176, + "step": 3130 + }, + { + "epoch": 2.2622478386167146, + "grad_norm": 17.08140631103118, + "learning_rate": 8.661628997289044e-09, + "logits/chosen": -1.974020004272461, + "logits/rejected": -1.9698307514190674, + "logps/chosen": -1.0156090259552002, + "logps/rejected": -1.1709401607513428, + "loss": 1.169, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.0312180519104004, + "rewards/margins": 0.31066226959228516, + "rewards/rejected": -2.3418803215026855, + "step": 3140 + }, + { + "epoch": 2.2694524495677233, + "grad_norm": 16.417561627034264, + "learning_rate": 8.503554176729341e-09, + "logits/chosen": -1.9732913970947266, + "logits/rejected": -1.9715712070465088, + "logps/chosen": -1.0267970561981201, + "logps/rejected": -1.1859861612319946, + "loss": 1.1702, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0535941123962402, + "rewards/margins": 0.3183782696723938, + "rewards/rejected": -2.3719723224639893, + "step": 3150 + }, + { + "epoch": 2.276657060518732, + "grad_norm": 24.932055477370433, + "learning_rate": 8.346638988193636e-09, + "logits/chosen": -1.9996095895767212, + "logits/rejected": -1.9946562051773071, + "logps/chosen": -0.9252532720565796, + "logps/rejected": -1.0754306316375732, + "loss": 1.1774, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.8505065441131592, + "rewards/margins": 0.30035486817359924, + "rewards/rejected": -2.1508612632751465, + "step": 3160 + }, + { + "epoch": 2.2838616714697406, + "grad_norm": 23.217906478731962, + "learning_rate": 8.19089446217176e-09, + "logits/chosen": -1.9767364263534546, + "logits/rejected": -1.9664764404296875, + "logps/chosen": -1.002582311630249, + "logps/rejected": -1.1916791200637817, + "loss": 1.121, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.005164623260498, + "rewards/margins": 0.378193199634552, + "rewards/rejected": -2.3833582401275635, + "step": 3170 + }, + { + "epoch": 2.2910662824207493, + "grad_norm": 17.141991975793108, + "learning_rate": 8.036331546860777e-09, + "logits/chosen": -1.9834896326065063, + "logits/rejected": -1.9829978942871094, + "logps/chosen": -0.950110912322998, + "logps/rejected": -1.0394455194473267, + "loss": 1.2456, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.900221824645996, + "rewards/margins": 0.17866934835910797, + "rewards/rejected": -2.0788910388946533, + "step": 3180 + }, + { + "epoch": 2.298270893371758, + "grad_norm": 23.516146286428995, + "learning_rate": 7.882961107395416e-09, + "logits/chosen": -1.9969555139541626, + "logits/rejected": -1.9910898208618164, + "logps/chosen": -1.130173683166504, + "logps/rejected": -1.1781437397003174, + "loss": 1.3138, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.260347366333008, + "rewards/margins": 0.09594009816646576, + "rewards/rejected": -2.3562874794006348, + "step": 3190 + }, + { + "epoch": 2.3054755043227666, + "grad_norm": 25.884981502834442, + "learning_rate": 7.73079392508428e-09, + "logits/chosen": -1.966968297958374, + "logits/rejected": -1.9663887023925781, + "logps/chosen": -1.0911871194839478, + "logps/rejected": -1.2788745164871216, + "loss": 1.1537, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.1823742389678955, + "rewards/margins": 0.3753744959831238, + "rewards/rejected": -2.557749032974243, + "step": 3200 + }, + { + "epoch": 2.3126801152737753, + "grad_norm": 21.445267273811133, + "learning_rate": 7.579840696651938e-09, + "logits/chosen": -1.9995644092559814, + "logits/rejected": -1.9965426921844482, + "logps/chosen": -1.0473155975341797, + "logps/rejected": -1.171209692955017, + "loss": 1.2093, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.0946311950683594, + "rewards/margins": 0.2477881908416748, + "rewards/rejected": -2.342419385910034, + "step": 3210 + }, + { + "epoch": 2.319884726224784, + "grad_norm": 21.167143894724592, + "learning_rate": 7.43011203348704e-09, + "logits/chosen": -1.9143447875976562, + "logits/rejected": -1.911238431930542, + "logps/chosen": -1.0506112575531006, + "logps/rejected": -1.1263036727905273, + "loss": 1.2681, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.101222515106201, + "rewards/margins": 0.15138480067253113, + "rewards/rejected": -2.2526073455810547, + "step": 3220 + }, + { + "epoch": 2.3270893371757926, + "grad_norm": 18.7122492381894, + "learning_rate": 7.281618460896344e-09, + "logits/chosen": -1.994127869606018, + "logits/rejected": -1.9916470050811768, + "logps/chosen": -0.9652446508407593, + "logps/rejected": -1.1070702075958252, + "loss": 1.1731, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9304893016815186, + "rewards/margins": 0.2836512625217438, + "rewards/rejected": -2.2141404151916504, + "step": 3230 + }, + { + "epoch": 2.3342939481268012, + "grad_norm": 20.398678891005122, + "learning_rate": 7.134370417364849e-09, + "logits/chosen": -1.9642471075057983, + "logits/rejected": -1.9637800455093384, + "logps/chosen": -1.0007370710372925, + "logps/rejected": -1.1398149728775024, + "loss": 1.2023, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.001474142074585, + "rewards/margins": 0.27815574407577515, + "rewards/rejected": -2.279629945755005, + "step": 3240 + }, + { + "epoch": 2.34149855907781, + "grad_norm": 23.240945165202064, + "learning_rate": 6.988378253821981e-09, + "logits/chosen": -1.9668891429901123, + "logits/rejected": -1.9658010005950928, + "logps/chosen": -1.0259102582931519, + "logps/rejected": -1.1435902118682861, + "loss": 1.2094, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.0518205165863037, + "rewards/margins": 0.2353595793247223, + "rewards/rejected": -2.2871804237365723, + "step": 3250 + }, + { + "epoch": 2.3487031700288186, + "grad_norm": 20.274147530632312, + "learning_rate": 6.8436522329140186e-09, + "logits/chosen": -1.9758933782577515, + "logits/rejected": -1.9824403524398804, + "logps/chosen": -1.0337318181991577, + "logps/rejected": -1.1588385105133057, + "loss": 1.2104, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.0674636363983154, + "rewards/margins": 0.25021329522132874, + "rewards/rejected": -2.3176770210266113, + "step": 3260 + }, + { + "epoch": 2.3559077809798272, + "grad_norm": 21.874255456099494, + "learning_rate": 6.700202528282603e-09, + "logits/chosen": -1.977266550064087, + "logits/rejected": -1.9675136804580688, + "logps/chosen": -1.0283175706863403, + "logps/rejected": -1.1439546346664429, + "loss": 1.2152, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.0566351413726807, + "rewards/margins": 0.2312743216753006, + "rewards/rejected": -2.2879092693328857, + "step": 3270 + }, + { + "epoch": 2.363112391930836, + "grad_norm": 21.384973629502426, + "learning_rate": 6.558039223849668e-09, + "logits/chosen": -2.0306007862091064, + "logits/rejected": -2.0210115909576416, + "logps/chosen": -1.036071538925171, + "logps/rejected": -1.2443821430206299, + "loss": 1.1148, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.072143077850342, + "rewards/margins": 0.41662105917930603, + "rewards/rejected": -2.4887642860412598, + "step": 3280 + }, + { + "epoch": 2.3703170028818445, + "grad_norm": 22.082670294571745, + "learning_rate": 6.417172313108471e-09, + "logits/chosen": -1.9587681293487549, + "logits/rejected": -1.9533681869506836, + "logps/chosen": -0.9850085973739624, + "logps/rejected": -1.1136410236358643, + "loss": 1.1988, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.9700171947479248, + "rewards/margins": 0.2572648227214813, + "rewards/rejected": -2.2272820472717285, + "step": 3290 + }, + { + "epoch": 2.377521613832853, + "grad_norm": 21.566072441690903, + "learning_rate": 6.277611698421179e-09, + "logits/chosen": -2.0187790393829346, + "logits/rejected": -2.010676383972168, + "logps/chosen": -0.9041236042976379, + "logps/rejected": -1.0975037813186646, + "loss": 1.1246, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.8082472085952759, + "rewards/margins": 0.3867604732513428, + "rewards/rejected": -2.195007562637329, + "step": 3300 + }, + { + "epoch": 2.3847262247838614, + "grad_norm": 22.686014286093737, + "learning_rate": 6.139367190322714e-09, + "logits/chosen": -2.0076019763946533, + "logits/rejected": -2.0073728561401367, + "logps/chosen": -1.0593101978302002, + "logps/rejected": -1.2181084156036377, + "loss": 1.1614, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.1186203956604004, + "rewards/margins": 0.3175966143608093, + "rewards/rejected": -2.4362168312072754, + "step": 3310 + }, + { + "epoch": 2.39193083573487, + "grad_norm": 17.411733324996547, + "learning_rate": 6.002448506831171e-09, + "logits/chosen": -2.00325083732605, + "logits/rejected": -1.9984287023544312, + "logps/chosen": -0.9814065098762512, + "logps/rejected": -1.124678134918213, + "loss": 1.1735, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.9628130197525024, + "rewards/margins": 0.2865433394908905, + "rewards/rejected": -2.249356269836426, + "step": 3320 + }, + { + "epoch": 2.3991354466858787, + "grad_norm": 18.206116423091125, + "learning_rate": 5.866865272764607e-09, + "logits/chosen": -2.0245230197906494, + "logits/rejected": -2.02435040473938, + "logps/chosen": -1.016035795211792, + "logps/rejected": -1.1609398126602173, + "loss": 1.1763, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.032071590423584, + "rewards/margins": 0.2898081839084625, + "rewards/rejected": -2.3218796253204346, + "step": 3330 + }, + { + "epoch": 2.4063400576368874, + "grad_norm": 23.272555194691343, + "learning_rate": 5.7326270190645595e-09, + "logits/chosen": -1.900092363357544, + "logits/rejected": -1.901489496231079, + "logps/chosen": -1.0590754747390747, + "logps/rejected": -1.1687304973602295, + "loss": 1.2178, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.1181509494781494, + "rewards/margins": 0.2193101942539215, + "rewards/rejected": -2.337460994720459, + "step": 3340 + }, + { + "epoch": 2.413544668587896, + "grad_norm": 18.379780103310328, + "learning_rate": 5.599743182125938e-09, + "logits/chosen": -2.0489888191223145, + "logits/rejected": -2.048907518386841, + "logps/chosen": -1.04793381690979, + "logps/rejected": -1.1847532987594604, + "loss": 1.1791, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.09586763381958, + "rewards/margins": 0.2736392021179199, + "rewards/rejected": -2.369506597518921, + "step": 3350 + }, + { + "epoch": 2.4207492795389047, + "grad_norm": 20.327364402710476, + "learning_rate": 5.46822310313379e-09, + "logits/chosen": -2.0488951206207275, + "logits/rejected": -2.05851411819458, + "logps/chosen": -1.0903593301773071, + "logps/rejected": -1.1954041719436646, + "loss": 1.2347, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.1807186603546143, + "rewards/margins": 0.21008984744548798, + "rewards/rejected": -2.390808343887329, + "step": 3360 + }, + { + "epoch": 2.4279538904899134, + "grad_norm": 20.74526034057685, + "learning_rate": 5.33807602740658e-09, + "logits/chosen": -2.0205600261688232, + "logits/rejected": -2.0137457847595215, + "logps/chosen": -0.9561742544174194, + "logps/rejected": -1.1597832441329956, + "loss": 1.1117, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.9123485088348389, + "rewards/margins": 0.4072180390357971, + "rewards/rejected": -2.319566488265991, + "step": 3370 + }, + { + "epoch": 2.435158501440922, + "grad_norm": 21.143223128559057, + "learning_rate": 5.209311103746334e-09, + "logits/chosen": -2.000640869140625, + "logits/rejected": -2.0010857582092285, + "logps/chosen": -1.0521432161331177, + "logps/rejected": -1.224646806716919, + "loss": 1.1583, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.1042864322662354, + "rewards/margins": 0.34500715136528015, + "rewards/rejected": -2.449293613433838, + "step": 3380 + }, + { + "epoch": 2.4423631123919307, + "grad_norm": 24.24368097300932, + "learning_rate": 5.081937383795484e-09, + "logits/chosen": -1.9737918376922607, + "logits/rejected": -1.9732027053833008, + "logps/chosen": -0.9712947010993958, + "logps/rejected": -1.1367398500442505, + "loss": 1.1475, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.9425894021987915, + "rewards/margins": 0.3308902680873871, + "rewards/rejected": -2.273479700088501, + "step": 3390 + }, + { + "epoch": 2.4495677233429394, + "grad_norm": 18.631728815748932, + "learning_rate": 4.955963821400599e-09, + "logits/chosen": -2.028813123703003, + "logits/rejected": -2.0230822563171387, + "logps/chosen": -1.0294839143753052, + "logps/rejected": -1.166856288909912, + "loss": 1.1931, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.0589678287506104, + "rewards/margins": 0.2747448980808258, + "rewards/rejected": -2.333712577819824, + "step": 3400 + }, + { + "epoch": 2.456772334293948, + "grad_norm": 15.405415975081304, + "learning_rate": 4.831399271982928e-09, + "logits/chosen": -1.9567426443099976, + "logits/rejected": -1.9487731456756592, + "logps/chosen": -1.0431255102157593, + "logps/rejected": -1.1727097034454346, + "loss": 1.211, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.0862510204315186, + "rewards/margins": 0.2591683268547058, + "rewards/rejected": -2.345419406890869, + "step": 3410 + }, + { + "epoch": 2.4639769452449567, + "grad_norm": 24.975880962434086, + "learning_rate": 4.708252491915951e-09, + "logits/chosen": -2.0264954566955566, + "logits/rejected": -2.0203347206115723, + "logps/chosen": -1.0453736782073975, + "logps/rejected": -1.1925294399261475, + "loss": 1.1967, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.090747356414795, + "rewards/margins": 0.29431161284446716, + "rewards/rejected": -2.385058879852295, + "step": 3420 + }, + { + "epoch": 2.4711815561959654, + "grad_norm": 25.25311706276655, + "learning_rate": 4.58653213790981e-09, + "logits/chosen": -2.009765863418579, + "logits/rejected": -2.0017480850219727, + "logps/chosen": -1.0253998041152954, + "logps/rejected": -1.1738344430923462, + "loss": 1.1794, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.050799608230591, + "rewards/margins": 0.2968693673610687, + "rewards/rejected": -2.3476688861846924, + "step": 3430 + }, + { + "epoch": 2.478386167146974, + "grad_norm": 18.143302706074053, + "learning_rate": 4.466246766402773e-09, + "logits/chosen": -1.989457130432129, + "logits/rejected": -1.9831485748291016, + "logps/chosen": -1.039151906967163, + "logps/rejected": -1.1928648948669434, + "loss": 1.1832, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.078303813934326, + "rewards/margins": 0.307425856590271, + "rewards/rejected": -2.3857297897338867, + "step": 3440 + }, + { + "epoch": 2.4855907780979827, + "grad_norm": 22.18491810055598, + "learning_rate": 4.347404832959775e-09, + "logits/chosen": -2.036830425262451, + "logits/rejected": -2.0370731353759766, + "logps/chosen": -1.0329066514968872, + "logps/rejected": -1.192126989364624, + "loss": 1.1623, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0658133029937744, + "rewards/margins": 0.31844058632850647, + "rewards/rejected": -2.384253978729248, + "step": 3450 + }, + { + "epoch": 2.4927953890489913, + "grad_norm": 33.05603885847282, + "learning_rate": 4.230014691678016e-09, + "logits/chosen": -1.9939508438110352, + "logits/rejected": -1.9945671558380127, + "logps/chosen": -1.0595440864562988, + "logps/rejected": -1.1260967254638672, + "loss": 1.2725, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.1190881729125977, + "rewards/margins": 0.1331052929162979, + "rewards/rejected": -2.2521934509277344, + "step": 3460 + }, + { + "epoch": 2.5, + "grad_norm": 17.95056296756793, + "learning_rate": 4.114084594599707e-09, + "logits/chosen": -1.9955661296844482, + "logits/rejected": -1.9954122304916382, + "logps/chosen": -1.0110971927642822, + "logps/rejected": -1.2275745868682861, + "loss": 1.1027, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.0221943855285645, + "rewards/margins": 0.4329546391963959, + "rewards/rejected": -2.4551491737365723, + "step": 3470 + }, + { + "epoch": 2.5072046109510087, + "grad_norm": 22.46057751431382, + "learning_rate": 3.9996226911319546e-09, + "logits/chosen": -1.9920648336410522, + "logits/rejected": -1.97979736328125, + "logps/chosen": -1.015860915184021, + "logps/rejected": -1.1448417901992798, + "loss": 1.1908, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.031721830368042, + "rewards/margins": 0.25796186923980713, + "rewards/rejected": -2.2896835803985596, + "step": 3480 + }, + { + "epoch": 2.5144092219020173, + "grad_norm": 17.84685371740395, + "learning_rate": 3.886637027473949e-09, + "logits/chosen": -2.003864049911499, + "logits/rejected": -2.0060501098632812, + "logps/chosen": -1.0762816667556763, + "logps/rejected": -1.239137887954712, + "loss": 1.1561, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1525633335113525, + "rewards/margins": 0.32571229338645935, + "rewards/rejected": -2.478275775909424, + "step": 3490 + }, + { + "epoch": 2.521613832853026, + "grad_norm": 19.29855967109395, + "learning_rate": 3.775135546051295e-09, + "logits/chosen": -1.9411065578460693, + "logits/rejected": -1.9420562982559204, + "logps/chosen": -1.0252829790115356, + "logps/rejected": -1.1507127285003662, + "loss": 1.1987, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.0505659580230713, + "rewards/margins": 0.250859797000885, + "rewards/rejected": -2.3014254570007324, + "step": 3500 + }, + { + "epoch": 2.5288184438040346, + "grad_norm": 23.2883981469754, + "learning_rate": 3.665126084957723e-09, + "logits/chosen": -1.9881235361099243, + "logits/rejected": -1.9923149347305298, + "logps/chosen": -1.1336690187454224, + "logps/rejected": -1.2319626808166504, + "loss": 1.2595, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.2673380374908447, + "rewards/margins": 0.19658716022968292, + "rewards/rejected": -2.463925361633301, + "step": 3510 + }, + { + "epoch": 2.5360230547550433, + "grad_norm": 19.49687492971959, + "learning_rate": 3.556616377404101e-09, + "logits/chosen": -2.005202531814575, + "logits/rejected": -2.002882719039917, + "logps/chosen": -1.078424096107483, + "logps/rejected": -1.2361047267913818, + "loss": 1.1542, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.156848192214966, + "rewards/margins": 0.3153611719608307, + "rewards/rejected": -2.4722094535827637, + "step": 3520 + }, + { + "epoch": 2.543227665706052, + "grad_norm": 19.79099303892538, + "learning_rate": 3.4496140511748125e-09, + "logits/chosen": -1.9998537302017212, + "logits/rejected": -1.9945876598358154, + "logps/chosen": -1.054868459701538, + "logps/rejected": -1.198381781578064, + "loss": 1.176, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.109736919403076, + "rewards/margins": 0.2870263457298279, + "rewards/rejected": -2.396763563156128, + "step": 3530 + }, + { + "epoch": 2.5504322766570606, + "grad_norm": 31.21393810269857, + "learning_rate": 3.3441266280915427e-09, + "logits/chosen": -1.9868720769882202, + "logits/rejected": -1.9875354766845703, + "logps/chosen": -1.0944864749908447, + "logps/rejected": -1.2088205814361572, + "loss": 1.2139, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.1889729499816895, + "rewards/margins": 0.2286679744720459, + "rewards/rejected": -2.4176411628723145, + "step": 3540 + }, + { + "epoch": 2.5576368876080693, + "grad_norm": 23.58461815941298, + "learning_rate": 3.2401615234845693e-09, + "logits/chosen": -2.00813627243042, + "logits/rejected": -2.0022683143615723, + "logps/chosen": -1.092397928237915, + "logps/rejected": -1.235439658164978, + "loss": 1.1899, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.18479585647583, + "rewards/margins": 0.28608375787734985, + "rewards/rejected": -2.470879316329956, + "step": 3550 + }, + { + "epoch": 2.564841498559078, + "grad_norm": 16.081046688546003, + "learning_rate": 3.1377260456714375e-09, + "logits/chosen": -1.8945989608764648, + "logits/rejected": -1.8861172199249268, + "logps/chosen": -1.060025930404663, + "logps/rejected": -1.2014561891555786, + "loss": 1.1701, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.120051860809326, + "rewards/margins": 0.2828606069087982, + "rewards/rejected": -2.4029123783111572, + "step": 3560 + }, + { + "epoch": 2.5720461095100866, + "grad_norm": 17.90788569435218, + "learning_rate": 3.0368273954432698e-09, + "logits/chosen": -2.0311574935913086, + "logits/rejected": -2.022732973098755, + "logps/chosen": -1.0490517616271973, + "logps/rejected": -1.1529314517974854, + "loss": 1.2253, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.0981035232543945, + "rewards/margins": 0.20775911211967468, + "rewards/rejected": -2.3058629035949707, + "step": 3570 + }, + { + "epoch": 2.5792507204610953, + "grad_norm": 17.350027926948087, + "learning_rate": 2.937472665558541e-09, + "logits/chosen": -2.0183825492858887, + "logits/rejected": -2.019484043121338, + "logps/chosen": -1.0362021923065186, + "logps/rejected": -1.1471149921417236, + "loss": 1.2278, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.072404384613037, + "rewards/margins": 0.22182568907737732, + "rewards/rejected": -2.2942299842834473, + "step": 3580 + }, + { + "epoch": 2.586455331412104, + "grad_norm": 21.819876934855408, + "learning_rate": 2.8396688402445053e-09, + "logits/chosen": -2.0643913745880127, + "logits/rejected": -2.0568366050720215, + "logps/chosen": -1.0093636512756348, + "logps/rejected": -1.2179429531097412, + "loss": 1.1053, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.0187273025512695, + "rewards/margins": 0.41715869307518005, + "rewards/rejected": -2.4358859062194824, + "step": 3590 + }, + { + "epoch": 2.5936599423631126, + "grad_norm": 23.978636486056804, + "learning_rate": 2.7434227947062324e-09, + "logits/chosen": -2.0086283683776855, + "logits/rejected": -2.002335548400879, + "logps/chosen": -1.1310678720474243, + "logps/rejected": -1.239127516746521, + "loss": 1.2324, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.2621357440948486, + "rewards/margins": 0.2161194384098053, + "rewards/rejected": -2.478255033493042, + "step": 3600 + }, + { + "epoch": 2.6008645533141213, + "grad_norm": 18.242440029630476, + "learning_rate": 2.6487412946432976e-09, + "logits/chosen": -1.9712591171264648, + "logits/rejected": -1.966138482093811, + "logps/chosen": -1.0688244104385376, + "logps/rejected": -1.2041301727294922, + "loss": 1.1924, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.137648820877075, + "rewards/margins": 0.27061182260513306, + "rewards/rejected": -2.4082603454589844, + "step": 3610 + }, + { + "epoch": 2.60806916426513, + "grad_norm": 22.81379926161724, + "learning_rate": 2.5556309957742024e-09, + "logits/chosen": -1.9811557531356812, + "logits/rejected": -1.9759891033172607, + "logps/chosen": -1.0247745513916016, + "logps/rejected": -1.220990538597107, + "loss": 1.1157, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.049549102783203, + "rewards/margins": 0.39243215322494507, + "rewards/rejected": -2.441981077194214, + "step": 3620 + }, + { + "epoch": 2.6152737752161386, + "grad_norm": 22.699037535274126, + "learning_rate": 2.4640984433684758e-09, + "logits/chosen": -2.03954815864563, + "logits/rejected": -2.040444850921631, + "logps/chosen": -1.1181697845458984, + "logps/rejected": -1.234116554260254, + "loss": 1.2351, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.236339569091797, + "rewards/margins": 0.231893390417099, + "rewards/rejected": -2.468233108520508, + "step": 3630 + }, + { + "epoch": 2.6224783861671472, + "grad_norm": 16.94171934712139, + "learning_rate": 2.3741500717865987e-09, + "logits/chosen": -1.995910882949829, + "logits/rejected": -2.00685715675354, + "logps/chosen": -1.0068349838256836, + "logps/rejected": -1.1508421897888184, + "loss": 1.1796, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.013669967651367, + "rewards/margins": 0.2880145311355591, + "rewards/rejected": -2.3016843795776367, + "step": 3640 + }, + { + "epoch": 2.629682997118156, + "grad_norm": 17.359736896500603, + "learning_rate": 2.285792204027678e-09, + "logits/chosen": -1.9837639331817627, + "logits/rejected": -1.9810622930526733, + "logps/chosen": -1.0128896236419678, + "logps/rejected": -1.2106821537017822, + "loss": 1.1023, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.0257792472839355, + "rewards/margins": 0.3955853283405304, + "rewards/rejected": -2.4213643074035645, + "step": 3650 + }, + { + "epoch": 2.636887608069164, + "grad_norm": 20.82292935112201, + "learning_rate": 2.199031051284972e-09, + "logits/chosen": -2.008237838745117, + "logits/rejected": -2.003821611404419, + "logps/chosen": -1.0695806741714478, + "logps/rejected": -1.1950201988220215, + "loss": 1.2201, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.1391613483428955, + "rewards/margins": 0.25087904930114746, + "rewards/rejected": -2.390040397644043, + "step": 3660 + }, + { + "epoch": 2.6440922190201728, + "grad_norm": 16.889253562953712, + "learning_rate": 2.113872712509254e-09, + "logits/chosen": -1.993787407875061, + "logits/rejected": -1.9862686395645142, + "logps/chosen": -1.1294848918914795, + "logps/rejected": -1.2411291599273682, + "loss": 1.2278, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.258969783782959, + "rewards/margins": 0.22328904271125793, + "rewards/rejected": -2.4822583198547363, + "step": 3670 + }, + { + "epoch": 2.6512968299711814, + "grad_norm": 14.064327131140953, + "learning_rate": 2.0303231739801143e-09, + "logits/chosen": -1.9686027765274048, + "logits/rejected": -1.957910180091858, + "logps/chosen": -1.0182335376739502, + "logps/rejected": -1.1589828729629517, + "loss": 1.182, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.0364670753479004, + "rewards/margins": 0.2814987301826477, + "rewards/rejected": -2.3179657459259033, + "step": 3680 + }, + { + "epoch": 2.65850144092219, + "grad_norm": 23.602606022846373, + "learning_rate": 1.948388308885102e-09, + "logits/chosen": -2.0372962951660156, + "logits/rejected": -2.0287270545959473, + "logps/chosen": -1.063071846961975, + "logps/rejected": -1.1734139919281006, + "loss": 1.2172, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.12614369392395, + "rewards/margins": 0.22068460285663605, + "rewards/rejected": -2.346827983856201, + "step": 3690 + }, + { + "epoch": 2.6657060518731988, + "grad_norm": 25.359683354788714, + "learning_rate": 1.86807387690692e-09, + "logits/chosen": -2.0645687580108643, + "logits/rejected": -2.061300754547119, + "logps/chosen": -1.0886929035186768, + "logps/rejected": -1.2760602235794067, + "loss": 1.1167, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1773858070373535, + "rewards/margins": 0.37473443150520325, + "rewards/rejected": -2.5521204471588135, + "step": 3700 + }, + { + "epoch": 2.6729106628242074, + "grad_norm": 19.291599291445024, + "learning_rate": 1.789385523818493e-09, + "logits/chosen": -2.024766206741333, + "logits/rejected": -2.0262362957000732, + "logps/chosen": -1.0400424003601074, + "logps/rejected": -1.2077550888061523, + "loss": 1.1498, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.080084800720215, + "rewards/margins": 0.33542555570602417, + "rewards/rejected": -2.4155101776123047, + "step": 3710 + }, + { + "epoch": 2.680115273775216, + "grad_norm": 25.661768967793073, + "learning_rate": 1.712328781086131e-09, + "logits/chosen": -2.051741123199463, + "logits/rejected": -2.046497344970703, + "logps/chosen": -1.1223233938217163, + "logps/rejected": -1.2178256511688232, + "loss": 1.2398, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.2446467876434326, + "rewards/margins": 0.19100406765937805, + "rewards/rejected": -2.4356513023376465, + "step": 3720 + }, + { + "epoch": 2.6873198847262247, + "grad_norm": 21.329919320711415, + "learning_rate": 1.6369090654806543e-09, + "logits/chosen": -2.0555293560028076, + "logits/rejected": -2.0489516258239746, + "logps/chosen": -1.0201804637908936, + "logps/rejected": -1.163674235343933, + "loss": 1.1691, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.040360927581787, + "rewards/margins": 0.28698763251304626, + "rewards/rejected": -2.327348470687866, + "step": 3730 + }, + { + "epoch": 2.6945244956772334, + "grad_norm": 19.141258585549746, + "learning_rate": 1.5631316786966498e-09, + "logits/chosen": -1.9855105876922607, + "logits/rejected": -1.978864312171936, + "logps/chosen": -1.0213624238967896, + "logps/rejected": -1.1611801385879517, + "loss": 1.1975, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.042724847793579, + "rewards/margins": 0.27963531017303467, + "rewards/rejected": -2.3223602771759033, + "step": 3740 + }, + { + "epoch": 2.701729106628242, + "grad_norm": 18.560557441799045, + "learning_rate": 1.491001806979772e-09, + "logits/chosen": -2.0349512100219727, + "logits/rejected": -2.028040647506714, + "logps/chosen": -1.0765492916107178, + "logps/rejected": -1.225642442703247, + "loss": 1.1737, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.1530985832214355, + "rewards/margins": 0.2981860637664795, + "rewards/rejected": -2.451284885406494, + "step": 3750 + }, + { + "epoch": 2.7089337175792507, + "grad_norm": 29.217027349904072, + "learning_rate": 1.4205245207621508e-09, + "logits/chosen": -1.9789804220199585, + "logits/rejected": -1.9764864444732666, + "logps/chosen": -1.1173272132873535, + "logps/rejected": -1.2862274646759033, + "loss": 1.1533, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.234654426574707, + "rewards/margins": 0.337800532579422, + "rewards/rejected": -2.5724549293518066, + "step": 3760 + }, + { + "epoch": 2.7161383285302594, + "grad_norm": 17.685903552737507, + "learning_rate": 1.3517047743059978e-09, + "logits/chosen": -2.0163912773132324, + "logits/rejected": -2.0196452140808105, + "logps/chosen": -1.073099136352539, + "logps/rejected": -1.2338473796844482, + "loss": 1.1562, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.146198272705078, + "rewards/margins": 0.3214961588382721, + "rewards/rejected": -2.4676947593688965, + "step": 3770 + }, + { + "epoch": 2.723342939481268, + "grad_norm": 17.086412835946355, + "learning_rate": 1.2845474053553156e-09, + "logits/chosen": -2.011781692504883, + "logits/rejected": -2.007628917694092, + "logps/chosen": -1.0312448740005493, + "logps/rejected": -1.1682651042938232, + "loss": 1.2025, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -2.0624897480010986, + "rewards/margins": 0.27404046058654785, + "rewards/rejected": -2.3365302085876465, + "step": 3780 + }, + { + "epoch": 2.7305475504322767, + "grad_norm": 22.568118558934447, + "learning_rate": 1.2190571347958422e-09, + "logits/chosen": -2.0422775745391846, + "logits/rejected": -2.0436275005340576, + "logps/chosen": -0.9664519429206848, + "logps/rejected": -1.167764663696289, + "loss": 1.1102, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.9329038858413696, + "rewards/margins": 0.40262526273727417, + "rewards/rejected": -2.335529327392578, + "step": 3790 + }, + { + "epoch": 2.7377521613832854, + "grad_norm": 18.280458594650423, + "learning_rate": 1.1552385663231634e-09, + "logits/chosen": -1.9983785152435303, + "logits/rejected": -1.9888427257537842, + "logps/chosen": -1.0933729410171509, + "logps/rejected": -1.1888386011123657, + "loss": 1.2396, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -2.1867458820343018, + "rewards/margins": 0.19093120098114014, + "rewards/rejected": -2.3776772022247314, + "step": 3800 + }, + { + "epoch": 2.744956772334294, + "grad_norm": 19.071247762838464, + "learning_rate": 1.0930961861191302e-09, + "logits/chosen": -1.9584522247314453, + "logits/rejected": -1.9630699157714844, + "logps/chosen": -1.0375924110412598, + "logps/rejected": -1.1800395250320435, + "loss": 1.2, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.0751848220825195, + "rewards/margins": 0.28489404916763306, + "rewards/rejected": -2.360079050064087, + "step": 3810 + }, + { + "epoch": 2.7521613832853027, + "grad_norm": 16.698045888323442, + "learning_rate": 1.0326343625364608e-09, + "logits/chosen": -1.9668670892715454, + "logits/rejected": -1.9615176916122437, + "logps/chosen": -1.040906310081482, + "logps/rejected": -1.2131140232086182, + "loss": 1.1383, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.081812620162964, + "rewards/margins": 0.3444153070449829, + "rewards/rejected": -2.4262280464172363, + "step": 3820 + }, + { + "epoch": 2.7593659942363113, + "grad_norm": 18.443004885087902, + "learning_rate": 9.738573457917066e-10, + "logits/chosen": -2.0455007553100586, + "logits/rejected": -2.0438742637634277, + "logps/chosen": -1.0494908094406128, + "logps/rejected": -1.2397874593734741, + "loss": 1.1107, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0989816188812256, + "rewards/margins": 0.3805932402610779, + "rewards/rejected": -2.4795749187469482, + "step": 3830 + }, + { + "epoch": 2.76657060518732, + "grad_norm": 18.699877901633396, + "learning_rate": 9.16769267666434e-10, + "logits/chosen": -2.012563705444336, + "logits/rejected": -2.0103211402893066, + "logps/chosen": -1.0743488073349, + "logps/rejected": -1.1493780612945557, + "loss": 1.2646, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.1486976146698, + "rewards/margins": 0.1500580608844757, + "rewards/rejected": -2.2987561225891113, + "step": 3840 + }, + { + "epoch": 2.7737752161383287, + "grad_norm": 19.950649690108477, + "learning_rate": 8.613741412168113e-10, + "logits/chosen": -2.024034261703491, + "logits/rejected": -2.023303270339966, + "logps/chosen": -1.0807751417160034, + "logps/rejected": -1.209153413772583, + "loss": 1.1806, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.161550283432007, + "rewards/margins": 0.2567565441131592, + "rewards/rejected": -2.418306827545166, + "step": 3850 + }, + { + "epoch": 2.7809798270893373, + "grad_norm": 19.864173044191876, + "learning_rate": 8.076758604914802e-10, + "logits/chosen": -1.9579622745513916, + "logits/rejected": -1.9533469676971436, + "logps/chosen": -0.9816028475761414, + "logps/rejected": -1.1145174503326416, + "loss": 1.1989, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9632056951522827, + "rewards/margins": 0.2658289074897766, + "rewards/rejected": -2.229034900665283, + "step": 3860 + }, + { + "epoch": 2.7881844380403455, + "grad_norm": 22.85284631990654, + "learning_rate": 7.55678200257856e-10, + "logits/chosen": -1.9850330352783203, + "logits/rejected": -1.9783369302749634, + "logps/chosen": -1.032204031944275, + "logps/rejected": -1.1751976013183594, + "loss": 1.1758, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.06440806388855, + "rewards/margins": 0.28598710894584656, + "rewards/rejected": -2.3503952026367188, + "step": 3870 + }, + { + "epoch": 2.795389048991354, + "grad_norm": 17.089433378243903, + "learning_rate": 7.053848157367315e-10, + "logits/chosen": -2.0007100105285645, + "logits/rejected": -1.9952503442764282, + "logps/chosen": -1.0419762134552002, + "logps/rejected": -1.1898977756500244, + "loss": 1.1845, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.0839524269104004, + "rewards/margins": 0.2958431541919708, + "rewards/rejected": -2.379795551300049, + "step": 3880 + }, + { + "epoch": 2.802593659942363, + "grad_norm": 15.839145055220296, + "learning_rate": 6.567992423453794e-10, + "logits/chosen": -2.015761375427246, + "logits/rejected": -2.0144906044006348, + "logps/chosen": -0.9628134965896606, + "logps/rejected": -1.0785901546478271, + "loss": 1.2027, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.9256269931793213, + "rewards/margins": 0.23155340552330017, + "rewards/rejected": -2.1571803092956543, + "step": 3890 + }, + { + "epoch": 2.8097982708933715, + "grad_norm": 19.561769738169332, + "learning_rate": 6.099248954489794e-10, + "logits/chosen": -1.9572585821151733, + "logits/rejected": -1.9550421237945557, + "logps/chosen": -1.0681164264678955, + "logps/rejected": -1.228930115699768, + "loss": 1.157, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.136232852935791, + "rewards/margins": 0.321627140045166, + "rewards/rejected": -2.457860231399536, + "step": 3900 + }, + { + "epoch": 2.81700288184438, + "grad_norm": 22.777244684824645, + "learning_rate": 5.647650701205653e-10, + "logits/chosen": -2.024953842163086, + "logits/rejected": -2.016838550567627, + "logps/chosen": -1.1103287935256958, + "logps/rejected": -1.2667860984802246, + "loss": 1.1763, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.2206575870513916, + "rewards/margins": 0.31291496753692627, + "rewards/rejected": -2.533572196960449, + "step": 3910 + }, + { + "epoch": 2.824207492795389, + "grad_norm": 16.303046104621277, + "learning_rate": 5.213229409093856e-10, + "logits/chosen": -2.0344924926757812, + "logits/rejected": -2.0291943550109863, + "logps/chosen": -1.0526678562164307, + "logps/rejected": -1.1854225397109985, + "loss": 1.1995, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.1053357124328613, + "rewards/margins": 0.2655092179775238, + "rewards/rejected": -2.370845079421997, + "step": 3920 + }, + { + "epoch": 2.8314121037463975, + "grad_norm": 20.917034597157283, + "learning_rate": 4.796015616177401e-10, + "logits/chosen": -1.9968492984771729, + "logits/rejected": -1.9910831451416016, + "logps/chosen": -1.0663249492645264, + "logps/rejected": -1.1775870323181152, + "loss": 1.2153, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.1326498985290527, + "rewards/margins": 0.22252389788627625, + "rewards/rejected": -2.3551740646362305, + "step": 3930 + }, + { + "epoch": 2.838616714697406, + "grad_norm": 16.746348704994205, + "learning_rate": 4.3960386508631595e-10, + "logits/chosen": -1.9347660541534424, + "logits/rejected": -1.9273744821548462, + "logps/chosen": -0.9666959643363953, + "logps/rejected": -1.0863577127456665, + "loss": 1.2257, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.9333919286727905, + "rewards/margins": 0.23932373523712158, + "rewards/rejected": -2.172715425491333, + "step": 3940 + }, + { + "epoch": 2.845821325648415, + "grad_norm": 35.59868847583538, + "learning_rate": 4.013326629880243e-10, + "logits/chosen": -1.9773681163787842, + "logits/rejected": -1.9677212238311768, + "logps/chosen": -1.1061880588531494, + "logps/rejected": -1.2337268590927124, + "loss": 1.2042, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.212376117706299, + "rewards/margins": 0.2550778090953827, + "rewards/rejected": -2.467453718185425, + "step": 3950 + }, + { + "epoch": 2.8530259365994235, + "grad_norm": 19.68089141096437, + "learning_rate": 3.64790645630339e-10, + "logits/chosen": -1.9358896017074585, + "logits/rejected": -1.9352163076400757, + "logps/chosen": -1.0549217462539673, + "logps/rejected": -1.1247824430465698, + "loss": 1.2634, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.1098434925079346, + "rewards/margins": 0.13972175121307373, + "rewards/rejected": -2.2495648860931396, + "step": 3960 + }, + { + "epoch": 2.860230547550432, + "grad_norm": 21.305782765629946, + "learning_rate": 3.2998038176619e-10, + "logits/chosen": -1.9779675006866455, + "logits/rejected": -1.9694792032241821, + "logps/chosen": -1.0569480657577515, + "logps/rejected": -1.1792137622833252, + "loss": 1.2079, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.113896131515503, + "rewards/margins": 0.24453163146972656, + "rewards/rejected": -2.3584275245666504, + "step": 3970 + }, + { + "epoch": 2.867435158501441, + "grad_norm": 20.77606365899596, + "learning_rate": 2.969043184133907e-10, + "logits/chosen": -2.0462427139282227, + "logits/rejected": -2.0448436737060547, + "logps/chosen": -0.9707571268081665, + "logps/rejected": -1.1868783235549927, + "loss": 1.0771, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.941514253616333, + "rewards/margins": 0.4322422444820404, + "rewards/rejected": -2.3737566471099854, + "step": 3980 + }, + { + "epoch": 2.8746397694524495, + "grad_norm": 17.888841507835842, + "learning_rate": 2.6556478068261447e-10, + "logits/chosen": -1.9706792831420898, + "logits/rejected": -1.968033790588379, + "logps/chosen": -0.9736353158950806, + "logps/rejected": -1.1013442277908325, + "loss": 1.2103, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.9472706317901611, + "rewards/margins": 0.25541773438453674, + "rewards/rejected": -2.202688455581665, + "step": 3990 + }, + { + "epoch": 2.881844380403458, + "grad_norm": 20.827982084787728, + "learning_rate": 2.3596397161395607e-10, + "logits/chosen": -2.0459811687469482, + "logits/rejected": -2.0342445373535156, + "logps/chosen": -1.0675297975540161, + "logps/rejected": -1.2322208881378174, + "loss": 1.1585, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.1350595951080322, + "rewards/margins": 0.32938265800476074, + "rewards/rejected": -2.4644417762756348, + "step": 4000 + }, + { + "epoch": 2.889048991354467, + "grad_norm": 26.363176469996105, + "learning_rate": 2.0810397202206399e-10, + "logits/chosen": -1.9519503116607666, + "logits/rejected": -1.9571945667266846, + "logps/chosen": -1.0638792514801025, + "logps/rejected": -1.1936867237091064, + "loss": 1.1899, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.127758502960205, + "rewards/margins": 0.25961530208587646, + "rewards/rejected": -2.387373447418213, + "step": 4010 + }, + { + "epoch": 2.8962536023054755, + "grad_norm": 22.435915119282647, + "learning_rate": 1.819867403498737e-10, + "logits/chosen": -2.0360004901885986, + "logits/rejected": -2.0333070755004883, + "logps/chosen": -1.0682737827301025, + "logps/rejected": -1.1997547149658203, + "loss": 1.2021, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.136547565460205, + "rewards/margins": 0.262962281703949, + "rewards/rejected": -2.3995094299316406, + "step": 4020 + }, + { + "epoch": 2.903458213256484, + "grad_norm": 21.73593175736094, + "learning_rate": 1.5761411253092382e-10, + "logits/chosen": -1.9650490283966064, + "logits/rejected": -1.9548593759536743, + "logps/chosen": -0.9868356585502625, + "logps/rejected": -1.1082721948623657, + "loss": 1.2004, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.973671317100525, + "rewards/margins": 0.24287304282188416, + "rewards/rejected": -2.2165443897247314, + "step": 4030 + }, + { + "epoch": 2.910662824207493, + "grad_norm": 20.195616773045355, + "learning_rate": 1.3498780186031455e-10, + "logits/chosen": -2.0080840587615967, + "logits/rejected": -2.0045909881591797, + "logps/chosen": -1.161108136177063, + "logps/rejected": -1.2804962396621704, + "loss": 1.2261, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -2.322216272354126, + "rewards/margins": 0.23877570033073425, + "rewards/rejected": -2.560992479324341, + "step": 4040 + }, + { + "epoch": 2.9178674351585014, + "grad_norm": 15.481345158642924, + "learning_rate": 1.1410939887425141e-10, + "logits/chosen": -2.0009922981262207, + "logits/rejected": -2.002737522125244, + "logps/chosen": -1.044710397720337, + "logps/rejected": -1.1732409000396729, + "loss": 1.2109, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.089420795440674, + "rewards/margins": 0.2570609450340271, + "rewards/rejected": -2.3464818000793457, + "step": 4050 + }, + { + "epoch": 2.92507204610951, + "grad_norm": 18.467501751315456, + "learning_rate": 9.498037123825686e-11, + "logits/chosen": -2.008939743041992, + "logits/rejected": -2.0055575370788574, + "logps/chosen": -1.0212924480438232, + "logps/rejected": -1.1463903188705444, + "loss": 1.1994, + "rewards/accuracies": 0.59375, + "rewards/chosen": -2.0425848960876465, + "rewards/margins": 0.2501956522464752, + "rewards/rejected": -2.292780637741089, + "step": 4060 + }, + { + "epoch": 2.9322766570605188, + "grad_norm": 21.297612250322334, + "learning_rate": 7.760206364398614e-11, + "logits/chosen": -2.0672459602355957, + "logits/rejected": -2.0643718242645264, + "logps/chosen": -1.0762133598327637, + "logps/rejected": -1.2181751728057861, + "loss": 1.1851, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.1524267196655273, + "rewards/margins": 0.283923476934433, + "rewards/rejected": -2.4363503456115723, + "step": 4070 + }, + { + "epoch": 2.9394812680115274, + "grad_norm": 21.284898715739565, + "learning_rate": 6.19756977147029e-11, + "logits/chosen": -1.9935872554779053, + "logits/rejected": -1.990228295326233, + "logps/chosen": -1.0278215408325195, + "logps/rejected": -1.2337336540222168, + "loss": 1.1105, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.055643081665039, + "rewards/margins": 0.4118243157863617, + "rewards/rejected": -2.4674673080444336, + "step": 4080 + }, + { + "epoch": 2.946685878962536, + "grad_norm": 20.87599043089686, + "learning_rate": 4.810237191940625e-11, + "logits/chosen": -1.976012945175171, + "logits/rejected": -1.97482168674469, + "logps/chosen": -1.038079857826233, + "logps/rejected": -1.1688039302825928, + "loss": 1.2176, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -2.076159715652466, + "rewards/margins": 0.2614482045173645, + "rewards/rejected": -2.3376078605651855, + "step": 4090 + }, + { + "epoch": 2.9538904899135447, + "grad_norm": 20.210725111790943, + "learning_rate": 3.5983061495617476e-11, + "logits/chosen": -2.032045841217041, + "logits/rejected": -2.0321781635284424, + "logps/chosen": -1.1231155395507812, + "logps/rejected": -1.270812749862671, + "loss": 1.1826, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2462310791015625, + "rewards/margins": 0.295394629240036, + "rewards/rejected": -2.541625499725342, + "step": 4100 + }, + { + "epoch": 2.9610951008645534, + "grad_norm": 21.355044904847375, + "learning_rate": 2.5618618380812694e-11, + "logits/chosen": -2.018833637237549, + "logits/rejected": -2.008314371109009, + "logps/chosen": -1.0017220973968506, + "logps/rejected": -1.1643407344818115, + "loss": 1.1725, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.003444194793701, + "rewards/margins": 0.32523733377456665, + "rewards/rejected": -2.328681468963623, + "step": 4110 + }, + { + "epoch": 2.968299711815562, + "grad_norm": 22.742418454002138, + "learning_rate": 1.700977115254576e-11, + "logits/chosen": -1.9938160181045532, + "logits/rejected": -1.9904816150665283, + "logps/chosen": -0.9965242147445679, + "logps/rejected": -1.1439779996871948, + "loss": 1.1688, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.9930484294891357, + "rewards/margins": 0.29490748047828674, + "rewards/rejected": -2.2879559993743896, + "step": 4120 + }, + { + "epoch": 2.9755043227665707, + "grad_norm": 20.73702179826127, + "learning_rate": 1.0157124977230868e-11, + "logits/chosen": -1.9752233028411865, + "logits/rejected": -1.9736427068710327, + "logps/chosen": -0.9688889384269714, + "logps/rejected": -1.117297649383545, + "loss": 1.1686, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.9377778768539429, + "rewards/margins": 0.2968176603317261, + "rewards/rejected": -2.23459529876709, + "step": 4130 + }, + { + "epoch": 2.9827089337175794, + "grad_norm": 21.969537635189976, + "learning_rate": 5.061161567596061e-12, + "logits/chosen": -1.9961363077163696, + "logits/rejected": -1.9917857646942139, + "logps/chosen": -1.0561162233352661, + "logps/rejected": -1.1413224935531616, + "loss": 1.261, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -2.1122324466705322, + "rewards/margins": 0.17041274905204773, + "rewards/rejected": -2.2826449871063232, + "step": 4140 + }, + { + "epoch": 2.989913544668588, + "grad_norm": 20.739431552772885, + "learning_rate": 1.7222391488297406e-12, + "logits/chosen": -2.016247510910034, + "logits/rejected": -2.0124571323394775, + "logps/chosen": -1.1069406270980835, + "logps/rejected": -1.2540051937103271, + "loss": 1.1761, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.213881254196167, + "rewards/margins": 0.2941294014453888, + "rewards/rejected": -2.5080103874206543, + "step": 4150 + }, + { + "epoch": 2.9971181556195967, + "grad_norm": 19.721112564343052, + "learning_rate": 1.4059243338693238e-13, + "logits/chosen": -1.9906642436981201, + "logits/rejected": -1.9835201501846313, + "logps/chosen": -1.0591213703155518, + "logps/rejected": -1.182531714439392, + "loss": 1.1942, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -2.1182427406311035, + "rewards/margins": 0.24682076275348663, + "rewards/rejected": -2.365063428878784, + "step": 4160 + }, + { + "epoch": 3.0, + "step": 4164, + "total_flos": 0.0, + "train_loss": 1.2025406490027275, + "train_runtime": 5488.6376, + "train_samples_per_second": 12.135, + "train_steps_per_second": 0.759 + } + ], + "logging_steps": 10, + "max_steps": 4164, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}