{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 3873, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.5773195876288657e-09, "logits/chosen": -3.9100074768066406, "logits/rejected": -4.447928428649902, "logps/chosen": -252.016845703125, "logps/rejected": -298.87518310546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 2.5773195876288656e-08, "logits/chosen": -5.264719486236572, "logits/rejected": -4.7501540184021, "logps/chosen": -704.29541015625, "logps/rejected": -532.2731323242188, "loss": 0.6952, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": 0.004859171807765961, "rewards/margins": 0.00023437623167410493, "rewards/rejected": 0.004624796565622091, "step": 10 }, { "epoch": 0.01, "learning_rate": 5.154639175257731e-08, "logits/chosen": -5.434407711029053, "logits/rejected": -4.95996618270874, "logps/chosen": -699.14013671875, "logps/rejected": -476.2240295410156, "loss": 0.6952, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.002142244251444936, "rewards/margins": -0.0066454135812819, "rewards/rejected": 0.00878765620291233, "step": 20 }, { "epoch": 0.01, "learning_rate": 7.731958762886598e-08, "logits/chosen": -5.243380546569824, "logits/rejected": -5.211713790893555, "logps/chosen": -525.1171875, "logps/rejected": -423.39312744140625, "loss": 0.6951, "rewards/accuracies": 0.5, "rewards/chosen": 0.003249790519475937, "rewards/margins": 0.000919342041015625, "rewards/rejected": 0.002330448944121599, "step": 30 }, { "epoch": 0.01, "learning_rate": 1.0309278350515462e-07, "logits/chosen": -5.131182670593262, "logits/rejected": -4.265445709228516, "logps/chosen": -661.4071655273438, "logps/rejected": -430.1532287597656, "loss": 0.6947, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.011420822702348232, "rewards/margins": -0.008613145910203457, "rewards/rejected": -0.0028076765593141317, "step": 40 }, { "epoch": 0.01, "learning_rate": 1.2886597938144328e-07, "logits/chosen": -5.016252040863037, "logits/rejected": -5.079930782318115, "logps/chosen": -700.6941528320312, "logps/rejected": -517.4772338867188, "loss": 0.6981, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.0022976198233664036, "rewards/margins": -0.007870988920331001, "rewards/rejected": 0.00557337049394846, "step": 50 }, { "epoch": 0.02, "learning_rate": 1.5463917525773197e-07, "logits/chosen": -4.962490558624268, "logits/rejected": -5.010842323303223, "logps/chosen": -555.6851196289062, "logps/rejected": -501.57110595703125, "loss": 0.6967, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0058050318621098995, "rewards/margins": -0.0007518678903579712, "rewards/rejected": -0.005053164903074503, "step": 60 }, { "epoch": 0.02, "learning_rate": 1.804123711340206e-07, "logits/chosen": -5.370819091796875, "logits/rejected": -5.034182071685791, "logps/chosen": -683.794921875, "logps/rejected": -468.4527893066406, "loss": 0.6901, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0010283368173986673, "rewards/margins": 0.014723362401127815, "rewards/rejected": -0.013695026747882366, "step": 70 }, { "epoch": 0.02, "learning_rate": 2.0618556701030925e-07, "logits/chosen": -4.814556121826172, "logits/rejected": -4.836775779724121, "logps/chosen": -626.3643798828125, "logps/rejected": -469.01177978515625, "loss": 0.6922, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.007337172515690327, "rewards/margins": 0.010705096647143364, "rewards/rejected": -0.0033679225016385317, "step": 80 }, { "epoch": 0.02, "learning_rate": 2.3195876288659794e-07, "logits/chosen": -5.1350202560424805, "logits/rejected": -5.1212358474731445, "logps/chosen": -515.248779296875, "logps/rejected": -433.7506408691406, "loss": 0.6926, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00244722468778491, "rewards/margins": 0.010435061529278755, "rewards/rejected": -0.007987835444509983, "step": 90 }, { "epoch": 0.03, "learning_rate": 2.5773195876288655e-07, "logits/chosen": -5.177114009857178, "logits/rejected": -4.349142551422119, "logps/chosen": -593.7941284179688, "logps/rejected": -424.19696044921875, "loss": 0.6868, "rewards/accuracies": 0.625, "rewards/chosen": -0.0027124141342937946, "rewards/margins": 0.011196794919669628, "rewards/rejected": -0.013909208588302135, "step": 100 }, { "epoch": 0.03, "learning_rate": 2.835051546391752e-07, "logits/chosen": -4.992671012878418, "logits/rejected": -4.795473098754883, "logps/chosen": -564.6749877929688, "logps/rejected": -494.9117126464844, "loss": 0.6847, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.008160188794136047, "rewards/margins": 0.023002145811915398, "rewards/rejected": -0.0148419588804245, "step": 110 }, { "epoch": 0.03, "learning_rate": 3.0927835051546394e-07, "logits/chosen": -5.057103157043457, "logits/rejected": -4.665154457092285, "logps/chosen": -580.5682373046875, "logps/rejected": -467.72369384765625, "loss": 0.6873, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008997360244393349, "rewards/margins": 0.010930529795587063, "rewards/rejected": -0.019927887246012688, "step": 120 }, { "epoch": 0.03, "learning_rate": 3.3505154639175255e-07, "logits/chosen": -5.1313934326171875, "logits/rejected": -4.634444236755371, "logps/chosen": -599.831787109375, "logps/rejected": -448.5379943847656, "loss": 0.6848, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.007190053351223469, "rewards/margins": 0.01588342897593975, "rewards/rejected": -0.008693376556038857, "step": 130 }, { "epoch": 0.04, "learning_rate": 3.608247422680412e-07, "logits/chosen": -5.257201194763184, "logits/rejected": -4.326685905456543, "logps/chosen": -568.3807373046875, "logps/rejected": -408.2413024902344, "loss": 0.6705, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.01816733181476593, "rewards/margins": 0.05539187043905258, "rewards/rejected": -0.03722454234957695, "step": 140 }, { "epoch": 0.04, "learning_rate": 3.865979381443299e-07, "logits/chosen": -5.004805088043213, "logits/rejected": -4.720073699951172, "logps/chosen": -584.5308227539062, "logps/rejected": -476.8841857910156, "loss": 0.6701, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.020336730405688286, "rewards/margins": 0.04606650024652481, "rewards/rejected": -0.025729769840836525, "step": 150 }, { "epoch": 0.04, "learning_rate": 4.123711340206185e-07, "logits/chosen": -5.245352745056152, "logits/rejected": -4.824395656585693, "logps/chosen": -647.1730346679688, "logps/rejected": -521.8081665039062, "loss": 0.6615, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.025765161961317062, "rewards/margins": 0.060902394354343414, "rewards/rejected": -0.03513722866773605, "step": 160 }, { "epoch": 0.04, "learning_rate": 4.381443298969072e-07, "logits/chosen": -4.918195724487305, "logits/rejected": -5.041461944580078, "logps/chosen": -649.2320556640625, "logps/rejected": -453.7999572753906, "loss": 0.6497, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.030253374949097633, "rewards/margins": 0.10368291288614273, "rewards/rejected": -0.07342952489852905, "step": 170 }, { "epoch": 0.05, "learning_rate": 4.639175257731959e-07, "logits/chosen": -4.9767351150512695, "logits/rejected": -4.5638556480407715, "logps/chosen": -633.88623046875, "logps/rejected": -496.74664306640625, "loss": 0.6699, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008293787948787212, "rewards/margins": 0.06771639734506607, "rewards/rejected": -0.07601018249988556, "step": 180 }, { "epoch": 0.05, "learning_rate": 4.896907216494845e-07, "logits/chosen": -5.383603096008301, "logits/rejected": -4.877391815185547, "logps/chosen": -546.728271484375, "logps/rejected": -344.8063659667969, "loss": 0.654, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007833002135157585, "rewards/margins": 0.09368343651294708, "rewards/rejected": -0.10151644051074982, "step": 190 }, { "epoch": 0.05, "learning_rate": 4.991845610220168e-07, "logits/chosen": -4.733527183532715, "logits/rejected": -4.868575096130371, "logps/chosen": -481.6207580566406, "logps/rejected": -476.0492248535156, "loss": 0.6565, "rewards/accuracies": 0.625, "rewards/chosen": -0.03832811862230301, "rewards/margins": 0.055693674832582474, "rewards/rejected": -0.09402180463075638, "step": 200 }, { "epoch": 0.05, "learning_rate": 4.978254960587116e-07, "logits/chosen": -4.98598051071167, "logits/rejected": -4.212894916534424, "logps/chosen": -637.4251098632812, "logps/rejected": -506.30230712890625, "loss": 0.6702, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0035689384676516056, "rewards/margins": 0.10787747800350189, "rewards/rejected": -0.10430854558944702, "step": 210 }, { "epoch": 0.06, "learning_rate": 4.964664310954063e-07, "logits/chosen": -5.308133125305176, "logits/rejected": -5.002093315124512, "logps/chosen": -582.2630615234375, "logps/rejected": -459.553466796875, "loss": 0.6532, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03686295077204704, "rewards/margins": 0.10949943214654922, "rewards/rejected": -0.14636239409446716, "step": 220 }, { "epoch": 0.06, "learning_rate": 4.951073661321011e-07, "logits/chosen": -5.136179447174072, "logits/rejected": -4.420478820800781, "logps/chosen": -637.677734375, "logps/rejected": -455.35528564453125, "loss": 0.6246, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.016099706292152405, "rewards/margins": 0.15525248646736145, "rewards/rejected": -0.17135220766067505, "step": 230 }, { "epoch": 0.06, "learning_rate": 4.937483011687959e-07, "logits/chosen": -5.426546573638916, "logits/rejected": -4.910277366638184, "logps/chosen": -543.5972900390625, "logps/rejected": -451.20599365234375, "loss": 0.6324, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05474834516644478, "rewards/margins": 0.12798623740673065, "rewards/rejected": -0.18273457884788513, "step": 240 }, { "epoch": 0.06, "learning_rate": 4.923892362054906e-07, "logits/chosen": -5.028792858123779, "logits/rejected": -4.966330051422119, "logps/chosen": -639.6892700195312, "logps/rejected": -501.51019287109375, "loss": 0.6341, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.013248622417449951, "rewards/margins": 0.18953406810760498, "rewards/rejected": -0.17628543078899384, "step": 250 }, { "epoch": 0.07, "learning_rate": 4.910301712421854e-07, "logits/chosen": -4.824421405792236, "logits/rejected": -5.0228495597839355, "logps/chosen": -572.6714477539062, "logps/rejected": -469.21148681640625, "loss": 0.6363, "rewards/accuracies": 0.625, "rewards/chosen": -0.042329370975494385, "rewards/margins": 0.18195411562919617, "rewards/rejected": -0.22428350150585175, "step": 260 }, { "epoch": 0.07, "learning_rate": 4.8967110627888e-07, "logits/chosen": -4.958866119384766, "logits/rejected": -4.671696662902832, "logps/chosen": -680.0038452148438, "logps/rejected": -487.1553649902344, "loss": 0.6307, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0647299587726593, "rewards/margins": 0.19074389338493347, "rewards/rejected": -0.2554738223552704, "step": 270 }, { "epoch": 0.07, "learning_rate": 4.883120413155748e-07, "logits/chosen": -5.179836750030518, "logits/rejected": -5.224351406097412, "logps/chosen": -659.30712890625, "logps/rejected": -563.93505859375, "loss": 0.6215, "rewards/accuracies": 0.6875, "rewards/chosen": 0.005299837794154882, "rewards/margins": 0.23988893628120422, "rewards/rejected": -0.23458907008171082, "step": 280 }, { "epoch": 0.07, "learning_rate": 4.869529763522696e-07, "logits/chosen": -5.175195217132568, "logits/rejected": -4.513309478759766, "logps/chosen": -554.3179321289062, "logps/rejected": -408.87469482421875, "loss": 0.6048, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03833787888288498, "rewards/margins": 0.20562824606895447, "rewards/rejected": -0.24396613240242004, "step": 290 }, { "epoch": 0.08, "learning_rate": 4.855939113889644e-07, "logits/chosen": -5.465035915374756, "logits/rejected": -5.007296562194824, "logps/chosen": -614.6101684570312, "logps/rejected": -448.00396728515625, "loss": 0.6177, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.04801105335354805, "rewards/margins": 0.2677549719810486, "rewards/rejected": -0.3157660663127899, "step": 300 }, { "epoch": 0.08, "learning_rate": 4.842348464256592e-07, "logits/chosen": -5.135851860046387, "logits/rejected": -4.458438873291016, "logps/chosen": -693.4744873046875, "logps/rejected": -477.97735595703125, "loss": 0.5953, "rewards/accuracies": 0.6875, "rewards/chosen": 0.026815488934516907, "rewards/margins": 0.36201024055480957, "rewards/rejected": -0.33519476652145386, "step": 310 }, { "epoch": 0.08, "learning_rate": 4.828757814623539e-07, "logits/chosen": -4.8842668533325195, "logits/rejected": -4.530327796936035, "logps/chosen": -618.3095703125, "logps/rejected": -440.5267028808594, "loss": 0.6121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04385804012417793, "rewards/margins": 0.31809335947036743, "rewards/rejected": -0.36195147037506104, "step": 320 }, { "epoch": 0.09, "learning_rate": 4.815167164990487e-07, "logits/chosen": -5.210285186767578, "logits/rejected": -4.398937225341797, "logps/chosen": -613.9888916015625, "logps/rejected": -423.78973388671875, "loss": 0.5855, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.045169491320848465, "rewards/margins": 0.3982721269130707, "rewards/rejected": -0.44344156980514526, "step": 330 }, { "epoch": 0.09, "learning_rate": 4.801576515357433e-07, "logits/chosen": -5.246552467346191, "logits/rejected": -4.851002216339111, "logps/chosen": -602.2916259765625, "logps/rejected": -460.1776428222656, "loss": 0.6032, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.023538529872894287, "rewards/margins": 0.2947237491607666, "rewards/rejected": -0.3182622492313385, "step": 340 }, { "epoch": 0.09, "learning_rate": 4.787985865724381e-07, "logits/chosen": -5.230737209320068, "logits/rejected": -4.5234880447387695, "logps/chosen": -476.8888244628906, "logps/rejected": -386.95208740234375, "loss": 0.5973, "rewards/accuracies": 0.625, "rewards/chosen": -0.20046333968639374, "rewards/margins": 0.22257764637470245, "rewards/rejected": -0.4230410158634186, "step": 350 }, { "epoch": 0.09, "learning_rate": 4.774395216091329e-07, "logits/chosen": -4.9621710777282715, "logits/rejected": -4.4291791915893555, "logps/chosen": -624.4473266601562, "logps/rejected": -469.51025390625, "loss": 0.5782, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0565962977707386, "rewards/margins": 0.4176466464996338, "rewards/rejected": -0.4742429256439209, "step": 360 }, { "epoch": 0.1, "learning_rate": 4.7608045664582765e-07, "logits/chosen": -5.13443660736084, "logits/rejected": -4.768548488616943, "logps/chosen": -580.9453125, "logps/rejected": -469.3211364746094, "loss": 0.6232, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.11847039312124252, "rewards/margins": 0.1889243870973587, "rewards/rejected": -0.307394802570343, "step": 370 }, { "epoch": 0.1, "learning_rate": 4.747213916825224e-07, "logits/chosen": -5.269211292266846, "logits/rejected": -4.984899997711182, "logps/chosen": -625.7012329101562, "logps/rejected": -498.78436279296875, "loss": 0.6085, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08433916419744492, "rewards/margins": 0.2882917821407318, "rewards/rejected": -0.3726309835910797, "step": 380 }, { "epoch": 0.1, "learning_rate": 4.733623267192172e-07, "logits/chosen": -4.900550365447998, "logits/rejected": -4.842984199523926, "logps/chosen": -595.7847290039062, "logps/rejected": -421.272705078125, "loss": 0.5801, "rewards/accuracies": 0.75, "rewards/chosen": -0.045443516224622726, "rewards/margins": 0.3703029155731201, "rewards/rejected": -0.41574639081954956, "step": 390 }, { "epoch": 0.1, "learning_rate": 4.720032617559119e-07, "logits/chosen": -4.971917152404785, "logits/rejected": -4.413316249847412, "logps/chosen": -603.6484375, "logps/rejected": -390.181884765625, "loss": 0.6157, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.08832856267690659, "rewards/margins": 0.370069295167923, "rewards/rejected": -0.4583978056907654, "step": 400 }, { "epoch": 0.11, "learning_rate": 4.7064419679260665e-07, "logits/chosen": -4.816274642944336, "logits/rejected": -4.781431198120117, "logps/chosen": -532.91650390625, "logps/rejected": -487.6172790527344, "loss": 0.6183, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2775554656982422, "rewards/margins": 0.12774913012981415, "rewards/rejected": -0.40530458092689514, "step": 410 }, { "epoch": 0.11, "learning_rate": 4.692851318293014e-07, "logits/chosen": -4.550357818603516, "logits/rejected": -4.743869781494141, "logps/chosen": -503.3206481933594, "logps/rejected": -449.945068359375, "loss": 0.6434, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2160659283399582, "rewards/margins": 0.1927742063999176, "rewards/rejected": -0.4088401198387146, "step": 420 }, { "epoch": 0.11, "learning_rate": 4.6792606686599617e-07, "logits/chosen": -4.8620710372924805, "logits/rejected": -4.680614471435547, "logps/chosen": -517.7664184570312, "logps/rejected": -415.820556640625, "loss": 0.5822, "rewards/accuracies": 0.6875, "rewards/chosen": -0.18706198036670685, "rewards/margins": 0.3389657735824585, "rewards/rejected": -0.5260277986526489, "step": 430 }, { "epoch": 0.11, "learning_rate": 4.6656700190269095e-07, "logits/chosen": -4.892120838165283, "logits/rejected": -4.399613380432129, "logps/chosen": -616.3198852539062, "logps/rejected": -455.2386169433594, "loss": 0.6139, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2131435126066208, "rewards/margins": 0.2211327850818634, "rewards/rejected": -0.434276282787323, "step": 440 }, { "epoch": 0.12, "learning_rate": 4.652079369393857e-07, "logits/chosen": -4.742907524108887, "logits/rejected": -4.0981621742248535, "logps/chosen": -684.6009521484375, "logps/rejected": -568.4190673828125, "loss": 0.6183, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.024788271635770798, "rewards/margins": 0.2745409309864044, "rewards/rejected": -0.29932913184165955, "step": 450 }, { "epoch": 0.12, "learning_rate": 4.638488719760804e-07, "logits/chosen": -4.990183353424072, "logits/rejected": -4.841261386871338, "logps/chosen": -501.45587158203125, "logps/rejected": -421.10516357421875, "loss": 0.6133, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1533351093530655, "rewards/margins": 0.2984987199306488, "rewards/rejected": -0.4518338739871979, "step": 460 }, { "epoch": 0.12, "learning_rate": 4.6248980701277516e-07, "logits/chosen": -4.963714599609375, "logits/rejected": -4.383978366851807, "logps/chosen": -651.7528076171875, "logps/rejected": -455.96051025390625, "loss": 0.5702, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.045393429696559906, "rewards/margins": 0.4336971640586853, "rewards/rejected": -0.4790906012058258, "step": 470 }, { "epoch": 0.12, "learning_rate": 4.6113074204946995e-07, "logits/chosen": -5.326716899871826, "logits/rejected": -4.635982036590576, "logps/chosen": -634.7332763671875, "logps/rejected": -447.55010986328125, "loss": 0.6572, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.18605412542819977, "rewards/margins": 0.21766987442970276, "rewards/rejected": -0.4037240147590637, "step": 480 }, { "epoch": 0.13, "learning_rate": 4.5977167708616473e-07, "logits/chosen": -5.067509651184082, "logits/rejected": -4.860345840454102, "logps/chosen": -541.9595947265625, "logps/rejected": -476.3515625, "loss": 0.5982, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13814474642276764, "rewards/margins": 0.31477227807044983, "rewards/rejected": -0.4529170095920563, "step": 490 }, { "epoch": 0.13, "learning_rate": 4.5841261212285947e-07, "logits/chosen": -4.927236080169678, "logits/rejected": -4.7580389976501465, "logps/chosen": -611.2810668945312, "logps/rejected": -467.39501953125, "loss": 0.6436, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1771600991487503, "rewards/margins": 0.2784258723258972, "rewards/rejected": -0.45558589696884155, "step": 500 }, { "epoch": 0.13, "learning_rate": 4.570535471595542e-07, "logits/chosen": -5.053236484527588, "logits/rejected": -4.242236137390137, "logps/chosen": -641.723876953125, "logps/rejected": -466.0104064941406, "loss": 0.6012, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07538946717977524, "rewards/margins": 0.42548665404319763, "rewards/rejected": -0.5008760690689087, "step": 510 }, { "epoch": 0.13, "learning_rate": 4.5569448219624894e-07, "logits/chosen": -5.104859352111816, "logits/rejected": -4.797629356384277, "logps/chosen": -636.1038818359375, "logps/rejected": -465.8121032714844, "loss": 0.5827, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03768538683652878, "rewards/margins": 0.325167715549469, "rewards/rejected": -0.36285310983657837, "step": 520 }, { "epoch": 0.14, "learning_rate": 4.543354172329437e-07, "logits/chosen": -4.918176174163818, "logits/rejected": -4.973766326904297, "logps/chosen": -557.7041015625, "logps/rejected": -446.3299865722656, "loss": 0.6046, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.08515395224094391, "rewards/margins": 0.29556483030319214, "rewards/rejected": -0.38071876764297485, "step": 530 }, { "epoch": 0.14, "learning_rate": 4.5297635226963846e-07, "logits/chosen": -5.113317012786865, "logits/rejected": -4.7637176513671875, "logps/chosen": -674.4705810546875, "logps/rejected": -501.1588439941406, "loss": 0.571, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0015305932611227036, "rewards/margins": 0.47964540123939514, "rewards/rejected": -0.4811759889125824, "step": 540 }, { "epoch": 0.14, "learning_rate": 4.5161728730633325e-07, "logits/chosen": -5.13610315322876, "logits/rejected": -4.954715251922607, "logps/chosen": -534.7664794921875, "logps/rejected": -424.2696228027344, "loss": 0.607, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.15319520235061646, "rewards/margins": 0.3650510609149933, "rewards/rejected": -0.5182462930679321, "step": 550 }, { "epoch": 0.14, "learning_rate": 4.50258222343028e-07, "logits/chosen": -5.178530693054199, "logits/rejected": -4.55206298828125, "logps/chosen": -707.4481811523438, "logps/rejected": -422.05035400390625, "loss": 0.5826, "rewards/accuracies": 0.6875, "rewards/chosen": 0.006741873919963837, "rewards/margins": 0.4363733232021332, "rewards/rejected": -0.42963147163391113, "step": 560 }, { "epoch": 0.15, "learning_rate": 4.488991573797227e-07, "logits/chosen": -5.193602085113525, "logits/rejected": -4.635763645172119, "logps/chosen": -541.9158935546875, "logps/rejected": -412.94415283203125, "loss": 0.5933, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.12414856255054474, "rewards/margins": 0.29824933409690857, "rewards/rejected": -0.4223979115486145, "step": 570 }, { "epoch": 0.15, "learning_rate": 4.475400924164175e-07, "logits/chosen": -5.302609443664551, "logits/rejected": -4.3719706535339355, "logps/chosen": -621.8121337890625, "logps/rejected": -496.27423095703125, "loss": 0.5573, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0030913546215742826, "rewards/margins": 0.5280329585075378, "rewards/rejected": -0.5249415636062622, "step": 580 }, { "epoch": 0.15, "learning_rate": 4.4618102745311224e-07, "logits/chosen": -4.785793781280518, "logits/rejected": -4.529145240783691, "logps/chosen": -636.421142578125, "logps/rejected": -513.9705810546875, "loss": 0.5954, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.017498258501291275, "rewards/margins": 0.42999735474586487, "rewards/rejected": -0.44749563932418823, "step": 590 }, { "epoch": 0.15, "learning_rate": 4.4482196248980697e-07, "logits/chosen": -5.281703948974609, "logits/rejected": -4.8774003982543945, "logps/chosen": -571.9153442382812, "logps/rejected": -460.17022705078125, "loss": 0.6264, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.2108837068080902, "rewards/margins": 0.2501353919506073, "rewards/rejected": -0.4610190987586975, "step": 600 }, { "epoch": 0.16, "learning_rate": 4.4346289752650176e-07, "logits/chosen": -5.042937278747559, "logits/rejected": -4.787137031555176, "logps/chosen": -647.1984252929688, "logps/rejected": -505.620361328125, "loss": 0.6232, "rewards/accuracies": 0.625, "rewards/chosen": -0.06373582035303116, "rewards/margins": 0.34771889448165894, "rewards/rejected": -0.4114547371864319, "step": 610 }, { "epoch": 0.16, "learning_rate": 4.421038325631965e-07, "logits/chosen": -4.816788673400879, "logits/rejected": -4.839449405670166, "logps/chosen": -594.6710815429688, "logps/rejected": -410.1011657714844, "loss": 0.5337, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.058069534599781036, "rewards/margins": 0.4649524688720703, "rewards/rejected": -0.4068829417228699, "step": 620 }, { "epoch": 0.16, "learning_rate": 4.407447675998913e-07, "logits/chosen": -5.119754314422607, "logits/rejected": -4.477426052093506, "logps/chosen": -607.0371704101562, "logps/rejected": -424.75164794921875, "loss": 0.5504, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.08852221071720123, "rewards/margins": 0.56838458776474, "rewards/rejected": -0.65690678358078, "step": 630 }, { "epoch": 0.17, "learning_rate": 4.39385702636586e-07, "logits/chosen": -4.7978363037109375, "logits/rejected": -4.382967948913574, "logps/chosen": -581.29248046875, "logps/rejected": -446.77886962890625, "loss": 0.5803, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2044920176267624, "rewards/margins": 0.3723471164703369, "rewards/rejected": -0.5768391489982605, "step": 640 }, { "epoch": 0.17, "learning_rate": 4.3802663767328075e-07, "logits/chosen": -5.002087593078613, "logits/rejected": -4.465549945831299, "logps/chosen": -565.1025390625, "logps/rejected": -443.71881103515625, "loss": 0.5596, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0946217030286789, "rewards/margins": 0.4325682520866394, "rewards/rejected": -0.5271899700164795, "step": 650 }, { "epoch": 0.17, "learning_rate": 4.366675727099755e-07, "logits/chosen": -5.237046718597412, "logits/rejected": -4.730754375457764, "logps/chosen": -572.2055053710938, "logps/rejected": -445.2671813964844, "loss": 0.5831, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.00032152156927622855, "rewards/margins": 0.5232617855072021, "rewards/rejected": -0.5235832929611206, "step": 660 }, { "epoch": 0.17, "learning_rate": 4.3530850774667027e-07, "logits/chosen": -4.660307884216309, "logits/rejected": -4.82083797454834, "logps/chosen": -571.3373413085938, "logps/rejected": -476.90240478515625, "loss": 0.569, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.10375179350376129, "rewards/margins": 0.4479514956474304, "rewards/rejected": -0.5517033338546753, "step": 670 }, { "epoch": 0.18, "learning_rate": 4.3394944278336506e-07, "logits/chosen": -5.485714912414551, "logits/rejected": -4.80424690246582, "logps/chosen": -647.6866455078125, "logps/rejected": -514.752197265625, "loss": 0.5722, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.026715148240327835, "rewards/margins": 0.4548015594482422, "rewards/rejected": -0.48151668906211853, "step": 680 }, { "epoch": 0.18, "learning_rate": 4.325903778200598e-07, "logits/chosen": -4.8239054679870605, "logits/rejected": -4.809833526611328, "logps/chosen": -474.703857421875, "logps/rejected": -464.36767578125, "loss": 0.5746, "rewards/accuracies": 0.6875, "rewards/chosen": -0.19570307433605194, "rewards/margins": 0.46426302194595337, "rewards/rejected": -0.6599661111831665, "step": 690 }, { "epoch": 0.18, "learning_rate": 4.3123131285675453e-07, "logits/chosen": -4.975176811218262, "logits/rejected": -4.977343559265137, "logps/chosen": -562.6448974609375, "logps/rejected": -458.48065185546875, "loss": 0.6136, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13624700903892517, "rewards/margins": 0.30273160338401794, "rewards/rejected": -0.4389786124229431, "step": 700 }, { "epoch": 0.18, "learning_rate": 4.2987224789344926e-07, "logits/chosen": -5.58438777923584, "logits/rejected": -5.220755577087402, "logps/chosen": -562.8697509765625, "logps/rejected": -392.17755126953125, "loss": 0.599, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1068628579378128, "rewards/margins": 0.5429280996322632, "rewards/rejected": -0.6497910022735596, "step": 710 }, { "epoch": 0.19, "learning_rate": 4.2851318293014405e-07, "logits/chosen": -4.708364963531494, "logits/rejected": -4.156419277191162, "logps/chosen": -605.2653198242188, "logps/rejected": -456.384033203125, "loss": 0.5552, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.013813665136694908, "rewards/margins": 0.5981405973434448, "rewards/rejected": -0.5843268632888794, "step": 720 }, { "epoch": 0.19, "learning_rate": 4.2715411796683884e-07, "logits/chosen": -5.233548164367676, "logits/rejected": -4.437680244445801, "logps/chosen": -627.4539794921875, "logps/rejected": -421.85565185546875, "loss": 0.576, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.025078674778342247, "rewards/margins": 0.5734429359436035, "rewards/rejected": -0.5985215902328491, "step": 730 }, { "epoch": 0.19, "learning_rate": 4.257950530035335e-07, "logits/chosen": -5.098059177398682, "logits/rejected": -4.827805995941162, "logps/chosen": -589.9744873046875, "logps/rejected": -431.1255798339844, "loss": 0.5876, "rewards/accuracies": 0.6875, "rewards/chosen": -0.036339785903692245, "rewards/margins": 0.3825877606868744, "rewards/rejected": -0.41892752051353455, "step": 740 }, { "epoch": 0.19, "learning_rate": 4.244359880402283e-07, "logits/chosen": -4.666082859039307, "logits/rejected": -4.680274963378906, "logps/chosen": -591.2884521484375, "logps/rejected": -430.6166076660156, "loss": 0.615, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03429872542619705, "rewards/margins": 0.46906599402427673, "rewards/rejected": -0.4347672462463379, "step": 750 }, { "epoch": 0.2, "learning_rate": 4.2307692307692304e-07, "logits/chosen": -5.038529396057129, "logits/rejected": -5.100456237792969, "logps/chosen": -573.0460815429688, "logps/rejected": -454.48968505859375, "loss": 0.5981, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.15040521323680878, "rewards/margins": 0.32461416721343994, "rewards/rejected": -0.4750193655490875, "step": 760 }, { "epoch": 0.2, "learning_rate": 4.2171785811361783e-07, "logits/chosen": -5.336598873138428, "logits/rejected": -4.637759208679199, "logps/chosen": -672.5980224609375, "logps/rejected": -416.6004333496094, "loss": 0.6224, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.05844946578145027, "rewards/margins": 0.6305155158042908, "rewards/rejected": -0.6889649629592896, "step": 770 }, { "epoch": 0.2, "learning_rate": 4.2035879315031256e-07, "logits/chosen": -4.696690559387207, "logits/rejected": -4.829110145568848, "logps/chosen": -568.7249755859375, "logps/rejected": -448.14190673828125, "loss": 0.5622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09914873540401459, "rewards/margins": 0.4367143511772156, "rewards/rejected": -0.5358631014823914, "step": 780 }, { "epoch": 0.2, "learning_rate": 4.189997281870073e-07, "logits/chosen": -5.3184123039245605, "logits/rejected": -4.514608860015869, "logps/chosen": -576.1703491210938, "logps/rejected": -419.2657775878906, "loss": 0.6095, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.049643244594335556, "rewards/margins": 0.4539434313774109, "rewards/rejected": -0.5035867094993591, "step": 790 }, { "epoch": 0.21, "learning_rate": 4.176406632237021e-07, "logits/chosen": -5.1099748611450195, "logits/rejected": -5.108311653137207, "logps/chosen": -621.3192138671875, "logps/rejected": -517.3109130859375, "loss": 0.6198, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.017213309183716774, "rewards/margins": 0.4413982033729553, "rewards/rejected": -0.4586115777492523, "step": 800 }, { "epoch": 0.21, "learning_rate": 4.162815982603968e-07, "logits/chosen": -4.819340705871582, "logits/rejected": -4.71310567855835, "logps/chosen": -657.822509765625, "logps/rejected": -436.1148376464844, "loss": 0.5477, "rewards/accuracies": 0.6875, "rewards/chosen": -0.050711147487163544, "rewards/margins": 0.4844001233577728, "rewards/rejected": -0.5351113080978394, "step": 810 }, { "epoch": 0.21, "learning_rate": 4.149225332970916e-07, "logits/chosen": -5.180826187133789, "logits/rejected": -4.553278923034668, "logps/chosen": -616.3976440429688, "logps/rejected": -428.14501953125, "loss": 0.5513, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.09903495013713837, "rewards/margins": 0.43739914894104004, "rewards/rejected": -0.5364341139793396, "step": 820 }, { "epoch": 0.21, "learning_rate": 4.1356346833378634e-07, "logits/chosen": -5.0159783363342285, "logits/rejected": -4.414090633392334, "logps/chosen": -532.2962646484375, "logps/rejected": -394.74188232421875, "loss": 0.5446, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10811267048120499, "rewards/margins": 0.3885238468647003, "rewards/rejected": -0.4966364800930023, "step": 830 }, { "epoch": 0.22, "learning_rate": 4.122044033704811e-07, "logits/chosen": -4.964326858520508, "logits/rejected": -4.606993675231934, "logps/chosen": -636.4017944335938, "logps/rejected": -483.2483825683594, "loss": 0.5947, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03904043883085251, "rewards/margins": 0.44281521439552307, "rewards/rejected": -0.48185569047927856, "step": 840 }, { "epoch": 0.22, "learning_rate": 4.108453384071758e-07, "logits/chosen": -4.908474445343018, "logits/rejected": -4.149386882781982, "logps/chosen": -553.094482421875, "logps/rejected": -415.75115966796875, "loss": 0.561, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0005749926203861833, "rewards/margins": 0.5852338075637817, "rewards/rejected": -0.5858088135719299, "step": 850 }, { "epoch": 0.22, "learning_rate": 4.094862734438706e-07, "logits/chosen": -4.999358177185059, "logits/rejected": -4.587876319885254, "logps/chosen": -549.2117919921875, "logps/rejected": -410.4534606933594, "loss": 0.5268, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.09028232097625732, "rewards/margins": 0.588280439376831, "rewards/rejected": -0.6785627603530884, "step": 860 }, { "epoch": 0.22, "learning_rate": 4.081272084805654e-07, "logits/chosen": -4.444740295410156, "logits/rejected": -4.594709873199463, "logps/chosen": -635.481201171875, "logps/rejected": -535.2267456054688, "loss": 0.5636, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.06267094612121582, "rewards/margins": 0.4273379445075989, "rewards/rejected": -0.4900088906288147, "step": 870 }, { "epoch": 0.23, "learning_rate": 4.067681435172601e-07, "logits/chosen": -4.936141014099121, "logits/rejected": -5.240883827209473, "logps/chosen": -575.3516845703125, "logps/rejected": -480.3345642089844, "loss": 0.5654, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02528349682688713, "rewards/margins": 0.6120938062667847, "rewards/rejected": -0.5868103504180908, "step": 880 }, { "epoch": 0.23, "learning_rate": 4.0540907855395485e-07, "logits/chosen": -5.011609077453613, "logits/rejected": -4.755931377410889, "logps/chosen": -560.1868896484375, "logps/rejected": -472.1454162597656, "loss": 0.5761, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.17354989051818848, "rewards/margins": 0.3528314232826233, "rewards/rejected": -0.5263813138008118, "step": 890 }, { "epoch": 0.23, "learning_rate": 4.040500135906496e-07, "logits/chosen": -5.133852005004883, "logits/rejected": -5.064622402191162, "logps/chosen": -621.9901123046875, "logps/rejected": -571.9163818359375, "loss": 0.6164, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05665457993745804, "rewards/margins": 0.3919576108455658, "rewards/rejected": -0.44861215353012085, "step": 900 }, { "epoch": 0.23, "learning_rate": 4.026909486273444e-07, "logits/chosen": -4.732084274291992, "logits/rejected": -4.557053565979004, "logps/chosen": -545.7122802734375, "logps/rejected": -464.310302734375, "loss": 0.6249, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.19330564141273499, "rewards/margins": 0.32232436537742615, "rewards/rejected": -0.5156300067901611, "step": 910 }, { "epoch": 0.24, "learning_rate": 4.0133188366403916e-07, "logits/chosen": -4.645724773406982, "logits/rejected": -4.4923095703125, "logps/chosen": -622.0107421875, "logps/rejected": -484.76800537109375, "loss": 0.6126, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.07238127291202545, "rewards/margins": 0.4415339529514313, "rewards/rejected": -0.5139152407646179, "step": 920 }, { "epoch": 0.24, "learning_rate": 3.9997281870073385e-07, "logits/chosen": -5.044188976287842, "logits/rejected": -4.4092912673950195, "logps/chosen": -694.5960083007812, "logps/rejected": -435.5107421875, "loss": 0.5151, "rewards/accuracies": 0.75, "rewards/chosen": 0.042339447885751724, "rewards/margins": 0.5750529170036316, "rewards/rejected": -0.5327135324478149, "step": 930 }, { "epoch": 0.24, "learning_rate": 3.9861375373742863e-07, "logits/chosen": -4.965481758117676, "logits/rejected": -5.099541664123535, "logps/chosen": -572.3638305664062, "logps/rejected": -371.61663818359375, "loss": 0.6066, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2305530607700348, "rewards/margins": 0.3573130667209625, "rewards/rejected": -0.5878661274909973, "step": 940 }, { "epoch": 0.25, "learning_rate": 3.9725468877412337e-07, "logits/chosen": -5.048774719238281, "logits/rejected": -4.542634010314941, "logps/chosen": -552.0947875976562, "logps/rejected": -455.07025146484375, "loss": 0.5745, "rewards/accuracies": 0.625, "rewards/chosen": -0.1393623650074005, "rewards/margins": 0.44824647903442383, "rewards/rejected": -0.587608814239502, "step": 950 }, { "epoch": 0.25, "learning_rate": 3.9589562381081816e-07, "logits/chosen": -5.307036876678467, "logits/rejected": -5.147672176361084, "logps/chosen": -506.1968688964844, "logps/rejected": -428.904296875, "loss": 0.6057, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2355213463306427, "rewards/margins": 0.3196939527988434, "rewards/rejected": -0.5552152991294861, "step": 960 }, { "epoch": 0.25, "learning_rate": 3.945365588475129e-07, "logits/chosen": -4.7375311851501465, "logits/rejected": -4.652678489685059, "logps/chosen": -612.0572509765625, "logps/rejected": -466.7972717285156, "loss": 0.5814, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0007326111081056297, "rewards/margins": 0.4344883859157562, "rewards/rejected": -0.43522095680236816, "step": 970 }, { "epoch": 0.25, "learning_rate": 3.931774938842076e-07, "logits/chosen": -4.782444953918457, "logits/rejected": -4.471555233001709, "logps/chosen": -639.8296508789062, "logps/rejected": -462.0065002441406, "loss": 0.5844, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.056622106581926346, "rewards/margins": 0.39103665947914124, "rewards/rejected": -0.4476587772369385, "step": 980 }, { "epoch": 0.26, "learning_rate": 3.918184289209024e-07, "logits/chosen": -4.838658809661865, "logits/rejected": -4.814244747161865, "logps/chosen": -602.7030029296875, "logps/rejected": -435.02081298828125, "loss": 0.5792, "rewards/accuracies": 0.6875, "rewards/chosen": -0.030191833153367043, "rewards/margins": 0.6228463053703308, "rewards/rejected": -0.6530382037162781, "step": 990 }, { "epoch": 0.26, "learning_rate": 3.9045936395759715e-07, "logits/chosen": -5.35817289352417, "logits/rejected": -4.868961334228516, "logps/chosen": -609.6304931640625, "logps/rejected": -530.9454956054688, "loss": 0.6597, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.1846841722726822, "rewards/margins": 0.1106841117143631, "rewards/rejected": -0.2953682541847229, "step": 1000 }, { "epoch": 0.26, "eval_logits/chosen": -5.090588569641113, "eval_logits/rejected": -4.794471740722656, "eval_logps/chosen": -589.0576782226562, "eval_logps/rejected": -448.0440673828125, "eval_loss": 0.588729202747345, "eval_rewards/accuracies": 0.6700000166893005, "eval_rewards/chosen": -0.07880854606628418, "eval_rewards/margins": 0.4715493321418762, "eval_rewards/rejected": -0.5503579378128052, "eval_runtime": 106.4173, "eval_samples_per_second": 18.794, "eval_steps_per_second": 1.175, "step": 1000 }, { "epoch": 0.26, "learning_rate": 3.8910029899429193e-07, "logits/chosen": -5.132681846618652, "logits/rejected": -4.960105895996094, "logps/chosen": -588.9244995117188, "logps/rejected": -428.7511291503906, "loss": 0.6207, "rewards/accuracies": 0.625, "rewards/chosen": -0.21401521563529968, "rewards/margins": 0.3573240637779236, "rewards/rejected": -0.5713392496109009, "step": 1010 }, { "epoch": 0.26, "learning_rate": 3.8774123403098667e-07, "logits/chosen": -5.2607316970825195, "logits/rejected": -4.853170394897461, "logps/chosen": -510.7887268066406, "logps/rejected": -449.23388671875, "loss": 0.5792, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.1676936149597168, "rewards/margins": 0.4615322947502136, "rewards/rejected": -0.6292259097099304, "step": 1020 }, { "epoch": 0.27, "learning_rate": 3.863821690676814e-07, "logits/chosen": -4.826608180999756, "logits/rejected": -4.439766883850098, "logps/chosen": -584.8558349609375, "logps/rejected": -467.75042724609375, "loss": 0.571, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.15133224427700043, "rewards/margins": 0.34150105714797974, "rewards/rejected": -0.49283328652381897, "step": 1030 }, { "epoch": 0.27, "learning_rate": 3.8502310410437614e-07, "logits/chosen": -4.53078556060791, "logits/rejected": -4.410236358642578, "logps/chosen": -554.697509765625, "logps/rejected": -405.0028076171875, "loss": 0.6074, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.10526251792907715, "rewards/margins": 0.43528643250465393, "rewards/rejected": -0.5405489206314087, "step": 1040 }, { "epoch": 0.27, "learning_rate": 3.836640391410709e-07, "logits/chosen": -5.252594470977783, "logits/rejected": -4.650245189666748, "logps/chosen": -598.9107055664062, "logps/rejected": -387.4427795410156, "loss": 0.5583, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08027671277523041, "rewards/margins": 0.5926394462585449, "rewards/rejected": -0.6729162931442261, "step": 1050 }, { "epoch": 0.27, "learning_rate": 3.823049741777657e-07, "logits/chosen": -4.956867694854736, "logits/rejected": -4.863291263580322, "logps/chosen": -556.4575805664062, "logps/rejected": -470.60906982421875, "loss": 0.6279, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.17006051540374756, "rewards/margins": 0.27765610814094543, "rewards/rejected": -0.4477166533470154, "step": 1060 }, { "epoch": 0.28, "learning_rate": 3.8094590921446045e-07, "logits/chosen": -5.26237678527832, "logits/rejected": -4.90781831741333, "logps/chosen": -585.8365478515625, "logps/rejected": -457.3870544433594, "loss": 0.6522, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08809679746627808, "rewards/margins": 0.2951076626777649, "rewards/rejected": -0.38320446014404297, "step": 1070 }, { "epoch": 0.28, "learning_rate": 3.795868442511552e-07, "logits/chosen": -5.097253322601318, "logits/rejected": -4.8938751220703125, "logps/chosen": -553.80029296875, "logps/rejected": -450.6329040527344, "loss": 0.5777, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.13567152619361877, "rewards/margins": 0.40010422468185425, "rewards/rejected": -0.5357757806777954, "step": 1080 }, { "epoch": 0.28, "learning_rate": 3.782277792878499e-07, "logits/chosen": -5.1099653244018555, "logits/rejected": -4.922682762145996, "logps/chosen": -599.4254150390625, "logps/rejected": -531.6323852539062, "loss": 0.5713, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.05410848930478096, "rewards/margins": 0.5033014416694641, "rewards/rejected": -0.44919291138648987, "step": 1090 }, { "epoch": 0.28, "learning_rate": 3.768687143245447e-07, "logits/chosen": -4.754248142242432, "logits/rejected": -4.799590110778809, "logps/chosen": -675.2301025390625, "logps/rejected": -487.0110778808594, "loss": 0.5704, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.06325565278530121, "rewards/margins": 0.5369106531143188, "rewards/rejected": -0.47365492582321167, "step": 1100 }, { "epoch": 0.29, "learning_rate": 3.755096493612395e-07, "logits/chosen": -4.940129280090332, "logits/rejected": -4.485627174377441, "logps/chosen": -634.7139892578125, "logps/rejected": -507.569580078125, "loss": 0.6026, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.032657913863658905, "rewards/margins": 0.32800525426864624, "rewards/rejected": -0.36066314578056335, "step": 1110 }, { "epoch": 0.29, "learning_rate": 3.7415058439793417e-07, "logits/chosen": -5.1175336837768555, "logits/rejected": -4.806564807891846, "logps/chosen": -612.5916748046875, "logps/rejected": -467.4063415527344, "loss": 0.5517, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.1026410236954689, "rewards/margins": 0.5207311511039734, "rewards/rejected": -0.41809016466140747, "step": 1120 }, { "epoch": 0.29, "learning_rate": 3.7279151943462896e-07, "logits/chosen": -5.094006538391113, "logits/rejected": -4.8595170974731445, "logps/chosen": -559.3692626953125, "logps/rejected": -455.74493408203125, "loss": 0.6017, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08821893483400345, "rewards/margins": 0.31291455030441284, "rewards/rejected": -0.40113353729248047, "step": 1130 }, { "epoch": 0.29, "learning_rate": 3.714324544713237e-07, "logits/chosen": -5.375515460968018, "logits/rejected": -4.94734001159668, "logps/chosen": -541.21826171875, "logps/rejected": -456.67144775390625, "loss": 0.5765, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.10544973611831665, "rewards/margins": 0.43963623046875, "rewards/rejected": -0.5450860261917114, "step": 1140 }, { "epoch": 0.3, "learning_rate": 3.700733895080185e-07, "logits/chosen": -4.819343566894531, "logits/rejected": -4.394247055053711, "logps/chosen": -607.6382446289062, "logps/rejected": -410.609619140625, "loss": 0.5681, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.15860453248023987, "rewards/margins": 0.6440173983573914, "rewards/rejected": -0.48541292548179626, "step": 1150 }, { "epoch": 0.3, "learning_rate": 3.687143245447132e-07, "logits/chosen": -5.074875354766846, "logits/rejected": -4.492025375366211, "logps/chosen": -566.6473388671875, "logps/rejected": -414.416015625, "loss": 0.5601, "rewards/accuracies": 0.6875, "rewards/chosen": -0.01561739295721054, "rewards/margins": 0.39994779229164124, "rewards/rejected": -0.4155651926994324, "step": 1160 }, { "epoch": 0.3, "learning_rate": 3.6735525958140795e-07, "logits/chosen": -4.827418327331543, "logits/rejected": -4.503185749053955, "logps/chosen": -538.5560913085938, "logps/rejected": -497.1845703125, "loss": 0.6146, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04222496598958969, "rewards/margins": 0.3792383074760437, "rewards/rejected": -0.4214633107185364, "step": 1170 }, { "epoch": 0.3, "learning_rate": 3.6599619461810274e-07, "logits/chosen": -4.778945446014404, "logits/rejected": -5.135014533996582, "logps/chosen": -618.1221923828125, "logps/rejected": -528.7874755859375, "loss": 0.5879, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.09977498650550842, "rewards/margins": 0.4105672836303711, "rewards/rejected": -0.3107922375202179, "step": 1180 }, { "epoch": 0.31, "learning_rate": 3.6463712965479747e-07, "logits/chosen": -4.888424396514893, "logits/rejected": -4.577418327331543, "logps/chosen": -577.0140991210938, "logps/rejected": -447.93212890625, "loss": 0.6051, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.053088851273059845, "rewards/margins": 0.49817705154418945, "rewards/rejected": -0.4450882077217102, "step": 1190 }, { "epoch": 0.31, "learning_rate": 3.6327806469149226e-07, "logits/chosen": -4.80429744720459, "logits/rejected": -4.898682594299316, "logps/chosen": -498.6844787597656, "logps/rejected": -423.1040954589844, "loss": 0.6116, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.09284614026546478, "rewards/margins": 0.33597826957702637, "rewards/rejected": -0.4288244843482971, "step": 1200 }, { "epoch": 0.31, "learning_rate": 3.61918999728187e-07, "logits/chosen": -4.932044506072998, "logits/rejected": -4.503706455230713, "logps/chosen": -721.5752563476562, "logps/rejected": -512.78369140625, "loss": 0.5663, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.21977362036705017, "rewards/margins": 0.5384609699249268, "rewards/rejected": -0.3186873197555542, "step": 1210 }, { "epoch": 0.32, "learning_rate": 3.6055993476488173e-07, "logits/chosen": -4.647530555725098, "logits/rejected": -4.263665199279785, "logps/chosen": -597.6514892578125, "logps/rejected": -423.85260009765625, "loss": 0.6171, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.012402093037962914, "rewards/margins": 0.41996484994888306, "rewards/rejected": -0.4323669970035553, "step": 1220 }, { "epoch": 0.32, "learning_rate": 3.5920086980157646e-07, "logits/chosen": -5.303116321563721, "logits/rejected": -4.703645706176758, "logps/chosen": -552.6138916015625, "logps/rejected": -469.75811767578125, "loss": 0.5929, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.12344861030578613, "rewards/margins": 0.4382708966732025, "rewards/rejected": -0.314822256565094, "step": 1230 }, { "epoch": 0.32, "learning_rate": 3.5784180483827125e-07, "logits/chosen": -5.118283748626709, "logits/rejected": -4.543520927429199, "logps/chosen": -555.630126953125, "logps/rejected": -487.35498046875, "loss": 0.6097, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06331979483366013, "rewards/margins": 0.3617437481880188, "rewards/rejected": -0.2984239459037781, "step": 1240 }, { "epoch": 0.32, "learning_rate": 3.5648273987496604e-07, "logits/chosen": -5.459466457366943, "logits/rejected": -4.4333906173706055, "logps/chosen": -588.8739624023438, "logps/rejected": -399.8832702636719, "loss": 0.5699, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09981145709753036, "rewards/margins": 0.4524534344673157, "rewards/rejected": -0.35264191031455994, "step": 1250 }, { "epoch": 0.33, "learning_rate": 3.5512367491166077e-07, "logits/chosen": -4.973242282867432, "logits/rejected": -4.562270164489746, "logps/chosen": -551.7763671875, "logps/rejected": -402.52655029296875, "loss": 0.5732, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.16520874202251434, "rewards/margins": 0.5079749822616577, "rewards/rejected": -0.3427662253379822, "step": 1260 }, { "epoch": 0.33, "learning_rate": 3.537646099483555e-07, "logits/chosen": -4.993309020996094, "logits/rejected": -5.029820442199707, "logps/chosen": -520.9195556640625, "logps/rejected": -385.7127380371094, "loss": 0.573, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.0681810975074768, "rewards/margins": 0.5013130903244019, "rewards/rejected": -0.4331319332122803, "step": 1270 }, { "epoch": 0.33, "learning_rate": 3.5240554498505024e-07, "logits/chosen": -4.839926242828369, "logits/rejected": -4.778874397277832, "logps/chosen": -558.1290283203125, "logps/rejected": -384.0828552246094, "loss": 0.5617, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.092664934694767, "rewards/margins": 0.5813394784927368, "rewards/rejected": -0.48867446184158325, "step": 1280 }, { "epoch": 0.33, "learning_rate": 3.5104648002174503e-07, "logits/chosen": -5.055529594421387, "logits/rejected": -4.888724327087402, "logps/chosen": -568.4841918945312, "logps/rejected": -433.59930419921875, "loss": 0.6144, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.021698763594031334, "rewards/margins": 0.37912648916244507, "rewards/rejected": -0.40082526206970215, "step": 1290 }, { "epoch": 0.34, "learning_rate": 3.496874150584398e-07, "logits/chosen": -5.203982353210449, "logits/rejected": -4.94318151473999, "logps/chosen": -483.5262756347656, "logps/rejected": -391.9451904296875, "loss": 0.5664, "rewards/accuracies": 0.6875, "rewards/chosen": -0.048337481915950775, "rewards/margins": 0.40044230222702026, "rewards/rejected": -0.44877976179122925, "step": 1300 }, { "epoch": 0.34, "learning_rate": 3.483283500951345e-07, "logits/chosen": -4.813787460327148, "logits/rejected": -4.484375953674316, "logps/chosen": -717.718017578125, "logps/rejected": -509.1785583496094, "loss": 0.5485, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.19200703501701355, "rewards/margins": 0.5683521032333374, "rewards/rejected": -0.37634506821632385, "step": 1310 }, { "epoch": 0.34, "learning_rate": 3.469692851318293e-07, "logits/chosen": -5.500855445861816, "logits/rejected": -4.6635870933532715, "logps/chosen": -628.2833251953125, "logps/rejected": -458.0882873535156, "loss": 0.5908, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.07916983217000961, "rewards/margins": 0.5226668119430542, "rewards/rejected": -0.4434970021247864, "step": 1320 }, { "epoch": 0.34, "learning_rate": 3.45610220168524e-07, "logits/chosen": -4.949522495269775, "logits/rejected": -4.809584140777588, "logps/chosen": -522.9071044921875, "logps/rejected": -479.11700439453125, "loss": 0.6266, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0525810606777668, "rewards/margins": 0.24236159026622772, "rewards/rejected": -0.2949426770210266, "step": 1330 }, { "epoch": 0.35, "learning_rate": 3.442511552052188e-07, "logits/chosen": -4.881261825561523, "logits/rejected": -5.100898265838623, "logps/chosen": -484.083984375, "logps/rejected": -514.7659912109375, "loss": 0.5984, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.06116398051381111, "rewards/margins": 0.3284783959388733, "rewards/rejected": -0.2673143744468689, "step": 1340 }, { "epoch": 0.35, "learning_rate": 3.4289209024191354e-07, "logits/chosen": -5.381341457366943, "logits/rejected": -5.32161808013916, "logps/chosen": -575.6818237304688, "logps/rejected": -414.44921875, "loss": 0.5427, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06358791887760162, "rewards/margins": 0.472216933965683, "rewards/rejected": -0.5358048677444458, "step": 1350 }, { "epoch": 0.35, "learning_rate": 3.415330252786083e-07, "logits/chosen": -5.047477722167969, "logits/rejected": -4.687131404876709, "logps/chosen": -623.3069458007812, "logps/rejected": -538.3536376953125, "loss": 0.6101, "rewards/accuracies": 0.625, "rewards/chosen": 0.029577601701021194, "rewards/margins": 0.3733888864517212, "rewards/rejected": -0.3438113033771515, "step": 1360 }, { "epoch": 0.35, "learning_rate": 3.4017396031530306e-07, "logits/chosen": -5.100833415985107, "logits/rejected": -5.073991298675537, "logps/chosen": -711.748046875, "logps/rejected": -509.7862243652344, "loss": 0.5531, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.15278328955173492, "rewards/margins": 0.5438005924224854, "rewards/rejected": -0.3910173773765564, "step": 1370 }, { "epoch": 0.36, "learning_rate": 3.388148953519978e-07, "logits/chosen": -4.926814079284668, "logits/rejected": -4.619027614593506, "logps/chosen": -612.6145629882812, "logps/rejected": -448.18621826171875, "loss": 0.5861, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.059671830385923386, "rewards/margins": 0.3946450352668762, "rewards/rejected": -0.3349732458591461, "step": 1380 }, { "epoch": 0.36, "learning_rate": 3.374558303886926e-07, "logits/chosen": -5.215226173400879, "logits/rejected": -4.522857189178467, "logps/chosen": -645.0291748046875, "logps/rejected": -458.01458740234375, "loss": 0.5513, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13360649347305298, "rewards/margins": 0.6589222550392151, "rewards/rejected": -0.5253156423568726, "step": 1390 }, { "epoch": 0.36, "learning_rate": 3.360967654253873e-07, "logits/chosen": -5.343368053436279, "logits/rejected": -4.6446380615234375, "logps/chosen": -642.7989501953125, "logps/rejected": -497.9615173339844, "loss": 0.6191, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11626561731100082, "rewards/margins": 0.4873886704444885, "rewards/rejected": -0.3711230456829071, "step": 1400 }, { "epoch": 0.36, "learning_rate": 3.3473770046208206e-07, "logits/chosen": -5.116816520690918, "logits/rejected": -4.710324287414551, "logps/chosen": -493.3133850097656, "logps/rejected": -378.7680969238281, "loss": 0.5325, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.07064076513051987, "rewards/margins": 0.5043250322341919, "rewards/rejected": -0.5749658346176147, "step": 1410 }, { "epoch": 0.37, "learning_rate": 3.3337863549877684e-07, "logits/chosen": -5.086273193359375, "logits/rejected": -4.876503944396973, "logps/chosen": -597.3706665039062, "logps/rejected": -440.29522705078125, "loss": 0.5873, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.021745752543210983, "rewards/margins": 0.42672309279441833, "rewards/rejected": -0.4484688639640808, "step": 1420 }, { "epoch": 0.37, "learning_rate": 3.320195705354716e-07, "logits/chosen": -5.1945905685424805, "logits/rejected": -4.673881530761719, "logps/chosen": -500.9388732910156, "logps/rejected": -457.3580017089844, "loss": 0.556, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20997877418994904, "rewards/margins": 0.44773179292678833, "rewards/rejected": -0.6577105522155762, "step": 1430 }, { "epoch": 0.37, "learning_rate": 3.3066050557216636e-07, "logits/chosen": -5.183767795562744, "logits/rejected": -4.868539333343506, "logps/chosen": -676.839111328125, "logps/rejected": -534.795166015625, "loss": 0.5611, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12110471725463867, "rewards/margins": 0.6017159819602966, "rewards/rejected": -0.48061123490333557, "step": 1440 }, { "epoch": 0.37, "learning_rate": 3.293014406088611e-07, "logits/chosen": -4.892951965332031, "logits/rejected": -4.403754711151123, "logps/chosen": -601.4327392578125, "logps/rejected": -494.6481018066406, "loss": 0.5855, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09234030544757843, "rewards/margins": 0.4433468282222748, "rewards/rejected": -0.5356870889663696, "step": 1450 }, { "epoch": 0.38, "learning_rate": 3.2794237564555583e-07, "logits/chosen": -5.250612258911133, "logits/rejected": -5.025344371795654, "logps/chosen": -717.2879638671875, "logps/rejected": -576.2543334960938, "loss": 0.583, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.09430352598428726, "rewards/margins": 0.5691030025482178, "rewards/rejected": -0.4747994542121887, "step": 1460 }, { "epoch": 0.38, "learning_rate": 3.2658331068225057e-07, "logits/chosen": -4.8504767417907715, "logits/rejected": -5.307365417480469, "logps/chosen": -681.6648559570312, "logps/rejected": -590.7325439453125, "loss": 0.6042, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.08806881308555603, "rewards/margins": 0.5291630029678345, "rewards/rejected": -0.44109421968460083, "step": 1470 }, { "epoch": 0.38, "learning_rate": 3.2522424571894536e-07, "logits/chosen": -5.252363204956055, "logits/rejected": -4.908774375915527, "logps/chosen": -534.4985961914062, "logps/rejected": -413.527099609375, "loss": 0.5693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.09185522794723511, "rewards/margins": 0.6131922602653503, "rewards/rejected": -0.52133709192276, "step": 1480 }, { "epoch": 0.38, "learning_rate": 3.2386518075564014e-07, "logits/chosen": -4.975606441497803, "logits/rejected": -5.050392150878906, "logps/chosen": -532.093017578125, "logps/rejected": -474.6080017089844, "loss": 0.5641, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.04556658864021301, "rewards/margins": 0.5064549446105957, "rewards/rejected": -0.4608883261680603, "step": 1490 }, { "epoch": 0.39, "learning_rate": 3.225061157923348e-07, "logits/chosen": -4.692256450653076, "logits/rejected": -4.6152777671813965, "logps/chosen": -618.080322265625, "logps/rejected": -493.81591796875, "loss": 0.5814, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.06875093281269073, "rewards/margins": 0.5014289021492004, "rewards/rejected": -0.4326779246330261, "step": 1500 }, { "epoch": 0.39, "learning_rate": 3.211470508290296e-07, "logits/chosen": -5.330389499664307, "logits/rejected": -4.813447952270508, "logps/chosen": -640.2445068359375, "logps/rejected": -463.66876220703125, "loss": 0.6027, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.01199892908334732, "rewards/margins": 0.4153195917606354, "rewards/rejected": -0.4273185133934021, "step": 1510 }, { "epoch": 0.39, "learning_rate": 3.1978798586572435e-07, "logits/chosen": -5.160652160644531, "logits/rejected": -5.030102252960205, "logps/chosen": -672.4517822265625, "logps/rejected": -530.814697265625, "loss": 0.567, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.07595814764499664, "rewards/margins": 0.6065148115158081, "rewards/rejected": -0.5305566787719727, "step": 1520 }, { "epoch": 0.4, "learning_rate": 3.1842892090241913e-07, "logits/chosen": -4.820624828338623, "logits/rejected": -4.647487640380859, "logps/chosen": -591.4603271484375, "logps/rejected": -454.6068420410156, "loss": 0.6181, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.07786037027835846, "rewards/margins": 0.39148765802383423, "rewards/rejected": -0.4693480134010315, "step": 1530 }, { "epoch": 0.4, "learning_rate": 3.1706985593911387e-07, "logits/chosen": -5.017200469970703, "logits/rejected": -4.60324239730835, "logps/chosen": -649.9214477539062, "logps/rejected": -520.6214599609375, "loss": 0.6101, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0727464109659195, "rewards/margins": 0.34763583540916443, "rewards/rejected": -0.27488940954208374, "step": 1540 }, { "epoch": 0.4, "learning_rate": 3.157107909758086e-07, "logits/chosen": -4.8267669677734375, "logits/rejected": -4.830237865447998, "logps/chosen": -747.1302490234375, "logps/rejected": -542.8424072265625, "loss": 0.5405, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.07756751030683517, "rewards/margins": 0.639870822429657, "rewards/rejected": -0.5623033046722412, "step": 1550 }, { "epoch": 0.4, "learning_rate": 3.143517260125034e-07, "logits/chosen": -4.978399276733398, "logits/rejected": -5.0023088455200195, "logps/chosen": -533.4491577148438, "logps/rejected": -406.94329833984375, "loss": 0.6764, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.08706123381853104, "rewards/margins": 0.35440611839294434, "rewards/rejected": -0.44146737456321716, "step": 1560 }, { "epoch": 0.41, "learning_rate": 3.129926610491981e-07, "logits/chosen": -5.211610794067383, "logits/rejected": -4.5253987312316895, "logps/chosen": -648.8171997070312, "logps/rejected": -461.8702697753906, "loss": 0.6241, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.07190613448619843, "rewards/margins": 0.4674451947212219, "rewards/rejected": -0.3955390751361847, "step": 1570 }, { "epoch": 0.41, "learning_rate": 3.116335960858929e-07, "logits/chosen": -5.574213981628418, "logits/rejected": -4.852818012237549, "logps/chosen": -693.5277099609375, "logps/rejected": -520.2194213867188, "loss": 0.5759, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.1971389502286911, "rewards/margins": 0.5434707403182983, "rewards/rejected": -0.34633177518844604, "step": 1580 }, { "epoch": 0.41, "learning_rate": 3.1027453112258765e-07, "logits/chosen": -5.164004325866699, "logits/rejected": -4.828025817871094, "logps/chosen": -518.7059326171875, "logps/rejected": -396.88983154296875, "loss": 0.5758, "rewards/accuracies": 0.75, "rewards/chosen": 0.023333771154284477, "rewards/margins": 0.512725830078125, "rewards/rejected": -0.48939210176467896, "step": 1590 }, { "epoch": 0.41, "learning_rate": 3.089154661592824e-07, "logits/chosen": -4.937495231628418, "logits/rejected": -4.612677574157715, "logps/chosen": -582.4169311523438, "logps/rejected": -454.89202880859375, "loss": 0.5465, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04642937704920769, "rewards/margins": 0.36292964220046997, "rewards/rejected": -0.3165002465248108, "step": 1600 }, { "epoch": 0.42, "learning_rate": 3.0755640119597717e-07, "logits/chosen": -4.931238174438477, "logits/rejected": -4.779752254486084, "logps/chosen": -601.8740234375, "logps/rejected": -576.3049926757812, "loss": 0.5925, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.06317490339279175, "rewards/margins": 0.41317835450172424, "rewards/rejected": -0.3500033915042877, "step": 1610 }, { "epoch": 0.42, "learning_rate": 3.061973362326719e-07, "logits/chosen": -5.196651458740234, "logits/rejected": -4.927478313446045, "logps/chosen": -576.1328125, "logps/rejected": -465.5516662597656, "loss": 0.5701, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0703674778342247, "rewards/margins": 0.5130087733268738, "rewards/rejected": -0.44264134764671326, "step": 1620 }, { "epoch": 0.42, "learning_rate": 3.048382712693667e-07, "logits/chosen": -4.793614864349365, "logits/rejected": -4.842940807342529, "logps/chosen": -599.1259765625, "logps/rejected": -518.8648071289062, "loss": 0.5838, "rewards/accuracies": 0.625, "rewards/chosen": 0.038860831409692764, "rewards/margins": 0.4439505934715271, "rewards/rejected": -0.40508976578712463, "step": 1630 }, { "epoch": 0.42, "learning_rate": 3.034792063060614e-07, "logits/chosen": -5.246109962463379, "logits/rejected": -4.256648063659668, "logps/chosen": -623.5345458984375, "logps/rejected": -469.36260986328125, "loss": 0.5893, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0005898177623748779, "rewards/margins": 0.333055317401886, "rewards/rejected": -0.3324654698371887, "step": 1640 }, { "epoch": 0.43, "learning_rate": 3.0212014134275616e-07, "logits/chosen": -4.698599815368652, "logits/rejected": -4.9277753829956055, "logps/chosen": -537.7254028320312, "logps/rejected": -432.79254150390625, "loss": 0.5716, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.03875481337308884, "rewards/margins": 0.4677085876464844, "rewards/rejected": -0.4289538264274597, "step": 1650 }, { "epoch": 0.43, "learning_rate": 3.007610763794509e-07, "logits/chosen": -5.468576431274414, "logits/rejected": -5.0482497215271, "logps/chosen": -577.8678588867188, "logps/rejected": -448.755859375, "loss": 0.6101, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.03977590426802635, "rewards/margins": 0.40795421600341797, "rewards/rejected": -0.3681783080101013, "step": 1660 }, { "epoch": 0.43, "learning_rate": 2.994020114161457e-07, "logits/chosen": -5.248682975769043, "logits/rejected": -4.796721935272217, "logps/chosen": -526.3560791015625, "logps/rejected": -412.30401611328125, "loss": 0.5786, "rewards/accuracies": 0.625, "rewards/chosen": -0.009044056758284569, "rewards/margins": 0.3918320834636688, "rewards/rejected": -0.4008761942386627, "step": 1670 }, { "epoch": 0.43, "learning_rate": 2.9804294645284047e-07, "logits/chosen": -5.161639213562012, "logits/rejected": -5.1383490562438965, "logps/chosen": -549.3775634765625, "logps/rejected": -435.27947998046875, "loss": 0.5723, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.16379335522651672, "rewards/margins": 0.5625983476638794, "rewards/rejected": -0.39880499243736267, "step": 1680 }, { "epoch": 0.44, "learning_rate": 2.9668388148953515e-07, "logits/chosen": -4.992415428161621, "logits/rejected": -4.46937894821167, "logps/chosen": -583.8298950195312, "logps/rejected": -446.05267333984375, "loss": 0.5645, "rewards/accuracies": 0.75, "rewards/chosen": 0.13408790528774261, "rewards/margins": 0.4382530152797699, "rewards/rejected": -0.3041651248931885, "step": 1690 }, { "epoch": 0.44, "learning_rate": 2.9532481652622994e-07, "logits/chosen": -5.193965911865234, "logits/rejected": -5.035892963409424, "logps/chosen": -526.388427734375, "logps/rejected": -451.8531188964844, "loss": 0.6048, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.008205227553844452, "rewards/margins": 0.4235307276248932, "rewards/rejected": -0.4317359924316406, "step": 1700 }, { "epoch": 0.44, "learning_rate": 2.9396575156292467e-07, "logits/chosen": -4.930581092834473, "logits/rejected": -4.750922203063965, "logps/chosen": -611.9539794921875, "logps/rejected": -419.3814392089844, "loss": 0.6115, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.06710588932037354, "rewards/margins": 0.4377163350582123, "rewards/rejected": -0.37061044573783875, "step": 1710 }, { "epoch": 0.44, "learning_rate": 2.9260668659961946e-07, "logits/chosen": -5.439746856689453, "logits/rejected": -5.209362983703613, "logps/chosen": -657.6182861328125, "logps/rejected": -513.0152587890625, "loss": 0.5919, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.03497297689318657, "rewards/margins": 0.4590676724910736, "rewards/rejected": -0.42409467697143555, "step": 1720 }, { "epoch": 0.45, "learning_rate": 2.9124762163631425e-07, "logits/chosen": -4.818249702453613, "logits/rejected": -4.768403053283691, "logps/chosen": -653.0007934570312, "logps/rejected": -423.8858947753906, "loss": 0.523, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.21503737568855286, "rewards/margins": 0.66923987865448, "rewards/rejected": -0.4542025029659271, "step": 1730 }, { "epoch": 0.45, "learning_rate": 2.8988855667300893e-07, "logits/chosen": -5.165972709655762, "logits/rejected": -4.13810920715332, "logps/chosen": -563.0274047851562, "logps/rejected": -381.11297607421875, "loss": 0.6047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02564326301217079, "rewards/margins": 0.4730495810508728, "rewards/rejected": -0.49869289994239807, "step": 1740 }, { "epoch": 0.45, "learning_rate": 2.885294917097037e-07, "logits/chosen": -5.053162574768066, "logits/rejected": -4.775099277496338, "logps/chosen": -672.1310424804688, "logps/rejected": -519.7974243164062, "loss": 0.5661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14415240287780762, "rewards/margins": 0.4557925760746002, "rewards/rejected": -0.3116401433944702, "step": 1750 }, { "epoch": 0.45, "learning_rate": 2.8717042674639845e-07, "logits/chosen": -5.262818336486816, "logits/rejected": -5.100151062011719, "logps/chosen": -615.7294921875, "logps/rejected": -467.1444396972656, "loss": 0.5634, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.004898411221802235, "rewards/margins": 0.5748053789138794, "rewards/rejected": -0.569907009601593, "step": 1760 }, { "epoch": 0.46, "learning_rate": 2.8581136178309324e-07, "logits/chosen": -5.110268592834473, "logits/rejected": -4.89237117767334, "logps/chosen": -627.622802734375, "logps/rejected": -493.29803466796875, "loss": 0.5815, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.06722798198461533, "rewards/margins": 0.4168556332588196, "rewards/rejected": -0.3496275842189789, "step": 1770 }, { "epoch": 0.46, "learning_rate": 2.84452296819788e-07, "logits/chosen": -5.134549617767334, "logits/rejected": -5.063877582550049, "logps/chosen": -519.733642578125, "logps/rejected": -483.128662109375, "loss": 0.6415, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.04302790015935898, "rewards/margins": 0.2304501086473465, "rewards/rejected": -0.27347803115844727, "step": 1780 }, { "epoch": 0.46, "learning_rate": 2.830932318564827e-07, "logits/chosen": -5.221256256103516, "logits/rejected": -4.87565803527832, "logps/chosen": -588.2337646484375, "logps/rejected": -459.95770263671875, "loss": 0.6025, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.08087799698114395, "rewards/margins": 0.4586857259273529, "rewards/rejected": -0.37780776619911194, "step": 1790 }, { "epoch": 0.46, "learning_rate": 2.817341668931775e-07, "logits/chosen": -5.366186618804932, "logits/rejected": -4.619801044464111, "logps/chosen": -584.1773681640625, "logps/rejected": -424.52178955078125, "loss": 0.5836, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.015188613906502724, "rewards/margins": 0.5031098127365112, "rewards/rejected": -0.48792123794555664, "step": 1800 }, { "epoch": 0.47, "learning_rate": 2.8037510192987223e-07, "logits/chosen": -5.331225395202637, "logits/rejected": -4.693115711212158, "logps/chosen": -568.12939453125, "logps/rejected": -415.4103088378906, "loss": 0.579, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.09138574451208115, "rewards/margins": 0.46304386854171753, "rewards/rejected": -0.3716581463813782, "step": 1810 }, { "epoch": 0.47, "learning_rate": 2.79016036966567e-07, "logits/chosen": -5.22902774810791, "logits/rejected": -4.565291881561279, "logps/chosen": -663.4849243164062, "logps/rejected": -516.2536010742188, "loss": 0.5382, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.14414827525615692, "rewards/margins": 0.7415136098861694, "rewards/rejected": -0.5973652601242065, "step": 1820 }, { "epoch": 0.47, "learning_rate": 2.7765697200326175e-07, "logits/chosen": -5.32174825668335, "logits/rejected": -5.004895210266113, "logps/chosen": -652.4281005859375, "logps/rejected": -464.1463928222656, "loss": 0.5295, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.08608388900756836, "rewards/margins": 0.5602224469184875, "rewards/rejected": -0.47413843870162964, "step": 1830 }, { "epoch": 0.48, "learning_rate": 2.762979070399565e-07, "logits/chosen": -4.805876731872559, "logits/rejected": -4.962039947509766, "logps/chosen": -514.503662109375, "logps/rejected": -440.2759704589844, "loss": 0.5884, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10992634296417236, "rewards/margins": 0.4109458327293396, "rewards/rejected": -0.5208722352981567, "step": 1840 }, { "epoch": 0.48, "learning_rate": 2.749388420766512e-07, "logits/chosen": -5.237261772155762, "logits/rejected": -4.973998546600342, "logps/chosen": -624.6343383789062, "logps/rejected": -472.2704162597656, "loss": 0.5279, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.25611644983291626, "rewards/margins": 0.7832332849502563, "rewards/rejected": -0.5271168947219849, "step": 1850 }, { "epoch": 0.48, "learning_rate": 2.73579777113346e-07, "logits/chosen": -5.345922946929932, "logits/rejected": -5.159511089324951, "logps/chosen": -544.3359375, "logps/rejected": -404.75738525390625, "loss": 0.5395, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.018667107447981834, "rewards/margins": 0.5646753907203674, "rewards/rejected": -0.546008288860321, "step": 1860 }, { "epoch": 0.48, "learning_rate": 2.722207121500408e-07, "logits/chosen": -5.2390313148498535, "logits/rejected": -5.117612838745117, "logps/chosen": -517.3175048828125, "logps/rejected": -434.38482666015625, "loss": 0.5567, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.030095791444182396, "rewards/margins": 0.4572904706001282, "rewards/rejected": -0.42719465494155884, "step": 1870 }, { "epoch": 0.49, "learning_rate": 2.708616471867355e-07, "logits/chosen": -4.929625988006592, "logits/rejected": -4.4647040367126465, "logps/chosen": -522.1158447265625, "logps/rejected": -373.1676330566406, "loss": 0.594, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08331742882728577, "rewards/margins": 0.4487723410129547, "rewards/rejected": -0.5320898294448853, "step": 1880 }, { "epoch": 0.49, "learning_rate": 2.6950258222343027e-07, "logits/chosen": -5.052488327026367, "logits/rejected": -4.925723552703857, "logps/chosen": -523.0936889648438, "logps/rejected": -422.98455810546875, "loss": 0.5874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.06005431339144707, "rewards/margins": 0.5165280103683472, "rewards/rejected": -0.5765823125839233, "step": 1890 }, { "epoch": 0.49, "learning_rate": 2.68143517260125e-07, "logits/chosen": -5.030156135559082, "logits/rejected": -5.466166973114014, "logps/chosen": -566.7584228515625, "logps/rejected": -479.91705322265625, "loss": 0.5815, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009148592129349709, "rewards/margins": 0.3955579996109009, "rewards/rejected": -0.4047066271305084, "step": 1900 }, { "epoch": 0.49, "learning_rate": 2.667844522968198e-07, "logits/chosen": -5.068789482116699, "logits/rejected": -4.769230842590332, "logps/chosen": -615.9959716796875, "logps/rejected": -434.10491943359375, "loss": 0.5802, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1012858897447586, "rewards/margins": 0.6581977605819702, "rewards/rejected": -0.5569119453430176, "step": 1910 }, { "epoch": 0.5, "learning_rate": 2.654253873335146e-07, "logits/chosen": -5.509184837341309, "logits/rejected": -4.679028511047363, "logps/chosen": -683.6079711914062, "logps/rejected": -507.5233459472656, "loss": 0.5501, "rewards/accuracies": 0.8125, "rewards/chosen": 0.13926604390144348, "rewards/margins": 0.7854963541030884, "rewards/rejected": -0.6462303400039673, "step": 1920 }, { "epoch": 0.5, "learning_rate": 2.6406632237020926e-07, "logits/chosen": -4.794188022613525, "logits/rejected": -5.179261684417725, "logps/chosen": -580.4613647460938, "logps/rejected": -535.4896240234375, "loss": 0.6086, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0010952949523925781, "rewards/margins": 0.41253209114074707, "rewards/rejected": -0.4114367365837097, "step": 1930 }, { "epoch": 0.5, "learning_rate": 2.6270725740690404e-07, "logits/chosen": -4.888454437255859, "logits/rejected": -4.942262172698975, "logps/chosen": -547.9248046875, "logps/rejected": -410.5484924316406, "loss": 0.5508, "rewards/accuracies": 0.75, "rewards/chosen": 0.097802072763443, "rewards/margins": 0.5982667207717896, "rewards/rejected": -0.500464677810669, "step": 1940 }, { "epoch": 0.5, "learning_rate": 2.613481924435988e-07, "logits/chosen": -5.1013078689575195, "logits/rejected": -4.519289493560791, "logps/chosen": -571.65966796875, "logps/rejected": -389.85186767578125, "loss": 0.5745, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03590717911720276, "rewards/margins": 0.5961285829544067, "rewards/rejected": -0.5602214336395264, "step": 1950 }, { "epoch": 0.51, "learning_rate": 2.5998912748029357e-07, "logits/chosen": -5.285208225250244, "logits/rejected": -4.495665550231934, "logps/chosen": -663.1292114257812, "logps/rejected": -532.8426513671875, "loss": 0.5878, "rewards/accuracies": 0.6875, "rewards/chosen": 0.021974461153149605, "rewards/margins": 0.43202877044677734, "rewards/rejected": -0.41005435585975647, "step": 1960 }, { "epoch": 0.51, "learning_rate": 2.586300625169883e-07, "logits/chosen": -5.222224235534668, "logits/rejected": -4.825186729431152, "logps/chosen": -573.8010864257812, "logps/rejected": -416.4983825683594, "loss": 0.5527, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.11975344270467758, "rewards/margins": 0.6545025110244751, "rewards/rejected": -0.5347490906715393, "step": 1970 }, { "epoch": 0.51, "learning_rate": 2.5727099755368303e-07, "logits/chosen": -5.289813041687012, "logits/rejected": -4.617688179016113, "logps/chosen": -560.8759155273438, "logps/rejected": -427.9178771972656, "loss": 0.5826, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.022764435037970543, "rewards/margins": 0.5342342257499695, "rewards/rejected": -0.5569986701011658, "step": 1980 }, { "epoch": 0.51, "learning_rate": 2.559119325903778e-07, "logits/chosen": -4.576651096343994, "logits/rejected": -4.732336521148682, "logps/chosen": -530.8504638671875, "logps/rejected": -465.09637451171875, "loss": 0.5855, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.021763667464256287, "rewards/margins": 0.4931577146053314, "rewards/rejected": -0.47139400243759155, "step": 1990 }, { "epoch": 0.52, "learning_rate": 2.5455286762707256e-07, "logits/chosen": -5.359576225280762, "logits/rejected": -4.76473331451416, "logps/chosen": -673.8704833984375, "logps/rejected": -438.1480407714844, "loss": 0.5306, "rewards/accuracies": 0.75, "rewards/chosen": 0.14796891808509827, "rewards/margins": 0.6954831480979919, "rewards/rejected": -0.5475142598152161, "step": 2000 }, { "epoch": 0.52, "eval_logits/chosen": -5.148571968078613, "eval_logits/rejected": -4.8584794998168945, "eval_logps/chosen": -588.2166137695312, "eval_logps/rejected": -447.5611572265625, "eval_loss": 0.5739557147026062, "eval_rewards/accuracies": 0.6840000152587891, "eval_rewards/chosen": 0.005295886192470789, "eval_rewards/margins": 0.5073610544204712, "eval_rewards/rejected": -0.5020651817321777, "eval_runtime": 108.0014, "eval_samples_per_second": 18.518, "eval_steps_per_second": 1.157, "step": 2000 }, { "epoch": 0.52, "learning_rate": 2.5319380266376734e-07, "logits/chosen": -4.752730369567871, "logits/rejected": -4.137943267822266, "logps/chosen": -574.3436279296875, "logps/rejected": -465.2110290527344, "loss": 0.6064, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.06843885034322739, "rewards/margins": 0.35540246963500977, "rewards/rejected": -0.42384132742881775, "step": 2010 }, { "epoch": 0.52, "learning_rate": 2.518347377004621e-07, "logits/chosen": -4.942543029785156, "logits/rejected": -4.652976036071777, "logps/chosen": -639.5670166015625, "logps/rejected": -511.5428161621094, "loss": 0.612, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.008806949481368065, "rewards/margins": 0.37993431091308594, "rewards/rejected": -0.38874128460884094, "step": 2020 }, { "epoch": 0.52, "learning_rate": 2.504756727371568e-07, "logits/chosen": -5.181532859802246, "logits/rejected": -5.063170433044434, "logps/chosen": -585.638671875, "logps/rejected": -500.2425231933594, "loss": 0.5583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03146061673760414, "rewards/margins": 0.488603413105011, "rewards/rejected": -0.45714277029037476, "step": 2030 }, { "epoch": 0.53, "learning_rate": 2.4911660777385155e-07, "logits/chosen": -5.165438652038574, "logits/rejected": -4.691048622131348, "logps/chosen": -624.2686767578125, "logps/rejected": -427.2496032714844, "loss": 0.5458, "rewards/accuracies": 0.6875, "rewards/chosen": 0.08551664650440216, "rewards/margins": 0.6588252186775208, "rewards/rejected": -0.5733085870742798, "step": 2040 }, { "epoch": 0.53, "learning_rate": 2.4775754281054634e-07, "logits/chosen": -5.296277046203613, "logits/rejected": -5.355370044708252, "logps/chosen": -531.8514404296875, "logps/rejected": -429.61737060546875, "loss": 0.5616, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.005439861677587032, "rewards/margins": 0.6196398735046387, "rewards/rejected": -0.6141999959945679, "step": 2050 }, { "epoch": 0.53, "learning_rate": 2.4639847784724107e-07, "logits/chosen": -4.849526405334473, "logits/rejected": -4.464692115783691, "logps/chosen": -627.7634887695312, "logps/rejected": -506.77215576171875, "loss": 0.5663, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.09366675466299057, "rewards/margins": 0.5064713358879089, "rewards/rejected": -0.4128045439720154, "step": 2060 }, { "epoch": 0.53, "learning_rate": 2.4503941288393586e-07, "logits/chosen": -4.814553260803223, "logits/rejected": -4.442749500274658, "logps/chosen": -575.5558471679688, "logps/rejected": -414.0263671875, "loss": 0.6023, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.10579456388950348, "rewards/margins": 0.7066904306411743, "rewards/rejected": -0.6008957624435425, "step": 2070 }, { "epoch": 0.54, "learning_rate": 2.436803479206306e-07, "logits/chosen": -5.186957359313965, "logits/rejected": -4.751301288604736, "logps/chosen": -653.7766723632812, "logps/rejected": -476.66162109375, "loss": 0.6414, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.07173504680395126, "rewards/margins": 0.36066287755966187, "rewards/rejected": -0.43239790201187134, "step": 2080 }, { "epoch": 0.54, "learning_rate": 2.423212829573253e-07, "logits/chosen": -5.053447246551514, "logits/rejected": -4.918185234069824, "logps/chosen": -549.3553466796875, "logps/rejected": -429.42919921875, "loss": 0.5791, "rewards/accuracies": 0.625, "rewards/chosen": -0.15459008514881134, "rewards/margins": 0.37957876920700073, "rewards/rejected": -0.5341688394546509, "step": 2090 }, { "epoch": 0.54, "learning_rate": 2.409622179940201e-07, "logits/chosen": -4.912445545196533, "logits/rejected": -4.6925859451293945, "logps/chosen": -432.07452392578125, "logps/rejected": -340.0139465332031, "loss": 0.5751, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.029613960534334183, "rewards/margins": 0.48295989632606506, "rewards/rejected": -0.512573778629303, "step": 2100 }, { "epoch": 0.54, "learning_rate": 2.3960315303071485e-07, "logits/chosen": -5.279432773590088, "logits/rejected": -4.552325248718262, "logps/chosen": -553.3790283203125, "logps/rejected": -460.07427978515625, "loss": 0.6008, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.06708192080259323, "rewards/margins": 0.4622860550880432, "rewards/rejected": -0.3952041566371918, "step": 2110 }, { "epoch": 0.55, "learning_rate": 2.382440880674096e-07, "logits/chosen": -5.419906139373779, "logits/rejected": -5.2849860191345215, "logps/chosen": -593.1560668945312, "logps/rejected": -437.38720703125, "loss": 0.5862, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0501575842499733, "rewards/margins": 0.41556042432785034, "rewards/rejected": -0.36540284752845764, "step": 2120 }, { "epoch": 0.55, "learning_rate": 2.3688502310410434e-07, "logits/chosen": -5.2504706382751465, "logits/rejected": -4.6304168701171875, "logps/chosen": -570.8204956054688, "logps/rejected": -433.6552734375, "loss": 0.6481, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12808158993721008, "rewards/margins": 0.49509382247924805, "rewards/rejected": -0.3670122027397156, "step": 2130 }, { "epoch": 0.55, "learning_rate": 2.3552595814079913e-07, "logits/chosen": -5.319705009460449, "logits/rejected": -4.581608772277832, "logps/chosen": -649.8846435546875, "logps/rejected": -481.82550048828125, "loss": 0.5648, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.2135562151670456, "rewards/margins": 0.6773136854171753, "rewards/rejected": -0.4637575149536133, "step": 2140 }, { "epoch": 0.56, "learning_rate": 2.341668931774939e-07, "logits/chosen": -5.048118591308594, "logits/rejected": -5.331495761871338, "logps/chosen": -596.2010498046875, "logps/rejected": -466.883544921875, "loss": 0.5177, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.11167912185192108, "rewards/margins": 0.675338089466095, "rewards/rejected": -0.5636589527130127, "step": 2150 }, { "epoch": 0.56, "learning_rate": 2.3280782821418863e-07, "logits/chosen": -5.044338703155518, "logits/rejected": -4.720290660858154, "logps/chosen": -599.9769287109375, "logps/rejected": -418.9681091308594, "loss": 0.5675, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.07666479051113129, "rewards/margins": 0.48825913667678833, "rewards/rejected": -0.41159430146217346, "step": 2160 }, { "epoch": 0.56, "learning_rate": 2.314487632508834e-07, "logits/chosen": -5.259824752807617, "logits/rejected": -4.929041385650635, "logps/chosen": -552.7124633789062, "logps/rejected": -404.2115478515625, "loss": 0.5555, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.050760865211486816, "rewards/margins": 0.4294905662536621, "rewards/rejected": -0.3787297308444977, "step": 2170 }, { "epoch": 0.56, "learning_rate": 2.3008969828757812e-07, "logits/chosen": -4.695530414581299, "logits/rejected": -4.929333686828613, "logps/chosen": -511.2328186035156, "logps/rejected": -422.66424560546875, "loss": 0.6039, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.011992475017905235, "rewards/margins": 0.38791900873184204, "rewards/rejected": -0.37592652440071106, "step": 2180 }, { "epoch": 0.57, "learning_rate": 2.2873063332427288e-07, "logits/chosen": -5.204197883605957, "logits/rejected": -4.672845363616943, "logps/chosen": -676.1151123046875, "logps/rejected": -544.1356811523438, "loss": 0.5929, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03945215791463852, "rewards/margins": 0.4905180037021637, "rewards/rejected": -0.4510658383369446, "step": 2190 }, { "epoch": 0.57, "learning_rate": 2.2737156836096762e-07, "logits/chosen": -4.961050987243652, "logits/rejected": -5.060594081878662, "logps/chosen": -606.8833618164062, "logps/rejected": -515.282470703125, "loss": 0.5516, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.14587000012397766, "rewards/margins": 0.57590252161026, "rewards/rejected": -0.43003249168395996, "step": 2200 }, { "epoch": 0.57, "learning_rate": 2.260125033976624e-07, "logits/chosen": -5.351556301116943, "logits/rejected": -4.766120910644531, "logps/chosen": -614.4051513671875, "logps/rejected": -479.23614501953125, "loss": 0.56, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12179956585168839, "rewards/margins": 0.5328482985496521, "rewards/rejected": -0.4110487401485443, "step": 2210 }, { "epoch": 0.57, "learning_rate": 2.2465343843435717e-07, "logits/chosen": -5.310830116271973, "logits/rejected": -4.890834808349609, "logps/chosen": -605.814453125, "logps/rejected": -474.8689880371094, "loss": 0.58, "rewards/accuracies": 0.75, "rewards/chosen": 0.025691330432891846, "rewards/margins": 0.5121801495552063, "rewards/rejected": -0.4864887595176697, "step": 2220 }, { "epoch": 0.58, "learning_rate": 2.232943734710519e-07, "logits/chosen": -5.086139678955078, "logits/rejected": -4.888861656188965, "logps/chosen": -586.828857421875, "logps/rejected": -441.39117431640625, "loss": 0.5311, "rewards/accuracies": 0.75, "rewards/chosen": 0.05895073339343071, "rewards/margins": 0.6442952156066895, "rewards/rejected": -0.5853445529937744, "step": 2230 }, { "epoch": 0.58, "learning_rate": 2.2193530850774666e-07, "logits/chosen": -5.551673412322998, "logits/rejected": -4.797183036804199, "logps/chosen": -568.6909790039062, "logps/rejected": -442.3128967285156, "loss": 0.587, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.08748052269220352, "rewards/margins": 0.5034887194633484, "rewards/rejected": -0.4160081744194031, "step": 2240 }, { "epoch": 0.58, "learning_rate": 2.205762435444414e-07, "logits/chosen": -4.75910758972168, "logits/rejected": -4.565986156463623, "logps/chosen": -667.8187866210938, "logps/rejected": -495.55218505859375, "loss": 0.5571, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.13216033577919006, "rewards/margins": 0.6989060640335083, "rewards/rejected": -0.5667458176612854, "step": 2250 }, { "epoch": 0.58, "learning_rate": 2.1921717858113616e-07, "logits/chosen": -4.912562370300293, "logits/rejected": -4.5314154624938965, "logps/chosen": -604.3954467773438, "logps/rejected": -406.6845703125, "loss": 0.5741, "rewards/accuracies": 0.6875, "rewards/chosen": 0.11295922845602036, "rewards/margins": 0.5721455812454224, "rewards/rejected": -0.459186315536499, "step": 2260 }, { "epoch": 0.59, "learning_rate": 2.1785811361783094e-07, "logits/chosen": -5.262351989746094, "logits/rejected": -4.794454574584961, "logps/chosen": -634.1242065429688, "logps/rejected": -520.752685546875, "loss": 0.5552, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02443080209195614, "rewards/margins": 0.573106586933136, "rewards/rejected": -0.5486757159233093, "step": 2270 }, { "epoch": 0.59, "learning_rate": 2.1649904865452568e-07, "logits/chosen": -5.01981782913208, "logits/rejected": -4.642092704772949, "logps/chosen": -641.73681640625, "logps/rejected": -522.1800537109375, "loss": 0.6323, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.008827829733490944, "rewards/margins": 0.31641584634780884, "rewards/rejected": -0.32524368166923523, "step": 2280 }, { "epoch": 0.59, "learning_rate": 2.1513998369122044e-07, "logits/chosen": -5.0282673835754395, "logits/rejected": -4.697837829589844, "logps/chosen": -707.3143310546875, "logps/rejected": -492.3343200683594, "loss": 0.5052, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.1378926932811737, "rewards/margins": 0.8396459817886353, "rewards/rejected": -0.7017532587051392, "step": 2290 }, { "epoch": 0.59, "learning_rate": 2.1378091872791517e-07, "logits/chosen": -5.313849925994873, "logits/rejected": -4.932788848876953, "logps/chosen": -605.4910888671875, "logps/rejected": -440.771728515625, "loss": 0.6109, "rewards/accuracies": 0.625, "rewards/chosen": -0.0297638438642025, "rewards/margins": 0.40913066267967224, "rewards/rejected": -0.43889445066452026, "step": 2300 }, { "epoch": 0.6, "learning_rate": 2.1242185376460994e-07, "logits/chosen": -5.261561393737793, "logits/rejected": -5.213662624359131, "logps/chosen": -527.4952392578125, "logps/rejected": -468.1676330566406, "loss": 0.5384, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.10785374790430069, "rewards/margins": 0.5355297923088074, "rewards/rejected": -0.4276760518550873, "step": 2310 }, { "epoch": 0.6, "learning_rate": 2.1106278880130467e-07, "logits/chosen": -5.175314903259277, "logits/rejected": -4.972376346588135, "logps/chosen": -579.1510009765625, "logps/rejected": -558.7515869140625, "loss": 0.5673, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.01355088222771883, "rewards/margins": 0.4957160949707031, "rewards/rejected": -0.48216524720191956, "step": 2320 }, { "epoch": 0.6, "learning_rate": 2.0970372383799946e-07, "logits/chosen": -5.064365386962891, "logits/rejected": -4.731338024139404, "logps/chosen": -484.03662109375, "logps/rejected": -456.25750732421875, "loss": 0.5479, "rewards/accuracies": 0.6875, "rewards/chosen": -0.035017192363739014, "rewards/margins": 0.5290455222129822, "rewards/rejected": -0.5640627145767212, "step": 2330 }, { "epoch": 0.6, "learning_rate": 2.0834465887469422e-07, "logits/chosen": -5.324121475219727, "logits/rejected": -4.988900184631348, "logps/chosen": -694.1812133789062, "logps/rejected": -493.7262268066406, "loss": 0.5393, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.16435925662517548, "rewards/margins": 0.5418882966041565, "rewards/rejected": -0.3775290548801422, "step": 2340 }, { "epoch": 0.61, "learning_rate": 2.0698559391138895e-07, "logits/chosen": -5.1242194175720215, "logits/rejected": -5.189187049865723, "logps/chosen": -547.3629150390625, "logps/rejected": -401.1067199707031, "loss": 0.5793, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.016556579619646072, "rewards/margins": 0.5332701802253723, "rewards/rejected": -0.5498267412185669, "step": 2350 }, { "epoch": 0.61, "learning_rate": 2.0562652894808371e-07, "logits/chosen": -5.232357501983643, "logits/rejected": -4.684348106384277, "logps/chosen": -600.7597045898438, "logps/rejected": -423.6444396972656, "loss": 0.5716, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.05882176756858826, "rewards/margins": 0.5666114091873169, "rewards/rejected": -0.5077896118164062, "step": 2360 }, { "epoch": 0.61, "learning_rate": 2.0426746398477845e-07, "logits/chosen": -4.743066310882568, "logits/rejected": -5.0457682609558105, "logps/chosen": -621.16943359375, "logps/rejected": -562.2957763671875, "loss": 0.5733, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.030701154842972755, "rewards/margins": 0.49867868423461914, "rewards/rejected": -0.4679775834083557, "step": 2370 }, { "epoch": 0.61, "learning_rate": 2.029083990214732e-07, "logits/chosen": -5.360964298248291, "logits/rejected": -4.910816192626953, "logps/chosen": -541.6188354492188, "logps/rejected": -409.1690368652344, "loss": 0.5836, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.00623717624694109, "rewards/margins": 0.5276843309402466, "rewards/rejected": -0.5214471817016602, "step": 2380 }, { "epoch": 0.62, "learning_rate": 2.0154933405816797e-07, "logits/chosen": -5.229551792144775, "logits/rejected": -5.032118797302246, "logps/chosen": -488.1885681152344, "logps/rejected": -373.1156311035156, "loss": 0.6244, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.14348170161247253, "rewards/margins": 0.24628177285194397, "rewards/rejected": -0.3897634446620941, "step": 2390 }, { "epoch": 0.62, "learning_rate": 2.0019026909486273e-07, "logits/chosen": -4.7342987060546875, "logits/rejected": -5.318948268890381, "logps/chosen": -520.8893432617188, "logps/rejected": -414.48193359375, "loss": 0.5789, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06965414434671402, "rewards/margins": 0.44729694724082947, "rewards/rejected": -0.5169510245323181, "step": 2400 }, { "epoch": 0.62, "learning_rate": 1.988312041315575e-07, "logits/chosen": -5.036027431488037, "logits/rejected": -5.126777172088623, "logps/chosen": -580.8709716796875, "logps/rejected": -451.442138671875, "loss": 0.5281, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.22733895480632782, "rewards/margins": 0.6606284379959106, "rewards/rejected": -0.43328937888145447, "step": 2410 }, { "epoch": 0.62, "learning_rate": 1.9747213916825223e-07, "logits/chosen": -4.949835300445557, "logits/rejected": -4.850296974182129, "logps/chosen": -540.108154296875, "logps/rejected": -456.7802734375, "loss": 0.6362, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01635250262916088, "rewards/margins": 0.41821298003196716, "rewards/rejected": -0.4018605351448059, "step": 2420 }, { "epoch": 0.63, "learning_rate": 1.96113074204947e-07, "logits/chosen": -5.230714797973633, "logits/rejected": -4.870827674865723, "logps/chosen": -629.0115966796875, "logps/rejected": -472.3067321777344, "loss": 0.5569, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.3579154908657074, "rewards/margins": 0.7859879732131958, "rewards/rejected": -0.42807239294052124, "step": 2430 }, { "epoch": 0.63, "learning_rate": 1.9475400924164172e-07, "logits/chosen": -5.072863578796387, "logits/rejected": -5.132185935974121, "logps/chosen": -481.25732421875, "logps/rejected": -406.0838317871094, "loss": 0.5747, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09596830606460571, "rewards/margins": 0.4968641698360443, "rewards/rejected": -0.400895893573761, "step": 2440 }, { "epoch": 0.63, "learning_rate": 1.933949442783365e-07, "logits/chosen": -5.207085609436035, "logits/rejected": -5.209758281707764, "logps/chosen": -663.7227172851562, "logps/rejected": -574.890869140625, "loss": 0.5902, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.01712667942047119, "rewards/margins": 0.5995233654975891, "rewards/rejected": -0.5823966264724731, "step": 2450 }, { "epoch": 0.64, "learning_rate": 1.9203587931503127e-07, "logits/chosen": -5.274188995361328, "logits/rejected": -4.79066276550293, "logps/chosen": -641.5878295898438, "logps/rejected": -467.88970947265625, "loss": 0.5335, "rewards/accuracies": 0.75, "rewards/chosen": 0.10666131973266602, "rewards/margins": 0.626215934753418, "rewards/rejected": -0.5195545554161072, "step": 2460 }, { "epoch": 0.64, "learning_rate": 1.90676814351726e-07, "logits/chosen": -4.854439735412598, "logits/rejected": -4.647269248962402, "logps/chosen": -645.0352783203125, "logps/rejected": -485.135009765625, "loss": 0.5604, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11006246507167816, "rewards/margins": 0.5373518466949463, "rewards/rejected": -0.42728933691978455, "step": 2470 }, { "epoch": 0.64, "learning_rate": 1.8931774938842077e-07, "logits/chosen": -5.332976341247559, "logits/rejected": -5.005293369293213, "logps/chosen": -735.2457275390625, "logps/rejected": -513.5392456054688, "loss": 0.5612, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.019729632884263992, "rewards/margins": 0.49942612648010254, "rewards/rejected": -0.47969645261764526, "step": 2480 }, { "epoch": 0.64, "learning_rate": 1.879586844251155e-07, "logits/chosen": -5.085322856903076, "logits/rejected": -4.836029052734375, "logps/chosen": -508.7854919433594, "logps/rejected": -408.246337890625, "loss": 0.5654, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.07745769619941711, "rewards/margins": 0.4990989565849304, "rewards/rejected": -0.5765566229820251, "step": 2490 }, { "epoch": 0.65, "learning_rate": 1.8659961946181026e-07, "logits/chosen": -5.203367233276367, "logits/rejected": -4.747181415557861, "logps/chosen": -633.173583984375, "logps/rejected": -483.2410583496094, "loss": 0.5208, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.09929683059453964, "rewards/margins": 0.6734989881515503, "rewards/rejected": -0.5742021799087524, "step": 2500 }, { "epoch": 0.65, "learning_rate": 1.85240554498505e-07, "logits/chosen": -5.12204647064209, "logits/rejected": -5.049520015716553, "logps/chosen": -613.7943115234375, "logps/rejected": -492.426513671875, "loss": 0.5893, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.09632813185453415, "rewards/margins": 0.46805983781814575, "rewards/rejected": -0.371731698513031, "step": 2510 }, { "epoch": 0.65, "learning_rate": 1.8388148953519978e-07, "logits/chosen": -5.151005744934082, "logits/rejected": -4.872684478759766, "logps/chosen": -588.597900390625, "logps/rejected": -445.72552490234375, "loss": 0.5419, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.16517074406147003, "rewards/margins": 0.5802798867225647, "rewards/rejected": -0.4151090979576111, "step": 2520 }, { "epoch": 0.65, "learning_rate": 1.8252242457189454e-07, "logits/chosen": -5.086743354797363, "logits/rejected": -5.152198791503906, "logps/chosen": -546.5130615234375, "logps/rejected": -409.61962890625, "loss": 0.5723, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08358633518218994, "rewards/margins": 0.44257277250289917, "rewards/rejected": -0.5261590480804443, "step": 2530 }, { "epoch": 0.66, "learning_rate": 1.8116335960858928e-07, "logits/chosen": -5.062109470367432, "logits/rejected": -4.863866329193115, "logps/chosen": -546.6769409179688, "logps/rejected": -462.1380310058594, "loss": 0.5698, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.009900592267513275, "rewards/margins": 0.36029139161109924, "rewards/rejected": -0.35039082169532776, "step": 2540 }, { "epoch": 0.66, "learning_rate": 1.7980429464528404e-07, "logits/chosen": -5.212429046630859, "logits/rejected": -5.2788190841674805, "logps/chosen": -579.0036010742188, "logps/rejected": -476.0892028808594, "loss": 0.5802, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.043891895562410355, "rewards/margins": 0.44694948196411133, "rewards/rejected": -0.40305763483047485, "step": 2550 }, { "epoch": 0.66, "learning_rate": 1.7844522968197877e-07, "logits/chosen": -5.256224632263184, "logits/rejected": -4.5983476638793945, "logps/chosen": -610.208251953125, "logps/rejected": -496.9752502441406, "loss": 0.5869, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.029510384425520897, "rewards/margins": 0.4173160195350647, "rewards/rejected": -0.44682639837265015, "step": 2560 }, { "epoch": 0.66, "learning_rate": 1.7708616471867354e-07, "logits/chosen": -5.25832462310791, "logits/rejected": -4.58776330947876, "logps/chosen": -606.7193603515625, "logps/rejected": -458.6946716308594, "loss": 0.5831, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.15029188990592957, "rewards/margins": 0.5948890447616577, "rewards/rejected": -0.4445970952510834, "step": 2570 }, { "epoch": 0.67, "learning_rate": 1.757270997553683e-07, "logits/chosen": -5.031914710998535, "logits/rejected": -4.41817569732666, "logps/chosen": -631.19873046875, "logps/rejected": -482.4442443847656, "loss": 0.5915, "rewards/accuracies": 0.75, "rewards/chosen": 0.216563418507576, "rewards/margins": 0.6215362548828125, "rewards/rejected": -0.40497273206710815, "step": 2580 }, { "epoch": 0.67, "learning_rate": 1.7436803479206306e-07, "logits/chosen": -5.128687381744385, "logits/rejected": -5.052613735198975, "logps/chosen": -527.7045288085938, "logps/rejected": -391.2369384765625, "loss": 0.5519, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0025056705344468355, "rewards/margins": 0.5808178782463074, "rewards/rejected": -0.5833235383033752, "step": 2590 }, { "epoch": 0.67, "learning_rate": 1.7300896982875782e-07, "logits/chosen": -4.958249092102051, "logits/rejected": -4.750493049621582, "logps/chosen": -653.8189697265625, "logps/rejected": -429.29864501953125, "loss": 0.5353, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.16785474121570587, "rewards/margins": 0.6959394216537476, "rewards/rejected": -0.5280846357345581, "step": 2600 }, { "epoch": 0.67, "learning_rate": 1.7164990486545255e-07, "logits/chosen": -5.113970756530762, "logits/rejected": -5.008670806884766, "logps/chosen": -636.5609741210938, "logps/rejected": -528.3571166992188, "loss": 0.5244, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.10236026346683502, "rewards/margins": 0.6292668581008911, "rewards/rejected": -0.5269066095352173, "step": 2610 }, { "epoch": 0.68, "learning_rate": 1.7029083990214731e-07, "logits/chosen": -5.374190807342529, "logits/rejected": -5.041703224182129, "logps/chosen": -589.2327270507812, "logps/rejected": -419.8196716308594, "loss": 0.5488, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.04434085637331009, "rewards/margins": 0.5901876091957092, "rewards/rejected": -0.545846700668335, "step": 2620 }, { "epoch": 0.68, "learning_rate": 1.6893177493884205e-07, "logits/chosen": -4.970804691314697, "logits/rejected": -5.064300060272217, "logps/chosen": -590.119140625, "logps/rejected": -529.2374267578125, "loss": 0.5954, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.036118488758802414, "rewards/margins": 0.34446951746940613, "rewards/rejected": -0.3083510398864746, "step": 2630 }, { "epoch": 0.68, "learning_rate": 1.6757270997553684e-07, "logits/chosen": -5.2425689697265625, "logits/rejected": -4.580435276031494, "logps/chosen": -681.4793090820312, "logps/rejected": -452.87677001953125, "loss": 0.5824, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.19392366707324982, "rewards/margins": 0.6403516530990601, "rewards/rejected": -0.44642800092697144, "step": 2640 }, { "epoch": 0.68, "learning_rate": 1.662136450122316e-07, "logits/chosen": -5.0710859298706055, "logits/rejected": -4.760018348693848, "logps/chosen": -674.2054443359375, "logps/rejected": -565.3984985351562, "loss": 0.5999, "rewards/accuracies": 0.625, "rewards/chosen": 0.06712154299020767, "rewards/margins": 0.37311816215515137, "rewards/rejected": -0.3059965968132019, "step": 2650 }, { "epoch": 0.69, "learning_rate": 1.6485458004892633e-07, "logits/chosen": -5.095987796783447, "logits/rejected": -5.052186489105225, "logps/chosen": -544.0107421875, "logps/rejected": -435.6206970214844, "loss": 0.5912, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.01182224415242672, "rewards/margins": 0.4927147924900055, "rewards/rejected": -0.4808925986289978, "step": 2660 }, { "epoch": 0.69, "learning_rate": 1.634955150856211e-07, "logits/chosen": -5.013184547424316, "logits/rejected": -4.417869567871094, "logps/chosen": -619.71240234375, "logps/rejected": -465.03167724609375, "loss": 0.5593, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.10584282875061035, "rewards/margins": 0.553497850894928, "rewards/rejected": -0.44765496253967285, "step": 2670 }, { "epoch": 0.69, "learning_rate": 1.6213645012231583e-07, "logits/chosen": -5.066674709320068, "logits/rejected": -4.569228172302246, "logps/chosen": -624.6241455078125, "logps/rejected": -439.173583984375, "loss": 0.5335, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.07207348197698593, "rewards/margins": 0.6219204664230347, "rewards/rejected": -0.5498470067977905, "step": 2680 }, { "epoch": 0.69, "learning_rate": 1.607773851590106e-07, "logits/chosen": -4.710171222686768, "logits/rejected": -4.891881465911865, "logps/chosen": -572.812744140625, "logps/rejected": -463.7294006347656, "loss": 0.6075, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02544694021344185, "rewards/margins": 0.48878225684165955, "rewards/rejected": -0.4633353352546692, "step": 2690 }, { "epoch": 0.7, "learning_rate": 1.5941832019570535e-07, "logits/chosen": -5.484849452972412, "logits/rejected": -4.930639266967773, "logps/chosen": -625.0872802734375, "logps/rejected": -407.0397033691406, "loss": 0.6312, "rewards/accuracies": 0.625, "rewards/chosen": 0.016370752826333046, "rewards/margins": 0.48144835233688354, "rewards/rejected": -0.46507757902145386, "step": 2700 }, { "epoch": 0.7, "learning_rate": 1.580592552324001e-07, "logits/chosen": -4.830108642578125, "logits/rejected": -4.818240642547607, "logps/chosen": -765.7501831054688, "logps/rejected": -527.1919555664062, "loss": 0.5712, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.25509804487228394, "rewards/margins": 0.5200859308242798, "rewards/rejected": -0.2649877965450287, "step": 2710 }, { "epoch": 0.7, "learning_rate": 1.5670019026909487e-07, "logits/chosen": -5.286885738372803, "logits/rejected": -5.104211807250977, "logps/chosen": -604.0806884765625, "logps/rejected": -539.4612426757812, "loss": 0.5953, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.06256647408008575, "rewards/margins": 0.41809743642807007, "rewards/rejected": -0.3555310070514679, "step": 2720 }, { "epoch": 0.7, "learning_rate": 1.553411253057896e-07, "logits/chosen": -5.280417442321777, "logits/rejected": -5.031546592712402, "logps/chosen": -564.3517456054688, "logps/rejected": -442.993408203125, "loss": 0.5995, "rewards/accuracies": 0.625, "rewards/chosen": 0.004562814719974995, "rewards/margins": 0.47682324051856995, "rewards/rejected": -0.4722604751586914, "step": 2730 }, { "epoch": 0.71, "learning_rate": 1.5398206034248437e-07, "logits/chosen": -5.209362983703613, "logits/rejected": -4.777202129364014, "logps/chosen": -545.34228515625, "logps/rejected": -498.712890625, "loss": 0.5853, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.10138173401355743, "rewards/margins": 0.5011622309684753, "rewards/rejected": -0.3997804522514343, "step": 2740 }, { "epoch": 0.71, "learning_rate": 1.526229953791791e-07, "logits/chosen": -5.201174736022949, "logits/rejected": -4.462300777435303, "logps/chosen": -507.9359436035156, "logps/rejected": -392.84295654296875, "loss": 0.5746, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.07436896860599518, "rewards/margins": 0.6053969264030457, "rewards/rejected": -0.5310279726982117, "step": 2750 }, { "epoch": 0.71, "learning_rate": 1.5126393041587386e-07, "logits/chosen": -5.354833602905273, "logits/rejected": -5.260016441345215, "logps/chosen": -614.5958862304688, "logps/rejected": -445.5887145996094, "loss": 0.548, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12793684005737305, "rewards/margins": 0.6688116192817688, "rewards/rejected": -0.540874719619751, "step": 2760 }, { "epoch": 0.72, "learning_rate": 1.4990486545256862e-07, "logits/chosen": -5.430781364440918, "logits/rejected": -4.701030254364014, "logps/chosen": -675.9953002929688, "logps/rejected": -461.70526123046875, "loss": 0.5407, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17530107498168945, "rewards/margins": 0.6282079815864563, "rewards/rejected": -0.45290690660476685, "step": 2770 }, { "epoch": 0.72, "learning_rate": 1.4854580048926338e-07, "logits/chosen": -5.285678386688232, "logits/rejected": -5.013982772827148, "logps/chosen": -570.0582275390625, "logps/rejected": -415.4815979003906, "loss": 0.6081, "rewards/accuracies": 0.6875, "rewards/chosen": 0.014036163687705994, "rewards/margins": 0.5525720119476318, "rewards/rejected": -0.5385358929634094, "step": 2780 }, { "epoch": 0.72, "learning_rate": 1.4718673552595815e-07, "logits/chosen": -5.222121238708496, "logits/rejected": -4.650824546813965, "logps/chosen": -658.1751708984375, "logps/rejected": -433.31317138671875, "loss": 0.5892, "rewards/accuracies": 0.625, "rewards/chosen": 0.11640272289514542, "rewards/margins": 0.46268996596336365, "rewards/rejected": -0.34628722071647644, "step": 2790 }, { "epoch": 0.72, "learning_rate": 1.4582767056265288e-07, "logits/chosen": -5.346107006072998, "logits/rejected": -4.362074375152588, "logps/chosen": -606.3163452148438, "logps/rejected": -428.46954345703125, "loss": 0.591, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.07154564559459686, "rewards/margins": 0.4669966697692871, "rewards/rejected": -0.5385423302650452, "step": 2800 }, { "epoch": 0.73, "learning_rate": 1.4446860559934764e-07, "logits/chosen": -5.480148792266846, "logits/rejected": -5.2202863693237305, "logps/chosen": -478.8058166503906, "logps/rejected": -412.5455627441406, "loss": 0.5717, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.02568097785115242, "rewards/margins": 0.44622331857681274, "rewards/rejected": -0.42054232954978943, "step": 2810 }, { "epoch": 0.73, "learning_rate": 1.4310954063604238e-07, "logits/chosen": -5.337462902069092, "logits/rejected": -4.77118444442749, "logps/chosen": -631.7322998046875, "logps/rejected": -428.6024475097656, "loss": 0.5114, "rewards/accuracies": 0.75, "rewards/chosen": 0.08751243352890015, "rewards/margins": 0.6282658576965332, "rewards/rejected": -0.5407534241676331, "step": 2820 }, { "epoch": 0.73, "learning_rate": 1.4175047567273716e-07, "logits/chosen": -4.882279396057129, "logits/rejected": -4.7415571212768555, "logps/chosen": -554.1456298828125, "logps/rejected": -440.29583740234375, "loss": 0.5925, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.027923833578824997, "rewards/margins": 0.5361341238021851, "rewards/rejected": -0.5640579462051392, "step": 2830 }, { "epoch": 0.73, "learning_rate": 1.4039141070943192e-07, "logits/chosen": -4.445671558380127, "logits/rejected": -4.818338394165039, "logps/chosen": -609.9766845703125, "logps/rejected": -476.7052307128906, "loss": 0.6746, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0833519846200943, "rewards/margins": 0.5305315852165222, "rewards/rejected": -0.4471796154975891, "step": 2840 }, { "epoch": 0.74, "learning_rate": 1.3903234574612666e-07, "logits/chosen": -4.992600917816162, "logits/rejected": -4.623105525970459, "logps/chosen": -629.71875, "logps/rejected": -438.7481994628906, "loss": 0.6014, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.04544510692358017, "rewards/margins": 0.5514675378799438, "rewards/rejected": -0.5060223937034607, "step": 2850 }, { "epoch": 0.74, "learning_rate": 1.3767328078282142e-07, "logits/chosen": -5.5516252517700195, "logits/rejected": -5.1864333152771, "logps/chosen": -638.9982299804688, "logps/rejected": -552.6722412109375, "loss": 0.594, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11496025323867798, "rewards/margins": 0.4150461256504059, "rewards/rejected": -0.3000858426094055, "step": 2860 }, { "epoch": 0.74, "learning_rate": 1.3631421581951615e-07, "logits/chosen": -5.008028507232666, "logits/rejected": -4.577816009521484, "logps/chosen": -548.2963256835938, "logps/rejected": -500.7344665527344, "loss": 0.6102, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.026208871975541115, "rewards/margins": 0.3533262610435486, "rewards/rejected": -0.37953513860702515, "step": 2870 }, { "epoch": 0.74, "learning_rate": 1.3495515085621091e-07, "logits/chosen": -5.472646713256836, "logits/rejected": -4.973770618438721, "logps/chosen": -724.769287109375, "logps/rejected": -563.2872924804688, "loss": 0.623, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.1146833673119545, "rewards/margins": 0.4099810719490051, "rewards/rejected": -0.29529768228530884, "step": 2880 }, { "epoch": 0.75, "learning_rate": 1.3359608589290568e-07, "logits/chosen": -4.954279899597168, "logits/rejected": -5.228877067565918, "logps/chosen": -581.0999145507812, "logps/rejected": -530.7953491210938, "loss": 0.5642, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11613529920578003, "rewards/margins": 0.453176885843277, "rewards/rejected": -0.33704158663749695, "step": 2890 }, { "epoch": 0.75, "learning_rate": 1.3223702092960044e-07, "logits/chosen": -5.242658615112305, "logits/rejected": -4.768167972564697, "logps/chosen": -533.3716430664062, "logps/rejected": -390.54547119140625, "loss": 0.5637, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.043281190097332, "rewards/margins": 0.43897300958633423, "rewards/rejected": -0.48225417733192444, "step": 2900 }, { "epoch": 0.75, "learning_rate": 1.308779559662952e-07, "logits/chosen": -5.201248645782471, "logits/rejected": -4.676947593688965, "logps/chosen": -643.8597412109375, "logps/rejected": -583.13232421875, "loss": 0.5888, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.14752855896949768, "rewards/margins": 0.2833397686481476, "rewards/rejected": -0.13581117987632751, "step": 2910 }, { "epoch": 0.75, "learning_rate": 1.2951889100298993e-07, "logits/chosen": -5.122186183929443, "logits/rejected": -5.041023254394531, "logps/chosen": -577.3592529296875, "logps/rejected": -440.177734375, "loss": 0.5703, "rewards/accuracies": 0.6875, "rewards/chosen": 0.05891375616192818, "rewards/margins": 0.4279320240020752, "rewards/rejected": -0.36901822686195374, "step": 2920 }, { "epoch": 0.76, "learning_rate": 1.281598260396847e-07, "logits/chosen": -5.211073875427246, "logits/rejected": -4.9518232345581055, "logps/chosen": -614.7965698242188, "logps/rejected": -439.0995178222656, "loss": 0.6065, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13381192088127136, "rewards/margins": 0.6412351131439209, "rewards/rejected": -0.5074232220649719, "step": 2930 }, { "epoch": 0.76, "learning_rate": 1.2680076107637943e-07, "logits/chosen": -5.193760871887207, "logits/rejected": -4.404151439666748, "logps/chosen": -622.8799438476562, "logps/rejected": -431.04461669921875, "loss": 0.5656, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1521753966808319, "rewards/margins": 0.6701608896255493, "rewards/rejected": -0.5179855227470398, "step": 2940 }, { "epoch": 0.76, "learning_rate": 1.2544169611307421e-07, "logits/chosen": -5.015482425689697, "logits/rejected": -4.603137016296387, "logps/chosen": -594.6808471679688, "logps/rejected": -462.0057678222656, "loss": 0.4977, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.07626765221357346, "rewards/margins": 0.6575822830200195, "rewards/rejected": -0.5813146829605103, "step": 2950 }, { "epoch": 0.76, "learning_rate": 1.2408263114976895e-07, "logits/chosen": -5.004543781280518, "logits/rejected": -4.734368801116943, "logps/chosen": -567.2260131835938, "logps/rejected": -404.7546081542969, "loss": 0.5255, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.07884837687015533, "rewards/margins": 0.6020263433456421, "rewards/rejected": -0.523177981376648, "step": 2960 }, { "epoch": 0.77, "learning_rate": 1.227235661864637e-07, "logits/chosen": -5.414193153381348, "logits/rejected": -5.046207904815674, "logps/chosen": -502.909912109375, "logps/rejected": -361.7054138183594, "loss": 0.5857, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.06829454004764557, "rewards/margins": 0.5591669082641602, "rewards/rejected": -0.6274614930152893, "step": 2970 }, { "epoch": 0.77, "learning_rate": 1.2136450122315844e-07, "logits/chosen": -5.198843955993652, "logits/rejected": -4.930603981018066, "logps/chosen": -520.4276733398438, "logps/rejected": -431.8805236816406, "loss": 0.6266, "rewards/accuracies": 0.625, "rewards/chosen": 0.028726745396852493, "rewards/margins": 0.35937756299972534, "rewards/rejected": -0.33065086603164673, "step": 2980 }, { "epoch": 0.77, "learning_rate": 1.200054362598532e-07, "logits/chosen": -5.31064510345459, "logits/rejected": -4.599266529083252, "logps/chosen": -619.8414306640625, "logps/rejected": -396.1475524902344, "loss": 0.5163, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.18492308259010315, "rewards/margins": 0.7202444672584534, "rewards/rejected": -0.5353213548660278, "step": 2990 }, { "epoch": 0.77, "learning_rate": 1.1864637129654798e-07, "logits/chosen": -5.111018180847168, "logits/rejected": -4.847773551940918, "logps/chosen": -574.0250854492188, "logps/rejected": -485.5152282714844, "loss": 0.6036, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.050168585032224655, "rewards/margins": 0.470418781042099, "rewards/rejected": -0.5205873847007751, "step": 3000 }, { "epoch": 0.77, "eval_logits/chosen": -5.234292984008789, "eval_logits/rejected": -4.938849449157715, "eval_logps/chosen": -587.7192993164062, "eval_logps/rejected": -447.32525634765625, "eval_loss": 0.5676125288009644, "eval_rewards/accuracies": 0.6890000104904175, "eval_rewards/chosen": 0.05502856895327568, "eval_rewards/margins": 0.533500075340271, "eval_rewards/rejected": -0.4784714877605438, "eval_runtime": 107.5703, "eval_samples_per_second": 18.592, "eval_steps_per_second": 1.162, "step": 3000 }, { "epoch": 0.78, "learning_rate": 1.1728730633324273e-07, "logits/chosen": -5.1651787757873535, "logits/rejected": -5.090694427490234, "logps/chosen": -506.1864318847656, "logps/rejected": -488.7061462402344, "loss": 0.5813, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.18548956513404846, "rewards/margins": 0.39953848719596863, "rewards/rejected": -0.5850280523300171, "step": 3010 }, { "epoch": 0.78, "learning_rate": 1.1592824136993748e-07, "logits/chosen": -5.586142539978027, "logits/rejected": -5.10451602935791, "logps/chosen": -577.9089965820312, "logps/rejected": -501.72509765625, "loss": 0.5691, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.15978381037712097, "rewards/margins": 0.689250111579895, "rewards/rejected": -0.5294662714004517, "step": 3020 }, { "epoch": 0.78, "learning_rate": 1.1456917640663224e-07, "logits/chosen": -5.37423849105835, "logits/rejected": -5.371174335479736, "logps/chosen": -580.6159057617188, "logps/rejected": -504.47174072265625, "loss": 0.5753, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.08985177427530289, "rewards/margins": 0.42342209815979004, "rewards/rejected": -0.5132738351821899, "step": 3030 }, { "epoch": 0.78, "learning_rate": 1.1321011144332698e-07, "logits/chosen": -4.83787202835083, "logits/rejected": -4.70668363571167, "logps/chosen": -620.3161010742188, "logps/rejected": -518.7396850585938, "loss": 0.5764, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.06751275807619095, "rewards/margins": 0.5135071277618408, "rewards/rejected": -0.44599437713623047, "step": 3040 }, { "epoch": 0.79, "learning_rate": 1.1185104648002173e-07, "logits/chosen": -5.008633136749268, "logits/rejected": -5.046548843383789, "logps/chosen": -627.8788452148438, "logps/rejected": -494.4703063964844, "loss": 0.5846, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.057392753660678864, "rewards/margins": 0.4596997797489166, "rewards/rejected": -0.40230703353881836, "step": 3050 }, { "epoch": 0.79, "learning_rate": 1.104919815167165e-07, "logits/chosen": -4.719240665435791, "logits/rejected": -4.67408561706543, "logps/chosen": -576.9584350585938, "logps/rejected": -475.76263427734375, "loss": 0.4916, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.07536741346120834, "rewards/margins": 0.6072790622711182, "rewards/rejected": -0.5319116115570068, "step": 3060 }, { "epoch": 0.79, "learning_rate": 1.0913291655341125e-07, "logits/chosen": -5.283504962921143, "logits/rejected": -5.348637580871582, "logps/chosen": -595.0512084960938, "logps/rejected": -455.53411865234375, "loss": 0.6054, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.09285713732242584, "rewards/margins": 0.38092684745788574, "rewards/rejected": -0.2880697250366211, "step": 3070 }, { "epoch": 0.8, "learning_rate": 1.07773851590106e-07, "logits/chosen": -5.273705005645752, "logits/rejected": -5.187539577484131, "logps/chosen": -588.6239013671875, "logps/rejected": -506.1387634277344, "loss": 0.5378, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.06552986800670624, "rewards/margins": 0.5921922326087952, "rewards/rejected": -0.5266624093055725, "step": 3080 }, { "epoch": 0.8, "learning_rate": 1.0641478662680076e-07, "logits/chosen": -5.3834028244018555, "logits/rejected": -4.774810791015625, "logps/chosen": -650.7406616210938, "logps/rejected": -485.8941345214844, "loss": 0.5863, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.07584445178508759, "rewards/margins": 0.4963054656982422, "rewards/rejected": -0.4204609990119934, "step": 3090 }, { "epoch": 0.8, "learning_rate": 1.0505572166349551e-07, "logits/chosen": -5.180682182312012, "logits/rejected": -4.717430591583252, "logps/chosen": -678.7384643554688, "logps/rejected": -479.06378173828125, "loss": 0.5281, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.20926418900489807, "rewards/margins": 0.6056047081947327, "rewards/rejected": -0.39634042978286743, "step": 3100 }, { "epoch": 0.8, "learning_rate": 1.0369665670019026e-07, "logits/chosen": -5.130129814147949, "logits/rejected": -4.669539928436279, "logps/chosen": -597.5671997070312, "logps/rejected": -453.7167053222656, "loss": 0.5428, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.09353788197040558, "rewards/margins": 0.5858071446418762, "rewards/rejected": -0.49226921796798706, "step": 3110 }, { "epoch": 0.81, "learning_rate": 1.02337591736885e-07, "logits/chosen": -5.12376594543457, "logits/rejected": -5.563569068908691, "logps/chosen": -499.6468811035156, "logps/rejected": -436.5850524902344, "loss": 0.6206, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.010913821868598461, "rewards/margins": 0.49812451004981995, "rewards/rejected": -0.4872106909751892, "step": 3120 }, { "epoch": 0.81, "learning_rate": 1.0097852677357978e-07, "logits/chosen": -5.269471168518066, "logits/rejected": -5.030025005340576, "logps/chosen": -533.9810180664062, "logps/rejected": -399.9656066894531, "loss": 0.5404, "rewards/accuracies": 0.75, "rewards/chosen": 0.11613695323467255, "rewards/margins": 0.5177056193351746, "rewards/rejected": -0.4015687108039856, "step": 3130 }, { "epoch": 0.81, "learning_rate": 9.961946181027453e-08, "logits/chosen": -5.219923973083496, "logits/rejected": -5.189248085021973, "logps/chosen": -583.900634765625, "logps/rejected": -441.4908142089844, "loss": 0.5211, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.12448207288980484, "rewards/margins": 0.7087076306343079, "rewards/rejected": -0.5842255353927612, "step": 3140 }, { "epoch": 0.81, "learning_rate": 9.826039684696928e-08, "logits/chosen": -5.200379371643066, "logits/rejected": -4.829669952392578, "logps/chosen": -659.9589233398438, "logps/rejected": -435.9754943847656, "loss": 0.5538, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.06884004175662994, "rewards/margins": 0.5907555818557739, "rewards/rejected": -0.5219155550003052, "step": 3150 }, { "epoch": 0.82, "learning_rate": 9.690133188366404e-08, "logits/chosen": -5.091736793518066, "logits/rejected": -5.361365795135498, "logps/chosen": -556.8284912109375, "logps/rejected": -478.77838134765625, "loss": 0.5516, "rewards/accuracies": 0.6875, "rewards/chosen": 0.059255052357912064, "rewards/margins": 0.5096714496612549, "rewards/rejected": -0.4504164159297943, "step": 3160 }, { "epoch": 0.82, "learning_rate": 9.554226692035878e-08, "logits/chosen": -5.221390247344971, "logits/rejected": -4.96212100982666, "logps/chosen": -653.00341796875, "logps/rejected": -540.3280639648438, "loss": 0.5672, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1990213841199875, "rewards/margins": 0.5155045390129089, "rewards/rejected": -0.31648311018943787, "step": 3170 }, { "epoch": 0.82, "learning_rate": 9.418320195705353e-08, "logits/chosen": -5.2392683029174805, "logits/rejected": -4.992609024047852, "logps/chosen": -588.1959228515625, "logps/rejected": -433.735595703125, "loss": 0.5209, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.21357548236846924, "rewards/margins": 0.7488683462142944, "rewards/rejected": -0.5352928042411804, "step": 3180 }, { "epoch": 0.82, "learning_rate": 9.28241369937483e-08, "logits/chosen": -5.30230188369751, "logits/rejected": -5.239448547363281, "logps/chosen": -568.1078491210938, "logps/rejected": -474.8052673339844, "loss": 0.5752, "rewards/accuracies": 0.625, "rewards/chosen": 0.11839760839939117, "rewards/margins": 0.46041935682296753, "rewards/rejected": -0.34202176332473755, "step": 3190 }, { "epoch": 0.83, "learning_rate": 9.146507203044305e-08, "logits/chosen": -5.218874931335449, "logits/rejected": -4.873734474182129, "logps/chosen": -609.4281005859375, "logps/rejected": -457.5322265625, "loss": 0.5636, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1006079763174057, "rewards/margins": 0.5693811178207397, "rewards/rejected": -0.4687730669975281, "step": 3200 }, { "epoch": 0.83, "learning_rate": 9.01060070671378e-08, "logits/chosen": -5.503744125366211, "logits/rejected": -5.236392498016357, "logps/chosen": -586.1392822265625, "logps/rejected": -444.5675354003906, "loss": 0.5376, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.03827662393450737, "rewards/margins": 0.5034765005111694, "rewards/rejected": -0.46519985795021057, "step": 3210 }, { "epoch": 0.83, "learning_rate": 8.874694210383256e-08, "logits/chosen": -5.066947937011719, "logits/rejected": -4.900802135467529, "logps/chosen": -595.0233154296875, "logps/rejected": -496.80877685546875, "loss": 0.5562, "rewards/accuracies": 0.6875, "rewards/chosen": 0.029369166120886803, "rewards/margins": 0.544265866279602, "rewards/rejected": -0.5148966908454895, "step": 3220 }, { "epoch": 0.83, "learning_rate": 8.738787714052731e-08, "logits/chosen": -5.082206726074219, "logits/rejected": -5.058773040771484, "logps/chosen": -645.4029541015625, "logps/rejected": -446.66766357421875, "loss": 0.5378, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.19253723323345184, "rewards/margins": 0.5689257979393005, "rewards/rejected": -0.3763886094093323, "step": 3230 }, { "epoch": 0.84, "learning_rate": 8.602881217722206e-08, "logits/chosen": -5.70505428314209, "logits/rejected": -4.710862636566162, "logps/chosen": -569.6564331054688, "logps/rejected": -432.09222412109375, "loss": 0.5233, "rewards/accuracies": 0.75, "rewards/chosen": 0.011319099925458431, "rewards/margins": 0.6908237338066101, "rewards/rejected": -0.6795046329498291, "step": 3240 }, { "epoch": 0.84, "learning_rate": 8.466974721391682e-08, "logits/chosen": -5.539618015289307, "logits/rejected": -5.331404209136963, "logps/chosen": -576.7149658203125, "logps/rejected": -434.1064453125, "loss": 0.5512, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.059493519365787506, "rewards/margins": 0.5593506693840027, "rewards/rejected": -0.4998571276664734, "step": 3250 }, { "epoch": 0.84, "learning_rate": 8.331068225061158e-08, "logits/chosen": -5.128886699676514, "logits/rejected": -5.121006011962891, "logps/chosen": -577.34130859375, "logps/rejected": -516.7703857421875, "loss": 0.6106, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04062025994062424, "rewards/margins": 0.4897204339504242, "rewards/rejected": -0.44910019636154175, "step": 3260 }, { "epoch": 0.84, "learning_rate": 8.195161728730633e-08, "logits/chosen": -5.2884345054626465, "logits/rejected": -4.900949478149414, "logps/chosen": -692.1761474609375, "logps/rejected": -483.1785583496094, "loss": 0.5496, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1615811139345169, "rewards/margins": 0.6961567401885986, "rewards/rejected": -0.5345755815505981, "step": 3270 }, { "epoch": 0.85, "learning_rate": 8.059255232400109e-08, "logits/chosen": -5.02389669418335, "logits/rejected": -4.964724540710449, "logps/chosen": -695.4935302734375, "logps/rejected": -528.0411376953125, "loss": 0.5498, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15914632380008698, "rewards/margins": 0.5235811471939087, "rewards/rejected": -0.36443477869033813, "step": 3280 }, { "epoch": 0.85, "learning_rate": 7.923348736069584e-08, "logits/chosen": -5.2003889083862305, "logits/rejected": -5.228209018707275, "logps/chosen": -537.4274291992188, "logps/rejected": -456.5669860839844, "loss": 0.5781, "rewards/accuracies": 0.75, "rewards/chosen": 0.11098973453044891, "rewards/margins": 0.608254611492157, "rewards/rejected": -0.49726492166519165, "step": 3290 }, { "epoch": 0.85, "learning_rate": 7.787442239739058e-08, "logits/chosen": -5.259240627288818, "logits/rejected": -4.512152671813965, "logps/chosen": -690.8038330078125, "logps/rejected": -444.99761962890625, "loss": 0.5395, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23449468612670898, "rewards/margins": 0.8231126070022583, "rewards/rejected": -0.5886179208755493, "step": 3300 }, { "epoch": 0.85, "learning_rate": 7.651535743408535e-08, "logits/chosen": -5.13831090927124, "logits/rejected": -4.875172138214111, "logps/chosen": -591.1875610351562, "logps/rejected": -428.94610595703125, "loss": 0.5582, "rewards/accuracies": 0.75, "rewards/chosen": 0.1050385981798172, "rewards/margins": 0.5936521291732788, "rewards/rejected": -0.4886136054992676, "step": 3310 }, { "epoch": 0.86, "learning_rate": 7.515629247078011e-08, "logits/chosen": -5.013466835021973, "logits/rejected": -5.039238929748535, "logps/chosen": -693.2198486328125, "logps/rejected": -527.318359375, "loss": 0.5254, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.19377581775188446, "rewards/margins": 0.7708918452262878, "rewards/rejected": -0.5771160125732422, "step": 3320 }, { "epoch": 0.86, "learning_rate": 7.379722750747485e-08, "logits/chosen": -4.996349811553955, "logits/rejected": -4.909236431121826, "logps/chosen": -618.8716430664062, "logps/rejected": -456.98492431640625, "loss": 0.6004, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.08843658864498138, "rewards/margins": 0.5962534546852112, "rewards/rejected": -0.5078169107437134, "step": 3330 }, { "epoch": 0.86, "learning_rate": 7.243816254416962e-08, "logits/chosen": -4.896212100982666, "logits/rejected": -4.641386985778809, "logps/chosen": -608.5612182617188, "logps/rejected": -474.12158203125, "loss": 0.6051, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.12576648592948914, "rewards/margins": 0.5437217950820923, "rewards/rejected": -0.41795530915260315, "step": 3340 }, { "epoch": 0.86, "learning_rate": 7.107909758086436e-08, "logits/chosen": -5.272801399230957, "logits/rejected": -4.8430891036987305, "logps/chosen": -544.3641357421875, "logps/rejected": -426.4873046875, "loss": 0.5574, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.015302670188248158, "rewards/margins": 0.5599262714385986, "rewards/rejected": -0.5446235537528992, "step": 3350 }, { "epoch": 0.87, "learning_rate": 6.972003261755911e-08, "logits/chosen": -4.518991947174072, "logits/rejected": -4.936086177825928, "logps/chosen": -601.0823364257812, "logps/rejected": -546.61376953125, "loss": 0.633, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0004491690488066524, "rewards/margins": 0.4196210503578186, "rewards/rejected": -0.42007017135620117, "step": 3360 }, { "epoch": 0.87, "learning_rate": 6.836096765425386e-08, "logits/chosen": -5.375284194946289, "logits/rejected": -5.087422847747803, "logps/chosen": -652.1438598632812, "logps/rejected": -542.370849609375, "loss": 0.5568, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.1580512672662735, "rewards/margins": 0.5438879132270813, "rewards/rejected": -0.3858366310596466, "step": 3370 }, { "epoch": 0.87, "learning_rate": 6.700190269094863e-08, "logits/chosen": -5.468593120574951, "logits/rejected": -4.970207214355469, "logps/chosen": -604.3890380859375, "logps/rejected": -451.9310607910156, "loss": 0.5345, "rewards/accuracies": 0.6875, "rewards/chosen": 0.19964425265789032, "rewards/margins": 0.6419768929481506, "rewards/rejected": -0.44233259558677673, "step": 3380 }, { "epoch": 0.88, "learning_rate": 6.564283772764338e-08, "logits/chosen": -5.108706474304199, "logits/rejected": -4.714381217956543, "logps/chosen": -603.6357421875, "logps/rejected": -479.19287109375, "loss": 0.6102, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.11323221772909164, "rewards/margins": 0.3441081643104553, "rewards/rejected": -0.4573403298854828, "step": 3390 }, { "epoch": 0.88, "learning_rate": 6.428377276433813e-08, "logits/chosen": -5.2289934158325195, "logits/rejected": -4.821419715881348, "logps/chosen": -629.8280029296875, "logps/rejected": -439.3097229003906, "loss": 0.5733, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.13448044657707214, "rewards/margins": 0.5138882994651794, "rewards/rejected": -0.3794078826904297, "step": 3400 }, { "epoch": 0.88, "learning_rate": 6.292470780103289e-08, "logits/chosen": -5.200686931610107, "logits/rejected": -4.433293342590332, "logps/chosen": -628.9244995117188, "logps/rejected": -463.2845153808594, "loss": 0.5512, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.22049829363822937, "rewards/margins": 0.5850414633750916, "rewards/rejected": -0.36454319953918457, "step": 3410 }, { "epoch": 0.88, "learning_rate": 6.156564283772764e-08, "logits/chosen": -5.299391269683838, "logits/rejected": -4.578976631164551, "logps/chosen": -617.935302734375, "logps/rejected": -455.72149658203125, "loss": 0.5461, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.08323828130960464, "rewards/margins": 0.7242950201034546, "rewards/rejected": -0.6410566568374634, "step": 3420 }, { "epoch": 0.89, "learning_rate": 6.02065778744224e-08, "logits/chosen": -5.269377708435059, "logits/rejected": -5.062786102294922, "logps/chosen": -640.0068359375, "logps/rejected": -480.91180419921875, "loss": 0.546, "rewards/accuracies": 0.75, "rewards/chosen": 0.1582534909248352, "rewards/margins": 0.6524442434310913, "rewards/rejected": -0.4941907823085785, "step": 3430 }, { "epoch": 0.89, "learning_rate": 5.8847512911117146e-08, "logits/chosen": -4.990743160247803, "logits/rejected": -5.302047252655029, "logps/chosen": -524.5138549804688, "logps/rejected": -541.4100341796875, "loss": 0.6432, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.05930762737989426, "rewards/margins": 0.3244546949863434, "rewards/rejected": -0.2651470899581909, "step": 3440 }, { "epoch": 0.89, "learning_rate": 5.74884479478119e-08, "logits/chosen": -5.07647705078125, "logits/rejected": -4.627741813659668, "logps/chosen": -591.9494018554688, "logps/rejected": -472.5774841308594, "loss": 0.5645, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.055071644484996796, "rewards/margins": 0.49125391244888306, "rewards/rejected": -0.43618226051330566, "step": 3450 }, { "epoch": 0.89, "learning_rate": 5.612938298450666e-08, "logits/chosen": -4.96254301071167, "logits/rejected": -4.850637912750244, "logps/chosen": -689.391357421875, "logps/rejected": -507.0537109375, "loss": 0.5609, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.1781730204820633, "rewards/margins": 0.5519202947616577, "rewards/rejected": -0.3737472593784332, "step": 3460 }, { "epoch": 0.9, "learning_rate": 5.477031802120141e-08, "logits/chosen": -5.282811164855957, "logits/rejected": -4.793059825897217, "logps/chosen": -681.777099609375, "logps/rejected": -499.6626892089844, "loss": 0.5871, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.20708510279655457, "rewards/margins": 0.5851645469665527, "rewards/rejected": -0.378079354763031, "step": 3470 }, { "epoch": 0.9, "learning_rate": 5.341125305789616e-08, "logits/chosen": -5.204301834106445, "logits/rejected": -5.192577838897705, "logps/chosen": -614.7431030273438, "logps/rejected": -475.15447998046875, "loss": 0.5711, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.06923681497573853, "rewards/margins": 0.6609135866165161, "rewards/rejected": -0.5916768312454224, "step": 3480 }, { "epoch": 0.9, "learning_rate": 5.2052188094590924e-08, "logits/chosen": -5.410575866699219, "logits/rejected": -4.7271928787231445, "logps/chosen": -562.2875366210938, "logps/rejected": -470.6141662597656, "loss": 0.574, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.04732062295079231, "rewards/margins": 0.45917314291000366, "rewards/rejected": -0.41185253858566284, "step": 3490 }, { "epoch": 0.9, "learning_rate": 5.069312313128567e-08, "logits/chosen": -4.909255027770996, "logits/rejected": -5.011034965515137, "logps/chosen": -578.5535888671875, "logps/rejected": -488.4493713378906, "loss": 0.5549, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.09110667556524277, "rewards/margins": 0.5210335850715637, "rewards/rejected": -0.4299268126487732, "step": 3500 }, { "epoch": 0.91, "learning_rate": 4.9334058167980426e-08, "logits/chosen": -5.587698936462402, "logits/rejected": -5.1530914306640625, "logps/chosen": -583.8742065429688, "logps/rejected": -433.79083251953125, "loss": 0.5733, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.08238669484853745, "rewards/margins": 0.5551365613937378, "rewards/rejected": -0.4727499485015869, "step": 3510 }, { "epoch": 0.91, "learning_rate": 4.797499320467518e-08, "logits/chosen": -5.150055885314941, "logits/rejected": -4.939459800720215, "logps/chosen": -628.3910522460938, "logps/rejected": -505.53125, "loss": 0.5523, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.09437895566225052, "rewards/margins": 0.5040851831436157, "rewards/rejected": -0.409706175327301, "step": 3520 }, { "epoch": 0.91, "learning_rate": 4.6615928241369935e-08, "logits/chosen": -5.329426288604736, "logits/rejected": -5.03844690322876, "logps/chosen": -593.9847412109375, "logps/rejected": -454.750244140625, "loss": 0.5544, "rewards/accuracies": 0.6875, "rewards/chosen": 0.11871786415576935, "rewards/margins": 0.545038640499115, "rewards/rejected": -0.426320880651474, "step": 3530 }, { "epoch": 0.91, "learning_rate": 4.525686327806469e-08, "logits/chosen": -5.202639579772949, "logits/rejected": -4.666885852813721, "logps/chosen": -557.8953857421875, "logps/rejected": -449.791015625, "loss": 0.5633, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.08986867219209671, "rewards/margins": 0.5092368125915527, "rewards/rejected": -0.41936811804771423, "step": 3540 }, { "epoch": 0.92, "learning_rate": 4.389779831475944e-08, "logits/chosen": -5.34531307220459, "logits/rejected": -4.846031188964844, "logps/chosen": -595.12841796875, "logps/rejected": -509.6744079589844, "loss": 0.5969, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.08132925629615784, "rewards/margins": 0.3766716420650482, "rewards/rejected": -0.2953423857688904, "step": 3550 }, { "epoch": 0.92, "learning_rate": 4.25387333514542e-08, "logits/chosen": -5.138333320617676, "logits/rejected": -4.746251106262207, "logps/chosen": -515.9407958984375, "logps/rejected": -437.38018798828125, "loss": 0.5986, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0017356962198391557, "rewards/margins": 0.5237280130386353, "rewards/rejected": -0.5254637002944946, "step": 3560 }, { "epoch": 0.92, "learning_rate": 4.117966838814895e-08, "logits/chosen": -5.134562969207764, "logits/rejected": -4.96474552154541, "logps/chosen": -606.036376953125, "logps/rejected": -466.07867431640625, "loss": 0.5611, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.08818952739238739, "rewards/margins": 0.5076228380203247, "rewards/rejected": -0.4194332957267761, "step": 3570 }, { "epoch": 0.92, "learning_rate": 3.98206034248437e-08, "logits/chosen": -5.315756320953369, "logits/rejected": -4.949517250061035, "logps/chosen": -636.086181640625, "logps/rejected": -445.82904052734375, "loss": 0.569, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.20392675697803497, "rewards/margins": 0.6199758052825928, "rewards/rejected": -0.416049063205719, "step": 3580 }, { "epoch": 0.93, "learning_rate": 3.846153846153846e-08, "logits/chosen": -5.101180076599121, "logits/rejected": -4.905373573303223, "logps/chosen": -536.1978759765625, "logps/rejected": -417.8277282714844, "loss": 0.5405, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.018841056153178215, "rewards/margins": 0.4977056086063385, "rewards/rejected": -0.5165466070175171, "step": 3590 }, { "epoch": 0.93, "learning_rate": 3.7102473498233216e-08, "logits/chosen": -5.034720420837402, "logits/rejected": -4.62627649307251, "logps/chosen": -499.31463623046875, "logps/rejected": -425.21112060546875, "loss": 0.5939, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.08160565793514252, "rewards/margins": 0.32524237036705017, "rewards/rejected": -0.4068480134010315, "step": 3600 }, { "epoch": 0.93, "learning_rate": 3.5743408534927963e-08, "logits/chosen": -5.122437000274658, "logits/rejected": -5.20498514175415, "logps/chosen": -485.17962646484375, "logps/rejected": -416.1468811035156, "loss": 0.5746, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.07851817458868027, "rewards/margins": 0.5143343210220337, "rewards/rejected": -0.4358161389827728, "step": 3610 }, { "epoch": 0.93, "learning_rate": 3.4384343571622724e-08, "logits/chosen": -5.532504081726074, "logits/rejected": -5.148187160491943, "logps/chosen": -519.8585815429688, "logps/rejected": -442.80352783203125, "loss": 0.6044, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.018877673894166946, "rewards/margins": 0.41989025473594666, "rewards/rejected": -0.4010125696659088, "step": 3620 }, { "epoch": 0.94, "learning_rate": 3.302527860831748e-08, "logits/chosen": -5.246006488800049, "logits/rejected": -4.628513336181641, "logps/chosen": -557.8768310546875, "logps/rejected": -396.80230712890625, "loss": 0.5759, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0534333661198616, "rewards/margins": 0.49642905592918396, "rewards/rejected": -0.44299569725990295, "step": 3630 }, { "epoch": 0.94, "learning_rate": 3.1666213645012227e-08, "logits/chosen": -5.454456806182861, "logits/rejected": -4.73259973526001, "logps/chosen": -597.5782470703125, "logps/rejected": -388.52789306640625, "loss": 0.5745, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.09610459953546524, "rewards/margins": 0.6527958512306213, "rewards/rejected": -0.5566911697387695, "step": 3640 }, { "epoch": 0.94, "learning_rate": 3.030714868170698e-08, "logits/chosen": -5.269309043884277, "logits/rejected": -4.4948835372924805, "logps/chosen": -625.8494873046875, "logps/rejected": -416.91815185546875, "loss": 0.5644, "rewards/accuracies": 0.75, "rewards/chosen": 0.13563071191310883, "rewards/margins": 0.5831824541091919, "rewards/rejected": -0.44755178689956665, "step": 3650 }, { "epoch": 0.95, "learning_rate": 2.894808371840174e-08, "logits/chosen": -4.872903347015381, "logits/rejected": -5.047135353088379, "logps/chosen": -538.2395629882812, "logps/rejected": -488.65966796875, "loss": 0.5254, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1966354250907898, "rewards/margins": 0.6410555839538574, "rewards/rejected": -0.4444200396537781, "step": 3660 }, { "epoch": 0.95, "learning_rate": 2.758901875509649e-08, "logits/chosen": -5.2469682693481445, "logits/rejected": -4.6465582847595215, "logps/chosen": -665.9546508789062, "logps/rejected": -488.3631286621094, "loss": 0.6068, "rewards/accuracies": 0.6875, "rewards/chosen": 0.06325142085552216, "rewards/margins": 0.4522647261619568, "rewards/rejected": -0.3890133202075958, "step": 3670 }, { "epoch": 0.95, "learning_rate": 2.6229953791791247e-08, "logits/chosen": -5.30975341796875, "logits/rejected": -5.214649677276611, "logps/chosen": -625.4133911132812, "logps/rejected": -527.6849365234375, "loss": 0.5544, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.031594168394804, "rewards/margins": 0.4863681197166443, "rewards/rejected": -0.4547739028930664, "step": 3680 }, { "epoch": 0.95, "learning_rate": 2.4870888828486002e-08, "logits/chosen": -5.154933929443359, "logits/rejected": -4.615642547607422, "logps/chosen": -566.6359252929688, "logps/rejected": -367.35272216796875, "loss": 0.5399, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.25163334608078003, "rewards/margins": 0.7026754021644592, "rewards/rejected": -0.4510420262813568, "step": 3690 }, { "epoch": 0.96, "learning_rate": 2.3511823865180753e-08, "logits/chosen": -4.935416221618652, "logits/rejected": -4.931743621826172, "logps/chosen": -531.6405029296875, "logps/rejected": -465.2660217285156, "loss": 0.5801, "rewards/accuracies": 0.75, "rewards/chosen": 0.0852300152182579, "rewards/margins": 0.6420767903327942, "rewards/rejected": -0.5568467378616333, "step": 3700 }, { "epoch": 0.96, "learning_rate": 2.215275890187551e-08, "logits/chosen": -5.1003828048706055, "logits/rejected": -4.942370891571045, "logps/chosen": -572.6801147460938, "logps/rejected": -499.9962463378906, "loss": 0.5402, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14501488208770752, "rewards/margins": 0.6089879274368286, "rewards/rejected": -0.46397310495376587, "step": 3710 }, { "epoch": 0.96, "learning_rate": 2.0793693938570265e-08, "logits/chosen": -5.139072895050049, "logits/rejected": -4.563143253326416, "logps/chosen": -524.9754028320312, "logps/rejected": -419.37274169921875, "loss": 0.5539, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.030956823378801346, "rewards/margins": 0.5671139359474182, "rewards/rejected": -0.5980707406997681, "step": 3720 }, { "epoch": 0.96, "learning_rate": 1.9434628975265016e-08, "logits/chosen": -5.675312042236328, "logits/rejected": -4.652360439300537, "logps/chosen": -580.7689208984375, "logps/rejected": -404.6139221191406, "loss": 0.5862, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.012955671176314354, "rewards/margins": 0.391316294670105, "rewards/rejected": -0.4042719900608063, "step": 3730 }, { "epoch": 0.97, "learning_rate": 1.807556401195977e-08, "logits/chosen": -5.271977424621582, "logits/rejected": -4.812222480773926, "logps/chosen": -672.85986328125, "logps/rejected": -450.5704040527344, "loss": 0.5576, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11036734282970428, "rewards/margins": 0.5794622898101807, "rewards/rejected": -0.4690949320793152, "step": 3740 }, { "epoch": 0.97, "learning_rate": 1.6716499048654525e-08, "logits/chosen": -5.003756523132324, "logits/rejected": -4.732114315032959, "logps/chosen": -636.2298583984375, "logps/rejected": -450.95172119140625, "loss": 0.5784, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.03545001894235611, "rewards/margins": 0.4769270420074463, "rewards/rejected": -0.44147706031799316, "step": 3750 }, { "epoch": 0.97, "learning_rate": 1.535743408534928e-08, "logits/chosen": -4.955419063568115, "logits/rejected": -5.042864799499512, "logps/chosen": -597.6549072265625, "logps/rejected": -529.520263671875, "loss": 0.584, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.06296499073505402, "rewards/margins": 0.43164101243019104, "rewards/rejected": -0.36867600679397583, "step": 3760 }, { "epoch": 0.97, "learning_rate": 1.3998369122044032e-08, "logits/chosen": -4.982466697692871, "logits/rejected": -5.189105987548828, "logps/chosen": -566.3472290039062, "logps/rejected": -493.54974365234375, "loss": 0.5668, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.027576932683587074, "rewards/margins": 0.46042051911354065, "rewards/rejected": -0.4328436255455017, "step": 3770 }, { "epoch": 0.98, "learning_rate": 1.2639304158738788e-08, "logits/chosen": -5.176326274871826, "logits/rejected": -4.8516364097595215, "logps/chosen": -625.2818603515625, "logps/rejected": -434.9867248535156, "loss": 0.5787, "rewards/accuracies": 0.6875, "rewards/chosen": 0.22398793697357178, "rewards/margins": 0.6402627229690552, "rewards/rejected": -0.41627463698387146, "step": 3780 }, { "epoch": 0.98, "learning_rate": 1.1280239195433542e-08, "logits/chosen": -4.825104236602783, "logits/rejected": -4.390969276428223, "logps/chosen": -642.5945434570312, "logps/rejected": -465.6443786621094, "loss": 0.5685, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14697226881980896, "rewards/margins": 0.5980256199836731, "rewards/rejected": -0.4510533809661865, "step": 3790 }, { "epoch": 0.98, "learning_rate": 9.921174232128295e-09, "logits/chosen": -5.257870674133301, "logits/rejected": -4.686526298522949, "logps/chosen": -617.8099365234375, "logps/rejected": -431.343017578125, "loss": 0.53, "rewards/accuracies": 0.75, "rewards/chosen": 0.1597820371389389, "rewards/margins": 0.6843008995056152, "rewards/rejected": -0.5245188474655151, "step": 3800 }, { "epoch": 0.98, "learning_rate": 8.562109268823049e-09, "logits/chosen": -5.258317470550537, "logits/rejected": -5.140551567077637, "logps/chosen": -546.1046752929688, "logps/rejected": -436.4046936035156, "loss": 0.5762, "rewards/accuracies": 0.625, "rewards/chosen": -0.011624747887253761, "rewards/margins": 0.36088308691978455, "rewards/rejected": -0.37250787019729614, "step": 3810 }, { "epoch": 0.99, "learning_rate": 7.203044305517803e-09, "logits/chosen": -4.875320911407471, "logits/rejected": -4.910046577453613, "logps/chosen": -613.1431884765625, "logps/rejected": -506.95098876953125, "loss": 0.5196, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1426512897014618, "rewards/margins": 0.5562176704406738, "rewards/rejected": -0.4135662913322449, "step": 3820 }, { "epoch": 0.99, "learning_rate": 5.843979342212558e-09, "logits/chosen": -4.8537468910217285, "logits/rejected": -4.501283645629883, "logps/chosen": -642.3465576171875, "logps/rejected": -467.0890197753906, "loss": 0.5845, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1156611293554306, "rewards/margins": 0.49272075295448303, "rewards/rejected": -0.37705960869789124, "step": 3830 }, { "epoch": 0.99, "learning_rate": 4.4849143789073114e-09, "logits/chosen": -5.545628070831299, "logits/rejected": -5.529524803161621, "logps/chosen": -538.4314575195312, "logps/rejected": -463.7015686035156, "loss": 0.553, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.012657539919018745, "rewards/margins": 0.49136465787887573, "rewards/rejected": -0.4787071645259857, "step": 3840 }, { "epoch": 0.99, "learning_rate": 3.1258494156020658e-09, "logits/chosen": -5.1411237716674805, "logits/rejected": -5.270503520965576, "logps/chosen": -614.623046875, "logps/rejected": -475.93975830078125, "loss": 0.5296, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.20060591399669647, "rewards/margins": 0.6371750831604004, "rewards/rejected": -0.4365692734718323, "step": 3850 }, { "epoch": 1.0, "learning_rate": 1.7667844522968197e-09, "logits/chosen": -5.237704753875732, "logits/rejected": -4.802920818328857, "logps/chosen": -578.4732666015625, "logps/rejected": -398.55804443359375, "loss": 0.5499, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.045068494975566864, "rewards/margins": 0.4510710835456848, "rewards/rejected": -0.4961395263671875, "step": 3860 }, { "epoch": 1.0, "learning_rate": 4.077194889915738e-10, "logits/chosen": -5.071371555328369, "logits/rejected": -4.778379917144775, "logps/chosen": -552.7120361328125, "logps/rejected": -491.9295349121094, "loss": 0.5576, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.027319246903061867, "rewards/margins": 0.4523466229438782, "rewards/rejected": -0.4796658456325531, "step": 3870 }, { "epoch": 1.0, "step": 3873, "total_flos": 0.0, "train_loss": 0.5843773977780509, "train_runtime": 4895.0541, "train_samples_per_second": 12.659, "train_steps_per_second": 0.791 } ], "logging_steps": 10, "max_steps": 3873, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }