diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4830 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3356, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.488095238095238e-09, + "logits/chosen": -2.6795692443847656, + "logits/rejected": -2.624149799346924, + "logps/chosen": -54.570396423339844, + "logps/rejected": -74.21392822265625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.4880952380952379e-08, + "logits/chosen": -2.7060725688934326, + "logits/rejected": -2.6765432357788086, + "logps/chosen": -95.24983978271484, + "logps/rejected": -91.18234252929688, + "loss": 0.6933, + "rewards/accuracies": 0.4166666567325592, + "rewards/chosen": 0.0005662046023644507, + "rewards/margins": -0.006994906347244978, + "rewards/rejected": 0.007561111822724342, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 2.9761904761904758e-08, + "logits/chosen": -2.5795836448669434, + "logits/rejected": -2.592409133911133, + "logps/chosen": -124.33586120605469, + "logps/rejected": -103.54573822021484, + "loss": 0.6947, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0065773227252066135, + "rewards/margins": -0.0029559016693383455, + "rewards/rejected": 0.009533221833407879, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 4.4642857142857145e-08, + "logits/chosen": -2.579939126968384, + "logits/rejected": -2.5497870445251465, + "logps/chosen": -68.13322448730469, + "logps/rejected": -66.37541961669922, + "loss": 0.6921, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.01673651486635208, + "rewards/margins": 0.00222357758320868, + "rewards/rejected": 0.01451293658465147, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 5.9523809523809515e-08, + "logits/chosen": -2.6564245223999023, + "logits/rejected": -2.608503818511963, + "logps/chosen": -83.7612533569336, + "logps/rejected": -79.3699951171875, + "loss": 0.6886, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.050556618720293045, + "rewards/margins": 0.005645673722028732, + "rewards/rejected": 0.04491094499826431, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 7.44047619047619e-08, + "logits/chosen": -2.752234935760498, + "logits/rejected": -2.6355555057525635, + "logps/chosen": -127.2625503540039, + "logps/rejected": -114.26876068115234, + "loss": 0.6892, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.09844812005758286, + "rewards/margins": 0.0011480912799015641, + "rewards/rejected": 0.09730003774166107, + "step": 50 + }, + { + "epoch": 0.02, + "learning_rate": 8.928571428571429e-08, + "logits/chosen": -2.669374704360962, + "logits/rejected": -2.652597188949585, + "logps/chosen": -103.32049560546875, + "logps/rejected": -105.29325103759766, + "loss": 0.6817, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.19618754088878632, + "rewards/margins": 0.012378268875181675, + "rewards/rejected": 0.18380926549434662, + "step": 60 + }, + { + "epoch": 0.02, + "learning_rate": 1.0416666666666667e-07, + "logits/chosen": -2.7530548572540283, + "logits/rejected": -2.680541753768921, + "logps/chosen": -84.53085327148438, + "logps/rejected": -84.82635498046875, + "loss": 0.6819, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33352726697921753, + "rewards/margins": 0.020224859938025475, + "rewards/rejected": 0.3133023679256439, + "step": 70 + }, + { + "epoch": 0.02, + "learning_rate": 1.1904761904761903e-07, + "logits/chosen": -2.572601079940796, + "logits/rejected": -2.5415000915527344, + "logps/chosen": -96.4114761352539, + "logps/rejected": -84.30821228027344, + "loss": 0.6708, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4342936873435974, + "rewards/margins": 0.0613841637969017, + "rewards/rejected": 0.3729095458984375, + "step": 80 + }, + { + "epoch": 0.03, + "learning_rate": 1.3392857142857142e-07, + "logits/chosen": -2.7009196281433105, + "logits/rejected": -2.698122262954712, + "logps/chosen": -78.68132781982422, + "logps/rejected": -81.79669189453125, + "loss": 0.6546, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5974748730659485, + "rewards/margins": 0.08051940053701401, + "rewards/rejected": 0.5169554948806763, + "step": 90 + }, + { + "epoch": 0.03, + "learning_rate": 1.488095238095238e-07, + "logits/chosen": -2.5833797454833984, + "logits/rejected": -2.624276876449585, + "logps/chosen": -77.67559814453125, + "logps/rejected": -90.95040130615234, + "loss": 0.6601, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.5890167355537415, + "rewards/margins": 0.06459061056375504, + "rewards/rejected": 0.5244261026382446, + "step": 100 + }, + { + "epoch": 0.03, + "learning_rate": 1.6369047619047617e-07, + "logits/chosen": -2.5440800189971924, + "logits/rejected": -2.536761522293091, + "logps/chosen": -79.65280151367188, + "logps/rejected": -77.1148681640625, + "loss": 0.6643, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.48702484369277954, + "rewards/margins": 0.02558879181742668, + "rewards/rejected": 0.46143603324890137, + "step": 110 + }, + { + "epoch": 0.04, + "learning_rate": 1.7857142857142858e-07, + "logits/chosen": -2.59000301361084, + "logits/rejected": -2.6294052600860596, + "logps/chosen": -98.95535278320312, + "logps/rejected": -93.15876770019531, + "loss": 0.6528, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.44851523637771606, + "rewards/margins": 0.04791822284460068, + "rewards/rejected": 0.4005970060825348, + "step": 120 + }, + { + "epoch": 0.04, + "learning_rate": 1.9345238095238096e-07, + "logits/chosen": -2.5660836696624756, + "logits/rejected": -2.532435894012451, + "logps/chosen": -81.32213592529297, + "logps/rejected": -86.37200927734375, + "loss": 0.6286, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.6584704518318176, + "rewards/margins": 0.1672821044921875, + "rewards/rejected": 0.4911883771419525, + "step": 130 + }, + { + "epoch": 0.04, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": -2.657209873199463, + "logits/rejected": -2.620845079421997, + "logps/chosen": -98.81898498535156, + "logps/rejected": -91.02754974365234, + "loss": 0.6596, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.8377985954284668, + "rewards/margins": 0.049154218286275864, + "rewards/rejected": 0.7886443138122559, + "step": 140 + }, + { + "epoch": 0.04, + "learning_rate": 2.232142857142857e-07, + "logits/chosen": -2.594756603240967, + "logits/rejected": -2.5098514556884766, + "logps/chosen": -108.9326171875, + "logps/rejected": -124.50955963134766, + "loss": 0.6063, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.6377179622650146, + "rewards/margins": 0.7904380559921265, + "rewards/rejected": -0.15272006392478943, + "step": 150 + }, + { + "epoch": 0.05, + "learning_rate": 2.3809523809523806e-07, + "logits/chosen": -2.5515310764312744, + "logits/rejected": -2.4522361755371094, + "logps/chosen": -90.93934631347656, + "logps/rejected": -106.53071594238281, + "loss": 0.6199, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.8264306783676147, + "rewards/margins": 0.5799387097358704, + "rewards/rejected": 0.246491938829422, + "step": 160 + }, + { + "epoch": 0.05, + "learning_rate": 2.5297619047619046e-07, + "logits/chosen": -2.511021137237549, + "logits/rejected": -2.5456349849700928, + "logps/chosen": -91.14982604980469, + "logps/rejected": -99.70429992675781, + "loss": 0.6079, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7046107649803162, + "rewards/margins": 0.36221450567245483, + "rewards/rejected": 0.34239625930786133, + "step": 170 + }, + { + "epoch": 0.05, + "learning_rate": 2.6785714285714284e-07, + "logits/chosen": -2.520282030105591, + "logits/rejected": -2.503950595855713, + "logps/chosen": -79.16224670410156, + "logps/rejected": -89.08283233642578, + "loss": 0.6324, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.8210590481758118, + "rewards/margins": 0.16991613805294037, + "rewards/rejected": 0.651142954826355, + "step": 180 + }, + { + "epoch": 0.06, + "learning_rate": 2.827380952380952e-07, + "logits/chosen": -2.6823697090148926, + "logits/rejected": -2.633678674697876, + "logps/chosen": -104.0126724243164, + "logps/rejected": -103.51971435546875, + "loss": 0.5904, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 1.1408202648162842, + "rewards/margins": 0.2863886058330536, + "rewards/rejected": 0.8544318079948425, + "step": 190 + }, + { + "epoch": 0.06, + "learning_rate": 2.976190476190476e-07, + "logits/chosen": -2.530428409576416, + "logits/rejected": -2.50227689743042, + "logps/chosen": -100.63572692871094, + "logps/rejected": -94.46806335449219, + "loss": 0.6018, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.7896903157234192, + "rewards/margins": 0.45959681272506714, + "rewards/rejected": 0.33009350299835205, + "step": 200 + }, + { + "epoch": 0.06, + "learning_rate": 3.1249999999999997e-07, + "logits/chosen": -2.4940271377563477, + "logits/rejected": -2.5085806846618652, + "logps/chosen": -92.1917724609375, + "logps/rejected": -107.3184585571289, + "loss": 0.5868, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.7482628226280212, + "rewards/margins": 0.49973025918006897, + "rewards/rejected": 0.24853253364562988, + "step": 210 + }, + { + "epoch": 0.07, + "learning_rate": 3.2738095238095235e-07, + "logits/chosen": -2.5470972061157227, + "logits/rejected": -2.5241191387176514, + "logps/chosen": -113.54488372802734, + "logps/rejected": -129.91867065429688, + "loss": 0.5871, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.4822530746459961, + "rewards/margins": 0.5863619446754456, + "rewards/rejected": -0.10410883277654648, + "step": 220 + }, + { + "epoch": 0.07, + "learning_rate": 3.4226190476190473e-07, + "logits/chosen": -2.5854454040527344, + "logits/rejected": -2.427126169204712, + "logps/chosen": -95.35980987548828, + "logps/rejected": -81.82037353515625, + "loss": 0.6183, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8959482908248901, + "rewards/margins": 0.8998041152954102, + "rewards/rejected": -0.0038558482192456722, + "step": 230 + }, + { + "epoch": 0.07, + "learning_rate": 3.5714285714285716e-07, + "logits/chosen": -2.5749735832214355, + "logits/rejected": -2.58799409866333, + "logps/chosen": -76.01658630371094, + "logps/rejected": -77.50577545166016, + "loss": 0.6595, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28254395723342896, + "rewards/margins": 0.4179397523403168, + "rewards/rejected": -0.13539579510688782, + "step": 240 + }, + { + "epoch": 0.07, + "learning_rate": 3.7202380952380953e-07, + "logits/chosen": -2.655733823776245, + "logits/rejected": -2.6001226902008057, + "logps/chosen": -112.2961654663086, + "logps/rejected": -124.30081939697266, + "loss": 0.5967, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 1.081606388092041, + "rewards/margins": 0.3873857855796814, + "rewards/rejected": 0.6942206025123596, + "step": 250 + }, + { + "epoch": 0.08, + "learning_rate": 3.869047619047619e-07, + "logits/chosen": -2.3797781467437744, + "logits/rejected": -2.3257176876068115, + "logps/chosen": -100.49422454833984, + "logps/rejected": -116.31571197509766, + "loss": 0.5687, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.26938995718955994, + "rewards/margins": 0.5422745943069458, + "rewards/rejected": -0.27288463711738586, + "step": 260 + }, + { + "epoch": 0.08, + "learning_rate": 4.017857142857143e-07, + "logits/chosen": -2.506838321685791, + "logits/rejected": -2.5618858337402344, + "logps/chosen": -103.68598937988281, + "logps/rejected": -116.80242919921875, + "loss": 0.6466, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.9570896029472351, + "rewards/margins": 0.3445149064064026, + "rewards/rejected": 0.6125746965408325, + "step": 270 + }, + { + "epoch": 0.08, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -2.5634925365448, + "logits/rejected": -2.520244836807251, + "logps/chosen": -102.6960678100586, + "logps/rejected": -90.80632019042969, + "loss": 0.5996, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.0112148523330688, + "rewards/margins": 0.3889988362789154, + "rewards/rejected": 0.6222161054611206, + "step": 280 + }, + { + "epoch": 0.09, + "learning_rate": 4.3154761904761904e-07, + "logits/chosen": -2.569206714630127, + "logits/rejected": -2.5652623176574707, + "logps/chosen": -85.24828338623047, + "logps/rejected": -93.45872497558594, + "loss": 0.5347, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.3659771978855133, + "rewards/margins": 0.7742798924446106, + "rewards/rejected": -0.4083026945590973, + "step": 290 + }, + { + "epoch": 0.09, + "learning_rate": 4.464285714285714e-07, + "logits/chosen": -2.363185167312622, + "logits/rejected": -2.371516227722168, + "logps/chosen": -99.2336654663086, + "logps/rejected": -92.32693481445312, + "loss": 0.5878, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7069708704948425, + "rewards/margins": 0.8502944111824036, + "rewards/rejected": -0.1433234065771103, + "step": 300 + }, + { + "epoch": 0.09, + "learning_rate": 4.613095238095238e-07, + "logits/chosen": -2.412259578704834, + "logits/rejected": -2.4086456298828125, + "logps/chosen": -96.43733978271484, + "logps/rejected": -120.0870590209961, + "loss": 0.5642, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.19554999470710754, + "rewards/margins": 1.321014404296875, + "rewards/rejected": -1.1254642009735107, + "step": 310 + }, + { + "epoch": 0.1, + "learning_rate": 4.761904761904761e-07, + "logits/chosen": -2.6165080070495605, + "logits/rejected": -2.6191306114196777, + "logps/chosen": -117.46064758300781, + "logps/rejected": -122.75732421875, + "loss": 0.5508, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7428444623947144, + "rewards/margins": 0.3669503331184387, + "rewards/rejected": -1.1097948551177979, + "step": 320 + }, + { + "epoch": 0.1, + "learning_rate": 4.910714285714285e-07, + "logits/chosen": -2.493110179901123, + "logits/rejected": -2.4452643394470215, + "logps/chosen": -91.34004211425781, + "logps/rejected": -103.17684173583984, + "loss": 0.5986, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6157582402229309, + "rewards/margins": 0.7595478892326355, + "rewards/rejected": -1.375306248664856, + "step": 330 + }, + { + "epoch": 0.1, + "learning_rate": 4.993377483443708e-07, + "logits/chosen": -2.547645092010498, + "logits/rejected": -2.4399895668029785, + "logps/chosen": -106.4365005493164, + "logps/rejected": -109.07222747802734, + "loss": 0.5639, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.36611366271972656, + "rewards/margins": 0.6897183060646057, + "rewards/rejected": -1.0558319091796875, + "step": 340 + }, + { + "epoch": 0.1, + "learning_rate": 4.97682119205298e-07, + "logits/chosen": -2.5453834533691406, + "logits/rejected": -2.5119881629943848, + "logps/chosen": -108.45722961425781, + "logps/rejected": -105.61241149902344, + "loss": 0.5994, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21297264099121094, + "rewards/margins": 0.35165560245513916, + "rewards/rejected": -0.13868291676044464, + "step": 350 + }, + { + "epoch": 0.11, + "learning_rate": 4.960264900662251e-07, + "logits/chosen": -2.568861484527588, + "logits/rejected": -2.552140712738037, + "logps/chosen": -99.7040786743164, + "logps/rejected": -109.383544921875, + "loss": 0.5401, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2864856421947479, + "rewards/margins": 0.8699267506599426, + "rewards/rejected": -1.1564123630523682, + "step": 360 + }, + { + "epoch": 0.11, + "learning_rate": 4.943708609271523e-07, + "logits/chosen": -2.584989070892334, + "logits/rejected": -2.524940013885498, + "logps/chosen": -116.22591400146484, + "logps/rejected": -132.27352905273438, + "loss": 0.5816, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3508208990097046, + "rewards/margins": 0.757738471031189, + "rewards/rejected": -1.1085593700408936, + "step": 370 + }, + { + "epoch": 0.11, + "learning_rate": 4.927152317880794e-07, + "logits/chosen": -2.5064499378204346, + "logits/rejected": -2.520719528198242, + "logps/chosen": -105.9725570678711, + "logps/rejected": -106.05126953125, + "loss": 0.6476, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.491422414779663, + "rewards/margins": 0.45032089948654175, + "rewards/rejected": -1.9417431354522705, + "step": 380 + }, + { + "epoch": 0.12, + "learning_rate": 4.910596026490066e-07, + "logits/chosen": -2.4913430213928223, + "logits/rejected": -2.5125203132629395, + "logps/chosen": -124.0137710571289, + "logps/rejected": -119.0078353881836, + "loss": 0.6202, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6193113327026367, + "rewards/margins": 0.656644880771637, + "rewards/rejected": -2.275956392288208, + "step": 390 + }, + { + "epoch": 0.12, + "learning_rate": 4.894039735099338e-07, + "logits/chosen": -2.5196266174316406, + "logits/rejected": -2.492640256881714, + "logps/chosen": -108.40077209472656, + "logps/rejected": -106.96036529541016, + "loss": 0.5793, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8650729060173035, + "rewards/margins": 0.4100722372531891, + "rewards/rejected": -1.275145173072815, + "step": 400 + }, + { + "epoch": 0.12, + "learning_rate": 4.877483443708609e-07, + "logits/chosen": -2.3113367557525635, + "logits/rejected": -2.363025426864624, + "logps/chosen": -108.32320404052734, + "logps/rejected": -96.14768981933594, + "loss": 1.0008, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -2.168811559677124, + "rewards/margins": -1.5380103588104248, + "rewards/rejected": -0.6308012008666992, + "step": 410 + }, + { + "epoch": 0.13, + "learning_rate": 4.860927152317881e-07, + "logits/chosen": -2.2521350383758545, + "logits/rejected": -2.2686538696289062, + "logps/chosen": -78.05595397949219, + "logps/rejected": -93.2776107788086, + "loss": 0.5595, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5212607979774475, + "rewards/margins": 0.686114490032196, + "rewards/rejected": -1.207375407218933, + "step": 420 + }, + { + "epoch": 0.13, + "learning_rate": 4.844370860927152e-07, + "logits/chosen": -2.2812628746032715, + "logits/rejected": -2.29258394241333, + "logps/chosen": -128.2143096923828, + "logps/rejected": -135.92117309570312, + "loss": 0.5525, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -2.4639101028442383, + "rewards/margins": 1.0309460163116455, + "rewards/rejected": -3.494856595993042, + "step": 430 + }, + { + "epoch": 0.13, + "learning_rate": 4.827814569536423e-07, + "logits/chosen": -2.3497612476348877, + "logits/rejected": -2.259904384613037, + "logps/chosen": -126.2747802734375, + "logps/rejected": -132.0948944091797, + "loss": 0.5087, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5596282482147217, + "rewards/margins": 1.1223429441452026, + "rewards/rejected": -2.681971549987793, + "step": 440 + }, + { + "epoch": 0.13, + "learning_rate": 4.811258278145695e-07, + "logits/chosen": -2.3696093559265137, + "logits/rejected": -2.355694055557251, + "logps/chosen": -113.27628326416016, + "logps/rejected": -120.5525131225586, + "loss": 0.5239, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5899262428283691, + "rewards/margins": 0.7907289266586304, + "rewards/rejected": -1.380655288696289, + "step": 450 + }, + { + "epoch": 0.14, + "learning_rate": 4.794701986754966e-07, + "logits/chosen": -2.4090988636016846, + "logits/rejected": -2.4314303398132324, + "logps/chosen": -119.7711410522461, + "logps/rejected": -138.52122497558594, + "loss": 0.6907, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2273411750793457, + "rewards/margins": 1.0350992679595947, + "rewards/rejected": -2.2624402046203613, + "step": 460 + }, + { + "epoch": 0.14, + "learning_rate": 4.778145695364238e-07, + "logits/chosen": -2.414658546447754, + "logits/rejected": -2.4013447761535645, + "logps/chosen": -101.0434799194336, + "logps/rejected": -102.90351867675781, + "loss": 0.5651, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9459658861160278, + "rewards/margins": 0.6103629469871521, + "rewards/rejected": -1.5563287734985352, + "step": 470 + }, + { + "epoch": 0.14, + "learning_rate": 4.76158940397351e-07, + "logits/chosen": -2.42374849319458, + "logits/rejected": -2.4381699562072754, + "logps/chosen": -113.9575424194336, + "logps/rejected": -121.03520202636719, + "loss": 0.5268, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7829158902168274, + "rewards/margins": 1.2523859739303589, + "rewards/rejected": -2.035301923751831, + "step": 480 + }, + { + "epoch": 0.15, + "learning_rate": 4.7450331125827815e-07, + "logits/chosen": -2.4486849308013916, + "logits/rejected": -2.4538803100585938, + "logps/chosen": -97.44860076904297, + "logps/rejected": -100.29484558105469, + "loss": 0.5659, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2252352237701416, + "rewards/margins": 0.4965124726295471, + "rewards/rejected": -0.7217476963996887, + "step": 490 + }, + { + "epoch": 0.15, + "learning_rate": 4.728476821192053e-07, + "logits/chosen": -2.4106860160827637, + "logits/rejected": -2.477334499359131, + "logps/chosen": -87.63328552246094, + "logps/rejected": -96.80977630615234, + "loss": 0.5598, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.022742483764886856, + "rewards/margins": 0.6748077273368835, + "rewards/rejected": -0.6520652174949646, + "step": 500 + }, + { + "epoch": 0.15, + "eval_logits/chosen": -2.339754343032837, + "eval_logits/rejected": -2.2989299297332764, + "eval_logps/chosen": -104.51243591308594, + "eval_logps/rejected": -112.7801513671875, + "eval_loss": 0.6348409652709961, + "eval_rewards/accuracies": 0.7120535969734192, + "eval_rewards/chosen": -0.46458080410957336, + "eval_rewards/margins": 1.1086541414260864, + "eval_rewards/rejected": -1.5732349157333374, + "eval_runtime": 528.3305, + "eval_samples_per_second": 3.38, + "eval_steps_per_second": 0.106, + "step": 500 + }, + { + "epoch": 0.15, + "learning_rate": 4.7119205298013243e-07, + "logits/chosen": -2.3279285430908203, + "logits/rejected": -2.2698190212249756, + "logps/chosen": -91.65203094482422, + "logps/rejected": -111.75373840332031, + "loss": 0.5584, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5981258153915405, + "rewards/margins": 1.0291321277618408, + "rewards/rejected": -1.6272579431533813, + "step": 510 + }, + { + "epoch": 0.15, + "learning_rate": 4.6953642384105957e-07, + "logits/chosen": -2.432509183883667, + "logits/rejected": -2.4649786949157715, + "logps/chosen": -113.98470306396484, + "logps/rejected": -131.01609802246094, + "loss": 0.5481, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3087894320487976, + "rewards/margins": 1.156360387802124, + "rewards/rejected": -1.4651498794555664, + "step": 520 + }, + { + "epoch": 0.16, + "learning_rate": 4.678807947019867e-07, + "logits/chosen": -2.4623587131500244, + "logits/rejected": -2.406970500946045, + "logps/chosen": -111.83284759521484, + "logps/rejected": -117.27791595458984, + "loss": 0.5231, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3450089991092682, + "rewards/margins": 1.3268024921417236, + "rewards/rejected": -1.6718114614486694, + "step": 530 + }, + { + "epoch": 0.16, + "learning_rate": 4.662251655629139e-07, + "logits/chosen": -2.434732675552368, + "logits/rejected": -2.482849597930908, + "logps/chosen": -82.09310150146484, + "logps/rejected": -113.52314758300781, + "loss": 0.5046, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.10573047399520874, + "rewards/margins": 1.0775012969970703, + "rewards/rejected": -1.1832319498062134, + "step": 540 + }, + { + "epoch": 0.16, + "learning_rate": 4.6456953642384104e-07, + "logits/chosen": -2.495922565460205, + "logits/rejected": -2.432220935821533, + "logps/chosen": -123.1400375366211, + "logps/rejected": -111.23506164550781, + "loss": 1.2805, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4564870297908783, + "rewards/margins": 0.5551273226737976, + "rewards/rejected": -1.0116143226623535, + "step": 550 + }, + { + "epoch": 0.17, + "learning_rate": 4.629139072847682e-07, + "logits/chosen": -2.3615145683288574, + "logits/rejected": -2.3742241859436035, + "logps/chosen": -128.84971618652344, + "logps/rejected": -140.29312133789062, + "loss": 1.2493, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.166237235069275, + "rewards/margins": 1.4496667385101318, + "rewards/rejected": -2.615903854370117, + "step": 560 + }, + { + "epoch": 0.17, + "learning_rate": 4.612582781456953e-07, + "logits/chosen": -2.471628189086914, + "logits/rejected": -2.407003164291382, + "logps/chosen": -106.4498291015625, + "logps/rejected": -119.580078125, + "loss": 0.4833, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.251348614692688, + "rewards/margins": 1.0506912469863892, + "rewards/rejected": -2.302039623260498, + "step": 570 + }, + { + "epoch": 0.17, + "learning_rate": 4.596026490066225e-07, + "logits/chosen": -2.3577880859375, + "logits/rejected": -2.3710594177246094, + "logps/chosen": -109.6436996459961, + "logps/rejected": -111.36781311035156, + "loss": 0.6501, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1875368356704712, + "rewards/margins": 0.8005573153495789, + "rewards/rejected": -1.9880939722061157, + "step": 580 + }, + { + "epoch": 0.18, + "learning_rate": 4.5794701986754965e-07, + "logits/chosen": -2.3025927543640137, + "logits/rejected": -2.412416934967041, + "logps/chosen": -92.57754516601562, + "logps/rejected": -125.9276123046875, + "loss": 0.6228, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.1066559553146362, + "rewards/margins": 0.5373567938804626, + "rewards/rejected": -1.6440128087997437, + "step": 590 + }, + { + "epoch": 0.18, + "learning_rate": 4.562913907284768e-07, + "logits/chosen": -2.315936326980591, + "logits/rejected": -2.264455556869507, + "logps/chosen": -111.17767333984375, + "logps/rejected": -124.8282699584961, + "loss": 0.5129, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9577595591545105, + "rewards/margins": 1.1823832988739014, + "rewards/rejected": -2.1401429176330566, + "step": 600 + }, + { + "epoch": 0.18, + "learning_rate": 4.54635761589404e-07, + "logits/chosen": -2.4507012367248535, + "logits/rejected": -2.402617931365967, + "logps/chosen": -112.44432067871094, + "logps/rejected": -117.4054946899414, + "loss": 0.7353, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0404579639434814, + "rewards/margins": 0.9835718870162964, + "rewards/rejected": -2.0240299701690674, + "step": 610 + }, + { + "epoch": 0.18, + "learning_rate": 4.5298013245033113e-07, + "logits/chosen": -2.410632610321045, + "logits/rejected": -2.4103057384490967, + "logps/chosen": -102.71327209472656, + "logps/rejected": -118.40677642822266, + "loss": 0.496, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.740358829498291, + "rewards/margins": 1.1352561712265015, + "rewards/rejected": -1.875615119934082, + "step": 620 + }, + { + "epoch": 0.19, + "learning_rate": 4.5132450331125827e-07, + "logits/chosen": -2.2251460552215576, + "logits/rejected": -2.2362751960754395, + "logps/chosen": -106.41926574707031, + "logps/rejected": -108.68377685546875, + "loss": 0.9923, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0975613594055176, + "rewards/margins": 1.4764889478683472, + "rewards/rejected": -2.5740504264831543, + "step": 630 + }, + { + "epoch": 0.19, + "learning_rate": 4.496688741721854e-07, + "logits/chosen": -2.2884135246276855, + "logits/rejected": -2.3148610591888428, + "logps/chosen": -95.29539489746094, + "logps/rejected": -102.16908264160156, + "loss": 0.6218, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0151549577713013, + "rewards/margins": 0.7811011075973511, + "rewards/rejected": -1.7962560653686523, + "step": 640 + }, + { + "epoch": 0.19, + "learning_rate": 4.4801324503311255e-07, + "logits/chosen": -2.266324520111084, + "logits/rejected": -2.1928133964538574, + "logps/chosen": -94.09014892578125, + "logps/rejected": -102.40970611572266, + "loss": 0.525, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.572268009185791, + "rewards/margins": 0.8905662298202515, + "rewards/rejected": -1.462834358215332, + "step": 650 + }, + { + "epoch": 0.2, + "learning_rate": 4.463576158940397e-07, + "logits/chosen": -2.2293038368225098, + "logits/rejected": -2.1520204544067383, + "logps/chosen": -119.3239974975586, + "logps/rejected": -125.38603210449219, + "loss": 0.7517, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5861669182777405, + "rewards/margins": 1.747180700302124, + "rewards/rejected": -2.333347797393799, + "step": 660 + }, + { + "epoch": 0.2, + "learning_rate": 4.4470198675496683e-07, + "logits/chosen": -2.3797130584716797, + "logits/rejected": -2.3224523067474365, + "logps/chosen": -103.2835922241211, + "logps/rejected": -110.06852722167969, + "loss": 0.6595, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6205762028694153, + "rewards/margins": 0.7353760600090027, + "rewards/rejected": -1.355952262878418, + "step": 670 + }, + { + "epoch": 0.2, + "learning_rate": 4.43046357615894e-07, + "logits/chosen": -2.286005973815918, + "logits/rejected": -2.243605375289917, + "logps/chosen": -122.601806640625, + "logps/rejected": -152.8876190185547, + "loss": 0.4932, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.38620439171791077, + "rewards/margins": 1.5684607028961182, + "rewards/rejected": -1.9546654224395752, + "step": 680 + }, + { + "epoch": 0.21, + "learning_rate": 4.4139072847682116e-07, + "logits/chosen": -2.2788243293762207, + "logits/rejected": -2.3132455348968506, + "logps/chosen": -109.7626724243164, + "logps/rejected": -121.210693359375, + "loss": 0.5107, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1420578509569168, + "rewards/margins": 1.0154675245285034, + "rewards/rejected": -1.1575253009796143, + "step": 690 + }, + { + "epoch": 0.21, + "learning_rate": 4.397350993377483e-07, + "logits/chosen": -2.1815805435180664, + "logits/rejected": -2.2088842391967773, + "logps/chosen": -97.82100677490234, + "logps/rejected": -110.3985595703125, + "loss": 0.5536, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.8147605657577515, + "rewards/margins": 1.0990091562271118, + "rewards/rejected": -1.9137697219848633, + "step": 700 + }, + { + "epoch": 0.21, + "learning_rate": 4.380794701986755e-07, + "logits/chosen": -2.1567564010620117, + "logits/rejected": -2.213163375854492, + "logps/chosen": -88.54952239990234, + "logps/rejected": -115.46138000488281, + "loss": 0.5308, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8950099945068359, + "rewards/margins": 0.986484169960022, + "rewards/rejected": -1.881494164466858, + "step": 710 + }, + { + "epoch": 0.21, + "learning_rate": 4.3642384105960263e-07, + "logits/chosen": -2.195145845413208, + "logits/rejected": -2.1583914756774902, + "logps/chosen": -89.95973205566406, + "logps/rejected": -90.32757568359375, + "loss": 0.5371, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1613867282867432, + "rewards/margins": 1.2194865942001343, + "rewards/rejected": -2.380873203277588, + "step": 720 + }, + { + "epoch": 0.22, + "learning_rate": 4.347682119205298e-07, + "logits/chosen": -2.157541513442993, + "logits/rejected": -2.0622384548187256, + "logps/chosen": -122.03218078613281, + "logps/rejected": -133.05084228515625, + "loss": 0.4753, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2253652811050415, + "rewards/margins": 1.0726807117462158, + "rewards/rejected": -2.2980456352233887, + "step": 730 + }, + { + "epoch": 0.22, + "learning_rate": 4.3311258278145697e-07, + "logits/chosen": -2.2362678050994873, + "logits/rejected": -2.2174267768859863, + "logps/chosen": -104.6390151977539, + "logps/rejected": -108.64559173583984, + "loss": 0.5309, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9815710186958313, + "rewards/margins": 0.686564564704895, + "rewards/rejected": -1.6681352853775024, + "step": 740 + }, + { + "epoch": 0.22, + "learning_rate": 4.314569536423841e-07, + "logits/chosen": -2.3051602840423584, + "logits/rejected": -2.205004930496216, + "logps/chosen": -112.1572494506836, + "logps/rejected": -115.80439758300781, + "loss": 3.3956, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1993415355682373, + "rewards/margins": 0.9256394505500793, + "rewards/rejected": -2.124980926513672, + "step": 750 + }, + { + "epoch": 0.23, + "learning_rate": 4.2980132450331125e-07, + "logits/chosen": -2.1093502044677734, + "logits/rejected": -2.1304099559783936, + "logps/chosen": -101.59135437011719, + "logps/rejected": -121.8282241821289, + "loss": 0.6244, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4122694730758667, + "rewards/margins": 1.0248304605484009, + "rewards/rejected": -2.4371001720428467, + "step": 760 + }, + { + "epoch": 0.23, + "learning_rate": 4.281456953642384e-07, + "logits/chosen": -2.1925549507141113, + "logits/rejected": -2.2341551780700684, + "logps/chosen": -125.73774719238281, + "logps/rejected": -137.68995666503906, + "loss": 0.5342, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3582961559295654, + "rewards/margins": 1.1268060207366943, + "rewards/rejected": -2.4851021766662598, + "step": 770 + }, + { + "epoch": 0.23, + "learning_rate": 4.2649006622516553e-07, + "logits/chosen": -2.1826648712158203, + "logits/rejected": -2.0866520404815674, + "logps/chosen": -112.77205657958984, + "logps/rejected": -135.73634338378906, + "loss": 0.6883, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9496241807937622, + "rewards/margins": 2.8248558044433594, + "rewards/rejected": -3.774479627609253, + "step": 780 + }, + { + "epoch": 0.24, + "learning_rate": 4.2483443708609267e-07, + "logits/chosen": -2.2106716632843018, + "logits/rejected": -2.2418696880340576, + "logps/chosen": -100.58189392089844, + "logps/rejected": -122.01805114746094, + "loss": 0.4801, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5530904531478882, + "rewards/margins": 1.249182939529419, + "rewards/rejected": -2.802273750305176, + "step": 790 + }, + { + "epoch": 0.24, + "learning_rate": 4.231788079470198e-07, + "logits/chosen": -2.1691789627075195, + "logits/rejected": -2.082367181777954, + "logps/chosen": -100.97856903076172, + "logps/rejected": -102.23161315917969, + "loss": 0.5207, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1863991022109985, + "rewards/margins": 1.5210940837860107, + "rewards/rejected": -2.707493305206299, + "step": 800 + }, + { + "epoch": 0.24, + "learning_rate": 4.21523178807947e-07, + "logits/chosen": -2.321969985961914, + "logits/rejected": -2.2945773601531982, + "logps/chosen": -95.80015563964844, + "logps/rejected": -103.98514556884766, + "loss": 0.5769, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6277263760566711, + "rewards/margins": 1.0614575147628784, + "rewards/rejected": -1.6891838312149048, + "step": 810 + }, + { + "epoch": 0.24, + "learning_rate": 4.1986754966887414e-07, + "logits/chosen": -2.16201114654541, + "logits/rejected": -2.100698471069336, + "logps/chosen": -107.64762878417969, + "logps/rejected": -114.20783996582031, + "loss": 0.5842, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0765411853790283, + "rewards/margins": 0.9841095209121704, + "rewards/rejected": -2.060650587081909, + "step": 820 + }, + { + "epoch": 0.25, + "learning_rate": 4.1821192052980133e-07, + "logits/chosen": -2.15731143951416, + "logits/rejected": -2.1200685501098633, + "logps/chosen": -94.93736267089844, + "logps/rejected": -108.20992279052734, + "loss": 0.502, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5728567838668823, + "rewards/margins": 1.2192682027816772, + "rewards/rejected": -1.7921253442764282, + "step": 830 + }, + { + "epoch": 0.25, + "learning_rate": 4.165562913907285e-07, + "logits/chosen": -2.228494644165039, + "logits/rejected": -2.199162006378174, + "logps/chosen": -119.44285583496094, + "logps/rejected": -124.89945220947266, + "loss": 0.5335, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5043415427207947, + "rewards/margins": 1.4671887159347534, + "rewards/rejected": -1.9715303182601929, + "step": 840 + }, + { + "epoch": 0.25, + "learning_rate": 4.149006622516556e-07, + "logits/chosen": -2.242833137512207, + "logits/rejected": -2.193368673324585, + "logps/chosen": -106.42388916015625, + "logps/rejected": -115.7519302368164, + "loss": 0.5458, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2504803538322449, + "rewards/margins": 1.4816687107086182, + "rewards/rejected": -1.7321488857269287, + "step": 850 + }, + { + "epoch": 0.26, + "learning_rate": 4.1324503311258276e-07, + "logits/chosen": -2.296274185180664, + "logits/rejected": -2.233081340789795, + "logps/chosen": -97.89036560058594, + "logps/rejected": -118.38981628417969, + "loss": 0.6251, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.856580376625061, + "rewards/margins": 1.5226026773452759, + "rewards/rejected": -2.379183053970337, + "step": 860 + }, + { + "epoch": 0.26, + "learning_rate": 4.1158940397350995e-07, + "logits/chosen": -2.2974660396575928, + "logits/rejected": -2.1640889644622803, + "logps/chosen": -111.53731536865234, + "logps/rejected": -109.1888656616211, + "loss": 0.4891, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9736050367355347, + "rewards/margins": 1.2244486808776855, + "rewards/rejected": -2.1980538368225098, + "step": 870 + }, + { + "epoch": 0.26, + "learning_rate": 4.099337748344371e-07, + "logits/chosen": -2.130094289779663, + "logits/rejected": -2.0237298011779785, + "logps/chosen": -116.61064147949219, + "logps/rejected": -123.98744201660156, + "loss": 0.9585, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.352774143218994, + "rewards/margins": 1.3028422594070435, + "rewards/rejected": -3.6556167602539062, + "step": 880 + }, + { + "epoch": 0.27, + "learning_rate": 4.0827814569536423e-07, + "logits/chosen": -2.1122946739196777, + "logits/rejected": -2.1758933067321777, + "logps/chosen": -92.36100769042969, + "logps/rejected": -117.257080078125, + "loss": 0.5243, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1455414295196533, + "rewards/margins": 1.1456917524337769, + "rewards/rejected": -2.291233539581299, + "step": 890 + }, + { + "epoch": 0.27, + "learning_rate": 4.0662251655629137e-07, + "logits/chosen": -2.1967172622680664, + "logits/rejected": -2.163334369659424, + "logps/chosen": -94.69267272949219, + "logps/rejected": -106.30582427978516, + "loss": 0.5839, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9426735043525696, + "rewards/margins": 0.8261833190917969, + "rewards/rejected": -1.7688567638397217, + "step": 900 + }, + { + "epoch": 0.27, + "learning_rate": 4.049668874172185e-07, + "logits/chosen": -2.2559750080108643, + "logits/rejected": -2.2480287551879883, + "logps/chosen": -114.98677825927734, + "logps/rejected": -118.00787353515625, + "loss": 0.6499, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.6318261623382568, + "rewards/margins": 0.813581645488739, + "rewards/rejected": -2.4454076290130615, + "step": 910 + }, + { + "epoch": 0.27, + "learning_rate": 4.0331125827814565e-07, + "logits/chosen": -2.277569055557251, + "logits/rejected": -2.2428252696990967, + "logps/chosen": -106.87760162353516, + "logps/rejected": -107.15045166015625, + "loss": 0.7337, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2901384830474854, + "rewards/margins": 0.8376191854476929, + "rewards/rejected": -2.1277577877044678, + "step": 920 + }, + { + "epoch": 0.28, + "learning_rate": 4.016556291390728e-07, + "logits/chosen": -2.2305266857147217, + "logits/rejected": -2.2446939945220947, + "logps/chosen": -115.1706314086914, + "logps/rejected": -132.69129943847656, + "loss": 0.5205, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4605052471160889, + "rewards/margins": 1.3340156078338623, + "rewards/rejected": -2.794520854949951, + "step": 930 + }, + { + "epoch": 0.28, + "learning_rate": 4e-07, + "logits/chosen": -2.3378500938415527, + "logits/rejected": -2.1980865001678467, + "logps/chosen": -124.11688232421875, + "logps/rejected": -121.2197494506836, + "loss": 0.5762, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1835664510726929, + "rewards/margins": 1.0963947772979736, + "rewards/rejected": -2.279961109161377, + "step": 940 + }, + { + "epoch": 0.28, + "learning_rate": 3.983443708609271e-07, + "logits/chosen": -2.2236156463623047, + "logits/rejected": -2.2054903507232666, + "logps/chosen": -122.0257568359375, + "logps/rejected": -125.471923828125, + "loss": 0.4677, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3710222244262695, + "rewards/margins": 1.238471269607544, + "rewards/rejected": -2.6094937324523926, + "step": 950 + }, + { + "epoch": 0.29, + "learning_rate": 3.966887417218543e-07, + "logits/chosen": -2.2760846614837646, + "logits/rejected": -2.2383294105529785, + "logps/chosen": -104.09146881103516, + "logps/rejected": -120.87336730957031, + "loss": 0.5848, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.4301923513412476, + "rewards/margins": 1.626868486404419, + "rewards/rejected": -3.0570602416992188, + "step": 960 + }, + { + "epoch": 0.29, + "learning_rate": 3.9503311258278146e-07, + "logits/chosen": -2.304551839828491, + "logits/rejected": -2.3333609104156494, + "logps/chosen": -119.12831115722656, + "logps/rejected": -128.80160522460938, + "loss": 0.555, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.5498030185699463, + "rewards/margins": 0.8728634715080261, + "rewards/rejected": -3.422666072845459, + "step": 970 + }, + { + "epoch": 0.29, + "learning_rate": 3.933774834437086e-07, + "logits/chosen": -2.2905325889587402, + "logits/rejected": -2.175750255584717, + "logps/chosen": -111.89952087402344, + "logps/rejected": -112.72969055175781, + "loss": 0.5745, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.818068742752075, + "rewards/margins": 0.5174419283866882, + "rewards/rejected": -3.335510730743408, + "step": 980 + }, + { + "epoch": 0.29, + "learning_rate": 3.9172185430463574e-07, + "logits/chosen": -2.3529715538024902, + "logits/rejected": -2.2983202934265137, + "logps/chosen": -136.7278594970703, + "logps/rejected": -129.16085815429688, + "loss": 0.5891, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.4263627529144287, + "rewards/margins": 0.8253445625305176, + "rewards/rejected": -3.2517075538635254, + "step": 990 + }, + { + "epoch": 0.3, + "learning_rate": 3.9006622516556293e-07, + "logits/chosen": -2.2374019622802734, + "logits/rejected": -2.2284903526306152, + "logps/chosen": -114.3366470336914, + "logps/rejected": -110.65074157714844, + "loss": 0.6708, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3343544006347656, + "rewards/margins": 1.0057871341705322, + "rewards/rejected": -3.340141773223877, + "step": 1000 + }, + { + "epoch": 0.3, + "eval_logits/chosen": -2.2622616291046143, + "eval_logits/rejected": -2.215507745742798, + "eval_logps/chosen": -119.37470245361328, + "eval_logps/rejected": -125.0894546508789, + "eval_loss": 0.5807133316993713, + "eval_rewards/accuracies": 0.6830357313156128, + "eval_rewards/chosen": -1.950809121131897, + "eval_rewards/margins": 0.8533560633659363, + "eval_rewards/rejected": -2.8041651248931885, + "eval_runtime": 520.9457, + "eval_samples_per_second": 3.428, + "eval_steps_per_second": 0.107, + "step": 1000 + }, + { + "epoch": 0.3, + "learning_rate": 3.8841059602649007e-07, + "logits/chosen": -2.4846906661987305, + "logits/rejected": -2.38966703414917, + "logps/chosen": -122.82658386230469, + "logps/rejected": -122.37986755371094, + "loss": 0.5429, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5582962036132812, + "rewards/margins": 0.7036358118057251, + "rewards/rejected": -2.261931896209717, + "step": 1010 + }, + { + "epoch": 0.3, + "learning_rate": 3.867549668874172e-07, + "logits/chosen": -2.413020133972168, + "logits/rejected": -2.348389148712158, + "logps/chosen": -146.7459716796875, + "logps/rejected": -152.81591796875, + "loss": 0.5503, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.9320647716522217, + "rewards/margins": 1.321993112564087, + "rewards/rejected": -3.2540581226348877, + "step": 1020 + }, + { + "epoch": 0.31, + "learning_rate": 3.8509933774834435e-07, + "logits/chosen": -2.335376262664795, + "logits/rejected": -2.3727335929870605, + "logps/chosen": -96.5339584350586, + "logps/rejected": -102.97718811035156, + "loss": 0.4738, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.347673773765564, + "rewards/margins": 0.7935667037963867, + "rewards/rejected": -2.141240358352661, + "step": 1030 + }, + { + "epoch": 0.31, + "learning_rate": 3.834437086092715e-07, + "logits/chosen": -2.474375009536743, + "logits/rejected": -2.457411527633667, + "logps/chosen": -100.12342071533203, + "logps/rejected": -98.28324890136719, + "loss": 0.5072, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6612989902496338, + "rewards/margins": 1.0141090154647827, + "rewards/rejected": -1.675408124923706, + "step": 1040 + }, + { + "epoch": 0.31, + "learning_rate": 3.8178807947019863e-07, + "logits/chosen": -2.4102184772491455, + "logits/rejected": -2.366565704345703, + "logps/chosen": -94.41053771972656, + "logps/rejected": -106.40338134765625, + "loss": 0.4768, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1945335865020752, + "rewards/margins": 1.7779722213745117, + "rewards/rejected": -2.972505807876587, + "step": 1050 + }, + { + "epoch": 0.32, + "learning_rate": 3.8013245033112577e-07, + "logits/chosen": -2.324781656265259, + "logits/rejected": -2.265265703201294, + "logps/chosen": -113.0925064086914, + "logps/rejected": -116.36458587646484, + "loss": 0.5291, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6215614080429077, + "rewards/margins": 0.970362663269043, + "rewards/rejected": -1.5919239521026611, + "step": 1060 + }, + { + "epoch": 0.32, + "learning_rate": 3.7847682119205296e-07, + "logits/chosen": -2.4248404502868652, + "logits/rejected": -2.3727540969848633, + "logps/chosen": -112.99056243896484, + "logps/rejected": -124.29933166503906, + "loss": 1.2347, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.332558035850525, + "rewards/margins": 1.7028182744979858, + "rewards/rejected": -3.0353763103485107, + "step": 1070 + }, + { + "epoch": 0.32, + "learning_rate": 3.7682119205298016e-07, + "logits/chosen": -2.510585308074951, + "logits/rejected": -2.4303643703460693, + "logps/chosen": -122.59515380859375, + "logps/rejected": -119.64692687988281, + "loss": 0.5815, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.8100630044937134, + "rewards/margins": 0.8500891923904419, + "rewards/rejected": -2.660151958465576, + "step": 1080 + }, + { + "epoch": 0.32, + "learning_rate": 3.751655629139073e-07, + "logits/chosen": -2.48645281791687, + "logits/rejected": -2.433279037475586, + "logps/chosen": -131.58583068847656, + "logps/rejected": -139.4903106689453, + "loss": 0.4595, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.738173484802246, + "rewards/margins": 0.9940687417984009, + "rewards/rejected": -2.7322418689727783, + "step": 1090 + }, + { + "epoch": 0.33, + "learning_rate": 3.7350993377483444e-07, + "logits/chosen": -2.2750325202941895, + "logits/rejected": -2.214141845703125, + "logps/chosen": -92.43232727050781, + "logps/rejected": -118.48176574707031, + "loss": 0.4501, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.566083312034607, + "rewards/margins": 1.913888931274414, + "rewards/rejected": -3.4799721240997314, + "step": 1100 + }, + { + "epoch": 0.33, + "learning_rate": 3.718543046357616e-07, + "logits/chosen": -2.3589186668395996, + "logits/rejected": -2.289020538330078, + "logps/chosen": -116.14213562011719, + "logps/rejected": -115.25, + "loss": 0.5489, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.8134400844573975, + "rewards/margins": 1.0903173685073853, + "rewards/rejected": -2.9037575721740723, + "step": 1110 + }, + { + "epoch": 0.33, + "learning_rate": 3.701986754966887e-07, + "logits/chosen": -2.4153926372528076, + "logits/rejected": -2.38564133644104, + "logps/chosen": -198.99185180664062, + "logps/rejected": -211.7269744873047, + "loss": 0.4915, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.85645580291748, + "rewards/margins": 1.3634490966796875, + "rewards/rejected": -10.219904899597168, + "step": 1120 + }, + { + "epoch": 0.34, + "learning_rate": 3.6854304635761586e-07, + "logits/chosen": -2.3718574047088623, + "logits/rejected": -2.323935031890869, + "logps/chosen": -114.41487121582031, + "logps/rejected": -115.03157806396484, + "loss": 0.5742, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.747230887413025, + "rewards/margins": 0.9782400131225586, + "rewards/rejected": -2.725471019744873, + "step": 1130 + }, + { + "epoch": 0.34, + "learning_rate": 3.6688741721854305e-07, + "logits/chosen": -2.266796350479126, + "logits/rejected": -2.279444456100464, + "logps/chosen": -125.43962097167969, + "logps/rejected": -138.60568237304688, + "loss": 0.5559, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7848097085952759, + "rewards/margins": 0.5874557495117188, + "rewards/rejected": -2.372265577316284, + "step": 1140 + }, + { + "epoch": 0.34, + "learning_rate": 3.652317880794702e-07, + "logits/chosen": -2.3460640907287598, + "logits/rejected": -2.2017135620117188, + "logps/chosen": -173.7471923828125, + "logps/rejected": -175.39913940429688, + "loss": 0.6409, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.320539474487305, + "rewards/margins": 0.5577089190483093, + "rewards/rejected": -8.87824821472168, + "step": 1150 + }, + { + "epoch": 0.35, + "learning_rate": 3.6357615894039733e-07, + "logits/chosen": -2.3931944370269775, + "logits/rejected": -2.295135498046875, + "logps/chosen": -117.7610855102539, + "logps/rejected": -131.86878967285156, + "loss": 0.5557, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.364652395248413, + "rewards/margins": 0.9879738092422485, + "rewards/rejected": -2.352626323699951, + "step": 1160 + }, + { + "epoch": 0.35, + "learning_rate": 3.6192052980132447e-07, + "logits/chosen": -2.3470005989074707, + "logits/rejected": -2.314392328262329, + "logps/chosen": -116.54869079589844, + "logps/rejected": -121.33402252197266, + "loss": 0.474, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6547797918319702, + "rewards/margins": 1.0188862085342407, + "rewards/rejected": -2.6736655235290527, + "step": 1170 + }, + { + "epoch": 0.35, + "learning_rate": 3.602649006622516e-07, + "logits/chosen": -2.3665614128112793, + "logits/rejected": -2.2760112285614014, + "logps/chosen": -116.99346923828125, + "logps/rejected": -194.17459106445312, + "loss": 0.4616, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1822216510772705, + "rewards/margins": 7.195115089416504, + "rewards/rejected": -8.377335548400879, + "step": 1180 + }, + { + "epoch": 0.35, + "learning_rate": 3.5860927152317875e-07, + "logits/chosen": -2.378209114074707, + "logits/rejected": -2.3278615474700928, + "logps/chosen": -119.82206726074219, + "logps/rejected": -126.67927551269531, + "loss": 0.5238, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.758419394493103, + "rewards/margins": 1.255906343460083, + "rewards/rejected": -3.0143258571624756, + "step": 1190 + }, + { + "epoch": 0.36, + "learning_rate": 3.5695364238410594e-07, + "logits/chosen": -2.376044750213623, + "logits/rejected": -2.308166265487671, + "logps/chosen": -113.3560562133789, + "logps/rejected": -114.37657165527344, + "loss": 0.5637, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4774434566497803, + "rewards/margins": 0.5793313384056091, + "rewards/rejected": -2.056774616241455, + "step": 1200 + }, + { + "epoch": 0.36, + "learning_rate": 3.5529801324503314e-07, + "logits/chosen": -2.374824285507202, + "logits/rejected": -2.3935980796813965, + "logps/chosen": -108.83685302734375, + "logps/rejected": -115.5961685180664, + "loss": 0.5626, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5476689338684082, + "rewards/margins": 0.7745328545570374, + "rewards/rejected": -2.322201728820801, + "step": 1210 + }, + { + "epoch": 0.36, + "learning_rate": 3.536423841059603e-07, + "logits/chosen": -2.339582681655884, + "logits/rejected": -2.361855983734131, + "logps/chosen": -121.9773941040039, + "logps/rejected": -133.71356201171875, + "loss": 0.5682, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.1847705841064453, + "rewards/margins": 0.9190909266471863, + "rewards/rejected": -3.1038613319396973, + "step": 1220 + }, + { + "epoch": 0.37, + "learning_rate": 3.519867549668874e-07, + "logits/chosen": -2.2750911712646484, + "logits/rejected": -2.235349416732788, + "logps/chosen": -83.35279846191406, + "logps/rejected": -102.2212905883789, + "loss": 0.5579, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9985214471817017, + "rewards/margins": 1.0258718729019165, + "rewards/rejected": -3.024393081665039, + "step": 1230 + }, + { + "epoch": 0.37, + "learning_rate": 3.5033112582781456e-07, + "logits/chosen": -2.3950631618499756, + "logits/rejected": -2.286043643951416, + "logps/chosen": -112.0318603515625, + "logps/rejected": -116.00035095214844, + "loss": 0.9739, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5054577589035034, + "rewards/margins": 0.963812530040741, + "rewards/rejected": -2.4692704677581787, + "step": 1240 + }, + { + "epoch": 0.37, + "learning_rate": 3.486754966887417e-07, + "logits/chosen": -2.2539525032043457, + "logits/rejected": -2.280163526535034, + "logps/chosen": -90.12135314941406, + "logps/rejected": -103.78663635253906, + "loss": 0.5341, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0227272510528564, + "rewards/margins": 0.8646720051765442, + "rewards/rejected": -1.8873993158340454, + "step": 1250 + }, + { + "epoch": 0.38, + "learning_rate": 3.4701986754966884e-07, + "logits/chosen": -2.3163156509399414, + "logits/rejected": -2.324432849884033, + "logps/chosen": -91.22362518310547, + "logps/rejected": -103.0667953491211, + "loss": 0.5978, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4652012288570404, + "rewards/margins": 1.1732677221298218, + "rewards/rejected": -1.6384689807891846, + "step": 1260 + }, + { + "epoch": 0.38, + "learning_rate": 3.4536423841059603e-07, + "logits/chosen": -2.3076674938201904, + "logits/rejected": -2.3175816535949707, + "logps/chosen": -100.41036224365234, + "logps/rejected": -117.35456848144531, + "loss": 0.648, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2293812036514282, + "rewards/margins": 0.8102920651435852, + "rewards/rejected": -2.039673089981079, + "step": 1270 + }, + { + "epoch": 0.38, + "learning_rate": 3.4370860927152317e-07, + "logits/chosen": -2.3699378967285156, + "logits/rejected": -2.3562657833099365, + "logps/chosen": -119.7326889038086, + "logps/rejected": -131.7585906982422, + "loss": 0.4977, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6823489665985107, + "rewards/margins": 1.3262075185775757, + "rewards/rejected": -2.008556604385376, + "step": 1280 + }, + { + "epoch": 0.38, + "learning_rate": 3.420529801324503e-07, + "logits/chosen": -2.1104941368103027, + "logits/rejected": -2.0905330181121826, + "logps/chosen": -171.0330810546875, + "logps/rejected": -175.89886474609375, + "loss": 0.9012, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -7.170141696929932, + "rewards/margins": -0.5844208002090454, + "rewards/rejected": -6.585721015930176, + "step": 1290 + }, + { + "epoch": 0.39, + "learning_rate": 3.4039735099337745e-07, + "logits/chosen": -2.3445873260498047, + "logits/rejected": -2.2650883197784424, + "logps/chosen": -130.222900390625, + "logps/rejected": -132.6783447265625, + "loss": 0.6484, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4724981784820557, + "rewards/margins": 0.9613865613937378, + "rewards/rejected": -2.433884859085083, + "step": 1300 + }, + { + "epoch": 0.39, + "learning_rate": 3.387417218543046e-07, + "logits/chosen": -2.457414150238037, + "logits/rejected": -2.5287423133850098, + "logps/chosen": -135.46902465820312, + "logps/rejected": -159.89739990234375, + "loss": 0.4958, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7360942363739014, + "rewards/margins": 1.1658858060836792, + "rewards/rejected": -2.901979923248291, + "step": 1310 + }, + { + "epoch": 0.39, + "learning_rate": 3.3708609271523173e-07, + "logits/chosen": -2.221667766571045, + "logits/rejected": -2.208982467651367, + "logps/chosen": -106.48121643066406, + "logps/rejected": -104.82066345214844, + "loss": 0.5892, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4603312015533447, + "rewards/margins": 0.8812816739082336, + "rewards/rejected": -2.3416128158569336, + "step": 1320 + }, + { + "epoch": 0.4, + "learning_rate": 3.35430463576159e-07, + "logits/chosen": -2.2299439907073975, + "logits/rejected": -2.225663423538208, + "logps/chosen": -94.22245025634766, + "logps/rejected": -100.85789489746094, + "loss": 0.5178, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.32827529311180115, + "rewards/margins": 1.4746736288070679, + "rewards/rejected": -1.8029489517211914, + "step": 1330 + }, + { + "epoch": 0.4, + "learning_rate": 3.337748344370861e-07, + "logits/chosen": -2.2380738258361816, + "logits/rejected": -2.2997546195983887, + "logps/chosen": -98.22574615478516, + "logps/rejected": -112.7544937133789, + "loss": 0.5318, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.4563646912574768, + "rewards/margins": 0.6618258953094482, + "rewards/rejected": -1.1181905269622803, + "step": 1340 + }, + { + "epoch": 0.4, + "learning_rate": 3.3211920529801326e-07, + "logits/chosen": -2.405059814453125, + "logits/rejected": -2.429863452911377, + "logps/chosen": -107.7689437866211, + "logps/rejected": -119.28629302978516, + "loss": 0.5111, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2023572474718094, + "rewards/margins": 0.8103917241096497, + "rewards/rejected": -1.0127489566802979, + "step": 1350 + }, + { + "epoch": 0.41, + "learning_rate": 3.304635761589404e-07, + "logits/chosen": -2.3326334953308105, + "logits/rejected": -2.256371259689331, + "logps/chosen": -111.0186767578125, + "logps/rejected": -101.33964538574219, + "loss": 0.5624, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1809541881084442, + "rewards/margins": 0.9960860013961792, + "rewards/rejected": -1.177040457725525, + "step": 1360 + }, + { + "epoch": 0.41, + "learning_rate": 3.2880794701986754e-07, + "logits/chosen": -2.228715181350708, + "logits/rejected": -2.2780203819274902, + "logps/chosen": -82.96192932128906, + "logps/rejected": -106.3135757446289, + "loss": 0.5477, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.4178234040737152, + "rewards/margins": 1.3403428792953491, + "rewards/rejected": -1.7581663131713867, + "step": 1370 + }, + { + "epoch": 0.41, + "learning_rate": 3.271523178807947e-07, + "logits/chosen": -2.3673007488250732, + "logits/rejected": -2.361161947250366, + "logps/chosen": -110.33650970458984, + "logps/rejected": -118.94313049316406, + "loss": 0.6233, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7891864776611328, + "rewards/margins": 0.4793139100074768, + "rewards/rejected": -1.2685004472732544, + "step": 1380 + }, + { + "epoch": 0.41, + "learning_rate": 3.254966887417218e-07, + "logits/chosen": -2.270993947982788, + "logits/rejected": -2.3422646522521973, + "logps/chosen": -110.1440200805664, + "logps/rejected": -123.39158630371094, + "loss": 0.5202, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0290793180465698, + "rewards/margins": 0.8790245056152344, + "rewards/rejected": -1.9081039428710938, + "step": 1390 + }, + { + "epoch": 0.42, + "learning_rate": 3.23841059602649e-07, + "logits/chosen": -2.3637521266937256, + "logits/rejected": -2.3246121406555176, + "logps/chosen": -123.53662109375, + "logps/rejected": -130.99119567871094, + "loss": 0.4855, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9286755323410034, + "rewards/margins": 1.2442817687988281, + "rewards/rejected": -2.172957181930542, + "step": 1400 + }, + { + "epoch": 0.42, + "learning_rate": 3.2218543046357615e-07, + "logits/chosen": -2.1791653633117676, + "logits/rejected": -2.18937349319458, + "logps/chosen": -166.4168243408203, + "logps/rejected": -188.746826171875, + "loss": 0.633, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -5.239639759063721, + "rewards/margins": 1.5334604978561401, + "rewards/rejected": -6.77310037612915, + "step": 1410 + }, + { + "epoch": 0.42, + "learning_rate": 3.205298013245033e-07, + "logits/chosen": -2.234860897064209, + "logits/rejected": -2.235252857208252, + "logps/chosen": -131.88731384277344, + "logps/rejected": -147.35777282714844, + "loss": 0.6159, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.8779850006103516, + "rewards/margins": 1.1691521406173706, + "rewards/rejected": -4.0471367835998535, + "step": 1420 + }, + { + "epoch": 0.43, + "learning_rate": 3.1887417218543043e-07, + "logits/chosen": -2.3836143016815186, + "logits/rejected": -2.35886287689209, + "logps/chosen": -122.30987548828125, + "logps/rejected": -125.3285903930664, + "loss": 0.536, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.5127007961273193, + "rewards/margins": 1.0224969387054443, + "rewards/rejected": -3.5351977348327637, + "step": 1430 + }, + { + "epoch": 0.43, + "learning_rate": 3.1721854304635757e-07, + "logits/chosen": -2.280726194381714, + "logits/rejected": -2.2161917686462402, + "logps/chosen": -118.85569763183594, + "logps/rejected": -122.44981384277344, + "loss": 0.5343, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.2102978229522705, + "rewards/margins": 1.2170623540878296, + "rewards/rejected": -3.4273605346679688, + "step": 1440 + }, + { + "epoch": 0.43, + "learning_rate": 3.155629139072847e-07, + "logits/chosen": -2.445349931716919, + "logits/rejected": -2.4110920429229736, + "logps/chosen": -110.351806640625, + "logps/rejected": -118.65000915527344, + "loss": 0.5877, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.997372031211853, + "rewards/margins": 0.7109335660934448, + "rewards/rejected": -2.708305835723877, + "step": 1450 + }, + { + "epoch": 0.44, + "learning_rate": 3.1390728476821196e-07, + "logits/chosen": -2.3366105556488037, + "logits/rejected": -2.3466391563415527, + "logps/chosen": -127.3154067993164, + "logps/rejected": -124.52156829833984, + "loss": 0.5383, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.18947172164917, + "rewards/margins": 0.6396933794021606, + "rewards/rejected": -2.829165458679199, + "step": 1460 + }, + { + "epoch": 0.44, + "learning_rate": 3.122516556291391e-07, + "logits/chosen": -2.260577917098999, + "logits/rejected": -2.2590389251708984, + "logps/chosen": -113.4861831665039, + "logps/rejected": -108.08863830566406, + "loss": 0.6422, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.5557265281677246, + "rewards/margins": 0.4942797124385834, + "rewards/rejected": -3.050006151199341, + "step": 1470 + }, + { + "epoch": 0.44, + "learning_rate": 3.1059602649006624e-07, + "logits/chosen": -2.3765158653259277, + "logits/rejected": -2.3215491771698, + "logps/chosen": -123.18157958984375, + "logps/rejected": -119.86373138427734, + "loss": 0.4739, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5957121849060059, + "rewards/margins": 0.9963384866714478, + "rewards/rejected": -2.592050552368164, + "step": 1480 + }, + { + "epoch": 0.44, + "learning_rate": 3.089403973509934e-07, + "logits/chosen": -2.3044986724853516, + "logits/rejected": -2.3184516429901123, + "logps/chosen": -120.02888488769531, + "logps/rejected": -125.82350158691406, + "loss": 0.5382, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5062299966812134, + "rewards/margins": 1.3234798908233643, + "rewards/rejected": -2.829709529876709, + "step": 1490 + }, + { + "epoch": 0.45, + "learning_rate": 3.072847682119205e-07, + "logits/chosen": -2.315985918045044, + "logits/rejected": -2.262968063354492, + "logps/chosen": -114.96397399902344, + "logps/rejected": -129.48318481445312, + "loss": 0.5984, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5051807165145874, + "rewards/margins": 1.5209523439407349, + "rewards/rejected": -3.0261335372924805, + "step": 1500 + }, + { + "epoch": 0.45, + "eval_logits/chosen": -2.1823971271514893, + "eval_logits/rejected": -2.1383469104766846, + "eval_logps/chosen": -114.31800079345703, + "eval_logps/rejected": -123.8126220703125, + "eval_loss": 0.5244069695472717, + "eval_rewards/accuracies": 0.71875, + "eval_rewards/chosen": -1.4451391696929932, + "eval_rewards/margins": 1.2313430309295654, + "eval_rewards/rejected": -2.6764819622039795, + "eval_runtime": 522.8803, + "eval_samples_per_second": 3.416, + "eval_steps_per_second": 0.107, + "step": 1500 + }, + { + "epoch": 0.45, + "learning_rate": 3.0562913907284766e-07, + "logits/chosen": -2.313927173614502, + "logits/rejected": -2.33535099029541, + "logps/chosen": -118.8395767211914, + "logps/rejected": -128.5199737548828, + "loss": 0.5877, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.8590469360351562, + "rewards/margins": 0.7332299947738647, + "rewards/rejected": -2.5922768115997314, + "step": 1510 + }, + { + "epoch": 0.45, + "learning_rate": 3.039735099337748e-07, + "logits/chosen": -2.3866069316864014, + "logits/rejected": -2.3465638160705566, + "logps/chosen": -120.46064758300781, + "logps/rejected": -116.50871276855469, + "loss": 0.8307, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8921232223510742, + "rewards/margins": 0.763845682144165, + "rewards/rejected": -2.6559691429138184, + "step": 1520 + }, + { + "epoch": 0.46, + "learning_rate": 3.02317880794702e-07, + "logits/chosen": -2.421853542327881, + "logits/rejected": -2.318270206451416, + "logps/chosen": -137.11500549316406, + "logps/rejected": -134.6293487548828, + "loss": 0.5303, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.621045708656311, + "rewards/margins": 0.8223884701728821, + "rewards/rejected": -2.443434238433838, + "step": 1530 + }, + { + "epoch": 0.46, + "learning_rate": 3.0066225165562913e-07, + "logits/chosen": -2.35496187210083, + "logits/rejected": -2.2371764183044434, + "logps/chosen": -111.7890853881836, + "logps/rejected": -106.77983093261719, + "loss": 0.5696, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0801293849945068, + "rewards/margins": 1.046350121498108, + "rewards/rejected": -2.1264796257019043, + "step": 1540 + }, + { + "epoch": 0.46, + "learning_rate": 2.9900662251655627e-07, + "logits/chosen": -2.329745054244995, + "logits/rejected": -2.2365243434906006, + "logps/chosen": -112.671875, + "logps/rejected": -102.41822814941406, + "loss": 0.6113, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2923063039779663, + "rewards/margins": 1.320711612701416, + "rewards/rejected": -2.613018035888672, + "step": 1550 + }, + { + "epoch": 0.46, + "learning_rate": 2.973509933774834e-07, + "logits/chosen": -2.2645044326782227, + "logits/rejected": -2.1956381797790527, + "logps/chosen": -104.55106353759766, + "logps/rejected": -106.2977294921875, + "loss": 0.7244, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3651814460754395, + "rewards/margins": 0.206166073679924, + "rewards/rejected": -1.5713475942611694, + "step": 1560 + }, + { + "epoch": 0.47, + "learning_rate": 2.9569536423841055e-07, + "logits/chosen": -2.1943066120147705, + "logits/rejected": -2.24649977684021, + "logps/chosen": -89.5943374633789, + "logps/rejected": -106.08259582519531, + "loss": 0.5679, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.288975477218628, + "rewards/margins": 0.5116127133369446, + "rewards/rejected": -1.8005882501602173, + "step": 1570 + }, + { + "epoch": 0.47, + "learning_rate": 2.940397350993377e-07, + "logits/chosen": -2.607445240020752, + "logits/rejected": -2.4970269203186035, + "logps/chosen": -146.52468872070312, + "logps/rejected": -140.1497344970703, + "loss": 0.5597, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5630671977996826, + "rewards/margins": 0.6673210263252258, + "rewards/rejected": -2.2303881645202637, + "step": 1580 + }, + { + "epoch": 0.47, + "learning_rate": 2.9238410596026494e-07, + "logits/chosen": -2.238361358642578, + "logits/rejected": -2.1506645679473877, + "logps/chosen": -99.36707305908203, + "logps/rejected": -105.86885070800781, + "loss": 0.5912, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7212435007095337, + "rewards/margins": 1.0883468389511108, + "rewards/rejected": -2.8095905780792236, + "step": 1590 + }, + { + "epoch": 0.48, + "learning_rate": 2.907284768211921e-07, + "logits/chosen": -2.3251490592956543, + "logits/rejected": -2.307288408279419, + "logps/chosen": -115.70127868652344, + "logps/rejected": -124.7564926147461, + "loss": 0.6455, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.711801290512085, + "rewards/margins": 1.0508122444152832, + "rewards/rejected": -2.7626137733459473, + "step": 1600 + }, + { + "epoch": 0.48, + "learning_rate": 2.890728476821192e-07, + "logits/chosen": -2.2478513717651367, + "logits/rejected": -2.2882168292999268, + "logps/chosen": -104.28340911865234, + "logps/rejected": -121.9701156616211, + "loss": 0.5165, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.887414574623108, + "rewards/margins": 0.925916850566864, + "rewards/rejected": -2.8133316040039062, + "step": 1610 + }, + { + "epoch": 0.48, + "learning_rate": 2.8741721854304636e-07, + "logits/chosen": -2.296663999557495, + "logits/rejected": -2.3100745677948, + "logps/chosen": -158.80392456054688, + "logps/rejected": -124.94380950927734, + "loss": 1.3257, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -4.360524654388428, + "rewards/margins": -2.369443655014038, + "rewards/rejected": -1.9910815954208374, + "step": 1620 + }, + { + "epoch": 0.49, + "learning_rate": 2.857615894039735e-07, + "logits/chosen": -2.2816338539123535, + "logits/rejected": -2.2055506706237793, + "logps/chosen": -116.35295104980469, + "logps/rejected": -125.78230285644531, + "loss": 0.4623, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.8872613906860352, + "rewards/margins": 1.402567982673645, + "rewards/rejected": -3.2898292541503906, + "step": 1630 + }, + { + "epoch": 0.49, + "learning_rate": 2.8410596026490064e-07, + "logits/chosen": -2.3136837482452393, + "logits/rejected": -2.327634334564209, + "logps/chosen": -118.3811264038086, + "logps/rejected": -131.34634399414062, + "loss": 0.4969, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.959405243396759, + "rewards/margins": 1.3146027326583862, + "rewards/rejected": -2.27400803565979, + "step": 1640 + }, + { + "epoch": 0.49, + "learning_rate": 2.824503311258278e-07, + "logits/chosen": -2.2207038402557373, + "logits/rejected": -2.2655978202819824, + "logps/chosen": -122.19026184082031, + "logps/rejected": -130.85289001464844, + "loss": 0.5344, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.631255865097046, + "rewards/margins": 1.0431182384490967, + "rewards/rejected": -2.6743741035461426, + "step": 1650 + }, + { + "epoch": 0.49, + "learning_rate": 2.8079470198675497e-07, + "logits/chosen": -2.3095381259918213, + "logits/rejected": -2.2248189449310303, + "logps/chosen": -103.5934066772461, + "logps/rejected": -116.8990249633789, + "loss": 0.5347, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.2814357280731201, + "rewards/margins": 1.4714066982269287, + "rewards/rejected": -2.752842426300049, + "step": 1660 + }, + { + "epoch": 0.5, + "learning_rate": 2.791390728476821e-07, + "logits/chosen": -2.2992262840270996, + "logits/rejected": -2.3474018573760986, + "logps/chosen": -140.76292419433594, + "logps/rejected": -168.6060333251953, + "loss": 0.5155, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.161952018737793, + "rewards/margins": 1.1705152988433838, + "rewards/rejected": -3.3324673175811768, + "step": 1670 + }, + { + "epoch": 0.5, + "learning_rate": 2.7748344370860925e-07, + "logits/chosen": -2.1538851261138916, + "logits/rejected": -2.1492209434509277, + "logps/chosen": -85.61529541015625, + "logps/rejected": -105.35960388183594, + "loss": 0.4547, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2468538284301758, + "rewards/margins": 1.524840235710144, + "rewards/rejected": -2.7716941833496094, + "step": 1680 + }, + { + "epoch": 0.5, + "learning_rate": 2.758278145695364e-07, + "logits/chosen": -2.220313549041748, + "logits/rejected": -2.270676612854004, + "logps/chosen": -116.72190856933594, + "logps/rejected": -132.87937927246094, + "loss": 0.4907, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.662656545639038, + "rewards/margins": 1.4583295583724976, + "rewards/rejected": -3.1209864616394043, + "step": 1690 + }, + { + "epoch": 0.51, + "learning_rate": 2.7417218543046353e-07, + "logits/chosen": -2.3201870918273926, + "logits/rejected": -2.287921667098999, + "logps/chosen": -119.7146987915039, + "logps/rejected": -135.27894592285156, + "loss": 0.4492, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.569067358970642, + "rewards/margins": 1.5324440002441406, + "rewards/rejected": -3.1015114784240723, + "step": 1700 + }, + { + "epoch": 0.51, + "learning_rate": 2.725165562913907e-07, + "logits/chosen": -2.2404065132141113, + "logits/rejected": -2.2358851432800293, + "logps/chosen": -134.36831665039062, + "logps/rejected": -137.21890258789062, + "loss": 0.5334, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.740677833557129, + "rewards/margins": 1.5694725513458252, + "rewards/rejected": -3.310150623321533, + "step": 1710 + }, + { + "epoch": 0.51, + "learning_rate": 2.7086092715231786e-07, + "logits/chosen": -2.3186452388763428, + "logits/rejected": -2.2739059925079346, + "logps/chosen": -118.93257141113281, + "logps/rejected": -117.28021240234375, + "loss": 0.5242, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.951216459274292, + "rewards/margins": 1.0923653841018677, + "rewards/rejected": -3.04358172416687, + "step": 1720 + }, + { + "epoch": 0.52, + "learning_rate": 2.6920529801324506e-07, + "logits/chosen": -2.230313777923584, + "logits/rejected": -2.269009828567505, + "logps/chosen": -152.952880859375, + "logps/rejected": -145.59732055664062, + "loss": 0.8317, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.0952677726745605, + "rewards/margins": -0.3405976891517639, + "rewards/rejected": -3.7546706199645996, + "step": 1730 + }, + { + "epoch": 0.52, + "learning_rate": 2.675496688741722e-07, + "logits/chosen": -2.276404857635498, + "logits/rejected": -2.223013162612915, + "logps/chosen": -126.49522399902344, + "logps/rejected": -123.188720703125, + "loss": 0.503, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3149405717849731, + "rewards/margins": 0.8885973691940308, + "rewards/rejected": -2.203538179397583, + "step": 1740 + }, + { + "epoch": 0.52, + "learning_rate": 2.6589403973509934e-07, + "logits/chosen": -2.1905181407928467, + "logits/rejected": -2.179508924484253, + "logps/chosen": -105.01606750488281, + "logps/rejected": -133.09783935546875, + "loss": 0.5413, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.6002556085586548, + "rewards/margins": 2.0444486141204834, + "rewards/rejected": -2.6447041034698486, + "step": 1750 + }, + { + "epoch": 0.52, + "learning_rate": 2.642384105960265e-07, + "logits/chosen": -2.2174530029296875, + "logits/rejected": -2.250398635864258, + "logps/chosen": -101.74955749511719, + "logps/rejected": -133.28713989257812, + "loss": 0.5109, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5314977765083313, + "rewards/margins": 1.0308630466461182, + "rewards/rejected": -1.5623606443405151, + "step": 1760 + }, + { + "epoch": 0.53, + "learning_rate": 2.625827814569536e-07, + "logits/chosen": -2.2077157497406006, + "logits/rejected": -2.1879453659057617, + "logps/chosen": -106.83088684082031, + "logps/rejected": -117.6878662109375, + "loss": 0.5311, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.922218918800354, + "rewards/margins": 1.4254719018936157, + "rewards/rejected": -3.3476905822753906, + "step": 1770 + }, + { + "epoch": 0.53, + "learning_rate": 2.6092715231788076e-07, + "logits/chosen": -2.236419677734375, + "logits/rejected": -2.1945042610168457, + "logps/chosen": -112.48036193847656, + "logps/rejected": -125.71522521972656, + "loss": 0.8311, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.3717401027679443, + "rewards/margins": 1.444071888923645, + "rewards/rejected": -2.8158118724823, + "step": 1780 + }, + { + "epoch": 0.53, + "learning_rate": 2.5927152317880795e-07, + "logits/chosen": -2.3207204341888428, + "logits/rejected": -2.2542405128479004, + "logps/chosen": -112.21119689941406, + "logps/rejected": -117.85597229003906, + "loss": 0.5143, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3228784799575806, + "rewards/margins": 0.836434543132782, + "rewards/rejected": -2.159313201904297, + "step": 1790 + }, + { + "epoch": 0.54, + "learning_rate": 2.576158940397351e-07, + "logits/chosen": -2.1415367126464844, + "logits/rejected": -2.173337936401367, + "logps/chosen": -111.6198501586914, + "logps/rejected": -131.82977294921875, + "loss": 0.5845, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.362797498703003, + "rewards/margins": 0.8809803128242493, + "rewards/rejected": -2.2437777519226074, + "step": 1800 + }, + { + "epoch": 0.54, + "learning_rate": 2.5596026490066223e-07, + "logits/chosen": -2.169029951095581, + "logits/rejected": -2.145346164703369, + "logps/chosen": -102.84260559082031, + "logps/rejected": -115.5724105834961, + "loss": 0.778, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.428421974182129, + "rewards/margins": 0.6596145629882812, + "rewards/rejected": -2.08803653717041, + "step": 1810 + }, + { + "epoch": 0.54, + "learning_rate": 2.5430463576158937e-07, + "logits/chosen": -2.3364787101745605, + "logits/rejected": -2.2011687755584717, + "logps/chosen": -109.7977066040039, + "logps/rejected": -112.79130554199219, + "loss": 0.4779, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1568695306777954, + "rewards/margins": 1.5236353874206543, + "rewards/rejected": -2.6805050373077393, + "step": 1820 + }, + { + "epoch": 0.55, + "learning_rate": 2.526490066225165e-07, + "logits/chosen": -2.270590305328369, + "logits/rejected": -2.317115306854248, + "logps/chosen": -131.15716552734375, + "logps/rejected": -127.9232406616211, + "loss": 0.5718, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.5472099781036377, + "rewards/margins": 0.4068627953529358, + "rewards/rejected": -1.9540729522705078, + "step": 1830 + }, + { + "epoch": 0.55, + "learning_rate": 2.509933774834437e-07, + "logits/chosen": -2.258516550064087, + "logits/rejected": -2.215508222579956, + "logps/chosen": -116.99686431884766, + "logps/rejected": -119.55728912353516, + "loss": 0.5443, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1407909393310547, + "rewards/margins": 1.8011624813079834, + "rewards/rejected": -2.941953659057617, + "step": 1840 + }, + { + "epoch": 0.55, + "learning_rate": 2.4933774834437084e-07, + "logits/chosen": -2.301156997680664, + "logits/rejected": -2.282895565032959, + "logps/chosen": -134.67526245117188, + "logps/rejected": -115.00638580322266, + "loss": 0.9478, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1763339042663574, + "rewards/margins": -0.813465416431427, + "rewards/rejected": -2.362868309020996, + "step": 1850 + }, + { + "epoch": 0.55, + "learning_rate": 2.47682119205298e-07, + "logits/chosen": -2.1597695350646973, + "logits/rejected": -2.2584662437438965, + "logps/chosen": -90.55691528320312, + "logps/rejected": -123.76590728759766, + "loss": 0.408, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2247810363769531, + "rewards/margins": 1.5576727390289307, + "rewards/rejected": -2.782454013824463, + "step": 1860 + }, + { + "epoch": 0.56, + "learning_rate": 2.460264900662252e-07, + "logits/chosen": -2.3151791095733643, + "logits/rejected": -2.3073203563690186, + "logps/chosen": -110.79292297363281, + "logps/rejected": -124.72319030761719, + "loss": 0.4559, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4885177612304688, + "rewards/margins": 0.9933377504348755, + "rewards/rejected": -2.481855630874634, + "step": 1870 + }, + { + "epoch": 0.56, + "learning_rate": 2.443708609271523e-07, + "logits/chosen": -2.3785767555236816, + "logits/rejected": -2.32625150680542, + "logps/chosen": -135.62266540527344, + "logps/rejected": -147.58059692382812, + "loss": 0.5509, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6126123666763306, + "rewards/margins": 1.325919508934021, + "rewards/rejected": -2.9385318756103516, + "step": 1880 + }, + { + "epoch": 0.56, + "learning_rate": 2.4271523178807946e-07, + "logits/chosen": -2.067755937576294, + "logits/rejected": -2.157957077026367, + "logps/chosen": -100.20467376708984, + "logps/rejected": -139.4010009765625, + "loss": 0.4928, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.7971267700195312, + "rewards/margins": 2.0710580348968506, + "rewards/rejected": -3.8681845664978027, + "step": 1890 + }, + { + "epoch": 0.57, + "learning_rate": 2.410596026490066e-07, + "logits/chosen": -2.26438307762146, + "logits/rejected": -2.1824748516082764, + "logps/chosen": -107.44325256347656, + "logps/rejected": -120.7808609008789, + "loss": 0.5088, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0230767726898193, + "rewards/margins": 1.2027556896209717, + "rewards/rejected": -2.22583270072937, + "step": 1900 + }, + { + "epoch": 0.57, + "learning_rate": 2.394039735099338e-07, + "logits/chosen": -2.202819585800171, + "logits/rejected": -2.1675939559936523, + "logps/chosen": -118.07059478759766, + "logps/rejected": -127.72599792480469, + "loss": 0.6344, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6664397716522217, + "rewards/margins": 0.9765909314155579, + "rewards/rejected": -2.6430306434631348, + "step": 1910 + }, + { + "epoch": 0.57, + "learning_rate": 2.377483443708609e-07, + "logits/chosen": -2.2761118412017822, + "logits/rejected": -2.2829902172088623, + "logps/chosen": -125.0960464477539, + "logps/rejected": -151.57437133789062, + "loss": 0.4561, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.6365854740142822, + "rewards/margins": 1.4987179040908813, + "rewards/rejected": -3.135303497314453, + "step": 1920 + }, + { + "epoch": 0.58, + "learning_rate": 2.3609271523178807e-07, + "logits/chosen": -2.317108154296875, + "logits/rejected": -2.367867946624756, + "logps/chosen": -110.93936920166016, + "logps/rejected": -124.30006408691406, + "loss": 0.4998, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5488382577896118, + "rewards/margins": 1.2520344257354736, + "rewards/rejected": -2.800872564315796, + "step": 1930 + }, + { + "epoch": 0.58, + "learning_rate": 2.3443708609271524e-07, + "logits/chosen": -2.283686399459839, + "logits/rejected": -2.2000420093536377, + "logps/chosen": -99.95622253417969, + "logps/rejected": -107.75992584228516, + "loss": 0.5787, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1963920593261719, + "rewards/margins": 1.1246505975723267, + "rewards/rejected": -2.321042537689209, + "step": 1940 + }, + { + "epoch": 0.58, + "learning_rate": 2.3278145695364238e-07, + "logits/chosen": -2.3099443912506104, + "logits/rejected": -2.293797016143799, + "logps/chosen": -143.53994750976562, + "logps/rejected": -156.98973083496094, + "loss": 0.5597, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4224615097045898, + "rewards/margins": 2.5084261894226074, + "rewards/rejected": -3.9308879375457764, + "step": 1950 + }, + { + "epoch": 0.58, + "learning_rate": 2.3112582781456952e-07, + "logits/chosen": -2.2922520637512207, + "logits/rejected": -2.3070366382598877, + "logps/chosen": -105.562744140625, + "logps/rejected": -115.12934875488281, + "loss": 0.4832, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.130995512008667, + "rewards/margins": 1.4186890125274658, + "rewards/rejected": -2.549685001373291, + "step": 1960 + }, + { + "epoch": 0.59, + "learning_rate": 2.2947019867549669e-07, + "logits/chosen": -2.32692289352417, + "logits/rejected": -2.284797191619873, + "logps/chosen": -110.09574127197266, + "logps/rejected": -141.3235321044922, + "loss": 0.5419, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.62274169921875, + "rewards/margins": 2.5739684104919434, + "rewards/rejected": -4.196709632873535, + "step": 1970 + }, + { + "epoch": 0.59, + "learning_rate": 2.2781456953642383e-07, + "logits/chosen": -2.2572226524353027, + "logits/rejected": -2.3021931648254395, + "logps/chosen": -103.7448501586914, + "logps/rejected": -124.720947265625, + "loss": 0.5929, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1002720594406128, + "rewards/margins": 1.6038618087768555, + "rewards/rejected": -2.704134225845337, + "step": 1980 + }, + { + "epoch": 0.59, + "learning_rate": 2.2615894039735097e-07, + "logits/chosen": -2.1988630294799805, + "logits/rejected": -2.1840052604675293, + "logps/chosen": -97.76619720458984, + "logps/rejected": -114.77903747558594, + "loss": 0.536, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.6038196086883545, + "rewards/margins": 1.3313450813293457, + "rewards/rejected": -2.9351646900177, + "step": 1990 + }, + { + "epoch": 0.6, + "learning_rate": 2.2450331125827813e-07, + "logits/chosen": -2.2937569618225098, + "logits/rejected": -2.1716551780700684, + "logps/chosen": -122.71870422363281, + "logps/rejected": -126.25750732421875, + "loss": 0.5508, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7580636739730835, + "rewards/margins": 1.4850653409957886, + "rewards/rejected": -3.243128538131714, + "step": 2000 + }, + { + "epoch": 0.6, + "eval_logits/chosen": -2.1208481788635254, + "eval_logits/rejected": -2.0760180950164795, + "eval_logps/chosen": -117.771728515625, + "eval_logps/rejected": -125.91642761230469, + "eval_loss": 0.5643959641456604, + "eval_rewards/accuracies": 0.6785714030265808, + "eval_rewards/chosen": -1.7905113697052002, + "eval_rewards/margins": 1.0963507890701294, + "eval_rewards/rejected": -2.886862277984619, + "eval_runtime": 519.2917, + "eval_samples_per_second": 3.439, + "eval_steps_per_second": 0.108, + "step": 2000 + }, + { + "epoch": 0.6, + "learning_rate": 2.228476821192053e-07, + "logits/chosen": -2.2932658195495605, + "logits/rejected": -2.2196624279022217, + "logps/chosen": -121.21055603027344, + "logps/rejected": -114.63111877441406, + "loss": 0.7063, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.775307059288025, + "rewards/margins": 0.6449312567710876, + "rewards/rejected": -2.420238494873047, + "step": 2010 + }, + { + "epoch": 0.6, + "learning_rate": 2.2119205298013244e-07, + "logits/chosen": -2.137760877609253, + "logits/rejected": -2.1844496726989746, + "logps/chosen": -138.58255004882812, + "logps/rejected": -127.23612213134766, + "loss": 0.5234, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.9617927074432373, + "rewards/margins": 0.6043619513511658, + "rewards/rejected": -2.5661544799804688, + "step": 2020 + }, + { + "epoch": 0.6, + "learning_rate": 2.1953642384105958e-07, + "logits/chosen": -2.2938625812530518, + "logits/rejected": -2.268752336502075, + "logps/chosen": -113.85087585449219, + "logps/rejected": -149.66539001464844, + "loss": 0.5987, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5664665699005127, + "rewards/margins": 0.8736650347709656, + "rewards/rejected": -2.440131664276123, + "step": 2030 + }, + { + "epoch": 0.61, + "learning_rate": 2.1788079470198675e-07, + "logits/chosen": -2.3219776153564453, + "logits/rejected": -2.350645065307617, + "logps/chosen": -94.68901824951172, + "logps/rejected": -106.27494812011719, + "loss": 0.6865, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4543777704238892, + "rewards/margins": 0.8551589846611023, + "rewards/rejected": -2.309536933898926, + "step": 2040 + }, + { + "epoch": 0.61, + "learning_rate": 2.1622516556291389e-07, + "logits/chosen": -2.2941012382507324, + "logits/rejected": -2.2624030113220215, + "logps/chosen": -125.8183822631836, + "logps/rejected": -133.0880584716797, + "loss": 0.5175, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4116047620773315, + "rewards/margins": 1.2253597974777222, + "rewards/rejected": -2.6369645595550537, + "step": 2050 + }, + { + "epoch": 0.61, + "learning_rate": 2.1456953642384105e-07, + "logits/chosen": -2.176222562789917, + "logits/rejected": -2.0717849731445312, + "logps/chosen": -104.74139404296875, + "logps/rejected": -124.4080581665039, + "loss": 0.4433, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.621763825416565, + "rewards/margins": 1.3861135244369507, + "rewards/rejected": -3.0078773498535156, + "step": 2060 + }, + { + "epoch": 0.62, + "learning_rate": 2.1291390728476822e-07, + "logits/chosen": -2.244816541671753, + "logits/rejected": -2.217611074447632, + "logps/chosen": -126.18927001953125, + "logps/rejected": -129.03915405273438, + "loss": 0.559, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.9465411901474, + "rewards/margins": 0.7713083028793335, + "rewards/rejected": -2.7178492546081543, + "step": 2070 + }, + { + "epoch": 0.62, + "learning_rate": 2.1125827814569536e-07, + "logits/chosen": -2.331676721572876, + "logits/rejected": -2.249488353729248, + "logps/chosen": -115.93875885009766, + "logps/rejected": -122.89765930175781, + "loss": 0.5416, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3803373575210571, + "rewards/margins": 0.6847006678581238, + "rewards/rejected": -2.065037965774536, + "step": 2080 + }, + { + "epoch": 0.62, + "learning_rate": 2.096026490066225e-07, + "logits/chosen": -2.2945587635040283, + "logits/rejected": -2.2709438800811768, + "logps/chosen": -102.5355453491211, + "logps/rejected": -105.96309661865234, + "loss": 0.6918, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.1539552211761475, + "rewards/margins": 1.233269453048706, + "rewards/rejected": -2.3872246742248535, + "step": 2090 + }, + { + "epoch": 0.63, + "learning_rate": 2.0794701986754967e-07, + "logits/chosen": -2.260633945465088, + "logits/rejected": -2.257582187652588, + "logps/chosen": -123.7904052734375, + "logps/rejected": -139.2223663330078, + "loss": 0.489, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.6961309909820557, + "rewards/margins": 1.7095321416854858, + "rewards/rejected": -3.405663251876831, + "step": 2100 + }, + { + "epoch": 0.63, + "learning_rate": 2.062913907284768e-07, + "logits/chosen": -2.4550869464874268, + "logits/rejected": -2.369741439819336, + "logps/chosen": -110.0873031616211, + "logps/rejected": -110.52005767822266, + "loss": 0.5515, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.5832288265228271, + "rewards/margins": 0.5315386652946472, + "rewards/rejected": -2.114767551422119, + "step": 2110 + }, + { + "epoch": 0.63, + "learning_rate": 2.0463576158940397e-07, + "logits/chosen": -2.1035804748535156, + "logits/rejected": -2.0870535373687744, + "logps/chosen": -108.03116607666016, + "logps/rejected": -193.72422790527344, + "loss": 0.4535, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8296096324920654, + "rewards/margins": 8.629142761230469, + "rewards/rejected": -10.45875072479248, + "step": 2120 + }, + { + "epoch": 0.63, + "learning_rate": 2.029801324503311e-07, + "logits/chosen": -2.3085687160491943, + "logits/rejected": -2.3340985774993896, + "logps/chosen": -117.62290954589844, + "logps/rejected": -126.44264221191406, + "loss": 2.8703, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7038648128509521, + "rewards/margins": 1.1059377193450928, + "rewards/rejected": -2.809802770614624, + "step": 2130 + }, + { + "epoch": 0.64, + "learning_rate": 2.0132450331125828e-07, + "logits/chosen": -2.4316937923431396, + "logits/rejected": -2.3887412548065186, + "logps/chosen": -138.1640625, + "logps/rejected": -139.81886291503906, + "loss": 0.5099, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.448891520500183, + "rewards/margins": 0.9665302038192749, + "rewards/rejected": -2.415421962738037, + "step": 2140 + }, + { + "epoch": 0.64, + "learning_rate": 1.9966887417218542e-07, + "logits/chosen": -2.5827786922454834, + "logits/rejected": -2.53794002532959, + "logps/chosen": -126.8006362915039, + "logps/rejected": -126.6136474609375, + "loss": 0.515, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2876603603363037, + "rewards/margins": 0.7639477849006653, + "rewards/rejected": -2.051608085632324, + "step": 2150 + }, + { + "epoch": 0.64, + "learning_rate": 1.9801324503311256e-07, + "logits/chosen": -2.3453097343444824, + "logits/rejected": -2.4177701473236084, + "logps/chosen": -111.456787109375, + "logps/rejected": -121.5312728881836, + "loss": 0.5125, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4392328262329102, + "rewards/margins": 1.1764917373657227, + "rewards/rejected": -2.615724563598633, + "step": 2160 + }, + { + "epoch": 0.65, + "learning_rate": 1.9635761589403973e-07, + "logits/chosen": -2.2641375064849854, + "logits/rejected": -2.3049521446228027, + "logps/chosen": -95.91242980957031, + "logps/rejected": -111.0653305053711, + "loss": 0.5322, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.325178861618042, + "rewards/margins": 1.0794451236724854, + "rewards/rejected": -2.4046239852905273, + "step": 2170 + }, + { + "epoch": 0.65, + "learning_rate": 1.947019867549669e-07, + "logits/chosen": -2.3387389183044434, + "logits/rejected": -2.2360782623291016, + "logps/chosen": -100.87395477294922, + "logps/rejected": -111.1401138305664, + "loss": 0.49, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.7546203136444092, + "rewards/margins": 1.1646721363067627, + "rewards/rejected": -2.919292449951172, + "step": 2180 + }, + { + "epoch": 0.65, + "learning_rate": 1.9304635761589403e-07, + "logits/chosen": -2.3461403846740723, + "logits/rejected": -2.3300156593322754, + "logps/chosen": -103.99101257324219, + "logps/rejected": -122.7502212524414, + "loss": 0.5215, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1400959491729736, + "rewards/margins": 1.375199556350708, + "rewards/rejected": -2.5152957439422607, + "step": 2190 + }, + { + "epoch": 0.66, + "learning_rate": 1.913907284768212e-07, + "logits/chosen": -2.353959560394287, + "logits/rejected": -2.2813894748687744, + "logps/chosen": -110.76973724365234, + "logps/rejected": -133.5187225341797, + "loss": 0.5406, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2461332082748413, + "rewards/margins": 2.200319766998291, + "rewards/rejected": -3.446453094482422, + "step": 2200 + }, + { + "epoch": 0.66, + "learning_rate": 1.8973509933774834e-07, + "logits/chosen": -2.483916759490967, + "logits/rejected": -2.3655548095703125, + "logps/chosen": -108.8669662475586, + "logps/rejected": -116.64085388183594, + "loss": 0.5907, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9422400593757629, + "rewards/margins": 0.9399551153182983, + "rewards/rejected": -1.8821951150894165, + "step": 2210 + }, + { + "epoch": 0.66, + "learning_rate": 1.8807947019867548e-07, + "logits/chosen": -2.3532841205596924, + "logits/rejected": -2.3442747592926025, + "logps/chosen": -99.94935607910156, + "logps/rejected": -115.6842269897461, + "loss": 0.6063, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1935023069381714, + "rewards/margins": 1.022131323814392, + "rewards/rejected": -2.2156338691711426, + "step": 2220 + }, + { + "epoch": 0.66, + "learning_rate": 1.8642384105960262e-07, + "logits/chosen": -2.473654270172119, + "logits/rejected": -2.456444263458252, + "logps/chosen": -128.30955505371094, + "logps/rejected": -135.67520141601562, + "loss": 0.5602, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0379096269607544, + "rewards/margins": 0.6246587038040161, + "rewards/rejected": -1.6625683307647705, + "step": 2230 + }, + { + "epoch": 0.67, + "learning_rate": 1.8476821192052979e-07, + "logits/chosen": -2.4869556427001953, + "logits/rejected": -2.449312686920166, + "logps/chosen": -116.06538391113281, + "logps/rejected": -120.8796615600586, + "loss": 0.5219, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9571272730827332, + "rewards/margins": 1.021393060684204, + "rewards/rejected": -1.978520154953003, + "step": 2240 + }, + { + "epoch": 0.67, + "learning_rate": 1.8311258278145695e-07, + "logits/chosen": -2.4670963287353516, + "logits/rejected": -2.382390260696411, + "logps/chosen": -123.17083740234375, + "logps/rejected": -109.2813949584961, + "loss": 0.5479, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2968004941940308, + "rewards/margins": 0.7737834453582764, + "rewards/rejected": -2.0705838203430176, + "step": 2250 + }, + { + "epoch": 0.67, + "learning_rate": 1.814569536423841e-07, + "logits/chosen": -2.345423460006714, + "logits/rejected": -2.3300931453704834, + "logps/chosen": -92.75725555419922, + "logps/rejected": -107.68856048583984, + "loss": 0.4778, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8894661068916321, + "rewards/margins": 1.360033392906189, + "rewards/rejected": -2.249499559402466, + "step": 2260 + }, + { + "epoch": 0.68, + "learning_rate": 1.7980132450331126e-07, + "logits/chosen": -2.2602181434631348, + "logits/rejected": -2.1599280834198, + "logps/chosen": -105.10438537597656, + "logps/rejected": -131.7586669921875, + "loss": 0.5319, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.113909125328064, + "rewards/margins": 1.5821037292480469, + "rewards/rejected": -2.6960129737854004, + "step": 2270 + }, + { + "epoch": 0.68, + "learning_rate": 1.781456953642384e-07, + "logits/chosen": -2.3455591201782227, + "logits/rejected": -2.364595413208008, + "logps/chosen": -130.79019165039062, + "logps/rejected": -147.9118194580078, + "loss": 0.4775, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7606639862060547, + "rewards/margins": 1.2344610691070557, + "rewards/rejected": -2.9951250553131104, + "step": 2280 + }, + { + "epoch": 0.68, + "learning_rate": 1.7649006622516554e-07, + "logits/chosen": -2.167285442352295, + "logits/rejected": -2.247238874435425, + "logps/chosen": -139.6284942626953, + "logps/rejected": -158.96109008789062, + "loss": 0.7002, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.767961263656616, + "rewards/margins": 1.04625403881073, + "rewards/rejected": -3.8142154216766357, + "step": 2290 + }, + { + "epoch": 0.69, + "learning_rate": 1.748344370860927e-07, + "logits/chosen": -2.3983192443847656, + "logits/rejected": -2.4019296169281006, + "logps/chosen": -115.08357238769531, + "logps/rejected": -129.3126220703125, + "loss": 0.4762, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.6475099325180054, + "rewards/margins": 1.2640306949615479, + "rewards/rejected": -2.9115407466888428, + "step": 2300 + }, + { + "epoch": 0.69, + "learning_rate": 1.7317880794701987e-07, + "logits/chosen": -2.3797879219055176, + "logits/rejected": -2.324965715408325, + "logps/chosen": -116.2055435180664, + "logps/rejected": -131.01705932617188, + "loss": 0.5819, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.186680555343628, + "rewards/margins": 1.4000349044799805, + "rewards/rejected": -2.5867154598236084, + "step": 2310 + }, + { + "epoch": 0.69, + "learning_rate": 1.71523178807947e-07, + "logits/chosen": -2.227961778640747, + "logits/rejected": -2.197960615158081, + "logps/chosen": -106.0052490234375, + "logps/rejected": -126.49810791015625, + "loss": 0.5598, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.7543351650238037, + "rewards/margins": 1.1439127922058105, + "rewards/rejected": -2.8982481956481934, + "step": 2320 + }, + { + "epoch": 0.69, + "learning_rate": 1.6986754966887418e-07, + "logits/chosen": -2.34653902053833, + "logits/rejected": -2.361428737640381, + "logps/chosen": -93.7066879272461, + "logps/rejected": -115.24361419677734, + "loss": 0.5737, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.030475378036499, + "rewards/margins": 1.2473831176757812, + "rewards/rejected": -2.2778584957122803, + "step": 2330 + }, + { + "epoch": 0.7, + "learning_rate": 1.6821192052980132e-07, + "logits/chosen": -2.3806099891662598, + "logits/rejected": -2.404531955718994, + "logps/chosen": -108.77107238769531, + "logps/rejected": -124.2442398071289, + "loss": 0.5444, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4443012475967407, + "rewards/margins": 0.9878839254379272, + "rewards/rejected": -2.432185411453247, + "step": 2340 + }, + { + "epoch": 0.7, + "learning_rate": 1.6655629139072846e-07, + "logits/chosen": -2.447935104370117, + "logits/rejected": -2.4319026470184326, + "logps/chosen": -116.2603988647461, + "logps/rejected": -123.78592681884766, + "loss": 0.4883, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3556338548660278, + "rewards/margins": 1.9813722372055054, + "rewards/rejected": -3.337006092071533, + "step": 2350 + }, + { + "epoch": 0.7, + "learning_rate": 1.649006622516556e-07, + "logits/chosen": -2.3291103839874268, + "logits/rejected": -2.3166141510009766, + "logps/chosen": -111.0199203491211, + "logps/rejected": -117.00514221191406, + "loss": 0.4997, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5363482236862183, + "rewards/margins": 0.870397686958313, + "rewards/rejected": -2.4067459106445312, + "step": 2360 + }, + { + "epoch": 0.71, + "learning_rate": 1.632450331125828e-07, + "logits/chosen": -2.2856314182281494, + "logits/rejected": -2.264632225036621, + "logps/chosen": -102.96492004394531, + "logps/rejected": -126.04007720947266, + "loss": 0.456, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1887879371643066, + "rewards/margins": 1.522206425666809, + "rewards/rejected": -2.7109944820404053, + "step": 2370 + }, + { + "epoch": 0.71, + "learning_rate": 1.6158940397350993e-07, + "logits/chosen": -2.2702925205230713, + "logits/rejected": -2.237971782684326, + "logps/chosen": -99.81072998046875, + "logps/rejected": -124.4561996459961, + "loss": 0.5386, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.322842001914978, + "rewards/margins": 1.6832103729248047, + "rewards/rejected": -3.0060524940490723, + "step": 2380 + }, + { + "epoch": 0.71, + "learning_rate": 1.5993377483443707e-07, + "logits/chosen": -2.265434741973877, + "logits/rejected": -2.2928626537323, + "logps/chosen": -83.43587493896484, + "logps/rejected": -114.23832702636719, + "loss": 0.5913, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8695703744888306, + "rewards/margins": 2.3257930278778076, + "rewards/rejected": -3.1953632831573486, + "step": 2390 + }, + { + "epoch": 0.72, + "learning_rate": 1.5827814569536424e-07, + "logits/chosen": -2.575456142425537, + "logits/rejected": -2.395871162414551, + "logps/chosen": -105.5760498046875, + "logps/rejected": -97.21208190917969, + "loss": 0.5091, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.1773121356964111, + "rewards/margins": 0.5559796690940857, + "rewards/rejected": -1.7332916259765625, + "step": 2400 + }, + { + "epoch": 0.72, + "learning_rate": 1.5662251655629138e-07, + "logits/chosen": -2.4753453731536865, + "logits/rejected": -2.414577007293701, + "logps/chosen": -143.94302368164062, + "logps/rejected": -137.99838256835938, + "loss": 0.5287, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1706478595733643, + "rewards/margins": 1.126072883605957, + "rewards/rejected": -2.2967207431793213, + "step": 2410 + }, + { + "epoch": 0.72, + "learning_rate": 1.5496688741721852e-07, + "logits/chosen": -2.412086009979248, + "logits/rejected": -2.3731260299682617, + "logps/chosen": -106.2443618774414, + "logps/rejected": -112.93099212646484, + "loss": 0.4915, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2554187774658203, + "rewards/margins": 1.3224613666534424, + "rewards/rejected": -2.5778801441192627, + "step": 2420 + }, + { + "epoch": 0.72, + "learning_rate": 1.533112582781457e-07, + "logits/chosen": -2.2778310775756836, + "logits/rejected": -2.256308078765869, + "logps/chosen": -120.01104736328125, + "logps/rejected": -123.55589294433594, + "loss": 0.465, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.085076093673706, + "rewards/margins": 1.26901376247406, + "rewards/rejected": -2.3540899753570557, + "step": 2430 + }, + { + "epoch": 0.73, + "learning_rate": 1.5165562913907285e-07, + "logits/chosen": -2.393749475479126, + "logits/rejected": -2.3263931274414062, + "logps/chosen": -117.22212219238281, + "logps/rejected": -138.7658233642578, + "loss": 0.5859, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8770434856414795, + "rewards/margins": 1.216582179069519, + "rewards/rejected": -3.0936264991760254, + "step": 2440 + }, + { + "epoch": 0.73, + "learning_rate": 1.5e-07, + "logits/chosen": -2.4931600093841553, + "logits/rejected": -2.4225075244903564, + "logps/chosen": -125.9542236328125, + "logps/rejected": -141.84219360351562, + "loss": 0.5621, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3538328409194946, + "rewards/margins": 1.1864392757415771, + "rewards/rejected": -2.5402722358703613, + "step": 2450 + }, + { + "epoch": 0.73, + "learning_rate": 1.4834437086092716e-07, + "logits/chosen": -2.3211989402770996, + "logits/rejected": -2.3927392959594727, + "logps/chosen": -94.21218872070312, + "logps/rejected": -111.05567932128906, + "loss": 0.5479, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.555248737335205, + "rewards/margins": 0.7776703834533691, + "rewards/rejected": -2.332918882369995, + "step": 2460 + }, + { + "epoch": 0.74, + "learning_rate": 1.466887417218543e-07, + "logits/chosen": -2.4537739753723145, + "logits/rejected": -2.3887171745300293, + "logps/chosen": -104.2787857055664, + "logps/rejected": -113.91764831542969, + "loss": 0.6143, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.352063775062561, + "rewards/margins": 0.9023284912109375, + "rewards/rejected": -2.254392147064209, + "step": 2470 + }, + { + "epoch": 0.74, + "learning_rate": 1.4503311258278144e-07, + "logits/chosen": -2.4015471935272217, + "logits/rejected": -2.42402720451355, + "logps/chosen": -114.39097595214844, + "logps/rejected": -132.64340209960938, + "loss": 0.5667, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2820857763290405, + "rewards/margins": 1.0415351390838623, + "rewards/rejected": -2.323620557785034, + "step": 2480 + }, + { + "epoch": 0.74, + "learning_rate": 1.4337748344370858e-07, + "logits/chosen": -2.355255126953125, + "logits/rejected": -2.277355909347534, + "logps/chosen": -102.35648345947266, + "logps/rejected": -109.78977966308594, + "loss": 0.5125, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.6282755136489868, + "rewards/margins": 1.255506992340088, + "rewards/rejected": -2.8837826251983643, + "step": 2490 + }, + { + "epoch": 0.74, + "learning_rate": 1.4172185430463577e-07, + "logits/chosen": -2.5044684410095215, + "logits/rejected": -2.3574650287628174, + "logps/chosen": -130.39955139160156, + "logps/rejected": -128.08071899414062, + "loss": 0.5218, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2382522821426392, + "rewards/margins": 1.3744902610778809, + "rewards/rejected": -2.6127424240112305, + "step": 2500 + }, + { + "epoch": 0.74, + "eval_logits/chosen": -2.261568784713745, + "eval_logits/rejected": -2.2171857357025146, + "eval_logps/chosen": -113.09461975097656, + "eval_logps/rejected": -122.51795959472656, + "eval_loss": 0.5183302164077759, + "eval_rewards/accuracies": 0.703125, + "eval_rewards/chosen": -1.3228007555007935, + "eval_rewards/margins": 1.2242145538330078, + "eval_rewards/rejected": -2.54701566696167, + "eval_runtime": 523.6034, + "eval_samples_per_second": 3.411, + "eval_steps_per_second": 0.107, + "step": 2500 + }, + { + "epoch": 0.75, + "learning_rate": 1.4006622516556291e-07, + "logits/chosen": -2.4229695796966553, + "logits/rejected": -2.3659071922302246, + "logps/chosen": -97.41563415527344, + "logps/rejected": -107.28167724609375, + "loss": 0.5285, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0089452266693115, + "rewards/margins": 1.1100685596466064, + "rewards/rejected": -2.119013786315918, + "step": 2510 + }, + { + "epoch": 0.75, + "learning_rate": 1.3841059602649005e-07, + "logits/chosen": -2.368020534515381, + "logits/rejected": -2.266580820083618, + "logps/chosen": -107.78886413574219, + "logps/rejected": -124.20140075683594, + "loss": 0.4815, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1261156797409058, + "rewards/margins": 1.54592764377594, + "rewards/rejected": -2.6720430850982666, + "step": 2520 + }, + { + "epoch": 0.75, + "learning_rate": 1.3675496688741722e-07, + "logits/chosen": -2.3915557861328125, + "logits/rejected": -2.3538260459899902, + "logps/chosen": -96.66950988769531, + "logps/rejected": -107.39601135253906, + "loss": 0.5181, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2402719259262085, + "rewards/margins": 1.5004689693450928, + "rewards/rejected": -2.7407407760620117, + "step": 2530 + }, + { + "epoch": 0.76, + "learning_rate": 1.3509933774834436e-07, + "logits/chosen": -2.3380367755889893, + "logits/rejected": -2.2895896434783936, + "logps/chosen": -122.12117767333984, + "logps/rejected": -122.6964111328125, + "loss": 0.5068, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7105709314346313, + "rewards/margins": 1.117949366569519, + "rewards/rejected": -2.8285202980041504, + "step": 2540 + }, + { + "epoch": 0.76, + "learning_rate": 1.334437086092715e-07, + "logits/chosen": -2.650242567062378, + "logits/rejected": -2.575338840484619, + "logps/chosen": -116.67132568359375, + "logps/rejected": -121.65495300292969, + "loss": 0.4781, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0724276304244995, + "rewards/margins": 0.7939808964729309, + "rewards/rejected": -1.8664085865020752, + "step": 2550 + }, + { + "epoch": 0.76, + "learning_rate": 1.317880794701987e-07, + "logits/chosen": -2.4393889904022217, + "logits/rejected": -2.356849431991577, + "logps/chosen": -108.2217788696289, + "logps/rejected": -105.802734375, + "loss": 0.4569, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9382781982421875, + "rewards/margins": 0.9701493978500366, + "rewards/rejected": -1.9084275960922241, + "step": 2560 + }, + { + "epoch": 0.77, + "learning_rate": 1.3013245033112583e-07, + "logits/chosen": -2.27262544631958, + "logits/rejected": -2.2650160789489746, + "logps/chosen": -82.49347686767578, + "logps/rejected": -105.01361083984375, + "loss": 0.4757, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5115716457366943, + "rewards/margins": 1.9998562335968018, + "rewards/rejected": -2.511428117752075, + "step": 2570 + }, + { + "epoch": 0.77, + "learning_rate": 1.2847682119205297e-07, + "logits/chosen": -2.3641304969787598, + "logits/rejected": -2.400428533554077, + "logps/chosen": -95.62802124023438, + "logps/rejected": -105.62736511230469, + "loss": 0.5091, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2022227048873901, + "rewards/margins": 1.1635633707046509, + "rewards/rejected": -2.365786075592041, + "step": 2580 + }, + { + "epoch": 0.77, + "learning_rate": 1.2682119205298011e-07, + "logits/chosen": -2.2362232208251953, + "logits/rejected": -2.294517993927002, + "logps/chosen": -111.7828140258789, + "logps/rejected": -107.35648345947266, + "loss": 0.5689, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3542122840881348, + "rewards/margins": 0.9707919359207153, + "rewards/rejected": -2.3250043392181396, + "step": 2590 + }, + { + "epoch": 0.77, + "learning_rate": 1.2516556291390728e-07, + "logits/chosen": -2.4351532459259033, + "logits/rejected": -2.3938307762145996, + "logps/chosen": -116.37557220458984, + "logps/rejected": -142.02877807617188, + "loss": 0.4966, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.059444546699524, + "rewards/margins": 1.6750872135162354, + "rewards/rejected": -2.734531879425049, + "step": 2600 + }, + { + "epoch": 0.78, + "learning_rate": 1.2350993377483442e-07, + "logits/chosen": -2.130566358566284, + "logits/rejected": -2.1427571773529053, + "logps/chosen": -98.26994323730469, + "logps/rejected": -125.13362121582031, + "loss": 0.5217, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.1679749488830566, + "rewards/margins": 2.0151760578155518, + "rewards/rejected": -4.183150768280029, + "step": 2610 + }, + { + "epoch": 0.78, + "learning_rate": 1.218543046357616e-07, + "logits/chosen": -2.3847343921661377, + "logits/rejected": -2.3289005756378174, + "logps/chosen": -103.18563079833984, + "logps/rejected": -106.60140228271484, + "loss": 0.526, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5415022373199463, + "rewards/margins": 1.246010184288025, + "rewards/rejected": -2.7875125408172607, + "step": 2620 + }, + { + "epoch": 0.78, + "learning_rate": 1.2019867549668873e-07, + "logits/chosen": -2.344989776611328, + "logits/rejected": -2.2486376762390137, + "logps/chosen": -111.1012954711914, + "logps/rejected": -114.76014709472656, + "loss": 0.4662, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4294898509979248, + "rewards/margins": 1.2511804103851318, + "rewards/rejected": -2.6806702613830566, + "step": 2630 + }, + { + "epoch": 0.79, + "learning_rate": 1.185430463576159e-07, + "logits/chosen": -2.342101573944092, + "logits/rejected": -2.3254072666168213, + "logps/chosen": -114.9495620727539, + "logps/rejected": -122.08809661865234, + "loss": 0.4812, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4265601634979248, + "rewards/margins": 1.6178033351898193, + "rewards/rejected": -3.0443637371063232, + "step": 2640 + }, + { + "epoch": 0.79, + "learning_rate": 1.1688741721854305e-07, + "logits/chosen": -2.329153537750244, + "logits/rejected": -2.2368149757385254, + "logps/chosen": -123.8796157836914, + "logps/rejected": -119.62074279785156, + "loss": 0.4744, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5358905792236328, + "rewards/margins": 1.2361419200897217, + "rewards/rejected": -2.7720324993133545, + "step": 2650 + }, + { + "epoch": 0.79, + "learning_rate": 1.1523178807947019e-07, + "logits/chosen": -2.4591078758239746, + "logits/rejected": -2.454157829284668, + "logps/chosen": -116.4410629272461, + "logps/rejected": -129.07809448242188, + "loss": 0.5417, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5216033458709717, + "rewards/margins": 1.3263527154922485, + "rewards/rejected": -2.8479561805725098, + "step": 2660 + }, + { + "epoch": 0.8, + "learning_rate": 1.1357615894039735e-07, + "logits/chosen": -2.287152051925659, + "logits/rejected": -2.2752058506011963, + "logps/chosen": -128.70211791992188, + "logps/rejected": -141.4760284423828, + "loss": 0.5571, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.9110784530639648, + "rewards/margins": 1.6628735065460205, + "rewards/rejected": -3.5739517211914062, + "step": 2670 + }, + { + "epoch": 0.8, + "learning_rate": 1.119205298013245e-07, + "logits/chosen": -2.3498637676239014, + "logits/rejected": -2.3876309394836426, + "logps/chosen": -121.30989074707031, + "logps/rejected": -129.5779571533203, + "loss": 0.4927, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.256763219833374, + "rewards/margins": 1.8756353855133057, + "rewards/rejected": -3.1323981285095215, + "step": 2680 + }, + { + "epoch": 0.8, + "learning_rate": 1.1026490066225165e-07, + "logits/chosen": -2.2559609413146973, + "logits/rejected": -2.2900869846343994, + "logps/chosen": -89.24148559570312, + "logps/rejected": -104.9818115234375, + "loss": 0.5639, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5417016744613647, + "rewards/margins": 1.3289363384246826, + "rewards/rejected": -2.870638132095337, + "step": 2690 + }, + { + "epoch": 0.8, + "learning_rate": 1.0860927152317881e-07, + "logits/chosen": -2.3323373794555664, + "logits/rejected": -2.4132628440856934, + "logps/chosen": -112.27374267578125, + "logps/rejected": -132.1646728515625, + "loss": 0.5692, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.039150595664978, + "rewards/margins": 1.5584369897842407, + "rewards/rejected": -2.597587823867798, + "step": 2700 + }, + { + "epoch": 0.81, + "learning_rate": 1.0695364238410595e-07, + "logits/chosen": -2.2826695442199707, + "logits/rejected": -2.232888698577881, + "logps/chosen": -107.91890716552734, + "logps/rejected": -114.87126159667969, + "loss": 0.5245, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3879740238189697, + "rewards/margins": 1.4284284114837646, + "rewards/rejected": -2.8164026737213135, + "step": 2710 + }, + { + "epoch": 0.81, + "learning_rate": 1.0529801324503311e-07, + "logits/chosen": -2.433330535888672, + "logits/rejected": -2.3720269203186035, + "logps/chosen": -122.9089584350586, + "logps/rejected": -130.165771484375, + "loss": 0.5503, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7988684177398682, + "rewards/margins": 1.0453321933746338, + "rewards/rejected": -2.844200611114502, + "step": 2720 + }, + { + "epoch": 0.81, + "learning_rate": 1.0364238410596025e-07, + "logits/chosen": -2.432610511779785, + "logits/rejected": -2.3609492778778076, + "logps/chosen": -126.18135070800781, + "logps/rejected": -137.95114135742188, + "loss": 0.5377, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1943086385726929, + "rewards/margins": 1.0217430591583252, + "rewards/rejected": -2.2160518169403076, + "step": 2730 + }, + { + "epoch": 0.82, + "learning_rate": 1.0198675496688741e-07, + "logits/chosen": -2.40020489692688, + "logits/rejected": -2.333512783050537, + "logps/chosen": -120.67720794677734, + "logps/rejected": -123.46641540527344, + "loss": 0.4558, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2497332096099854, + "rewards/margins": 1.2155460119247437, + "rewards/rejected": -2.4652791023254395, + "step": 2740 + }, + { + "epoch": 0.82, + "learning_rate": 1.0033112582781457e-07, + "logits/chosen": -2.4432952404022217, + "logits/rejected": -2.3959970474243164, + "logps/chosen": -131.6014862060547, + "logps/rejected": -145.7483673095703, + "loss": 0.4373, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9097809791564941, + "rewards/margins": 1.4488131999969482, + "rewards/rejected": -2.3585941791534424, + "step": 2750 + }, + { + "epoch": 0.82, + "learning_rate": 9.867549668874171e-08, + "logits/chosen": -2.2430427074432373, + "logits/rejected": -2.2248117923736572, + "logps/chosen": -99.05213928222656, + "logps/rejected": -118.5693130493164, + "loss": 0.5283, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.898768424987793, + "rewards/margins": 1.2684627771377563, + "rewards/rejected": -2.1672310829162598, + "step": 2760 + }, + { + "epoch": 0.83, + "learning_rate": 9.701986754966887e-08, + "logits/chosen": -2.4450364112854004, + "logits/rejected": -2.3705830574035645, + "logps/chosen": -107.76090240478516, + "logps/rejected": -112.4260482788086, + "loss": 0.4824, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4488307237625122, + "rewards/margins": 0.7578933835029602, + "rewards/rejected": -2.206723928451538, + "step": 2770 + }, + { + "epoch": 0.83, + "learning_rate": 9.536423841059603e-08, + "logits/chosen": -2.4003443717956543, + "logits/rejected": -2.348435878753662, + "logps/chosen": -98.28638458251953, + "logps/rejected": -100.79689025878906, + "loss": 0.5439, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.41732919216156, + "rewards/margins": 1.0273702144622803, + "rewards/rejected": -2.444699764251709, + "step": 2780 + }, + { + "epoch": 0.83, + "learning_rate": 9.370860927152317e-08, + "logits/chosen": -2.3565890789031982, + "logits/rejected": -2.3134591579437256, + "logps/chosen": -122.64701080322266, + "logps/rejected": -140.7588348388672, + "loss": 0.54, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.8713737726211548, + "rewards/margins": 0.9109483957290649, + "rewards/rejected": -2.7823221683502197, + "step": 2790 + }, + { + "epoch": 0.83, + "learning_rate": 9.205298013245033e-08, + "logits/chosen": -2.4065792560577393, + "logits/rejected": -2.343113422393799, + "logps/chosen": -113.3506088256836, + "logps/rejected": -118.6836166381836, + "loss": 0.6089, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.640981912612915, + "rewards/margins": 1.4465951919555664, + "rewards/rejected": -3.0875768661499023, + "step": 2800 + }, + { + "epoch": 0.84, + "learning_rate": 9.039735099337747e-08, + "logits/chosen": -2.280989170074463, + "logits/rejected": -2.2906501293182373, + "logps/chosen": -108.36322021484375, + "logps/rejected": -118.99311828613281, + "loss": 0.4821, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3466839790344238, + "rewards/margins": 1.234621524810791, + "rewards/rejected": -2.581305503845215, + "step": 2810 + }, + { + "epoch": 0.84, + "learning_rate": 8.874172185430463e-08, + "logits/chosen": -2.3098435401916504, + "logits/rejected": -2.365722179412842, + "logps/chosen": -142.2515411376953, + "logps/rejected": -136.40847778320312, + "loss": 0.6105, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.9295507669448853, + "rewards/margins": 0.7886122465133667, + "rewards/rejected": -2.718163013458252, + "step": 2820 + }, + { + "epoch": 0.84, + "learning_rate": 8.70860927152318e-08, + "logits/chosen": -2.4758474826812744, + "logits/rejected": -2.4529106616973877, + "logps/chosen": -102.67512512207031, + "logps/rejected": -108.22530364990234, + "loss": 0.4814, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4004428386688232, + "rewards/margins": 0.7858734130859375, + "rewards/rejected": -2.1863162517547607, + "step": 2830 + }, + { + "epoch": 0.85, + "learning_rate": 8.543046357615893e-08, + "logits/chosen": -2.4003779888153076, + "logits/rejected": -2.3763396739959717, + "logps/chosen": -104.71977233886719, + "logps/rejected": -117.16717529296875, + "loss": 0.4928, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -1.1805320978164673, + "rewards/margins": 1.6279674768447876, + "rewards/rejected": -2.808499336242676, + "step": 2840 + }, + { + "epoch": 0.85, + "learning_rate": 8.377483443708609e-08, + "logits/chosen": -2.4015908241271973, + "logits/rejected": -2.3405182361602783, + "logps/chosen": -117.36273193359375, + "logps/rejected": -124.22319030761719, + "loss": 0.5652, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3869013786315918, + "rewards/margins": 0.9574357867240906, + "rewards/rejected": -2.344337224960327, + "step": 2850 + }, + { + "epoch": 0.85, + "learning_rate": 8.211920529801324e-08, + "logits/chosen": -2.4349982738494873, + "logits/rejected": -2.4097352027893066, + "logps/chosen": -125.55684661865234, + "logps/rejected": -132.021484375, + "loss": 0.5082, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4107874631881714, + "rewards/margins": 0.8303612470626831, + "rewards/rejected": -2.2411487102508545, + "step": 2860 + }, + { + "epoch": 0.86, + "learning_rate": 8.04635761589404e-08, + "logits/chosen": -2.265141248703003, + "logits/rejected": -2.169220447540283, + "logps/chosen": -102.09349060058594, + "logps/rejected": -119.7810287475586, + "loss": 0.553, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4435564279556274, + "rewards/margins": 1.294883131980896, + "rewards/rejected": -2.7384393215179443, + "step": 2870 + }, + { + "epoch": 0.86, + "learning_rate": 7.880794701986755e-08, + "logits/chosen": -2.4385974407196045, + "logits/rejected": -2.3579273223876953, + "logps/chosen": -93.9774169921875, + "logps/rejected": -96.58930969238281, + "loss": 0.5111, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5919442772865295, + "rewards/margins": 0.9432849884033203, + "rewards/rejected": -1.5352293252944946, + "step": 2880 + }, + { + "epoch": 0.86, + "learning_rate": 7.71523178807947e-08, + "logits/chosen": -2.4252941608428955, + "logits/rejected": -2.308663845062256, + "logps/chosen": -139.50485229492188, + "logps/rejected": -134.99417114257812, + "loss": 0.4841, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1931045055389404, + "rewards/margins": 1.3365012407302856, + "rewards/rejected": -2.5296058654785156, + "step": 2890 + }, + { + "epoch": 0.86, + "learning_rate": 7.549668874172185e-08, + "logits/chosen": -2.3252806663513184, + "logits/rejected": -2.2149767875671387, + "logps/chosen": -119.28135681152344, + "logps/rejected": -126.89034271240234, + "loss": 0.4699, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3043967485427856, + "rewards/margins": 1.297178030014038, + "rewards/rejected": -2.601574420928955, + "step": 2900 + }, + { + "epoch": 0.87, + "learning_rate": 7.3841059602649e-08, + "logits/chosen": -2.4337799549102783, + "logits/rejected": -2.408616065979004, + "logps/chosen": -105.0708236694336, + "logps/rejected": -112.90872955322266, + "loss": 0.5492, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.739833652973175, + "rewards/margins": 1.0932036638259888, + "rewards/rejected": -1.8330373764038086, + "step": 2910 + }, + { + "epoch": 0.87, + "learning_rate": 7.218543046357616e-08, + "logits/chosen": -2.474499225616455, + "logits/rejected": -2.3793933391571045, + "logps/chosen": -115.8188247680664, + "logps/rejected": -119.8792953491211, + "loss": 0.5534, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7591004371643066, + "rewards/margins": 1.390928864479065, + "rewards/rejected": -2.150029182434082, + "step": 2920 + }, + { + "epoch": 0.87, + "learning_rate": 7.052980132450331e-08, + "logits/chosen": -2.342878580093384, + "logits/rejected": -2.2635059356689453, + "logps/chosen": -112.3121566772461, + "logps/rejected": -118.00971984863281, + "loss": 0.4827, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1263339519500732, + "rewards/margins": 0.7218903303146362, + "rewards/rejected": -1.848224401473999, + "step": 2930 + }, + { + "epoch": 0.88, + "learning_rate": 6.887417218543045e-08, + "logits/chosen": -2.4378771781921387, + "logits/rejected": -2.493478775024414, + "logps/chosen": -101.32011413574219, + "logps/rejected": -126.55435943603516, + "loss": 0.4912, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.0950850248336792, + "rewards/margins": 1.0815422534942627, + "rewards/rejected": -2.1766273975372314, + "step": 2940 + }, + { + "epoch": 0.88, + "learning_rate": 6.721854304635762e-08, + "logits/chosen": -2.395272731781006, + "logits/rejected": -2.352908134460449, + "logps/chosen": -115.22686767578125, + "logps/rejected": -114.85673522949219, + "loss": 0.5139, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.910413384437561, + "rewards/margins": 0.9333620071411133, + "rewards/rejected": -1.8437751531600952, + "step": 2950 + }, + { + "epoch": 0.88, + "learning_rate": 6.556291390728476e-08, + "logits/chosen": -2.4603307247161865, + "logits/rejected": -2.4367270469665527, + "logps/chosen": -111.51399993896484, + "logps/rejected": -120.80682373046875, + "loss": 0.5692, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1758167743682861, + "rewards/margins": 0.8748563528060913, + "rewards/rejected": -2.050673007965088, + "step": 2960 + }, + { + "epoch": 0.88, + "learning_rate": 6.390728476821191e-08, + "logits/chosen": -2.3244917392730713, + "logits/rejected": -2.253732919692993, + "logps/chosen": -108.8800277709961, + "logps/rejected": -125.33662414550781, + "loss": 0.4513, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7894026637077332, + "rewards/margins": 1.8209375143051147, + "rewards/rejected": -2.610340118408203, + "step": 2970 + }, + { + "epoch": 0.89, + "learning_rate": 6.225165562913907e-08, + "logits/chosen": -2.387305974960327, + "logits/rejected": -2.387345552444458, + "logps/chosen": -107.43021392822266, + "logps/rejected": -118.97044372558594, + "loss": 0.6606, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9636829495429993, + "rewards/margins": 0.8673983812332153, + "rewards/rejected": -1.8310810327529907, + "step": 2980 + }, + { + "epoch": 0.89, + "learning_rate": 6.059602649006622e-08, + "logits/chosen": -2.3770089149475098, + "logits/rejected": -2.371371269226074, + "logps/chosen": -123.25062561035156, + "logps/rejected": -140.9857635498047, + "loss": 0.5031, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.802879810333252, + "rewards/margins": 1.1967840194702148, + "rewards/rejected": -1.9996639490127563, + "step": 2990 + }, + { + "epoch": 0.89, + "learning_rate": 5.8940397350993375e-08, + "logits/chosen": -2.3844501972198486, + "logits/rejected": -2.415923595428467, + "logps/chosen": -96.17528533935547, + "logps/rejected": -111.2402114868164, + "loss": 0.4914, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7955904006958008, + "rewards/margins": 1.3297996520996094, + "rewards/rejected": -2.125389814376831, + "step": 3000 + }, + { + "epoch": 0.89, + "eval_logits/chosen": -2.2567203044891357, + "eval_logits/rejected": -2.214937925338745, + "eval_logps/chosen": -110.69182586669922, + "eval_logps/rejected": -120.59849548339844, + "eval_loss": 0.5078982710838318, + "eval_rewards/accuracies": 0.7120535969734192, + "eval_rewards/chosen": -1.0825201272964478, + "eval_rewards/margins": 1.2725489139556885, + "eval_rewards/rejected": -2.3550689220428467, + "eval_runtime": 502.7018, + "eval_samples_per_second": 3.553, + "eval_steps_per_second": 0.111, + "step": 3000 + }, + { + "epoch": 0.9, + "learning_rate": 5.728476821192053e-08, + "logits/chosen": -2.398317575454712, + "logits/rejected": -2.4122400283813477, + "logps/chosen": -93.20875549316406, + "logps/rejected": -113.88653564453125, + "loss": 0.549, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8295547366142273, + "rewards/margins": 1.022578239440918, + "rewards/rejected": -1.852132797241211, + "step": 3010 + }, + { + "epoch": 0.9, + "learning_rate": 5.5629139072847675e-08, + "logits/chosen": -2.414301633834839, + "logits/rejected": -2.3872337341308594, + "logps/chosen": -129.2257080078125, + "logps/rejected": -136.29031372070312, + "loss": 0.4718, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3618860244750977, + "rewards/margins": 1.9172807931900024, + "rewards/rejected": -3.2791664600372314, + "step": 3020 + }, + { + "epoch": 0.9, + "learning_rate": 5.397350993377483e-08, + "logits/chosen": -2.446453809738159, + "logits/rejected": -2.384152889251709, + "logps/chosen": -120.69456481933594, + "logps/rejected": -128.5080108642578, + "loss": 0.4889, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.820598304271698, + "rewards/margins": 1.557586908340454, + "rewards/rejected": -2.3781850337982178, + "step": 3030 + }, + { + "epoch": 0.91, + "learning_rate": 5.231788079470199e-08, + "logits/chosen": -2.416982889175415, + "logits/rejected": -2.296403646469116, + "logps/chosen": -110.80255126953125, + "logps/rejected": -113.04368591308594, + "loss": 0.4946, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1277140378952026, + "rewards/margins": 1.060675859451294, + "rewards/rejected": -2.188389778137207, + "step": 3040 + }, + { + "epoch": 0.91, + "learning_rate": 5.0662251655629135e-08, + "logits/chosen": -2.355494976043701, + "logits/rejected": -2.2958462238311768, + "logps/chosen": -113.16410064697266, + "logps/rejected": -119.9725112915039, + "loss": 0.4515, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6746724247932434, + "rewards/margins": 1.8009824752807617, + "rewards/rejected": -2.4756548404693604, + "step": 3050 + }, + { + "epoch": 0.91, + "learning_rate": 4.900662251655629e-08, + "logits/chosen": -2.4485743045806885, + "logits/rejected": -2.426466703414917, + "logps/chosen": -110.64210510253906, + "logps/rejected": -122.92867279052734, + "loss": 0.4162, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9479681849479675, + "rewards/margins": 1.6344906091690063, + "rewards/rejected": -2.582458972930908, + "step": 3060 + }, + { + "epoch": 0.91, + "learning_rate": 4.735099337748344e-08, + "logits/chosen": -2.279062509536743, + "logits/rejected": -2.2378296852111816, + "logps/chosen": -117.4856185913086, + "logps/rejected": -126.33473205566406, + "loss": 0.5187, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9676671028137207, + "rewards/margins": 1.4139858484268188, + "rewards/rejected": -2.381652593612671, + "step": 3070 + }, + { + "epoch": 0.92, + "learning_rate": 4.5695364238410595e-08, + "logits/chosen": -2.27183198928833, + "logits/rejected": -2.2195851802825928, + "logps/chosen": -99.91886138916016, + "logps/rejected": -139.50657653808594, + "loss": 0.5204, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.060943603515625, + "rewards/margins": 2.9951958656311035, + "rewards/rejected": -4.056139945983887, + "step": 3080 + }, + { + "epoch": 0.92, + "learning_rate": 4.403973509933775e-08, + "logits/chosen": -2.413677215576172, + "logits/rejected": -2.440647602081299, + "logps/chosen": -118.7281723022461, + "logps/rejected": -134.04771423339844, + "loss": 0.5028, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.119231939315796, + "rewards/margins": 1.4490314722061157, + "rewards/rejected": -2.568263530731201, + "step": 3090 + }, + { + "epoch": 0.92, + "learning_rate": 4.23841059602649e-08, + "logits/chosen": -2.3565783500671387, + "logits/rejected": -2.4461493492126465, + "logps/chosen": -108.08616638183594, + "logps/rejected": -132.34011840820312, + "loss": 0.485, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1898248195648193, + "rewards/margins": 1.3966195583343506, + "rewards/rejected": -2.58644437789917, + "step": 3100 + }, + { + "epoch": 0.93, + "learning_rate": 4.072847682119205e-08, + "logits/chosen": -2.396179437637329, + "logits/rejected": -2.4256176948547363, + "logps/chosen": -96.67437744140625, + "logps/rejected": -101.86246490478516, + "loss": 0.4582, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4870302081108093, + "rewards/margins": 1.127990484237671, + "rewards/rejected": -1.615020751953125, + "step": 3110 + }, + { + "epoch": 0.93, + "learning_rate": 3.90728476821192e-08, + "logits/chosen": -2.3725028038024902, + "logits/rejected": -2.322782039642334, + "logps/chosen": -128.52896118164062, + "logps/rejected": -129.73118591308594, + "loss": 0.5572, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3587214946746826, + "rewards/margins": 1.5958476066589355, + "rewards/rejected": -2.954568862915039, + "step": 3120 + }, + { + "epoch": 0.93, + "learning_rate": 3.7417218543046355e-08, + "logits/chosen": -2.378821611404419, + "logits/rejected": -2.277832269668579, + "logps/chosen": -87.0296630859375, + "logps/rejected": -106.12138366699219, + "loss": 0.5238, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8327251672744751, + "rewards/margins": 1.2228658199310303, + "rewards/rejected": -2.055591106414795, + "step": 3130 + }, + { + "epoch": 0.94, + "learning_rate": 3.576158940397351e-08, + "logits/chosen": -2.549872398376465, + "logits/rejected": -2.4757115840911865, + "logps/chosen": -114.14369201660156, + "logps/rejected": -116.66259765625, + "loss": 0.5169, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9791749119758606, + "rewards/margins": 0.8427003026008606, + "rewards/rejected": -1.821874976158142, + "step": 3140 + }, + { + "epoch": 0.94, + "learning_rate": 3.410596026490066e-08, + "logits/chosen": -2.433527708053589, + "logits/rejected": -2.371525764465332, + "logps/chosen": -103.0054931640625, + "logps/rejected": -103.31925964355469, + "loss": 0.5538, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8271923065185547, + "rewards/margins": 1.119652509689331, + "rewards/rejected": -1.9468450546264648, + "step": 3150 + }, + { + "epoch": 0.94, + "learning_rate": 3.245033112582781e-08, + "logits/chosen": -2.337153434753418, + "logits/rejected": -2.2308475971221924, + "logps/chosen": -129.55728149414062, + "logps/rejected": -122.7024917602539, + "loss": 0.4763, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9213937520980835, + "rewards/margins": 1.0446635484695435, + "rewards/rejected": -1.9660571813583374, + "step": 3160 + }, + { + "epoch": 0.94, + "learning_rate": 3.079470198675496e-08, + "logits/chosen": -2.2858211994171143, + "logits/rejected": -2.313380002975464, + "logps/chosen": -107.20402526855469, + "logps/rejected": -136.98562622070312, + "loss": 0.5288, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9993183016777039, + "rewards/margins": 1.509690284729004, + "rewards/rejected": -2.5090086460113525, + "step": 3170 + }, + { + "epoch": 0.95, + "learning_rate": 2.913907284768212e-08, + "logits/chosen": -2.3693079948425293, + "logits/rejected": -2.284874677658081, + "logps/chosen": -106.6112289428711, + "logps/rejected": -126.05074310302734, + "loss": 0.4491, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7449665665626526, + "rewards/margins": 1.826768159866333, + "rewards/rejected": -2.571734666824341, + "step": 3180 + }, + { + "epoch": 0.95, + "learning_rate": 2.748344370860927e-08, + "logits/chosen": -2.2911553382873535, + "logits/rejected": -2.380384922027588, + "logps/chosen": -102.5718765258789, + "logps/rejected": -124.40003967285156, + "loss": 0.4937, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7534275650978088, + "rewards/margins": 1.1661580801010132, + "rewards/rejected": -1.9195858240127563, + "step": 3190 + }, + { + "epoch": 0.95, + "learning_rate": 2.5827814569536422e-08, + "logits/chosen": -2.4230473041534424, + "logits/rejected": -2.4315543174743652, + "logps/chosen": -117.46553802490234, + "logps/rejected": -130.05776977539062, + "loss": 0.4991, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.8374800682067871, + "rewards/margins": 1.2378642559051514, + "rewards/rejected": -2.0753445625305176, + "step": 3200 + }, + { + "epoch": 0.96, + "learning_rate": 2.4172185430463576e-08, + "logits/chosen": -2.417757034301758, + "logits/rejected": -2.2985901832580566, + "logps/chosen": -132.27774047851562, + "logps/rejected": -133.81459045410156, + "loss": 0.5058, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.378666877746582, + "rewards/margins": 0.97132807970047, + "rewards/rejected": -2.3499951362609863, + "step": 3210 + }, + { + "epoch": 0.96, + "learning_rate": 2.2516556291390726e-08, + "logits/chosen": -2.327725887298584, + "logits/rejected": -2.290168046951294, + "logps/chosen": -118.74835205078125, + "logps/rejected": -132.76882934570312, + "loss": 0.6159, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3064758777618408, + "rewards/margins": 1.0742686986923218, + "rewards/rejected": -2.380744457244873, + "step": 3220 + }, + { + "epoch": 0.96, + "learning_rate": 2.0860927152317882e-08, + "logits/chosen": -2.3731508255004883, + "logits/rejected": -2.367323398590088, + "logps/chosen": -126.88232421875, + "logps/rejected": -135.72384643554688, + "loss": 0.5072, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.890730082988739, + "rewards/margins": 1.7571513652801514, + "rewards/rejected": -2.6478817462921143, + "step": 3230 + }, + { + "epoch": 0.97, + "learning_rate": 1.9205298013245032e-08, + "logits/chosen": -2.4219555854797363, + "logits/rejected": -2.4555513858795166, + "logps/chosen": -96.6889419555664, + "logps/rejected": -114.50843811035156, + "loss": 0.514, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1261937618255615, + "rewards/margins": 1.0691124200820923, + "rewards/rejected": -2.1953060626983643, + "step": 3240 + }, + { + "epoch": 0.97, + "learning_rate": 1.7549668874172186e-08, + "logits/chosen": -2.3101606369018555, + "logits/rejected": -2.3013217449188232, + "logps/chosen": -95.89967346191406, + "logps/rejected": -99.94120025634766, + "loss": 0.4685, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2933688163757324, + "rewards/margins": 0.8693240880966187, + "rewards/rejected": -2.1626930236816406, + "step": 3250 + }, + { + "epoch": 0.97, + "learning_rate": 1.5894039735099336e-08, + "logits/chosen": -2.22920823097229, + "logits/rejected": -2.2497153282165527, + "logps/chosen": -83.50569152832031, + "logps/rejected": -98.3634033203125, + "loss": 0.514, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0309844017028809, + "rewards/margins": 1.4945679903030396, + "rewards/rejected": -2.525552272796631, + "step": 3260 + }, + { + "epoch": 0.97, + "learning_rate": 1.4238410596026489e-08, + "logits/chosen": -2.220327854156494, + "logits/rejected": -2.2442502975463867, + "logps/chosen": -105.8703842163086, + "logps/rejected": -126.78196716308594, + "loss": 0.4796, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.513962984085083, + "rewards/margins": 1.4240639209747314, + "rewards/rejected": -2.9380269050598145, + "step": 3270 + }, + { + "epoch": 0.98, + "learning_rate": 1.2582781456953642e-08, + "logits/chosen": -2.417300224304199, + "logits/rejected": -2.3726484775543213, + "logps/chosen": -126.7840576171875, + "logps/rejected": -133.47689819335938, + "loss": 0.4275, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0579255819320679, + "rewards/margins": 1.5306205749511719, + "rewards/rejected": -2.58854603767395, + "step": 3280 + }, + { + "epoch": 0.98, + "learning_rate": 1.0927152317880794e-08, + "logits/chosen": -2.4346401691436768, + "logits/rejected": -2.4542853832244873, + "logps/chosen": -119.21122741699219, + "logps/rejected": -128.86886596679688, + "loss": 0.4999, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2019822597503662, + "rewards/margins": 1.384701132774353, + "rewards/rejected": -2.5866830348968506, + "step": 3290 + }, + { + "epoch": 0.98, + "learning_rate": 9.271523178807947e-09, + "logits/chosen": -2.4030935764312744, + "logits/rejected": -2.3885276317596436, + "logps/chosen": -111.55142974853516, + "logps/rejected": -113.03800964355469, + "loss": 0.6577, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.222048282623291, + "rewards/margins": 0.9595780372619629, + "rewards/rejected": -2.181626558303833, + "step": 3300 + }, + { + "epoch": 0.99, + "learning_rate": 7.6158940397351e-09, + "logits/chosen": -2.2258238792419434, + "logits/rejected": -2.1862361431121826, + "logps/chosen": -92.22099304199219, + "logps/rejected": -98.86279296875, + "loss": 0.5937, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.713568925857544, + "rewards/margins": 0.8357810974121094, + "rewards/rejected": -2.5493500232696533, + "step": 3310 + }, + { + "epoch": 0.99, + "learning_rate": 5.960264900662252e-09, + "logits/chosen": -2.317258358001709, + "logits/rejected": -2.3031933307647705, + "logps/chosen": -109.45621490478516, + "logps/rejected": -111.22418212890625, + "loss": 0.8281, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2969977855682373, + "rewards/margins": 1.3577762842178345, + "rewards/rejected": -2.6547741889953613, + "step": 3320 + }, + { + "epoch": 0.99, + "learning_rate": 4.3046357615894034e-09, + "logits/chosen": -2.2622385025024414, + "logits/rejected": -2.2199172973632812, + "logps/chosen": -98.4054946899414, + "logps/rejected": -112.76808166503906, + "loss": 0.4438, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9782658815383911, + "rewards/margins": 1.7501733303070068, + "rewards/rejected": -2.7284390926361084, + "step": 3330 + }, + { + "epoch": 1.0, + "learning_rate": 2.6490066225165564e-09, + "logits/chosen": -2.3729500770568848, + "logits/rejected": -2.432080030441284, + "logps/chosen": -101.60713195800781, + "logps/rejected": -131.0595703125, + "loss": 0.5899, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8371769189834595, + "rewards/margins": 0.8700039982795715, + "rewards/rejected": -1.7071807384490967, + "step": 3340 + }, + { + "epoch": 1.0, + "learning_rate": 9.933774834437085e-10, + "logits/chosen": -2.2028284072875977, + "logits/rejected": -2.2098453044891357, + "logps/chosen": -109.49913024902344, + "logps/rejected": -121.43013763427734, + "loss": 0.4436, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7948501110076904, + "rewards/margins": 2.1088409423828125, + "rewards/rejected": -2.903691291809082, + "step": 3350 + }, + { + "epoch": 1.0, + "step": 3356, + "total_flos": 0.0, + "train_loss": 0.58384587518933, + "train_runtime": 30698.0699, + "train_samples_per_second": 1.749, + "train_steps_per_second": 0.109 + } + ], + "logging_steps": 10, + "max_steps": 3356, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}