{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1563, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 3.1847133757961784e-09, "logits/generated": -2.8295888900756836, "logits/real": -2.849569320678711, "logps/generated": -609.6478881835938, "logps/real": -115.27798461914062, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/generated": 0.0, "rewards/margins": 0.0, "rewards/real": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 3.184713375796178e-08, "logits/generated": -2.6501715183258057, "logits/real": -2.821021318435669, "logps/generated": -688.5430297851562, "logps/real": -131.44122314453125, "loss": 0.5833, "rewards/accuracies": 0.6388888955116272, "rewards/generated": -0.34704259037971497, "rewards/margins": 0.3495745360851288, "rewards/real": 0.0025319471023976803, "step": 10 }, { "epoch": 0.01, "learning_rate": 6.369426751592356e-08, "logits/generated": -2.714224338531494, "logits/real": -2.780827522277832, "logps/generated": -677.5689086914062, "logps/real": -129.50588989257812, "loss": 0.3907, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -1.4221687316894531, "rewards/margins": 1.4617396593093872, "rewards/real": 0.039571087807416916, "step": 20 }, { "epoch": 0.02, "learning_rate": 9.554140127388536e-08, "logits/generated": -2.7694613933563232, "logits/real": -2.860546588897705, "logps/generated": -698.780517578125, "logps/real": -139.16226196289062, "loss": 0.2306, "rewards/accuracies": 1.0, "rewards/generated": -2.204209089279175, "rewards/margins": 2.362700939178467, "rewards/real": 0.15849189460277557, "step": 30 }, { "epoch": 0.03, "learning_rate": 1.2738853503184713e-07, "logits/generated": -2.678589344024658, "logits/real": -2.7937369346618652, "logps/generated": -756.0526123046875, "logps/real": -132.27268981933594, "loss": 0.1129, "rewards/accuracies": 1.0, "rewards/generated": -4.11299467086792, "rewards/margins": 4.405646800994873, "rewards/real": 0.29265230894088745, "step": 40 }, { "epoch": 0.03, "learning_rate": 1.592356687898089e-07, "logits/generated": -2.719285726547241, "logits/real": -2.8095831871032715, "logps/generated": -749.7476806640625, "logps/real": -123.66046142578125, "loss": 0.0573, "rewards/accuracies": 1.0, "rewards/generated": -4.803214073181152, "rewards/margins": 5.2087225914001465, "rewards/real": 0.4055088460445404, "step": 50 }, { "epoch": 0.04, "learning_rate": 1.9108280254777072e-07, "logits/generated": -2.685657024383545, "logits/real": -2.817525625228882, "logps/generated": -743.1283569335938, "logps/real": -132.15084838867188, "loss": 0.0325, "rewards/accuracies": 1.0, "rewards/generated": -4.882467746734619, "rewards/margins": 5.401439189910889, "rewards/real": 0.5189720392227173, "step": 60 }, { "epoch": 0.04, "learning_rate": 2.2292993630573247e-07, "logits/generated": -2.6736221313476562, "logits/real": -2.782536029815674, "logps/generated": -779.1280517578125, "logps/real": -136.8399200439453, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/generated": -5.401805400848389, "rewards/margins": 5.999195098876953, "rewards/real": 0.597389817237854, "step": 70 }, { "epoch": 0.05, "learning_rate": 2.5477707006369425e-07, "logits/generated": -2.716283082962036, "logits/real": -2.7836098670959473, "logps/generated": -763.261962890625, "logps/real": -121.11332702636719, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/generated": -5.937032222747803, "rewards/margins": 6.565484046936035, "rewards/real": 0.6284510493278503, "step": 80 }, { "epoch": 0.06, "learning_rate": 2.86624203821656e-07, "logits/generated": -2.752387285232544, "logits/real": -2.8120365142822266, "logps/generated": -760.432373046875, "logps/real": -132.00917053222656, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/generated": -6.373486518859863, "rewards/margins": 7.068659782409668, "rewards/real": 0.6951735615730286, "step": 90 }, { "epoch": 0.06, "learning_rate": 3.184713375796178e-07, "logits/generated": -2.702080488204956, "logits/real": -2.792630910873413, "logps/generated": -744.688232421875, "logps/real": -120.1148910522461, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/generated": -5.977693557739258, "rewards/margins": 6.694817543029785, "rewards/real": 0.7171245217323303, "step": 100 }, { "epoch": 0.07, "learning_rate": 3.5031847133757957e-07, "logits/generated": -2.6754021644592285, "logits/real": -2.786886215209961, "logps/generated": -726.2047119140625, "logps/real": -138.05221557617188, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/generated": -6.919286251068115, "rewards/margins": 7.560235500335693, "rewards/real": 0.6409494280815125, "step": 110 }, { "epoch": 0.08, "learning_rate": 3.8216560509554143e-07, "logits/generated": -2.7799072265625, "logits/real": -2.834855556488037, "logps/generated": -731.4354248046875, "logps/real": -123.99066162109375, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/generated": -6.255575656890869, "rewards/margins": 7.035998344421387, "rewards/real": 0.7804235219955444, "step": 120 }, { "epoch": 0.08, "learning_rate": 4.140127388535032e-07, "logits/generated": -2.695237398147583, "logits/real": -2.808246612548828, "logps/generated": -781.197265625, "logps/real": -134.5619354248047, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/generated": -7.42046594619751, "rewards/margins": 8.036266326904297, "rewards/real": 0.6158010363578796, "step": 130 }, { "epoch": 0.09, "learning_rate": 4.4585987261146494e-07, "logits/generated": -2.690974712371826, "logits/real": -2.7843871116638184, "logps/generated": -769.6416625976562, "logps/real": -118.09663391113281, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/generated": -8.030922889709473, "rewards/margins": 8.821279525756836, "rewards/real": 0.7903567552566528, "step": 140 }, { "epoch": 0.1, "learning_rate": 4.777070063694267e-07, "logits/generated": -2.6999502182006836, "logits/real": -2.845780372619629, "logps/generated": -756.4619750976562, "logps/real": -128.97555541992188, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/generated": -7.640128135681152, "rewards/margins": 8.349076271057129, "rewards/real": 0.7089481353759766, "step": 150 }, { "epoch": 0.1, "learning_rate": 4.989331436699858e-07, "logits/generated": -2.770327568054199, "logits/real": -2.7911763191223145, "logps/generated": -772.9364624023438, "logps/real": -125.63675689697266, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/generated": -8.297707557678223, "rewards/margins": 9.060877799987793, "rewards/real": 0.7631710767745972, "step": 160 }, { "epoch": 0.11, "learning_rate": 4.953769559032717e-07, "logits/generated": -2.700230121612549, "logits/real": -2.770098924636841, "logps/generated": -789.2719116210938, "logps/real": -118.48409271240234, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/generated": -8.936752319335938, "rewards/margins": 9.651647567749023, "rewards/real": 0.7148973345756531, "step": 170 }, { "epoch": 0.12, "learning_rate": 4.918207681365576e-07, "logits/generated": -2.715512752532959, "logits/real": -2.8020758628845215, "logps/generated": -785.4468383789062, "logps/real": -141.07501220703125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/generated": -9.002473831176758, "rewards/margins": 9.73188591003418, "rewards/real": 0.7294121980667114, "step": 180 }, { "epoch": 0.12, "learning_rate": 4.882645803698435e-07, "logits/generated": -2.7685036659240723, "logits/real": -2.789585590362549, "logps/generated": -773.3211669921875, "logps/real": -132.91026306152344, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/generated": -9.269811630249023, "rewards/margins": 10.108678817749023, "rewards/real": 0.8388668298721313, "step": 190 }, { "epoch": 0.13, "learning_rate": 4.847083926031294e-07, "logits/generated": -2.7428407669067383, "logits/real": -2.7933878898620605, "logps/generated": -822.3059692382812, "logps/real": -119.96354675292969, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/generated": -9.959307670593262, "rewards/margins": 10.764963150024414, "rewards/real": 0.8056550025939941, "step": 200 }, { "epoch": 0.13, "learning_rate": 4.811522048364154e-07, "logits/generated": -2.6588082313537598, "logits/real": -2.753288984298706, "logps/generated": -809.4946899414062, "logps/real": -112.04827880859375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/generated": -9.868528366088867, "rewards/margins": 10.617731094360352, "rewards/real": 0.7492026090621948, "step": 210 }, { "epoch": 0.14, "learning_rate": 4.775960170697012e-07, "logits/generated": -2.6897635459899902, "logits/real": -2.770383596420288, "logps/generated": -840.3517456054688, "logps/real": -111.0347900390625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/generated": -10.592704772949219, "rewards/margins": 11.492315292358398, "rewards/real": 0.8996096849441528, "step": 220 }, { "epoch": 0.15, "learning_rate": 4.7403982930298717e-07, "logits/generated": -2.7826247215270996, "logits/real": -2.786170482635498, "logps/generated": -804.2281494140625, "logps/real": -119.8484115600586, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/generated": -9.724154472351074, "rewards/margins": 10.552526473999023, "rewards/real": 0.8283706903457642, "step": 230 }, { "epoch": 0.15, "learning_rate": 4.7048364153627306e-07, "logits/generated": -2.7883107662200928, "logits/real": -2.7733795642852783, "logps/generated": -787.4295654296875, "logps/real": -148.79747009277344, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/generated": -9.756797790527344, "rewards/margins": 10.487835884094238, "rewards/real": 0.7310384511947632, "step": 240 }, { "epoch": 0.16, "learning_rate": 4.66927453769559e-07, "logits/generated": -2.7394678592681885, "logits/real": -2.790409803390503, "logps/generated": -819.8644409179688, "logps/real": -116.63028717041016, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/generated": -10.328279495239258, "rewards/margins": 11.112763404846191, "rewards/real": 0.7844842672348022, "step": 250 }, { "epoch": 0.17, "learning_rate": 4.633712660028449e-07, "logits/generated": -2.727818012237549, "logits/real": -2.7909488677978516, "logps/generated": -797.3067016601562, "logps/real": -117.8537826538086, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/generated": -10.560891151428223, "rewards/margins": 11.291653633117676, "rewards/real": 0.7307616472244263, "step": 260 }, { "epoch": 0.17, "learning_rate": 4.5981507823613085e-07, "logits/generated": -2.767531633377075, "logits/real": -2.7978005409240723, "logps/generated": -810.1624755859375, "logps/real": -121.3199691772461, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/generated": -10.349993705749512, "rewards/margins": 11.145318984985352, "rewards/real": 0.7953254580497742, "step": 270 }, { "epoch": 0.18, "learning_rate": 4.562588904694168e-07, "logits/generated": -2.778958559036255, "logits/real": -2.767667293548584, "logps/generated": -821.5338134765625, "logps/real": -130.97152709960938, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/generated": -11.092814445495605, "rewards/margins": 11.828493118286133, "rewards/real": 0.7356794476509094, "step": 280 }, { "epoch": 0.19, "learning_rate": 4.5270270270270264e-07, "logits/generated": -2.732811450958252, "logits/real": -2.771510601043701, "logps/generated": -757.4833374023438, "logps/real": -126.4487075805664, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/generated": -10.417075157165527, "rewards/margins": 11.195660591125488, "rewards/real": 0.7785850167274475, "step": 290 }, { "epoch": 0.19, "learning_rate": 4.491465149359886e-07, "logits/generated": -2.713351249694824, "logits/real": -2.756260395050049, "logps/generated": -806.474609375, "logps/real": -118.60646057128906, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/generated": -11.823250770568848, "rewards/margins": 12.591789245605469, "rewards/real": 0.7685383558273315, "step": 300 }, { "epoch": 0.2, "learning_rate": 4.4559032716927454e-07, "logits/generated": -2.8016388416290283, "logits/real": -2.796672821044922, "logps/generated": -763.6119995117188, "logps/real": -142.29685974121094, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/generated": -10.680870056152344, "rewards/margins": 11.322160720825195, "rewards/real": 0.6412909030914307, "step": 310 }, { "epoch": 0.2, "learning_rate": 4.420341394025605e-07, "logits/generated": -2.7270781993865967, "logits/real": -2.7888123989105225, "logps/generated": -799.5844116210938, "logps/real": -130.8887481689453, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/generated": -11.834752082824707, "rewards/margins": 12.512211799621582, "rewards/real": 0.6774585247039795, "step": 320 }, { "epoch": 0.21, "learning_rate": 4.384779516358463e-07, "logits/generated": -2.79298734664917, "logits/real": -2.784541606903076, "logps/generated": -788.7904052734375, "logps/real": -134.79293823242188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/generated": -11.506429672241211, "rewards/margins": 12.11829948425293, "rewards/real": 0.6118704080581665, "step": 330 }, { "epoch": 0.22, "learning_rate": 4.3492176386913227e-07, "logits/generated": -2.8363544940948486, "logits/real": -2.8054802417755127, "logps/generated": -768.1728515625, "logps/real": -139.9955291748047, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -11.797532081604004, "rewards/margins": 12.470538139343262, "rewards/real": 0.6730067133903503, "step": 340 }, { "epoch": 0.22, "learning_rate": 4.313655761024182e-07, "logits/generated": -2.7400827407836914, "logits/real": -2.7338576316833496, "logps/generated": -811.0081787109375, "logps/real": -131.45535278320312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/generated": -12.710081100463867, "rewards/margins": 13.333477973937988, "rewards/real": 0.6233970522880554, "step": 350 }, { "epoch": 0.23, "learning_rate": 4.278093883357041e-07, "logits/generated": -2.776153087615967, "logits/real": -2.750797748565674, "logps/generated": -819.32666015625, "logps/real": -126.93217468261719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -12.181897163391113, "rewards/margins": 12.906854629516602, "rewards/real": 0.7249582409858704, "step": 360 }, { "epoch": 0.24, "learning_rate": 4.2425320056899e-07, "logits/generated": -2.7904210090637207, "logits/real": -2.7961840629577637, "logps/generated": -789.5787963867188, "logps/real": -147.4117431640625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/generated": -11.834062576293945, "rewards/margins": 12.519264221191406, "rewards/real": 0.6852015256881714, "step": 370 }, { "epoch": 0.24, "learning_rate": 4.2069701280227595e-07, "logits/generated": -2.750471830368042, "logits/real": -2.772777795791626, "logps/generated": -819.3651123046875, "logps/real": -135.7245330810547, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -12.894061088562012, "rewards/margins": 13.609522819519043, "rewards/real": 0.7154618501663208, "step": 380 }, { "epoch": 0.25, "learning_rate": 4.1714082503556185e-07, "logits/generated": -2.7443814277648926, "logits/real": -2.745856285095215, "logps/generated": -827.7913208007812, "logps/real": -126.5484390258789, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -13.79316234588623, "rewards/margins": 14.394973754882812, "rewards/real": 0.601812481880188, "step": 390 }, { "epoch": 0.26, "learning_rate": 4.135846372688478e-07, "logits/generated": -2.812390089035034, "logits/real": -2.73819637298584, "logps/generated": -808.9031982421875, "logps/real": -131.18746948242188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -12.755112648010254, "rewards/margins": 13.416218757629395, "rewards/real": 0.6611047983169556, "step": 400 }, { "epoch": 0.26, "learning_rate": 4.100284495021337e-07, "logits/generated": -2.820923089981079, "logits/real": -2.7527689933776855, "logps/generated": -823.16796875, "logps/real": -127.68003845214844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -12.786894798278809, "rewards/margins": 13.430368423461914, "rewards/real": 0.6434718370437622, "step": 410 }, { "epoch": 0.27, "learning_rate": 4.064722617354196e-07, "logits/generated": -2.678879737854004, "logits/real": -2.7136893272399902, "logps/generated": -872.8970947265625, "logps/real": -131.42593383789062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -13.947868347167969, "rewards/margins": 14.58533000946045, "rewards/real": 0.637461245059967, "step": 420 }, { "epoch": 0.28, "learning_rate": 4.0291607396870553e-07, "logits/generated": -2.7224462032318115, "logits/real": -2.7085747718811035, "logps/generated": -834.8814697265625, "logps/real": -122.5090560913086, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -13.601274490356445, "rewards/margins": 14.206278800964355, "rewards/real": 0.6050056219100952, "step": 430 }, { "epoch": 0.28, "learning_rate": 3.993598862019915e-07, "logits/generated": -2.783932685852051, "logits/real": -2.7148799896240234, "logps/generated": -828.7579956054688, "logps/real": -125.92060852050781, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -13.426411628723145, "rewards/margins": 14.101513862609863, "rewards/real": 0.6751025915145874, "step": 440 }, { "epoch": 0.29, "learning_rate": 3.9580369843527737e-07, "logits/generated": -2.774967670440674, "logits/real": -2.701488971710205, "logps/generated": -873.3244018554688, "logps/real": -123.95247650146484, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/generated": -14.389918327331543, "rewards/margins": 14.915544509887695, "rewards/real": 0.525626540184021, "step": 450 }, { "epoch": 0.29, "learning_rate": 3.9224751066856327e-07, "logits/generated": -2.735586643218994, "logits/real": -2.7377548217773438, "logps/generated": -805.3878173828125, "logps/real": -155.18560791015625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/generated": -14.252492904663086, "rewards/margins": 14.655688285827637, "rewards/real": 0.4031934142112732, "step": 460 }, { "epoch": 0.3, "learning_rate": 3.886913229018492e-07, "logits/generated": -2.7526440620422363, "logits/real": -2.7074286937713623, "logps/generated": -821.6927490234375, "logps/real": -128.5563507080078, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -14.784818649291992, "rewards/margins": 15.3468599319458, "rewards/real": 0.562040388584137, "step": 470 }, { "epoch": 0.31, "learning_rate": 3.851351351351351e-07, "logits/generated": -2.7571702003479004, "logits/real": -2.737308979034424, "logps/generated": -852.9948120117188, "logps/real": -132.78759765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -15.543283462524414, "rewards/margins": 16.045238494873047, "rewards/real": 0.5019546747207642, "step": 480 }, { "epoch": 0.31, "learning_rate": 3.8157894736842105e-07, "logits/generated": -2.706204891204834, "logits/real": -2.6772992610931396, "logps/generated": -849.1552734375, "logps/real": -120.03173828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -15.504430770874023, "rewards/margins": 15.993593215942383, "rewards/real": 0.4891592860221863, "step": 490 }, { "epoch": 0.32, "learning_rate": 3.7802275960170695e-07, "logits/generated": -2.7938857078552246, "logits/real": -2.740180253982544, "logps/generated": -782.1716918945312, "logps/real": -129.24673461914062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -14.547981262207031, "rewards/margins": 15.090237617492676, "rewards/real": 0.5422547459602356, "step": 500 }, { "epoch": 0.32, "eval_logits/generated": -2.761601686477661, "eval_logits/real": -2.705458402633667, "eval_logps/generated": -827.1112670898438, "eval_logps/real": -130.07472229003906, "eval_loss": 5.503268039319664e-05, "eval_rewards/accuracies": 1.0, "eval_rewards/generated": -14.911882400512695, "eval_rewards/margins": 15.304994583129883, "eval_rewards/real": 0.3931117355823517, "eval_runtime": 66.014, "eval_samples_per_second": 7.574, "eval_steps_per_second": 0.242, "step": 500 }, { "epoch": 0.33, "learning_rate": 3.7446657183499284e-07, "logits/generated": -2.8136253356933594, "logits/real": -2.669490337371826, "logps/generated": -792.15380859375, "logps/real": -127.56230163574219, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -14.737091064453125, "rewards/margins": 15.237916946411133, "rewards/real": 0.5008259415626526, "step": 510 }, { "epoch": 0.33, "learning_rate": 3.709103840682788e-07, "logits/generated": -2.787266731262207, "logits/real": -2.670997142791748, "logps/generated": -824.7960205078125, "logps/real": -124.60465240478516, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -15.136065483093262, "rewards/margins": 15.58125114440918, "rewards/real": 0.4451850950717926, "step": 520 }, { "epoch": 0.34, "learning_rate": 3.6735419630156474e-07, "logits/generated": -2.704446315765381, "logits/real": -2.6110129356384277, "logps/generated": -878.9093017578125, "logps/real": -118.95606994628906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -16.33749008178711, "rewards/margins": 16.747507095336914, "rewards/real": 0.41001471877098083, "step": 530 }, { "epoch": 0.35, "learning_rate": 3.637980085348506e-07, "logits/generated": -2.794490098953247, "logits/real": -2.6710212230682373, "logps/generated": -834.6387939453125, "logps/real": -130.42050170898438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -15.052263259887695, "rewards/margins": 15.429656982421875, "rewards/real": 0.37739241123199463, "step": 540 }, { "epoch": 0.35, "learning_rate": 3.602418207681365e-07, "logits/generated": -2.7407026290893555, "logits/real": -2.6537132263183594, "logps/generated": -880.5545043945312, "logps/real": -135.30288696289062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -16.217912673950195, "rewards/margins": 16.690380096435547, "rewards/real": 0.47246813774108887, "step": 550 }, { "epoch": 0.36, "learning_rate": 3.5668563300142247e-07, "logits/generated": -2.81174898147583, "logits/real": -2.6751866340637207, "logps/generated": -854.27490234375, "logps/real": -126.11138916015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -16.003740310668945, "rewards/margins": 16.378582000732422, "rewards/real": 0.3748398423194885, "step": 560 }, { "epoch": 0.36, "learning_rate": 3.5312944523470837e-07, "logits/generated": -2.741673469543457, "logits/real": -2.5907273292541504, "logps/generated": -888.6085815429688, "logps/real": -128.69569396972656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -17.06721305847168, "rewards/margins": 17.444976806640625, "rewards/real": 0.37776434421539307, "step": 570 }, { "epoch": 0.37, "learning_rate": 3.495732574679943e-07, "logits/generated": -2.7112770080566406, "logits/real": -2.647355794906616, "logps/generated": -842.6921997070312, "logps/real": -126.5383529663086, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -16.2426815032959, "rewards/margins": 16.684232711791992, "rewards/real": 0.4415510296821594, "step": 580 }, { "epoch": 0.38, "learning_rate": 3.460170697012802e-07, "logits/generated": -2.7611031532287598, "logits/real": -2.587040901184082, "logps/generated": -856.2335815429688, "logps/real": -131.83444213867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -16.540231704711914, "rewards/margins": 16.8332576751709, "rewards/real": 0.29302695393562317, "step": 590 }, { "epoch": 0.38, "learning_rate": 3.424608819345661e-07, "logits/generated": -2.749112844467163, "logits/real": -2.5918571949005127, "logps/generated": -806.6871948242188, "logps/real": -124.9672622680664, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -15.760574340820312, "rewards/margins": 16.10599136352539, "rewards/real": 0.3454182744026184, "step": 600 }, { "epoch": 0.39, "learning_rate": 3.3890469416785205e-07, "logits/generated": -2.7701778411865234, "logits/real": -2.6194324493408203, "logps/generated": -830.5565185546875, "logps/real": -129.43264770507812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -16.803186416625977, "rewards/margins": 17.231233596801758, "rewards/real": 0.42804789543151855, "step": 610 }, { "epoch": 0.4, "learning_rate": 3.35348506401138e-07, "logits/generated": -2.814532995223999, "logits/real": -2.5871658325195312, "logps/generated": -834.9091796875, "logps/real": -127.82197570800781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -17.46357536315918, "rewards/margins": 17.733022689819336, "rewards/real": 0.2694476246833801, "step": 620 }, { "epoch": 0.4, "learning_rate": 3.3179231863442384e-07, "logits/generated": -2.748018741607666, "logits/real": -2.5878210067749023, "logps/generated": -869.302734375, "logps/real": -141.6997833251953, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -18.122358322143555, "rewards/margins": 18.34942626953125, "rewards/real": 0.22706761956214905, "step": 630 }, { "epoch": 0.41, "learning_rate": 3.282361308677098e-07, "logits/generated": -2.792604684829712, "logits/real": -2.5345077514648438, "logps/generated": -875.5255737304688, "logps/real": -133.2880096435547, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -18.701038360595703, "rewards/margins": 19.063264846801758, "rewards/real": 0.3622281849384308, "step": 640 }, { "epoch": 0.42, "learning_rate": 3.2467994310099573e-07, "logits/generated": -2.775300979614258, "logits/real": -2.560939311981201, "logps/generated": -871.24853515625, "logps/real": -142.5642852783203, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -18.207752227783203, "rewards/margins": 18.60503387451172, "rewards/real": 0.3972865343093872, "step": 650 }, { "epoch": 0.42, "learning_rate": 3.211237553342817e-07, "logits/generated": -2.835697650909424, "logits/real": -2.570935010910034, "logps/generated": -844.2117309570312, "logps/real": -129.60166931152344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -17.626232147216797, "rewards/margins": 17.888334274291992, "rewards/real": 0.26210257411003113, "step": 660 }, { "epoch": 0.43, "learning_rate": 3.175675675675675e-07, "logits/generated": -2.7552971839904785, "logits/real": -2.5506412982940674, "logps/generated": -894.5694580078125, "logps/real": -119.1685791015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -19.028125762939453, "rewards/margins": 19.389427185058594, "rewards/real": 0.36130291223526, "step": 670 }, { "epoch": 0.44, "learning_rate": 3.1401137980085347e-07, "logits/generated": -2.7464940547943115, "logits/real": -2.5819218158721924, "logps/generated": -898.6015625, "logps/real": -131.2238311767578, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -19.07114601135254, "rewards/margins": 19.419193267822266, "rewards/real": 0.34804823994636536, "step": 680 }, { "epoch": 0.44, "learning_rate": 3.104551920341394e-07, "logits/generated": -2.7470998764038086, "logits/real": -2.5765349864959717, "logps/generated": -885.7230224609375, "logps/real": -134.91915893554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -18.37563705444336, "rewards/margins": 18.524120330810547, "rewards/real": 0.14848431944847107, "step": 690 }, { "epoch": 0.45, "learning_rate": 3.068990042674253e-07, "logits/generated": -2.734856128692627, "logits/real": -2.512298107147217, "logps/generated": -853.0060424804688, "logps/real": -120.59394836425781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -18.26091194152832, "rewards/margins": 18.471248626708984, "rewards/real": 0.21033525466918945, "step": 700 }, { "epoch": 0.45, "learning_rate": 3.033428165007112e-07, "logits/generated": -2.7379255294799805, "logits/real": -2.524719715118408, "logps/generated": -912.4431762695312, "logps/real": -123.26702880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -18.675243377685547, "rewards/margins": 19.047819137573242, "rewards/real": 0.37257617712020874, "step": 710 }, { "epoch": 0.46, "learning_rate": 2.9978662873399715e-07, "logits/generated": -2.7684216499328613, "logits/real": -2.531463146209717, "logps/generated": -881.7340698242188, "logps/real": -134.31008911132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -19.38132667541504, "rewards/margins": 19.59175682067871, "rewards/real": 0.21043212711811066, "step": 720 }, { "epoch": 0.47, "learning_rate": 2.9623044096728305e-07, "logits/generated": -2.7709155082702637, "logits/real": -2.562648057937622, "logps/generated": -887.0978393554688, "logps/real": -145.66043090820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -19.221771240234375, "rewards/margins": 19.359745025634766, "rewards/real": 0.13797567784786224, "step": 730 }, { "epoch": 0.47, "learning_rate": 2.92674253200569e-07, "logits/generated": -2.8770792484283447, "logits/real": -2.5876846313476562, "logps/generated": -835.0736083984375, "logps/real": -131.42913818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -18.772159576416016, "rewards/margins": 19.107730865478516, "rewards/real": 0.33557194471359253, "step": 740 }, { "epoch": 0.48, "learning_rate": 2.8911806543385494e-07, "logits/generated": -2.734930992126465, "logits/real": -2.5578300952911377, "logps/generated": -874.3038330078125, "logps/real": -145.36695861816406, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.031780242919922, "rewards/margins": 20.371191024780273, "rewards/real": 0.33940908312797546, "step": 750 }, { "epoch": 0.49, "learning_rate": 2.855618776671408e-07, "logits/generated": -2.6914491653442383, "logits/real": -2.576624631881714, "logps/generated": -893.9830322265625, "logps/real": -153.35513305664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -19.32299041748047, "rewards/margins": 19.533132553100586, "rewards/real": 0.21014323830604553, "step": 760 }, { "epoch": 0.49, "learning_rate": 2.8200568990042673e-07, "logits/generated": -2.7508046627044678, "logits/real": -2.489609956741333, "logps/generated": -886.3199462890625, "logps/real": -132.56236267089844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.033309936523438, "rewards/margins": 20.369976043701172, "rewards/real": 0.3366653025150299, "step": 770 }, { "epoch": 0.5, "learning_rate": 2.784495021337127e-07, "logits/generated": -2.7089645862579346, "logits/real": -2.4881272315979004, "logps/generated": -903.6559448242188, "logps/real": -122.16401672363281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -19.558847427368164, "rewards/margins": 19.884899139404297, "rewards/real": 0.32605427503585815, "step": 780 }, { "epoch": 0.51, "learning_rate": 2.7489331436699857e-07, "logits/generated": -2.7490928173065186, "logits/real": -2.536649703979492, "logps/generated": -893.4107666015625, "logps/real": -141.30215454101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -19.618242263793945, "rewards/margins": 19.87813949584961, "rewards/real": 0.25990021228790283, "step": 790 }, { "epoch": 0.51, "learning_rate": 2.7133712660028446e-07, "logits/generated": -2.754713535308838, "logits/real": -2.4825220108032227, "logps/generated": -877.2716064453125, "logps/real": -122.8569107055664, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -19.619098663330078, "rewards/margins": 19.970170974731445, "rewards/real": 0.35107091069221497, "step": 800 }, { "epoch": 0.52, "learning_rate": 2.677809388335704e-07, "logits/generated": -2.7757420539855957, "logits/real": -2.5053367614746094, "logps/generated": -907.7996826171875, "logps/real": -124.29368591308594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.24595069885254, "rewards/margins": 20.557300567626953, "rewards/real": 0.3113483488559723, "step": 810 }, { "epoch": 0.52, "learning_rate": 2.642247510668563e-07, "logits/generated": -2.7591538429260254, "logits/real": -2.488976001739502, "logps/generated": -878.0640869140625, "logps/real": -129.9168701171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.200183868408203, "rewards/margins": 20.459766387939453, "rewards/real": 0.25958216190338135, "step": 820 }, { "epoch": 0.53, "learning_rate": 2.6066856330014225e-07, "logits/generated": -2.717893600463867, "logits/real": -2.475963830947876, "logps/generated": -886.1345825195312, "logps/real": -136.38031005859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -21.064132690429688, "rewards/margins": 21.22158432006836, "rewards/real": 0.1574556827545166, "step": 830 }, { "epoch": 0.54, "learning_rate": 2.5711237553342815e-07, "logits/generated": -2.7576115131378174, "logits/real": -2.520620107650757, "logps/generated": -919.7344970703125, "logps/real": -146.73341369628906, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -21.55472755432129, "rewards/margins": 21.70217514038086, "rewards/real": 0.14745107293128967, "step": 840 }, { "epoch": 0.54, "learning_rate": 2.5355618776671404e-07, "logits/generated": -2.758734941482544, "logits/real": -2.4575321674346924, "logps/generated": -924.5079956054688, "logps/real": -137.92759704589844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.882726669311523, "rewards/margins": 21.117658615112305, "rewards/real": 0.23493008315563202, "step": 850 }, { "epoch": 0.55, "learning_rate": 2.5e-07, "logits/generated": -2.76533579826355, "logits/real": -2.473336696624756, "logps/generated": -872.0582885742188, "logps/real": -128.65528869628906, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.465499877929688, "rewards/margins": 20.760725021362305, "rewards/real": 0.29522615671157837, "step": 860 }, { "epoch": 0.56, "learning_rate": 2.4644381223328594e-07, "logits/generated": -2.8043251037597656, "logits/real": -2.4596962928771973, "logps/generated": -831.9053955078125, "logps/real": -136.89483642578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.220478057861328, "rewards/margins": 20.537092208862305, "rewards/real": 0.31661272048950195, "step": 870 }, { "epoch": 0.56, "learning_rate": 2.4288762446657183e-07, "logits/generated": -2.7661118507385254, "logits/real": -2.463319778442383, "logps/generated": -896.66796875, "logps/real": -140.58807373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -21.672061920166016, "rewards/margins": 21.854246139526367, "rewards/real": 0.18218322098255157, "step": 880 }, { "epoch": 0.57, "learning_rate": 2.393314366998578e-07, "logits/generated": -2.8111281394958496, "logits/real": -2.45881986618042, "logps/generated": -883.6256713867188, "logps/real": -133.40512084960938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -21.710323333740234, "rewards/margins": 22.020999908447266, "rewards/real": 0.3106769621372223, "step": 890 }, { "epoch": 0.58, "learning_rate": 2.3577524893314365e-07, "logits/generated": -2.7077133655548096, "logits/real": -2.4282584190368652, "logps/generated": -929.71923828125, "logps/real": -125.48017883300781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -21.90422821044922, "rewards/margins": 22.404098510742188, "rewards/real": 0.49986690282821655, "step": 900 }, { "epoch": 0.58, "learning_rate": 2.322190611664296e-07, "logits/generated": -2.8797926902770996, "logits/real": -2.449512004852295, "logps/generated": -898.6605224609375, "logps/real": -145.88131713867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.765823364257812, "rewards/margins": 21.118236541748047, "rewards/real": 0.35241395235061646, "step": 910 }, { "epoch": 0.59, "learning_rate": 2.2866287339971549e-07, "logits/generated": -2.7840793132781982, "logits/real": -2.467308282852173, "logps/generated": -892.7574462890625, "logps/real": -133.6017303466797, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -21.309818267822266, "rewards/margins": 21.834148406982422, "rewards/real": 0.524328351020813, "step": 920 }, { "epoch": 0.6, "learning_rate": 2.251066856330014e-07, "logits/generated": -2.7835748195648193, "logits/real": -2.430983781814575, "logps/generated": -926.97900390625, "logps/real": -121.66536712646484, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -22.335927963256836, "rewards/margins": 22.66562271118164, "rewards/real": 0.32969528436660767, "step": 930 }, { "epoch": 0.6, "learning_rate": 2.2155049786628733e-07, "logits/generated": -2.7988877296447754, "logits/real": -2.470797061920166, "logps/generated": -906.01904296875, "logps/real": -139.35302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -21.41358757019043, "rewards/margins": 21.88725471496582, "rewards/real": 0.4736654758453369, "step": 940 }, { "epoch": 0.61, "learning_rate": 2.1799431009957325e-07, "logits/generated": -2.7693393230438232, "logits/real": -2.449216842651367, "logps/generated": -971.3763427734375, "logps/real": -120.10380554199219, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -22.712810516357422, "rewards/margins": 23.097557067871094, "rewards/real": 0.38474756479263306, "step": 950 }, { "epoch": 0.61, "learning_rate": 2.1443812233285914e-07, "logits/generated": -2.8496899604797363, "logits/real": -2.6208953857421875, "logps/generated": -853.5813598632812, "logps/real": -140.28988647460938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/generated": -18.387523651123047, "rewards/margins": 18.704341888427734, "rewards/real": 0.31681886315345764, "step": 960 }, { "epoch": 0.62, "learning_rate": 2.108819345661451e-07, "logits/generated": -2.81449556350708, "logits/real": -2.7593271732330322, "logps/generated": -824.1788330078125, "logps/real": -129.34524536132812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -15.003524780273438, "rewards/margins": 15.52760124206543, "rewards/real": 0.5240752100944519, "step": 970 }, { "epoch": 0.63, "learning_rate": 2.0732574679943098e-07, "logits/generated": -2.8338940143585205, "logits/real": -2.6659107208251953, "logps/generated": -828.3829956054688, "logps/real": -113.12556457519531, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -15.830032348632812, "rewards/margins": 16.479970932006836, "rewards/real": 0.6499394178390503, "step": 980 }, { "epoch": 0.63, "learning_rate": 2.0376955903271693e-07, "logits/generated": -2.8800129890441895, "logits/real": -2.7851107120513916, "logps/generated": -825.91015625, "logps/real": -145.60104370117188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -15.22101879119873, "rewards/margins": 15.671483993530273, "rewards/real": 0.45046553015708923, "step": 990 }, { "epoch": 0.64, "learning_rate": 2.0021337126600283e-07, "logits/generated": -2.8783576488494873, "logits/real": -2.719095468521118, "logps/generated": -829.0347900390625, "logps/real": -117.2816162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -16.659259796142578, "rewards/margins": 17.303306579589844, "rewards/real": 0.6440474390983582, "step": 1000 }, { "epoch": 0.64, "eval_logits/generated": -2.8324971199035645, "eval_logits/real": -2.7463560104370117, "eval_logps/generated": -843.258544921875, "eval_logps/real": -129.80291748046875, "eval_loss": 2.3505108401877806e-05, "eval_rewards/accuracies": 1.0, "eval_rewards/generated": -16.52660369873047, "eval_rewards/margins": 16.946895599365234, "eval_rewards/real": 0.42029163241386414, "eval_runtime": 65.6308, "eval_samples_per_second": 7.618, "eval_steps_per_second": 0.244, "step": 1000 }, { "epoch": 0.65, "learning_rate": 1.9665718349928875e-07, "logits/generated": -2.7597239017486572, "logits/real": -2.70881986618042, "logps/generated": -878.791015625, "logps/real": -121.81756591796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -17.580312728881836, "rewards/margins": 18.086267471313477, "rewards/real": 0.5059542655944824, "step": 1010 }, { "epoch": 0.65, "learning_rate": 1.931009957325747e-07, "logits/generated": -2.8248562812805176, "logits/real": -2.6810784339904785, "logps/generated": -852.2976684570312, "logps/real": -110.82057189941406, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -17.118757247924805, "rewards/margins": 17.73995590209961, "rewards/real": 0.6211975812911987, "step": 1020 }, { "epoch": 0.66, "learning_rate": 1.895448079658606e-07, "logits/generated": -2.8053200244903564, "logits/real": -2.70365309715271, "logps/generated": -900.5974731445312, "logps/real": -135.4402313232422, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -17.847537994384766, "rewards/margins": 18.353679656982422, "rewards/real": 0.5061434507369995, "step": 1030 }, { "epoch": 0.67, "learning_rate": 1.859886201991465e-07, "logits/generated": -2.898444414138794, "logits/real": -2.7453322410583496, "logps/generated": -820.4622802734375, "logps/real": -140.00946044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -17.400270462036133, "rewards/margins": 17.893169403076172, "rewards/real": 0.4929002821445465, "step": 1040 }, { "epoch": 0.67, "learning_rate": 1.8243243243243243e-07, "logits/generated": -2.839688777923584, "logits/real": -2.6923739910125732, "logps/generated": -838.73291015625, "logps/real": -132.70223999023438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -18.139698028564453, "rewards/margins": 18.552087783813477, "rewards/real": 0.41239088773727417, "step": 1050 }, { "epoch": 0.68, "learning_rate": 1.7887624466571835e-07, "logits/generated": -2.833216667175293, "logits/real": -2.7041759490966797, "logps/generated": -861.8936767578125, "logps/real": -126.6530990600586, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -18.731300354003906, "rewards/margins": 19.197338104248047, "rewards/real": 0.4660395085811615, "step": 1060 }, { "epoch": 0.68, "learning_rate": 1.7532005689900424e-07, "logits/generated": -2.8046717643737793, "logits/real": -2.69667387008667, "logps/generated": -875.9267578125, "logps/real": -128.2639617919922, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -18.01252555847168, "rewards/margins": 18.45262908935547, "rewards/real": 0.44010037183761597, "step": 1070 }, { "epoch": 0.69, "learning_rate": 1.717638691322902e-07, "logits/generated": -2.83022141456604, "logits/real": -2.692930221557617, "logps/generated": -854.4952392578125, "logps/real": -121.74955749511719, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -17.961904525756836, "rewards/margins": 18.5115909576416, "rewards/real": 0.5496853590011597, "step": 1080 }, { "epoch": 0.7, "learning_rate": 1.6820768136557609e-07, "logits/generated": -2.8350539207458496, "logits/real": -2.6970601081848145, "logps/generated": -863.2819213867188, "logps/real": -123.15059661865234, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -18.864200592041016, "rewards/margins": 19.36715316772461, "rewards/real": 0.5029550194740295, "step": 1090 }, { "epoch": 0.7, "learning_rate": 1.64651493598862e-07, "logits/generated": -2.8632559776306152, "logits/real": -2.677931308746338, "logps/generated": -879.8753662109375, "logps/real": -141.77952575683594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -18.380382537841797, "rewards/margins": 18.56157684326172, "rewards/real": 0.18119129538536072, "step": 1100 }, { "epoch": 0.71, "learning_rate": 1.6109530583214793e-07, "logits/generated": -2.803745746612549, "logits/real": -2.6646134853363037, "logps/generated": -901.3361206054688, "logps/real": -126.1727523803711, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -19.827922821044922, "rewards/margins": 20.339210510253906, "rewards/real": 0.5112860202789307, "step": 1110 }, { "epoch": 0.72, "learning_rate": 1.5753911806543385e-07, "logits/generated": -2.8503870964050293, "logits/real": -2.6645989418029785, "logps/generated": -888.7047119140625, "logps/real": -129.1335906982422, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -19.367877960205078, "rewards/margins": 19.824716567993164, "rewards/real": 0.4568362236022949, "step": 1120 }, { "epoch": 0.72, "learning_rate": 1.5398293029871974e-07, "logits/generated": -2.7480721473693848, "logits/real": -2.6507885456085205, "logps/generated": -934.6629028320312, "logps/real": -135.6553497314453, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.353116989135742, "rewards/margins": 20.787641525268555, "rewards/real": 0.4345230162143707, "step": 1130 }, { "epoch": 0.73, "learning_rate": 1.504267425320057e-07, "logits/generated": -2.797375202178955, "logits/real": -2.6887059211730957, "logps/generated": -881.1536254882812, "logps/real": -125.61927795410156, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -19.214933395385742, "rewards/margins": 19.779462814331055, "rewards/real": 0.5645291209220886, "step": 1140 }, { "epoch": 0.74, "learning_rate": 1.4687055476529158e-07, "logits/generated": -2.870006799697876, "logits/real": -2.654900550842285, "logps/generated": -834.4885864257812, "logps/real": -136.41250610351562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -18.738508224487305, "rewards/margins": 19.11133575439453, "rewards/real": 0.37282687425613403, "step": 1150 }, { "epoch": 0.74, "learning_rate": 1.4331436699857753e-07, "logits/generated": -2.8572959899902344, "logits/real": -2.7063486576080322, "logps/generated": -870.64404296875, "logps/real": -134.53292846679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -19.98287582397461, "rewards/margins": 20.453279495239258, "rewards/real": 0.4704047739505768, "step": 1160 }, { "epoch": 0.75, "learning_rate": 1.3975817923186345e-07, "logits/generated": -2.871398448944092, "logits/real": -2.707024335861206, "logps/generated": -850.9390869140625, "logps/real": -148.42337036132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.16036605834961, "rewards/margins": 20.592147827148438, "rewards/real": 0.43178051710128784, "step": 1170 }, { "epoch": 0.75, "learning_rate": 1.3620199146514935e-07, "logits/generated": -2.807039976119995, "logits/real": -2.723776340484619, "logps/generated": -929.8095703125, "logps/real": -115.76216125488281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.871318817138672, "rewards/margins": 21.316526412963867, "rewards/real": 0.44520822167396545, "step": 1180 }, { "epoch": 0.76, "learning_rate": 1.326458036984353e-07, "logits/generated": -2.8286869525909424, "logits/real": -2.6350340843200684, "logps/generated": -862.5518798828125, "logps/real": -126.08731842041016, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -19.82559585571289, "rewards/margins": 20.303485870361328, "rewards/real": 0.4778921604156494, "step": 1190 }, { "epoch": 0.77, "learning_rate": 1.290896159317212e-07, "logits/generated": -2.8613972663879395, "logits/real": -2.638763904571533, "logps/generated": -921.1492919921875, "logps/real": -123.17964172363281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.54897689819336, "rewards/margins": 20.937849044799805, "rewards/real": 0.3888731598854065, "step": 1200 }, { "epoch": 0.77, "learning_rate": 1.255334281650071e-07, "logits/generated": -2.876451253890991, "logits/real": -2.6815617084503174, "logps/generated": -831.7360229492188, "logps/real": -139.11557006835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -19.613943099975586, "rewards/margins": 19.776029586791992, "rewards/real": 0.16208769381046295, "step": 1210 }, { "epoch": 0.78, "learning_rate": 1.2197724039829303e-07, "logits/generated": -2.8330130577087402, "logits/real": -2.566429615020752, "logps/generated": -911.4781494140625, "logps/real": -122.40309143066406, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -22.14625358581543, "rewards/margins": 22.701326370239258, "rewards/real": 0.5550734400749207, "step": 1220 }, { "epoch": 0.79, "learning_rate": 1.1842105263157894e-07, "logits/generated": -2.808868885040283, "logits/real": -2.6208791732788086, "logps/generated": -873.4892578125, "logps/real": -114.96858215332031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.708131790161133, "rewards/margins": 21.098825454711914, "rewards/real": 0.3906935155391693, "step": 1230 }, { "epoch": 0.79, "learning_rate": 1.1486486486486487e-07, "logits/generated": -2.8322298526763916, "logits/real": -2.6367688179016113, "logps/generated": -940.2215576171875, "logps/real": -132.8704071044922, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -22.91935920715332, "rewards/margins": 23.39228630065918, "rewards/real": 0.4729260802268982, "step": 1240 }, { "epoch": 0.8, "learning_rate": 1.1130867709815078e-07, "logits/generated": -2.8651223182678223, "logits/real": -2.6476199626922607, "logps/generated": -869.115234375, "logps/real": -129.05712890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.786222457885742, "rewards/margins": 21.287538528442383, "rewards/real": 0.5013141632080078, "step": 1250 }, { "epoch": 0.81, "learning_rate": 1.077524893314367e-07, "logits/generated": -2.7978148460388184, "logits/real": -2.5825142860412598, "logps/generated": -897.38330078125, "logps/real": -118.57264709472656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -21.1629638671875, "rewards/margins": 21.7834415435791, "rewards/real": 0.6204766631126404, "step": 1260 }, { "epoch": 0.81, "learning_rate": 1.0419630156472262e-07, "logits/generated": -2.866764783859253, "logits/real": -2.685533046722412, "logps/generated": -919.8984375, "logps/real": -146.31027221679688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/generated": -21.15250587463379, "rewards/margins": 21.48689079284668, "rewards/real": 0.3343891501426697, "step": 1270 }, { "epoch": 0.82, "learning_rate": 1.0064011379800854e-07, "logits/generated": -2.8685061931610107, "logits/real": -2.649932384490967, "logps/generated": -902.7081909179688, "logps/real": -141.6209716796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -22.167659759521484, "rewards/margins": 22.551496505737305, "rewards/real": 0.38383588194847107, "step": 1280 }, { "epoch": 0.83, "learning_rate": 9.708392603129445e-08, "logits/generated": -2.8944077491760254, "logits/real": -2.6382641792297363, "logps/generated": -916.2838745117188, "logps/real": -130.5482177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -21.300434112548828, "rewards/margins": 21.502178192138672, "rewards/real": 0.20173999667167664, "step": 1290 }, { "epoch": 0.83, "learning_rate": 9.352773826458037e-08, "logits/generated": -2.8912646770477295, "logits/real": -2.6097311973571777, "logps/generated": -894.2330322265625, "logps/real": -132.25819396972656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -21.95718765258789, "rewards/margins": 22.406177520751953, "rewards/real": 0.44899100065231323, "step": 1300 }, { "epoch": 0.84, "learning_rate": 8.997155049786629e-08, "logits/generated": -2.8901562690734863, "logits/real": -2.551877021789551, "logps/generated": -932.6094970703125, "logps/real": -119.0418930053711, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -22.05977439880371, "rewards/margins": 22.581966400146484, "rewards/real": 0.5221914052963257, "step": 1310 }, { "epoch": 0.84, "learning_rate": 8.64153627311522e-08, "logits/generated": -2.8296782970428467, "logits/real": -2.5993740558624268, "logps/generated": -896.4122314453125, "logps/real": -128.8374481201172, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -21.528562545776367, "rewards/margins": 22.051563262939453, "rewards/real": 0.5230005979537964, "step": 1320 }, { "epoch": 0.85, "learning_rate": 8.285917496443812e-08, "logits/generated": -2.860327959060669, "logits/real": -2.5901761054992676, "logps/generated": -898.0784912109375, "logps/real": -132.93478393554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -22.043405532836914, "rewards/margins": 22.58662223815918, "rewards/real": 0.543217658996582, "step": 1330 }, { "epoch": 0.86, "learning_rate": 7.930298719772404e-08, "logits/generated": -2.8273463249206543, "logits/real": -2.5543017387390137, "logps/generated": -961.3826904296875, "logps/real": -140.56031799316406, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -23.900615692138672, "rewards/margins": 24.335386276245117, "rewards/real": 0.43477168679237366, "step": 1340 }, { "epoch": 0.86, "learning_rate": 7.574679943100994e-08, "logits/generated": -2.8639349937438965, "logits/real": -2.6260411739349365, "logps/generated": -885.5602416992188, "logps/real": -112.6806869506836, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -20.910247802734375, "rewards/margins": 21.491928100585938, "rewards/real": 0.5816811919212341, "step": 1350 }, { "epoch": 0.87, "learning_rate": 7.219061166429587e-08, "logits/generated": -2.8940956592559814, "logits/real": -2.5862889289855957, "logps/generated": -917.0275268554688, "logps/real": -125.59222412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -23.038272857666016, "rewards/margins": 23.486886978149414, "rewards/real": 0.44861316680908203, "step": 1360 }, { "epoch": 0.88, "learning_rate": 6.863442389758179e-08, "logits/generated": -2.8932290077209473, "logits/real": -2.5974230766296387, "logps/generated": -886.7428588867188, "logps/real": -143.0746307373047, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -22.268463134765625, "rewards/margins": 22.662071228027344, "rewards/real": 0.3936085104942322, "step": 1370 }, { "epoch": 0.88, "learning_rate": 6.507823613086771e-08, "logits/generated": -2.922111749649048, "logits/real": -2.594691276550293, "logps/generated": -906.36767578125, "logps/real": -140.33889770507812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -22.82131004333496, "rewards/margins": 23.025859832763672, "rewards/real": 0.20455090701580048, "step": 1380 }, { "epoch": 0.89, "learning_rate": 6.152204836415363e-08, "logits/generated": -2.914600133895874, "logits/real": -2.6084065437316895, "logps/generated": -941.8387451171875, "logps/real": -133.2527313232422, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -23.14767837524414, "rewards/margins": 23.540363311767578, "rewards/real": 0.39268168807029724, "step": 1390 }, { "epoch": 0.9, "learning_rate": 5.796586059743954e-08, "logits/generated": -2.8385584354400635, "logits/real": -2.6445257663726807, "logps/generated": -907.6339721679688, "logps/real": -132.3414764404297, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -24.239797592163086, "rewards/margins": 24.71152114868164, "rewards/real": 0.47172126173973083, "step": 1400 }, { "epoch": 0.9, "learning_rate": 5.4409672830725456e-08, "logits/generated": -2.8101682662963867, "logits/real": -2.6199960708618164, "logps/generated": -945.5979614257812, "logps/real": -132.10206604003906, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -24.88088607788086, "rewards/margins": 25.367450714111328, "rewards/real": 0.4865630567073822, "step": 1410 }, { "epoch": 0.91, "learning_rate": 5.0853485064011376e-08, "logits/generated": -2.8484458923339844, "logits/real": -2.563117504119873, "logps/generated": -928.4225463867188, "logps/real": -119.01268005371094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -23.193946838378906, "rewards/margins": 23.61981201171875, "rewards/real": 0.4258663058280945, "step": 1420 }, { "epoch": 0.91, "learning_rate": 4.72972972972973e-08, "logits/generated": -2.8349921703338623, "logits/real": -2.5548834800720215, "logps/generated": -979.7244873046875, "logps/real": -130.8174285888672, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -25.183971405029297, "rewards/margins": 25.632709503173828, "rewards/real": 0.44873887300491333, "step": 1430 }, { "epoch": 0.92, "learning_rate": 4.374110953058322e-08, "logits/generated": -2.837606906890869, "logits/real": -2.537325620651245, "logps/generated": -992.2394409179688, "logps/real": -128.49644470214844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -25.086801528930664, "rewards/margins": 25.410724639892578, "rewards/real": 0.32392334938049316, "step": 1440 }, { "epoch": 0.93, "learning_rate": 4.018492176386913e-08, "logits/generated": -2.91603422164917, "logits/real": -2.5341243743896484, "logps/generated": -898.00146484375, "logps/real": -129.14276123046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -22.930097579956055, "rewards/margins": 23.440217971801758, "rewards/real": 0.5101193189620972, "step": 1450 }, { "epoch": 0.93, "learning_rate": 3.6628733997155046e-08, "logits/generated": -2.8616137504577637, "logits/real": -2.5504488945007324, "logps/generated": -912.74560546875, "logps/real": -108.04595947265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -23.54279899597168, "rewards/margins": 23.969791412353516, "rewards/real": 0.4269927442073822, "step": 1460 }, { "epoch": 0.94, "learning_rate": 3.3072546230440967e-08, "logits/generated": -2.85672926902771, "logits/real": -2.5588877201080322, "logps/generated": -940.3358154296875, "logps/real": -125.9631118774414, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -24.594558715820312, "rewards/margins": 24.99027442932129, "rewards/real": 0.39571598172187805, "step": 1470 }, { "epoch": 0.95, "learning_rate": 2.9516358463726884e-08, "logits/generated": -2.89031720161438, "logits/real": -2.5603203773498535, "logps/generated": -972.2039184570312, "logps/real": -137.25588989257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -24.38454818725586, "rewards/margins": 24.839740753173828, "rewards/real": 0.45519551634788513, "step": 1480 }, { "epoch": 0.95, "learning_rate": 2.59601706970128e-08, "logits/generated": -2.890516996383667, "logits/real": -2.5641016960144043, "logps/generated": -936.7041015625, "logps/real": -140.62559509277344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -25.00693702697754, "rewards/margins": 25.536272048950195, "rewards/real": 0.5293352603912354, "step": 1490 }, { "epoch": 0.96, "learning_rate": 2.240398293029872e-08, "logits/generated": -2.8512871265411377, "logits/real": -2.5838348865509033, "logps/generated": -910.3527221679688, "logps/real": -126.60355377197266, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -23.73545265197754, "rewards/margins": 24.106916427612305, "rewards/real": 0.3714631199836731, "step": 1500 }, { "epoch": 0.96, "eval_logits/generated": -2.8901188373565674, "eval_logits/real": -2.609180450439453, "eval_logps/generated": -916.0912475585938, "eval_logps/real": -130.5047149658203, "eval_loss": 2.595016326267796e-07, "eval_rewards/accuracies": 1.0, "eval_rewards/generated": -23.809871673583984, "eval_rewards/margins": 24.159982681274414, "eval_rewards/real": 0.35011160373687744, "eval_runtime": 65.5536, "eval_samples_per_second": 7.627, "eval_steps_per_second": 0.244, "step": 1500 }, { "epoch": 0.97, "learning_rate": 1.8847795163584636e-08, "logits/generated": -2.900836229324341, "logits/real": -2.5513949394226074, "logps/generated": -931.4461059570312, "logps/real": -129.80133056640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -24.57596206665039, "rewards/margins": 24.972017288208008, "rewards/real": 0.3960537910461426, "step": 1510 }, { "epoch": 0.97, "learning_rate": 1.5291607396870554e-08, "logits/generated": -2.862175941467285, "logits/real": -2.5267205238342285, "logps/generated": -934.8350830078125, "logps/real": -126.36529541015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -24.785350799560547, "rewards/margins": 25.167552947998047, "rewards/real": 0.38220247626304626, "step": 1520 }, { "epoch": 0.98, "learning_rate": 1.1735419630156473e-08, "logits/generated": -2.877037525177002, "logits/real": -2.5817883014678955, "logps/generated": -916.3255004882812, "logps/real": -128.97787475585938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -24.590730667114258, "rewards/margins": 24.93622589111328, "rewards/real": 0.3454935848712921, "step": 1530 }, { "epoch": 0.99, "learning_rate": 8.179231863442388e-09, "logits/generated": -2.9152793884277344, "logits/real": -2.5522732734680176, "logps/generated": -968.6594848632812, "logps/real": -132.7301025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -24.529855728149414, "rewards/margins": 25.114501953125, "rewards/real": 0.5846462249755859, "step": 1540 }, { "epoch": 0.99, "learning_rate": 4.623044096728307e-09, "logits/generated": -2.8607215881347656, "logits/real": -2.5737948417663574, "logps/generated": -897.6732177734375, "logps/real": -137.73817443847656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -23.409955978393555, "rewards/margins": 23.72184181213379, "rewards/real": 0.31188473105430603, "step": 1550 }, { "epoch": 1.0, "learning_rate": 1.0668563300142248e-09, "logits/generated": -2.8556289672851562, "logits/real": -2.54154896736145, "logps/generated": -959.1483154296875, "logps/real": -127.1308822631836, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/generated": -25.255569458007812, "rewards/margins": 25.72182273864746, "rewards/real": 0.46625250577926636, "step": 1560 }, { "epoch": 1.0, "step": 1563, "total_flos": 0.0, "train_loss": 0.009621814649877188, "train_runtime": 14787.3165, "train_samples_per_second": 3.381, "train_steps_per_second": 0.106 } ], "logging_steps": 10, "max_steps": 1563, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }