{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997172745264349, "eval_steps": 500, "global_step": 442, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0022618037885213456, "grad_norm": 115.54104008253113, "learning_rate": 1.7777777777777777e-08, "logits/chosen": -0.8356236219406128, "logits/rejected": -0.8113616108894348, "logps/chosen": -1.5973824262619019, "logps/rejected": -1.7205333709716797, "loss": 5.7448, "rewards/accuracies": 0.5625, "rewards/chosen": -15.973824501037598, "rewards/margins": 1.2315096855163574, "rewards/rejected": -17.205333709716797, "step": 1 }, { "epoch": 0.004523607577042691, "grad_norm": 74.3673874458241, "learning_rate": 3.5555555555555554e-08, "logits/chosen": -0.880168616771698, "logits/rejected": -0.8785539269447327, "logps/chosen": -1.676809310913086, "logps/rejected": -1.6232023239135742, "loss": 6.1494, "rewards/accuracies": 0.4921875, "rewards/chosen": -16.76809310913086, "rewards/margins": -0.5360711812973022, "rewards/rejected": -16.23202133178711, "step": 2 }, { "epoch": 0.006785411365564037, "grad_norm": 103.24134323529078, "learning_rate": 5.333333333333333e-08, "logits/chosen": -0.8413803577423096, "logits/rejected": -0.8578035831451416, "logps/chosen": -1.679062843322754, "logps/rejected": -1.8749037981033325, "loss": 6.2783, "rewards/accuracies": 0.578125, "rewards/chosen": -16.79062843322754, "rewards/margins": 1.9584112167358398, "rewards/rejected": -18.749040603637695, "step": 3 }, { "epoch": 0.009047215154085382, "grad_norm": 118.3827028625394, "learning_rate": 7.111111111111111e-08, "logits/chosen": -0.8771331906318665, "logits/rejected": -0.8562486171722412, "logps/chosen": -1.6714611053466797, "logps/rejected": -1.6346337795257568, "loss": 6.182, "rewards/accuracies": 0.5546875, "rewards/chosen": -16.71461296081543, "rewards/margins": -0.368274062871933, "rewards/rejected": -16.346338272094727, "step": 4 }, { "epoch": 0.01130901894260673, "grad_norm": 80.57413510413119, "learning_rate": 8.888888888888888e-08, "logits/chosen": -0.8716552257537842, "logits/rejected": -0.8481063842773438, "logps/chosen": -1.5923399925231934, "logps/rejected": -1.6487252712249756, "loss": 5.4404, "rewards/accuracies": 0.546875, "rewards/chosen": -15.92340087890625, "rewards/margins": 0.5638511776924133, "rewards/rejected": -16.48725128173828, "step": 5 }, { "epoch": 0.013570822731128074, "grad_norm": 87.54382641921318, "learning_rate": 1.0666666666666666e-07, "logits/chosen": -0.9280990958213806, "logits/rejected": -0.9075251221656799, "logps/chosen": -1.659511685371399, "logps/rejected": -1.6108341217041016, "loss": 6.2758, "rewards/accuracies": 0.5078125, "rewards/chosen": -16.595117568969727, "rewards/margins": -0.48677870631217957, "rewards/rejected": -16.108339309692383, "step": 6 }, { "epoch": 0.01583262651964942, "grad_norm": 39.60022994902477, "learning_rate": 1.2444444444444443e-07, "logits/chosen": -0.8669524192810059, "logits/rejected": -0.8501181602478027, "logps/chosen": -1.4954458475112915, "logps/rejected": -1.8645169734954834, "loss": 4.1135, "rewards/accuracies": 0.6171875, "rewards/chosen": -14.954458236694336, "rewards/margins": 3.6907100677490234, "rewards/rejected": -18.64516830444336, "step": 7 }, { "epoch": 0.018094430308170765, "grad_norm": 97.49420569209839, "learning_rate": 1.4222222222222222e-07, "logits/chosen": -0.8323963284492493, "logits/rejected": -0.8386867046356201, "logps/chosen": -1.5902166366577148, "logps/rejected": -1.6685606241226196, "loss": 5.7368, "rewards/accuracies": 0.5234375, "rewards/chosen": -15.902165412902832, "rewards/margins": 0.7834409475326538, "rewards/rejected": -16.685604095458984, "step": 8 }, { "epoch": 0.020356234096692113, "grad_norm": 65.1210073833328, "learning_rate": 1.6e-07, "logits/chosen": -0.8797612190246582, "logits/rejected": -0.8696941137313843, "logps/chosen": -1.5322370529174805, "logps/rejected": -1.739979863166809, "loss": 5.4506, "rewards/accuracies": 0.5078125, "rewards/chosen": -15.322370529174805, "rewards/margins": 2.0774283409118652, "rewards/rejected": -17.399799346923828, "step": 9 }, { "epoch": 0.02261803788521346, "grad_norm": 87.75880998151953, "learning_rate": 1.7777777777777776e-07, "logits/chosen": -0.8726012706756592, "logits/rejected": -0.8817980885505676, "logps/chosen": -1.692103385925293, "logps/rejected": -1.6219866275787354, "loss": 6.0529, "rewards/accuracies": 0.59375, "rewards/chosen": -16.92103385925293, "rewards/margins": -0.7011662721633911, "rewards/rejected": -16.219867706298828, "step": 10 }, { "epoch": 0.024879841673734804, "grad_norm": 94.08875549981737, "learning_rate": 1.9555555555555555e-07, "logits/chosen": -0.9308934211730957, "logits/rejected": -0.9283267259597778, "logps/chosen": -1.6734390258789062, "logps/rejected": -1.9049830436706543, "loss": 5.3677, "rewards/accuracies": 0.578125, "rewards/chosen": -16.734390258789062, "rewards/margins": 2.315438747406006, "rewards/rejected": -19.049829483032227, "step": 11 }, { "epoch": 0.02714164546225615, "grad_norm": 114.82744103438529, "learning_rate": 2.133333333333333e-07, "logits/chosen": -0.8485775589942932, "logits/rejected": -0.8313932418823242, "logps/chosen": -1.7942991256713867, "logps/rejected": -1.8555328845977783, "loss": 6.4051, "rewards/accuracies": 0.609375, "rewards/chosen": -17.942991256713867, "rewards/margins": 0.6123358607292175, "rewards/rejected": -18.555328369140625, "step": 12 }, { "epoch": 0.029403449250777494, "grad_norm": 97.6559617021603, "learning_rate": 2.3111111111111107e-07, "logits/chosen": -0.8471003770828247, "logits/rejected": -0.8123136162757874, "logps/chosen": -1.7098432779312134, "logps/rejected": -1.626631736755371, "loss": 6.1671, "rewards/accuracies": 0.578125, "rewards/chosen": -17.098432540893555, "rewards/margins": -0.8321163654327393, "rewards/rejected": -16.26631736755371, "step": 13 }, { "epoch": 0.03166525303929884, "grad_norm": 48.43299087579507, "learning_rate": 2.4888888888888886e-07, "logits/chosen": -0.8456010222434998, "logits/rejected": -0.843168318271637, "logps/chosen": -1.3908941745758057, "logps/rejected": -1.59244704246521, "loss": 4.5976, "rewards/accuracies": 0.5859375, "rewards/chosen": -13.908943176269531, "rewards/margins": 2.0155270099639893, "rewards/rejected": -15.924469947814941, "step": 14 }, { "epoch": 0.033927056827820185, "grad_norm": 94.32433506251559, "learning_rate": 2.666666666666666e-07, "logits/chosen": -0.8408608436584473, "logits/rejected": -0.8317903280258179, "logps/chosen": -1.5308924913406372, "logps/rejected": -1.621803879737854, "loss": 5.1224, "rewards/accuracies": 0.6171875, "rewards/chosen": -15.308926582336426, "rewards/margins": 0.9091131091117859, "rewards/rejected": -16.218036651611328, "step": 15 }, { "epoch": 0.03618886061634153, "grad_norm": 80.0199360911188, "learning_rate": 2.8444444444444443e-07, "logits/chosen": -0.896875262260437, "logits/rejected": -0.8800469636917114, "logps/chosen": -1.6712013483047485, "logps/rejected": -1.6556079387664795, "loss": 6.2495, "rewards/accuracies": 0.5, "rewards/chosen": -16.71201515197754, "rewards/margins": -0.15593338012695312, "rewards/rejected": -16.556079864501953, "step": 16 }, { "epoch": 0.038450664404862875, "grad_norm": 95.62781163862564, "learning_rate": 3.022222222222222e-07, "logits/chosen": -0.9052500128746033, "logits/rejected": -0.8847813010215759, "logps/chosen": -1.4807989597320557, "logps/rejected": -1.447709321975708, "loss": 6.2111, "rewards/accuracies": 0.53125, "rewards/chosen": -14.807989120483398, "rewards/margins": -0.33089762926101685, "rewards/rejected": -14.477092742919922, "step": 17 }, { "epoch": 0.04071246819338423, "grad_norm": 99.53047146451797, "learning_rate": 3.2e-07, "logits/chosen": -0.9046046733856201, "logits/rejected": -0.8962881565093994, "logps/chosen": -1.9553179740905762, "logps/rejected": -1.9541630744934082, "loss": 6.2661, "rewards/accuracies": 0.4921875, "rewards/chosen": -19.553178787231445, "rewards/margins": -0.011548399925231934, "rewards/rejected": -19.5416316986084, "step": 18 }, { "epoch": 0.04297427198190557, "grad_norm": 94.64334054203071, "learning_rate": 3.3777777777777777e-07, "logits/chosen": -0.9112716913223267, "logits/rejected": -0.8977913856506348, "logps/chosen": -1.6549549102783203, "logps/rejected": -1.672560214996338, "loss": 5.3987, "rewards/accuracies": 0.5703125, "rewards/chosen": -16.549549102783203, "rewards/margins": 0.1760539710521698, "rewards/rejected": -16.725605010986328, "step": 19 }, { "epoch": 0.04523607577042692, "grad_norm": 41.05939278803522, "learning_rate": 3.5555555555555553e-07, "logits/chosen": -0.9180342555046082, "logits/rejected": -0.9136630892753601, "logps/chosen": -1.5036756992340088, "logps/rejected": -1.7418506145477295, "loss": 4.306, "rewards/accuracies": 0.65625, "rewards/chosen": -15.03675651550293, "rewards/margins": 2.3817477226257324, "rewards/rejected": -17.41850471496582, "step": 20 }, { "epoch": 0.04749787955894826, "grad_norm": 65.62889809973544, "learning_rate": 3.7333333333333334e-07, "logits/chosen": -0.8245253562927246, "logits/rejected": -0.8135088086128235, "logps/chosen": -1.5167511701583862, "logps/rejected": -1.5217550992965698, "loss": 5.4951, "rewards/accuracies": 0.609375, "rewards/chosen": -15.167511940002441, "rewards/margins": 0.05004033446311951, "rewards/rejected": -15.217550277709961, "step": 21 }, { "epoch": 0.04975968334746961, "grad_norm": 53.92197856426591, "learning_rate": 3.911111111111111e-07, "logits/chosen": -0.8624619245529175, "logits/rejected": -0.8261862397193909, "logps/chosen": -1.559888243675232, "logps/rejected": -1.6315239667892456, "loss": 4.6857, "rewards/accuracies": 0.6015625, "rewards/chosen": -15.598882675170898, "rewards/margins": 0.716356098651886, "rewards/rejected": -16.31523895263672, "step": 22 }, { "epoch": 0.05202148713599095, "grad_norm": 71.94913453042106, "learning_rate": 4.0888888888888886e-07, "logits/chosen": -0.8572225570678711, "logits/rejected": -0.8356618881225586, "logps/chosen": -1.5296409130096436, "logps/rejected": -1.5351814031600952, "loss": 5.7403, "rewards/accuracies": 0.5234375, "rewards/chosen": -15.296407699584961, "rewards/margins": 0.055405229330062866, "rewards/rejected": -15.351814270019531, "step": 23 }, { "epoch": 0.0542832909245123, "grad_norm": 53.61180470189225, "learning_rate": 4.266666666666666e-07, "logits/chosen": -0.8729988932609558, "logits/rejected": -0.8457622528076172, "logps/chosen": -1.371631383895874, "logps/rejected": -1.4574888944625854, "loss": 5.0383, "rewards/accuracies": 0.578125, "rewards/chosen": -13.716312408447266, "rewards/margins": 0.8585769534111023, "rewards/rejected": -14.574889183044434, "step": 24 }, { "epoch": 0.05654509471303364, "grad_norm": 98.16261763681565, "learning_rate": 4.4444444444444444e-07, "logits/chosen": -0.862544596195221, "logits/rejected": -0.8518227934837341, "logps/chosen": -1.6941993236541748, "logps/rejected": -1.677493691444397, "loss": 5.7118, "rewards/accuracies": 0.5234375, "rewards/chosen": -16.941993713378906, "rewards/margins": -0.16705602407455444, "rewards/rejected": -16.77493667602539, "step": 25 }, { "epoch": 0.05880689850155499, "grad_norm": 94.80833058904163, "learning_rate": 4.6222222222222214e-07, "logits/chosen": -0.8756837844848633, "logits/rejected": -0.8487232327461243, "logps/chosen": -1.6833150386810303, "logps/rejected": -1.6622823476791382, "loss": 5.6915, "rewards/accuracies": 0.546875, "rewards/chosen": -16.83315086364746, "rewards/margins": -0.21032753586769104, "rewards/rejected": -16.622821807861328, "step": 26 }, { "epoch": 0.061068702290076333, "grad_norm": 81.08001128654294, "learning_rate": 4.8e-07, "logits/chosen": -0.8789094686508179, "logits/rejected": -0.8827879428863525, "logps/chosen": -1.6698274612426758, "logps/rejected": -1.5913212299346924, "loss": 6.1429, "rewards/accuracies": 0.515625, "rewards/chosen": -16.698274612426758, "rewards/margins": -0.7850615978240967, "rewards/rejected": -15.913213729858398, "step": 27 }, { "epoch": 0.06333050607859768, "grad_norm": 56.644003389915696, "learning_rate": 4.977777777777777e-07, "logits/chosen": -0.8799877166748047, "logits/rejected": -0.870951235294342, "logps/chosen": -1.5632425546646118, "logps/rejected": -1.624694585800171, "loss": 5.6969, "rewards/accuracies": 0.5234375, "rewards/chosen": -15.632424354553223, "rewards/margins": 0.6145212054252625, "rewards/rejected": -16.246946334838867, "step": 28 }, { "epoch": 0.06559230986711903, "grad_norm": 74.85771393756472, "learning_rate": 5.155555555555556e-07, "logits/chosen": -0.8934893608093262, "logits/rejected": -0.8896267414093018, "logps/chosen": -1.5920103788375854, "logps/rejected": -1.6025701761245728, "loss": 5.5342, "rewards/accuracies": 0.484375, "rewards/chosen": -15.920103073120117, "rewards/margins": 0.10559805482625961, "rewards/rejected": -16.02570152282715, "step": 29 }, { "epoch": 0.06785411365564037, "grad_norm": 76.84594367688287, "learning_rate": 5.333333333333332e-07, "logits/chosen": -0.8594059944152832, "logits/rejected": -0.8437649607658386, "logps/chosen": -1.5912779569625854, "logps/rejected": -1.6219682693481445, "loss": 5.9684, "rewards/accuracies": 0.515625, "rewards/chosen": -15.912779808044434, "rewards/margins": 0.3069048821926117, "rewards/rejected": -16.219684600830078, "step": 30 }, { "epoch": 0.07011591744416172, "grad_norm": 78.87037036995574, "learning_rate": 5.511111111111111e-07, "logits/chosen": -0.8989169001579285, "logits/rejected": -0.88699871301651, "logps/chosen": -1.577941656112671, "logps/rejected": -1.548736572265625, "loss": 5.6791, "rewards/accuracies": 0.59375, "rewards/chosen": -15.779415130615234, "rewards/margins": -0.2920517921447754, "rewards/rejected": -15.487363815307617, "step": 31 }, { "epoch": 0.07237772123268306, "grad_norm": 70.30306886857991, "learning_rate": 5.688888888888889e-07, "logits/chosen": -0.9056158661842346, "logits/rejected": -0.9041393399238586, "logps/chosen": -1.5190542936325073, "logps/rejected": -1.482797622680664, "loss": 5.7225, "rewards/accuracies": 0.5234375, "rewards/chosen": -15.190543174743652, "rewards/margins": -0.36256617307662964, "rewards/rejected": -14.827978134155273, "step": 32 }, { "epoch": 0.07463952502120441, "grad_norm": 52.019953616790154, "learning_rate": 5.866666666666666e-07, "logits/chosen": -0.8431529998779297, "logits/rejected": -0.8326103687286377, "logps/chosen": -1.4754631519317627, "logps/rejected": -1.6542396545410156, "loss": 4.6444, "rewards/accuracies": 0.609375, "rewards/chosen": -14.754631042480469, "rewards/margins": 1.787764072418213, "rewards/rejected": -16.542396545410156, "step": 33 }, { "epoch": 0.07690132880972575, "grad_norm": 51.76600092399858, "learning_rate": 6.044444444444444e-07, "logits/chosen": -0.8958278298377991, "logits/rejected": -0.8506935834884644, "logps/chosen": -1.4213745594024658, "logps/rejected": -1.5557016134262085, "loss": 4.5416, "rewards/accuracies": 0.53125, "rewards/chosen": -14.213743209838867, "rewards/margins": 1.343271017074585, "rewards/rejected": -15.557015419006348, "step": 34 }, { "epoch": 0.0791631325982471, "grad_norm": 88.73341217553781, "learning_rate": 6.222222222222223e-07, "logits/chosen": -0.9224306344985962, "logits/rejected": -0.8935542106628418, "logps/chosen": -1.5873973369598389, "logps/rejected": -1.7230992317199707, "loss": 5.1296, "rewards/accuracies": 0.546875, "rewards/chosen": -15.87397289276123, "rewards/margins": 1.3570194244384766, "rewards/rejected": -17.230993270874023, "step": 35 }, { "epoch": 0.08142493638676845, "grad_norm": 81.48522456937111, "learning_rate": 6.4e-07, "logits/chosen": -0.8501981496810913, "logits/rejected": -0.8491517305374146, "logps/chosen": -1.5095704793930054, "logps/rejected": -1.6728523969650269, "loss": 4.9032, "rewards/accuracies": 0.6171875, "rewards/chosen": -15.095704078674316, "rewards/margins": 1.632819652557373, "rewards/rejected": -16.72852325439453, "step": 36 }, { "epoch": 0.08368674017528979, "grad_norm": 63.09197425067475, "learning_rate": 6.577777777777777e-07, "logits/chosen": -0.8523389101028442, "logits/rejected": -0.8278622627258301, "logps/chosen": -1.3732750415802002, "logps/rejected": -1.3724522590637207, "loss": 5.2905, "rewards/accuracies": 0.609375, "rewards/chosen": -13.732749938964844, "rewards/margins": -0.008226484060287476, "rewards/rejected": -13.724522590637207, "step": 37 }, { "epoch": 0.08594854396381114, "grad_norm": 73.86459203067565, "learning_rate": 6.755555555555555e-07, "logits/chosen": -0.9427972435951233, "logits/rejected": -0.9414781332015991, "logps/chosen": -1.5264731645584106, "logps/rejected": -1.5371237993240356, "loss": 5.2678, "rewards/accuracies": 0.5625, "rewards/chosen": -15.264732360839844, "rewards/margins": 0.1065058782696724, "rewards/rejected": -15.371236801147461, "step": 38 }, { "epoch": 0.08821034775233248, "grad_norm": 73.25463666536884, "learning_rate": 6.933333333333333e-07, "logits/chosen": -0.926520586013794, "logits/rejected": -0.9318759441375732, "logps/chosen": -1.5116084814071655, "logps/rejected": -1.524423360824585, "loss": 5.1166, "rewards/accuracies": 0.5234375, "rewards/chosen": -15.116085052490234, "rewards/margins": 0.12814898788928986, "rewards/rejected": -15.244234085083008, "step": 39 }, { "epoch": 0.09047215154085383, "grad_norm": 47.01597449801661, "learning_rate": 7.111111111111111e-07, "logits/chosen": -0.8796355128288269, "logits/rejected": -0.8566000461578369, "logps/chosen": -1.3858391046524048, "logps/rejected": -1.5868655443191528, "loss": 4.0668, "rewards/accuracies": 0.6640625, "rewards/chosen": -13.858390808105469, "rewards/margins": 2.0102648735046387, "rewards/rejected": -15.868656158447266, "step": 40 }, { "epoch": 0.09273395532937517, "grad_norm": 38.92083571265448, "learning_rate": 7.288888888888888e-07, "logits/chosen": -0.9404792189598083, "logits/rejected": -0.9076958894729614, "logps/chosen": -1.3758432865142822, "logps/rejected": -1.5328952074050903, "loss": 4.4094, "rewards/accuracies": 0.609375, "rewards/chosen": -13.75843334197998, "rewards/margins": 1.5705193281173706, "rewards/rejected": -15.32895278930664, "step": 41 }, { "epoch": 0.09499575911789652, "grad_norm": 68.82170323755115, "learning_rate": 7.466666666666667e-07, "logits/chosen": -0.8351485729217529, "logits/rejected": -0.7955107092857361, "logps/chosen": -1.487162709236145, "logps/rejected": -1.5933465957641602, "loss": 5.0699, "rewards/accuracies": 0.546875, "rewards/chosen": -14.871627807617188, "rewards/margins": 1.0618385076522827, "rewards/rejected": -15.933464050292969, "step": 42 }, { "epoch": 0.09725756290641786, "grad_norm": 63.18032695061353, "learning_rate": 7.644444444444444e-07, "logits/chosen": -0.9111210703849792, "logits/rejected": -0.8793379664421082, "logps/chosen": -1.4616880416870117, "logps/rejected": -1.5058850049972534, "loss": 5.005, "rewards/accuracies": 0.5546875, "rewards/chosen": -14.616881370544434, "rewards/margins": 0.44196972250938416, "rewards/rejected": -15.058850288391113, "step": 43 }, { "epoch": 0.09951936669493922, "grad_norm": 48.381985529172866, "learning_rate": 7.822222222222222e-07, "logits/chosen": -0.8437673449516296, "logits/rejected": -0.8208142518997192, "logps/chosen": -1.3148137331008911, "logps/rejected": -1.4531042575836182, "loss": 4.179, "rewards/accuracies": 0.6328125, "rewards/chosen": -13.148136138916016, "rewards/margins": 1.38290536403656, "rewards/rejected": -14.53104305267334, "step": 44 }, { "epoch": 0.10178117048346055, "grad_norm": 38.81813502976088, "learning_rate": 8e-07, "logits/chosen": -0.9005692005157471, "logits/rejected": -0.8871059417724609, "logps/chosen": -1.3741270303726196, "logps/rejected": -1.4992985725402832, "loss": 4.5246, "rewards/accuracies": 0.6171875, "rewards/chosen": -13.741270065307617, "rewards/margins": 1.2517166137695312, "rewards/rejected": -14.992988586425781, "step": 45 }, { "epoch": 0.1040429742719819, "grad_norm": 89.05318759018981, "learning_rate": 7.999874759018868e-07, "logits/chosen": -0.9439775943756104, "logits/rejected": -0.9177378416061401, "logps/chosen": -1.6424872875213623, "logps/rejected": -1.7683295011520386, "loss": 4.7224, "rewards/accuracies": 0.609375, "rewards/chosen": -16.42487144470215, "rewards/margins": 1.2584227323532104, "rewards/rejected": -17.68329429626465, "step": 46 }, { "epoch": 0.10630477806050326, "grad_norm": 43.17715042286116, "learning_rate": 7.999499043918123e-07, "logits/chosen": -0.934738278388977, "logits/rejected": -0.9424084424972534, "logps/chosen": -1.4421114921569824, "logps/rejected": -1.5015398263931274, "loss": 5.0113, "rewards/accuracies": 0.5234375, "rewards/chosen": -14.421113967895508, "rewards/margins": 0.5942831635475159, "rewards/rejected": -15.015397071838379, "step": 47 }, { "epoch": 0.1085665818490246, "grad_norm": 43.641796737833445, "learning_rate": 7.998872878225228e-07, "logits/chosen": -0.8617913722991943, "logits/rejected": -0.8524473905563354, "logps/chosen": -1.48220694065094, "logps/rejected": -1.628198504447937, "loss": 4.4078, "rewards/accuracies": 0.578125, "rewards/chosen": -14.822070121765137, "rewards/margins": 1.4599149227142334, "rewards/rejected": -16.281984329223633, "step": 48 }, { "epoch": 0.11082838563754595, "grad_norm": 44.9928271242027, "learning_rate": 7.997996301150987e-07, "logits/chosen": -0.8672093152999878, "logits/rejected": -0.8628696203231812, "logps/chosen": -1.4041790962219238, "logps/rejected": -1.5184260606765747, "loss": 4.6114, "rewards/accuracies": 0.6171875, "rewards/chosen": -14.041790962219238, "rewards/margins": 1.142470121383667, "rewards/rejected": -15.184259414672852, "step": 49 }, { "epoch": 0.11309018942606729, "grad_norm": 48.332832486571874, "learning_rate": 7.996869367587088e-07, "logits/chosen": -0.836407482624054, "logits/rejected": -0.8215224146842957, "logps/chosen": -1.4828814268112183, "logps/rejected": -1.5967737436294556, "loss": 4.6077, "rewards/accuracies": 0.625, "rewards/chosen": -14.828814506530762, "rewards/margins": 1.1389241218566895, "rewards/rejected": -15.967738151550293, "step": 50 }, { "epoch": 0.11535199321458864, "grad_norm": 42.46620935554636, "learning_rate": 7.99549214810266e-07, "logits/chosen": -0.8490492105484009, "logits/rejected": -0.8362867832183838, "logps/chosen": -1.4633221626281738, "logps/rejected": -1.5538841485977173, "loss": 4.5936, "rewards/accuracies": 0.609375, "rewards/chosen": -14.633221626281738, "rewards/margins": 0.9056205153465271, "rewards/rejected": -15.538841247558594, "step": 51 }, { "epoch": 0.11761379700310998, "grad_norm": 46.23878461965418, "learning_rate": 7.993864728939867e-07, "logits/chosen": -0.8653365969657898, "logits/rejected": -0.8207730650901794, "logps/chosen": -1.4526644945144653, "logps/rejected": -1.5614793300628662, "loss": 4.8368, "rewards/accuracies": 0.609375, "rewards/chosen": -14.526643753051758, "rewards/margins": 1.0881470441818237, "rewards/rejected": -15.614792823791504, "step": 52 }, { "epoch": 0.11987560079163133, "grad_norm": 50.746711219977314, "learning_rate": 7.991987212008491e-07, "logits/chosen": -0.8787316083908081, "logits/rejected": -0.8544822931289673, "logps/chosen": -1.524681568145752, "logps/rejected": -1.7203454971313477, "loss": 4.3884, "rewards/accuracies": 0.5703125, "rewards/chosen": -15.246816635131836, "rewards/margins": 1.9566391706466675, "rewards/rejected": -17.203454971313477, "step": 53 }, { "epoch": 0.12213740458015267, "grad_norm": 51.98936145640891, "learning_rate": 7.989859714879565e-07, "logits/chosen": -0.9071463346481323, "logits/rejected": -0.8824944496154785, "logps/chosen": -1.4744333028793335, "logps/rejected": -1.5566731691360474, "loss": 4.8618, "rewards/accuracies": 0.5546875, "rewards/chosen": -14.744333267211914, "rewards/margins": 0.8223981261253357, "rewards/rejected": -15.566731452941895, "step": 54 }, { "epoch": 0.12439920836867402, "grad_norm": 44.93136877143668, "learning_rate": 7.987482370778005e-07, "logits/chosen": -0.8825117349624634, "logits/rejected": -0.8596429824829102, "logps/chosen": -1.500649094581604, "logps/rejected": -1.6202951669692993, "loss": 4.696, "rewards/accuracies": 0.546875, "rewards/chosen": -15.006490707397461, "rewards/margins": 1.1964606046676636, "rewards/rejected": -16.20294952392578, "step": 55 }, { "epoch": 0.12666101215719536, "grad_norm": 57.11711572983479, "learning_rate": 7.984855328574262e-07, "logits/chosen": -0.748485803604126, "logits/rejected": -0.7519769668579102, "logps/chosen": -1.4509243965148926, "logps/rejected": -1.5625280141830444, "loss": 4.4574, "rewards/accuracies": 0.5859375, "rewards/chosen": -14.50924301147461, "rewards/margins": 1.1160372495651245, "rewards/rejected": -15.625280380249023, "step": 56 }, { "epoch": 0.1289228159457167, "grad_norm": 70.17018909190087, "learning_rate": 7.981978752775009e-07, "logits/chosen": -0.8194972276687622, "logits/rejected": -0.8117552399635315, "logps/chosen": -1.5257998704910278, "logps/rejected": -1.6556179523468018, "loss": 4.665, "rewards/accuracies": 0.5703125, "rewards/chosen": -15.257999420166016, "rewards/margins": 1.298180341720581, "rewards/rejected": -16.55617904663086, "step": 57 }, { "epoch": 0.13118461973423806, "grad_norm": 79.53233397371731, "learning_rate": 7.978852823512833e-07, "logits/chosen": -0.8595327138900757, "logits/rejected": -0.8340020179748535, "logps/chosen": -1.641236424446106, "logps/rejected": -1.7583504915237427, "loss": 4.9327, "rewards/accuracies": 0.5859375, "rewards/chosen": -16.412364959716797, "rewards/margins": 1.1711419820785522, "rewards/rejected": -17.583507537841797, "step": 58 }, { "epoch": 0.1334464235227594, "grad_norm": 69.60803219031307, "learning_rate": 7.975477736534957e-07, "logits/chosen": -0.8586044907569885, "logits/rejected": -0.8539649844169617, "logps/chosen": -1.5942871570587158, "logps/rejected": -1.808882474899292, "loss": 4.3286, "rewards/accuracies": 0.609375, "rewards/chosen": -15.942872047424316, "rewards/margins": 2.1459531784057617, "rewards/rejected": -18.088825225830078, "step": 59 }, { "epoch": 0.13570822731128074, "grad_norm": 62.64898369105346, "learning_rate": 7.971853703190986e-07, "logits/chosen": -0.8574209213256836, "logits/rejected": -0.8404501080513, "logps/chosen": -1.5743780136108398, "logps/rejected": -1.7517762184143066, "loss": 4.4022, "rewards/accuracies": 0.6875, "rewards/chosen": -15.743781089782715, "rewards/margins": 1.773982048034668, "rewards/rejected": -17.517763137817383, "step": 60 }, { "epoch": 0.1379700310998021, "grad_norm": 59.54476826787708, "learning_rate": 7.967980950419664e-07, "logits/chosen": -0.8027121424674988, "logits/rejected": -0.7864540815353394, "logps/chosen": -1.5260121822357178, "logps/rejected": -1.6937767267227173, "loss": 4.4368, "rewards/accuracies": 0.609375, "rewards/chosen": -15.260122299194336, "rewards/margins": 1.6776450872421265, "rewards/rejected": -16.937767028808594, "step": 61 }, { "epoch": 0.14023183488832344, "grad_norm": 54.120200448145056, "learning_rate": 7.963859720734669e-07, "logits/chosen": -0.8626559376716614, "logits/rejected": -0.8548423051834106, "logps/chosen": -1.4451175928115845, "logps/rejected": -1.646138310432434, "loss": 4.3155, "rewards/accuracies": 0.59375, "rewards/chosen": -14.451175689697266, "rewards/margins": 2.0102078914642334, "rewards/rejected": -16.461383819580078, "step": 62 }, { "epoch": 0.14249363867684478, "grad_norm": 48.27367689289302, "learning_rate": 7.959490272209427e-07, "logits/chosen": -0.8522219061851501, "logits/rejected": -0.8149221539497375, "logps/chosen": -1.4842830896377563, "logps/rejected": -1.741237998008728, "loss": 3.8194, "rewards/accuracies": 0.6796875, "rewards/chosen": -14.842830657958984, "rewards/margins": 2.5695488452911377, "rewards/rejected": -17.41238021850586, "step": 63 }, { "epoch": 0.14475544246536612, "grad_norm": 47.53970538406451, "learning_rate": 7.954872878460946e-07, "logits/chosen": -0.8807967901229858, "logits/rejected": -0.8453354835510254, "logps/chosen": -1.5572218894958496, "logps/rejected": -1.7726668119430542, "loss": 4.0647, "rewards/accuracies": 0.671875, "rewards/chosen": -15.572219848632812, "rewards/margins": 2.154447555541992, "rewards/rejected": -17.726669311523438, "step": 64 }, { "epoch": 0.14701724625388748, "grad_norm": 61.35510962861716, "learning_rate": 7.950007828632691e-07, "logits/chosen": -0.8250374794006348, "logits/rejected": -0.820457935333252, "logps/chosen": -1.607496738433838, "logps/rejected": -1.8878852128982544, "loss": 4.0136, "rewards/accuracies": 0.640625, "rewards/chosen": -16.074966430664062, "rewards/margins": 2.8038859367370605, "rewards/rejected": -18.87885284423828, "step": 65 }, { "epoch": 0.14927905004240882, "grad_norm": 54.05120684424973, "learning_rate": 7.944895427376465e-07, "logits/chosen": -0.8387467861175537, "logits/rejected": -0.8197423219680786, "logps/chosen": -1.613673210144043, "logps/rejected": -1.8641467094421387, "loss": 4.1501, "rewards/accuracies": 0.5703125, "rewards/chosen": -16.13673210144043, "rewards/margins": 2.5047359466552734, "rewards/rejected": -18.641468048095703, "step": 66 }, { "epoch": 0.15154085383093016, "grad_norm": 43.4329226476985, "learning_rate": 7.939535994833345e-07, "logits/chosen": -0.80382239818573, "logits/rejected": -0.7954918742179871, "logps/chosen": -1.4918571710586548, "logps/rejected": -1.7582557201385498, "loss": 4.0382, "rewards/accuracies": 0.6328125, "rewards/chosen": -14.918570518493652, "rewards/margins": 2.663985013961792, "rewards/rejected": -17.582555770874023, "step": 67 }, { "epoch": 0.1538026576194515, "grad_norm": 58.50135003917604, "learning_rate": 7.933929866613628e-07, "logits/chosen": -0.8193422555923462, "logits/rejected": -0.8229498863220215, "logps/chosen": -1.5523847341537476, "logps/rejected": -1.6850162744522095, "loss": 4.511, "rewards/accuracies": 0.6171875, "rewards/chosen": -15.523847579956055, "rewards/margins": 1.32631516456604, "rewards/rejected": -16.850162506103516, "step": 68 }, { "epoch": 0.15606446140797287, "grad_norm": 52.388333685361864, "learning_rate": 7.928077393775808e-07, "logits/chosen": -0.8074467778205872, "logits/rejected": -0.8173753619194031, "logps/chosen": -1.5872104167938232, "logps/rejected": -1.9215919971466064, "loss": 3.605, "rewards/accuracies": 0.6640625, "rewards/chosen": -15.87210464477539, "rewards/margins": 3.3438150882720947, "rewards/rejected": -19.215919494628906, "step": 69 }, { "epoch": 0.1583262651964942, "grad_norm": 70.46015135129937, "learning_rate": 7.921978942804609e-07, "logits/chosen": -0.7921926975250244, "logits/rejected": -0.7895167469978333, "logps/chosen": -1.5997159481048584, "logps/rejected": -1.855806827545166, "loss": 3.9852, "rewards/accuracies": 0.640625, "rewards/chosen": -15.997159004211426, "rewards/margins": 2.5609097480773926, "rewards/rejected": -18.558067321777344, "step": 70 }, { "epoch": 0.16058806898501554, "grad_norm": 66.5782725292864, "learning_rate": 7.915634895588021e-07, "logits/chosen": -0.8188354969024658, "logits/rejected": -0.803663969039917, "logps/chosen": -1.694320797920227, "logps/rejected": -1.8535633087158203, "loss": 4.5753, "rewards/accuracies": 0.6015625, "rewards/chosen": -16.943206787109375, "rewards/margins": 1.5924267768859863, "rewards/rejected": -18.535634994506836, "step": 71 }, { "epoch": 0.1628498727735369, "grad_norm": 63.98536775928476, "learning_rate": 7.909045649393394e-07, "logits/chosen": -0.8593119382858276, "logits/rejected": -0.8650994896888733, "logps/chosen": -1.585839033126831, "logps/rejected": -1.7022672891616821, "loss": 4.5482, "rewards/accuracies": 0.6171875, "rewards/chosen": -15.858390808105469, "rewards/margins": 1.1642816066741943, "rewards/rejected": -17.02267074584961, "step": 72 }, { "epoch": 0.16511167656205825, "grad_norm": 47.71315634678256, "learning_rate": 7.902211616842556e-07, "logits/chosen": -0.8264446258544922, "logits/rejected": -0.8236741423606873, "logps/chosen": -1.623077154159546, "logps/rejected": -1.879746437072754, "loss": 4.1393, "rewards/accuracies": 0.640625, "rewards/chosen": -16.23077392578125, "rewards/margins": 2.566693067550659, "rewards/rejected": -18.79746437072754, "step": 73 }, { "epoch": 0.16737348035057958, "grad_norm": 70.53123827391246, "learning_rate": 7.89513322588598e-07, "logits/chosen": -0.808039665222168, "logits/rejected": -0.7966674566268921, "logps/chosen": -1.592429757118225, "logps/rejected": -1.8032734394073486, "loss": 3.9256, "rewards/accuracies": 0.6015625, "rewards/chosen": -15.924297332763672, "rewards/margins": 2.108438014984131, "rewards/rejected": -18.03273582458496, "step": 74 }, { "epoch": 0.16963528413910092, "grad_norm": 56.68159659271728, "learning_rate": 7.887810919775976e-07, "logits/chosen": -0.7461099028587341, "logits/rejected": -0.7355799674987793, "logps/chosen": -1.6924803256988525, "logps/rejected": -1.9031829833984375, "loss": 4.0589, "rewards/accuracies": 0.65625, "rewards/chosen": -16.9248046875, "rewards/margins": 2.107023239135742, "rewards/rejected": -19.031827926635742, "step": 75 }, { "epoch": 0.1718970879276223, "grad_norm": 42.71145723908974, "learning_rate": 7.880245157038949e-07, "logits/chosen": -0.8165091276168823, "logits/rejected": -0.793809175491333, "logps/chosen": -1.688427448272705, "logps/rejected": -1.9064791202545166, "loss": 4.0899, "rewards/accuracies": 0.609375, "rewards/chosen": -16.884273529052734, "rewards/margins": 2.180520534515381, "rewards/rejected": -19.064794540405273, "step": 76 }, { "epoch": 0.17415889171614363, "grad_norm": 80.58036409049882, "learning_rate": 7.872436411446671e-07, "logits/chosen": -0.836346447467804, "logits/rejected": -0.8506262302398682, "logps/chosen": -1.7576085329055786, "logps/rejected": -1.920924186706543, "loss": 4.5954, "rewards/accuracies": 0.6171875, "rewards/chosen": -17.57608413696289, "rewards/margins": 1.6331558227539062, "rewards/rejected": -19.209239959716797, "step": 77 }, { "epoch": 0.17642069550466496, "grad_norm": 46.381702188392666, "learning_rate": 7.86438517198662e-07, "logits/chosen": -0.780924379825592, "logits/rejected": -0.767948567867279, "logps/chosen": -1.650989294052124, "logps/rejected": -1.8504787683486938, "loss": 4.2658, "rewards/accuracies": 0.625, "rewards/chosen": -16.5098934173584, "rewards/margins": 1.9948934316635132, "rewards/rejected": -18.50478744506836, "step": 78 }, { "epoch": 0.1786824992931863, "grad_norm": 56.20092121621141, "learning_rate": 7.856091942831366e-07, "logits/chosen": -0.7430872321128845, "logits/rejected": -0.7503747940063477, "logps/chosen": -1.644688367843628, "logps/rejected": -1.8490663766860962, "loss": 4.517, "rewards/accuracies": 0.625, "rewards/chosen": -16.446882247924805, "rewards/margins": 2.0437800884246826, "rewards/rejected": -18.49066162109375, "step": 79 }, { "epoch": 0.18094430308170767, "grad_norm": 52.81332760090688, "learning_rate": 7.847557243306982e-07, "logits/chosen": -0.8418252468109131, "logits/rejected": -0.8341580629348755, "logps/chosen": -1.6995246410369873, "logps/rejected": -1.9185855388641357, "loss": 4.0086, "rewards/accuracies": 0.671875, "rewards/chosen": -16.9952449798584, "rewards/margins": 2.190608024597168, "rewards/rejected": -19.18585205078125, "step": 80 }, { "epoch": 0.183206106870229, "grad_norm": 56.02862880172386, "learning_rate": 7.838781607860541e-07, "logits/chosen": -0.8196614980697632, "logits/rejected": -0.8126786947250366, "logps/chosen": -1.7471215724945068, "logps/rejected": -1.9539873600006104, "loss": 3.7371, "rewards/accuracies": 0.65625, "rewards/chosen": -17.471214294433594, "rewards/margins": 2.068657398223877, "rewards/rejected": -19.539873123168945, "step": 81 }, { "epoch": 0.18546791065875035, "grad_norm": 54.48859910947903, "learning_rate": 7.82976558602664e-07, "logits/chosen": -0.8580424785614014, "logits/rejected": -0.8641104102134705, "logps/chosen": -1.7102807760238647, "logps/rejected": -1.8986783027648926, "loss": 4.2118, "rewards/accuracies": 0.6484375, "rewards/chosen": -17.102807998657227, "rewards/margins": 1.8839763402938843, "rewards/rejected": -18.986783981323242, "step": 82 }, { "epoch": 0.1877297144472717, "grad_norm": 78.04437890690556, "learning_rate": 7.820509742392988e-07, "logits/chosen": -0.8468527793884277, "logits/rejected": -0.8453028202056885, "logps/chosen": -1.8543328046798706, "logps/rejected": -2.0150225162506104, "loss": 4.3218, "rewards/accuracies": 0.6328125, "rewards/chosen": -18.54332733154297, "rewards/margins": 1.6068973541259766, "rewards/rejected": -20.150224685668945, "step": 83 }, { "epoch": 0.18999151823579305, "grad_norm": 88.82175847045283, "learning_rate": 7.811014656565054e-07, "logits/chosen": -0.8449732661247253, "logits/rejected": -0.815599799156189, "logps/chosen": -1.738198161125183, "logps/rejected": -2.115600347518921, "loss": 3.5074, "rewards/accuracies": 0.703125, "rewards/chosen": -17.381982803344727, "rewards/margins": 3.7740225791931152, "rewards/rejected": -21.156005859375, "step": 84 }, { "epoch": 0.1922533220243144, "grad_norm": 63.74399903736426, "learning_rate": 7.801280923129773e-07, "logits/chosen": -0.8337980508804321, "logits/rejected": -0.8294973969459534, "logps/chosen": -1.8116644620895386, "logps/rejected": -1.987363338470459, "loss": 4.549, "rewards/accuracies": 0.6015625, "rewards/chosen": -18.11664581298828, "rewards/margins": 1.756988763809204, "rewards/rejected": -19.873632431030273, "step": 85 }, { "epoch": 0.19451512581283573, "grad_norm": 77.0826088582988, "learning_rate": 7.791309151618305e-07, "logits/chosen": -0.8380694389343262, "logits/rejected": -0.8311120271682739, "logps/chosen": -1.9478144645690918, "logps/rejected": -2.143031597137451, "loss": 4.2291, "rewards/accuracies": 0.5703125, "rewards/chosen": -19.478145599365234, "rewards/margins": 1.9521695375442505, "rewards/rejected": -21.430315017700195, "step": 86 }, { "epoch": 0.1967769296013571, "grad_norm": 53.07834075055144, "learning_rate": 7.781099966467874e-07, "logits/chosen": -0.8639700412750244, "logits/rejected": -0.8545355200767517, "logps/chosen": -1.727626919746399, "logps/rejected": -1.8995643854141235, "loss": 3.8778, "rewards/accuracies": 0.6875, "rewards/chosen": -17.276269912719727, "rewards/margins": 1.7193742990493774, "rewards/rejected": -18.995643615722656, "step": 87 }, { "epoch": 0.19903873338987843, "grad_norm": 82.55613961122098, "learning_rate": 7.770654006982664e-07, "logits/chosen": -0.8509809374809265, "logits/rejected": -0.8106420040130615, "logps/chosen": -2.0078237056732178, "logps/rejected": -2.231494426727295, "loss": 4.4582, "rewards/accuracies": 0.671875, "rewards/chosen": -20.078235626220703, "rewards/margins": 2.2367055416107178, "rewards/rejected": -22.31494140625, "step": 88 }, { "epoch": 0.20130053717839977, "grad_norm": 72.58396338245271, "learning_rate": 7.759971927293781e-07, "logits/chosen": -0.8639533519744873, "logits/rejected": -0.8477087616920471, "logps/chosen": -1.8459900617599487, "logps/rejected": -2.0477120876312256, "loss": 4.1424, "rewards/accuracies": 0.6953125, "rewards/chosen": -18.459901809692383, "rewards/margins": 2.0172195434570312, "rewards/rejected": -20.477121353149414, "step": 89 }, { "epoch": 0.2035623409669211, "grad_norm": 63.790702792270665, "learning_rate": 7.749054396318297e-07, "logits/chosen": -0.839960515499115, "logits/rejected": -0.8227687478065491, "logps/chosen": -1.9486068487167358, "logps/rejected": -2.1220030784606934, "loss": 4.2943, "rewards/accuracies": 0.625, "rewards/chosen": -19.486068725585938, "rewards/margins": 1.733961582183838, "rewards/rejected": -21.220029830932617, "step": 90 }, { "epoch": 0.20582414475544247, "grad_norm": 77.99250593212055, "learning_rate": 7.737902097717356e-07, "logits/chosen": -0.8212487101554871, "logits/rejected": -0.8338538408279419, "logps/chosen": -1.863584280014038, "logps/rejected": -2.1549441814422607, "loss": 4.1322, "rewards/accuracies": 0.609375, "rewards/chosen": -18.635841369628906, "rewards/margins": 2.913600444793701, "rewards/rejected": -21.549442291259766, "step": 91 }, { "epoch": 0.2080859485439638, "grad_norm": 72.8959596074702, "learning_rate": 7.726515729853367e-07, "logits/chosen": -0.8232444524765015, "logits/rejected": -0.819841742515564, "logps/chosen": -1.8698346614837646, "logps/rejected": -2.029289484024048, "loss": 4.5166, "rewards/accuracies": 0.6171875, "rewards/chosen": -18.698345184326172, "rewards/margins": 1.5945475101470947, "rewards/rejected": -20.29289436340332, "step": 92 }, { "epoch": 0.21034775233248515, "grad_norm": 87.04210388064935, "learning_rate": 7.714896005746272e-07, "logits/chosen": -0.8586671948432922, "logits/rejected": -0.8418205976486206, "logps/chosen": -1.9241650104522705, "logps/rejected": -2.2153499126434326, "loss": 3.6986, "rewards/accuracies": 0.6875, "rewards/chosen": -19.24165153503418, "rewards/margins": 2.9118492603302, "rewards/rejected": -22.153499603271484, "step": 93 }, { "epoch": 0.21260955612100652, "grad_norm": 111.68532775800007, "learning_rate": 7.703043653028896e-07, "logits/chosen": -0.8883798122406006, "logits/rejected": -0.8768050670623779, "logps/chosen": -2.1463255882263184, "logps/rejected": -2.3561160564422607, "loss": 4.2491, "rewards/accuracies": 0.6796875, "rewards/chosen": -21.463254928588867, "rewards/margins": 2.097905158996582, "rewards/rejected": -23.561161041259766, "step": 94 }, { "epoch": 0.21487135990952785, "grad_norm": 73.84778954625673, "learning_rate": 7.690959413901379e-07, "logits/chosen": -0.8396280407905579, "logits/rejected": -0.8113132119178772, "logps/chosen": -2.0055348873138428, "logps/rejected": -2.264613628387451, "loss": 4.0505, "rewards/accuracies": 0.671875, "rewards/chosen": -20.055349349975586, "rewards/margins": 2.590789318084717, "rewards/rejected": -22.646137237548828, "step": 95 }, { "epoch": 0.2171331636980492, "grad_norm": 76.75066956482912, "learning_rate": 7.678644045084704e-07, "logits/chosen": -0.7979358434677124, "logits/rejected": -0.8126614093780518, "logps/chosen": -1.901458978652954, "logps/rejected": -2.1271097660064697, "loss": 4.3121, "rewards/accuracies": 0.59375, "rewards/chosen": -19.014589309692383, "rewards/margins": 2.2565088272094727, "rewards/rejected": -21.271099090576172, "step": 96 }, { "epoch": 0.21939496748657053, "grad_norm": 69.10947756524247, "learning_rate": 7.666098317773308e-07, "logits/chosen": -0.8482003211975098, "logits/rejected": -0.8479557037353516, "logps/chosen": -2.065002918243408, "logps/rejected": -2.2882683277130127, "loss": 3.7783, "rewards/accuracies": 0.671875, "rewards/chosen": -20.650028228759766, "rewards/margins": 2.2326550483703613, "rewards/rejected": -22.8826847076416, "step": 97 }, { "epoch": 0.2216567712750919, "grad_norm": 71.38081973867735, "learning_rate": 7.653323017586789e-07, "logits/chosen": -0.8626989722251892, "logits/rejected": -0.8502533435821533, "logps/chosen": -1.871101975440979, "logps/rejected": -2.0896551609039307, "loss": 3.8377, "rewards/accuracies": 0.7109375, "rewards/chosen": -18.711015701293945, "rewards/margins": 2.1855337619781494, "rewards/rejected": -20.89655303955078, "step": 98 }, { "epoch": 0.22391857506361323, "grad_norm": 80.27831480952293, "learning_rate": 7.640318944520711e-07, "logits/chosen": -0.8602339029312134, "logits/rejected": -0.8497695922851562, "logps/chosen": -2.0520148277282715, "logps/rejected": -2.2807064056396484, "loss": 3.7873, "rewards/accuracies": 0.7109375, "rewards/chosen": -20.52014923095703, "rewards/margins": 2.286914825439453, "rewards/rejected": -22.80706214904785, "step": 99 }, { "epoch": 0.22618037885213457, "grad_norm": 82.83162144065287, "learning_rate": 7.627086912896511e-07, "logits/chosen": -0.755748987197876, "logits/rejected": -0.7821561098098755, "logps/chosen": -1.9375782012939453, "logps/rejected": -2.1892411708831787, "loss": 3.7342, "rewards/accuracies": 0.7265625, "rewards/chosen": -19.37578010559082, "rewards/margins": 2.516632318496704, "rewards/rejected": -21.892412185668945, "step": 100 }, { "epoch": 0.2284421826406559, "grad_norm": 61.630906980874535, "learning_rate": 7.613627751310499e-07, "logits/chosen": -0.8867424726486206, "logits/rejected": -0.8885044455528259, "logps/chosen": -2.1041259765625, "logps/rejected": -2.3255014419555664, "loss": 3.658, "rewards/accuracies": 0.65625, "rewards/chosen": -21.041257858276367, "rewards/margins": 2.2137553691864014, "rewards/rejected": -23.255016326904297, "step": 101 }, { "epoch": 0.23070398642917728, "grad_norm": 89.96142120049903, "learning_rate": 7.599942302581977e-07, "logits/chosen": -0.8578089475631714, "logits/rejected": -0.8603122234344482, "logps/chosen": -2.110222816467285, "logps/rejected": -2.432941198348999, "loss": 3.5255, "rewards/accuracies": 0.7109375, "rewards/chosen": -21.10222816467285, "rewards/margins": 3.227184295654297, "rewards/rejected": -24.329410552978516, "step": 102 }, { "epoch": 0.23296579021769862, "grad_norm": 76.3035369590703, "learning_rate": 7.586031423700457e-07, "logits/chosen": -0.8419609069824219, "logits/rejected": -0.8390515446662903, "logps/chosen": -2.08290958404541, "logps/rejected": -2.321464776992798, "loss": 3.9243, "rewards/accuracies": 0.6484375, "rewards/chosen": -20.8290958404541, "rewards/margins": 2.3855514526367188, "rewards/rejected": -23.214645385742188, "step": 103 }, { "epoch": 0.23522759400621995, "grad_norm": 73.90826312722105, "learning_rate": 7.571895985772e-07, "logits/chosen": -0.8009840846061707, "logits/rejected": -0.8087509870529175, "logps/chosen": -2.1095006465911865, "logps/rejected": -2.4837350845336914, "loss": 3.27, "rewards/accuracies": 0.71875, "rewards/chosen": -21.095006942749023, "rewards/margins": 3.7423441410064697, "rewards/rejected": -24.837350845336914, "step": 104 }, { "epoch": 0.23748939779474132, "grad_norm": 90.91119808796685, "learning_rate": 7.557536873964661e-07, "logits/chosen": -0.8794471025466919, "logits/rejected": -0.8741526007652283, "logps/chosen": -2.432756185531616, "logps/rejected": -2.654453992843628, "loss": 4.2395, "rewards/accuracies": 0.6953125, "rewards/chosen": -24.327564239501953, "rewards/margins": 2.2169761657714844, "rewards/rejected": -26.544538497924805, "step": 105 }, { "epoch": 0.23975120158326266, "grad_norm": 72.09150920561058, "learning_rate": 7.542954987453069e-07, "logits/chosen": -0.8508340716362, "logits/rejected": -0.8528013825416565, "logps/chosen": -2.29594087600708, "logps/rejected": -2.5421571731567383, "loss": 3.7295, "rewards/accuracies": 0.703125, "rewards/chosen": -22.959407806396484, "rewards/margins": 2.462160348892212, "rewards/rejected": -25.421571731567383, "step": 106 }, { "epoch": 0.242013005371784, "grad_norm": 92.05218134624694, "learning_rate": 7.528151239362108e-07, "logits/chosen": -0.8492079377174377, "logits/rejected": -0.8603383898735046, "logps/chosen": -2.424321174621582, "logps/rejected": -2.706427812576294, "loss": 3.7107, "rewards/accuracies": 0.6484375, "rewards/chosen": -24.243213653564453, "rewards/margins": 2.8210651874542236, "rewards/rejected": -27.064279556274414, "step": 107 }, { "epoch": 0.24427480916030533, "grad_norm": 87.963517642823, "learning_rate": 7.513126556709748e-07, "logits/chosen": -0.8548307418823242, "logits/rejected": -0.8365832567214966, "logps/chosen": -2.367842197418213, "logps/rejected": -2.776707172393799, "loss": 3.2256, "rewards/accuracies": 0.703125, "rewards/chosen": -23.678422927856445, "rewards/margins": 4.088651657104492, "rewards/rejected": -27.767070770263672, "step": 108 }, { "epoch": 0.2465366129488267, "grad_norm": 104.59173334061525, "learning_rate": 7.497881880348984e-07, "logits/chosen": -0.8096323013305664, "logits/rejected": -0.7972118854522705, "logps/chosen": -2.4743218421936035, "logps/rejected": -2.755343437194824, "loss": 3.9171, "rewards/accuracies": 0.671875, "rewards/chosen": -24.74321937561035, "rewards/margins": 2.8102121353149414, "rewards/rejected": -27.55343246459961, "step": 109 }, { "epoch": 0.24879841673734804, "grad_norm": 164.02635203828626, "learning_rate": 7.482418164908931e-07, "logits/chosen": -0.8311317563056946, "logits/rejected": -0.8270904421806335, "logps/chosen": -2.616877794265747, "logps/rejected": -2.835850477218628, "loss": 4.1572, "rewards/accuracies": 0.6640625, "rewards/chosen": -26.168777465820312, "rewards/margins": 2.1897284984588623, "rewards/rejected": -28.358505249023438, "step": 110 }, { "epoch": 0.2510602205258694, "grad_norm": 91.04185257653407, "learning_rate": 7.466736378735035e-07, "logits/chosen": -0.8114999532699585, "logits/rejected": -0.813449501991272, "logps/chosen": -2.6687309741973877, "logps/rejected": -3.0298876762390137, "loss": 3.5266, "rewards/accuracies": 0.7109375, "rewards/chosen": -26.68730926513672, "rewards/margins": 3.6115658283233643, "rewards/rejected": -30.298873901367188, "step": 111 }, { "epoch": 0.2533220243143907, "grad_norm": 103.42000228406864, "learning_rate": 7.450837503828439e-07, "logits/chosen": -0.7962609529495239, "logits/rejected": -0.7812699675559998, "logps/chosen": -2.829650640487671, "logps/rejected": -3.2509734630584717, "loss": 3.4804, "rewards/accuracies": 0.75, "rewards/chosen": -28.296504974365234, "rewards/margins": 4.213228702545166, "rewards/rejected": -32.509735107421875, "step": 112 }, { "epoch": 0.2555838281029121, "grad_norm": 79.3801670576343, "learning_rate": 7.43472253578449e-07, "logits/chosen": -0.792640745639801, "logits/rejected": -0.7946760654449463, "logps/chosen": -2.4934792518615723, "logps/rejected": -2.861074447631836, "loss": 3.6492, "rewards/accuracies": 0.6953125, "rewards/chosen": -24.934791564941406, "rewards/margins": 3.675952434539795, "rewards/rejected": -28.61074447631836, "step": 113 }, { "epoch": 0.2578456318914334, "grad_norm": 84.41846931366052, "learning_rate": 7.418392483730389e-07, "logits/chosen": -0.8063937425613403, "logits/rejected": -0.8131504058837891, "logps/chosen": -2.685357093811035, "logps/rejected": -3.0498905181884766, "loss": 3.312, "rewards/accuracies": 0.7265625, "rewards/chosen": -26.85357093811035, "rewards/margins": 3.6453330516815186, "rewards/rejected": -30.4989013671875, "step": 114 }, { "epoch": 0.26010743567995476, "grad_norm": 83.25446388909985, "learning_rate": 7.401848370262012e-07, "logits/chosen": -0.8394590020179749, "logits/rejected": -0.8276815414428711, "logps/chosen": -2.709625244140625, "logps/rejected": -2.9904394149780273, "loss": 3.5602, "rewards/accuracies": 0.7109375, "rewards/chosen": -27.096254348754883, "rewards/margins": 2.808140754699707, "rewards/rejected": -29.904394149780273, "step": 115 }, { "epoch": 0.2623692394684761, "grad_norm": 99.34909208476857, "learning_rate": 7.385091231379856e-07, "logits/chosen": -0.8177067041397095, "logits/rejected": -0.8186391592025757, "logps/chosen": -2.9124531745910645, "logps/rejected": -3.2890923023223877, "loss": 3.7712, "rewards/accuracies": 0.703125, "rewards/chosen": -29.124530792236328, "rewards/margins": 3.7663931846618652, "rewards/rejected": -32.89092254638672, "step": 116 }, { "epoch": 0.26463104325699743, "grad_norm": 196.05646746243912, "learning_rate": 7.368122116424182e-07, "logits/chosen": -0.7795528769493103, "logits/rejected": -0.7889488935470581, "logps/chosen": -2.8638410568237305, "logps/rejected": -3.2152295112609863, "loss": 3.873, "rewards/accuracies": 0.65625, "rewards/chosen": -28.63841438293457, "rewards/margins": 3.5138840675354004, "rewards/rejected": -32.15229797363281, "step": 117 }, { "epoch": 0.2668928470455188, "grad_norm": 123.65267983469268, "learning_rate": 7.350942088009289e-07, "logits/chosen": -0.8420966863632202, "logits/rejected": -0.8411574363708496, "logps/chosen": -2.9619340896606445, "logps/rejected": -3.260565996170044, "loss": 3.5104, "rewards/accuracies": 0.7421875, "rewards/chosen": -29.619338989257812, "rewards/margins": 2.9863200187683105, "rewards/rejected": -32.60565948486328, "step": 118 }, { "epoch": 0.26915465083404017, "grad_norm": 124.41958542086248, "learning_rate": 7.333552221956986e-07, "logits/chosen": -0.9466845393180847, "logits/rejected": -0.9294576644897461, "logps/chosen": -3.071857213973999, "logps/rejected": -3.4685423374176025, "loss": 3.8289, "rewards/accuracies": 0.703125, "rewards/chosen": -30.718576431274414, "rewards/margins": 3.9668469429016113, "rewards/rejected": -34.6854248046875, "step": 119 }, { "epoch": 0.2714164546225615, "grad_norm": 95.20752031494493, "learning_rate": 7.315953607229217e-07, "logits/chosen": -0.8441572785377502, "logits/rejected": -0.8446038961410522, "logps/chosen": -3.104475498199463, "logps/rejected": -3.505401611328125, "loss": 3.32, "rewards/accuracies": 0.703125, "rewards/chosen": -31.04475212097168, "rewards/margins": 4.009262561798096, "rewards/rejected": -35.05401611328125, "step": 120 }, { "epoch": 0.27367825841108284, "grad_norm": 106.09440015221676, "learning_rate": 7.298147345859869e-07, "logits/chosen": -0.8386214375495911, "logits/rejected": -0.8599450588226318, "logps/chosen": -2.900517463684082, "logps/rejected": -3.2684617042541504, "loss": 3.5845, "rewards/accuracies": 0.71875, "rewards/chosen": -29.00517463684082, "rewards/margins": 3.6794400215148926, "rewards/rejected": -32.68461608886719, "step": 121 }, { "epoch": 0.2759400621996042, "grad_norm": 160.41836210136088, "learning_rate": 7.280134552885762e-07, "logits/chosen": -0.8167920112609863, "logits/rejected": -0.8117007613182068, "logps/chosen": -2.9862632751464844, "logps/rejected": -3.363959789276123, "loss": 3.5251, "rewards/accuracies": 0.7421875, "rewards/chosen": -29.86263084411621, "rewards/margins": 3.776963710784912, "rewards/rejected": -33.63959884643555, "step": 122 }, { "epoch": 0.2782018659881255, "grad_norm": 99.5977130216733, "learning_rate": 7.261916356276831e-07, "logits/chosen": -0.8167967796325684, "logits/rejected": -0.8092377185821533, "logps/chosen": -3.0471675395965576, "logps/rejected": -3.5443172454833984, "loss": 2.9443, "rewards/accuracies": 0.765625, "rewards/chosen": -30.47167205810547, "rewards/margins": 4.971498489379883, "rewards/rejected": -35.443172454833984, "step": 123 }, { "epoch": 0.2804636697766469, "grad_norm": 98.66168594344816, "learning_rate": 7.243493896865486e-07, "logits/chosen": -0.8218358755111694, "logits/rejected": -0.8053916096687317, "logps/chosen": -2.7801990509033203, "logps/rejected": -3.0717597007751465, "loss": 3.5017, "rewards/accuracies": 0.6796875, "rewards/chosen": -27.801990509033203, "rewards/margins": 2.9156064987182617, "rewards/rejected": -30.71759796142578, "step": 124 }, { "epoch": 0.2827254735651682, "grad_norm": 114.74683317574737, "learning_rate": 7.224868328275169e-07, "logits/chosen": -0.8093046545982361, "logits/rejected": -0.805727481842041, "logps/chosen": -2.894709348678589, "logps/rejected": -3.254554033279419, "loss": 3.7134, "rewards/accuracies": 0.6875, "rewards/chosen": -28.947093963623047, "rewards/margins": 3.5984435081481934, "rewards/rejected": -32.54553985595703, "step": 125 }, { "epoch": 0.28498727735368956, "grad_norm": 194.3294016359615, "learning_rate": 7.206040816848126e-07, "logits/chosen": -0.8390508890151978, "logits/rejected": -0.8263464570045471, "logps/chosen": -3.065004825592041, "logps/rejected": -3.259084939956665, "loss": 4.415, "rewards/accuracies": 0.671875, "rewards/chosen": -30.650049209594727, "rewards/margins": 1.9408013820648193, "rewards/rejected": -32.590850830078125, "step": 126 }, { "epoch": 0.2872490811422109, "grad_norm": 120.3073948329199, "learning_rate": 7.187012541572356e-07, "logits/chosen": -0.905938446521759, "logits/rejected": -0.8901224136352539, "logps/chosen": -3.1450047492980957, "logps/rejected": -3.485504627227783, "loss": 3.7657, "rewards/accuracies": 0.671875, "rewards/chosen": -31.450042724609375, "rewards/margins": 3.4050049781799316, "rewards/rejected": -34.85504913330078, "step": 127 }, { "epoch": 0.28951088493073224, "grad_norm": 137.9411831080582, "learning_rate": 7.167784694007791e-07, "logits/chosen": -0.8116433620452881, "logits/rejected": -0.8170086741447449, "logps/chosen": -3.0017967224121094, "logps/rejected": -3.353875160217285, "loss": 3.7386, "rewards/accuracies": 0.703125, "rewards/chosen": -30.017967224121094, "rewards/margins": 3.5207817554473877, "rewards/rejected": -33.53874969482422, "step": 128 }, { "epoch": 0.2917726887192536, "grad_norm": 110.61290745803281, "learning_rate": 7.148358478211682e-07, "logits/chosen": -0.8747140169143677, "logits/rejected": -0.8586560487747192, "logps/chosen": -3.0916569232940674, "logps/rejected": -3.517625331878662, "loss": 2.9442, "rewards/accuracies": 0.7578125, "rewards/chosen": -30.91657066345215, "rewards/margins": 4.259681701660156, "rewards/rejected": -35.17625045776367, "step": 129 }, { "epoch": 0.29403449250777497, "grad_norm": 84.56951029170779, "learning_rate": 7.128735110663187e-07, "logits/chosen": -0.8497614860534668, "logits/rejected": -0.8194679617881775, "logps/chosen": -2.7605772018432617, "logps/rejected": -3.1635406017303467, "loss": 3.4017, "rewards/accuracies": 0.6875, "rewards/chosen": -27.605772018432617, "rewards/margins": 4.029631614685059, "rewards/rejected": -31.63540267944336, "step": 130 }, { "epoch": 0.2962962962962963, "grad_norm": 112.17591295821964, "learning_rate": 7.108915820187211e-07, "logits/chosen": -0.8097432255744934, "logits/rejected": -0.8088663816452026, "logps/chosen": -3.1781814098358154, "logps/rejected": -3.545646905899048, "loss": 3.8459, "rewards/accuracies": 0.6875, "rewards/chosen": -31.781814575195312, "rewards/margins": 3.6746530532836914, "rewards/rejected": -35.45646667480469, "step": 131 }, { "epoch": 0.29855810008481765, "grad_norm": 120.89188876376829, "learning_rate": 7.088901847877447e-07, "logits/chosen": -0.7971144914627075, "logits/rejected": -0.7930186986923218, "logps/chosen": -3.015921115875244, "logps/rejected": -3.2458338737487793, "loss": 4.6776, "rewards/accuracies": 0.6953125, "rewards/chosen": -30.159210205078125, "rewards/margins": 2.2991273403167725, "rewards/rejected": -32.458335876464844, "step": 132 }, { "epoch": 0.300819903873339, "grad_norm": 123.91186702122735, "learning_rate": 7.068694447018658e-07, "logits/chosen": -0.8384436964988708, "logits/rejected": -0.846354603767395, "logps/chosen": -3.0088987350463867, "logps/rejected": -3.410034656524658, "loss": 3.3848, "rewards/accuracies": 0.7265625, "rewards/chosen": -30.088991165161133, "rewards/margins": 4.011353492736816, "rewards/rejected": -34.100341796875, "step": 133 }, { "epoch": 0.3030817076618603, "grad_norm": 98.50507786273067, "learning_rate": 7.048294883008199e-07, "logits/chosen": -0.8138392567634583, "logits/rejected": -0.8176507353782654, "logps/chosen": -2.9301810264587402, "logps/rejected": -3.2924013137817383, "loss": 3.3271, "rewards/accuracies": 0.78125, "rewards/chosen": -29.30181121826172, "rewards/margins": 3.6222026348114014, "rewards/rejected": -32.924015045166016, "step": 134 }, { "epoch": 0.3053435114503817, "grad_norm": 80.23471164611576, "learning_rate": 7.027704433276776e-07, "logits/chosen": -0.7829840183258057, "logits/rejected": -0.7787750363349915, "logps/chosen": -2.9372944831848145, "logps/rejected": -3.404730796813965, "loss": 3.1137, "rewards/accuracies": 0.75, "rewards/chosen": -29.37294578552246, "rewards/margins": 4.674362659454346, "rewards/rejected": -34.047306060791016, "step": 135 }, { "epoch": 0.307605315238903, "grad_norm": 118.34531582043013, "learning_rate": 7.006924387208452e-07, "logits/chosen": -0.7873696088790894, "logits/rejected": -0.7685777544975281, "logps/chosen": -2.834895610809326, "logps/rejected": -3.1748013496398926, "loss": 3.3333, "rewards/accuracies": 0.7734375, "rewards/chosen": -28.348957061767578, "rewards/margins": 3.3990557193756104, "rewards/rejected": -31.74801254272461, "step": 136 }, { "epoch": 0.30986711902742436, "grad_norm": 95.16445157429489, "learning_rate": 6.985956046059904e-07, "logits/chosen": -0.7679412961006165, "logits/rejected": -0.7677736878395081, "logps/chosen": -2.7393550872802734, "logps/rejected": -3.1393771171569824, "loss": 3.5669, "rewards/accuracies": 0.703125, "rewards/chosen": -27.393552780151367, "rewards/margins": 4.000221252441406, "rewards/rejected": -31.39377212524414, "step": 137 }, { "epoch": 0.31212892281594573, "grad_norm": 82.62714852037034, "learning_rate": 6.964800722878945e-07, "logits/chosen": -0.724460244178772, "logits/rejected": -0.721074104309082, "logps/chosen": -2.9422450065612793, "logps/rejected": -3.427375555038452, "loss": 3.0358, "rewards/accuracies": 0.765625, "rewards/chosen": -29.42245101928711, "rewards/margins": 4.85130500793457, "rewards/rejected": -34.27375411987305, "step": 138 }, { "epoch": 0.31439072660446704, "grad_norm": 88.6570876412774, "learning_rate": 6.943459742422287e-07, "logits/chosen": -0.75481116771698, "logits/rejected": -0.7281723022460938, "logps/chosen": -2.926560401916504, "logps/rejected": -3.3593132495880127, "loss": 3.6077, "rewards/accuracies": 0.703125, "rewards/chosen": -29.265605926513672, "rewards/margins": 4.327524662017822, "rewards/rejected": -33.59313201904297, "step": 139 }, { "epoch": 0.3166525303929884, "grad_norm": 97.82195429096225, "learning_rate": 6.921934441072597e-07, "logits/chosen": -0.800703227519989, "logits/rejected": -0.8018285036087036, "logps/chosen": -3.131863594055176, "logps/rejected": -3.468980073928833, "loss": 3.8994, "rewards/accuracies": 0.6875, "rewards/chosen": -31.318635940551758, "rewards/margins": 3.3711633682250977, "rewards/rejected": -34.68980026245117, "step": 140 }, { "epoch": 0.3189143341815098, "grad_norm": 130.70910880053893, "learning_rate": 6.900226166754807e-07, "logits/chosen": -0.7732895016670227, "logits/rejected": -0.7903754115104675, "logps/chosen": -3.2161002159118652, "logps/rejected": -3.525928497314453, "loss": 3.9513, "rewards/accuracies": 0.7421875, "rewards/chosen": -32.1609992980957, "rewards/margins": 3.0982844829559326, "rewards/rejected": -35.2592887878418, "step": 141 }, { "epoch": 0.3211761379700311, "grad_norm": 94.8619719166109, "learning_rate": 6.8783362788517e-07, "logits/chosen": -0.7802690267562866, "logits/rejected": -0.781207263469696, "logps/chosen": -3.136306047439575, "logps/rejected": -3.542205333709717, "loss": 4.0634, "rewards/accuracies": 0.6328125, "rewards/chosen": -31.363059997558594, "rewards/margins": 4.058990478515625, "rewards/rejected": -35.422054290771484, "step": 142 }, { "epoch": 0.32343794175855245, "grad_norm": 96.17049011345333, "learning_rate": 6.856266148118796e-07, "logits/chosen": -0.7571829557418823, "logits/rejected": -0.7646656036376953, "logps/chosen": -2.8659865856170654, "logps/rejected": -3.3385119438171387, "loss": 3.2572, "rewards/accuracies": 0.75, "rewards/chosen": -28.659862518310547, "rewards/margins": 4.725252151489258, "rewards/rejected": -33.38511657714844, "step": 143 }, { "epoch": 0.3256997455470738, "grad_norm": 110.19796175719054, "learning_rate": 6.834017156598512e-07, "logits/chosen": -0.7483683228492737, "logits/rejected": -0.7414959073066711, "logps/chosen": -3.109438419342041, "logps/rejected": -3.5427985191345215, "loss": 3.4928, "rewards/accuracies": 0.703125, "rewards/chosen": -31.09438133239746, "rewards/margins": 4.333602428436279, "rewards/rejected": -35.427982330322266, "step": 144 }, { "epoch": 0.3279615493355951, "grad_norm": 79.78726439178003, "learning_rate": 6.811590697533607e-07, "logits/chosen": -0.8195265531539917, "logits/rejected": -0.838479220867157, "logps/chosen": -3.01442813873291, "logps/rejected": -3.3907887935638428, "loss": 3.5141, "rewards/accuracies": 0.765625, "rewards/chosen": -30.144283294677734, "rewards/margins": 3.7636024951934814, "rewards/rejected": -33.90788650512695, "step": 145 }, { "epoch": 0.3302233531241165, "grad_norm": 151.81377404607898, "learning_rate": 6.788988175279951e-07, "logits/chosen": -0.7769032120704651, "logits/rejected": -0.7602939605712891, "logps/chosen": -3.0373010635375977, "logps/rejected": -3.367269515991211, "loss": 4.0091, "rewards/accuracies": 0.6796875, "rewards/chosen": -30.373010635375977, "rewards/margins": 3.299685001373291, "rewards/rejected": -33.672698974609375, "step": 146 }, { "epoch": 0.3324851569126378, "grad_norm": 93.15629599355084, "learning_rate": 6.766211005218577e-07, "logits/chosen": -0.7618966698646545, "logits/rejected": -0.7614046931266785, "logps/chosen": -3.0041072368621826, "logps/rejected": -3.5381112098693848, "loss": 3.0388, "rewards/accuracies": 0.7578125, "rewards/chosen": -30.04107093811035, "rewards/margins": 5.340038299560547, "rewards/rejected": -35.38111114501953, "step": 147 }, { "epoch": 0.33474696070115917, "grad_norm": 93.03406880519788, "learning_rate": 6.743260613667047e-07, "logits/chosen": -0.8518264889717102, "logits/rejected": -0.8462361693382263, "logps/chosen": -2.939993143081665, "logps/rejected": -3.3589367866516113, "loss": 3.4992, "rewards/accuracies": 0.7421875, "rewards/chosen": -29.399932861328125, "rewards/margins": 4.18943452835083, "rewards/rejected": -33.58937072753906, "step": 148 }, { "epoch": 0.33700876448968053, "grad_norm": 73.40361139697784, "learning_rate": 6.720138437790139e-07, "logits/chosen": -0.8052965998649597, "logits/rejected": -0.7937459945678711, "logps/chosen": -2.8842406272888184, "logps/rejected": -3.307342529296875, "loss": 3.1965, "rewards/accuracies": 0.7265625, "rewards/chosen": -28.8424015045166, "rewards/margins": 4.23102331161499, "rewards/rejected": -33.07342529296875, "step": 149 }, { "epoch": 0.33927056827820185, "grad_norm": 106.95490507848946, "learning_rate": 6.696845925509848e-07, "logits/chosen": -0.8310205936431885, "logits/rejected": -0.8272488713264465, "logps/chosen": -2.934943437576294, "logps/rejected": -3.2793498039245605, "loss": 3.6164, "rewards/accuracies": 0.6953125, "rewards/chosen": -29.349435806274414, "rewards/margins": 3.444066047668457, "rewards/rejected": -32.79349899291992, "step": 150 }, { "epoch": 0.3415323720667232, "grad_norm": 100.186884927208, "learning_rate": 6.673384535414718e-07, "logits/chosen": -0.8480186462402344, "logits/rejected": -0.8310289978981018, "logps/chosen": -3.0514492988586426, "logps/rejected": -3.3322248458862305, "loss": 3.9173, "rewards/accuracies": 0.671875, "rewards/chosen": -30.514493942260742, "rewards/margins": 2.8077542781829834, "rewards/rejected": -33.32224655151367, "step": 151 }, { "epoch": 0.3437941758552446, "grad_norm": 100.75576403172839, "learning_rate": 6.649755736668511e-07, "logits/chosen": -0.7694522738456726, "logits/rejected": -0.7615189552307129, "logps/chosen": -2.6866354942321777, "logps/rejected": -3.11234188079834, "loss": 2.8237, "rewards/accuracies": 0.7578125, "rewards/chosen": -26.866352081298828, "rewards/margins": 4.257061958312988, "rewards/rejected": -31.123416900634766, "step": 152 }, { "epoch": 0.3460559796437659, "grad_norm": 117.9327947173104, "learning_rate": 6.625961008918192e-07, "logits/chosen": -0.7936750054359436, "logits/rejected": -0.7835503220558167, "logps/chosen": -2.7540676593780518, "logps/rejected": -3.2012295722961426, "loss": 2.9183, "rewards/accuracies": 0.7890625, "rewards/chosen": -27.540678024291992, "rewards/margins": 4.471617221832275, "rewards/rejected": -32.01229476928711, "step": 153 }, { "epoch": 0.34831778343228725, "grad_norm": 96.1956695260042, "learning_rate": 6.602001842201289e-07, "logits/chosen": -0.7796362042427063, "logits/rejected": -0.7905425429344177, "logps/chosen": -2.7750422954559326, "logps/rejected": -3.049340009689331, "loss": 4.0488, "rewards/accuracies": 0.6953125, "rewards/chosen": -27.75042152404785, "rewards/margins": 2.74297833442688, "rewards/rejected": -30.49340057373047, "step": 154 }, { "epoch": 0.3505795872208086, "grad_norm": 101.24392958865374, "learning_rate": 6.577879736852571e-07, "logits/chosen": -0.8088594079017639, "logits/rejected": -0.8155099749565125, "logps/chosen": -2.821500301361084, "logps/rejected": -3.0726358890533447, "loss": 3.9081, "rewards/accuracies": 0.65625, "rewards/chosen": -28.215003967285156, "rewards/margins": 2.5113601684570312, "rewards/rejected": -30.726364135742188, "step": 155 }, { "epoch": 0.35284139100932993, "grad_norm": 73.2743068340787, "learning_rate": 6.553596203410112e-07, "logits/chosen": -0.8153470754623413, "logits/rejected": -0.8048913478851318, "logps/chosen": -2.7679576873779297, "logps/rejected": -3.2883174419403076, "loss": 2.546, "rewards/accuracies": 0.8203125, "rewards/chosen": -27.679576873779297, "rewards/margins": 5.203596115112305, "rewards/rejected": -32.883174896240234, "step": 156 }, { "epoch": 0.3551031947978513, "grad_norm": 82.3012728094315, "learning_rate": 6.529152762520688e-07, "logits/chosen": -0.8138669729232788, "logits/rejected": -0.8138793110847473, "logps/chosen": -2.864006757736206, "logps/rejected": -3.2064666748046875, "loss": 3.5646, "rewards/accuracies": 0.7265625, "rewards/chosen": -28.64006805419922, "rewards/margins": 3.424598217010498, "rewards/rejected": -32.06466293334961, "step": 157 }, { "epoch": 0.3573649985863726, "grad_norm": 100.03152867926248, "learning_rate": 6.504550944844558e-07, "logits/chosen": -0.7475910782814026, "logits/rejected": -0.7779514789581299, "logps/chosen": -2.7473607063293457, "logps/rejected": -3.1302061080932617, "loss": 3.452, "rewards/accuracies": 0.7109375, "rewards/chosen": -27.47360610961914, "rewards/margins": 3.8284525871276855, "rewards/rejected": -31.302059173583984, "step": 158 }, { "epoch": 0.359626802374894, "grad_norm": 105.22720404522045, "learning_rate": 6.479792290959613e-07, "logits/chosen": -0.7691587209701538, "logits/rejected": -0.7878850698471069, "logps/chosen": -2.8018503189086914, "logps/rejected": -3.312527656555176, "loss": 3.2183, "rewards/accuracies": 0.765625, "rewards/chosen": -28.01850128173828, "rewards/margins": 5.106773853302002, "rewards/rejected": -33.125274658203125, "step": 159 }, { "epoch": 0.36188860616341534, "grad_norm": 89.61616128866521, "learning_rate": 6.454878351264906e-07, "logits/chosen": -0.7589330673217773, "logits/rejected": -0.745934009552002, "logps/chosen": -2.6822848320007324, "logps/rejected": -3.0995330810546875, "loss": 3.4046, "rewards/accuracies": 0.7265625, "rewards/chosen": -26.822847366333008, "rewards/margins": 4.172482490539551, "rewards/rejected": -30.995328903198242, "step": 160 }, { "epoch": 0.36415040995193665, "grad_norm": 96.61923230400093, "learning_rate": 6.429810685883565e-07, "logits/chosen": -0.8186591267585754, "logits/rejected": -0.82514488697052, "logps/chosen": -2.8654110431671143, "logps/rejected": -3.2399096488952637, "loss": 3.238, "rewards/accuracies": 0.765625, "rewards/chosen": -28.65411376953125, "rewards/margins": 3.7449822425842285, "rewards/rejected": -32.39909362792969, "step": 161 }, { "epoch": 0.366412213740458, "grad_norm": 109.68200048681109, "learning_rate": 6.404590864565088e-07, "logits/chosen": -0.7650143504142761, "logits/rejected": -0.7517848014831543, "logps/chosen": -2.817117214202881, "logps/rejected": -3.050743341445923, "loss": 3.9095, "rewards/accuracies": 0.6640625, "rewards/chosen": -28.171171188354492, "rewards/margins": 2.3362598419189453, "rewards/rejected": -30.507431030273438, "step": 162 }, { "epoch": 0.3686740175289794, "grad_norm": 103.61475684738596, "learning_rate": 6.379220466587063e-07, "logits/chosen": -0.7960351705551147, "logits/rejected": -0.7686564922332764, "logps/chosen": -2.810275077819824, "logps/rejected": -3.18802809715271, "loss": 3.246, "rewards/accuracies": 0.8203125, "rewards/chosen": -28.10274887084961, "rewards/margins": 3.777529239654541, "rewards/rejected": -31.880279541015625, "step": 163 }, { "epoch": 0.3709358213175007, "grad_norm": 119.25022849905575, "learning_rate": 6.353701080656254e-07, "logits/chosen": -0.7721018195152283, "logits/rejected": -0.7901967763900757, "logps/chosen": -2.9517931938171387, "logps/rejected": -3.250936985015869, "loss": 3.6435, "rewards/accuracies": 0.6953125, "rewards/chosen": -29.517929077148438, "rewards/margins": 2.9914417266845703, "rewards/rejected": -32.50937271118164, "step": 164 }, { "epoch": 0.37319762510602206, "grad_norm": 106.89469142388397, "learning_rate": 6.32803430480913e-07, "logits/chosen": -0.7933882474899292, "logits/rejected": -0.785070538520813, "logps/chosen": -2.898366689682007, "logps/rejected": -3.3264529705047607, "loss": 3.3983, "rewards/accuracies": 0.75, "rewards/chosen": -28.98366928100586, "rewards/margins": 4.2808613777160645, "rewards/rejected": -33.264530181884766, "step": 165 }, { "epoch": 0.3754594288945434, "grad_norm": 90.5066177326131, "learning_rate": 6.302221746311782e-07, "logits/chosen": -0.8018909096717834, "logits/rejected": -0.7745494246482849, "logps/chosen": -2.7565348148345947, "logps/rejected": -3.1439740657806396, "loss": 3.4065, "rewards/accuracies": 0.703125, "rewards/chosen": -27.565349578857422, "rewards/margins": 3.874392032623291, "rewards/rejected": -31.439741134643555, "step": 166 }, { "epoch": 0.37772123268306473, "grad_norm": 86.55925111562716, "learning_rate": 6.276265021559288e-07, "logits/chosen": -0.8132920861244202, "logits/rejected": -0.8016676306724548, "logps/chosen": -2.9889135360717773, "logps/rejected": -3.338178873062134, "loss": 3.5752, "rewards/accuracies": 0.7109375, "rewards/chosen": -29.88913345336914, "rewards/margins": 3.4926562309265137, "rewards/rejected": -33.38179016113281, "step": 167 }, { "epoch": 0.3799830364715861, "grad_norm": 73.35269687730333, "learning_rate": 6.250165755974487e-07, "logits/chosen": -0.757270336151123, "logits/rejected": -0.7608906626701355, "logps/chosen": -2.9331836700439453, "logps/rejected": -3.337920904159546, "loss": 3.1407, "rewards/accuracies": 0.7109375, "rewards/chosen": -29.33183479309082, "rewards/margins": 4.047374725341797, "rewards/rejected": -33.37921142578125, "step": 168 }, { "epoch": 0.3822448402601074, "grad_norm": 86.91929641048736, "learning_rate": 6.223925583906192e-07, "logits/chosen": -0.8268415331840515, "logits/rejected": -0.8238467574119568, "logps/chosen": -3.028745174407959, "logps/rejected": -3.4829823970794678, "loss": 2.8551, "rewards/accuracies": 0.7421875, "rewards/chosen": -30.287450790405273, "rewards/margins": 4.5423736572265625, "rewards/rejected": -34.8298225402832, "step": 169 }, { "epoch": 0.3845066440486288, "grad_norm": 85.76060468853915, "learning_rate": 6.19754614852685e-07, "logits/chosen": -0.8132871985435486, "logits/rejected": -0.8051560521125793, "logps/chosen": -2.942837953567505, "logps/rejected": -3.3470842838287354, "loss": 3.0876, "rewards/accuracies": 0.7578125, "rewards/chosen": -29.428375244140625, "rewards/margins": 4.04246711730957, "rewards/rejected": -33.470848083496094, "step": 170 }, { "epoch": 0.38676844783715014, "grad_norm": 101.76248326578423, "learning_rate": 6.171029101729644e-07, "logits/chosen": -0.733617901802063, "logits/rejected": -0.7339813709259033, "logps/chosen": -3.1949334144592285, "logps/rejected": -3.661990165710449, "loss": 3.1671, "rewards/accuracies": 0.734375, "rewards/chosen": -31.94933319091797, "rewards/margins": 4.670570373535156, "rewards/rejected": -36.61989974975586, "step": 171 }, { "epoch": 0.38903025162567145, "grad_norm": 113.9018020285417, "learning_rate": 6.144376104025055e-07, "logits/chosen": -0.8161033987998962, "logits/rejected": -0.8007526993751526, "logps/chosen": -3.0398004055023193, "logps/rejected": -3.4840195178985596, "loss": 3.1426, "rewards/accuracies": 0.7578125, "rewards/chosen": -30.398000717163086, "rewards/margins": 4.442192554473877, "rewards/rejected": -34.84019470214844, "step": 172 }, { "epoch": 0.3912920554141928, "grad_norm": 109.66608111755836, "learning_rate": 6.117588824436873e-07, "logits/chosen": -0.8302391171455383, "logits/rejected": -0.8384109735488892, "logps/chosen": -3.1871933937072754, "logps/rejected": -3.547971487045288, "loss": 3.7869, "rewards/accuracies": 0.6875, "rewards/chosen": -31.87193489074707, "rewards/margins": 3.607778549194336, "rewards/rejected": -35.47970962524414, "step": 173 }, { "epoch": 0.3935538592027142, "grad_norm": 97.34693909102697, "learning_rate": 6.090668940397688e-07, "logits/chosen": -0.7868531942367554, "logits/rejected": -0.7912797331809998, "logps/chosen": -3.1741623878479004, "logps/rejected": -3.6076653003692627, "loss": 3.2467, "rewards/accuracies": 0.7421875, "rewards/chosen": -31.741622924804688, "rewards/margins": 4.3350300788879395, "rewards/rejected": -36.07665252685547, "step": 174 }, { "epoch": 0.3958156629912355, "grad_norm": 112.75426033024542, "learning_rate": 6.063618137643844e-07, "logits/chosen": -0.7921246290206909, "logits/rejected": -0.78474360704422, "logps/chosen": -3.200976610183716, "logps/rejected": -3.6109395027160645, "loss": 3.3634, "rewards/accuracies": 0.71875, "rewards/chosen": -32.009769439697266, "rewards/margins": 4.099628925323486, "rewards/rejected": -36.109397888183594, "step": 175 }, { "epoch": 0.39807746677975686, "grad_norm": 113.32746124615062, "learning_rate": 6.03643811010988e-07, "logits/chosen": -0.8276042938232422, "logits/rejected": -0.8417137265205383, "logps/chosen": -3.3886866569519043, "logps/rejected": -3.824484348297119, "loss": 3.0549, "rewards/accuracies": 0.7421875, "rewards/chosen": -33.886863708496094, "rewards/margins": 4.357980728149414, "rewards/rejected": -38.24484634399414, "step": 176 }, { "epoch": 0.4003392705682782, "grad_norm": 115.44790456463876, "learning_rate": 6.009130559822453e-07, "logits/chosen": -0.8511748909950256, "logits/rejected": -0.8455148339271545, "logps/chosen": -3.3521997928619385, "logps/rejected": -3.621072292327881, "loss": 4.1968, "rewards/accuracies": 0.6796875, "rewards/chosen": -33.521995544433594, "rewards/margins": 2.6887285709381104, "rewards/rejected": -36.210723876953125, "step": 177 }, { "epoch": 0.40260107435679954, "grad_norm": 108.10466917985082, "learning_rate": 5.981697196793758e-07, "logits/chosen": -0.8837382793426514, "logits/rejected": -0.8865911364555359, "logps/chosen": -3.501465082168579, "logps/rejected": -3.9723386764526367, "loss": 2.5792, "rewards/accuracies": 0.78125, "rewards/chosen": -35.0146484375, "rewards/margins": 4.708735466003418, "rewards/rejected": -39.723388671875, "step": 178 }, { "epoch": 0.4048628781453209, "grad_norm": 113.75351199007196, "learning_rate": 5.954139738914446e-07, "logits/chosen": -0.8577677607536316, "logits/rejected": -0.869698703289032, "logps/chosen": -3.4370806217193604, "logps/rejected": -3.856444835662842, "loss": 3.4991, "rewards/accuracies": 0.6796875, "rewards/chosen": -34.37080764770508, "rewards/margins": 4.193644046783447, "rewards/rejected": -38.564453125, "step": 179 }, { "epoch": 0.4071246819338422, "grad_norm": 105.83270319758616, "learning_rate": 5.92645991184605e-07, "logits/chosen": -0.8364049792289734, "logits/rejected": -0.8378995656967163, "logps/chosen": -3.5678813457489014, "logps/rejected": -4.040313720703125, "loss": 2.9427, "rewards/accuracies": 0.765625, "rewards/chosen": -35.67881393432617, "rewards/margins": 4.724322319030762, "rewards/rejected": -40.403133392333984, "step": 180 }, { "epoch": 0.4093864857223636, "grad_norm": 97.36394616930042, "learning_rate": 5.898659448912917e-07, "logits/chosen": -0.8220387697219849, "logits/rejected": -0.8456301689147949, "logps/chosen": -3.3772408962249756, "logps/rejected": -3.840843915939331, "loss": 3.2422, "rewards/accuracies": 0.75, "rewards/chosen": -33.77241134643555, "rewards/margins": 4.636030197143555, "rewards/rejected": -38.40843963623047, "step": 181 }, { "epoch": 0.41164828951088495, "grad_norm": 119.36449481585935, "learning_rate": 5.870740090993676e-07, "logits/chosen": -0.8707118034362793, "logits/rejected": -0.8762695789337158, "logps/chosen": -3.7325124740600586, "logps/rejected": -4.200765609741211, "loss": 3.0099, "rewards/accuracies": 0.78125, "rewards/chosen": -37.32512664794922, "rewards/margins": 4.68253755569458, "rewards/rejected": -42.007659912109375, "step": 182 }, { "epoch": 0.41391009329940626, "grad_norm": 112.59179043910055, "learning_rate": 5.842703586412214e-07, "logits/chosen": -0.8712838888168335, "logits/rejected": -0.8757526278495789, "logps/chosen": -3.783506393432617, "logps/rejected": -4.1643571853637695, "loss": 3.9002, "rewards/accuracies": 0.6875, "rewards/chosen": -37.83506774902344, "rewards/margins": 3.8085036277770996, "rewards/rejected": -41.6435661315918, "step": 183 }, { "epoch": 0.4161718970879276, "grad_norm": 129.66104630601248, "learning_rate": 5.814551690828203e-07, "logits/chosen": -0.8484200239181519, "logits/rejected": -0.861181378364563, "logps/chosen": -3.624147653579712, "logps/rejected": -4.078574180603027, "loss": 2.9549, "rewards/accuracies": 0.75, "rewards/chosen": -36.241477966308594, "rewards/margins": 4.544264793395996, "rewards/rejected": -40.785743713378906, "step": 184 }, { "epoch": 0.418433700876449, "grad_norm": 123.33358190542005, "learning_rate": 5.786286167127155e-07, "logits/chosen": -0.8848705291748047, "logits/rejected": -0.8773502111434937, "logps/chosen": -3.5523104667663574, "logps/rejected": -4.035827159881592, "loss": 3.2787, "rewards/accuracies": 0.765625, "rewards/chosen": -35.523101806640625, "rewards/margins": 4.83516788482666, "rewards/rejected": -40.35826873779297, "step": 185 }, { "epoch": 0.4206955046649703, "grad_norm": 105.18151417237235, "learning_rate": 5.757908785310031e-07, "logits/chosen": -0.812483012676239, "logits/rejected": -0.8327686786651611, "logps/chosen": -3.3677561283111572, "logps/rejected": -3.8305165767669678, "loss": 3.089, "rewards/accuracies": 0.78125, "rewards/chosen": -33.67756271362305, "rewards/margins": 4.627603530883789, "rewards/rejected": -38.3051643371582, "step": 186 }, { "epoch": 0.42295730845349166, "grad_norm": 112.39088354266822, "learning_rate": 5.729421322382399e-07, "logits/chosen": -0.8071901202201843, "logits/rejected": -0.8371500372886658, "logps/chosen": -3.244313955307007, "logps/rejected": -3.724259376525879, "loss": 3.1866, "rewards/accuracies": 0.7109375, "rewards/chosen": -32.443138122558594, "rewards/margins": 4.799454212188721, "rewards/rejected": -37.242591857910156, "step": 187 }, { "epoch": 0.42521911224201303, "grad_norm": 99.91697305071902, "learning_rate": 5.700825562243163e-07, "logits/chosen": -0.7996731996536255, "logits/rejected": -0.8074153065681458, "logps/chosen": -3.3295788764953613, "logps/rejected": -3.8264358043670654, "loss": 3.0297, "rewards/accuracies": 0.7421875, "rewards/chosen": -33.29579162597656, "rewards/margins": 4.968564510345459, "rewards/rejected": -38.26435089111328, "step": 188 }, { "epoch": 0.42748091603053434, "grad_norm": 106.43222768263621, "learning_rate": 5.672123295572854e-07, "logits/chosen": -0.8531807661056519, "logits/rejected": -0.8710072636604309, "logps/chosen": -3.4436635971069336, "logps/rejected": -3.7774899005889893, "loss": 3.2074, "rewards/accuracies": 0.734375, "rewards/chosen": -34.43663787841797, "rewards/margins": 3.3382644653320312, "rewards/rejected": -37.774898529052734, "step": 189 }, { "epoch": 0.4297427198190557, "grad_norm": 101.78216988587263, "learning_rate": 5.643316319721487e-07, "logits/chosen": -0.834848940372467, "logits/rejected": -0.8536701798439026, "logps/chosen": -3.5879836082458496, "logps/rejected": -3.99747896194458, "loss": 3.6465, "rewards/accuracies": 0.6953125, "rewards/chosen": -35.87983703613281, "rewards/margins": 4.094951152801514, "rewards/rejected": -39.97479248046875, "step": 190 }, { "epoch": 0.432004523607577, "grad_norm": 102.01291002558317, "learning_rate": 5.614406438596026e-07, "logits/chosen": -0.8791413307189941, "logits/rejected": -0.8761864900588989, "logps/chosen": -3.594583511352539, "logps/rejected": -4.055732727050781, "loss": 3.5126, "rewards/accuracies": 0.7109375, "rewards/chosen": -35.945831298828125, "rewards/margins": 4.611495018005371, "rewards/rejected": -40.55733108520508, "step": 191 }, { "epoch": 0.4342663273960984, "grad_norm": 112.72220193442307, "learning_rate": 5.585395462547406e-07, "logits/chosen": -0.8375272154808044, "logits/rejected": -0.8324666023254395, "logps/chosen": -3.421821117401123, "logps/rejected": -3.717869758605957, "loss": 3.8101, "rewards/accuracies": 0.65625, "rewards/chosen": -34.21821212768555, "rewards/margins": 2.9604828357696533, "rewards/rejected": -37.17869186401367, "step": 192 }, { "epoch": 0.43652813118461975, "grad_norm": 99.3082505357167, "learning_rate": 5.55628520825718e-07, "logits/chosen": -0.908355712890625, "logits/rejected": -0.9252756237983704, "logps/chosen": -3.4431350231170654, "logps/rejected": -3.812532424926758, "loss": 3.5263, "rewards/accuracies": 0.6953125, "rewards/chosen": -34.43135070800781, "rewards/margins": 3.6939783096313477, "rewards/rejected": -38.125328063964844, "step": 193 }, { "epoch": 0.43878993497314106, "grad_norm": 107.13322204576244, "learning_rate": 5.527077498623752e-07, "logits/chosen": -0.8578076958656311, "logits/rejected": -0.8740971088409424, "logps/chosen": -3.3862037658691406, "logps/rejected": -3.792330741882324, "loss": 3.1196, "rewards/accuracies": 0.7734375, "rewards/chosen": -33.862037658691406, "rewards/margins": 4.0612711906433105, "rewards/rejected": -37.92330551147461, "step": 194 }, { "epoch": 0.4410517387616624, "grad_norm": 85.34410402793644, "learning_rate": 5.497774162648228e-07, "logits/chosen": -0.8335473537445068, "logits/rejected": -0.8551528453826904, "logps/chosen": -3.339934825897217, "logps/rejected": -3.8869519233703613, "loss": 3.1318, "rewards/accuracies": 0.75, "rewards/chosen": -33.39934539794922, "rewards/margins": 5.470172882080078, "rewards/rejected": -38.86952209472656, "step": 195 }, { "epoch": 0.4433135425501838, "grad_norm": 112.04047145787284, "learning_rate": 5.468377035319882e-07, "logits/chosen": -0.8870958089828491, "logits/rejected": -0.8841900825500488, "logps/chosen": -3.344312906265259, "logps/rejected": -3.845787763595581, "loss": 3.2742, "rewards/accuracies": 0.6875, "rewards/chosen": -33.44313049316406, "rewards/margins": 5.014750003814697, "rewards/rejected": -38.457881927490234, "step": 196 }, { "epoch": 0.4455753463387051, "grad_norm": 108.93166440182182, "learning_rate": 5.438887957501248e-07, "logits/chosen": -0.7933169603347778, "logits/rejected": -0.7912404537200928, "logps/chosen": -3.3394107818603516, "logps/rejected": -3.764794111251831, "loss": 3.0992, "rewards/accuracies": 0.71875, "rewards/chosen": -33.39411163330078, "rewards/margins": 4.253833293914795, "rewards/rejected": -37.64794158935547, "step": 197 }, { "epoch": 0.44783715012722647, "grad_norm": 132.62161789111477, "learning_rate": 5.409308775812844e-07, "logits/chosen": -0.8376902341842651, "logits/rejected": -0.8406752347946167, "logps/chosen": -3.4705710411071777, "logps/rejected": -3.8878021240234375, "loss": 3.5095, "rewards/accuracies": 0.7265625, "rewards/chosen": -34.705711364746094, "rewards/margins": 4.172308444976807, "rewards/rejected": -38.878021240234375, "step": 198 }, { "epoch": 0.45009895391574783, "grad_norm": 105.66558509816933, "learning_rate": 5.379641342517541e-07, "logits/chosen": -0.8948197960853577, "logits/rejected": -0.8918160200119019, "logps/chosen": -3.276104211807251, "logps/rejected": -3.7821552753448486, "loss": 3.1998, "rewards/accuracies": 0.7578125, "rewards/chosen": -32.76103973388672, "rewards/margins": 5.060507297515869, "rewards/rejected": -37.82155227661133, "step": 199 }, { "epoch": 0.45236075770426915, "grad_norm": 100.99361157251298, "learning_rate": 5.349887515404564e-07, "logits/chosen": -0.8491485714912415, "logits/rejected": -0.8752503991127014, "logps/chosen": -3.4885029792785645, "logps/rejected": -4.05246114730835, "loss": 2.811, "rewards/accuracies": 0.7421875, "rewards/chosen": -34.88502883911133, "rewards/margins": 5.639582633972168, "rewards/rejected": -40.52460479736328, "step": 200 }, { "epoch": 0.4546225614927905, "grad_norm": 113.88837192083922, "learning_rate": 5.320049157673163e-07, "logits/chosen": -0.7907375693321228, "logits/rejected": -0.7869551181793213, "logps/chosen": -3.329808235168457, "logps/rejected": -3.815051794052124, "loss": 3.0112, "rewards/accuracies": 0.75, "rewards/chosen": -33.29808044433594, "rewards/margins": 4.8524346351623535, "rewards/rejected": -38.15052032470703, "step": 201 }, { "epoch": 0.4568843652813118, "grad_norm": 96.00315980556027, "learning_rate": 5.290128137815938e-07, "logits/chosen": -0.8615790009498596, "logits/rejected": -0.8816788792610168, "logps/chosen": -3.5456151962280273, "logps/rejected": -4.082833766937256, "loss": 2.6221, "rewards/accuracies": 0.765625, "rewards/chosen": -35.456146240234375, "rewards/margins": 5.372189044952393, "rewards/rejected": -40.828338623046875, "step": 202 }, { "epoch": 0.4591461690698332, "grad_norm": 88.74384836731605, "learning_rate": 5.260126329501828e-07, "logits/chosen": -0.8821161985397339, "logits/rejected": -0.8808766603469849, "logps/chosen": -3.4488883018493652, "logps/rejected": -4.065739631652832, "loss": 2.526, "rewards/accuracies": 0.796875, "rewards/chosen": -34.48888397216797, "rewards/margins": 6.168512344360352, "rewards/rejected": -40.65739440917969, "step": 203 }, { "epoch": 0.46140797285835455, "grad_norm": 113.38423627891478, "learning_rate": 5.230045611458789e-07, "logits/chosen": -0.8067418932914734, "logits/rejected": -0.8317432403564453, "logps/chosen": -3.4061567783355713, "logps/rejected": -3.852400302886963, "loss": 3.1033, "rewards/accuracies": 0.7265625, "rewards/chosen": -34.06156921386719, "rewards/margins": 4.462434768676758, "rewards/rejected": -38.52400207519531, "step": 204 }, { "epoch": 0.46366977664687586, "grad_norm": 109.078062930805, "learning_rate": 5.199887867356143e-07, "logits/chosen": -0.8303195238113403, "logits/rejected": -0.8461140394210815, "logps/chosen": -3.646005153656006, "logps/rejected": -4.241487979888916, "loss": 2.5447, "rewards/accuracies": 0.765625, "rewards/chosen": -36.46004867553711, "rewards/margins": 5.954832077026367, "rewards/rejected": -42.41488265991211, "step": 205 }, { "epoch": 0.46593158043539723, "grad_norm": 115.30415505519554, "learning_rate": 5.16965498568662e-07, "logits/chosen": -0.8711931705474854, "logits/rejected": -0.8695412278175354, "logps/chosen": -3.7641541957855225, "logps/rejected": -4.470314025878906, "loss": 2.7657, "rewards/accuracies": 0.796875, "rewards/chosen": -37.64154052734375, "rewards/margins": 7.061600685119629, "rewards/rejected": -44.70314025878906, "step": 206 }, { "epoch": 0.4681933842239186, "grad_norm": 112.79356107718269, "learning_rate": 5.139348859648098e-07, "logits/chosen": -0.8668640851974487, "logits/rejected": -0.8753060698509216, "logps/chosen": -3.509500026702881, "logps/rejected": -4.0229034423828125, "loss": 2.9855, "rewards/accuracies": 0.7421875, "rewards/chosen": -35.095001220703125, "rewards/margins": 5.134032249450684, "rewards/rejected": -40.229034423828125, "step": 207 }, { "epoch": 0.4704551880124399, "grad_norm": 117.01804715220312, "learning_rate": 5.10897138702506e-07, "logits/chosen": -0.8137744665145874, "logits/rejected": -0.838422417640686, "logps/chosen": -3.5989084243774414, "logps/rejected": -4.12141227722168, "loss": 3.4055, "rewards/accuracies": 0.71875, "rewards/chosen": -35.98908615112305, "rewards/margins": 5.225040912628174, "rewards/rejected": -41.21411895751953, "step": 208 }, { "epoch": 0.4727169918009613, "grad_norm": 101.49552741213645, "learning_rate": 5.078524470069743e-07, "logits/chosen": -0.9176779985427856, "logits/rejected": -0.9260926246643066, "logps/chosen": -3.756748914718628, "logps/rejected": -4.329287052154541, "loss": 2.4625, "rewards/accuracies": 0.8046875, "rewards/chosen": -37.56748580932617, "rewards/margins": 5.725386142730713, "rewards/rejected": -43.29287338256836, "step": 209 }, { "epoch": 0.47497879558948264, "grad_norm": 103.91381353366985, "learning_rate": 5.048010015383021e-07, "logits/chosen": -0.8263366222381592, "logits/rejected": -0.8194425106048584, "logps/chosen": -3.8313450813293457, "logps/rejected": -4.535330772399902, "loss": 2.4896, "rewards/accuracies": 0.7890625, "rewards/chosen": -38.31344985961914, "rewards/margins": 7.039859294891357, "rewards/rejected": -45.35331344604492, "step": 210 }, { "epoch": 0.47724059937800395, "grad_norm": 93.64016980030927, "learning_rate": 5.01742993379502e-07, "logits/chosen": -0.8458577990531921, "logits/rejected": -0.868080735206604, "logps/chosen": -3.8605237007141113, "logps/rejected": -4.4653801918029785, "loss": 2.6156, "rewards/accuracies": 0.7890625, "rewards/chosen": -38.60523986816406, "rewards/margins": 6.04856538772583, "rewards/rejected": -44.653804779052734, "step": 211 }, { "epoch": 0.4795024031665253, "grad_norm": 127.6076556977002, "learning_rate": 4.986786140245446e-07, "logits/chosen": -0.8188483715057373, "logits/rejected": -0.826935887336731, "logps/chosen": -3.8074846267700195, "logps/rejected": -4.2949419021606445, "loss": 3.2102, "rewards/accuracies": 0.7265625, "rewards/chosen": -38.074851989746094, "rewards/margins": 4.874571323394775, "rewards/rejected": -42.94941711425781, "step": 212 }, { "epoch": 0.4817642069550466, "grad_norm": 147.29228875677396, "learning_rate": 4.956080553663687e-07, "logits/chosen": -0.8854949474334717, "logits/rejected": -0.8917779922485352, "logps/chosen": -3.8558950424194336, "logps/rejected": -4.444454193115234, "loss": 2.994, "rewards/accuracies": 0.796875, "rewards/chosen": -38.55895233154297, "rewards/margins": 5.885589122772217, "rewards/rejected": -44.444541931152344, "step": 213 }, { "epoch": 0.484026010743568, "grad_norm": 136.51813237025374, "learning_rate": 4.925315096848636e-07, "logits/chosen": -0.8785922527313232, "logits/rejected": -0.8924418091773987, "logps/chosen": -4.0408549308776855, "logps/rejected": -4.76306676864624, "loss": 2.8242, "rewards/accuracies": 0.734375, "rewards/chosen": -40.40855407714844, "rewards/margins": 7.2221174240112305, "rewards/rejected": -47.63066864013672, "step": 214 }, { "epoch": 0.48628781453208936, "grad_norm": 110.42259187397326, "learning_rate": 4.894491696348293e-07, "logits/chosen": -0.8856892585754395, "logits/rejected": -0.8893029093742371, "logps/chosen": -3.8507235050201416, "logps/rejected": -4.282730579376221, "loss": 3.3235, "rewards/accuracies": 0.6953125, "rewards/chosen": -38.507232666015625, "rewards/margins": 4.320071220397949, "rewards/rejected": -42.82730484008789, "step": 215 }, { "epoch": 0.48854961832061067, "grad_norm": 112.38972321227853, "learning_rate": 4.863612282339116e-07, "logits/chosen": -0.817990779876709, "logits/rejected": -0.8263007998466492, "logps/chosen": -4.125490188598633, "logps/rejected": -4.6461663246154785, "loss": 3.3821, "rewards/accuracies": 0.6796875, "rewards/chosen": -41.25490188598633, "rewards/margins": 5.206766128540039, "rewards/rejected": -46.461666107177734, "step": 216 }, { "epoch": 0.49081142210913203, "grad_norm": 122.33140314915195, "learning_rate": 4.832678788505161e-07, "logits/chosen": -0.8691527843475342, "logits/rejected": -0.8712851405143738, "logps/chosen": -4.1400322914123535, "logps/rejected": -4.675261497497559, "loss": 3.4518, "rewards/accuracies": 0.6796875, "rewards/chosen": -41.400325775146484, "rewards/margins": 5.352287769317627, "rewards/rejected": -46.75260925292969, "step": 217 }, { "epoch": 0.4930732258976534, "grad_norm": 153.70015543824144, "learning_rate": 4.801693151916985e-07, "logits/chosen": -0.8753068447113037, "logits/rejected": -0.9178647398948669, "logps/chosen": -4.102838516235352, "logps/rejected": -4.627809047698975, "loss": 3.1055, "rewards/accuracies": 0.78125, "rewards/chosen": -41.02838134765625, "rewards/margins": 5.249708652496338, "rewards/rejected": -46.27809143066406, "step": 218 }, { "epoch": 0.4953350296861747, "grad_norm": 110.68812495356474, "learning_rate": 4.770657312910354e-07, "logits/chosen": -0.8981303572654724, "logits/rejected": -0.915514349937439, "logps/chosen": -4.165809631347656, "logps/rejected": -4.6610212326049805, "loss": 3.376, "rewards/accuracies": 0.703125, "rewards/chosen": -41.65810012817383, "rewards/margins": 4.952118396759033, "rewards/rejected": -46.6102180480957, "step": 219 }, { "epoch": 0.4975968334746961, "grad_norm": 116.58607540402677, "learning_rate": 4.739573214964729e-07, "logits/chosen": -0.8669706583023071, "logits/rejected": -0.8741896748542786, "logps/chosen": -3.9599337577819824, "logps/rejected": -4.534191608428955, "loss": 2.7624, "rewards/accuracies": 0.7734375, "rewards/chosen": -39.599342346191406, "rewards/margins": 5.742575645446777, "rewards/rejected": -45.341915130615234, "step": 220 }, { "epoch": 0.49985863726321744, "grad_norm": 138.68860730884543, "learning_rate": 4.7084428045815733e-07, "logits/chosen": -0.8756369948387146, "logits/rejected": -0.8829125165939331, "logps/chosen": -4.227509498596191, "logps/rejected": -4.716983795166016, "loss": 3.2761, "rewards/accuracies": 0.7265625, "rewards/chosen": -42.27509307861328, "rewards/margins": 4.894742012023926, "rewards/rejected": -47.16983413696289, "step": 221 }, { "epoch": 0.5021204410517388, "grad_norm": 184.1037285692299, "learning_rate": 4.677268031162457e-07, "logits/chosen": -0.896783709526062, "logits/rejected": -0.9043738842010498, "logps/chosen": -4.029943943023682, "logps/rejected": -4.505390644073486, "loss": 3.5501, "rewards/accuracies": 0.6953125, "rewards/chosen": -40.299442291259766, "rewards/margins": 4.7544660568237305, "rewards/rejected": -45.05391311645508, "step": 222 }, { "epoch": 0.5043822448402601, "grad_norm": 112.45603200436628, "learning_rate": 4.646050846886985e-07, "logits/chosen": -0.8041797876358032, "logits/rejected": -0.8242363929748535, "logps/chosen": -3.881974220275879, "logps/rejected": -4.4934492111206055, "loss": 2.6607, "rewards/accuracies": 0.796875, "rewards/chosen": -38.81974411010742, "rewards/margins": 6.114748954772949, "rewards/rejected": -44.93449401855469, "step": 223 }, { "epoch": 0.5066440486287814, "grad_norm": 134.00364181034922, "learning_rate": 4.6147932065905494e-07, "logits/chosen": -0.867178738117218, "logits/rejected": -0.8653546571731567, "logps/chosen": -4.144719123840332, "logps/rejected": -4.623917102813721, "loss": 3.5008, "rewards/accuracies": 0.7265625, "rewards/chosen": -41.44718933105469, "rewards/margins": 4.791982650756836, "rewards/rejected": -46.239173889160156, "step": 224 }, { "epoch": 0.5089058524173028, "grad_norm": 114.91237964387022, "learning_rate": 4.5834970676419214e-07, "logits/chosen": -0.8645190000534058, "logits/rejected": -0.8773024082183838, "logps/chosen": -3.982334613800049, "logps/rejected": -4.518105983734131, "loss": 3.0231, "rewards/accuracies": 0.7421875, "rewards/chosen": -39.82334899902344, "rewards/margins": 5.357712268829346, "rewards/rejected": -45.181060791015625, "step": 225 }, { "epoch": 0.5111676562058242, "grad_norm": 154.2435864953568, "learning_rate": 4.552164389820673e-07, "logits/chosen": -0.7863515615463257, "logits/rejected": -0.804935097694397, "logps/chosen": -4.0218400955200195, "logps/rejected": -4.741469383239746, "loss": 2.7939, "rewards/accuracies": 0.8203125, "rewards/chosen": -40.218406677246094, "rewards/margins": 7.1962890625, "rewards/rejected": -47.414695739746094, "step": 226 }, { "epoch": 0.5134294599943455, "grad_norm": 129.512345171623, "learning_rate": 4.5207971351944605e-07, "logits/chosen": -0.904701828956604, "logits/rejected": -0.9039019346237183, "logps/chosen": -3.990769386291504, "logps/rejected": -4.5699143409729, "loss": 3.5426, "rewards/accuracies": 0.71875, "rewards/chosen": -39.90769577026367, "rewards/margins": 5.791450500488281, "rewards/rejected": -45.69914245605469, "step": 227 }, { "epoch": 0.5156912637828668, "grad_norm": 127.45769396653864, "learning_rate": 4.489397267996157e-07, "logits/chosen": -0.8994483351707458, "logits/rejected": -0.8912683725357056, "logps/chosen": -3.946481466293335, "logps/rejected": -4.470279693603516, "loss": 2.9926, "rewards/accuracies": 0.7421875, "rewards/chosen": -39.464813232421875, "rewards/margins": 5.237981796264648, "rewards/rejected": -44.702796936035156, "step": 228 }, { "epoch": 0.5179530675713881, "grad_norm": 111.39180557968587, "learning_rate": 4.45796675450085e-07, "logits/chosen": -0.8582149744033813, "logits/rejected": -0.8690947890281677, "logps/chosen": -3.8885929584503174, "logps/rejected": -4.494987964630127, "loss": 2.8133, "rewards/accuracies": 0.7265625, "rewards/chosen": -38.88593292236328, "rewards/margins": 6.063946723937988, "rewards/rejected": -44.94988250732422, "step": 229 }, { "epoch": 0.5202148713599095, "grad_norm": 135.02910775325827, "learning_rate": 4.4265075629027126e-07, "logits/chosen": -0.8169862031936646, "logits/rejected": -0.8348796963691711, "logps/chosen": -4.078422546386719, "logps/rejected": -4.564748764038086, "loss": 2.8027, "rewards/accuracies": 0.8203125, "rewards/chosen": -40.78423309326172, "rewards/margins": 4.863255977630615, "rewards/rejected": -45.64748764038086, "step": 230 }, { "epoch": 0.5224766751484309, "grad_norm": 110.51829137339959, "learning_rate": 4.3950216631917563e-07, "logits/chosen": -0.888090193271637, "logits/rejected": -0.9090730547904968, "logps/chosen": -3.9436981678009033, "logps/rejected": -4.6157708168029785, "loss": 2.6358, "rewards/accuracies": 0.7734375, "rewards/chosen": -39.43698501586914, "rewards/margins": 6.720722198486328, "rewards/rejected": -46.15770721435547, "step": 231 }, { "epoch": 0.5247384789369522, "grad_norm": 111.61538092823348, "learning_rate": 4.3635110270304676e-07, "logits/chosen": -0.8641917109489441, "logits/rejected": -0.8716497421264648, "logps/chosen": -3.86027193069458, "logps/rejected": -4.502015113830566, "loss": 2.2291, "rewards/accuracies": 0.828125, "rewards/chosen": -38.602718353271484, "rewards/margins": 6.4174323081970215, "rewards/rejected": -45.02014923095703, "step": 232 }, { "epoch": 0.5270002827254736, "grad_norm": 173.35008588152775, "learning_rate": 4.331977627630339e-07, "logits/chosen": -0.8097434043884277, "logits/rejected": -0.8040153980255127, "logps/chosen": -3.757528305053711, "logps/rejected": -4.439907550811768, "loss": 2.6864, "rewards/accuracies": 0.796875, "rewards/chosen": -37.57528305053711, "rewards/margins": 6.823795318603516, "rewards/rejected": -44.399078369140625, "step": 233 }, { "epoch": 0.5292620865139949, "grad_norm": 120.90157841350384, "learning_rate": 4.300423439628313e-07, "logits/chosen": -0.8537578582763672, "logits/rejected": -0.8780308365821838, "logps/chosen": -3.8302276134490967, "logps/rejected": -4.468556880950928, "loss": 2.498, "rewards/accuracies": 0.796875, "rewards/chosen": -38.30228042602539, "rewards/margins": 6.383289813995361, "rewards/rejected": -44.685569763183594, "step": 234 }, { "epoch": 0.5315238903025162, "grad_norm": 118.3536030698132, "learning_rate": 4.268850438963118e-07, "logits/chosen": -0.8823138475418091, "logits/rejected": -0.9043455719947815, "logps/chosen": -4.102262020111084, "logps/rejected": -4.674637794494629, "loss": 2.7324, "rewards/accuracies": 0.7421875, "rewards/chosen": -41.02262496948242, "rewards/margins": 5.723756313323975, "rewards/rejected": -46.746376037597656, "step": 235 }, { "epoch": 0.5337856940910376, "grad_norm": 116.93696869949373, "learning_rate": 4.2372606027515463e-07, "logits/chosen": -0.8339194655418396, "logits/rejected": -0.8445159196853638, "logps/chosen": -3.7381174564361572, "logps/rejected": -4.291147708892822, "loss": 2.9013, "rewards/accuracies": 0.7890625, "rewards/chosen": -37.38117599487305, "rewards/margins": 5.530303478240967, "rewards/rejected": -42.911476135253906, "step": 236 }, { "epoch": 0.536047497879559, "grad_norm": 148.0693344568736, "learning_rate": 4.2056559091646387e-07, "logits/chosen": -0.8702591061592102, "logits/rejected": -0.898255467414856, "logps/chosen": -4.004217624664307, "logps/rejected": -4.487666130065918, "loss": 3.3504, "rewards/accuracies": 0.7109375, "rewards/chosen": -40.042179107666016, "rewards/margins": 4.834486484527588, "rewards/rejected": -44.87666702270508, "step": 237 }, { "epoch": 0.5383093016680803, "grad_norm": 136.83734710023003, "learning_rate": 4.1740383373038116e-07, "logits/chosen": -0.8536058664321899, "logits/rejected": -0.8870092630386353, "logps/chosen": -3.817162036895752, "logps/rejected": -4.381956100463867, "loss": 2.9761, "rewards/accuracies": 0.765625, "rewards/chosen": -38.1716194152832, "rewards/margins": 5.647944450378418, "rewards/rejected": -43.81956481933594, "step": 238 }, { "epoch": 0.5405711054566016, "grad_norm": 116.45504096009955, "learning_rate": 4.1424098670769255e-07, "logits/chosen": -0.9009624719619751, "logits/rejected": -0.9262585639953613, "logps/chosen": -3.884793281555176, "logps/rejected": -4.33213996887207, "loss": 3.0779, "rewards/accuracies": 0.734375, "rewards/chosen": -38.84792709350586, "rewards/margins": 4.473471164703369, "rewards/rejected": -43.3213996887207, "step": 239 }, { "epoch": 0.542832909245123, "grad_norm": 114.67603070092983, "learning_rate": 4.1107724790743007e-07, "logits/chosen": -0.8459216356277466, "logits/rejected": -0.8754346966743469, "logps/chosen": -3.9210426807403564, "logps/rejected": -4.424591064453125, "loss": 2.7487, "rewards/accuracies": 0.8046875, "rewards/chosen": -39.210426330566406, "rewards/margins": 5.035484313964844, "rewards/rejected": -44.245906829833984, "step": 240 }, { "epoch": 0.5450947130336443, "grad_norm": 159.44800208061523, "learning_rate": 4.0791281544446947e-07, "logits/chosen": -0.8924515843391418, "logits/rejected": -0.8807788491249084, "logps/chosen": -3.9503896236419678, "logps/rejected": -4.500914573669434, "loss": 2.7043, "rewards/accuracies": 0.7890625, "rewards/chosen": -39.50389099121094, "rewards/margins": 5.505251884460449, "rewards/rejected": -45.00914764404297, "step": 241 }, { "epoch": 0.5473565168221657, "grad_norm": 128.53086499808066, "learning_rate": 4.0474788747712416e-07, "logits/chosen": -0.8996694684028625, "logits/rejected": -0.9028959274291992, "logps/chosen": -3.932866096496582, "logps/rejected": -4.37875509262085, "loss": 3.589, "rewards/accuracies": 0.703125, "rewards/chosen": -39.32866287231445, "rewards/margins": 4.458887100219727, "rewards/rejected": -43.78754806518555, "step": 242 }, { "epoch": 0.549618320610687, "grad_norm": 104.57144676128537, "learning_rate": 4.0158266219473573e-07, "logits/chosen": -0.8719525933265686, "logits/rejected": -0.8880172371864319, "logps/chosen": -3.7240490913391113, "logps/rejected": -4.320034503936768, "loss": 2.5879, "rewards/accuracies": 0.8046875, "rewards/chosen": -37.24049377441406, "rewards/margins": 5.959850311279297, "rewards/rejected": -43.20034408569336, "step": 243 }, { "epoch": 0.5518801243992084, "grad_norm": 121.72780985174033, "learning_rate": 3.984173378052643e-07, "logits/chosen": -0.8488632440567017, "logits/rejected": -0.8424826264381409, "logps/chosen": -3.662327289581299, "logps/rejected": -4.250753879547119, "loss": 2.558, "rewards/accuracies": 0.7734375, "rewards/chosen": -36.623268127441406, "rewards/margins": 5.884267807006836, "rewards/rejected": -42.507537841796875, "step": 244 }, { "epoch": 0.5541419281877297, "grad_norm": 176.66908023998735, "learning_rate": 3.9525211252287585e-07, "logits/chosen": -0.9206802248954773, "logits/rejected": -0.938546895980835, "logps/chosen": -3.8519883155822754, "logps/rejected": -4.533115386962891, "loss": 3.0243, "rewards/accuracies": 0.734375, "rewards/chosen": -38.51988220214844, "rewards/margins": 6.811273574829102, "rewards/rejected": -45.331153869628906, "step": 245 }, { "epoch": 0.556403731976251, "grad_norm": 108.79709814447062, "learning_rate": 3.920871845555305e-07, "logits/chosen": -0.8708853721618652, "logits/rejected": -0.8730578422546387, "logps/chosen": -3.832918882369995, "logps/rejected": -4.335785865783691, "loss": 2.5306, "rewards/accuracies": 0.78125, "rewards/chosen": -38.32918930053711, "rewards/margins": 5.0286712646484375, "rewards/rejected": -43.35785675048828, "step": 246 }, { "epoch": 0.5586655357647724, "grad_norm": 127.09481932058374, "learning_rate": 3.8892275209256984e-07, "logits/chosen": -0.921829342842102, "logits/rejected": -0.9111767411231995, "logps/chosen": -3.9879932403564453, "logps/rejected": -4.458497047424316, "loss": 2.9165, "rewards/accuracies": 0.765625, "rewards/chosen": -39.87993240356445, "rewards/margins": 4.705035209655762, "rewards/rejected": -44.58496856689453, "step": 247 }, { "epoch": 0.5609273395532938, "grad_norm": 115.9590799116695, "learning_rate": 3.8575901329230747e-07, "logits/chosen": -0.8582264184951782, "logits/rejected": -0.8617616891860962, "logps/chosen": -3.9318342208862305, "logps/rejected": -4.470717430114746, "loss": 3.0595, "rewards/accuracies": 0.7734375, "rewards/chosen": -39.31834411621094, "rewards/margins": 5.388828277587891, "rewards/rejected": -44.707176208496094, "step": 248 }, { "epoch": 0.5631891433418151, "grad_norm": 128.3493722347937, "learning_rate": 3.8259616626961886e-07, "logits/chosen": -0.8590461015701294, "logits/rejected": -0.8726236820220947, "logps/chosen": -3.7069010734558105, "logps/rejected": -4.159891128540039, "loss": 2.6582, "rewards/accuracies": 0.8125, "rewards/chosen": -37.06901550292969, "rewards/margins": 4.529898643493652, "rewards/rejected": -41.59891128540039, "step": 249 }, { "epoch": 0.5654509471303364, "grad_norm": 114.11597318565974, "learning_rate": 3.794344090835362e-07, "logits/chosen": -0.8618912100791931, "logits/rejected": -0.8814125061035156, "logps/chosen": -4.0398969650268555, "logps/rejected": -4.569504737854004, "loss": 2.945, "rewards/accuracies": 0.7734375, "rewards/chosen": -40.39897155761719, "rewards/margins": 5.2960710525512695, "rewards/rejected": -45.695045471191406, "step": 250 }, { "epoch": 0.5677127509188578, "grad_norm": 132.84283803977144, "learning_rate": 3.7627393972484534e-07, "logits/chosen": -0.9538972973823547, "logits/rejected": -0.961841881275177, "logps/chosen": -3.9805994033813477, "logps/rejected": -4.373291969299316, "loss": 3.459, "rewards/accuracies": 0.734375, "rewards/chosen": -39.805992126464844, "rewards/margins": 3.9269251823425293, "rewards/rejected": -43.73291778564453, "step": 251 }, { "epoch": 0.5699745547073791, "grad_norm": 116.59079088621297, "learning_rate": 3.7311495610368823e-07, "logits/chosen": -0.9467366933822632, "logits/rejected": -0.9687215089797974, "logps/chosen": -4.043094158172607, "logps/rejected": -4.563295364379883, "loss": 3.0119, "rewards/accuracies": 0.7578125, "rewards/chosen": -40.430938720703125, "rewards/margins": 5.202012538909912, "rewards/rejected": -45.63295364379883, "step": 252 }, { "epoch": 0.5722363584959005, "grad_norm": 120.59136322495354, "learning_rate": 3.699576560371689e-07, "logits/chosen": -0.8889734745025635, "logits/rejected": -0.9066051244735718, "logps/chosen": -4.205962181091309, "logps/rejected": -4.996251106262207, "loss": 2.315, "rewards/accuracies": 0.7890625, "rewards/chosen": -42.05961990356445, "rewards/margins": 7.902889251708984, "rewards/rejected": -49.96250915527344, "step": 253 }, { "epoch": 0.5744981622844219, "grad_norm": 124.59506533641044, "learning_rate": 3.66802237236966e-07, "logits/chosen": -0.8749493956565857, "logits/rejected": -0.8885746002197266, "logps/chosen": -4.300434112548828, "logps/rejected": -4.866487503051758, "loss": 2.9432, "rewards/accuracies": 0.7734375, "rewards/chosen": -43.00434112548828, "rewards/margins": 5.66053581237793, "rewards/rejected": -48.664878845214844, "step": 254 }, { "epoch": 0.5767599660729432, "grad_norm": 118.16852020588865, "learning_rate": 3.636488972969532e-07, "logits/chosen": -0.9017617702484131, "logits/rejected": -0.9131591320037842, "logps/chosen": -4.101204872131348, "logps/rejected": -4.681705474853516, "loss": 2.6388, "rewards/accuracies": 0.765625, "rewards/chosen": -41.012046813964844, "rewards/margins": 5.80500602722168, "rewards/rejected": -46.817054748535156, "step": 255 }, { "epoch": 0.5790217698614645, "grad_norm": 120.89498541698326, "learning_rate": 3.604978336808244e-07, "logits/chosen": -0.9948743581771851, "logits/rejected": -1.0087530612945557, "logps/chosen": -4.0368547439575195, "logps/rejected": -4.631007194519043, "loss": 2.8043, "rewards/accuracies": 0.7734375, "rewards/chosen": -40.36854934692383, "rewards/margins": 5.941521644592285, "rewards/rejected": -46.31007385253906, "step": 256 }, { "epoch": 0.5812835736499858, "grad_norm": 124.83239622588137, "learning_rate": 3.5734924370972876e-07, "logits/chosen": -0.9356947541236877, "logits/rejected": -0.9560631513595581, "logps/chosen": -4.138307571411133, "logps/rejected": -4.669035911560059, "loss": 3.0658, "rewards/accuracies": 0.75, "rewards/chosen": -41.38307189941406, "rewards/margins": 5.307290554046631, "rewards/rejected": -46.69036102294922, "step": 257 }, { "epoch": 0.5835453774385072, "grad_norm": 138.50523829837766, "learning_rate": 3.5420332454991504e-07, "logits/chosen": -0.8820909261703491, "logits/rejected": -0.8897730112075806, "logps/chosen": -4.207208633422852, "logps/rejected": -4.809183597564697, "loss": 2.7913, "rewards/accuracies": 0.765625, "rewards/chosen": -42.07209014892578, "rewards/margins": 6.019748210906982, "rewards/rejected": -48.091835021972656, "step": 258 }, { "epoch": 0.5858071812270286, "grad_norm": 110.39897739040883, "learning_rate": 3.510602732003843e-07, "logits/chosen": -0.9389081597328186, "logits/rejected": -0.9693293571472168, "logps/chosen": -4.315252780914307, "logps/rejected": -4.980816841125488, "loss": 2.5721, "rewards/accuracies": 0.8203125, "rewards/chosen": -43.152530670166016, "rewards/margins": 6.655643463134766, "rewards/rejected": -49.80816650390625, "step": 259 }, { "epoch": 0.5880689850155499, "grad_norm": 125.09334065686004, "learning_rate": 3.4792028648055396e-07, "logits/chosen": -0.8979041576385498, "logits/rejected": -0.9275961518287659, "logps/chosen": -4.101990699768066, "logps/rejected": -4.697176933288574, "loss": 2.7902, "rewards/accuracies": 0.7890625, "rewards/chosen": -41.0199089050293, "rewards/margins": 5.951866149902344, "rewards/rejected": -46.971771240234375, "step": 260 }, { "epoch": 0.5903307888040712, "grad_norm": 117.68527045074552, "learning_rate": 3.447835610179327e-07, "logits/chosen": -0.8862229585647583, "logits/rejected": -0.899125337600708, "logps/chosen": -4.087460994720459, "logps/rejected": -4.825685501098633, "loss": 2.6479, "rewards/accuracies": 0.8046875, "rewards/chosen": -40.874610900878906, "rewards/margins": 7.382248878479004, "rewards/rejected": -48.25685501098633, "step": 261 }, { "epoch": 0.5925925925925926, "grad_norm": 139.3400014797519, "learning_rate": 3.416502932358079e-07, "logits/chosen": -0.9597766399383545, "logits/rejected": -0.9774207472801208, "logps/chosen": -4.404236316680908, "logps/rejected": -4.83123254776001, "loss": 3.1974, "rewards/accuracies": 0.7265625, "rewards/chosen": -44.04236602783203, "rewards/margins": 4.269958019256592, "rewards/rejected": -48.31232452392578, "step": 262 }, { "epoch": 0.5948543963811139, "grad_norm": 150.23635516979436, "learning_rate": 3.385206793409451e-07, "logits/chosen": -0.8739109039306641, "logits/rejected": -0.8933315873146057, "logps/chosen": -3.9273197650909424, "logps/rejected": -4.468226432800293, "loss": 2.7657, "rewards/accuracies": 0.8046875, "rewards/chosen": -39.273197174072266, "rewards/margins": 5.4090657234191895, "rewards/rejected": -44.68226623535156, "step": 263 }, { "epoch": 0.5971162001696353, "grad_norm": 136.57829357030124, "learning_rate": 3.3539491531130163e-07, "logits/chosen": -0.9004377722740173, "logits/rejected": -0.9124334454536438, "logps/chosen": -4.114037036895752, "logps/rejected": -4.8097944259643555, "loss": 2.6883, "rewards/accuracies": 0.828125, "rewards/chosen": -41.14036560058594, "rewards/margins": 6.95757532119751, "rewards/rejected": -48.09794235229492, "step": 264 }, { "epoch": 0.5993780039581567, "grad_norm": 134.6857649306004, "learning_rate": 3.3227319688375426e-07, "logits/chosen": -0.9540138840675354, "logits/rejected": -0.9450901746749878, "logps/chosen": -4.0871100425720215, "logps/rejected": -4.629415035247803, "loss": 2.8724, "rewards/accuracies": 0.8046875, "rewards/chosen": -40.8711051940918, "rewards/margins": 5.423047065734863, "rewards/rejected": -46.294151306152344, "step": 265 }, { "epoch": 0.601639807746678, "grad_norm": 137.41581633964907, "learning_rate": 3.291557195418427e-07, "logits/chosen": -0.9520595073699951, "logits/rejected": -0.9495226144790649, "logps/chosen": -3.8760974407196045, "logps/rejected": -4.302947044372559, "loss": 3.2724, "rewards/accuracies": 0.7421875, "rewards/chosen": -38.76097869873047, "rewards/margins": 4.268494606018066, "rewards/rejected": -43.02947235107422, "step": 266 }, { "epoch": 0.6039016115351993, "grad_norm": 144.7995816554034, "learning_rate": 3.260426785035272e-07, "logits/chosen": -0.9223634004592896, "logits/rejected": -0.9228293895721436, "logps/chosen": -3.9227046966552734, "logps/rejected": -4.469911098480225, "loss": 3.3138, "rewards/accuracies": 0.765625, "rewards/chosen": -39.227046966552734, "rewards/margins": 5.472067356109619, "rewards/rejected": -44.6991081237793, "step": 267 }, { "epoch": 0.6061634153237206, "grad_norm": 108.29473766754211, "learning_rate": 3.229342687089646e-07, "logits/chosen": -0.9119688272476196, "logits/rejected": -0.915400505065918, "logps/chosen": -3.8088831901550293, "logps/rejected": -4.450516223907471, "loss": 2.4994, "rewards/accuracies": 0.796875, "rewards/chosen": -38.08883285522461, "rewards/margins": 6.416332244873047, "rewards/rejected": -44.505165100097656, "step": 268 }, { "epoch": 0.608425219112242, "grad_norm": 114.65522312241735, "learning_rate": 3.1983068480830143e-07, "logits/chosen": -0.9089800715446472, "logits/rejected": -0.9124254584312439, "logps/chosen": -3.8074193000793457, "logps/rejected": -4.448195934295654, "loss": 2.7265, "rewards/accuracies": 0.7578125, "rewards/chosen": -38.074195861816406, "rewards/margins": 6.407771110534668, "rewards/rejected": -44.481964111328125, "step": 269 }, { "epoch": 0.6106870229007634, "grad_norm": 114.37970020373666, "learning_rate": 3.1673212114948387e-07, "logits/chosen": -0.8962373733520508, "logits/rejected": -0.8947957158088684, "logps/chosen": -3.66055965423584, "logps/rejected": -4.304127216339111, "loss": 2.4988, "rewards/accuracies": 0.796875, "rewards/chosen": -36.60559844970703, "rewards/margins": 6.435674667358398, "rewards/rejected": -43.0412712097168, "step": 270 }, { "epoch": 0.6129488266892847, "grad_norm": 131.22698552322544, "learning_rate": 3.1363877176608845e-07, "logits/chosen": -0.8634111285209656, "logits/rejected": -0.8854045271873474, "logps/chosen": -3.583430290222168, "logps/rejected": -4.167083740234375, "loss": 2.9822, "rewards/accuracies": 0.7421875, "rewards/chosen": -35.83430099487305, "rewards/margins": 5.836535930633545, "rewards/rejected": -41.67083740234375, "step": 271 }, { "epoch": 0.615210630477806, "grad_norm": 118.13378200287389, "learning_rate": 3.1055083036517076e-07, "logits/chosen": -0.8832507133483887, "logits/rejected": -0.8600270748138428, "logps/chosen": -3.605767011642456, "logps/rejected": -4.242787837982178, "loss": 2.5501, "rewards/accuracies": 0.8515625, "rewards/chosen": -36.05766677856445, "rewards/margins": 6.370209217071533, "rewards/rejected": -42.427879333496094, "step": 272 }, { "epoch": 0.6174724342663274, "grad_norm": 150.22493651179883, "learning_rate": 3.074684903151364e-07, "logits/chosen": -0.7782445549964905, "logits/rejected": -0.7658709287643433, "logps/chosen": -3.3246827125549316, "logps/rejected": -3.7869513034820557, "loss": 2.6785, "rewards/accuracies": 0.75, "rewards/chosen": -33.246826171875, "rewards/margins": 4.622686386108398, "rewards/rejected": -37.86951446533203, "step": 273 }, { "epoch": 0.6197342380548487, "grad_norm": 124.8554523177313, "learning_rate": 3.0439194463363136e-07, "logits/chosen": -0.8569799065589905, "logits/rejected": -0.8484780192375183, "logps/chosen": -3.430828094482422, "logps/rejected": -3.913545608520508, "loss": 3.1282, "rewards/accuracies": 0.7265625, "rewards/chosen": -34.30828094482422, "rewards/margins": 4.827174186706543, "rewards/rejected": -39.13545608520508, "step": 274 }, { "epoch": 0.6219960418433701, "grad_norm": 105.37259501282527, "learning_rate": 3.0132138597545537e-07, "logits/chosen": -0.8960826396942139, "logits/rejected": -0.935992956161499, "logps/chosen": -3.6094119548797607, "logps/rejected": -4.329087734222412, "loss": 2.4253, "rewards/accuracies": 0.7734375, "rewards/chosen": -36.094120025634766, "rewards/margins": 7.196761608123779, "rewards/rejected": -43.29087829589844, "step": 275 }, { "epoch": 0.6242578456318915, "grad_norm": 102.48158570587563, "learning_rate": 2.982570066204981e-07, "logits/chosen": -0.8868385553359985, "logits/rejected": -0.8976235389709473, "logps/chosen": -3.52752685546875, "logps/rejected": -4.042696952819824, "loss": 2.8358, "rewards/accuracies": 0.7734375, "rewards/chosen": -35.275264739990234, "rewards/margins": 5.151702880859375, "rewards/rejected": -40.426971435546875, "step": 276 }, { "epoch": 0.6265196494204128, "grad_norm": 152.73891214182655, "learning_rate": 2.951989984616979e-07, "logits/chosen": -0.8241417407989502, "logits/rejected": -0.8495975136756897, "logps/chosen": -3.6896445751190186, "logps/rejected": -4.32491397857666, "loss": 3.0859, "rewards/accuracies": 0.7421875, "rewards/chosen": -36.89644241333008, "rewards/margins": 6.352697849273682, "rewards/rejected": -43.249141693115234, "step": 277 }, { "epoch": 0.6287814532089341, "grad_norm": 112.80204422900222, "learning_rate": 2.9214755299302584e-07, "logits/chosen": -0.8538424968719482, "logits/rejected": -0.8619410395622253, "logps/chosen": -3.8360202312469482, "logps/rejected": -4.50319766998291, "loss": 2.0849, "rewards/accuracies": 0.8671875, "rewards/chosen": -38.36020278930664, "rewards/margins": 6.671772480010986, "rewards/rejected": -45.03197479248047, "step": 278 }, { "epoch": 0.6310432569974554, "grad_norm": 128.85380711403621, "learning_rate": 2.89102861297494e-07, "logits/chosen": -0.8916823863983154, "logits/rejected": -0.9155115485191345, "logps/chosen": -3.7095577716827393, "logps/rejected": -4.224562168121338, "loss": 3.36, "rewards/accuracies": 0.75, "rewards/chosen": -37.095577239990234, "rewards/margins": 5.150045394897461, "rewards/rejected": -42.24562072753906, "step": 279 }, { "epoch": 0.6333050607859768, "grad_norm": 164.97262791305909, "learning_rate": 2.860651140351902e-07, "logits/chosen": -0.887188196182251, "logits/rejected": -0.8843110799789429, "logps/chosen": -3.7880616188049316, "logps/rejected": -4.440821647644043, "loss": 2.8817, "rewards/accuracies": 0.765625, "rewards/chosen": -37.880611419677734, "rewards/margins": 6.5276007652282715, "rewards/rejected": -44.40821838378906, "step": 280 }, { "epoch": 0.6355668645744982, "grad_norm": 120.88646729377493, "learning_rate": 2.830345014313381e-07, "logits/chosen": -0.828898549079895, "logits/rejected": -0.8654926419258118, "logps/chosen": -3.848175525665283, "logps/rejected": -4.498141288757324, "loss": 2.3091, "rewards/accuracies": 0.765625, "rewards/chosen": -38.48175811767578, "rewards/margins": 6.499655246734619, "rewards/rejected": -44.981414794921875, "step": 281 }, { "epoch": 0.6378286683630195, "grad_norm": 138.22518292554588, "learning_rate": 2.800112132643856e-07, "logits/chosen": -0.8729172348976135, "logits/rejected": -0.8878234028816223, "logps/chosen": -3.8589096069335938, "logps/rejected": -4.514438629150391, "loss": 2.7701, "rewards/accuracies": 0.7734375, "rewards/chosen": -38.58909225463867, "rewards/margins": 6.5552897453308105, "rewards/rejected": -45.144378662109375, "step": 282 }, { "epoch": 0.6400904721515408, "grad_norm": 123.6765508241198, "learning_rate": 2.7699543885412105e-07, "logits/chosen": -0.8810731768608093, "logits/rejected": -0.8965428471565247, "logps/chosen": -3.979841947555542, "logps/rejected": -4.676267623901367, "loss": 2.5296, "rewards/accuracies": 0.8046875, "rewards/chosen": -39.79841995239258, "rewards/margins": 6.96425724029541, "rewards/rejected": -46.762672424316406, "step": 283 }, { "epoch": 0.6423522759400622, "grad_norm": 134.34352209400595, "learning_rate": 2.7398736704981725e-07, "logits/chosen": -0.8905003070831299, "logits/rejected": -0.8742426037788391, "logps/chosen": -4.015549182891846, "logps/rejected": -4.602110385894775, "loss": 2.6926, "rewards/accuracies": 0.7734375, "rewards/chosen": -40.155487060546875, "rewards/margins": 5.865612030029297, "rewards/rejected": -46.0211067199707, "step": 284 }, { "epoch": 0.6446140797285835, "grad_norm": 125.55351312091291, "learning_rate": 2.709871862184063e-07, "logits/chosen": -0.8608399629592896, "logits/rejected": -0.8779529929161072, "logps/chosen": -3.936886787414551, "logps/rejected": -4.538551330566406, "loss": 3.1564, "rewards/accuracies": 0.7578125, "rewards/chosen": -39.36886215209961, "rewards/margins": 6.016650199890137, "rewards/rejected": -45.38551712036133, "step": 285 }, { "epoch": 0.6468758835171049, "grad_norm": 116.6462838115517, "learning_rate": 2.679950842326837e-07, "logits/chosen": -0.9049277901649475, "logits/rejected": -0.9126715064048767, "logps/chosen": -4.160530090332031, "logps/rejected": -4.841786861419678, "loss": 2.5103, "rewards/accuracies": 0.75, "rewards/chosen": -41.60530471801758, "rewards/margins": 6.812563896179199, "rewards/rejected": -48.41786193847656, "step": 286 }, { "epoch": 0.6491376873056263, "grad_norm": 125.7388754846288, "learning_rate": 2.6501124845954363e-07, "logits/chosen": -0.8767872452735901, "logits/rejected": -0.8922024965286255, "logps/chosen": -4.103570461273193, "logps/rejected": -4.829570770263672, "loss": 2.3212, "rewards/accuracies": 0.8046875, "rewards/chosen": -41.03570556640625, "rewards/margins": 7.260003089904785, "rewards/rejected": -48.29570388793945, "step": 287 }, { "epoch": 0.6513994910941476, "grad_norm": 123.45145960508628, "learning_rate": 2.62035865748246e-07, "logits/chosen": -0.8526559472084045, "logits/rejected": -0.865902304649353, "logps/chosen": -3.979776382446289, "logps/rejected": -4.576243877410889, "loss": 2.8045, "rewards/accuracies": 0.7734375, "rewards/chosen": -39.797767639160156, "rewards/margins": 5.964676856994629, "rewards/rejected": -45.76243591308594, "step": 288 }, { "epoch": 0.6536612948826689, "grad_norm": 165.09853723797895, "learning_rate": 2.5906912241871554e-07, "logits/chosen": -0.932748019695282, "logits/rejected": -0.9441463947296143, "logps/chosen": -4.250375270843506, "logps/rejected": -4.883494853973389, "loss": 2.7687, "rewards/accuracies": 0.765625, "rewards/chosen": -42.503753662109375, "rewards/margins": 6.331197738647461, "rewards/rejected": -48.83495330810547, "step": 289 }, { "epoch": 0.6559230986711903, "grad_norm": 136.63193628571327, "learning_rate": 2.561112042498753e-07, "logits/chosen": -0.8068567514419556, "logits/rejected": -0.8377366065979004, "logps/chosen": -3.9496514797210693, "logps/rejected": -4.45440673828125, "loss": 3.4096, "rewards/accuracies": 0.71875, "rewards/chosen": -39.49651336669922, "rewards/margins": 5.047552108764648, "rewards/rejected": -44.5440673828125, "step": 290 }, { "epoch": 0.6581849024597116, "grad_norm": 143.97363440746773, "learning_rate": 2.5316229646801195e-07, "logits/chosen": -0.8525142073631287, "logits/rejected": -0.8822568655014038, "logps/chosen": -4.472620010375977, "logps/rejected": -5.035334587097168, "loss": 2.9599, "rewards/accuracies": 0.765625, "rewards/chosen": -44.726200103759766, "rewards/margins": 5.6271467208862305, "rewards/rejected": -50.35334777832031, "step": 291 }, { "epoch": 0.660446706248233, "grad_norm": 120.93371939289545, "learning_rate": 2.5022258373517714e-07, "logits/chosen": -0.9202491044998169, "logits/rejected": -0.9317676424980164, "logps/chosen": -4.282386779785156, "logps/rejected": -4.909029483795166, "loss": 2.3016, "rewards/accuracies": 0.828125, "rewards/chosen": -42.8238639831543, "rewards/margins": 6.266423225402832, "rewards/rejected": -49.09029006958008, "step": 292 }, { "epoch": 0.6627085100367544, "grad_norm": 149.03147323251173, "learning_rate": 2.4729225013762474e-07, "logits/chosen": -0.9682255387306213, "logits/rejected": -0.9804242253303528, "logps/chosen": -4.4975714683532715, "logps/rejected": -5.107451438903809, "loss": 3.3597, "rewards/accuracies": 0.7734375, "rewards/chosen": -44.9757194519043, "rewards/margins": 6.098800182342529, "rewards/rejected": -51.07451629638672, "step": 293 }, { "epoch": 0.6649703138252756, "grad_norm": 147.62664173767308, "learning_rate": 2.4437147917428203e-07, "logits/chosen": -0.8548184633255005, "logits/rejected": -0.8674319386482239, "logps/chosen": -4.390334129333496, "logps/rejected": -5.062026500701904, "loss": 2.7968, "rewards/accuracies": 0.796875, "rewards/chosen": -43.90333938598633, "rewards/margins": 6.716926574707031, "rewards/rejected": -50.620262145996094, "step": 294 }, { "epoch": 0.667232117613797, "grad_norm": 145.09780607922434, "learning_rate": 2.414604537452595e-07, "logits/chosen": -0.8391546607017517, "logits/rejected": -0.8629494309425354, "logps/chosen": -4.255443096160889, "logps/rejected": -4.826220512390137, "loss": 2.7128, "rewards/accuracies": 0.78125, "rewards/chosen": -42.55442810058594, "rewards/margins": 5.707772731781006, "rewards/rejected": -48.26219940185547, "step": 295 }, { "epoch": 0.6694939214023183, "grad_norm": 141.08198535612613, "learning_rate": 2.385593561403974e-07, "logits/chosen": -0.8808133602142334, "logits/rejected": -0.9036346673965454, "logps/chosen": -4.058435440063477, "logps/rejected": -4.680113792419434, "loss": 2.5458, "rewards/accuracies": 0.859375, "rewards/chosen": -40.5843505859375, "rewards/margins": 6.216782569885254, "rewards/rejected": -46.8011360168457, "step": 296 }, { "epoch": 0.6717557251908397, "grad_norm": 108.96502235601564, "learning_rate": 2.3566836802785119e-07, "logits/chosen": -0.8734185099601746, "logits/rejected": -0.910306453704834, "logps/chosen": -4.139810562133789, "logps/rejected": -4.895854949951172, "loss": 2.3129, "rewards/accuracies": 0.8125, "rewards/chosen": -41.398101806640625, "rewards/margins": 7.560453414916992, "rewards/rejected": -48.95855712890625, "step": 297 }, { "epoch": 0.6740175289793611, "grad_norm": 137.86729078107237, "learning_rate": 2.327876704427146e-07, "logits/chosen": -0.8416418433189392, "logits/rejected": -0.8470006585121155, "logps/chosen": -4.119014263153076, "logps/rejected": -4.593555927276611, "loss": 3.1621, "rewards/accuracies": 0.734375, "rewards/chosen": -41.19013977050781, "rewards/margins": 4.745421886444092, "rewards/rejected": -45.93556213378906, "step": 298 }, { "epoch": 0.6762793327678824, "grad_norm": 175.89823569204168, "learning_rate": 2.2991744377568358e-07, "logits/chosen": -0.8492337465286255, "logits/rejected": -0.8457680344581604, "logps/chosen": -4.260539531707764, "logps/rejected": -4.814329147338867, "loss": 2.9577, "rewards/accuracies": 0.7890625, "rewards/chosen": -42.60539627075195, "rewards/margins": 5.537896633148193, "rewards/rejected": -48.14329528808594, "step": 299 }, { "epoch": 0.6785411365564037, "grad_norm": 131.9500051592224, "learning_rate": 2.270578677617601e-07, "logits/chosen": -0.9049394130706787, "logits/rejected": -0.9202775955200195, "logps/chosen": -4.153038501739502, "logps/rejected": -4.798150062561035, "loss": 3.2734, "rewards/accuracies": 0.75, "rewards/chosen": -41.53038787841797, "rewards/margins": 6.451115608215332, "rewards/rejected": -47.981502532958984, "step": 300 }, { "epoch": 0.6808029403449251, "grad_norm": 128.15329813087357, "learning_rate": 2.242091214689971e-07, "logits/chosen": -0.8781294226646423, "logits/rejected": -0.9136564135551453, "logps/chosen": -4.225987911224365, "logps/rejected": -4.948566436767578, "loss": 2.5272, "rewards/accuracies": 0.828125, "rewards/chosen": -42.2598876953125, "rewards/margins": 7.225780487060547, "rewards/rejected": -49.485660552978516, "step": 301 }, { "epoch": 0.6830647441334464, "grad_norm": 149.91529967533276, "learning_rate": 2.2137138328728456e-07, "logits/chosen": -0.9418582916259766, "logits/rejected": -0.9293465614318848, "logps/chosen": -4.346358299255371, "logps/rejected": -4.898774147033691, "loss": 2.7408, "rewards/accuracies": 0.78125, "rewards/chosen": -43.46357727050781, "rewards/margins": 5.524153232574463, "rewards/rejected": -48.987735748291016, "step": 302 }, { "epoch": 0.6853265479219678, "grad_norm": 108.47737776135804, "learning_rate": 2.1854483091717974e-07, "logits/chosen": -0.9234378337860107, "logits/rejected": -0.9519913792610168, "logps/chosen": -4.19830322265625, "logps/rejected": -4.844261646270752, "loss": 2.266, "rewards/accuracies": 0.78125, "rewards/chosen": -41.9830322265625, "rewards/margins": 6.459583759307861, "rewards/rejected": -48.4426155090332, "step": 303 }, { "epoch": 0.6875883517104892, "grad_norm": 143.85162210524913, "learning_rate": 2.1572964135877863e-07, "logits/chosen": -0.9188116192817688, "logits/rejected": -0.9410698413848877, "logps/chosen": -4.3630805015563965, "logps/rejected": -4.898950099945068, "loss": 3.0578, "rewards/accuracies": 0.7890625, "rewards/chosen": -43.63079833984375, "rewards/margins": 5.358698844909668, "rewards/rejected": -48.989498138427734, "step": 304 }, { "epoch": 0.6898501554990104, "grad_norm": 122.12397072006037, "learning_rate": 2.1292599090063245e-07, "logits/chosen": -0.9438715577125549, "logits/rejected": -0.9488154053688049, "logps/chosen": -4.201948165893555, "logps/rejected": -4.896744251251221, "loss": 2.4426, "rewards/accuracies": 0.828125, "rewards/chosen": -42.01948165893555, "rewards/margins": 6.94796085357666, "rewards/rejected": -48.96744155883789, "step": 305 }, { "epoch": 0.6921119592875318, "grad_norm": 144.4237668373174, "learning_rate": 2.1013405510870824e-07, "logits/chosen": -0.8521759510040283, "logits/rejected": -0.8959603905677795, "logps/chosen": -4.268230438232422, "logps/rejected": -4.95402193069458, "loss": 2.2676, "rewards/accuracies": 0.8203125, "rewards/chosen": -42.68229675292969, "rewards/margins": 6.857920169830322, "rewards/rejected": -49.540225982666016, "step": 306 }, { "epoch": 0.6943737630760531, "grad_norm": 121.44280758498674, "learning_rate": 2.0735400881539494e-07, "logits/chosen": -0.8913055658340454, "logits/rejected": -0.9139821529388428, "logps/chosen": -4.439169406890869, "logps/rejected": -5.179335117340088, "loss": 2.3341, "rewards/accuracies": 0.8203125, "rewards/chosen": -44.39169692993164, "rewards/margins": 7.4016547203063965, "rewards/rejected": -51.79335021972656, "step": 307 }, { "epoch": 0.6966355668645745, "grad_norm": 158.3173957292418, "learning_rate": 2.0458602610855536e-07, "logits/chosen": -0.9496070742607117, "logits/rejected": -0.9575868844985962, "logps/chosen": -4.370190620422363, "logps/rejected": -4.980884552001953, "loss": 2.4398, "rewards/accuracies": 0.796875, "rewards/chosen": -43.70191192626953, "rewards/margins": 6.106935501098633, "rewards/rejected": -49.80884552001953, "step": 308 }, { "epoch": 0.6988973706530959, "grad_norm": 148.52467272675696, "learning_rate": 2.0183028032062422e-07, "logits/chosen": -0.9165297746658325, "logits/rejected": -0.9382550120353699, "logps/chosen": -4.426529884338379, "logps/rejected": -5.0528764724731445, "loss": 2.7146, "rewards/accuracies": 0.828125, "rewards/chosen": -44.265296936035156, "rewards/margins": 6.263469219207764, "rewards/rejected": -50.52876663208008, "step": 309 }, { "epoch": 0.7011591744416172, "grad_norm": 132.68776312794716, "learning_rate": 1.9908694401775473e-07, "logits/chosen": -0.9458051323890686, "logits/rejected": -0.9692423939704895, "logps/chosen": -4.464923858642578, "logps/rejected": -5.0837626457214355, "loss": 2.6034, "rewards/accuracies": 0.8046875, "rewards/chosen": -44.64923858642578, "rewards/margins": 6.188381195068359, "rewards/rejected": -50.837623596191406, "step": 310 }, { "epoch": 0.7034209782301385, "grad_norm": 136.8201982483417, "learning_rate": 1.9635618898901196e-07, "logits/chosen": -0.921970784664154, "logits/rejected": -0.939640998840332, "logps/chosen": -4.886068820953369, "logps/rejected": -5.573887825012207, "loss": 2.8076, "rewards/accuracies": 0.7734375, "rewards/chosen": -48.860687255859375, "rewards/margins": 6.878194332122803, "rewards/rejected": -55.73888397216797, "step": 311 }, { "epoch": 0.7056827820186599, "grad_norm": 138.79561100336207, "learning_rate": 1.9363818623561565e-07, "logits/chosen": -0.8815241456031799, "logits/rejected": -0.9167051315307617, "logps/chosen": -4.46604585647583, "logps/rejected": -5.128448009490967, "loss": 2.467, "rewards/accuracies": 0.8046875, "rewards/chosen": -44.66046142578125, "rewards/margins": 6.624024391174316, "rewards/rejected": -51.284481048583984, "step": 312 }, { "epoch": 0.7079445858071812, "grad_norm": 139.63987542313842, "learning_rate": 1.9093310596023108e-07, "logits/chosen": -0.8783115148544312, "logits/rejected": -0.886088490486145, "logps/chosen": -4.325229167938232, "logps/rejected": -5.129339694976807, "loss": 2.4526, "rewards/accuracies": 0.8046875, "rewards/chosen": -43.252296447753906, "rewards/margins": 8.04110336303711, "rewards/rejected": -51.29339599609375, "step": 313 }, { "epoch": 0.7102063895957026, "grad_norm": 158.39745233921516, "learning_rate": 1.8824111755631274e-07, "logits/chosen": -0.9300839900970459, "logits/rejected": -0.9606208801269531, "logps/chosen": -4.352430820465088, "logps/rejected": -4.94814920425415, "loss": 2.7746, "rewards/accuracies": 0.8046875, "rewards/chosen": -43.52430725097656, "rewards/margins": 5.957186222076416, "rewards/rejected": -49.48149490356445, "step": 314 }, { "epoch": 0.712468193384224, "grad_norm": 175.08231727150573, "learning_rate": 1.8556238959749457e-07, "logits/chosen": -0.9153900146484375, "logits/rejected": -0.9284498691558838, "logps/chosen": -4.703487396240234, "logps/rejected": -5.153081893920898, "loss": 3.7552, "rewards/accuracies": 0.703125, "rewards/chosen": -47.034873962402344, "rewards/margins": 4.495938777923584, "rewards/rejected": -51.53081512451172, "step": 315 }, { "epoch": 0.7147299971727452, "grad_norm": 171.25427946077767, "learning_rate": 1.8289708982703562e-07, "logits/chosen": -0.8872180581092834, "logits/rejected": -0.8773900866508484, "logps/chosen": -4.5176239013671875, "logps/rejected": -5.215326309204102, "loss": 3.269, "rewards/accuracies": 0.734375, "rewards/chosen": -45.17624282836914, "rewards/margins": 6.977020740509033, "rewards/rejected": -52.153263092041016, "step": 316 }, { "epoch": 0.7169918009612666, "grad_norm": 167.8139506813274, "learning_rate": 1.802453851473151e-07, "logits/chosen": -0.9402052164077759, "logits/rejected": -0.9389104247093201, "logps/chosen": -4.668498516082764, "logps/rejected": -5.336842060089111, "loss": 2.5691, "rewards/accuracies": 0.8125, "rewards/chosen": -46.68498229980469, "rewards/margins": 6.683432102203369, "rewards/rejected": -53.36841583251953, "step": 317 }, { "epoch": 0.719253604749788, "grad_norm": 145.08828046519986, "learning_rate": 1.7760744160938093e-07, "logits/chosen": -0.8834313154220581, "logits/rejected": -0.899104118347168, "logps/chosen": -4.476520538330078, "logps/rejected": -5.283236980438232, "loss": 2.5195, "rewards/accuracies": 0.8203125, "rewards/chosen": -44.76520538330078, "rewards/margins": 8.06716251373291, "rewards/rejected": -52.832374572753906, "step": 318 }, { "epoch": 0.7215154085383093, "grad_norm": 139.91997046332745, "learning_rate": 1.7498342440255135e-07, "logits/chosen": -0.9341943264007568, "logits/rejected": -0.9333917498588562, "logps/chosen": -4.627048015594482, "logps/rejected": -5.251206874847412, "loss": 2.5712, "rewards/accuracies": 0.78125, "rewards/chosen": -46.27048110961914, "rewards/margins": 6.241583824157715, "rewards/rejected": -52.51206588745117, "step": 319 }, { "epoch": 0.7237772123268307, "grad_norm": 128.75616766030222, "learning_rate": 1.7237349784407115e-07, "logits/chosen": -0.9444934725761414, "logits/rejected": -0.9463576674461365, "logps/chosen": -4.7167863845825195, "logps/rejected": -5.434706687927246, "loss": 2.4956, "rewards/accuracies": 0.78125, "rewards/chosen": -47.16786575317383, "rewards/margins": 7.179207801818848, "rewards/rejected": -54.34707260131836, "step": 320 }, { "epoch": 0.726039016115352, "grad_norm": 152.32529171434254, "learning_rate": 1.6977782536882178e-07, "logits/chosen": -0.8644733428955078, "logits/rejected": -0.878294825553894, "logps/chosen": -4.275421142578125, "logps/rejected": -5.040489196777344, "loss": 2.7444, "rewards/accuracies": 0.78125, "rewards/chosen": -42.75421142578125, "rewards/margins": 7.650677680969238, "rewards/rejected": -50.40489196777344, "step": 321 }, { "epoch": 0.7283008199038733, "grad_norm": 114.93649081424365, "learning_rate": 1.6719656951908708e-07, "logits/chosen": -0.8660048246383667, "logits/rejected": -0.8882208466529846, "logps/chosen": -4.067705154418945, "logps/rejected": -4.7624616622924805, "loss": 2.4388, "rewards/accuracies": 0.8125, "rewards/chosen": -40.67705535888672, "rewards/margins": 6.947561264038086, "rewards/rejected": -47.62461853027344, "step": 322 }, { "epoch": 0.7305626236923947, "grad_norm": 141.6695023872139, "learning_rate": 1.6462989193437453e-07, "logits/chosen": -0.9560823440551758, "logits/rejected": -0.9642462730407715, "logps/chosen": -4.5176310539245605, "logps/rejected": -5.131880760192871, "loss": 2.8127, "rewards/accuracies": 0.7578125, "rewards/chosen": -45.17631149291992, "rewards/margins": 6.142499923706055, "rewards/rejected": -51.318809509277344, "step": 323 }, { "epoch": 0.732824427480916, "grad_norm": 153.0336707141252, "learning_rate": 1.6207795334129365e-07, "logits/chosen": -0.9089516997337341, "logits/rejected": -0.9075677394866943, "logps/chosen": -4.768195152282715, "logps/rejected": -5.365372180938721, "loss": 2.9092, "rewards/accuracies": 0.8046875, "rewards/chosen": -47.68195343017578, "rewards/margins": 5.971770286560059, "rewards/rejected": -53.65372085571289, "step": 324 }, { "epoch": 0.7350862312694374, "grad_norm": 175.02628532285675, "learning_rate": 1.5954091354349121e-07, "logits/chosen": -0.93093341588974, "logits/rejected": -0.9459247589111328, "logps/chosen": -4.557867050170898, "logps/rejected": -5.066596508026123, "loss": 3.4744, "rewards/accuracies": 0.75, "rewards/chosen": -45.57866668701172, "rewards/margins": 5.087299346923828, "rewards/rejected": -50.66596603393555, "step": 325 }, { "epoch": 0.7373480350579588, "grad_norm": 268.3403033769808, "learning_rate": 1.5701893141164364e-07, "logits/chosen": -0.9369128346443176, "logits/rejected": -0.9450178742408752, "logps/chosen": -4.78832483291626, "logps/rejected": -5.510087490081787, "loss": 3.4083, "rewards/accuracies": 0.7265625, "rewards/chosen": -47.88325119018555, "rewards/margins": 7.2176289558410645, "rewards/rejected": -55.10087966918945, "step": 326 }, { "epoch": 0.73960983884648, "grad_norm": 144.09484817777712, "learning_rate": 1.545121648735093e-07, "logits/chosen": -0.9158852100372314, "logits/rejected": -0.924169659614563, "logps/chosen": -4.634927749633789, "logps/rejected": -5.217785358428955, "loss": 3.0842, "rewards/accuracies": 0.7578125, "rewards/chosen": -46.349273681640625, "rewards/margins": 5.828577518463135, "rewards/rejected": -52.17784881591797, "step": 327 }, { "epoch": 0.7418716426350014, "grad_norm": 134.18959418719092, "learning_rate": 1.5202077090403863e-07, "logits/chosen": -0.9410818815231323, "logits/rejected": -0.9246065616607666, "logps/chosen": -4.225207805633545, "logps/rejected": -4.864658355712891, "loss": 2.5935, "rewards/accuracies": 0.8125, "rewards/chosen": -42.2520751953125, "rewards/margins": 6.394504547119141, "rewards/rejected": -48.646583557128906, "step": 328 }, { "epoch": 0.7441334464235227, "grad_norm": 157.68518289039383, "learning_rate": 1.495449055155443e-07, "logits/chosen": -0.9306075572967529, "logits/rejected": -0.942533552646637, "logps/chosen": -4.558164119720459, "logps/rejected": -5.3390398025512695, "loss": 2.4075, "rewards/accuracies": 0.796875, "rewards/chosen": -45.581642150878906, "rewards/margins": 7.808758735656738, "rewards/rejected": -53.390403747558594, "step": 329 }, { "epoch": 0.7463952502120441, "grad_norm": 155.53781883249212, "learning_rate": 1.4708472374793112e-07, "logits/chosen": -0.9006601572036743, "logits/rejected": -0.9074862599372864, "logps/chosen": -4.6048903465271, "logps/rejected": -5.098773002624512, "loss": 3.4763, "rewards/accuracies": 0.7109375, "rewards/chosen": -46.04890441894531, "rewards/margins": 4.938818454742432, "rewards/rejected": -50.98772430419922, "step": 330 }, { "epoch": 0.7486570540005655, "grad_norm": 156.32875339642223, "learning_rate": 1.4464037965898878e-07, "logits/chosen": -0.8546017408370972, "logits/rejected": -0.8647469878196716, "logps/chosen": -4.482880115509033, "logps/rejected": -5.083220481872559, "loss": 2.9646, "rewards/accuracies": 0.765625, "rewards/chosen": -44.82880401611328, "rewards/margins": 6.003401756286621, "rewards/rejected": -50.83220672607422, "step": 331 }, { "epoch": 0.7509188577890868, "grad_norm": 131.67726299773977, "learning_rate": 1.4221202631474282e-07, "logits/chosen": -0.8612452745437622, "logits/rejected": -0.8679234385490417, "logps/chosen": -4.424932479858398, "logps/rejected": -5.060441970825195, "loss": 2.7243, "rewards/accuracies": 0.78125, "rewards/chosen": -44.249324798583984, "rewards/margins": 6.355095863342285, "rewards/rejected": -50.60442352294922, "step": 332 }, { "epoch": 0.7531806615776081, "grad_norm": 143.74537092522394, "learning_rate": 1.3979981577987113e-07, "logits/chosen": -0.9003939628601074, "logits/rejected": -0.8934139013290405, "logps/chosen": -4.204031467437744, "logps/rejected": -4.884526252746582, "loss": 2.7886, "rewards/accuracies": 0.734375, "rewards/chosen": -42.040313720703125, "rewards/margins": 6.804945945739746, "rewards/rejected": -48.84525680541992, "step": 333 }, { "epoch": 0.7554424653661295, "grad_norm": 122.3096159489272, "learning_rate": 1.374038991081807e-07, "logits/chosen": -0.9354572892189026, "logits/rejected": -0.9418012499809265, "logps/chosen": -4.416646957397461, "logps/rejected": -4.966562747955322, "loss": 2.9822, "rewards/accuracies": 0.734375, "rewards/chosen": -44.16646957397461, "rewards/margins": 5.49915885925293, "rewards/rejected": -49.665626525878906, "step": 334 }, { "epoch": 0.7577042691546508, "grad_norm": 125.3467664124325, "learning_rate": 1.3502442633314882e-07, "logits/chosen": -0.8854781985282898, "logits/rejected": -0.8959544897079468, "logps/chosen": -3.8627703189849854, "logps/rejected": -4.476377010345459, "loss": 2.4221, "rewards/accuracies": 0.8203125, "rewards/chosen": -38.62770462036133, "rewards/margins": 6.136070728302002, "rewards/rejected": -44.76377487182617, "step": 335 }, { "epoch": 0.7599660729431722, "grad_norm": 130.27731954527835, "learning_rate": 1.3266154645852815e-07, "logits/chosen": -0.8756478428840637, "logits/rejected": -0.8797450065612793, "logps/chosen": -4.367222785949707, "logps/rejected": -4.935546875, "loss": 2.7571, "rewards/accuracies": 0.765625, "rewards/chosen": -43.6722297668457, "rewards/margins": 5.683239936828613, "rewards/rejected": -49.35546875, "step": 336 }, { "epoch": 0.7622278767316936, "grad_norm": 174.47595303975626, "learning_rate": 1.303154074490152e-07, "logits/chosen": -0.9251211881637573, "logits/rejected": -0.9095232486724854, "logps/chosen": -4.15577507019043, "logps/rejected": -4.788844108581543, "loss": 3.0771, "rewards/accuracies": 0.71875, "rewards/chosen": -41.55774688720703, "rewards/margins": 6.330696105957031, "rewards/rejected": -47.88844680786133, "step": 337 }, { "epoch": 0.7644896805202148, "grad_norm": 139.20935050711142, "learning_rate": 1.2798615622098616e-07, "logits/chosen": -0.9291560649871826, "logits/rejected": -0.9244073033332825, "logps/chosen": -4.082375526428223, "logps/rejected": -4.767295837402344, "loss": 2.935, "rewards/accuracies": 0.75, "rewards/chosen": -40.823753356933594, "rewards/margins": 6.8492045402526855, "rewards/rejected": -47.67295837402344, "step": 338 }, { "epoch": 0.7667514843087362, "grad_norm": 125.85713373053201, "learning_rate": 1.2567393863329523e-07, "logits/chosen": -0.9064013957977295, "logits/rejected": -0.9375932216644287, "logps/chosen": -4.197393417358398, "logps/rejected": -4.932326793670654, "loss": 2.4315, "rewards/accuracies": 0.8125, "rewards/chosen": -41.973934173583984, "rewards/margins": 7.349334239959717, "rewards/rejected": -49.323272705078125, "step": 339 }, { "epoch": 0.7690132880972576, "grad_norm": 113.88331415159809, "learning_rate": 1.233788994781423e-07, "logits/chosen": -0.9322744607925415, "logits/rejected": -0.9696506261825562, "logps/chosen": -3.992047071456909, "logps/rejected": -4.617818355560303, "loss": 2.3658, "rewards/accuracies": 0.7890625, "rewards/chosen": -39.920467376708984, "rewards/margins": 6.257713317871094, "rewards/rejected": -46.178184509277344, "step": 340 }, { "epoch": 0.7712750918857789, "grad_norm": 120.76771779346096, "learning_rate": 1.2110118247200468e-07, "logits/chosen": -0.930842399597168, "logits/rejected": -0.9461864829063416, "logps/chosen": -4.024172782897949, "logps/rejected": -4.681670188903809, "loss": 2.253, "rewards/accuracies": 0.828125, "rewards/chosen": -40.24173355102539, "rewards/margins": 6.574971675872803, "rewards/rejected": -46.81669998168945, "step": 341 }, { "epoch": 0.7735368956743003, "grad_norm": 136.22802399303367, "learning_rate": 1.1884093024663933e-07, "logits/chosen": -0.9333779811859131, "logits/rejected": -0.9390580058097839, "logps/chosen": -3.7476558685302734, "logps/rejected": -4.544672012329102, "loss": 2.7483, "rewards/accuracies": 0.7421875, "rewards/chosen": -37.47655487060547, "rewards/margins": 7.970164775848389, "rewards/rejected": -45.446720123291016, "step": 342 }, { "epoch": 0.7757986994628217, "grad_norm": 142.58791407306794, "learning_rate": 1.1659828434014886e-07, "logits/chosen": -0.9368746280670166, "logits/rejected": -0.9193394780158997, "logps/chosen": -3.7733166217803955, "logps/rejected": -4.509119033813477, "loss": 2.5863, "rewards/accuracies": 0.7890625, "rewards/chosen": -37.73316955566406, "rewards/margins": 7.358019828796387, "rewards/rejected": -45.0911865234375, "step": 343 }, { "epoch": 0.7780605032513429, "grad_norm": 149.16462513003228, "learning_rate": 1.143733851881203e-07, "logits/chosen": -0.969507098197937, "logits/rejected": -0.9765860438346863, "logps/chosen": -4.0373382568359375, "logps/rejected": -4.758094310760498, "loss": 2.5659, "rewards/accuracies": 0.8125, "rewards/chosen": -40.373382568359375, "rewards/margins": 7.207555770874023, "rewards/rejected": -47.5809440612793, "step": 344 }, { "epoch": 0.7803223070398643, "grad_norm": 124.92145191514574, "learning_rate": 1.1216637211483005e-07, "logits/chosen": -0.9140538573265076, "logits/rejected": -0.9331907629966736, "logps/chosen": -3.9364168643951416, "logps/rejected": -4.495926856994629, "loss": 2.7829, "rewards/accuracies": 0.7890625, "rewards/chosen": -39.36417007446289, "rewards/margins": 5.595102310180664, "rewards/rejected": -44.959266662597656, "step": 345 }, { "epoch": 0.7825841108283856, "grad_norm": 139.19218112831186, "learning_rate": 1.0997738332451936e-07, "logits/chosen": -0.9063421487808228, "logits/rejected": -0.9176337718963623, "logps/chosen": -4.21934175491333, "logps/rejected": -4.815896034240723, "loss": 2.6989, "rewards/accuracies": 0.765625, "rewards/chosen": -42.19342041015625, "rewards/margins": 5.965543270111084, "rewards/rejected": -48.15896224975586, "step": 346 }, { "epoch": 0.784845914616907, "grad_norm": 135.08565556440382, "learning_rate": 1.0780655589274031e-07, "logits/chosen": -0.9613451957702637, "logits/rejected": -0.9540661573410034, "logps/chosen": -3.9844415187835693, "logps/rejected": -4.568908214569092, "loss": 2.3105, "rewards/accuracies": 0.8359375, "rewards/chosen": -39.844417572021484, "rewards/margins": 5.844663619995117, "rewards/rejected": -45.68907928466797, "step": 347 }, { "epoch": 0.7871077184054284, "grad_norm": 140.13866385079996, "learning_rate": 1.056540257577712e-07, "logits/chosen": -0.8759354948997498, "logits/rejected": -0.8963236808776855, "logps/chosen": -4.536995887756348, "logps/rejected": -5.269956111907959, "loss": 2.1934, "rewards/accuracies": 0.859375, "rewards/chosen": -45.369956970214844, "rewards/margins": 7.329606056213379, "rewards/rejected": -52.69956970214844, "step": 348 }, { "epoch": 0.7893695221939496, "grad_norm": 137.27660659020637, "learning_rate": 1.0351992771210554e-07, "logits/chosen": -0.9132465720176697, "logits/rejected": -0.9206264615058899, "logps/chosen": -4.071958541870117, "logps/rejected": -4.736346244812012, "loss": 2.6102, "rewards/accuracies": 0.828125, "rewards/chosen": -40.719581604003906, "rewards/margins": 6.643874168395996, "rewards/rejected": -47.36345291137695, "step": 349 }, { "epoch": 0.791631325982471, "grad_norm": 142.1622013900452, "learning_rate": 1.0140439539400953e-07, "logits/chosen": -0.8662968277931213, "logits/rejected": -0.9013144969940186, "logps/chosen": -4.007244110107422, "logps/rejected": -4.624824523925781, "loss": 3.0147, "rewards/accuracies": 0.7734375, "rewards/chosen": -40.07244110107422, "rewards/margins": 6.175803184509277, "rewards/rejected": -46.24824523925781, "step": 350 }, { "epoch": 0.7938931297709924, "grad_norm": 129.3739896140384, "learning_rate": 9.930756127915488e-08, "logits/chosen": -0.9286041259765625, "logits/rejected": -0.9355084896087646, "logps/chosen": -4.044810771942139, "logps/rejected": -4.674356460571289, "loss": 2.6907, "rewards/accuracies": 0.78125, "rewards/chosen": -40.44810485839844, "rewards/margins": 6.295462131500244, "rewards/rejected": -46.743568420410156, "step": 351 }, { "epoch": 0.7961549335595137, "grad_norm": 154.70829465365628, "learning_rate": 9.722955667232242e-08, "logits/chosen": -0.9570465683937073, "logits/rejected": -0.9680700302124023, "logps/chosen": -4.297163963317871, "logps/rejected": -4.780937194824219, "loss": 3.2322, "rewards/accuracies": 0.7109375, "rewards/chosen": -42.971641540527344, "rewards/margins": 4.837734222412109, "rewards/rejected": -47.80936813354492, "step": 352 }, { "epoch": 0.7984167373480351, "grad_norm": 140.78323504174404, "learning_rate": 9.517051169918016e-08, "logits/chosen": -0.9370065331459045, "logits/rejected": -0.9510276317596436, "logps/chosen": -4.016280651092529, "logps/rejected": -4.564591407775879, "loss": 3.2779, "rewards/accuracies": 0.75, "rewards/chosen": -40.162811279296875, "rewards/margins": 5.483105182647705, "rewards/rejected": -45.64591979980469, "step": 353 }, { "epoch": 0.8006785411365565, "grad_norm": 142.37033106886284, "learning_rate": 9.313055529813412e-08, "logits/chosen": -0.8857989311218262, "logits/rejected": -0.9076879620552063, "logps/chosen": -4.098158836364746, "logps/rejected": -4.799932479858398, "loss": 2.3024, "rewards/accuracies": 0.8125, "rewards/chosen": -40.98158645629883, "rewards/margins": 7.017735004425049, "rewards/rejected": -47.99932098388672, "step": 354 }, { "epoch": 0.8029403449250777, "grad_norm": 144.22040244633794, "learning_rate": 9.110981521225532e-08, "logits/chosen": -0.9384421706199646, "logits/rejected": -0.9499157667160034, "logps/chosen": -4.076770782470703, "logps/rejected": -4.640554428100586, "loss": 2.9348, "rewards/accuracies": 0.765625, "rewards/chosen": -40.76770782470703, "rewards/margins": 5.637840747833252, "rewards/rejected": -46.40555191040039, "step": 355 }, { "epoch": 0.8052021487135991, "grad_norm": 139.7935847003589, "learning_rate": 8.910841798127884e-08, "logits/chosen": -0.9020113945007324, "logits/rejected": -0.9301177263259888, "logps/chosen": -4.16035270690918, "logps/rejected": -4.824099063873291, "loss": 2.5111, "rewards/accuracies": 0.796875, "rewards/chosen": -41.60352325439453, "rewards/margins": 6.637460708618164, "rewards/rejected": -48.240989685058594, "step": 356 }, { "epoch": 0.8074639525021204, "grad_norm": 166.03846448720964, "learning_rate": 8.712648893368139e-08, "logits/chosen": -0.9206175208091736, "logits/rejected": -0.9502934217453003, "logps/chosen": -4.101649761199951, "logps/rejected": -4.884461879730225, "loss": 2.5017, "rewards/accuracies": 0.765625, "rewards/chosen": -41.01649475097656, "rewards/margins": 7.828126907348633, "rewards/rejected": -48.844627380371094, "step": 357 }, { "epoch": 0.8097257562906418, "grad_norm": 117.89459898307335, "learning_rate": 8.516415217883186e-08, "logits/chosen": -0.9100026488304138, "logits/rejected": -0.9124536514282227, "logps/chosen": -4.00990629196167, "logps/rejected": -4.74934196472168, "loss": 2.6175, "rewards/accuracies": 0.828125, "rewards/chosen": -40.099063873291016, "rewards/margins": 7.394357681274414, "rewards/rejected": -47.4934196472168, "step": 358 }, { "epoch": 0.8119875600791632, "grad_norm": 151.06740224882444, "learning_rate": 8.32215305992209e-08, "logits/chosen": -0.9616566896438599, "logits/rejected": -0.9742845296859741, "logps/chosen": -3.94974422454834, "logps/rejected": -4.5588812828063965, "loss": 2.8848, "rewards/accuracies": 0.796875, "rewards/chosen": -39.49744415283203, "rewards/margins": 6.091368675231934, "rewards/rejected": -45.58881759643555, "step": 359 }, { "epoch": 0.8142493638676844, "grad_norm": 116.28513294682625, "learning_rate": 8.129874584276448e-08, "logits/chosen": -0.9059348702430725, "logits/rejected": -0.9224525690078735, "logps/chosen": -4.079035758972168, "logps/rejected": -4.834011554718018, "loss": 2.1432, "rewards/accuracies": 0.8046875, "rewards/chosen": -40.79035949707031, "rewards/margins": 7.549759864807129, "rewards/rejected": -48.34011459350586, "step": 360 }, { "epoch": 0.8165111676562058, "grad_norm": 143.4855751446256, "learning_rate": 7.939591831518746e-08, "logits/chosen": -0.943411648273468, "logits/rejected": -0.9577879905700684, "logps/chosen": -4.06253719329834, "logps/rejected": -4.622920513153076, "loss": 2.308, "rewards/accuracies": 0.8515625, "rewards/chosen": -40.625370025634766, "rewards/margins": 5.603834629058838, "rewards/rejected": -46.22920608520508, "step": 361 }, { "epoch": 0.8187729714447272, "grad_norm": 132.12698029254315, "learning_rate": 7.751316717248304e-08, "logits/chosen": -0.9082808494567871, "logits/rejected": -0.9305973052978516, "logps/chosen": -4.398627281188965, "logps/rejected": -5.260112285614014, "loss": 2.4971, "rewards/accuracies": 0.796875, "rewards/chosen": -43.986270904541016, "rewards/margins": 8.614850044250488, "rewards/rejected": -52.60112762451172, "step": 362 }, { "epoch": 0.8210347752332485, "grad_norm": 158.043389904261, "learning_rate": 7.565061031345142e-08, "logits/chosen": -0.9185335040092468, "logits/rejected": -0.9299246072769165, "logps/chosen": -4.59241247177124, "logps/rejected": -5.313713550567627, "loss": 2.3348, "rewards/accuracies": 0.8125, "rewards/chosen": -45.92412185668945, "rewards/margins": 7.213016033172607, "rewards/rejected": -53.137142181396484, "step": 363 }, { "epoch": 0.8232965790217699, "grad_norm": 137.14995559881837, "learning_rate": 7.380836437231686e-08, "logits/chosen": -0.9011315107345581, "logits/rejected": -0.9010403752326965, "logps/chosen": -4.083221435546875, "logps/rejected": -4.828488349914551, "loss": 2.3463, "rewards/accuracies": 0.8125, "rewards/chosen": -40.83221435546875, "rewards/margins": 7.452672481536865, "rewards/rejected": -48.284889221191406, "step": 364 }, { "epoch": 0.8255583828102913, "grad_norm": 133.9434857418435, "learning_rate": 7.198654471142371e-08, "logits/chosen": -0.9325624704360962, "logits/rejected": -0.9269375205039978, "logps/chosen": -4.175022602081299, "logps/rejected": -5.013765335083008, "loss": 2.1937, "rewards/accuracies": 0.8359375, "rewards/chosen": -41.75022506713867, "rewards/margins": 8.387434959411621, "rewards/rejected": -50.13766098022461, "step": 365 }, { "epoch": 0.8278201865988125, "grad_norm": 140.3898107041387, "learning_rate": 7.01852654140132e-08, "logits/chosen": -0.954879105091095, "logits/rejected": -0.9756340980529785, "logps/chosen": -4.5630106925964355, "logps/rejected": -5.309413433074951, "loss": 2.3981, "rewards/accuracies": 0.8046875, "rewards/chosen": -45.63011169433594, "rewards/margins": 7.464024066925049, "rewards/rejected": -53.0941276550293, "step": 366 }, { "epoch": 0.8300819903873339, "grad_norm": 138.4057371685046, "learning_rate": 6.840463927707833e-08, "logits/chosen": -0.9294202923774719, "logits/rejected": -0.9419483542442322, "logps/chosen": -4.580535888671875, "logps/rejected": -5.202334880828857, "loss": 2.7198, "rewards/accuracies": 0.7890625, "rewards/chosen": -45.80535888671875, "rewards/margins": 6.217983245849609, "rewards/rejected": -52.023345947265625, "step": 367 }, { "epoch": 0.8323437941758552, "grad_norm": 128.96927985102926, "learning_rate": 6.664477780430138e-08, "logits/chosen": -0.9347717761993408, "logits/rejected": -0.945314884185791, "logps/chosen": -4.429632663726807, "logps/rejected": -4.995570659637451, "loss": 2.9752, "rewards/accuracies": 0.765625, "rewards/chosen": -44.29632568359375, "rewards/margins": 5.659379959106445, "rewards/rejected": -49.955711364746094, "step": 368 }, { "epoch": 0.8346055979643766, "grad_norm": 152.34642547867944, "learning_rate": 6.49057911990711e-08, "logits/chosen": -0.8949201107025146, "logits/rejected": -0.9076350927352905, "logps/chosen": -4.397095680236816, "logps/rejected": -4.981942176818848, "loss": 2.9721, "rewards/accuracies": 0.75, "rewards/chosen": -43.970951080322266, "rewards/margins": 5.848470687866211, "rewards/rejected": -49.81943130493164, "step": 369 }, { "epoch": 0.836867401752898, "grad_norm": 132.0391102685107, "learning_rate": 6.318778835758189e-08, "logits/chosen": -0.92762690782547, "logits/rejected": -0.9354040026664734, "logps/chosen": -4.483278274536133, "logps/rejected": -5.1504974365234375, "loss": 1.8653, "rewards/accuracies": 0.8828125, "rewards/chosen": -44.83277893066406, "rewards/margins": 6.672185897827148, "rewards/rejected": -51.50497055053711, "step": 370 }, { "epoch": 0.8391292055414192, "grad_norm": 179.951771990511, "learning_rate": 6.149087686201433e-08, "logits/chosen": -0.9428873062133789, "logits/rejected": -0.9634348154067993, "logps/chosen": -4.341042518615723, "logps/rejected": -4.949177265167236, "loss": 3.3993, "rewards/accuracies": 0.75, "rewards/chosen": -43.41042709350586, "rewards/margins": 6.081344127655029, "rewards/rejected": -49.49176788330078, "step": 371 }, { "epoch": 0.8413910093299406, "grad_norm": 136.21427347850883, "learning_rate": 5.98151629737988e-08, "logits/chosen": -0.9433773756027222, "logits/rejected": -0.943168044090271, "logps/chosen": -4.414024829864502, "logps/rejected": -5.185835838317871, "loss": 2.3556, "rewards/accuracies": 0.796875, "rewards/chosen": -44.14024353027344, "rewards/margins": 7.718109607696533, "rewards/rejected": -51.85835647583008, "step": 372 }, { "epoch": 0.843652813118462, "grad_norm": 127.45396317271563, "learning_rate": 5.816075162696097e-08, "logits/chosen": -0.9678685069084167, "logits/rejected": -0.9940780401229858, "logps/chosen": -4.39580774307251, "logps/rejected": -5.012912273406982, "loss": 2.2762, "rewards/accuracies": 0.8125, "rewards/chosen": -43.958072662353516, "rewards/margins": 6.171045303344727, "rewards/rejected": -50.129119873046875, "step": 373 }, { "epoch": 0.8459146169069833, "grad_norm": 123.86661129091185, "learning_rate": 5.6527746421551046e-08, "logits/chosen": -0.9064250588417053, "logits/rejected": -0.9173108339309692, "logps/chosen": -4.327992916107178, "logps/rejected": -5.055395603179932, "loss": 2.4613, "rewards/accuracies": 0.7578125, "rewards/chosen": -43.279930114746094, "rewards/margins": 7.274028778076172, "rewards/rejected": -50.553955078125, "step": 374 }, { "epoch": 0.8481764206955047, "grad_norm": 136.01804435251455, "learning_rate": 5.4916249617156064e-08, "logits/chosen": -0.9181968569755554, "logits/rejected": -0.9360796213150024, "logps/chosen": -4.141705513000488, "logps/rejected": -4.756865978240967, "loss": 2.774, "rewards/accuracies": 0.7421875, "rewards/chosen": -41.41705322265625, "rewards/margins": 6.151602745056152, "rewards/rejected": -47.568660736083984, "step": 375 }, { "epoch": 0.8504382244840261, "grad_norm": 135.3387610667954, "learning_rate": 5.332636212649646e-08, "logits/chosen": -0.8991196155548096, "logits/rejected": -0.915702223777771, "logps/chosen": -4.379838466644287, "logps/rejected": -5.094522953033447, "loss": 2.1719, "rewards/accuracies": 0.828125, "rewards/chosen": -43.79838562011719, "rewards/margins": 7.146846771240234, "rewards/rejected": -50.945228576660156, "step": 376 }, { "epoch": 0.8527000282725473, "grad_norm": 164.19828475720155, "learning_rate": 5.17581835091069e-08, "logits/chosen": -0.9365058541297913, "logits/rejected": -0.9663807153701782, "logps/chosen": -4.514606475830078, "logps/rejected": -5.128344535827637, "loss": 3.0464, "rewards/accuracies": 0.7578125, "rewards/chosen": -45.14606475830078, "rewards/margins": 6.137386322021484, "rewards/rejected": -51.28345489501953, "step": 377 }, { "epoch": 0.8549618320610687, "grad_norm": 138.669954265479, "learning_rate": 5.02118119651016e-08, "logits/chosen": -0.9410414099693298, "logits/rejected": -0.9501762390136719, "logps/chosen": -4.367845058441162, "logps/rejected": -5.0047101974487305, "loss": 3.1673, "rewards/accuracies": 0.7109375, "rewards/chosen": -43.67845153808594, "rewards/margins": 6.368653297424316, "rewards/rejected": -50.04710388183594, "step": 378 }, { "epoch": 0.85722363584959, "grad_norm": 153.057632250685, "learning_rate": 4.868734432902526e-08, "logits/chosen": -1.0021592378616333, "logits/rejected": -0.9952703714370728, "logps/chosen": -4.49019718170166, "logps/rejected": -5.29477071762085, "loss": 3.0216, "rewards/accuracies": 0.78125, "rewards/chosen": -44.90196990966797, "rewards/margins": 8.045737266540527, "rewards/rejected": -52.94770812988281, "step": 379 }, { "epoch": 0.8594854396381114, "grad_norm": 139.1146559906999, "learning_rate": 4.7184876063789134e-08, "logits/chosen": -0.9506573677062988, "logits/rejected": -0.9560145139694214, "logps/chosen": -3.926301956176758, "logps/rejected": -4.576600074768066, "loss": 2.563, "rewards/accuracies": 0.8359375, "rewards/chosen": -39.263023376464844, "rewards/margins": 6.5029826164245605, "rewards/rejected": -45.7660026550293, "step": 380 }, { "epoch": 0.8617472434266328, "grad_norm": 136.57207110844007, "learning_rate": 4.570450125469314e-08, "logits/chosen": -0.9335479140281677, "logits/rejected": -0.9474495649337769, "logps/chosen": -4.52652645111084, "logps/rejected": -5.3495774269104, "loss": 2.4878, "rewards/accuracies": 0.828125, "rewards/chosen": -45.2652587890625, "rewards/margins": 8.230509757995605, "rewards/rejected": -53.49578094482422, "step": 381 }, { "epoch": 0.864009047215154, "grad_norm": 149.81502524090894, "learning_rate": 4.424631260353378e-08, "logits/chosen": -0.9694351553916931, "logits/rejected": -0.9859524369239807, "logps/chosen": -4.307926654815674, "logps/rejected": -4.940521240234375, "loss": 2.7104, "rewards/accuracies": 0.7890625, "rewards/chosen": -43.07926940917969, "rewards/margins": 6.325945854187012, "rewards/rejected": -49.40521240234375, "step": 382 }, { "epoch": 0.8662708510036754, "grad_norm": 121.3426641959867, "learning_rate": 4.281040142280008e-08, "logits/chosen": -0.9893457889556885, "logits/rejected": -0.9991154670715332, "logps/chosen": -4.156393527984619, "logps/rejected": -4.968776226043701, "loss": 1.989, "rewards/accuracies": 0.8203125, "rewards/chosen": -41.56393814086914, "rewards/margins": 8.123825073242188, "rewards/rejected": -49.68776321411133, "step": 383 }, { "epoch": 0.8685326547921968, "grad_norm": 141.4337079496108, "learning_rate": 4.1396857629954286e-08, "logits/chosen": -0.9534589052200317, "logits/rejected": -0.9696213603019714, "logps/chosen": -4.799047470092773, "logps/rejected": -5.520049571990967, "loss": 2.7878, "rewards/accuracies": 0.7421875, "rewards/chosen": -47.99047088623047, "rewards/margins": 7.210024833679199, "rewards/rejected": -55.20050048828125, "step": 384 }, { "epoch": 0.8707944585807181, "grad_norm": 119.73253748242631, "learning_rate": 4.000576974180232e-08, "logits/chosen": -0.9004536271095276, "logits/rejected": -0.9263263940811157, "logps/chosen": -4.2699875831604, "logps/rejected": -4.922300338745117, "loss": 2.9088, "rewards/accuracies": 0.8046875, "rewards/chosen": -42.69987487792969, "rewards/margins": 6.523127555847168, "rewards/rejected": -49.22300720214844, "step": 385 }, { "epoch": 0.8730562623692395, "grad_norm": 132.99002186990265, "learning_rate": 3.8637224868950066e-08, "logits/chosen": -0.9017341136932373, "logits/rejected": -0.9102005958557129, "logps/chosen": -4.248313903808594, "logps/rejected": -4.877220630645752, "loss": 2.8312, "rewards/accuracies": 0.7578125, "rewards/chosen": -42.48313522338867, "rewards/margins": 6.2890706062316895, "rewards/rejected": -48.77220153808594, "step": 386 }, { "epoch": 0.8753180661577609, "grad_norm": 140.2576422089798, "learning_rate": 3.729130871034885e-08, "logits/chosen": -0.9371786713600159, "logits/rejected": -0.9420756101608276, "logps/chosen": -4.37814998626709, "logps/rejected": -5.035106182098389, "loss": 2.8047, "rewards/accuracies": 0.8046875, "rewards/chosen": -43.78150177001953, "rewards/margins": 6.569563865661621, "rewards/rejected": -50.3510627746582, "step": 387 }, { "epoch": 0.8775798699462821, "grad_norm": 175.47044378770292, "learning_rate": 3.596810554792888e-08, "logits/chosen": -0.9239012598991394, "logits/rejected": -0.9475809335708618, "logps/chosen": -4.3749284744262695, "logps/rejected": -5.061702251434326, "loss": 3.1355, "rewards/accuracies": 0.734375, "rewards/chosen": -43.7492790222168, "rewards/margins": 6.867737770080566, "rewards/rejected": -50.61702346801758, "step": 388 }, { "epoch": 0.8798416737348035, "grad_norm": 136.44044081625452, "learning_rate": 3.466769824132116e-08, "logits/chosen": -0.9199025630950928, "logits/rejected": -0.9204123020172119, "logps/chosen": -4.2924580574035645, "logps/rejected": -4.982794761657715, "loss": 2.3706, "rewards/accuracies": 0.828125, "rewards/chosen": -42.924583435058594, "rewards/margins": 6.903364181518555, "rewards/rejected": -49.82794189453125, "step": 389 }, { "epoch": 0.8821034775233249, "grad_norm": 157.13088383312373, "learning_rate": 3.339016822266925e-08, "logits/chosen": -0.8951210975646973, "logits/rejected": -0.9262260794639587, "logps/chosen": -4.462003707885742, "logps/rejected": -5.319886684417725, "loss": 1.8385, "rewards/accuracies": 0.8515625, "rewards/chosen": -44.62003707885742, "rewards/margins": 8.57883071899414, "rewards/rejected": -53.19886779785156, "step": 390 }, { "epoch": 0.8843652813118462, "grad_norm": 145.27269622289882, "learning_rate": 3.213559549152958e-08, "logits/chosen": -0.9537985920906067, "logits/rejected": -0.9690415859222412, "logps/chosen": -4.21071195602417, "logps/rejected": -4.958610534667969, "loss": 2.7635, "rewards/accuracies": 0.765625, "rewards/chosen": -42.10711669921875, "rewards/margins": 7.478985786437988, "rewards/rejected": -49.58610153198242, "step": 391 }, { "epoch": 0.8866270851003676, "grad_norm": 152.2072238338678, "learning_rate": 3.090405860986203e-08, "logits/chosen": -0.9644224643707275, "logits/rejected": -0.9988764524459839, "logps/chosen": -4.434269428253174, "logps/rejected": -5.323245048522949, "loss": 2.3393, "rewards/accuracies": 0.8046875, "rewards/chosen": -44.34269332885742, "rewards/margins": 8.889755249023438, "rewards/rejected": -53.23244857788086, "step": 392 }, { "epoch": 0.8888888888888888, "grad_norm": 133.85280588404478, "learning_rate": 2.9695634697110315e-08, "logits/chosen": -0.9042102694511414, "logits/rejected": -0.9273264408111572, "logps/chosen": -4.202421188354492, "logps/rejected": -5.033628463745117, "loss": 2.664, "rewards/accuracies": 0.8203125, "rewards/chosen": -42.02421569824219, "rewards/margins": 8.31207275390625, "rewards/rejected": -50.33628463745117, "step": 393 }, { "epoch": 0.8911506926774102, "grad_norm": 139.50976820633048, "learning_rate": 2.8510399425372766e-08, "logits/chosen": -0.9206915497779846, "logits/rejected": -0.9092394113540649, "logps/chosen": -4.3293986320495605, "logps/rejected": -4.970660209655762, "loss": 2.7218, "rewards/accuracies": 0.7890625, "rewards/chosen": -43.293983459472656, "rewards/margins": 6.412619113922119, "rewards/rejected": -49.70660400390625, "step": 394 }, { "epoch": 0.8934124964659316, "grad_norm": 142.6192511354523, "learning_rate": 2.734842701466329e-08, "logits/chosen": -0.9256288409233093, "logits/rejected": -0.9244977235794067, "logps/chosen": -4.661899566650391, "logps/rejected": -5.342780113220215, "loss": 2.4201, "rewards/accuracies": 0.8046875, "rewards/chosen": -46.61899948120117, "rewards/margins": 6.808799743652344, "rewards/rejected": -53.42779541015625, "step": 395 }, { "epoch": 0.8956743002544529, "grad_norm": 130.63925825670492, "learning_rate": 2.6209790228264438e-08, "logits/chosen": -0.9332349300384521, "logits/rejected": -0.94581139087677, "logps/chosen": -4.036855220794678, "logps/rejected": -4.773642539978027, "loss": 2.2623, "rewards/accuracies": 0.8359375, "rewards/chosen": -40.368553161621094, "rewards/margins": 7.367873191833496, "rewards/rejected": -47.736427307128906, "step": 396 }, { "epoch": 0.8979361040429743, "grad_norm": 149.5991916133793, "learning_rate": 2.5094560368170305e-08, "logits/chosen": -0.9196925163269043, "logits/rejected": -0.9395575523376465, "logps/chosen": -4.5655694007873535, "logps/rejected": -5.217185974121094, "loss": 2.5713, "rewards/accuracies": 0.796875, "rewards/chosen": -45.65568923950195, "rewards/margins": 6.516168594360352, "rewards/rejected": -52.17185974121094, "step": 397 }, { "epoch": 0.9001979078314957, "grad_norm": 122.6578201059457, "learning_rate": 2.4002807270621893e-08, "logits/chosen": -0.9552274942398071, "logits/rejected": -0.9657354950904846, "logps/chosen": -4.322449207305908, "logps/rejected": -4.962361812591553, "loss": 2.6024, "rewards/accuracies": 0.828125, "rewards/chosen": -43.2244987487793, "rewards/margins": 6.399123191833496, "rewards/rejected": -49.623619079589844, "step": 398 }, { "epoch": 0.9024597116200169, "grad_norm": 135.84460405011131, "learning_rate": 2.293459930173354e-08, "logits/chosen": -0.9458591341972351, "logits/rejected": -0.9692145586013794, "logps/chosen": -4.452592849731445, "logps/rejected": -5.130153179168701, "loss": 2.783, "rewards/accuracies": 0.7578125, "rewards/chosen": -44.52592468261719, "rewards/margins": 6.775611400604248, "rewards/rejected": -51.30153274536133, "step": 399 }, { "epoch": 0.9047215154085383, "grad_norm": 171.33604881928616, "learning_rate": 2.189000335321256e-08, "logits/chosen": -0.9176933765411377, "logits/rejected": -0.9229288101196289, "logps/chosen": -4.287893295288086, "logps/rejected": -4.882048606872559, "loss": 3.0622, "rewards/accuracies": 0.78125, "rewards/chosen": -42.87893295288086, "rewards/margins": 5.941554069519043, "rewards/rejected": -48.82048797607422, "step": 400 }, { "epoch": 0.9069833191970597, "grad_norm": 131.77434980590826, "learning_rate": 2.086908483816954e-08, "logits/chosen": -0.9492596387863159, "logits/rejected": -0.9559190273284912, "logps/chosen": -4.549674987792969, "logps/rejected": -5.201348304748535, "loss": 2.4757, "rewards/accuracies": 0.828125, "rewards/chosen": -45.49674606323242, "rewards/margins": 6.516733169555664, "rewards/rejected": -52.01348114013672, "step": 401 }, { "epoch": 0.909245122985581, "grad_norm": 132.06347666424753, "learning_rate": 1.9871907687022717e-08, "logits/chosen": -0.916560173034668, "logits/rejected": -0.9371925592422485, "logps/chosen": -4.1877241134643555, "logps/rejected": -4.8005266189575195, "loss": 2.6123, "rewards/accuracies": 0.765625, "rewards/chosen": -41.87724685668945, "rewards/margins": 6.128022193908691, "rewards/rejected": -48.005271911621094, "step": 402 }, { "epoch": 0.9115069267741024, "grad_norm": 114.509047735518, "learning_rate": 1.889853434349451e-08, "logits/chosen": -0.9288345575332642, "logits/rejected": -0.9471941590309143, "logps/chosen": -4.192251682281494, "logps/rejected": -4.938650131225586, "loss": 2.434, "rewards/accuracies": 0.78125, "rewards/chosen": -41.922515869140625, "rewards/margins": 7.4639787673950195, "rewards/rejected": -49.386497497558594, "step": 403 }, { "epoch": 0.9137687305626236, "grad_norm": 139.51897990590004, "learning_rate": 1.7949025760701164e-08, "logits/chosen": -0.9225287437438965, "logits/rejected": -0.9274791479110718, "logps/chosen": -4.604381561279297, "logps/rejected": -5.195613861083984, "loss": 2.6384, "rewards/accuracies": 0.828125, "rewards/chosen": -46.043819427490234, "rewards/margins": 5.912320137023926, "rewards/rejected": -51.956138610839844, "step": 404 }, { "epoch": 0.916030534351145, "grad_norm": 128.19324678370154, "learning_rate": 1.7023441397336023e-08, "logits/chosen": -0.9489941596984863, "logits/rejected": -0.9579771757125854, "logps/chosen": -4.172736167907715, "logps/rejected": -4.906558513641357, "loss": 2.4065, "rewards/accuracies": 0.8125, "rewards/chosen": -41.727359771728516, "rewards/margins": 7.338226318359375, "rewards/rejected": -49.065582275390625, "step": 405 }, { "epoch": 0.9182923381396664, "grad_norm": 127.71243008762988, "learning_rate": 1.6121839213945854e-08, "logits/chosen": -0.9154041409492493, "logits/rejected": -0.9540258049964905, "logps/chosen": -4.24996280670166, "logps/rejected": -5.038878440856934, "loss": 2.6564, "rewards/accuracies": 0.7578125, "rewards/chosen": -42.499629974365234, "rewards/margins": 7.889162063598633, "rewards/rejected": -50.3887939453125, "step": 406 }, { "epoch": 0.9205541419281877, "grad_norm": 146.10675693844263, "learning_rate": 1.5244275669301777e-08, "logits/chosen": -0.955981969833374, "logits/rejected": -0.9593254923820496, "logps/chosen": -4.389744758605957, "logps/rejected": -5.067453384399414, "loss": 2.8744, "rewards/accuracies": 0.78125, "rewards/chosen": -43.8974494934082, "rewards/margins": 6.777082920074463, "rewards/rejected": -50.67453384399414, "step": 407 }, { "epoch": 0.9228159457167091, "grad_norm": 132.95610789058466, "learning_rate": 1.4390805716863398e-08, "logits/chosen": -0.9074594378471375, "logits/rejected": -0.9208613634109497, "logps/chosen": -4.289839744567871, "logps/rejected": -4.873122692108154, "loss": 3.0576, "rewards/accuracies": 0.7734375, "rewards/chosen": -42.898399353027344, "rewards/margins": 5.832827091217041, "rewards/rejected": -48.73122787475586, "step": 408 }, { "epoch": 0.9250777495052305, "grad_norm": 141.9755163132043, "learning_rate": 1.3561482801337908e-08, "logits/chosen": -0.9116663336753845, "logits/rejected": -0.9385542273521423, "logps/chosen": -4.232028007507324, "logps/rejected": -4.991069793701172, "loss": 2.9291, "rewards/accuracies": 0.7265625, "rewards/chosen": -42.320281982421875, "rewards/margins": 7.590411186218262, "rewards/rejected": -49.91069412231445, "step": 409 }, { "epoch": 0.9273395532937517, "grad_norm": 133.18959451570092, "learning_rate": 1.2756358855332904e-08, "logits/chosen": -0.9445152282714844, "logits/rejected": -0.9605578184127808, "logps/chosen": -4.202373504638672, "logps/rejected": -4.8074846267700195, "loss": 3.1204, "rewards/accuracies": 0.7421875, "rewards/chosen": -42.02373504638672, "rewards/margins": 6.051117897033691, "rewards/rejected": -48.074851989746094, "step": 410 }, { "epoch": 0.9296013570822731, "grad_norm": 136.12472128004111, "learning_rate": 1.1975484296105154e-08, "logits/chosen": -0.9164653420448303, "logits/rejected": -0.9311988353729248, "logps/chosen": -4.378890037536621, "logps/rejected": -5.060423851013184, "loss": 2.8484, "rewards/accuracies": 0.78125, "rewards/chosen": -43.78889846801758, "rewards/margins": 6.815339088439941, "rewards/rejected": -50.6042366027832, "step": 411 }, { "epoch": 0.9318631608707945, "grad_norm": 137.33443947595077, "learning_rate": 1.1218908022402374e-08, "logits/chosen": -0.9297804832458496, "logits/rejected": -0.9439125061035156, "logps/chosen": -4.097784996032715, "logps/rejected": -4.829689025878906, "loss": 2.4591, "rewards/accuracies": 0.796875, "rewards/chosen": -40.977848052978516, "rewards/margins": 7.319035530090332, "rewards/rejected": -48.2968864440918, "step": 412 }, { "epoch": 0.9341249646593158, "grad_norm": 145.19557689058706, "learning_rate": 1.0486677411402079e-08, "logits/chosen": -0.9909257888793945, "logits/rejected": -0.9965202212333679, "logps/chosen": -4.445742607116699, "logps/rejected": -5.312036037445068, "loss": 2.6238, "rewards/accuracies": 0.796875, "rewards/chosen": -44.45742416381836, "rewards/margins": 8.66294002532959, "rewards/rejected": -53.120365142822266, "step": 413 }, { "epoch": 0.9363867684478372, "grad_norm": 135.59114017765043, "learning_rate": 9.778838315744353e-09, "logits/chosen": -0.9647377133369446, "logits/rejected": -0.9831647872924805, "logps/chosen": -4.492733478546143, "logps/rejected": -5.175955295562744, "loss": 2.6291, "rewards/accuracies": 0.8125, "rewards/chosen": -44.927330017089844, "rewards/margins": 6.832226753234863, "rewards/rejected": -51.75955581665039, "step": 414 }, { "epoch": 0.9386485722363584, "grad_norm": 145.5793767400526, "learning_rate": 9.095435060660595e-09, "logits/chosen": -0.9024043679237366, "logits/rejected": -0.917569100856781, "logps/chosen": -4.358269691467285, "logps/rejected": -5.034271717071533, "loss": 2.8645, "rewards/accuracies": 0.796875, "rewards/chosen": -43.58269500732422, "rewards/margins": 6.760016441345215, "rewards/rejected": -50.34271240234375, "step": 415 }, { "epoch": 0.9409103760248798, "grad_norm": 162.49589370243754, "learning_rate": 8.436510441197864e-09, "logits/chosen": -0.9422574043273926, "logits/rejected": -0.9609728455543518, "logps/chosen": -4.340670585632324, "logps/rejected": -5.023505210876465, "loss": 2.9033, "rewards/accuracies": 0.7734375, "rewards/chosen": -43.406707763671875, "rewards/margins": 6.82834529876709, "rewards/rejected": -50.23505401611328, "step": 416 }, { "epoch": 0.9431721798134012, "grad_norm": 224.81877042117685, "learning_rate": 7.802105719539076e-09, "logits/chosen": -0.9420458078384399, "logits/rejected": -0.9551193118095398, "logps/chosen": -4.549409866333008, "logps/rejected": -5.181853771209717, "loss": 3.3733, "rewards/accuracies": 0.7265625, "rewards/chosen": -45.49409866333008, "rewards/margins": 6.324440002441406, "rewards/rejected": -51.818538665771484, "step": 417 }, { "epoch": 0.9454339836019225, "grad_norm": 131.4283806000118, "learning_rate": 7.1922606224192e-09, "logits/chosen": -0.9589974880218506, "logits/rejected": -0.9696003198623657, "logps/chosen": -4.475660800933838, "logps/rejected": -5.16171407699585, "loss": 2.5232, "rewards/accuracies": 0.7890625, "rewards/chosen": -44.756614685058594, "rewards/margins": 6.860527992248535, "rewards/rejected": -51.61713790893555, "step": 418 }, { "epoch": 0.9476957873904439, "grad_norm": 148.50211503722525, "learning_rate": 6.6070133386372906e-09, "logits/chosen": -0.9348170161247253, "logits/rejected": -0.9493433237075806, "logps/chosen": -4.343303680419922, "logps/rejected": -4.941909313201904, "loss": 3.0315, "rewards/accuracies": 0.78125, "rewards/chosen": -43.43303680419922, "rewards/margins": 5.9860520362854, "rewards/rejected": -49.419090270996094, "step": 419 }, { "epoch": 0.9499575911789653, "grad_norm": 129.14740181782477, "learning_rate": 6.046400516665384e-09, "logits/chosen": -0.957095742225647, "logits/rejected": -0.9551052451133728, "logps/chosen": -4.2565999031066895, "logps/rejected": -4.9523539543151855, "loss": 3.1614, "rewards/accuracies": 0.734375, "rewards/chosen": -42.566001892089844, "rewards/margins": 6.9575371742248535, "rewards/rejected": -49.523536682128906, "step": 420 }, { "epoch": 0.9522193949674865, "grad_norm": 122.48845625064148, "learning_rate": 5.510457262353396e-09, "logits/chosen": -0.9842012524604797, "logits/rejected": -1.0115524530410767, "logps/chosen": -4.26042366027832, "logps/rejected": -4.900167942047119, "loss": 2.4057, "rewards/accuracies": 0.8203125, "rewards/chosen": -42.60423278808594, "rewards/margins": 6.397446155548096, "rewards/rejected": -49.00168228149414, "step": 421 }, { "epoch": 0.9544811987560079, "grad_norm": 130.67374788625062, "learning_rate": 4.9992171367309265e-09, "logits/chosen": -0.9512357711791992, "logits/rejected": -0.9497030377388, "logps/chosen": -4.133967399597168, "logps/rejected": -4.79262638092041, "loss": 2.3627, "rewards/accuracies": 0.8359375, "rewards/chosen": -41.33967208862305, "rewards/margins": 6.586594581604004, "rewards/rejected": -47.92626953125, "step": 422 }, { "epoch": 0.9567430025445293, "grad_norm": 170.07916949000563, "learning_rate": 4.5127121539052955e-09, "logits/chosen": -0.9652352333068848, "logits/rejected": -0.9730237722396851, "logps/chosen": -4.610488414764404, "logps/rejected": -5.30706262588501, "loss": 2.5704, "rewards/accuracies": 0.8046875, "rewards/chosen": -46.10488510131836, "rewards/margins": 6.9657416343688965, "rewards/rejected": -53.07062911987305, "step": 423 }, { "epoch": 0.9590048063330506, "grad_norm": 148.77436953597567, "learning_rate": 4.050972779057327e-09, "logits/chosen": -0.8603891730308533, "logits/rejected": -0.883198618888855, "logps/chosen": -4.0621185302734375, "logps/rejected": -4.761756896972656, "loss": 2.5931, "rewards/accuracies": 0.7734375, "rewards/chosen": -40.62118911743164, "rewards/margins": 6.996386528015137, "rewards/rejected": -47.61757278442383, "step": 424 }, { "epoch": 0.961266610121572, "grad_norm": 150.51477842196144, "learning_rate": 3.6140279265330477e-09, "logits/chosen": -0.9070014357566833, "logits/rejected": -0.9290311336517334, "logps/chosen": -4.51793909072876, "logps/rejected": -5.157177448272705, "loss": 2.8884, "rewards/accuracies": 0.7578125, "rewards/chosen": -45.17938995361328, "rewards/margins": 6.392387390136719, "rewards/rejected": -51.57177734375, "step": 425 }, { "epoch": 0.9635284139100933, "grad_norm": 136.67894069927104, "learning_rate": 3.2019049580335853e-09, "logits/chosen": -0.9470658898353577, "logits/rejected": -0.9471170902252197, "logps/chosen": -4.178645133972168, "logps/rejected": -4.754918098449707, "loss": 3.3311, "rewards/accuracies": 0.7421875, "rewards/chosen": -41.78644943237305, "rewards/margins": 5.762726783752441, "rewards/rejected": -47.54917907714844, "step": 426 }, { "epoch": 0.9657902176986146, "grad_norm": 118.06690739543247, "learning_rate": 2.814629680901337e-09, "logits/chosen": -0.9594217538833618, "logits/rejected": -0.9807270169258118, "logps/chosen": -4.4059553146362305, "logps/rejected": -5.0466628074646, "loss": 2.3717, "rewards/accuracies": 0.8125, "rewards/chosen": -44.05955505371094, "rewards/margins": 6.407071113586426, "rewards/rejected": -50.46662902832031, "step": 427 }, { "epoch": 0.968052021487136, "grad_norm": 150.5152498317493, "learning_rate": 2.4522263465041937e-09, "logits/chosen": -0.9186062812805176, "logits/rejected": -0.9451611042022705, "logps/chosen": -4.349206447601318, "logps/rejected": -5.054396629333496, "loss": 2.2936, "rewards/accuracies": 0.8203125, "rewards/chosen": -43.49205780029297, "rewards/margins": 7.051908016204834, "rewards/rejected": -50.54396438598633, "step": 428 }, { "epoch": 0.9703138252756573, "grad_norm": 135.25076850145902, "learning_rate": 2.114717648716713e-09, "logits/chosen": -0.8935461044311523, "logits/rejected": -0.9095126986503601, "logps/chosen": -4.378687381744385, "logps/rejected": -5.177103042602539, "loss": 2.34, "rewards/accuracies": 0.8359375, "rewards/chosen": -43.786869049072266, "rewards/margins": 7.984163761138916, "rewards/rejected": -51.771034240722656, "step": 429 }, { "epoch": 0.9725756290641787, "grad_norm": 152.01906534523854, "learning_rate": 1.802124722499121e-09, "logits/chosen": -0.9317042231559753, "logits/rejected": -0.9441834688186646, "logps/chosen": -4.452950477600098, "logps/rejected": -5.174241065979004, "loss": 2.7273, "rewards/accuracies": 0.8046875, "rewards/chosen": -44.529502868652344, "rewards/margins": 7.2129082679748535, "rewards/rejected": -51.742408752441406, "step": 430 }, { "epoch": 0.9748374328527001, "grad_norm": 129.07592379619018, "learning_rate": 1.5144671425737499e-09, "logits/chosen": -0.9220924377441406, "logits/rejected": -0.9323858022689819, "logps/chosen": -4.173183441162109, "logps/rejected": -4.836048603057861, "loss": 2.9168, "rewards/accuracies": 0.78125, "rewards/chosen": -41.731834411621094, "rewards/margins": 6.628646373748779, "rewards/rejected": -48.36048126220703, "step": 431 }, { "epoch": 0.9770992366412213, "grad_norm": 122.42308357023245, "learning_rate": 1.251762922199484e-09, "logits/chosen": -0.8762988448143005, "logits/rejected": -0.9018377065658569, "logps/chosen": -4.394649982452393, "logps/rejected": -5.167266845703125, "loss": 2.0131, "rewards/accuracies": 0.859375, "rewards/chosen": -43.94649887084961, "rewards/margins": 7.726165771484375, "rewards/rejected": -51.67266845703125, "step": 432 }, { "epoch": 0.9793610404297427, "grad_norm": 144.42537157796136, "learning_rate": 1.0140285120433744e-09, "logits/chosen": -0.9518988132476807, "logits/rejected": -0.9752581715583801, "logps/chosen": -4.419306755065918, "logps/rejected": -5.068571090698242, "loss": 2.8378, "rewards/accuracies": 0.7265625, "rewards/chosen": -44.19306564331055, "rewards/margins": 6.492642402648926, "rewards/rejected": -50.685707092285156, "step": 433 }, { "epoch": 0.9816228442182641, "grad_norm": 144.35990425477752, "learning_rate": 8.012787991508396e-10, "logits/chosen": -0.9084798693656921, "logits/rejected": -0.9402381181716919, "logps/chosen": -4.297061920166016, "logps/rejected": -5.11636209487915, "loss": 2.7992, "rewards/accuracies": 0.7734375, "rewards/chosen": -42.97062301635742, "rewards/margins": 8.192997932434082, "rewards/rejected": -51.16362380981445, "step": 434 }, { "epoch": 0.9838846480067854, "grad_norm": 134.7783472158385, "learning_rate": 6.135271060133007e-10, "logits/chosen": -0.8788937330245972, "logits/rejected": -0.8922220468521118, "logps/chosen": -4.292323112487793, "logps/rejected": -4.958512783050537, "loss": 2.7245, "rewards/accuracies": 0.765625, "rewards/chosen": -42.9232292175293, "rewards/margins": 6.6618971824646, "rewards/rejected": -49.585121154785156, "step": 435 }, { "epoch": 0.9861464517953068, "grad_norm": 131.05692298332755, "learning_rate": 4.50785189733871e-10, "logits/chosen": -0.8994375467300415, "logits/rejected": -0.9363196492195129, "logps/chosen": -4.137233257293701, "logps/rejected": -4.894649982452393, "loss": 2.0871, "rewards/accuracies": 0.8515625, "rewards/chosen": -41.372337341308594, "rewards/margins": 7.574166297912598, "rewards/rejected": -48.946495056152344, "step": 436 }, { "epoch": 0.988408255583828, "grad_norm": 141.9667964600976, "learning_rate": 3.1306324129118935e-10, "logits/chosen": -0.9034903049468994, "logits/rejected": -0.9206139445304871, "logps/chosen": -4.409141540527344, "logps/rejected": -5.078451156616211, "loss": 2.5653, "rewards/accuracies": 0.8046875, "rewards/chosen": -44.09141540527344, "rewards/margins": 6.693098068237305, "rewards/rejected": -50.784515380859375, "step": 437 }, { "epoch": 0.9906700593723494, "grad_norm": 149.43690667028594, "learning_rate": 2.003698849011748e-10, "logits/chosen": -0.9702510237693787, "logits/rejected": -0.992080807685852, "logps/chosen": -4.582041263580322, "logps/rejected": -5.16063928604126, "loss": 2.9004, "rewards/accuracies": 0.7734375, "rewards/chosen": -45.82041931152344, "rewards/margins": 5.78598165512085, "rewards/rejected": -51.60639953613281, "step": 438 }, { "epoch": 0.9929318631608708, "grad_norm": 151.45409397929544, "learning_rate": 1.1271217747714779e-10, "logits/chosen": -0.9387862086296082, "logits/rejected": -0.9742698669433594, "logps/chosen": -4.416835784912109, "logps/rejected": -5.033830165863037, "loss": 2.8039, "rewards/accuracies": 0.7421875, "rewards/chosen": -44.168357849121094, "rewards/margins": 6.169943809509277, "rewards/rejected": -50.33830642700195, "step": 439 }, { "epoch": 0.9951936669493922, "grad_norm": 117.95433018808065, "learning_rate": 5.0095608187739055e-11, "logits/chosen": -0.9022542238235474, "logits/rejected": -0.921513557434082, "logps/chosen": -4.125314712524414, "logps/rejected": -4.81278133392334, "loss": 2.5334, "rewards/accuracies": 0.765625, "rewards/chosen": -41.25314712524414, "rewards/margins": 6.874664306640625, "rewards/rejected": -48.127811431884766, "step": 440 }, { "epoch": 0.9974554707379135, "grad_norm": 150.85789440344882, "learning_rate": 1.2524098113209092e-11, "logits/chosen": -0.9529531002044678, "logits/rejected": -0.9607404470443726, "logps/chosen": -4.348971366882324, "logps/rejected": -4.960037708282471, "loss": 3.1636, "rewards/accuracies": 0.796875, "rewards/chosen": -43.48971176147461, "rewards/margins": 6.110668182373047, "rewards/rejected": -49.600379943847656, "step": 441 }, { "epoch": 0.9997172745264349, "grad_norm": 133.66353463529444, "learning_rate": 0.0, "logits/chosen": -0.9448285698890686, "logits/rejected": -0.951061487197876, "logps/chosen": -4.208832263946533, "logps/rejected": -4.896453857421875, "loss": 2.7111, "rewards/accuracies": 0.78125, "rewards/chosen": -42.088321685791016, "rewards/margins": 6.876214027404785, "rewards/rejected": -48.96453857421875, "step": 442 }, { "epoch": 0.9997172745264349, "eval_logits/chosen": -0.9250581860542297, "eval_logits/rejected": -0.9405222535133362, "eval_logps/chosen": -4.356727600097656, "eval_logps/rejected": -5.03577995300293, "eval_loss": 2.5820231437683105, "eval_rewards/accuracies": 0.7914438843727112, "eval_rewards/chosen": -43.567283630371094, "eval_rewards/margins": 6.790517330169678, "eval_rewards/rejected": -50.35779571533203, "eval_runtime": 64.9654, "eval_samples_per_second": 45.855, "eval_steps_per_second": 2.878, "step": 442 }, { "epoch": 0.9997172745264349, "step": 442, "total_flos": 134366991482880.0, "train_loss": 3.371998559026157, "train_runtime": 3776.6556, "train_samples_per_second": 14.984, "train_steps_per_second": 0.117 } ], "logging_steps": 1.0, "max_steps": 442, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 134366991482880.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }