simpo-stackexchange_christianity / trainer_state.json
sedrickkeh's picture
End of training
c3ef54f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997172745264349,
"eval_steps": 500,
"global_step": 442,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0022618037885213456,
"grad_norm": 115.54104008253113,
"learning_rate": 1.7777777777777777e-08,
"logits/chosen": -0.8356236219406128,
"logits/rejected": -0.8113616108894348,
"logps/chosen": -1.5973824262619019,
"logps/rejected": -1.7205333709716797,
"loss": 5.7448,
"rewards/accuracies": 0.5625,
"rewards/chosen": -15.973824501037598,
"rewards/margins": 1.2315096855163574,
"rewards/rejected": -17.205333709716797,
"step": 1
},
{
"epoch": 0.004523607577042691,
"grad_norm": 74.3673874458241,
"learning_rate": 3.5555555555555554e-08,
"logits/chosen": -0.880168616771698,
"logits/rejected": -0.8785539269447327,
"logps/chosen": -1.676809310913086,
"logps/rejected": -1.6232023239135742,
"loss": 6.1494,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -16.76809310913086,
"rewards/margins": -0.5360711812973022,
"rewards/rejected": -16.23202133178711,
"step": 2
},
{
"epoch": 0.006785411365564037,
"grad_norm": 103.24134323529078,
"learning_rate": 5.333333333333333e-08,
"logits/chosen": -0.8413803577423096,
"logits/rejected": -0.8578035831451416,
"logps/chosen": -1.679062843322754,
"logps/rejected": -1.8749037981033325,
"loss": 6.2783,
"rewards/accuracies": 0.578125,
"rewards/chosen": -16.79062843322754,
"rewards/margins": 1.9584112167358398,
"rewards/rejected": -18.749040603637695,
"step": 3
},
{
"epoch": 0.009047215154085382,
"grad_norm": 118.3827028625394,
"learning_rate": 7.111111111111111e-08,
"logits/chosen": -0.8771331906318665,
"logits/rejected": -0.8562486171722412,
"logps/chosen": -1.6714611053466797,
"logps/rejected": -1.6346337795257568,
"loss": 6.182,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -16.71461296081543,
"rewards/margins": -0.368274062871933,
"rewards/rejected": -16.346338272094727,
"step": 4
},
{
"epoch": 0.01130901894260673,
"grad_norm": 80.57413510413119,
"learning_rate": 8.888888888888888e-08,
"logits/chosen": -0.8716552257537842,
"logits/rejected": -0.8481063842773438,
"logps/chosen": -1.5923399925231934,
"logps/rejected": -1.6487252712249756,
"loss": 5.4404,
"rewards/accuracies": 0.546875,
"rewards/chosen": -15.92340087890625,
"rewards/margins": 0.5638511776924133,
"rewards/rejected": -16.48725128173828,
"step": 5
},
{
"epoch": 0.013570822731128074,
"grad_norm": 87.54382641921318,
"learning_rate": 1.0666666666666666e-07,
"logits/chosen": -0.9280990958213806,
"logits/rejected": -0.9075251221656799,
"logps/chosen": -1.659511685371399,
"logps/rejected": -1.6108341217041016,
"loss": 6.2758,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -16.595117568969727,
"rewards/margins": -0.48677870631217957,
"rewards/rejected": -16.108339309692383,
"step": 6
},
{
"epoch": 0.01583262651964942,
"grad_norm": 39.60022994902477,
"learning_rate": 1.2444444444444443e-07,
"logits/chosen": -0.8669524192810059,
"logits/rejected": -0.8501181602478027,
"logps/chosen": -1.4954458475112915,
"logps/rejected": -1.8645169734954834,
"loss": 4.1135,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -14.954458236694336,
"rewards/margins": 3.6907100677490234,
"rewards/rejected": -18.64516830444336,
"step": 7
},
{
"epoch": 0.018094430308170765,
"grad_norm": 97.49420569209839,
"learning_rate": 1.4222222222222222e-07,
"logits/chosen": -0.8323963284492493,
"logits/rejected": -0.8386867046356201,
"logps/chosen": -1.5902166366577148,
"logps/rejected": -1.6685606241226196,
"loss": 5.7368,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -15.902165412902832,
"rewards/margins": 0.7834409475326538,
"rewards/rejected": -16.685604095458984,
"step": 8
},
{
"epoch": 0.020356234096692113,
"grad_norm": 65.1210073833328,
"learning_rate": 1.6e-07,
"logits/chosen": -0.8797612190246582,
"logits/rejected": -0.8696941137313843,
"logps/chosen": -1.5322370529174805,
"logps/rejected": -1.739979863166809,
"loss": 5.4506,
"rewards/accuracies": 0.5078125,
"rewards/chosen": -15.322370529174805,
"rewards/margins": 2.0774283409118652,
"rewards/rejected": -17.399799346923828,
"step": 9
},
{
"epoch": 0.02261803788521346,
"grad_norm": 87.75880998151953,
"learning_rate": 1.7777777777777776e-07,
"logits/chosen": -0.8726012706756592,
"logits/rejected": -0.8817980885505676,
"logps/chosen": -1.692103385925293,
"logps/rejected": -1.6219866275787354,
"loss": 6.0529,
"rewards/accuracies": 0.59375,
"rewards/chosen": -16.92103385925293,
"rewards/margins": -0.7011662721633911,
"rewards/rejected": -16.219867706298828,
"step": 10
},
{
"epoch": 0.024879841673734804,
"grad_norm": 94.08875549981737,
"learning_rate": 1.9555555555555555e-07,
"logits/chosen": -0.9308934211730957,
"logits/rejected": -0.9283267259597778,
"logps/chosen": -1.6734390258789062,
"logps/rejected": -1.9049830436706543,
"loss": 5.3677,
"rewards/accuracies": 0.578125,
"rewards/chosen": -16.734390258789062,
"rewards/margins": 2.315438747406006,
"rewards/rejected": -19.049829483032227,
"step": 11
},
{
"epoch": 0.02714164546225615,
"grad_norm": 114.82744103438529,
"learning_rate": 2.133333333333333e-07,
"logits/chosen": -0.8485775589942932,
"logits/rejected": -0.8313932418823242,
"logps/chosen": -1.7942991256713867,
"logps/rejected": -1.8555328845977783,
"loss": 6.4051,
"rewards/accuracies": 0.609375,
"rewards/chosen": -17.942991256713867,
"rewards/margins": 0.6123358607292175,
"rewards/rejected": -18.555328369140625,
"step": 12
},
{
"epoch": 0.029403449250777494,
"grad_norm": 97.6559617021603,
"learning_rate": 2.3111111111111107e-07,
"logits/chosen": -0.8471003770828247,
"logits/rejected": -0.8123136162757874,
"logps/chosen": -1.7098432779312134,
"logps/rejected": -1.626631736755371,
"loss": 6.1671,
"rewards/accuracies": 0.578125,
"rewards/chosen": -17.098432540893555,
"rewards/margins": -0.8321163654327393,
"rewards/rejected": -16.26631736755371,
"step": 13
},
{
"epoch": 0.03166525303929884,
"grad_norm": 48.43299087579507,
"learning_rate": 2.4888888888888886e-07,
"logits/chosen": -0.8456010222434998,
"logits/rejected": -0.843168318271637,
"logps/chosen": -1.3908941745758057,
"logps/rejected": -1.59244704246521,
"loss": 4.5976,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -13.908943176269531,
"rewards/margins": 2.0155270099639893,
"rewards/rejected": -15.924469947814941,
"step": 14
},
{
"epoch": 0.033927056827820185,
"grad_norm": 94.32433506251559,
"learning_rate": 2.666666666666666e-07,
"logits/chosen": -0.8408608436584473,
"logits/rejected": -0.8317903280258179,
"logps/chosen": -1.5308924913406372,
"logps/rejected": -1.621803879737854,
"loss": 5.1224,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -15.308926582336426,
"rewards/margins": 0.9091131091117859,
"rewards/rejected": -16.218036651611328,
"step": 15
},
{
"epoch": 0.03618886061634153,
"grad_norm": 80.0199360911188,
"learning_rate": 2.8444444444444443e-07,
"logits/chosen": -0.896875262260437,
"logits/rejected": -0.8800469636917114,
"logps/chosen": -1.6712013483047485,
"logps/rejected": -1.6556079387664795,
"loss": 6.2495,
"rewards/accuracies": 0.5,
"rewards/chosen": -16.71201515197754,
"rewards/margins": -0.15593338012695312,
"rewards/rejected": -16.556079864501953,
"step": 16
},
{
"epoch": 0.038450664404862875,
"grad_norm": 95.62781163862564,
"learning_rate": 3.022222222222222e-07,
"logits/chosen": -0.9052500128746033,
"logits/rejected": -0.8847813010215759,
"logps/chosen": -1.4807989597320557,
"logps/rejected": -1.447709321975708,
"loss": 6.2111,
"rewards/accuracies": 0.53125,
"rewards/chosen": -14.807989120483398,
"rewards/margins": -0.33089762926101685,
"rewards/rejected": -14.477092742919922,
"step": 17
},
{
"epoch": 0.04071246819338423,
"grad_norm": 99.53047146451797,
"learning_rate": 3.2e-07,
"logits/chosen": -0.9046046733856201,
"logits/rejected": -0.8962881565093994,
"logps/chosen": -1.9553179740905762,
"logps/rejected": -1.9541630744934082,
"loss": 6.2661,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -19.553178787231445,
"rewards/margins": -0.011548399925231934,
"rewards/rejected": -19.5416316986084,
"step": 18
},
{
"epoch": 0.04297427198190557,
"grad_norm": 94.64334054203071,
"learning_rate": 3.3777777777777777e-07,
"logits/chosen": -0.9112716913223267,
"logits/rejected": -0.8977913856506348,
"logps/chosen": -1.6549549102783203,
"logps/rejected": -1.672560214996338,
"loss": 5.3987,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -16.549549102783203,
"rewards/margins": 0.1760539710521698,
"rewards/rejected": -16.725605010986328,
"step": 19
},
{
"epoch": 0.04523607577042692,
"grad_norm": 41.05939278803522,
"learning_rate": 3.5555555555555553e-07,
"logits/chosen": -0.9180342555046082,
"logits/rejected": -0.9136630892753601,
"logps/chosen": -1.5036756992340088,
"logps/rejected": -1.7418506145477295,
"loss": 4.306,
"rewards/accuracies": 0.65625,
"rewards/chosen": -15.03675651550293,
"rewards/margins": 2.3817477226257324,
"rewards/rejected": -17.41850471496582,
"step": 20
},
{
"epoch": 0.04749787955894826,
"grad_norm": 65.62889809973544,
"learning_rate": 3.7333333333333334e-07,
"logits/chosen": -0.8245253562927246,
"logits/rejected": -0.8135088086128235,
"logps/chosen": -1.5167511701583862,
"logps/rejected": -1.5217550992965698,
"loss": 5.4951,
"rewards/accuracies": 0.609375,
"rewards/chosen": -15.167511940002441,
"rewards/margins": 0.05004033446311951,
"rewards/rejected": -15.217550277709961,
"step": 21
},
{
"epoch": 0.04975968334746961,
"grad_norm": 53.92197856426591,
"learning_rate": 3.911111111111111e-07,
"logits/chosen": -0.8624619245529175,
"logits/rejected": -0.8261862397193909,
"logps/chosen": -1.559888243675232,
"logps/rejected": -1.6315239667892456,
"loss": 4.6857,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -15.598882675170898,
"rewards/margins": 0.716356098651886,
"rewards/rejected": -16.31523895263672,
"step": 22
},
{
"epoch": 0.05202148713599095,
"grad_norm": 71.94913453042106,
"learning_rate": 4.0888888888888886e-07,
"logits/chosen": -0.8572225570678711,
"logits/rejected": -0.8356618881225586,
"logps/chosen": -1.5296409130096436,
"logps/rejected": -1.5351814031600952,
"loss": 5.7403,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -15.296407699584961,
"rewards/margins": 0.055405229330062866,
"rewards/rejected": -15.351814270019531,
"step": 23
},
{
"epoch": 0.0542832909245123,
"grad_norm": 53.61180470189225,
"learning_rate": 4.266666666666666e-07,
"logits/chosen": -0.8729988932609558,
"logits/rejected": -0.8457622528076172,
"logps/chosen": -1.371631383895874,
"logps/rejected": -1.4574888944625854,
"loss": 5.0383,
"rewards/accuracies": 0.578125,
"rewards/chosen": -13.716312408447266,
"rewards/margins": 0.8585769534111023,
"rewards/rejected": -14.574889183044434,
"step": 24
},
{
"epoch": 0.05654509471303364,
"grad_norm": 98.16261763681565,
"learning_rate": 4.4444444444444444e-07,
"logits/chosen": -0.862544596195221,
"logits/rejected": -0.8518227934837341,
"logps/chosen": -1.6941993236541748,
"logps/rejected": -1.677493691444397,
"loss": 5.7118,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -16.941993713378906,
"rewards/margins": -0.16705602407455444,
"rewards/rejected": -16.77493667602539,
"step": 25
},
{
"epoch": 0.05880689850155499,
"grad_norm": 94.80833058904163,
"learning_rate": 4.6222222222222214e-07,
"logits/chosen": -0.8756837844848633,
"logits/rejected": -0.8487232327461243,
"logps/chosen": -1.6833150386810303,
"logps/rejected": -1.6622823476791382,
"loss": 5.6915,
"rewards/accuracies": 0.546875,
"rewards/chosen": -16.83315086364746,
"rewards/margins": -0.21032753586769104,
"rewards/rejected": -16.622821807861328,
"step": 26
},
{
"epoch": 0.061068702290076333,
"grad_norm": 81.08001128654294,
"learning_rate": 4.8e-07,
"logits/chosen": -0.8789094686508179,
"logits/rejected": -0.8827879428863525,
"logps/chosen": -1.6698274612426758,
"logps/rejected": -1.5913212299346924,
"loss": 6.1429,
"rewards/accuracies": 0.515625,
"rewards/chosen": -16.698274612426758,
"rewards/margins": -0.7850615978240967,
"rewards/rejected": -15.913213729858398,
"step": 27
},
{
"epoch": 0.06333050607859768,
"grad_norm": 56.644003389915696,
"learning_rate": 4.977777777777777e-07,
"logits/chosen": -0.8799877166748047,
"logits/rejected": -0.870951235294342,
"logps/chosen": -1.5632425546646118,
"logps/rejected": -1.624694585800171,
"loss": 5.6969,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -15.632424354553223,
"rewards/margins": 0.6145212054252625,
"rewards/rejected": -16.246946334838867,
"step": 28
},
{
"epoch": 0.06559230986711903,
"grad_norm": 74.85771393756472,
"learning_rate": 5.155555555555556e-07,
"logits/chosen": -0.8934893608093262,
"logits/rejected": -0.8896267414093018,
"logps/chosen": -1.5920103788375854,
"logps/rejected": -1.6025701761245728,
"loss": 5.5342,
"rewards/accuracies": 0.484375,
"rewards/chosen": -15.920103073120117,
"rewards/margins": 0.10559805482625961,
"rewards/rejected": -16.02570152282715,
"step": 29
},
{
"epoch": 0.06785411365564037,
"grad_norm": 76.84594367688287,
"learning_rate": 5.333333333333332e-07,
"logits/chosen": -0.8594059944152832,
"logits/rejected": -0.8437649607658386,
"logps/chosen": -1.5912779569625854,
"logps/rejected": -1.6219682693481445,
"loss": 5.9684,
"rewards/accuracies": 0.515625,
"rewards/chosen": -15.912779808044434,
"rewards/margins": 0.3069048821926117,
"rewards/rejected": -16.219684600830078,
"step": 30
},
{
"epoch": 0.07011591744416172,
"grad_norm": 78.87037036995574,
"learning_rate": 5.511111111111111e-07,
"logits/chosen": -0.8989169001579285,
"logits/rejected": -0.88699871301651,
"logps/chosen": -1.577941656112671,
"logps/rejected": -1.548736572265625,
"loss": 5.6791,
"rewards/accuracies": 0.59375,
"rewards/chosen": -15.779415130615234,
"rewards/margins": -0.2920517921447754,
"rewards/rejected": -15.487363815307617,
"step": 31
},
{
"epoch": 0.07237772123268306,
"grad_norm": 70.30306886857991,
"learning_rate": 5.688888888888889e-07,
"logits/chosen": -0.9056158661842346,
"logits/rejected": -0.9041393399238586,
"logps/chosen": -1.5190542936325073,
"logps/rejected": -1.482797622680664,
"loss": 5.7225,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -15.190543174743652,
"rewards/margins": -0.36256617307662964,
"rewards/rejected": -14.827978134155273,
"step": 32
},
{
"epoch": 0.07463952502120441,
"grad_norm": 52.019953616790154,
"learning_rate": 5.866666666666666e-07,
"logits/chosen": -0.8431529998779297,
"logits/rejected": -0.8326103687286377,
"logps/chosen": -1.4754631519317627,
"logps/rejected": -1.6542396545410156,
"loss": 4.6444,
"rewards/accuracies": 0.609375,
"rewards/chosen": -14.754631042480469,
"rewards/margins": 1.787764072418213,
"rewards/rejected": -16.542396545410156,
"step": 33
},
{
"epoch": 0.07690132880972575,
"grad_norm": 51.76600092399858,
"learning_rate": 6.044444444444444e-07,
"logits/chosen": -0.8958278298377991,
"logits/rejected": -0.8506935834884644,
"logps/chosen": -1.4213745594024658,
"logps/rejected": -1.5557016134262085,
"loss": 4.5416,
"rewards/accuracies": 0.53125,
"rewards/chosen": -14.213743209838867,
"rewards/margins": 1.343271017074585,
"rewards/rejected": -15.557015419006348,
"step": 34
},
{
"epoch": 0.0791631325982471,
"grad_norm": 88.73341217553781,
"learning_rate": 6.222222222222223e-07,
"logits/chosen": -0.9224306344985962,
"logits/rejected": -0.8935542106628418,
"logps/chosen": -1.5873973369598389,
"logps/rejected": -1.7230992317199707,
"loss": 5.1296,
"rewards/accuracies": 0.546875,
"rewards/chosen": -15.87397289276123,
"rewards/margins": 1.3570194244384766,
"rewards/rejected": -17.230993270874023,
"step": 35
},
{
"epoch": 0.08142493638676845,
"grad_norm": 81.48522456937111,
"learning_rate": 6.4e-07,
"logits/chosen": -0.8501981496810913,
"logits/rejected": -0.8491517305374146,
"logps/chosen": -1.5095704793930054,
"logps/rejected": -1.6728523969650269,
"loss": 4.9032,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -15.095704078674316,
"rewards/margins": 1.632819652557373,
"rewards/rejected": -16.72852325439453,
"step": 36
},
{
"epoch": 0.08368674017528979,
"grad_norm": 63.09197425067475,
"learning_rate": 6.577777777777777e-07,
"logits/chosen": -0.8523389101028442,
"logits/rejected": -0.8278622627258301,
"logps/chosen": -1.3732750415802002,
"logps/rejected": -1.3724522590637207,
"loss": 5.2905,
"rewards/accuracies": 0.609375,
"rewards/chosen": -13.732749938964844,
"rewards/margins": -0.008226484060287476,
"rewards/rejected": -13.724522590637207,
"step": 37
},
{
"epoch": 0.08594854396381114,
"grad_norm": 73.86459203067565,
"learning_rate": 6.755555555555555e-07,
"logits/chosen": -0.9427972435951233,
"logits/rejected": -0.9414781332015991,
"logps/chosen": -1.5264731645584106,
"logps/rejected": -1.5371237993240356,
"loss": 5.2678,
"rewards/accuracies": 0.5625,
"rewards/chosen": -15.264732360839844,
"rewards/margins": 0.1065058782696724,
"rewards/rejected": -15.371236801147461,
"step": 38
},
{
"epoch": 0.08821034775233248,
"grad_norm": 73.25463666536884,
"learning_rate": 6.933333333333333e-07,
"logits/chosen": -0.926520586013794,
"logits/rejected": -0.9318759441375732,
"logps/chosen": -1.5116084814071655,
"logps/rejected": -1.524423360824585,
"loss": 5.1166,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -15.116085052490234,
"rewards/margins": 0.12814898788928986,
"rewards/rejected": -15.244234085083008,
"step": 39
},
{
"epoch": 0.09047215154085383,
"grad_norm": 47.01597449801661,
"learning_rate": 7.111111111111111e-07,
"logits/chosen": -0.8796355128288269,
"logits/rejected": -0.8566000461578369,
"logps/chosen": -1.3858391046524048,
"logps/rejected": -1.5868655443191528,
"loss": 4.0668,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -13.858390808105469,
"rewards/margins": 2.0102648735046387,
"rewards/rejected": -15.868656158447266,
"step": 40
},
{
"epoch": 0.09273395532937517,
"grad_norm": 38.92083571265448,
"learning_rate": 7.288888888888888e-07,
"logits/chosen": -0.9404792189598083,
"logits/rejected": -0.9076958894729614,
"logps/chosen": -1.3758432865142822,
"logps/rejected": -1.5328952074050903,
"loss": 4.4094,
"rewards/accuracies": 0.609375,
"rewards/chosen": -13.75843334197998,
"rewards/margins": 1.5705193281173706,
"rewards/rejected": -15.32895278930664,
"step": 41
},
{
"epoch": 0.09499575911789652,
"grad_norm": 68.82170323755115,
"learning_rate": 7.466666666666667e-07,
"logits/chosen": -0.8351485729217529,
"logits/rejected": -0.7955107092857361,
"logps/chosen": -1.487162709236145,
"logps/rejected": -1.5933465957641602,
"loss": 5.0699,
"rewards/accuracies": 0.546875,
"rewards/chosen": -14.871627807617188,
"rewards/margins": 1.0618385076522827,
"rewards/rejected": -15.933464050292969,
"step": 42
},
{
"epoch": 0.09725756290641786,
"grad_norm": 63.18032695061353,
"learning_rate": 7.644444444444444e-07,
"logits/chosen": -0.9111210703849792,
"logits/rejected": -0.8793379664421082,
"logps/chosen": -1.4616880416870117,
"logps/rejected": -1.5058850049972534,
"loss": 5.005,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -14.616881370544434,
"rewards/margins": 0.44196972250938416,
"rewards/rejected": -15.058850288391113,
"step": 43
},
{
"epoch": 0.09951936669493922,
"grad_norm": 48.381985529172866,
"learning_rate": 7.822222222222222e-07,
"logits/chosen": -0.8437673449516296,
"logits/rejected": -0.8208142518997192,
"logps/chosen": -1.3148137331008911,
"logps/rejected": -1.4531042575836182,
"loss": 4.179,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -13.148136138916016,
"rewards/margins": 1.38290536403656,
"rewards/rejected": -14.53104305267334,
"step": 44
},
{
"epoch": 0.10178117048346055,
"grad_norm": 38.81813502976088,
"learning_rate": 8e-07,
"logits/chosen": -0.9005692005157471,
"logits/rejected": -0.8871059417724609,
"logps/chosen": -1.3741270303726196,
"logps/rejected": -1.4992985725402832,
"loss": 4.5246,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -13.741270065307617,
"rewards/margins": 1.2517166137695312,
"rewards/rejected": -14.992988586425781,
"step": 45
},
{
"epoch": 0.1040429742719819,
"grad_norm": 89.05318759018981,
"learning_rate": 7.999874759018868e-07,
"logits/chosen": -0.9439775943756104,
"logits/rejected": -0.9177378416061401,
"logps/chosen": -1.6424872875213623,
"logps/rejected": -1.7683295011520386,
"loss": 4.7224,
"rewards/accuracies": 0.609375,
"rewards/chosen": -16.42487144470215,
"rewards/margins": 1.2584227323532104,
"rewards/rejected": -17.68329429626465,
"step": 46
},
{
"epoch": 0.10630477806050326,
"grad_norm": 43.17715042286116,
"learning_rate": 7.999499043918123e-07,
"logits/chosen": -0.934738278388977,
"logits/rejected": -0.9424084424972534,
"logps/chosen": -1.4421114921569824,
"logps/rejected": -1.5015398263931274,
"loss": 5.0113,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -14.421113967895508,
"rewards/margins": 0.5942831635475159,
"rewards/rejected": -15.015397071838379,
"step": 47
},
{
"epoch": 0.1085665818490246,
"grad_norm": 43.641796737833445,
"learning_rate": 7.998872878225228e-07,
"logits/chosen": -0.8617913722991943,
"logits/rejected": -0.8524473905563354,
"logps/chosen": -1.48220694065094,
"logps/rejected": -1.628198504447937,
"loss": 4.4078,
"rewards/accuracies": 0.578125,
"rewards/chosen": -14.822070121765137,
"rewards/margins": 1.4599149227142334,
"rewards/rejected": -16.281984329223633,
"step": 48
},
{
"epoch": 0.11082838563754595,
"grad_norm": 44.9928271242027,
"learning_rate": 7.997996301150987e-07,
"logits/chosen": -0.8672093152999878,
"logits/rejected": -0.8628696203231812,
"logps/chosen": -1.4041790962219238,
"logps/rejected": -1.5184260606765747,
"loss": 4.6114,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -14.041790962219238,
"rewards/margins": 1.142470121383667,
"rewards/rejected": -15.184259414672852,
"step": 49
},
{
"epoch": 0.11309018942606729,
"grad_norm": 48.332832486571874,
"learning_rate": 7.996869367587088e-07,
"logits/chosen": -0.836407482624054,
"logits/rejected": -0.8215224146842957,
"logps/chosen": -1.4828814268112183,
"logps/rejected": -1.5967737436294556,
"loss": 4.6077,
"rewards/accuracies": 0.625,
"rewards/chosen": -14.828814506530762,
"rewards/margins": 1.1389241218566895,
"rewards/rejected": -15.967738151550293,
"step": 50
},
{
"epoch": 0.11535199321458864,
"grad_norm": 42.46620935554636,
"learning_rate": 7.99549214810266e-07,
"logits/chosen": -0.8490492105484009,
"logits/rejected": -0.8362867832183838,
"logps/chosen": -1.4633221626281738,
"logps/rejected": -1.5538841485977173,
"loss": 4.5936,
"rewards/accuracies": 0.609375,
"rewards/chosen": -14.633221626281738,
"rewards/margins": 0.9056205153465271,
"rewards/rejected": -15.538841247558594,
"step": 51
},
{
"epoch": 0.11761379700310998,
"grad_norm": 46.23878461965418,
"learning_rate": 7.993864728939867e-07,
"logits/chosen": -0.8653365969657898,
"logits/rejected": -0.8207730650901794,
"logps/chosen": -1.4526644945144653,
"logps/rejected": -1.5614793300628662,
"loss": 4.8368,
"rewards/accuracies": 0.609375,
"rewards/chosen": -14.526643753051758,
"rewards/margins": 1.0881470441818237,
"rewards/rejected": -15.614792823791504,
"step": 52
},
{
"epoch": 0.11987560079163133,
"grad_norm": 50.746711219977314,
"learning_rate": 7.991987212008491e-07,
"logits/chosen": -0.8787316083908081,
"logits/rejected": -0.8544822931289673,
"logps/chosen": -1.524681568145752,
"logps/rejected": -1.7203454971313477,
"loss": 4.3884,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -15.246816635131836,
"rewards/margins": 1.9566391706466675,
"rewards/rejected": -17.203454971313477,
"step": 53
},
{
"epoch": 0.12213740458015267,
"grad_norm": 51.98936145640891,
"learning_rate": 7.989859714879565e-07,
"logits/chosen": -0.9071463346481323,
"logits/rejected": -0.8824944496154785,
"logps/chosen": -1.4744333028793335,
"logps/rejected": -1.5566731691360474,
"loss": 4.8618,
"rewards/accuracies": 0.5546875,
"rewards/chosen": -14.744333267211914,
"rewards/margins": 0.8223981261253357,
"rewards/rejected": -15.566731452941895,
"step": 54
},
{
"epoch": 0.12439920836867402,
"grad_norm": 44.93136877143668,
"learning_rate": 7.987482370778005e-07,
"logits/chosen": -0.8825117349624634,
"logits/rejected": -0.8596429824829102,
"logps/chosen": -1.500649094581604,
"logps/rejected": -1.6202951669692993,
"loss": 4.696,
"rewards/accuracies": 0.546875,
"rewards/chosen": -15.006490707397461,
"rewards/margins": 1.1964606046676636,
"rewards/rejected": -16.20294952392578,
"step": 55
},
{
"epoch": 0.12666101215719536,
"grad_norm": 57.11711572983479,
"learning_rate": 7.984855328574262e-07,
"logits/chosen": -0.748485803604126,
"logits/rejected": -0.7519769668579102,
"logps/chosen": -1.4509243965148926,
"logps/rejected": -1.5625280141830444,
"loss": 4.4574,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -14.50924301147461,
"rewards/margins": 1.1160372495651245,
"rewards/rejected": -15.625280380249023,
"step": 56
},
{
"epoch": 0.1289228159457167,
"grad_norm": 70.17018909190087,
"learning_rate": 7.981978752775009e-07,
"logits/chosen": -0.8194972276687622,
"logits/rejected": -0.8117552399635315,
"logps/chosen": -1.5257998704910278,
"logps/rejected": -1.6556179523468018,
"loss": 4.665,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -15.257999420166016,
"rewards/margins": 1.298180341720581,
"rewards/rejected": -16.55617904663086,
"step": 57
},
{
"epoch": 0.13118461973423806,
"grad_norm": 79.53233397371731,
"learning_rate": 7.978852823512833e-07,
"logits/chosen": -0.8595327138900757,
"logits/rejected": -0.8340020179748535,
"logps/chosen": -1.641236424446106,
"logps/rejected": -1.7583504915237427,
"loss": 4.9327,
"rewards/accuracies": 0.5859375,
"rewards/chosen": -16.412364959716797,
"rewards/margins": 1.1711419820785522,
"rewards/rejected": -17.583507537841797,
"step": 58
},
{
"epoch": 0.1334464235227594,
"grad_norm": 69.60803219031307,
"learning_rate": 7.975477736534957e-07,
"logits/chosen": -0.8586044907569885,
"logits/rejected": -0.8539649844169617,
"logps/chosen": -1.5942871570587158,
"logps/rejected": -1.808882474899292,
"loss": 4.3286,
"rewards/accuracies": 0.609375,
"rewards/chosen": -15.942872047424316,
"rewards/margins": 2.1459531784057617,
"rewards/rejected": -18.088825225830078,
"step": 59
},
{
"epoch": 0.13570822731128074,
"grad_norm": 62.64898369105346,
"learning_rate": 7.971853703190986e-07,
"logits/chosen": -0.8574209213256836,
"logits/rejected": -0.8404501080513,
"logps/chosen": -1.5743780136108398,
"logps/rejected": -1.7517762184143066,
"loss": 4.4022,
"rewards/accuracies": 0.6875,
"rewards/chosen": -15.743781089782715,
"rewards/margins": 1.773982048034668,
"rewards/rejected": -17.517763137817383,
"step": 60
},
{
"epoch": 0.1379700310998021,
"grad_norm": 59.54476826787708,
"learning_rate": 7.967980950419664e-07,
"logits/chosen": -0.8027121424674988,
"logits/rejected": -0.7864540815353394,
"logps/chosen": -1.5260121822357178,
"logps/rejected": -1.6937767267227173,
"loss": 4.4368,
"rewards/accuracies": 0.609375,
"rewards/chosen": -15.260122299194336,
"rewards/margins": 1.6776450872421265,
"rewards/rejected": -16.937767028808594,
"step": 61
},
{
"epoch": 0.14023183488832344,
"grad_norm": 54.120200448145056,
"learning_rate": 7.963859720734669e-07,
"logits/chosen": -0.8626559376716614,
"logits/rejected": -0.8548423051834106,
"logps/chosen": -1.4451175928115845,
"logps/rejected": -1.646138310432434,
"loss": 4.3155,
"rewards/accuracies": 0.59375,
"rewards/chosen": -14.451175689697266,
"rewards/margins": 2.0102078914642334,
"rewards/rejected": -16.461383819580078,
"step": 62
},
{
"epoch": 0.14249363867684478,
"grad_norm": 48.27367689289302,
"learning_rate": 7.959490272209427e-07,
"logits/chosen": -0.8522219061851501,
"logits/rejected": -0.8149221539497375,
"logps/chosen": -1.4842830896377563,
"logps/rejected": -1.741237998008728,
"loss": 3.8194,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -14.842830657958984,
"rewards/margins": 2.5695488452911377,
"rewards/rejected": -17.41238021850586,
"step": 63
},
{
"epoch": 0.14475544246536612,
"grad_norm": 47.53970538406451,
"learning_rate": 7.954872878460946e-07,
"logits/chosen": -0.8807967901229858,
"logits/rejected": -0.8453354835510254,
"logps/chosen": -1.5572218894958496,
"logps/rejected": -1.7726668119430542,
"loss": 4.0647,
"rewards/accuracies": 0.671875,
"rewards/chosen": -15.572219848632812,
"rewards/margins": 2.154447555541992,
"rewards/rejected": -17.726669311523438,
"step": 64
},
{
"epoch": 0.14701724625388748,
"grad_norm": 61.35510962861716,
"learning_rate": 7.950007828632691e-07,
"logits/chosen": -0.8250374794006348,
"logits/rejected": -0.820457935333252,
"logps/chosen": -1.607496738433838,
"logps/rejected": -1.8878852128982544,
"loss": 4.0136,
"rewards/accuracies": 0.640625,
"rewards/chosen": -16.074966430664062,
"rewards/margins": 2.8038859367370605,
"rewards/rejected": -18.87885284423828,
"step": 65
},
{
"epoch": 0.14927905004240882,
"grad_norm": 54.05120684424973,
"learning_rate": 7.944895427376465e-07,
"logits/chosen": -0.8387467861175537,
"logits/rejected": -0.8197423219680786,
"logps/chosen": -1.613673210144043,
"logps/rejected": -1.8641467094421387,
"loss": 4.1501,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -16.13673210144043,
"rewards/margins": 2.5047359466552734,
"rewards/rejected": -18.641468048095703,
"step": 66
},
{
"epoch": 0.15154085383093016,
"grad_norm": 43.4329226476985,
"learning_rate": 7.939535994833345e-07,
"logits/chosen": -0.80382239818573,
"logits/rejected": -0.7954918742179871,
"logps/chosen": -1.4918571710586548,
"logps/rejected": -1.7582557201385498,
"loss": 4.0382,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -14.918570518493652,
"rewards/margins": 2.663985013961792,
"rewards/rejected": -17.582555770874023,
"step": 67
},
{
"epoch": 0.1538026576194515,
"grad_norm": 58.50135003917604,
"learning_rate": 7.933929866613628e-07,
"logits/chosen": -0.8193422555923462,
"logits/rejected": -0.8229498863220215,
"logps/chosen": -1.5523847341537476,
"logps/rejected": -1.6850162744522095,
"loss": 4.511,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -15.523847579956055,
"rewards/margins": 1.32631516456604,
"rewards/rejected": -16.850162506103516,
"step": 68
},
{
"epoch": 0.15606446140797287,
"grad_norm": 52.388333685361864,
"learning_rate": 7.928077393775808e-07,
"logits/chosen": -0.8074467778205872,
"logits/rejected": -0.8173753619194031,
"logps/chosen": -1.5872104167938232,
"logps/rejected": -1.9215919971466064,
"loss": 3.605,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -15.87210464477539,
"rewards/margins": 3.3438150882720947,
"rewards/rejected": -19.215919494628906,
"step": 69
},
{
"epoch": 0.1583262651964942,
"grad_norm": 70.46015135129937,
"learning_rate": 7.921978942804609e-07,
"logits/chosen": -0.7921926975250244,
"logits/rejected": -0.7895167469978333,
"logps/chosen": -1.5997159481048584,
"logps/rejected": -1.855806827545166,
"loss": 3.9852,
"rewards/accuracies": 0.640625,
"rewards/chosen": -15.997159004211426,
"rewards/margins": 2.5609097480773926,
"rewards/rejected": -18.558067321777344,
"step": 70
},
{
"epoch": 0.16058806898501554,
"grad_norm": 66.5782725292864,
"learning_rate": 7.915634895588021e-07,
"logits/chosen": -0.8188354969024658,
"logits/rejected": -0.803663969039917,
"logps/chosen": -1.694320797920227,
"logps/rejected": -1.8535633087158203,
"loss": 4.5753,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -16.943206787109375,
"rewards/margins": 1.5924267768859863,
"rewards/rejected": -18.535634994506836,
"step": 71
},
{
"epoch": 0.1628498727735369,
"grad_norm": 63.98536775928476,
"learning_rate": 7.909045649393394e-07,
"logits/chosen": -0.8593119382858276,
"logits/rejected": -0.8650994896888733,
"logps/chosen": -1.585839033126831,
"logps/rejected": -1.7022672891616821,
"loss": 4.5482,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -15.858390808105469,
"rewards/margins": 1.1642816066741943,
"rewards/rejected": -17.02267074584961,
"step": 72
},
{
"epoch": 0.16511167656205825,
"grad_norm": 47.71315634678256,
"learning_rate": 7.902211616842556e-07,
"logits/chosen": -0.8264446258544922,
"logits/rejected": -0.8236741423606873,
"logps/chosen": -1.623077154159546,
"logps/rejected": -1.879746437072754,
"loss": 4.1393,
"rewards/accuracies": 0.640625,
"rewards/chosen": -16.23077392578125,
"rewards/margins": 2.566693067550659,
"rewards/rejected": -18.79746437072754,
"step": 73
},
{
"epoch": 0.16737348035057958,
"grad_norm": 70.53123827391246,
"learning_rate": 7.89513322588598e-07,
"logits/chosen": -0.808039665222168,
"logits/rejected": -0.7966674566268921,
"logps/chosen": -1.592429757118225,
"logps/rejected": -1.8032734394073486,
"loss": 3.9256,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -15.924297332763672,
"rewards/margins": 2.108438014984131,
"rewards/rejected": -18.03273582458496,
"step": 74
},
{
"epoch": 0.16963528413910092,
"grad_norm": 56.68159659271728,
"learning_rate": 7.887810919775976e-07,
"logits/chosen": -0.7461099028587341,
"logits/rejected": -0.7355799674987793,
"logps/chosen": -1.6924803256988525,
"logps/rejected": -1.9031829833984375,
"loss": 4.0589,
"rewards/accuracies": 0.65625,
"rewards/chosen": -16.9248046875,
"rewards/margins": 2.107023239135742,
"rewards/rejected": -19.031827926635742,
"step": 75
},
{
"epoch": 0.1718970879276223,
"grad_norm": 42.71145723908974,
"learning_rate": 7.880245157038949e-07,
"logits/chosen": -0.8165091276168823,
"logits/rejected": -0.793809175491333,
"logps/chosen": -1.688427448272705,
"logps/rejected": -1.9064791202545166,
"loss": 4.0899,
"rewards/accuracies": 0.609375,
"rewards/chosen": -16.884273529052734,
"rewards/margins": 2.180520534515381,
"rewards/rejected": -19.064794540405273,
"step": 76
},
{
"epoch": 0.17415889171614363,
"grad_norm": 80.58036409049882,
"learning_rate": 7.872436411446671e-07,
"logits/chosen": -0.836346447467804,
"logits/rejected": -0.8506262302398682,
"logps/chosen": -1.7576085329055786,
"logps/rejected": -1.920924186706543,
"loss": 4.5954,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -17.57608413696289,
"rewards/margins": 1.6331558227539062,
"rewards/rejected": -19.209239959716797,
"step": 77
},
{
"epoch": 0.17642069550466496,
"grad_norm": 46.381702188392666,
"learning_rate": 7.86438517198662e-07,
"logits/chosen": -0.780924379825592,
"logits/rejected": -0.767948567867279,
"logps/chosen": -1.650989294052124,
"logps/rejected": -1.8504787683486938,
"loss": 4.2658,
"rewards/accuracies": 0.625,
"rewards/chosen": -16.5098934173584,
"rewards/margins": 1.9948934316635132,
"rewards/rejected": -18.50478744506836,
"step": 78
},
{
"epoch": 0.1786824992931863,
"grad_norm": 56.20092121621141,
"learning_rate": 7.856091942831366e-07,
"logits/chosen": -0.7430872321128845,
"logits/rejected": -0.7503747940063477,
"logps/chosen": -1.644688367843628,
"logps/rejected": -1.8490663766860962,
"loss": 4.517,
"rewards/accuracies": 0.625,
"rewards/chosen": -16.446882247924805,
"rewards/margins": 2.0437800884246826,
"rewards/rejected": -18.49066162109375,
"step": 79
},
{
"epoch": 0.18094430308170767,
"grad_norm": 52.81332760090688,
"learning_rate": 7.847557243306982e-07,
"logits/chosen": -0.8418252468109131,
"logits/rejected": -0.8341580629348755,
"logps/chosen": -1.6995246410369873,
"logps/rejected": -1.9185855388641357,
"loss": 4.0086,
"rewards/accuracies": 0.671875,
"rewards/chosen": -16.9952449798584,
"rewards/margins": 2.190608024597168,
"rewards/rejected": -19.18585205078125,
"step": 80
},
{
"epoch": 0.183206106870229,
"grad_norm": 56.02862880172386,
"learning_rate": 7.838781607860541e-07,
"logits/chosen": -0.8196614980697632,
"logits/rejected": -0.8126786947250366,
"logps/chosen": -1.7471215724945068,
"logps/rejected": -1.9539873600006104,
"loss": 3.7371,
"rewards/accuracies": 0.65625,
"rewards/chosen": -17.471214294433594,
"rewards/margins": 2.068657398223877,
"rewards/rejected": -19.539873123168945,
"step": 81
},
{
"epoch": 0.18546791065875035,
"grad_norm": 54.48859910947903,
"learning_rate": 7.82976558602664e-07,
"logits/chosen": -0.8580424785614014,
"logits/rejected": -0.8641104102134705,
"logps/chosen": -1.7102807760238647,
"logps/rejected": -1.8986783027648926,
"loss": 4.2118,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -17.102807998657227,
"rewards/margins": 1.8839763402938843,
"rewards/rejected": -18.986783981323242,
"step": 82
},
{
"epoch": 0.1877297144472717,
"grad_norm": 78.04437890690556,
"learning_rate": 7.820509742392988e-07,
"logits/chosen": -0.8468527793884277,
"logits/rejected": -0.8453028202056885,
"logps/chosen": -1.8543328046798706,
"logps/rejected": -2.0150225162506104,
"loss": 4.3218,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -18.54332733154297,
"rewards/margins": 1.6068973541259766,
"rewards/rejected": -20.150224685668945,
"step": 83
},
{
"epoch": 0.18999151823579305,
"grad_norm": 88.82175847045283,
"learning_rate": 7.811014656565054e-07,
"logits/chosen": -0.8449732661247253,
"logits/rejected": -0.815599799156189,
"logps/chosen": -1.738198161125183,
"logps/rejected": -2.115600347518921,
"loss": 3.5074,
"rewards/accuracies": 0.703125,
"rewards/chosen": -17.381982803344727,
"rewards/margins": 3.7740225791931152,
"rewards/rejected": -21.156005859375,
"step": 84
},
{
"epoch": 0.1922533220243144,
"grad_norm": 63.74399903736426,
"learning_rate": 7.801280923129773e-07,
"logits/chosen": -0.8337980508804321,
"logits/rejected": -0.8294973969459534,
"logps/chosen": -1.8116644620895386,
"logps/rejected": -1.987363338470459,
"loss": 4.549,
"rewards/accuracies": 0.6015625,
"rewards/chosen": -18.11664581298828,
"rewards/margins": 1.756988763809204,
"rewards/rejected": -19.873632431030273,
"step": 85
},
{
"epoch": 0.19451512581283573,
"grad_norm": 77.0826088582988,
"learning_rate": 7.791309151618305e-07,
"logits/chosen": -0.8380694389343262,
"logits/rejected": -0.8311120271682739,
"logps/chosen": -1.9478144645690918,
"logps/rejected": -2.143031597137451,
"loss": 4.2291,
"rewards/accuracies": 0.5703125,
"rewards/chosen": -19.478145599365234,
"rewards/margins": 1.9521695375442505,
"rewards/rejected": -21.430315017700195,
"step": 86
},
{
"epoch": 0.1967769296013571,
"grad_norm": 53.07834075055144,
"learning_rate": 7.781099966467874e-07,
"logits/chosen": -0.8639700412750244,
"logits/rejected": -0.8545355200767517,
"logps/chosen": -1.727626919746399,
"logps/rejected": -1.8995643854141235,
"loss": 3.8778,
"rewards/accuracies": 0.6875,
"rewards/chosen": -17.276269912719727,
"rewards/margins": 1.7193742990493774,
"rewards/rejected": -18.995643615722656,
"step": 87
},
{
"epoch": 0.19903873338987843,
"grad_norm": 82.55613961122098,
"learning_rate": 7.770654006982664e-07,
"logits/chosen": -0.8509809374809265,
"logits/rejected": -0.8106420040130615,
"logps/chosen": -2.0078237056732178,
"logps/rejected": -2.231494426727295,
"loss": 4.4582,
"rewards/accuracies": 0.671875,
"rewards/chosen": -20.078235626220703,
"rewards/margins": 2.2367055416107178,
"rewards/rejected": -22.31494140625,
"step": 88
},
{
"epoch": 0.20130053717839977,
"grad_norm": 72.58396338245271,
"learning_rate": 7.759971927293781e-07,
"logits/chosen": -0.8639533519744873,
"logits/rejected": -0.8477087616920471,
"logps/chosen": -1.8459900617599487,
"logps/rejected": -2.0477120876312256,
"loss": 4.1424,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -18.459901809692383,
"rewards/margins": 2.0172195434570312,
"rewards/rejected": -20.477121353149414,
"step": 89
},
{
"epoch": 0.2035623409669211,
"grad_norm": 63.790702792270665,
"learning_rate": 7.749054396318297e-07,
"logits/chosen": -0.839960515499115,
"logits/rejected": -0.8227687478065491,
"logps/chosen": -1.9486068487167358,
"logps/rejected": -2.1220030784606934,
"loss": 4.2943,
"rewards/accuracies": 0.625,
"rewards/chosen": -19.486068725585938,
"rewards/margins": 1.733961582183838,
"rewards/rejected": -21.220029830932617,
"step": 90
},
{
"epoch": 0.20582414475544247,
"grad_norm": 77.99250593212055,
"learning_rate": 7.737902097717356e-07,
"logits/chosen": -0.8212487101554871,
"logits/rejected": -0.8338538408279419,
"logps/chosen": -1.863584280014038,
"logps/rejected": -2.1549441814422607,
"loss": 4.1322,
"rewards/accuracies": 0.609375,
"rewards/chosen": -18.635841369628906,
"rewards/margins": 2.913600444793701,
"rewards/rejected": -21.549442291259766,
"step": 91
},
{
"epoch": 0.2080859485439638,
"grad_norm": 72.8959596074702,
"learning_rate": 7.726515729853367e-07,
"logits/chosen": -0.8232444524765015,
"logits/rejected": -0.819841742515564,
"logps/chosen": -1.8698346614837646,
"logps/rejected": -2.029289484024048,
"loss": 4.5166,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -18.698345184326172,
"rewards/margins": 1.5945475101470947,
"rewards/rejected": -20.29289436340332,
"step": 92
},
{
"epoch": 0.21034775233248515,
"grad_norm": 87.04210388064935,
"learning_rate": 7.714896005746272e-07,
"logits/chosen": -0.8586671948432922,
"logits/rejected": -0.8418205976486206,
"logps/chosen": -1.9241650104522705,
"logps/rejected": -2.2153499126434326,
"loss": 3.6986,
"rewards/accuracies": 0.6875,
"rewards/chosen": -19.24165153503418,
"rewards/margins": 2.9118492603302,
"rewards/rejected": -22.153499603271484,
"step": 93
},
{
"epoch": 0.21260955612100652,
"grad_norm": 111.68532775800007,
"learning_rate": 7.703043653028896e-07,
"logits/chosen": -0.8883798122406006,
"logits/rejected": -0.8768050670623779,
"logps/chosen": -2.1463255882263184,
"logps/rejected": -2.3561160564422607,
"loss": 4.2491,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -21.463254928588867,
"rewards/margins": 2.097905158996582,
"rewards/rejected": -23.561161041259766,
"step": 94
},
{
"epoch": 0.21487135990952785,
"grad_norm": 73.84778954625673,
"learning_rate": 7.690959413901379e-07,
"logits/chosen": -0.8396280407905579,
"logits/rejected": -0.8113132119178772,
"logps/chosen": -2.0055348873138428,
"logps/rejected": -2.264613628387451,
"loss": 4.0505,
"rewards/accuracies": 0.671875,
"rewards/chosen": -20.055349349975586,
"rewards/margins": 2.590789318084717,
"rewards/rejected": -22.646137237548828,
"step": 95
},
{
"epoch": 0.2171331636980492,
"grad_norm": 76.75066956482912,
"learning_rate": 7.678644045084704e-07,
"logits/chosen": -0.7979358434677124,
"logits/rejected": -0.8126614093780518,
"logps/chosen": -1.901458978652954,
"logps/rejected": -2.1271097660064697,
"loss": 4.3121,
"rewards/accuracies": 0.59375,
"rewards/chosen": -19.014589309692383,
"rewards/margins": 2.2565088272094727,
"rewards/rejected": -21.271099090576172,
"step": 96
},
{
"epoch": 0.21939496748657053,
"grad_norm": 69.10947756524247,
"learning_rate": 7.666098317773308e-07,
"logits/chosen": -0.8482003211975098,
"logits/rejected": -0.8479557037353516,
"logps/chosen": -2.065002918243408,
"logps/rejected": -2.2882683277130127,
"loss": 3.7783,
"rewards/accuracies": 0.671875,
"rewards/chosen": -20.650028228759766,
"rewards/margins": 2.2326550483703613,
"rewards/rejected": -22.8826847076416,
"step": 97
},
{
"epoch": 0.2216567712750919,
"grad_norm": 71.38081973867735,
"learning_rate": 7.653323017586789e-07,
"logits/chosen": -0.8626989722251892,
"logits/rejected": -0.8502533435821533,
"logps/chosen": -1.871101975440979,
"logps/rejected": -2.0896551609039307,
"loss": 3.8377,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -18.711015701293945,
"rewards/margins": 2.1855337619781494,
"rewards/rejected": -20.89655303955078,
"step": 98
},
{
"epoch": 0.22391857506361323,
"grad_norm": 80.27831480952293,
"learning_rate": 7.640318944520711e-07,
"logits/chosen": -0.8602339029312134,
"logits/rejected": -0.8497695922851562,
"logps/chosen": -2.0520148277282715,
"logps/rejected": -2.2807064056396484,
"loss": 3.7873,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -20.52014923095703,
"rewards/margins": 2.286914825439453,
"rewards/rejected": -22.80706214904785,
"step": 99
},
{
"epoch": 0.22618037885213457,
"grad_norm": 82.83162144065287,
"learning_rate": 7.627086912896511e-07,
"logits/chosen": -0.755748987197876,
"logits/rejected": -0.7821561098098755,
"logps/chosen": -1.9375782012939453,
"logps/rejected": -2.1892411708831787,
"loss": 3.7342,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -19.37578010559082,
"rewards/margins": 2.516632318496704,
"rewards/rejected": -21.892412185668945,
"step": 100
},
{
"epoch": 0.2284421826406559,
"grad_norm": 61.630906980874535,
"learning_rate": 7.613627751310499e-07,
"logits/chosen": -0.8867424726486206,
"logits/rejected": -0.8885044455528259,
"logps/chosen": -2.1041259765625,
"logps/rejected": -2.3255014419555664,
"loss": 3.658,
"rewards/accuracies": 0.65625,
"rewards/chosen": -21.041257858276367,
"rewards/margins": 2.2137553691864014,
"rewards/rejected": -23.255016326904297,
"step": 101
},
{
"epoch": 0.23070398642917728,
"grad_norm": 89.96142120049903,
"learning_rate": 7.599942302581977e-07,
"logits/chosen": -0.8578089475631714,
"logits/rejected": -0.8603122234344482,
"logps/chosen": -2.110222816467285,
"logps/rejected": -2.432941198348999,
"loss": 3.5255,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -21.10222816467285,
"rewards/margins": 3.227184295654297,
"rewards/rejected": -24.329410552978516,
"step": 102
},
{
"epoch": 0.23296579021769862,
"grad_norm": 76.3035369590703,
"learning_rate": 7.586031423700457e-07,
"logits/chosen": -0.8419609069824219,
"logits/rejected": -0.8390515446662903,
"logps/chosen": -2.08290958404541,
"logps/rejected": -2.321464776992798,
"loss": 3.9243,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -20.8290958404541,
"rewards/margins": 2.3855514526367188,
"rewards/rejected": -23.214645385742188,
"step": 103
},
{
"epoch": 0.23522759400621995,
"grad_norm": 73.90826312722105,
"learning_rate": 7.571895985772e-07,
"logits/chosen": -0.8009840846061707,
"logits/rejected": -0.8087509870529175,
"logps/chosen": -2.1095006465911865,
"logps/rejected": -2.4837350845336914,
"loss": 3.27,
"rewards/accuracies": 0.71875,
"rewards/chosen": -21.095006942749023,
"rewards/margins": 3.7423441410064697,
"rewards/rejected": -24.837350845336914,
"step": 104
},
{
"epoch": 0.23748939779474132,
"grad_norm": 90.91119808796685,
"learning_rate": 7.557536873964661e-07,
"logits/chosen": -0.8794471025466919,
"logits/rejected": -0.8741526007652283,
"logps/chosen": -2.432756185531616,
"logps/rejected": -2.654453992843628,
"loss": 4.2395,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -24.327564239501953,
"rewards/margins": 2.2169761657714844,
"rewards/rejected": -26.544538497924805,
"step": 105
},
{
"epoch": 0.23975120158326266,
"grad_norm": 72.09150920561058,
"learning_rate": 7.542954987453069e-07,
"logits/chosen": -0.8508340716362,
"logits/rejected": -0.8528013825416565,
"logps/chosen": -2.29594087600708,
"logps/rejected": -2.5421571731567383,
"loss": 3.7295,
"rewards/accuracies": 0.703125,
"rewards/chosen": -22.959407806396484,
"rewards/margins": 2.462160348892212,
"rewards/rejected": -25.421571731567383,
"step": 106
},
{
"epoch": 0.242013005371784,
"grad_norm": 92.05218134624694,
"learning_rate": 7.528151239362108e-07,
"logits/chosen": -0.8492079377174377,
"logits/rejected": -0.8603383898735046,
"logps/chosen": -2.424321174621582,
"logps/rejected": -2.706427812576294,
"loss": 3.7107,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -24.243213653564453,
"rewards/margins": 2.8210651874542236,
"rewards/rejected": -27.064279556274414,
"step": 107
},
{
"epoch": 0.24427480916030533,
"grad_norm": 87.963517642823,
"learning_rate": 7.513126556709748e-07,
"logits/chosen": -0.8548307418823242,
"logits/rejected": -0.8365832567214966,
"logps/chosen": -2.367842197418213,
"logps/rejected": -2.776707172393799,
"loss": 3.2256,
"rewards/accuracies": 0.703125,
"rewards/chosen": -23.678422927856445,
"rewards/margins": 4.088651657104492,
"rewards/rejected": -27.767070770263672,
"step": 108
},
{
"epoch": 0.2465366129488267,
"grad_norm": 104.59173334061525,
"learning_rate": 7.497881880348984e-07,
"logits/chosen": -0.8096323013305664,
"logits/rejected": -0.7972118854522705,
"logps/chosen": -2.4743218421936035,
"logps/rejected": -2.755343437194824,
"loss": 3.9171,
"rewards/accuracies": 0.671875,
"rewards/chosen": -24.74321937561035,
"rewards/margins": 2.8102121353149414,
"rewards/rejected": -27.55343246459961,
"step": 109
},
{
"epoch": 0.24879841673734804,
"grad_norm": 164.02635203828626,
"learning_rate": 7.482418164908931e-07,
"logits/chosen": -0.8311317563056946,
"logits/rejected": -0.8270904421806335,
"logps/chosen": -2.616877794265747,
"logps/rejected": -2.835850477218628,
"loss": 4.1572,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -26.168777465820312,
"rewards/margins": 2.1897284984588623,
"rewards/rejected": -28.358505249023438,
"step": 110
},
{
"epoch": 0.2510602205258694,
"grad_norm": 91.04185257653407,
"learning_rate": 7.466736378735035e-07,
"logits/chosen": -0.8114999532699585,
"logits/rejected": -0.813449501991272,
"logps/chosen": -2.6687309741973877,
"logps/rejected": -3.0298876762390137,
"loss": 3.5266,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -26.68730926513672,
"rewards/margins": 3.6115658283233643,
"rewards/rejected": -30.298873901367188,
"step": 111
},
{
"epoch": 0.2533220243143907,
"grad_norm": 103.42000228406864,
"learning_rate": 7.450837503828439e-07,
"logits/chosen": -0.7962609529495239,
"logits/rejected": -0.7812699675559998,
"logps/chosen": -2.829650640487671,
"logps/rejected": -3.2509734630584717,
"loss": 3.4804,
"rewards/accuracies": 0.75,
"rewards/chosen": -28.296504974365234,
"rewards/margins": 4.213228702545166,
"rewards/rejected": -32.509735107421875,
"step": 112
},
{
"epoch": 0.2555838281029121,
"grad_norm": 79.3801670576343,
"learning_rate": 7.43472253578449e-07,
"logits/chosen": -0.792640745639801,
"logits/rejected": -0.7946760654449463,
"logps/chosen": -2.4934792518615723,
"logps/rejected": -2.861074447631836,
"loss": 3.6492,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -24.934791564941406,
"rewards/margins": 3.675952434539795,
"rewards/rejected": -28.61074447631836,
"step": 113
},
{
"epoch": 0.2578456318914334,
"grad_norm": 84.41846931366052,
"learning_rate": 7.418392483730389e-07,
"logits/chosen": -0.8063937425613403,
"logits/rejected": -0.8131504058837891,
"logps/chosen": -2.685357093811035,
"logps/rejected": -3.0498905181884766,
"loss": 3.312,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -26.85357093811035,
"rewards/margins": 3.6453330516815186,
"rewards/rejected": -30.4989013671875,
"step": 114
},
{
"epoch": 0.26010743567995476,
"grad_norm": 83.25446388909985,
"learning_rate": 7.401848370262012e-07,
"logits/chosen": -0.8394590020179749,
"logits/rejected": -0.8276815414428711,
"logps/chosen": -2.709625244140625,
"logps/rejected": -2.9904394149780273,
"loss": 3.5602,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -27.096254348754883,
"rewards/margins": 2.808140754699707,
"rewards/rejected": -29.904394149780273,
"step": 115
},
{
"epoch": 0.2623692394684761,
"grad_norm": 99.34909208476857,
"learning_rate": 7.385091231379856e-07,
"logits/chosen": -0.8177067041397095,
"logits/rejected": -0.8186391592025757,
"logps/chosen": -2.9124531745910645,
"logps/rejected": -3.2890923023223877,
"loss": 3.7712,
"rewards/accuracies": 0.703125,
"rewards/chosen": -29.124530792236328,
"rewards/margins": 3.7663931846618652,
"rewards/rejected": -32.89092254638672,
"step": 116
},
{
"epoch": 0.26463104325699743,
"grad_norm": 196.05646746243912,
"learning_rate": 7.368122116424182e-07,
"logits/chosen": -0.7795528769493103,
"logits/rejected": -0.7889488935470581,
"logps/chosen": -2.8638410568237305,
"logps/rejected": -3.2152295112609863,
"loss": 3.873,
"rewards/accuracies": 0.65625,
"rewards/chosen": -28.63841438293457,
"rewards/margins": 3.5138840675354004,
"rewards/rejected": -32.15229797363281,
"step": 117
},
{
"epoch": 0.2668928470455188,
"grad_norm": 123.65267983469268,
"learning_rate": 7.350942088009289e-07,
"logits/chosen": -0.8420966863632202,
"logits/rejected": -0.8411574363708496,
"logps/chosen": -2.9619340896606445,
"logps/rejected": -3.260565996170044,
"loss": 3.5104,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -29.619338989257812,
"rewards/margins": 2.9863200187683105,
"rewards/rejected": -32.60565948486328,
"step": 118
},
{
"epoch": 0.26915465083404017,
"grad_norm": 124.41958542086248,
"learning_rate": 7.333552221956986e-07,
"logits/chosen": -0.9466845393180847,
"logits/rejected": -0.9294576644897461,
"logps/chosen": -3.071857213973999,
"logps/rejected": -3.4685423374176025,
"loss": 3.8289,
"rewards/accuracies": 0.703125,
"rewards/chosen": -30.718576431274414,
"rewards/margins": 3.9668469429016113,
"rewards/rejected": -34.6854248046875,
"step": 119
},
{
"epoch": 0.2714164546225615,
"grad_norm": 95.20752031494493,
"learning_rate": 7.315953607229217e-07,
"logits/chosen": -0.8441572785377502,
"logits/rejected": -0.8446038961410522,
"logps/chosen": -3.104475498199463,
"logps/rejected": -3.505401611328125,
"loss": 3.32,
"rewards/accuracies": 0.703125,
"rewards/chosen": -31.04475212097168,
"rewards/margins": 4.009262561798096,
"rewards/rejected": -35.05401611328125,
"step": 120
},
{
"epoch": 0.27367825841108284,
"grad_norm": 106.09440015221676,
"learning_rate": 7.298147345859869e-07,
"logits/chosen": -0.8386214375495911,
"logits/rejected": -0.8599450588226318,
"logps/chosen": -2.900517463684082,
"logps/rejected": -3.2684617042541504,
"loss": 3.5845,
"rewards/accuracies": 0.71875,
"rewards/chosen": -29.00517463684082,
"rewards/margins": 3.6794400215148926,
"rewards/rejected": -32.68461608886719,
"step": 121
},
{
"epoch": 0.2759400621996042,
"grad_norm": 160.41836210136088,
"learning_rate": 7.280134552885762e-07,
"logits/chosen": -0.8167920112609863,
"logits/rejected": -0.8117007613182068,
"logps/chosen": -2.9862632751464844,
"logps/rejected": -3.363959789276123,
"loss": 3.5251,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -29.86263084411621,
"rewards/margins": 3.776963710784912,
"rewards/rejected": -33.63959884643555,
"step": 122
},
{
"epoch": 0.2782018659881255,
"grad_norm": 99.5977130216733,
"learning_rate": 7.261916356276831e-07,
"logits/chosen": -0.8167967796325684,
"logits/rejected": -0.8092377185821533,
"logps/chosen": -3.0471675395965576,
"logps/rejected": -3.5443172454833984,
"loss": 2.9443,
"rewards/accuracies": 0.765625,
"rewards/chosen": -30.47167205810547,
"rewards/margins": 4.971498489379883,
"rewards/rejected": -35.443172454833984,
"step": 123
},
{
"epoch": 0.2804636697766469,
"grad_norm": 98.66168594344816,
"learning_rate": 7.243493896865486e-07,
"logits/chosen": -0.8218358755111694,
"logits/rejected": -0.8053916096687317,
"logps/chosen": -2.7801990509033203,
"logps/rejected": -3.0717597007751465,
"loss": 3.5017,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -27.801990509033203,
"rewards/margins": 2.9156064987182617,
"rewards/rejected": -30.71759796142578,
"step": 124
},
{
"epoch": 0.2827254735651682,
"grad_norm": 114.74683317574737,
"learning_rate": 7.224868328275169e-07,
"logits/chosen": -0.8093046545982361,
"logits/rejected": -0.805727481842041,
"logps/chosen": -2.894709348678589,
"logps/rejected": -3.254554033279419,
"loss": 3.7134,
"rewards/accuracies": 0.6875,
"rewards/chosen": -28.947093963623047,
"rewards/margins": 3.5984435081481934,
"rewards/rejected": -32.54553985595703,
"step": 125
},
{
"epoch": 0.28498727735368956,
"grad_norm": 194.3294016359615,
"learning_rate": 7.206040816848126e-07,
"logits/chosen": -0.8390508890151978,
"logits/rejected": -0.8263464570045471,
"logps/chosen": -3.065004825592041,
"logps/rejected": -3.259084939956665,
"loss": 4.415,
"rewards/accuracies": 0.671875,
"rewards/chosen": -30.650049209594727,
"rewards/margins": 1.9408013820648193,
"rewards/rejected": -32.590850830078125,
"step": 126
},
{
"epoch": 0.2872490811422109,
"grad_norm": 120.3073948329199,
"learning_rate": 7.187012541572356e-07,
"logits/chosen": -0.905938446521759,
"logits/rejected": -0.8901224136352539,
"logps/chosen": -3.1450047492980957,
"logps/rejected": -3.485504627227783,
"loss": 3.7657,
"rewards/accuracies": 0.671875,
"rewards/chosen": -31.450042724609375,
"rewards/margins": 3.4050049781799316,
"rewards/rejected": -34.85504913330078,
"step": 127
},
{
"epoch": 0.28951088493073224,
"grad_norm": 137.9411831080582,
"learning_rate": 7.167784694007791e-07,
"logits/chosen": -0.8116433620452881,
"logits/rejected": -0.8170086741447449,
"logps/chosen": -3.0017967224121094,
"logps/rejected": -3.353875160217285,
"loss": 3.7386,
"rewards/accuracies": 0.703125,
"rewards/chosen": -30.017967224121094,
"rewards/margins": 3.5207817554473877,
"rewards/rejected": -33.53874969482422,
"step": 128
},
{
"epoch": 0.2917726887192536,
"grad_norm": 110.61290745803281,
"learning_rate": 7.148358478211682e-07,
"logits/chosen": -0.8747140169143677,
"logits/rejected": -0.8586560487747192,
"logps/chosen": -3.0916569232940674,
"logps/rejected": -3.517625331878662,
"loss": 2.9442,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -30.91657066345215,
"rewards/margins": 4.259681701660156,
"rewards/rejected": -35.17625045776367,
"step": 129
},
{
"epoch": 0.29403449250777497,
"grad_norm": 84.56951029170779,
"learning_rate": 7.128735110663187e-07,
"logits/chosen": -0.8497614860534668,
"logits/rejected": -0.8194679617881775,
"logps/chosen": -2.7605772018432617,
"logps/rejected": -3.1635406017303467,
"loss": 3.4017,
"rewards/accuracies": 0.6875,
"rewards/chosen": -27.605772018432617,
"rewards/margins": 4.029631614685059,
"rewards/rejected": -31.63540267944336,
"step": 130
},
{
"epoch": 0.2962962962962963,
"grad_norm": 112.17591295821964,
"learning_rate": 7.108915820187211e-07,
"logits/chosen": -0.8097432255744934,
"logits/rejected": -0.8088663816452026,
"logps/chosen": -3.1781814098358154,
"logps/rejected": -3.545646905899048,
"loss": 3.8459,
"rewards/accuracies": 0.6875,
"rewards/chosen": -31.781814575195312,
"rewards/margins": 3.6746530532836914,
"rewards/rejected": -35.45646667480469,
"step": 131
},
{
"epoch": 0.29855810008481765,
"grad_norm": 120.89188876376829,
"learning_rate": 7.088901847877447e-07,
"logits/chosen": -0.7971144914627075,
"logits/rejected": -0.7930186986923218,
"logps/chosen": -3.015921115875244,
"logps/rejected": -3.2458338737487793,
"loss": 4.6776,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -30.159210205078125,
"rewards/margins": 2.2991273403167725,
"rewards/rejected": -32.458335876464844,
"step": 132
},
{
"epoch": 0.300819903873339,
"grad_norm": 123.91186702122735,
"learning_rate": 7.068694447018658e-07,
"logits/chosen": -0.8384436964988708,
"logits/rejected": -0.846354603767395,
"logps/chosen": -3.0088987350463867,
"logps/rejected": -3.410034656524658,
"loss": 3.3848,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -30.088991165161133,
"rewards/margins": 4.011353492736816,
"rewards/rejected": -34.100341796875,
"step": 133
},
{
"epoch": 0.3030817076618603,
"grad_norm": 98.50507786273067,
"learning_rate": 7.048294883008199e-07,
"logits/chosen": -0.8138392567634583,
"logits/rejected": -0.8176507353782654,
"logps/chosen": -2.9301810264587402,
"logps/rejected": -3.2924013137817383,
"loss": 3.3271,
"rewards/accuracies": 0.78125,
"rewards/chosen": -29.30181121826172,
"rewards/margins": 3.6222026348114014,
"rewards/rejected": -32.924015045166016,
"step": 134
},
{
"epoch": 0.3053435114503817,
"grad_norm": 80.23471164611576,
"learning_rate": 7.027704433276776e-07,
"logits/chosen": -0.7829840183258057,
"logits/rejected": -0.7787750363349915,
"logps/chosen": -2.9372944831848145,
"logps/rejected": -3.404730796813965,
"loss": 3.1137,
"rewards/accuracies": 0.75,
"rewards/chosen": -29.37294578552246,
"rewards/margins": 4.674362659454346,
"rewards/rejected": -34.047306060791016,
"step": 135
},
{
"epoch": 0.307605315238903,
"grad_norm": 118.34531582043013,
"learning_rate": 7.006924387208452e-07,
"logits/chosen": -0.7873696088790894,
"logits/rejected": -0.7685777544975281,
"logps/chosen": -2.834895610809326,
"logps/rejected": -3.1748013496398926,
"loss": 3.3333,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -28.348957061767578,
"rewards/margins": 3.3990557193756104,
"rewards/rejected": -31.74801254272461,
"step": 136
},
{
"epoch": 0.30986711902742436,
"grad_norm": 95.16445157429489,
"learning_rate": 6.985956046059904e-07,
"logits/chosen": -0.7679412961006165,
"logits/rejected": -0.7677736878395081,
"logps/chosen": -2.7393550872802734,
"logps/rejected": -3.1393771171569824,
"loss": 3.5669,
"rewards/accuracies": 0.703125,
"rewards/chosen": -27.393552780151367,
"rewards/margins": 4.000221252441406,
"rewards/rejected": -31.39377212524414,
"step": 137
},
{
"epoch": 0.31212892281594573,
"grad_norm": 82.62714852037034,
"learning_rate": 6.964800722878945e-07,
"logits/chosen": -0.724460244178772,
"logits/rejected": -0.721074104309082,
"logps/chosen": -2.9422450065612793,
"logps/rejected": -3.427375555038452,
"loss": 3.0358,
"rewards/accuracies": 0.765625,
"rewards/chosen": -29.42245101928711,
"rewards/margins": 4.85130500793457,
"rewards/rejected": -34.27375411987305,
"step": 138
},
{
"epoch": 0.31439072660446704,
"grad_norm": 88.6570876412774,
"learning_rate": 6.943459742422287e-07,
"logits/chosen": -0.75481116771698,
"logits/rejected": -0.7281723022460938,
"logps/chosen": -2.926560401916504,
"logps/rejected": -3.3593132495880127,
"loss": 3.6077,
"rewards/accuracies": 0.703125,
"rewards/chosen": -29.265605926513672,
"rewards/margins": 4.327524662017822,
"rewards/rejected": -33.59313201904297,
"step": 139
},
{
"epoch": 0.3166525303929884,
"grad_norm": 97.82195429096225,
"learning_rate": 6.921934441072597e-07,
"logits/chosen": -0.800703227519989,
"logits/rejected": -0.8018285036087036,
"logps/chosen": -3.131863594055176,
"logps/rejected": -3.468980073928833,
"loss": 3.8994,
"rewards/accuracies": 0.6875,
"rewards/chosen": -31.318635940551758,
"rewards/margins": 3.3711633682250977,
"rewards/rejected": -34.68980026245117,
"step": 140
},
{
"epoch": 0.3189143341815098,
"grad_norm": 130.70910880053893,
"learning_rate": 6.900226166754807e-07,
"logits/chosen": -0.7732895016670227,
"logits/rejected": -0.7903754115104675,
"logps/chosen": -3.2161002159118652,
"logps/rejected": -3.525928497314453,
"loss": 3.9513,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -32.1609992980957,
"rewards/margins": 3.0982844829559326,
"rewards/rejected": -35.2592887878418,
"step": 141
},
{
"epoch": 0.3211761379700311,
"grad_norm": 94.8619719166109,
"learning_rate": 6.8783362788517e-07,
"logits/chosen": -0.7802690267562866,
"logits/rejected": -0.781207263469696,
"logps/chosen": -3.136306047439575,
"logps/rejected": -3.542205333709717,
"loss": 4.0634,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -31.363059997558594,
"rewards/margins": 4.058990478515625,
"rewards/rejected": -35.422054290771484,
"step": 142
},
{
"epoch": 0.32343794175855245,
"grad_norm": 96.17049011345333,
"learning_rate": 6.856266148118796e-07,
"logits/chosen": -0.7571829557418823,
"logits/rejected": -0.7646656036376953,
"logps/chosen": -2.8659865856170654,
"logps/rejected": -3.3385119438171387,
"loss": 3.2572,
"rewards/accuracies": 0.75,
"rewards/chosen": -28.659862518310547,
"rewards/margins": 4.725252151489258,
"rewards/rejected": -33.38511657714844,
"step": 143
},
{
"epoch": 0.3256997455470738,
"grad_norm": 110.19796175719054,
"learning_rate": 6.834017156598512e-07,
"logits/chosen": -0.7483683228492737,
"logits/rejected": -0.7414959073066711,
"logps/chosen": -3.109438419342041,
"logps/rejected": -3.5427985191345215,
"loss": 3.4928,
"rewards/accuracies": 0.703125,
"rewards/chosen": -31.09438133239746,
"rewards/margins": 4.333602428436279,
"rewards/rejected": -35.427982330322266,
"step": 144
},
{
"epoch": 0.3279615493355951,
"grad_norm": 79.78726439178003,
"learning_rate": 6.811590697533607e-07,
"logits/chosen": -0.8195265531539917,
"logits/rejected": -0.838479220867157,
"logps/chosen": -3.01442813873291,
"logps/rejected": -3.3907887935638428,
"loss": 3.5141,
"rewards/accuracies": 0.765625,
"rewards/chosen": -30.144283294677734,
"rewards/margins": 3.7636024951934814,
"rewards/rejected": -33.90788650512695,
"step": 145
},
{
"epoch": 0.3302233531241165,
"grad_norm": 151.81377404607898,
"learning_rate": 6.788988175279951e-07,
"logits/chosen": -0.7769032120704651,
"logits/rejected": -0.7602939605712891,
"logps/chosen": -3.0373010635375977,
"logps/rejected": -3.367269515991211,
"loss": 4.0091,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -30.373010635375977,
"rewards/margins": 3.299685001373291,
"rewards/rejected": -33.672698974609375,
"step": 146
},
{
"epoch": 0.3324851569126378,
"grad_norm": 93.15629599355084,
"learning_rate": 6.766211005218577e-07,
"logits/chosen": -0.7618966698646545,
"logits/rejected": -0.7614046931266785,
"logps/chosen": -3.0041072368621826,
"logps/rejected": -3.5381112098693848,
"loss": 3.0388,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -30.04107093811035,
"rewards/margins": 5.340038299560547,
"rewards/rejected": -35.38111114501953,
"step": 147
},
{
"epoch": 0.33474696070115917,
"grad_norm": 93.03406880519788,
"learning_rate": 6.743260613667047e-07,
"logits/chosen": -0.8518264889717102,
"logits/rejected": -0.8462361693382263,
"logps/chosen": -2.939993143081665,
"logps/rejected": -3.3589367866516113,
"loss": 3.4992,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -29.399932861328125,
"rewards/margins": 4.18943452835083,
"rewards/rejected": -33.58937072753906,
"step": 148
},
{
"epoch": 0.33700876448968053,
"grad_norm": 73.40361139697784,
"learning_rate": 6.720138437790139e-07,
"logits/chosen": -0.8052965998649597,
"logits/rejected": -0.7937459945678711,
"logps/chosen": -2.8842406272888184,
"logps/rejected": -3.307342529296875,
"loss": 3.1965,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -28.8424015045166,
"rewards/margins": 4.23102331161499,
"rewards/rejected": -33.07342529296875,
"step": 149
},
{
"epoch": 0.33927056827820185,
"grad_norm": 106.95490507848946,
"learning_rate": 6.696845925509848e-07,
"logits/chosen": -0.8310205936431885,
"logits/rejected": -0.8272488713264465,
"logps/chosen": -2.934943437576294,
"logps/rejected": -3.2793498039245605,
"loss": 3.6164,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -29.349435806274414,
"rewards/margins": 3.444066047668457,
"rewards/rejected": -32.79349899291992,
"step": 150
},
{
"epoch": 0.3415323720667232,
"grad_norm": 100.186884927208,
"learning_rate": 6.673384535414718e-07,
"logits/chosen": -0.8480186462402344,
"logits/rejected": -0.8310289978981018,
"logps/chosen": -3.0514492988586426,
"logps/rejected": -3.3322248458862305,
"loss": 3.9173,
"rewards/accuracies": 0.671875,
"rewards/chosen": -30.514493942260742,
"rewards/margins": 2.8077542781829834,
"rewards/rejected": -33.32224655151367,
"step": 151
},
{
"epoch": 0.3437941758552446,
"grad_norm": 100.75576403172839,
"learning_rate": 6.649755736668511e-07,
"logits/chosen": -0.7694522738456726,
"logits/rejected": -0.7615189552307129,
"logps/chosen": -2.6866354942321777,
"logps/rejected": -3.11234188079834,
"loss": 2.8237,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -26.866352081298828,
"rewards/margins": 4.257061958312988,
"rewards/rejected": -31.123416900634766,
"step": 152
},
{
"epoch": 0.3460559796437659,
"grad_norm": 117.9327947173104,
"learning_rate": 6.625961008918192e-07,
"logits/chosen": -0.7936750054359436,
"logits/rejected": -0.7835503220558167,
"logps/chosen": -2.7540676593780518,
"logps/rejected": -3.2012295722961426,
"loss": 2.9183,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -27.540678024291992,
"rewards/margins": 4.471617221832275,
"rewards/rejected": -32.01229476928711,
"step": 153
},
{
"epoch": 0.34831778343228725,
"grad_norm": 96.1956695260042,
"learning_rate": 6.602001842201289e-07,
"logits/chosen": -0.7796362042427063,
"logits/rejected": -0.7905425429344177,
"logps/chosen": -2.7750422954559326,
"logps/rejected": -3.049340009689331,
"loss": 4.0488,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -27.75042152404785,
"rewards/margins": 2.74297833442688,
"rewards/rejected": -30.49340057373047,
"step": 154
},
{
"epoch": 0.3505795872208086,
"grad_norm": 101.24392958865374,
"learning_rate": 6.577879736852571e-07,
"logits/chosen": -0.8088594079017639,
"logits/rejected": -0.8155099749565125,
"logps/chosen": -2.821500301361084,
"logps/rejected": -3.0726358890533447,
"loss": 3.9081,
"rewards/accuracies": 0.65625,
"rewards/chosen": -28.215003967285156,
"rewards/margins": 2.5113601684570312,
"rewards/rejected": -30.726364135742188,
"step": 155
},
{
"epoch": 0.35284139100932993,
"grad_norm": 73.2743068340787,
"learning_rate": 6.553596203410112e-07,
"logits/chosen": -0.8153470754623413,
"logits/rejected": -0.8048913478851318,
"logps/chosen": -2.7679576873779297,
"logps/rejected": -3.2883174419403076,
"loss": 2.546,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -27.679576873779297,
"rewards/margins": 5.203596115112305,
"rewards/rejected": -32.883174896240234,
"step": 156
},
{
"epoch": 0.3551031947978513,
"grad_norm": 82.3012728094315,
"learning_rate": 6.529152762520688e-07,
"logits/chosen": -0.8138669729232788,
"logits/rejected": -0.8138793110847473,
"logps/chosen": -2.864006757736206,
"logps/rejected": -3.2064666748046875,
"loss": 3.5646,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -28.64006805419922,
"rewards/margins": 3.424598217010498,
"rewards/rejected": -32.06466293334961,
"step": 157
},
{
"epoch": 0.3573649985863726,
"grad_norm": 100.03152867926248,
"learning_rate": 6.504550944844558e-07,
"logits/chosen": -0.7475910782814026,
"logits/rejected": -0.7779514789581299,
"logps/chosen": -2.7473607063293457,
"logps/rejected": -3.1302061080932617,
"loss": 3.452,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -27.47360610961914,
"rewards/margins": 3.8284525871276855,
"rewards/rejected": -31.302059173583984,
"step": 158
},
{
"epoch": 0.359626802374894,
"grad_norm": 105.22720404522045,
"learning_rate": 6.479792290959613e-07,
"logits/chosen": -0.7691587209701538,
"logits/rejected": -0.7878850698471069,
"logps/chosen": -2.8018503189086914,
"logps/rejected": -3.312527656555176,
"loss": 3.2183,
"rewards/accuracies": 0.765625,
"rewards/chosen": -28.01850128173828,
"rewards/margins": 5.106773853302002,
"rewards/rejected": -33.125274658203125,
"step": 159
},
{
"epoch": 0.36188860616341534,
"grad_norm": 89.61616128866521,
"learning_rate": 6.454878351264906e-07,
"logits/chosen": -0.7589330673217773,
"logits/rejected": -0.745934009552002,
"logps/chosen": -2.6822848320007324,
"logps/rejected": -3.0995330810546875,
"loss": 3.4046,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -26.822847366333008,
"rewards/margins": 4.172482490539551,
"rewards/rejected": -30.995328903198242,
"step": 160
},
{
"epoch": 0.36415040995193665,
"grad_norm": 96.61923230400093,
"learning_rate": 6.429810685883565e-07,
"logits/chosen": -0.8186591267585754,
"logits/rejected": -0.82514488697052,
"logps/chosen": -2.8654110431671143,
"logps/rejected": -3.2399096488952637,
"loss": 3.238,
"rewards/accuracies": 0.765625,
"rewards/chosen": -28.65411376953125,
"rewards/margins": 3.7449822425842285,
"rewards/rejected": -32.39909362792969,
"step": 161
},
{
"epoch": 0.366412213740458,
"grad_norm": 109.68200048681109,
"learning_rate": 6.404590864565088e-07,
"logits/chosen": -0.7650143504142761,
"logits/rejected": -0.7517848014831543,
"logps/chosen": -2.817117214202881,
"logps/rejected": -3.050743341445923,
"loss": 3.9095,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -28.171171188354492,
"rewards/margins": 2.3362598419189453,
"rewards/rejected": -30.507431030273438,
"step": 162
},
{
"epoch": 0.3686740175289794,
"grad_norm": 103.61475684738596,
"learning_rate": 6.379220466587063e-07,
"logits/chosen": -0.7960351705551147,
"logits/rejected": -0.7686564922332764,
"logps/chosen": -2.810275077819824,
"logps/rejected": -3.18802809715271,
"loss": 3.246,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -28.10274887084961,
"rewards/margins": 3.777529239654541,
"rewards/rejected": -31.880279541015625,
"step": 163
},
{
"epoch": 0.3709358213175007,
"grad_norm": 119.25022849905575,
"learning_rate": 6.353701080656254e-07,
"logits/chosen": -0.7721018195152283,
"logits/rejected": -0.7901967763900757,
"logps/chosen": -2.9517931938171387,
"logps/rejected": -3.250936985015869,
"loss": 3.6435,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -29.517929077148438,
"rewards/margins": 2.9914417266845703,
"rewards/rejected": -32.50937271118164,
"step": 164
},
{
"epoch": 0.37319762510602206,
"grad_norm": 106.89469142388397,
"learning_rate": 6.32803430480913e-07,
"logits/chosen": -0.7933882474899292,
"logits/rejected": -0.785070538520813,
"logps/chosen": -2.898366689682007,
"logps/rejected": -3.3264529705047607,
"loss": 3.3983,
"rewards/accuracies": 0.75,
"rewards/chosen": -28.98366928100586,
"rewards/margins": 4.2808613777160645,
"rewards/rejected": -33.264530181884766,
"step": 165
},
{
"epoch": 0.3754594288945434,
"grad_norm": 90.5066177326131,
"learning_rate": 6.302221746311782e-07,
"logits/chosen": -0.8018909096717834,
"logits/rejected": -0.7745494246482849,
"logps/chosen": -2.7565348148345947,
"logps/rejected": -3.1439740657806396,
"loss": 3.4065,
"rewards/accuracies": 0.703125,
"rewards/chosen": -27.565349578857422,
"rewards/margins": 3.874392032623291,
"rewards/rejected": -31.439741134643555,
"step": 166
},
{
"epoch": 0.37772123268306473,
"grad_norm": 86.55925111562716,
"learning_rate": 6.276265021559288e-07,
"logits/chosen": -0.8132920861244202,
"logits/rejected": -0.8016676306724548,
"logps/chosen": -2.9889135360717773,
"logps/rejected": -3.338178873062134,
"loss": 3.5752,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -29.88913345336914,
"rewards/margins": 3.4926562309265137,
"rewards/rejected": -33.38179016113281,
"step": 167
},
{
"epoch": 0.3799830364715861,
"grad_norm": 73.35269687730333,
"learning_rate": 6.250165755974487e-07,
"logits/chosen": -0.757270336151123,
"logits/rejected": -0.7608906626701355,
"logps/chosen": -2.9331836700439453,
"logps/rejected": -3.337920904159546,
"loss": 3.1407,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -29.33183479309082,
"rewards/margins": 4.047374725341797,
"rewards/rejected": -33.37921142578125,
"step": 168
},
{
"epoch": 0.3822448402601074,
"grad_norm": 86.91929641048736,
"learning_rate": 6.223925583906192e-07,
"logits/chosen": -0.8268415331840515,
"logits/rejected": -0.8238467574119568,
"logps/chosen": -3.028745174407959,
"logps/rejected": -3.4829823970794678,
"loss": 2.8551,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -30.287450790405273,
"rewards/margins": 4.5423736572265625,
"rewards/rejected": -34.8298225402832,
"step": 169
},
{
"epoch": 0.3845066440486288,
"grad_norm": 85.76060468853915,
"learning_rate": 6.19754614852685e-07,
"logits/chosen": -0.8132871985435486,
"logits/rejected": -0.8051560521125793,
"logps/chosen": -2.942837953567505,
"logps/rejected": -3.3470842838287354,
"loss": 3.0876,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -29.428375244140625,
"rewards/margins": 4.04246711730957,
"rewards/rejected": -33.470848083496094,
"step": 170
},
{
"epoch": 0.38676844783715014,
"grad_norm": 101.76248326578423,
"learning_rate": 6.171029101729644e-07,
"logits/chosen": -0.733617901802063,
"logits/rejected": -0.7339813709259033,
"logps/chosen": -3.1949334144592285,
"logps/rejected": -3.661990165710449,
"loss": 3.1671,
"rewards/accuracies": 0.734375,
"rewards/chosen": -31.94933319091797,
"rewards/margins": 4.670570373535156,
"rewards/rejected": -36.61989974975586,
"step": 171
},
{
"epoch": 0.38903025162567145,
"grad_norm": 113.9018020285417,
"learning_rate": 6.144376104025055e-07,
"logits/chosen": -0.8161033987998962,
"logits/rejected": -0.8007526993751526,
"logps/chosen": -3.0398004055023193,
"logps/rejected": -3.4840195178985596,
"loss": 3.1426,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -30.398000717163086,
"rewards/margins": 4.442192554473877,
"rewards/rejected": -34.84019470214844,
"step": 172
},
{
"epoch": 0.3912920554141928,
"grad_norm": 109.66608111755836,
"learning_rate": 6.117588824436873e-07,
"logits/chosen": -0.8302391171455383,
"logits/rejected": -0.8384109735488892,
"logps/chosen": -3.1871933937072754,
"logps/rejected": -3.547971487045288,
"loss": 3.7869,
"rewards/accuracies": 0.6875,
"rewards/chosen": -31.87193489074707,
"rewards/margins": 3.607778549194336,
"rewards/rejected": -35.47970962524414,
"step": 173
},
{
"epoch": 0.3935538592027142,
"grad_norm": 97.34693909102697,
"learning_rate": 6.090668940397688e-07,
"logits/chosen": -0.7868531942367554,
"logits/rejected": -0.7912797331809998,
"logps/chosen": -3.1741623878479004,
"logps/rejected": -3.6076653003692627,
"loss": 3.2467,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -31.741622924804688,
"rewards/margins": 4.3350300788879395,
"rewards/rejected": -36.07665252685547,
"step": 174
},
{
"epoch": 0.3958156629912355,
"grad_norm": 112.75426033024542,
"learning_rate": 6.063618137643844e-07,
"logits/chosen": -0.7921246290206909,
"logits/rejected": -0.78474360704422,
"logps/chosen": -3.200976610183716,
"logps/rejected": -3.6109395027160645,
"loss": 3.3634,
"rewards/accuracies": 0.71875,
"rewards/chosen": -32.009769439697266,
"rewards/margins": 4.099628925323486,
"rewards/rejected": -36.109397888183594,
"step": 175
},
{
"epoch": 0.39807746677975686,
"grad_norm": 113.32746124615062,
"learning_rate": 6.03643811010988e-07,
"logits/chosen": -0.8276042938232422,
"logits/rejected": -0.8417137265205383,
"logps/chosen": -3.3886866569519043,
"logps/rejected": -3.824484348297119,
"loss": 3.0549,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -33.886863708496094,
"rewards/margins": 4.357980728149414,
"rewards/rejected": -38.24484634399414,
"step": 176
},
{
"epoch": 0.4003392705682782,
"grad_norm": 115.44790456463876,
"learning_rate": 6.009130559822453e-07,
"logits/chosen": -0.8511748909950256,
"logits/rejected": -0.8455148339271545,
"logps/chosen": -3.3521997928619385,
"logps/rejected": -3.621072292327881,
"loss": 4.1968,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -33.521995544433594,
"rewards/margins": 2.6887285709381104,
"rewards/rejected": -36.210723876953125,
"step": 177
},
{
"epoch": 0.40260107435679954,
"grad_norm": 108.10466917985082,
"learning_rate": 5.981697196793758e-07,
"logits/chosen": -0.8837382793426514,
"logits/rejected": -0.8865911364555359,
"logps/chosen": -3.501465082168579,
"logps/rejected": -3.9723386764526367,
"loss": 2.5792,
"rewards/accuracies": 0.78125,
"rewards/chosen": -35.0146484375,
"rewards/margins": 4.708735466003418,
"rewards/rejected": -39.723388671875,
"step": 178
},
{
"epoch": 0.4048628781453209,
"grad_norm": 113.75351199007196,
"learning_rate": 5.954139738914446e-07,
"logits/chosen": -0.8577677607536316,
"logits/rejected": -0.869698703289032,
"logps/chosen": -3.4370806217193604,
"logps/rejected": -3.856444835662842,
"loss": 3.4991,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -34.37080764770508,
"rewards/margins": 4.193644046783447,
"rewards/rejected": -38.564453125,
"step": 179
},
{
"epoch": 0.4071246819338422,
"grad_norm": 105.83270319758616,
"learning_rate": 5.92645991184605e-07,
"logits/chosen": -0.8364049792289734,
"logits/rejected": -0.8378995656967163,
"logps/chosen": -3.5678813457489014,
"logps/rejected": -4.040313720703125,
"loss": 2.9427,
"rewards/accuracies": 0.765625,
"rewards/chosen": -35.67881393432617,
"rewards/margins": 4.724322319030762,
"rewards/rejected": -40.403133392333984,
"step": 180
},
{
"epoch": 0.4093864857223636,
"grad_norm": 97.36394616930042,
"learning_rate": 5.898659448912917e-07,
"logits/chosen": -0.8220387697219849,
"logits/rejected": -0.8456301689147949,
"logps/chosen": -3.3772408962249756,
"logps/rejected": -3.840843915939331,
"loss": 3.2422,
"rewards/accuracies": 0.75,
"rewards/chosen": -33.77241134643555,
"rewards/margins": 4.636030197143555,
"rewards/rejected": -38.40843963623047,
"step": 181
},
{
"epoch": 0.41164828951088495,
"grad_norm": 119.36449481585935,
"learning_rate": 5.870740090993676e-07,
"logits/chosen": -0.8707118034362793,
"logits/rejected": -0.8762695789337158,
"logps/chosen": -3.7325124740600586,
"logps/rejected": -4.200765609741211,
"loss": 3.0099,
"rewards/accuracies": 0.78125,
"rewards/chosen": -37.32512664794922,
"rewards/margins": 4.68253755569458,
"rewards/rejected": -42.007659912109375,
"step": 182
},
{
"epoch": 0.41391009329940626,
"grad_norm": 112.59179043910055,
"learning_rate": 5.842703586412214e-07,
"logits/chosen": -0.8712838888168335,
"logits/rejected": -0.8757526278495789,
"logps/chosen": -3.783506393432617,
"logps/rejected": -4.1643571853637695,
"loss": 3.9002,
"rewards/accuracies": 0.6875,
"rewards/chosen": -37.83506774902344,
"rewards/margins": 3.8085036277770996,
"rewards/rejected": -41.6435661315918,
"step": 183
},
{
"epoch": 0.4161718970879276,
"grad_norm": 129.66104630601248,
"learning_rate": 5.814551690828203e-07,
"logits/chosen": -0.8484200239181519,
"logits/rejected": -0.861181378364563,
"logps/chosen": -3.624147653579712,
"logps/rejected": -4.078574180603027,
"loss": 2.9549,
"rewards/accuracies": 0.75,
"rewards/chosen": -36.241477966308594,
"rewards/margins": 4.544264793395996,
"rewards/rejected": -40.785743713378906,
"step": 184
},
{
"epoch": 0.418433700876449,
"grad_norm": 123.33358190542005,
"learning_rate": 5.786286167127155e-07,
"logits/chosen": -0.8848705291748047,
"logits/rejected": -0.8773502111434937,
"logps/chosen": -3.5523104667663574,
"logps/rejected": -4.035827159881592,
"loss": 3.2787,
"rewards/accuracies": 0.765625,
"rewards/chosen": -35.523101806640625,
"rewards/margins": 4.83516788482666,
"rewards/rejected": -40.35826873779297,
"step": 185
},
{
"epoch": 0.4206955046649703,
"grad_norm": 105.18151417237235,
"learning_rate": 5.757908785310031e-07,
"logits/chosen": -0.812483012676239,
"logits/rejected": -0.8327686786651611,
"logps/chosen": -3.3677561283111572,
"logps/rejected": -3.8305165767669678,
"loss": 3.089,
"rewards/accuracies": 0.78125,
"rewards/chosen": -33.67756271362305,
"rewards/margins": 4.627603530883789,
"rewards/rejected": -38.3051643371582,
"step": 186
},
{
"epoch": 0.42295730845349166,
"grad_norm": 112.39088354266822,
"learning_rate": 5.729421322382399e-07,
"logits/chosen": -0.8071901202201843,
"logits/rejected": -0.8371500372886658,
"logps/chosen": -3.244313955307007,
"logps/rejected": -3.724259376525879,
"loss": 3.1866,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -32.443138122558594,
"rewards/margins": 4.799454212188721,
"rewards/rejected": -37.242591857910156,
"step": 187
},
{
"epoch": 0.42521911224201303,
"grad_norm": 99.91697305071902,
"learning_rate": 5.700825562243163e-07,
"logits/chosen": -0.7996731996536255,
"logits/rejected": -0.8074153065681458,
"logps/chosen": -3.3295788764953613,
"logps/rejected": -3.8264358043670654,
"loss": 3.0297,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -33.29579162597656,
"rewards/margins": 4.968564510345459,
"rewards/rejected": -38.26435089111328,
"step": 188
},
{
"epoch": 0.42748091603053434,
"grad_norm": 106.43222768263621,
"learning_rate": 5.672123295572854e-07,
"logits/chosen": -0.8531807661056519,
"logits/rejected": -0.8710072636604309,
"logps/chosen": -3.4436635971069336,
"logps/rejected": -3.7774899005889893,
"loss": 3.2074,
"rewards/accuracies": 0.734375,
"rewards/chosen": -34.43663787841797,
"rewards/margins": 3.3382644653320312,
"rewards/rejected": -37.774898529052734,
"step": 189
},
{
"epoch": 0.4297427198190557,
"grad_norm": 101.78216988587263,
"learning_rate": 5.643316319721487e-07,
"logits/chosen": -0.834848940372467,
"logits/rejected": -0.8536701798439026,
"logps/chosen": -3.5879836082458496,
"logps/rejected": -3.99747896194458,
"loss": 3.6465,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -35.87983703613281,
"rewards/margins": 4.094951152801514,
"rewards/rejected": -39.97479248046875,
"step": 190
},
{
"epoch": 0.432004523607577,
"grad_norm": 102.01291002558317,
"learning_rate": 5.614406438596026e-07,
"logits/chosen": -0.8791413307189941,
"logits/rejected": -0.8761864900588989,
"logps/chosen": -3.594583511352539,
"logps/rejected": -4.055732727050781,
"loss": 3.5126,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -35.945831298828125,
"rewards/margins": 4.611495018005371,
"rewards/rejected": -40.55733108520508,
"step": 191
},
{
"epoch": 0.4342663273960984,
"grad_norm": 112.72220193442307,
"learning_rate": 5.585395462547406e-07,
"logits/chosen": -0.8375272154808044,
"logits/rejected": -0.8324666023254395,
"logps/chosen": -3.421821117401123,
"logps/rejected": -3.717869758605957,
"loss": 3.8101,
"rewards/accuracies": 0.65625,
"rewards/chosen": -34.21821212768555,
"rewards/margins": 2.9604828357696533,
"rewards/rejected": -37.17869186401367,
"step": 192
},
{
"epoch": 0.43652813118461975,
"grad_norm": 99.3082505357167,
"learning_rate": 5.55628520825718e-07,
"logits/chosen": -0.908355712890625,
"logits/rejected": -0.9252756237983704,
"logps/chosen": -3.4431350231170654,
"logps/rejected": -3.812532424926758,
"loss": 3.5263,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -34.43135070800781,
"rewards/margins": 3.6939783096313477,
"rewards/rejected": -38.125328063964844,
"step": 193
},
{
"epoch": 0.43878993497314106,
"grad_norm": 107.13322204576244,
"learning_rate": 5.527077498623752e-07,
"logits/chosen": -0.8578076958656311,
"logits/rejected": -0.8740971088409424,
"logps/chosen": -3.3862037658691406,
"logps/rejected": -3.792330741882324,
"loss": 3.1196,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -33.862037658691406,
"rewards/margins": 4.0612711906433105,
"rewards/rejected": -37.92330551147461,
"step": 194
},
{
"epoch": 0.4410517387616624,
"grad_norm": 85.34410402793644,
"learning_rate": 5.497774162648228e-07,
"logits/chosen": -0.8335473537445068,
"logits/rejected": -0.8551528453826904,
"logps/chosen": -3.339934825897217,
"logps/rejected": -3.8869519233703613,
"loss": 3.1318,
"rewards/accuracies": 0.75,
"rewards/chosen": -33.39934539794922,
"rewards/margins": 5.470172882080078,
"rewards/rejected": -38.86952209472656,
"step": 195
},
{
"epoch": 0.4433135425501838,
"grad_norm": 112.04047145787284,
"learning_rate": 5.468377035319882e-07,
"logits/chosen": -0.8870958089828491,
"logits/rejected": -0.8841900825500488,
"logps/chosen": -3.344312906265259,
"logps/rejected": -3.845787763595581,
"loss": 3.2742,
"rewards/accuracies": 0.6875,
"rewards/chosen": -33.44313049316406,
"rewards/margins": 5.014750003814697,
"rewards/rejected": -38.457881927490234,
"step": 196
},
{
"epoch": 0.4455753463387051,
"grad_norm": 108.93166440182182,
"learning_rate": 5.438887957501248e-07,
"logits/chosen": -0.7933169603347778,
"logits/rejected": -0.7912404537200928,
"logps/chosen": -3.3394107818603516,
"logps/rejected": -3.764794111251831,
"loss": 3.0992,
"rewards/accuracies": 0.71875,
"rewards/chosen": -33.39411163330078,
"rewards/margins": 4.253833293914795,
"rewards/rejected": -37.64794158935547,
"step": 197
},
{
"epoch": 0.44783715012722647,
"grad_norm": 132.62161789111477,
"learning_rate": 5.409308775812844e-07,
"logits/chosen": -0.8376902341842651,
"logits/rejected": -0.8406752347946167,
"logps/chosen": -3.4705710411071777,
"logps/rejected": -3.8878021240234375,
"loss": 3.5095,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -34.705711364746094,
"rewards/margins": 4.172308444976807,
"rewards/rejected": -38.878021240234375,
"step": 198
},
{
"epoch": 0.45009895391574783,
"grad_norm": 105.66558509816933,
"learning_rate": 5.379641342517541e-07,
"logits/chosen": -0.8948197960853577,
"logits/rejected": -0.8918160200119019,
"logps/chosen": -3.276104211807251,
"logps/rejected": -3.7821552753448486,
"loss": 3.1998,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -32.76103973388672,
"rewards/margins": 5.060507297515869,
"rewards/rejected": -37.82155227661133,
"step": 199
},
{
"epoch": 0.45236075770426915,
"grad_norm": 100.99361157251298,
"learning_rate": 5.349887515404564e-07,
"logits/chosen": -0.8491485714912415,
"logits/rejected": -0.8752503991127014,
"logps/chosen": -3.4885029792785645,
"logps/rejected": -4.05246114730835,
"loss": 2.811,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -34.88502883911133,
"rewards/margins": 5.639582633972168,
"rewards/rejected": -40.52460479736328,
"step": 200
},
{
"epoch": 0.4546225614927905,
"grad_norm": 113.88837192083922,
"learning_rate": 5.320049157673163e-07,
"logits/chosen": -0.7907375693321228,
"logits/rejected": -0.7869551181793213,
"logps/chosen": -3.329808235168457,
"logps/rejected": -3.815051794052124,
"loss": 3.0112,
"rewards/accuracies": 0.75,
"rewards/chosen": -33.29808044433594,
"rewards/margins": 4.8524346351623535,
"rewards/rejected": -38.15052032470703,
"step": 201
},
{
"epoch": 0.4568843652813118,
"grad_norm": 96.00315980556027,
"learning_rate": 5.290128137815938e-07,
"logits/chosen": -0.8615790009498596,
"logits/rejected": -0.8816788792610168,
"logps/chosen": -3.5456151962280273,
"logps/rejected": -4.082833766937256,
"loss": 2.6221,
"rewards/accuracies": 0.765625,
"rewards/chosen": -35.456146240234375,
"rewards/margins": 5.372189044952393,
"rewards/rejected": -40.828338623046875,
"step": 202
},
{
"epoch": 0.4591461690698332,
"grad_norm": 88.74384836731605,
"learning_rate": 5.260126329501828e-07,
"logits/chosen": -0.8821161985397339,
"logits/rejected": -0.8808766603469849,
"logps/chosen": -3.4488883018493652,
"logps/rejected": -4.065739631652832,
"loss": 2.526,
"rewards/accuracies": 0.796875,
"rewards/chosen": -34.48888397216797,
"rewards/margins": 6.168512344360352,
"rewards/rejected": -40.65739440917969,
"step": 203
},
{
"epoch": 0.46140797285835455,
"grad_norm": 113.38423627891478,
"learning_rate": 5.230045611458789e-07,
"logits/chosen": -0.8067418932914734,
"logits/rejected": -0.8317432403564453,
"logps/chosen": -3.4061567783355713,
"logps/rejected": -3.852400302886963,
"loss": 3.1033,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -34.06156921386719,
"rewards/margins": 4.462434768676758,
"rewards/rejected": -38.52400207519531,
"step": 204
},
{
"epoch": 0.46366977664687586,
"grad_norm": 109.078062930805,
"learning_rate": 5.199887867356143e-07,
"logits/chosen": -0.8303195238113403,
"logits/rejected": -0.8461140394210815,
"logps/chosen": -3.646005153656006,
"logps/rejected": -4.241487979888916,
"loss": 2.5447,
"rewards/accuracies": 0.765625,
"rewards/chosen": -36.46004867553711,
"rewards/margins": 5.954832077026367,
"rewards/rejected": -42.41488265991211,
"step": 205
},
{
"epoch": 0.46593158043539723,
"grad_norm": 115.30415505519554,
"learning_rate": 5.16965498568662e-07,
"logits/chosen": -0.8711931705474854,
"logits/rejected": -0.8695412278175354,
"logps/chosen": -3.7641541957855225,
"logps/rejected": -4.470314025878906,
"loss": 2.7657,
"rewards/accuracies": 0.796875,
"rewards/chosen": -37.64154052734375,
"rewards/margins": 7.061600685119629,
"rewards/rejected": -44.70314025878906,
"step": 206
},
{
"epoch": 0.4681933842239186,
"grad_norm": 112.79356107718269,
"learning_rate": 5.139348859648098e-07,
"logits/chosen": -0.8668640851974487,
"logits/rejected": -0.8753060698509216,
"logps/chosen": -3.509500026702881,
"logps/rejected": -4.0229034423828125,
"loss": 2.9855,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -35.095001220703125,
"rewards/margins": 5.134032249450684,
"rewards/rejected": -40.229034423828125,
"step": 207
},
{
"epoch": 0.4704551880124399,
"grad_norm": 117.01804715220312,
"learning_rate": 5.10897138702506e-07,
"logits/chosen": -0.8137744665145874,
"logits/rejected": -0.838422417640686,
"logps/chosen": -3.5989084243774414,
"logps/rejected": -4.12141227722168,
"loss": 3.4055,
"rewards/accuracies": 0.71875,
"rewards/chosen": -35.98908615112305,
"rewards/margins": 5.225040912628174,
"rewards/rejected": -41.21411895751953,
"step": 208
},
{
"epoch": 0.4727169918009613,
"grad_norm": 101.49552741213645,
"learning_rate": 5.078524470069743e-07,
"logits/chosen": -0.9176779985427856,
"logits/rejected": -0.9260926246643066,
"logps/chosen": -3.756748914718628,
"logps/rejected": -4.329287052154541,
"loss": 2.4625,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -37.56748580932617,
"rewards/margins": 5.725386142730713,
"rewards/rejected": -43.29287338256836,
"step": 209
},
{
"epoch": 0.47497879558948264,
"grad_norm": 103.91381353366985,
"learning_rate": 5.048010015383021e-07,
"logits/chosen": -0.8263366222381592,
"logits/rejected": -0.8194425106048584,
"logps/chosen": -3.8313450813293457,
"logps/rejected": -4.535330772399902,
"loss": 2.4896,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -38.31344985961914,
"rewards/margins": 7.039859294891357,
"rewards/rejected": -45.35331344604492,
"step": 210
},
{
"epoch": 0.47724059937800395,
"grad_norm": 93.64016980030927,
"learning_rate": 5.01742993379502e-07,
"logits/chosen": -0.8458577990531921,
"logits/rejected": -0.868080735206604,
"logps/chosen": -3.8605237007141113,
"logps/rejected": -4.4653801918029785,
"loss": 2.6156,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -38.60523986816406,
"rewards/margins": 6.04856538772583,
"rewards/rejected": -44.653804779052734,
"step": 211
},
{
"epoch": 0.4795024031665253,
"grad_norm": 127.6076556977002,
"learning_rate": 4.986786140245446e-07,
"logits/chosen": -0.8188483715057373,
"logits/rejected": -0.826935887336731,
"logps/chosen": -3.8074846267700195,
"logps/rejected": -4.2949419021606445,
"loss": 3.2102,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -38.074851989746094,
"rewards/margins": 4.874571323394775,
"rewards/rejected": -42.94941711425781,
"step": 212
},
{
"epoch": 0.4817642069550466,
"grad_norm": 147.29228875677396,
"learning_rate": 4.956080553663687e-07,
"logits/chosen": -0.8854949474334717,
"logits/rejected": -0.8917779922485352,
"logps/chosen": -3.8558950424194336,
"logps/rejected": -4.444454193115234,
"loss": 2.994,
"rewards/accuracies": 0.796875,
"rewards/chosen": -38.55895233154297,
"rewards/margins": 5.885589122772217,
"rewards/rejected": -44.444541931152344,
"step": 213
},
{
"epoch": 0.484026010743568,
"grad_norm": 136.51813237025374,
"learning_rate": 4.925315096848636e-07,
"logits/chosen": -0.8785922527313232,
"logits/rejected": -0.8924418091773987,
"logps/chosen": -4.0408549308776855,
"logps/rejected": -4.76306676864624,
"loss": 2.8242,
"rewards/accuracies": 0.734375,
"rewards/chosen": -40.40855407714844,
"rewards/margins": 7.2221174240112305,
"rewards/rejected": -47.63066864013672,
"step": 214
},
{
"epoch": 0.48628781453208936,
"grad_norm": 110.42259187397326,
"learning_rate": 4.894491696348293e-07,
"logits/chosen": -0.8856892585754395,
"logits/rejected": -0.8893029093742371,
"logps/chosen": -3.8507235050201416,
"logps/rejected": -4.282730579376221,
"loss": 3.3235,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -38.507232666015625,
"rewards/margins": 4.320071220397949,
"rewards/rejected": -42.82730484008789,
"step": 215
},
{
"epoch": 0.48854961832061067,
"grad_norm": 112.38972321227853,
"learning_rate": 4.863612282339116e-07,
"logits/chosen": -0.817990779876709,
"logits/rejected": -0.8263007998466492,
"logps/chosen": -4.125490188598633,
"logps/rejected": -4.6461663246154785,
"loss": 3.3821,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -41.25490188598633,
"rewards/margins": 5.206766128540039,
"rewards/rejected": -46.461666107177734,
"step": 216
},
{
"epoch": 0.49081142210913203,
"grad_norm": 122.33140314915195,
"learning_rate": 4.832678788505161e-07,
"logits/chosen": -0.8691527843475342,
"logits/rejected": -0.8712851405143738,
"logps/chosen": -4.1400322914123535,
"logps/rejected": -4.675261497497559,
"loss": 3.4518,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -41.400325775146484,
"rewards/margins": 5.352287769317627,
"rewards/rejected": -46.75260925292969,
"step": 217
},
{
"epoch": 0.4930732258976534,
"grad_norm": 153.70015543824144,
"learning_rate": 4.801693151916985e-07,
"logits/chosen": -0.8753068447113037,
"logits/rejected": -0.9178647398948669,
"logps/chosen": -4.102838516235352,
"logps/rejected": -4.627809047698975,
"loss": 3.1055,
"rewards/accuracies": 0.78125,
"rewards/chosen": -41.02838134765625,
"rewards/margins": 5.249708652496338,
"rewards/rejected": -46.27809143066406,
"step": 218
},
{
"epoch": 0.4953350296861747,
"grad_norm": 110.68812495356474,
"learning_rate": 4.770657312910354e-07,
"logits/chosen": -0.8981303572654724,
"logits/rejected": -0.915514349937439,
"logps/chosen": -4.165809631347656,
"logps/rejected": -4.6610212326049805,
"loss": 3.376,
"rewards/accuracies": 0.703125,
"rewards/chosen": -41.65810012817383,
"rewards/margins": 4.952118396759033,
"rewards/rejected": -46.6102180480957,
"step": 219
},
{
"epoch": 0.4975968334746961,
"grad_norm": 116.58607540402677,
"learning_rate": 4.739573214964729e-07,
"logits/chosen": -0.8669706583023071,
"logits/rejected": -0.8741896748542786,
"logps/chosen": -3.9599337577819824,
"logps/rejected": -4.534191608428955,
"loss": 2.7624,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -39.599342346191406,
"rewards/margins": 5.742575645446777,
"rewards/rejected": -45.341915130615234,
"step": 220
},
{
"epoch": 0.49985863726321744,
"grad_norm": 138.68860730884543,
"learning_rate": 4.7084428045815733e-07,
"logits/chosen": -0.8756369948387146,
"logits/rejected": -0.8829125165939331,
"logps/chosen": -4.227509498596191,
"logps/rejected": -4.716983795166016,
"loss": 3.2761,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -42.27509307861328,
"rewards/margins": 4.894742012023926,
"rewards/rejected": -47.16983413696289,
"step": 221
},
{
"epoch": 0.5021204410517388,
"grad_norm": 184.1037285692299,
"learning_rate": 4.677268031162457e-07,
"logits/chosen": -0.896783709526062,
"logits/rejected": -0.9043738842010498,
"logps/chosen": -4.029943943023682,
"logps/rejected": -4.505390644073486,
"loss": 3.5501,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -40.299442291259766,
"rewards/margins": 4.7544660568237305,
"rewards/rejected": -45.05391311645508,
"step": 222
},
{
"epoch": 0.5043822448402601,
"grad_norm": 112.45603200436628,
"learning_rate": 4.646050846886985e-07,
"logits/chosen": -0.8041797876358032,
"logits/rejected": -0.8242363929748535,
"logps/chosen": -3.881974220275879,
"logps/rejected": -4.4934492111206055,
"loss": 2.6607,
"rewards/accuracies": 0.796875,
"rewards/chosen": -38.81974411010742,
"rewards/margins": 6.114748954772949,
"rewards/rejected": -44.93449401855469,
"step": 223
},
{
"epoch": 0.5066440486287814,
"grad_norm": 134.00364181034922,
"learning_rate": 4.6147932065905494e-07,
"logits/chosen": -0.867178738117218,
"logits/rejected": -0.8653546571731567,
"logps/chosen": -4.144719123840332,
"logps/rejected": -4.623917102813721,
"loss": 3.5008,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -41.44718933105469,
"rewards/margins": 4.791982650756836,
"rewards/rejected": -46.239173889160156,
"step": 224
},
{
"epoch": 0.5089058524173028,
"grad_norm": 114.91237964387022,
"learning_rate": 4.5834970676419214e-07,
"logits/chosen": -0.8645190000534058,
"logits/rejected": -0.8773024082183838,
"logps/chosen": -3.982334613800049,
"logps/rejected": -4.518105983734131,
"loss": 3.0231,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -39.82334899902344,
"rewards/margins": 5.357712268829346,
"rewards/rejected": -45.181060791015625,
"step": 225
},
{
"epoch": 0.5111676562058242,
"grad_norm": 154.2435864953568,
"learning_rate": 4.552164389820673e-07,
"logits/chosen": -0.7863515615463257,
"logits/rejected": -0.804935097694397,
"logps/chosen": -4.0218400955200195,
"logps/rejected": -4.741469383239746,
"loss": 2.7939,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -40.218406677246094,
"rewards/margins": 7.1962890625,
"rewards/rejected": -47.414695739746094,
"step": 226
},
{
"epoch": 0.5134294599943455,
"grad_norm": 129.512345171623,
"learning_rate": 4.5207971351944605e-07,
"logits/chosen": -0.904701828956604,
"logits/rejected": -0.9039019346237183,
"logps/chosen": -3.990769386291504,
"logps/rejected": -4.5699143409729,
"loss": 3.5426,
"rewards/accuracies": 0.71875,
"rewards/chosen": -39.90769577026367,
"rewards/margins": 5.791450500488281,
"rewards/rejected": -45.69914245605469,
"step": 227
},
{
"epoch": 0.5156912637828668,
"grad_norm": 127.45769396653864,
"learning_rate": 4.489397267996157e-07,
"logits/chosen": -0.8994483351707458,
"logits/rejected": -0.8912683725357056,
"logps/chosen": -3.946481466293335,
"logps/rejected": -4.470279693603516,
"loss": 2.9926,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -39.464813232421875,
"rewards/margins": 5.237981796264648,
"rewards/rejected": -44.702796936035156,
"step": 228
},
{
"epoch": 0.5179530675713881,
"grad_norm": 111.39180557968587,
"learning_rate": 4.45796675450085e-07,
"logits/chosen": -0.8582149744033813,
"logits/rejected": -0.8690947890281677,
"logps/chosen": -3.8885929584503174,
"logps/rejected": -4.494987964630127,
"loss": 2.8133,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -38.88593292236328,
"rewards/margins": 6.063946723937988,
"rewards/rejected": -44.94988250732422,
"step": 229
},
{
"epoch": 0.5202148713599095,
"grad_norm": 135.02910775325827,
"learning_rate": 4.4265075629027126e-07,
"logits/chosen": -0.8169862031936646,
"logits/rejected": -0.8348796963691711,
"logps/chosen": -4.078422546386719,
"logps/rejected": -4.564748764038086,
"loss": 2.8027,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -40.78423309326172,
"rewards/margins": 4.863255977630615,
"rewards/rejected": -45.64748764038086,
"step": 230
},
{
"epoch": 0.5224766751484309,
"grad_norm": 110.51829137339959,
"learning_rate": 4.3950216631917563e-07,
"logits/chosen": -0.888090193271637,
"logits/rejected": -0.9090730547904968,
"logps/chosen": -3.9436981678009033,
"logps/rejected": -4.6157708168029785,
"loss": 2.6358,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -39.43698501586914,
"rewards/margins": 6.720722198486328,
"rewards/rejected": -46.15770721435547,
"step": 231
},
{
"epoch": 0.5247384789369522,
"grad_norm": 111.61538092823348,
"learning_rate": 4.3635110270304676e-07,
"logits/chosen": -0.8641917109489441,
"logits/rejected": -0.8716497421264648,
"logps/chosen": -3.86027193069458,
"logps/rejected": -4.502015113830566,
"loss": 2.2291,
"rewards/accuracies": 0.828125,
"rewards/chosen": -38.602718353271484,
"rewards/margins": 6.4174323081970215,
"rewards/rejected": -45.02014923095703,
"step": 232
},
{
"epoch": 0.5270002827254736,
"grad_norm": 173.35008588152775,
"learning_rate": 4.331977627630339e-07,
"logits/chosen": -0.8097434043884277,
"logits/rejected": -0.8040153980255127,
"logps/chosen": -3.757528305053711,
"logps/rejected": -4.439907550811768,
"loss": 2.6864,
"rewards/accuracies": 0.796875,
"rewards/chosen": -37.57528305053711,
"rewards/margins": 6.823795318603516,
"rewards/rejected": -44.399078369140625,
"step": 233
},
{
"epoch": 0.5292620865139949,
"grad_norm": 120.90157841350384,
"learning_rate": 4.300423439628313e-07,
"logits/chosen": -0.8537578582763672,
"logits/rejected": -0.8780308365821838,
"logps/chosen": -3.8302276134490967,
"logps/rejected": -4.468556880950928,
"loss": 2.498,
"rewards/accuracies": 0.796875,
"rewards/chosen": -38.30228042602539,
"rewards/margins": 6.383289813995361,
"rewards/rejected": -44.685569763183594,
"step": 234
},
{
"epoch": 0.5315238903025162,
"grad_norm": 118.3536030698132,
"learning_rate": 4.268850438963118e-07,
"logits/chosen": -0.8823138475418091,
"logits/rejected": -0.9043455719947815,
"logps/chosen": -4.102262020111084,
"logps/rejected": -4.674637794494629,
"loss": 2.7324,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -41.02262496948242,
"rewards/margins": 5.723756313323975,
"rewards/rejected": -46.746376037597656,
"step": 235
},
{
"epoch": 0.5337856940910376,
"grad_norm": 116.93696869949373,
"learning_rate": 4.2372606027515463e-07,
"logits/chosen": -0.8339194655418396,
"logits/rejected": -0.8445159196853638,
"logps/chosen": -3.7381174564361572,
"logps/rejected": -4.291147708892822,
"loss": 2.9013,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -37.38117599487305,
"rewards/margins": 5.530303478240967,
"rewards/rejected": -42.911476135253906,
"step": 236
},
{
"epoch": 0.536047497879559,
"grad_norm": 148.0693344568736,
"learning_rate": 4.2056559091646387e-07,
"logits/chosen": -0.8702591061592102,
"logits/rejected": -0.898255467414856,
"logps/chosen": -4.004217624664307,
"logps/rejected": -4.487666130065918,
"loss": 3.3504,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -40.042179107666016,
"rewards/margins": 4.834486484527588,
"rewards/rejected": -44.87666702270508,
"step": 237
},
{
"epoch": 0.5383093016680803,
"grad_norm": 136.83734710023003,
"learning_rate": 4.1740383373038116e-07,
"logits/chosen": -0.8536058664321899,
"logits/rejected": -0.8870092630386353,
"logps/chosen": -3.817162036895752,
"logps/rejected": -4.381956100463867,
"loss": 2.9761,
"rewards/accuracies": 0.765625,
"rewards/chosen": -38.1716194152832,
"rewards/margins": 5.647944450378418,
"rewards/rejected": -43.81956481933594,
"step": 238
},
{
"epoch": 0.5405711054566016,
"grad_norm": 116.45504096009955,
"learning_rate": 4.1424098670769255e-07,
"logits/chosen": -0.9009624719619751,
"logits/rejected": -0.9262585639953613,
"logps/chosen": -3.884793281555176,
"logps/rejected": -4.33213996887207,
"loss": 3.0779,
"rewards/accuracies": 0.734375,
"rewards/chosen": -38.84792709350586,
"rewards/margins": 4.473471164703369,
"rewards/rejected": -43.3213996887207,
"step": 239
},
{
"epoch": 0.542832909245123,
"grad_norm": 114.67603070092983,
"learning_rate": 4.1107724790743007e-07,
"logits/chosen": -0.8459216356277466,
"logits/rejected": -0.8754346966743469,
"logps/chosen": -3.9210426807403564,
"logps/rejected": -4.424591064453125,
"loss": 2.7487,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -39.210426330566406,
"rewards/margins": 5.035484313964844,
"rewards/rejected": -44.245906829833984,
"step": 240
},
{
"epoch": 0.5450947130336443,
"grad_norm": 159.44800208061523,
"learning_rate": 4.0791281544446947e-07,
"logits/chosen": -0.8924515843391418,
"logits/rejected": -0.8807788491249084,
"logps/chosen": -3.9503896236419678,
"logps/rejected": -4.500914573669434,
"loss": 2.7043,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -39.50389099121094,
"rewards/margins": 5.505251884460449,
"rewards/rejected": -45.00914764404297,
"step": 241
},
{
"epoch": 0.5473565168221657,
"grad_norm": 128.53086499808066,
"learning_rate": 4.0474788747712416e-07,
"logits/chosen": -0.8996694684028625,
"logits/rejected": -0.9028959274291992,
"logps/chosen": -3.932866096496582,
"logps/rejected": -4.37875509262085,
"loss": 3.589,
"rewards/accuracies": 0.703125,
"rewards/chosen": -39.32866287231445,
"rewards/margins": 4.458887100219727,
"rewards/rejected": -43.78754806518555,
"step": 242
},
{
"epoch": 0.549618320610687,
"grad_norm": 104.57144676128537,
"learning_rate": 4.0158266219473573e-07,
"logits/chosen": -0.8719525933265686,
"logits/rejected": -0.8880172371864319,
"logps/chosen": -3.7240490913391113,
"logps/rejected": -4.320034503936768,
"loss": 2.5879,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -37.24049377441406,
"rewards/margins": 5.959850311279297,
"rewards/rejected": -43.20034408569336,
"step": 243
},
{
"epoch": 0.5518801243992084,
"grad_norm": 121.72780985174033,
"learning_rate": 3.984173378052643e-07,
"logits/chosen": -0.8488632440567017,
"logits/rejected": -0.8424826264381409,
"logps/chosen": -3.662327289581299,
"logps/rejected": -4.250753879547119,
"loss": 2.558,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -36.623268127441406,
"rewards/margins": 5.884267807006836,
"rewards/rejected": -42.507537841796875,
"step": 244
},
{
"epoch": 0.5541419281877297,
"grad_norm": 176.66908023998735,
"learning_rate": 3.9525211252287585e-07,
"logits/chosen": -0.9206802248954773,
"logits/rejected": -0.938546895980835,
"logps/chosen": -3.8519883155822754,
"logps/rejected": -4.533115386962891,
"loss": 3.0243,
"rewards/accuracies": 0.734375,
"rewards/chosen": -38.51988220214844,
"rewards/margins": 6.811273574829102,
"rewards/rejected": -45.331153869628906,
"step": 245
},
{
"epoch": 0.556403731976251,
"grad_norm": 108.79709814447062,
"learning_rate": 3.920871845555305e-07,
"logits/chosen": -0.8708853721618652,
"logits/rejected": -0.8730578422546387,
"logps/chosen": -3.832918882369995,
"logps/rejected": -4.335785865783691,
"loss": 2.5306,
"rewards/accuracies": 0.78125,
"rewards/chosen": -38.32918930053711,
"rewards/margins": 5.0286712646484375,
"rewards/rejected": -43.35785675048828,
"step": 246
},
{
"epoch": 0.5586655357647724,
"grad_norm": 127.09481932058374,
"learning_rate": 3.8892275209256984e-07,
"logits/chosen": -0.921829342842102,
"logits/rejected": -0.9111767411231995,
"logps/chosen": -3.9879932403564453,
"logps/rejected": -4.458497047424316,
"loss": 2.9165,
"rewards/accuracies": 0.765625,
"rewards/chosen": -39.87993240356445,
"rewards/margins": 4.705035209655762,
"rewards/rejected": -44.58496856689453,
"step": 247
},
{
"epoch": 0.5609273395532938,
"grad_norm": 115.9590799116695,
"learning_rate": 3.8575901329230747e-07,
"logits/chosen": -0.8582264184951782,
"logits/rejected": -0.8617616891860962,
"logps/chosen": -3.9318342208862305,
"logps/rejected": -4.470717430114746,
"loss": 3.0595,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -39.31834411621094,
"rewards/margins": 5.388828277587891,
"rewards/rejected": -44.707176208496094,
"step": 248
},
{
"epoch": 0.5631891433418151,
"grad_norm": 128.3493722347937,
"learning_rate": 3.8259616626961886e-07,
"logits/chosen": -0.8590461015701294,
"logits/rejected": -0.8726236820220947,
"logps/chosen": -3.7069010734558105,
"logps/rejected": -4.159891128540039,
"loss": 2.6582,
"rewards/accuracies": 0.8125,
"rewards/chosen": -37.06901550292969,
"rewards/margins": 4.529898643493652,
"rewards/rejected": -41.59891128540039,
"step": 249
},
{
"epoch": 0.5654509471303364,
"grad_norm": 114.11597318565974,
"learning_rate": 3.794344090835362e-07,
"logits/chosen": -0.8618912100791931,
"logits/rejected": -0.8814125061035156,
"logps/chosen": -4.0398969650268555,
"logps/rejected": -4.569504737854004,
"loss": 2.945,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -40.39897155761719,
"rewards/margins": 5.2960710525512695,
"rewards/rejected": -45.695045471191406,
"step": 250
},
{
"epoch": 0.5677127509188578,
"grad_norm": 132.84283803977144,
"learning_rate": 3.7627393972484534e-07,
"logits/chosen": -0.9538972973823547,
"logits/rejected": -0.961841881275177,
"logps/chosen": -3.9805994033813477,
"logps/rejected": -4.373291969299316,
"loss": 3.459,
"rewards/accuracies": 0.734375,
"rewards/chosen": -39.805992126464844,
"rewards/margins": 3.9269251823425293,
"rewards/rejected": -43.73291778564453,
"step": 251
},
{
"epoch": 0.5699745547073791,
"grad_norm": 116.59079088621297,
"learning_rate": 3.7311495610368823e-07,
"logits/chosen": -0.9467366933822632,
"logits/rejected": -0.9687215089797974,
"logps/chosen": -4.043094158172607,
"logps/rejected": -4.563295364379883,
"loss": 3.0119,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -40.430938720703125,
"rewards/margins": 5.202012538909912,
"rewards/rejected": -45.63295364379883,
"step": 252
},
{
"epoch": 0.5722363584959005,
"grad_norm": 120.59136322495354,
"learning_rate": 3.699576560371689e-07,
"logits/chosen": -0.8889734745025635,
"logits/rejected": -0.9066051244735718,
"logps/chosen": -4.205962181091309,
"logps/rejected": -4.996251106262207,
"loss": 2.315,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -42.05961990356445,
"rewards/margins": 7.902889251708984,
"rewards/rejected": -49.96250915527344,
"step": 253
},
{
"epoch": 0.5744981622844219,
"grad_norm": 124.59506533641044,
"learning_rate": 3.66802237236966e-07,
"logits/chosen": -0.8749493956565857,
"logits/rejected": -0.8885746002197266,
"logps/chosen": -4.300434112548828,
"logps/rejected": -4.866487503051758,
"loss": 2.9432,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -43.00434112548828,
"rewards/margins": 5.66053581237793,
"rewards/rejected": -48.664878845214844,
"step": 254
},
{
"epoch": 0.5767599660729432,
"grad_norm": 118.16852020588865,
"learning_rate": 3.636488972969532e-07,
"logits/chosen": -0.9017617702484131,
"logits/rejected": -0.9131591320037842,
"logps/chosen": -4.101204872131348,
"logps/rejected": -4.681705474853516,
"loss": 2.6388,
"rewards/accuracies": 0.765625,
"rewards/chosen": -41.012046813964844,
"rewards/margins": 5.80500602722168,
"rewards/rejected": -46.817054748535156,
"step": 255
},
{
"epoch": 0.5790217698614645,
"grad_norm": 120.89498541698326,
"learning_rate": 3.604978336808244e-07,
"logits/chosen": -0.9948743581771851,
"logits/rejected": -1.0087530612945557,
"logps/chosen": -4.0368547439575195,
"logps/rejected": -4.631007194519043,
"loss": 2.8043,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -40.36854934692383,
"rewards/margins": 5.941521644592285,
"rewards/rejected": -46.31007385253906,
"step": 256
},
{
"epoch": 0.5812835736499858,
"grad_norm": 124.83239622588137,
"learning_rate": 3.5734924370972876e-07,
"logits/chosen": -0.9356947541236877,
"logits/rejected": -0.9560631513595581,
"logps/chosen": -4.138307571411133,
"logps/rejected": -4.669035911560059,
"loss": 3.0658,
"rewards/accuracies": 0.75,
"rewards/chosen": -41.38307189941406,
"rewards/margins": 5.307290554046631,
"rewards/rejected": -46.69036102294922,
"step": 257
},
{
"epoch": 0.5835453774385072,
"grad_norm": 138.50523829837766,
"learning_rate": 3.5420332454991504e-07,
"logits/chosen": -0.8820909261703491,
"logits/rejected": -0.8897730112075806,
"logps/chosen": -4.207208633422852,
"logps/rejected": -4.809183597564697,
"loss": 2.7913,
"rewards/accuracies": 0.765625,
"rewards/chosen": -42.07209014892578,
"rewards/margins": 6.019748210906982,
"rewards/rejected": -48.091835021972656,
"step": 258
},
{
"epoch": 0.5858071812270286,
"grad_norm": 110.39897739040883,
"learning_rate": 3.510602732003843e-07,
"logits/chosen": -0.9389081597328186,
"logits/rejected": -0.9693293571472168,
"logps/chosen": -4.315252780914307,
"logps/rejected": -4.980816841125488,
"loss": 2.5721,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -43.152530670166016,
"rewards/margins": 6.655643463134766,
"rewards/rejected": -49.80816650390625,
"step": 259
},
{
"epoch": 0.5880689850155499,
"grad_norm": 125.09334065686004,
"learning_rate": 3.4792028648055396e-07,
"logits/chosen": -0.8979041576385498,
"logits/rejected": -0.9275961518287659,
"logps/chosen": -4.101990699768066,
"logps/rejected": -4.697176933288574,
"loss": 2.7902,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -41.0199089050293,
"rewards/margins": 5.951866149902344,
"rewards/rejected": -46.971771240234375,
"step": 260
},
{
"epoch": 0.5903307888040712,
"grad_norm": 117.68527045074552,
"learning_rate": 3.447835610179327e-07,
"logits/chosen": -0.8862229585647583,
"logits/rejected": -0.899125337600708,
"logps/chosen": -4.087460994720459,
"logps/rejected": -4.825685501098633,
"loss": 2.6479,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -40.874610900878906,
"rewards/margins": 7.382248878479004,
"rewards/rejected": -48.25685501098633,
"step": 261
},
{
"epoch": 0.5925925925925926,
"grad_norm": 139.3400014797519,
"learning_rate": 3.416502932358079e-07,
"logits/chosen": -0.9597766399383545,
"logits/rejected": -0.9774207472801208,
"logps/chosen": -4.404236316680908,
"logps/rejected": -4.83123254776001,
"loss": 3.1974,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -44.04236602783203,
"rewards/margins": 4.269958019256592,
"rewards/rejected": -48.31232452392578,
"step": 262
},
{
"epoch": 0.5948543963811139,
"grad_norm": 150.23635516979436,
"learning_rate": 3.385206793409451e-07,
"logits/chosen": -0.8739109039306641,
"logits/rejected": -0.8933315873146057,
"logps/chosen": -3.9273197650909424,
"logps/rejected": -4.468226432800293,
"loss": 2.7657,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -39.273197174072266,
"rewards/margins": 5.4090657234191895,
"rewards/rejected": -44.68226623535156,
"step": 263
},
{
"epoch": 0.5971162001696353,
"grad_norm": 136.57829357030124,
"learning_rate": 3.3539491531130163e-07,
"logits/chosen": -0.9004377722740173,
"logits/rejected": -0.9124334454536438,
"logps/chosen": -4.114037036895752,
"logps/rejected": -4.8097944259643555,
"loss": 2.6883,
"rewards/accuracies": 0.828125,
"rewards/chosen": -41.14036560058594,
"rewards/margins": 6.95757532119751,
"rewards/rejected": -48.09794235229492,
"step": 264
},
{
"epoch": 0.5993780039581567,
"grad_norm": 134.6857649306004,
"learning_rate": 3.3227319688375426e-07,
"logits/chosen": -0.9540138840675354,
"logits/rejected": -0.9450901746749878,
"logps/chosen": -4.0871100425720215,
"logps/rejected": -4.629415035247803,
"loss": 2.8724,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -40.8711051940918,
"rewards/margins": 5.423047065734863,
"rewards/rejected": -46.294151306152344,
"step": 265
},
{
"epoch": 0.601639807746678,
"grad_norm": 137.41581633964907,
"learning_rate": 3.291557195418427e-07,
"logits/chosen": -0.9520595073699951,
"logits/rejected": -0.9495226144790649,
"logps/chosen": -3.8760974407196045,
"logps/rejected": -4.302947044372559,
"loss": 3.2724,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -38.76097869873047,
"rewards/margins": 4.268494606018066,
"rewards/rejected": -43.02947235107422,
"step": 266
},
{
"epoch": 0.6039016115351993,
"grad_norm": 144.7995816554034,
"learning_rate": 3.260426785035272e-07,
"logits/chosen": -0.9223634004592896,
"logits/rejected": -0.9228293895721436,
"logps/chosen": -3.9227046966552734,
"logps/rejected": -4.469911098480225,
"loss": 3.3138,
"rewards/accuracies": 0.765625,
"rewards/chosen": -39.227046966552734,
"rewards/margins": 5.472067356109619,
"rewards/rejected": -44.6991081237793,
"step": 267
},
{
"epoch": 0.6061634153237206,
"grad_norm": 108.29473766754211,
"learning_rate": 3.229342687089646e-07,
"logits/chosen": -0.9119688272476196,
"logits/rejected": -0.915400505065918,
"logps/chosen": -3.8088831901550293,
"logps/rejected": -4.450516223907471,
"loss": 2.4994,
"rewards/accuracies": 0.796875,
"rewards/chosen": -38.08883285522461,
"rewards/margins": 6.416332244873047,
"rewards/rejected": -44.505165100097656,
"step": 268
},
{
"epoch": 0.608425219112242,
"grad_norm": 114.65522312241735,
"learning_rate": 3.1983068480830143e-07,
"logits/chosen": -0.9089800715446472,
"logits/rejected": -0.9124254584312439,
"logps/chosen": -3.8074193000793457,
"logps/rejected": -4.448195934295654,
"loss": 2.7265,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -38.074195861816406,
"rewards/margins": 6.407771110534668,
"rewards/rejected": -44.481964111328125,
"step": 269
},
{
"epoch": 0.6106870229007634,
"grad_norm": 114.37970020373666,
"learning_rate": 3.1673212114948387e-07,
"logits/chosen": -0.8962373733520508,
"logits/rejected": -0.8947957158088684,
"logps/chosen": -3.66055965423584,
"logps/rejected": -4.304127216339111,
"loss": 2.4988,
"rewards/accuracies": 0.796875,
"rewards/chosen": -36.60559844970703,
"rewards/margins": 6.435674667358398,
"rewards/rejected": -43.0412712097168,
"step": 270
},
{
"epoch": 0.6129488266892847,
"grad_norm": 131.22698552322544,
"learning_rate": 3.1363877176608845e-07,
"logits/chosen": -0.8634111285209656,
"logits/rejected": -0.8854045271873474,
"logps/chosen": -3.583430290222168,
"logps/rejected": -4.167083740234375,
"loss": 2.9822,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -35.83430099487305,
"rewards/margins": 5.836535930633545,
"rewards/rejected": -41.67083740234375,
"step": 271
},
{
"epoch": 0.615210630477806,
"grad_norm": 118.13378200287389,
"learning_rate": 3.1055083036517076e-07,
"logits/chosen": -0.8832507133483887,
"logits/rejected": -0.8600270748138428,
"logps/chosen": -3.605767011642456,
"logps/rejected": -4.242787837982178,
"loss": 2.5501,
"rewards/accuracies": 0.8515625,
"rewards/chosen": -36.05766677856445,
"rewards/margins": 6.370209217071533,
"rewards/rejected": -42.427879333496094,
"step": 272
},
{
"epoch": 0.6174724342663274,
"grad_norm": 150.22493651179883,
"learning_rate": 3.074684903151364e-07,
"logits/chosen": -0.7782445549964905,
"logits/rejected": -0.7658709287643433,
"logps/chosen": -3.3246827125549316,
"logps/rejected": -3.7869513034820557,
"loss": 2.6785,
"rewards/accuracies": 0.75,
"rewards/chosen": -33.246826171875,
"rewards/margins": 4.622686386108398,
"rewards/rejected": -37.86951446533203,
"step": 273
},
{
"epoch": 0.6197342380548487,
"grad_norm": 124.8554523177313,
"learning_rate": 3.0439194463363136e-07,
"logits/chosen": -0.8569799065589905,
"logits/rejected": -0.8484780192375183,
"logps/chosen": -3.430828094482422,
"logps/rejected": -3.913545608520508,
"loss": 3.1282,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -34.30828094482422,
"rewards/margins": 4.827174186706543,
"rewards/rejected": -39.13545608520508,
"step": 274
},
{
"epoch": 0.6219960418433701,
"grad_norm": 105.37259501282527,
"learning_rate": 3.0132138597545537e-07,
"logits/chosen": -0.8960826396942139,
"logits/rejected": -0.935992956161499,
"logps/chosen": -3.6094119548797607,
"logps/rejected": -4.329087734222412,
"loss": 2.4253,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -36.094120025634766,
"rewards/margins": 7.196761608123779,
"rewards/rejected": -43.29087829589844,
"step": 275
},
{
"epoch": 0.6242578456318915,
"grad_norm": 102.48158570587563,
"learning_rate": 2.982570066204981e-07,
"logits/chosen": -0.8868385553359985,
"logits/rejected": -0.8976235389709473,
"logps/chosen": -3.52752685546875,
"logps/rejected": -4.042696952819824,
"loss": 2.8358,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -35.275264739990234,
"rewards/margins": 5.151702880859375,
"rewards/rejected": -40.426971435546875,
"step": 276
},
{
"epoch": 0.6265196494204128,
"grad_norm": 152.73891214182655,
"learning_rate": 2.951989984616979e-07,
"logits/chosen": -0.8241417407989502,
"logits/rejected": -0.8495975136756897,
"logps/chosen": -3.6896445751190186,
"logps/rejected": -4.32491397857666,
"loss": 3.0859,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -36.89644241333008,
"rewards/margins": 6.352697849273682,
"rewards/rejected": -43.249141693115234,
"step": 277
},
{
"epoch": 0.6287814532089341,
"grad_norm": 112.80204422900222,
"learning_rate": 2.9214755299302584e-07,
"logits/chosen": -0.8538424968719482,
"logits/rejected": -0.8619410395622253,
"logps/chosen": -3.8360202312469482,
"logps/rejected": -4.50319766998291,
"loss": 2.0849,
"rewards/accuracies": 0.8671875,
"rewards/chosen": -38.36020278930664,
"rewards/margins": 6.671772480010986,
"rewards/rejected": -45.03197479248047,
"step": 278
},
{
"epoch": 0.6310432569974554,
"grad_norm": 128.85380711403621,
"learning_rate": 2.89102861297494e-07,
"logits/chosen": -0.8916823863983154,
"logits/rejected": -0.9155115485191345,
"logps/chosen": -3.7095577716827393,
"logps/rejected": -4.224562168121338,
"loss": 3.36,
"rewards/accuracies": 0.75,
"rewards/chosen": -37.095577239990234,
"rewards/margins": 5.150045394897461,
"rewards/rejected": -42.24562072753906,
"step": 279
},
{
"epoch": 0.6333050607859768,
"grad_norm": 164.97262791305909,
"learning_rate": 2.860651140351902e-07,
"logits/chosen": -0.887188196182251,
"logits/rejected": -0.8843110799789429,
"logps/chosen": -3.7880616188049316,
"logps/rejected": -4.440821647644043,
"loss": 2.8817,
"rewards/accuracies": 0.765625,
"rewards/chosen": -37.880611419677734,
"rewards/margins": 6.5276007652282715,
"rewards/rejected": -44.40821838378906,
"step": 280
},
{
"epoch": 0.6355668645744982,
"grad_norm": 120.88646729377493,
"learning_rate": 2.830345014313381e-07,
"logits/chosen": -0.828898549079895,
"logits/rejected": -0.8654926419258118,
"logps/chosen": -3.848175525665283,
"logps/rejected": -4.498141288757324,
"loss": 2.3091,
"rewards/accuracies": 0.765625,
"rewards/chosen": -38.48175811767578,
"rewards/margins": 6.499655246734619,
"rewards/rejected": -44.981414794921875,
"step": 281
},
{
"epoch": 0.6378286683630195,
"grad_norm": 138.22518292554588,
"learning_rate": 2.800112132643856e-07,
"logits/chosen": -0.8729172348976135,
"logits/rejected": -0.8878234028816223,
"logps/chosen": -3.8589096069335938,
"logps/rejected": -4.514438629150391,
"loss": 2.7701,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -38.58909225463867,
"rewards/margins": 6.5552897453308105,
"rewards/rejected": -45.144378662109375,
"step": 282
},
{
"epoch": 0.6400904721515408,
"grad_norm": 123.6765508241198,
"learning_rate": 2.7699543885412105e-07,
"logits/chosen": -0.8810731768608093,
"logits/rejected": -0.8965428471565247,
"logps/chosen": -3.979841947555542,
"logps/rejected": -4.676267623901367,
"loss": 2.5296,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -39.79841995239258,
"rewards/margins": 6.96425724029541,
"rewards/rejected": -46.762672424316406,
"step": 283
},
{
"epoch": 0.6423522759400622,
"grad_norm": 134.34352209400595,
"learning_rate": 2.7398736704981725e-07,
"logits/chosen": -0.8905003070831299,
"logits/rejected": -0.8742426037788391,
"logps/chosen": -4.015549182891846,
"logps/rejected": -4.602110385894775,
"loss": 2.6926,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -40.155487060546875,
"rewards/margins": 5.865612030029297,
"rewards/rejected": -46.0211067199707,
"step": 284
},
{
"epoch": 0.6446140797285835,
"grad_norm": 125.55351312091291,
"learning_rate": 2.709871862184063e-07,
"logits/chosen": -0.8608399629592896,
"logits/rejected": -0.8779529929161072,
"logps/chosen": -3.936886787414551,
"logps/rejected": -4.538551330566406,
"loss": 3.1564,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -39.36886215209961,
"rewards/margins": 6.016650199890137,
"rewards/rejected": -45.38551712036133,
"step": 285
},
{
"epoch": 0.6468758835171049,
"grad_norm": 116.6462838115517,
"learning_rate": 2.679950842326837e-07,
"logits/chosen": -0.9049277901649475,
"logits/rejected": -0.9126715064048767,
"logps/chosen": -4.160530090332031,
"logps/rejected": -4.841786861419678,
"loss": 2.5103,
"rewards/accuracies": 0.75,
"rewards/chosen": -41.60530471801758,
"rewards/margins": 6.812563896179199,
"rewards/rejected": -48.41786193847656,
"step": 286
},
{
"epoch": 0.6491376873056263,
"grad_norm": 125.7388754846288,
"learning_rate": 2.6501124845954363e-07,
"logits/chosen": -0.8767872452735901,
"logits/rejected": -0.8922024965286255,
"logps/chosen": -4.103570461273193,
"logps/rejected": -4.829570770263672,
"loss": 2.3212,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -41.03570556640625,
"rewards/margins": 7.260003089904785,
"rewards/rejected": -48.29570388793945,
"step": 287
},
{
"epoch": 0.6513994910941476,
"grad_norm": 123.45145960508628,
"learning_rate": 2.62035865748246e-07,
"logits/chosen": -0.8526559472084045,
"logits/rejected": -0.865902304649353,
"logps/chosen": -3.979776382446289,
"logps/rejected": -4.576243877410889,
"loss": 2.8045,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -39.797767639160156,
"rewards/margins": 5.964676856994629,
"rewards/rejected": -45.76243591308594,
"step": 288
},
{
"epoch": 0.6536612948826689,
"grad_norm": 165.09853723797895,
"learning_rate": 2.5906912241871554e-07,
"logits/chosen": -0.932748019695282,
"logits/rejected": -0.9441463947296143,
"logps/chosen": -4.250375270843506,
"logps/rejected": -4.883494853973389,
"loss": 2.7687,
"rewards/accuracies": 0.765625,
"rewards/chosen": -42.503753662109375,
"rewards/margins": 6.331197738647461,
"rewards/rejected": -48.83495330810547,
"step": 289
},
{
"epoch": 0.6559230986711903,
"grad_norm": 136.63193628571327,
"learning_rate": 2.561112042498753e-07,
"logits/chosen": -0.8068567514419556,
"logits/rejected": -0.8377366065979004,
"logps/chosen": -3.9496514797210693,
"logps/rejected": -4.45440673828125,
"loss": 3.4096,
"rewards/accuracies": 0.71875,
"rewards/chosen": -39.49651336669922,
"rewards/margins": 5.047552108764648,
"rewards/rejected": -44.5440673828125,
"step": 290
},
{
"epoch": 0.6581849024597116,
"grad_norm": 143.97363440746773,
"learning_rate": 2.5316229646801195e-07,
"logits/chosen": -0.8525142073631287,
"logits/rejected": -0.8822568655014038,
"logps/chosen": -4.472620010375977,
"logps/rejected": -5.035334587097168,
"loss": 2.9599,
"rewards/accuracies": 0.765625,
"rewards/chosen": -44.726200103759766,
"rewards/margins": 5.6271467208862305,
"rewards/rejected": -50.35334777832031,
"step": 291
},
{
"epoch": 0.660446706248233,
"grad_norm": 120.93371939289545,
"learning_rate": 2.5022258373517714e-07,
"logits/chosen": -0.9202491044998169,
"logits/rejected": -0.9317676424980164,
"logps/chosen": -4.282386779785156,
"logps/rejected": -4.909029483795166,
"loss": 2.3016,
"rewards/accuracies": 0.828125,
"rewards/chosen": -42.8238639831543,
"rewards/margins": 6.266423225402832,
"rewards/rejected": -49.09029006958008,
"step": 292
},
{
"epoch": 0.6627085100367544,
"grad_norm": 149.03147323251173,
"learning_rate": 2.4729225013762474e-07,
"logits/chosen": -0.9682255387306213,
"logits/rejected": -0.9804242253303528,
"logps/chosen": -4.4975714683532715,
"logps/rejected": -5.107451438903809,
"loss": 3.3597,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -44.9757194519043,
"rewards/margins": 6.098800182342529,
"rewards/rejected": -51.07451629638672,
"step": 293
},
{
"epoch": 0.6649703138252756,
"grad_norm": 147.62664173767308,
"learning_rate": 2.4437147917428203e-07,
"logits/chosen": -0.8548184633255005,
"logits/rejected": -0.8674319386482239,
"logps/chosen": -4.390334129333496,
"logps/rejected": -5.062026500701904,
"loss": 2.7968,
"rewards/accuracies": 0.796875,
"rewards/chosen": -43.90333938598633,
"rewards/margins": 6.716926574707031,
"rewards/rejected": -50.620262145996094,
"step": 294
},
{
"epoch": 0.667232117613797,
"grad_norm": 145.09780607922434,
"learning_rate": 2.414604537452595e-07,
"logits/chosen": -0.8391546607017517,
"logits/rejected": -0.8629494309425354,
"logps/chosen": -4.255443096160889,
"logps/rejected": -4.826220512390137,
"loss": 2.7128,
"rewards/accuracies": 0.78125,
"rewards/chosen": -42.55442810058594,
"rewards/margins": 5.707772731781006,
"rewards/rejected": -48.26219940185547,
"step": 295
},
{
"epoch": 0.6694939214023183,
"grad_norm": 141.08198535612613,
"learning_rate": 2.385593561403974e-07,
"logits/chosen": -0.8808133602142334,
"logits/rejected": -0.9036346673965454,
"logps/chosen": -4.058435440063477,
"logps/rejected": -4.680113792419434,
"loss": 2.5458,
"rewards/accuracies": 0.859375,
"rewards/chosen": -40.5843505859375,
"rewards/margins": 6.216782569885254,
"rewards/rejected": -46.8011360168457,
"step": 296
},
{
"epoch": 0.6717557251908397,
"grad_norm": 108.96502235601564,
"learning_rate": 2.3566836802785119e-07,
"logits/chosen": -0.8734185099601746,
"logits/rejected": -0.910306453704834,
"logps/chosen": -4.139810562133789,
"logps/rejected": -4.895854949951172,
"loss": 2.3129,
"rewards/accuracies": 0.8125,
"rewards/chosen": -41.398101806640625,
"rewards/margins": 7.560453414916992,
"rewards/rejected": -48.95855712890625,
"step": 297
},
{
"epoch": 0.6740175289793611,
"grad_norm": 137.86729078107237,
"learning_rate": 2.327876704427146e-07,
"logits/chosen": -0.8416418433189392,
"logits/rejected": -0.8470006585121155,
"logps/chosen": -4.119014263153076,
"logps/rejected": -4.593555927276611,
"loss": 3.1621,
"rewards/accuracies": 0.734375,
"rewards/chosen": -41.19013977050781,
"rewards/margins": 4.745421886444092,
"rewards/rejected": -45.93556213378906,
"step": 298
},
{
"epoch": 0.6762793327678824,
"grad_norm": 175.89823569204168,
"learning_rate": 2.2991744377568358e-07,
"logits/chosen": -0.8492337465286255,
"logits/rejected": -0.8457680344581604,
"logps/chosen": -4.260539531707764,
"logps/rejected": -4.814329147338867,
"loss": 2.9577,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -42.60539627075195,
"rewards/margins": 5.537896633148193,
"rewards/rejected": -48.14329528808594,
"step": 299
},
{
"epoch": 0.6785411365564037,
"grad_norm": 131.9500051592224,
"learning_rate": 2.270578677617601e-07,
"logits/chosen": -0.9049394130706787,
"logits/rejected": -0.9202775955200195,
"logps/chosen": -4.153038501739502,
"logps/rejected": -4.798150062561035,
"loss": 3.2734,
"rewards/accuracies": 0.75,
"rewards/chosen": -41.53038787841797,
"rewards/margins": 6.451115608215332,
"rewards/rejected": -47.981502532958984,
"step": 300
},
{
"epoch": 0.6808029403449251,
"grad_norm": 128.15329813087357,
"learning_rate": 2.242091214689971e-07,
"logits/chosen": -0.8781294226646423,
"logits/rejected": -0.9136564135551453,
"logps/chosen": -4.225987911224365,
"logps/rejected": -4.948566436767578,
"loss": 2.5272,
"rewards/accuracies": 0.828125,
"rewards/chosen": -42.2598876953125,
"rewards/margins": 7.225780487060547,
"rewards/rejected": -49.485660552978516,
"step": 301
},
{
"epoch": 0.6830647441334464,
"grad_norm": 149.91529967533276,
"learning_rate": 2.2137138328728456e-07,
"logits/chosen": -0.9418582916259766,
"logits/rejected": -0.9293465614318848,
"logps/chosen": -4.346358299255371,
"logps/rejected": -4.898774147033691,
"loss": 2.7408,
"rewards/accuracies": 0.78125,
"rewards/chosen": -43.46357727050781,
"rewards/margins": 5.524153232574463,
"rewards/rejected": -48.987735748291016,
"step": 302
},
{
"epoch": 0.6853265479219678,
"grad_norm": 108.47737776135804,
"learning_rate": 2.1854483091717974e-07,
"logits/chosen": -0.9234378337860107,
"logits/rejected": -0.9519913792610168,
"logps/chosen": -4.19830322265625,
"logps/rejected": -4.844261646270752,
"loss": 2.266,
"rewards/accuracies": 0.78125,
"rewards/chosen": -41.9830322265625,
"rewards/margins": 6.459583759307861,
"rewards/rejected": -48.4426155090332,
"step": 303
},
{
"epoch": 0.6875883517104892,
"grad_norm": 143.85162210524913,
"learning_rate": 2.1572964135877863e-07,
"logits/chosen": -0.9188116192817688,
"logits/rejected": -0.9410698413848877,
"logps/chosen": -4.3630805015563965,
"logps/rejected": -4.898950099945068,
"loss": 3.0578,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -43.63079833984375,
"rewards/margins": 5.358698844909668,
"rewards/rejected": -48.989498138427734,
"step": 304
},
{
"epoch": 0.6898501554990104,
"grad_norm": 122.12397072006037,
"learning_rate": 2.1292599090063245e-07,
"logits/chosen": -0.9438715577125549,
"logits/rejected": -0.9488154053688049,
"logps/chosen": -4.201948165893555,
"logps/rejected": -4.896744251251221,
"loss": 2.4426,
"rewards/accuracies": 0.828125,
"rewards/chosen": -42.01948165893555,
"rewards/margins": 6.94796085357666,
"rewards/rejected": -48.96744155883789,
"step": 305
},
{
"epoch": 0.6921119592875318,
"grad_norm": 144.4237668373174,
"learning_rate": 2.1013405510870824e-07,
"logits/chosen": -0.8521759510040283,
"logits/rejected": -0.8959603905677795,
"logps/chosen": -4.268230438232422,
"logps/rejected": -4.95402193069458,
"loss": 2.2676,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -42.68229675292969,
"rewards/margins": 6.857920169830322,
"rewards/rejected": -49.540225982666016,
"step": 306
},
{
"epoch": 0.6943737630760531,
"grad_norm": 121.44280758498674,
"learning_rate": 2.0735400881539494e-07,
"logits/chosen": -0.8913055658340454,
"logits/rejected": -0.9139821529388428,
"logps/chosen": -4.439169406890869,
"logps/rejected": -5.179335117340088,
"loss": 2.3341,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -44.39169692993164,
"rewards/margins": 7.4016547203063965,
"rewards/rejected": -51.79335021972656,
"step": 307
},
{
"epoch": 0.6966355668645745,
"grad_norm": 158.3173957292418,
"learning_rate": 2.0458602610855536e-07,
"logits/chosen": -0.9496070742607117,
"logits/rejected": -0.9575868844985962,
"logps/chosen": -4.370190620422363,
"logps/rejected": -4.980884552001953,
"loss": 2.4398,
"rewards/accuracies": 0.796875,
"rewards/chosen": -43.70191192626953,
"rewards/margins": 6.106935501098633,
"rewards/rejected": -49.80884552001953,
"step": 308
},
{
"epoch": 0.6988973706530959,
"grad_norm": 148.52467272675696,
"learning_rate": 2.0183028032062422e-07,
"logits/chosen": -0.9165297746658325,
"logits/rejected": -0.9382550120353699,
"logps/chosen": -4.426529884338379,
"logps/rejected": -5.0528764724731445,
"loss": 2.7146,
"rewards/accuracies": 0.828125,
"rewards/chosen": -44.265296936035156,
"rewards/margins": 6.263469219207764,
"rewards/rejected": -50.52876663208008,
"step": 309
},
{
"epoch": 0.7011591744416172,
"grad_norm": 132.68776312794716,
"learning_rate": 1.9908694401775473e-07,
"logits/chosen": -0.9458051323890686,
"logits/rejected": -0.9692423939704895,
"logps/chosen": -4.464923858642578,
"logps/rejected": -5.0837626457214355,
"loss": 2.6034,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -44.64923858642578,
"rewards/margins": 6.188381195068359,
"rewards/rejected": -50.837623596191406,
"step": 310
},
{
"epoch": 0.7034209782301385,
"grad_norm": 136.8201982483417,
"learning_rate": 1.9635618898901196e-07,
"logits/chosen": -0.921970784664154,
"logits/rejected": -0.939640998840332,
"logps/chosen": -4.886068820953369,
"logps/rejected": -5.573887825012207,
"loss": 2.8076,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -48.860687255859375,
"rewards/margins": 6.878194332122803,
"rewards/rejected": -55.73888397216797,
"step": 311
},
{
"epoch": 0.7056827820186599,
"grad_norm": 138.79561100336207,
"learning_rate": 1.9363818623561565e-07,
"logits/chosen": -0.8815241456031799,
"logits/rejected": -0.9167051315307617,
"logps/chosen": -4.46604585647583,
"logps/rejected": -5.128448009490967,
"loss": 2.467,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -44.66046142578125,
"rewards/margins": 6.624024391174316,
"rewards/rejected": -51.284481048583984,
"step": 312
},
{
"epoch": 0.7079445858071812,
"grad_norm": 139.63987542313842,
"learning_rate": 1.9093310596023108e-07,
"logits/chosen": -0.8783115148544312,
"logits/rejected": -0.886088490486145,
"logps/chosen": -4.325229167938232,
"logps/rejected": -5.129339694976807,
"loss": 2.4526,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -43.252296447753906,
"rewards/margins": 8.04110336303711,
"rewards/rejected": -51.29339599609375,
"step": 313
},
{
"epoch": 0.7102063895957026,
"grad_norm": 158.39745233921516,
"learning_rate": 1.8824111755631274e-07,
"logits/chosen": -0.9300839900970459,
"logits/rejected": -0.9606208801269531,
"logps/chosen": -4.352430820465088,
"logps/rejected": -4.94814920425415,
"loss": 2.7746,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -43.52430725097656,
"rewards/margins": 5.957186222076416,
"rewards/rejected": -49.48149490356445,
"step": 314
},
{
"epoch": 0.712468193384224,
"grad_norm": 175.08231727150573,
"learning_rate": 1.8556238959749457e-07,
"logits/chosen": -0.9153900146484375,
"logits/rejected": -0.9284498691558838,
"logps/chosen": -4.703487396240234,
"logps/rejected": -5.153081893920898,
"loss": 3.7552,
"rewards/accuracies": 0.703125,
"rewards/chosen": -47.034873962402344,
"rewards/margins": 4.495938777923584,
"rewards/rejected": -51.53081512451172,
"step": 315
},
{
"epoch": 0.7147299971727452,
"grad_norm": 171.25427946077767,
"learning_rate": 1.8289708982703562e-07,
"logits/chosen": -0.8872180581092834,
"logits/rejected": -0.8773900866508484,
"logps/chosen": -4.5176239013671875,
"logps/rejected": -5.215326309204102,
"loss": 3.269,
"rewards/accuracies": 0.734375,
"rewards/chosen": -45.17624282836914,
"rewards/margins": 6.977020740509033,
"rewards/rejected": -52.153263092041016,
"step": 316
},
{
"epoch": 0.7169918009612666,
"grad_norm": 167.8139506813274,
"learning_rate": 1.802453851473151e-07,
"logits/chosen": -0.9402052164077759,
"logits/rejected": -0.9389104247093201,
"logps/chosen": -4.668498516082764,
"logps/rejected": -5.336842060089111,
"loss": 2.5691,
"rewards/accuracies": 0.8125,
"rewards/chosen": -46.68498229980469,
"rewards/margins": 6.683432102203369,
"rewards/rejected": -53.36841583251953,
"step": 317
},
{
"epoch": 0.719253604749788,
"grad_norm": 145.08828046519986,
"learning_rate": 1.7760744160938093e-07,
"logits/chosen": -0.8834313154220581,
"logits/rejected": -0.899104118347168,
"logps/chosen": -4.476520538330078,
"logps/rejected": -5.283236980438232,
"loss": 2.5195,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -44.76520538330078,
"rewards/margins": 8.06716251373291,
"rewards/rejected": -52.832374572753906,
"step": 318
},
{
"epoch": 0.7215154085383093,
"grad_norm": 139.91997046332745,
"learning_rate": 1.7498342440255135e-07,
"logits/chosen": -0.9341943264007568,
"logits/rejected": -0.9333917498588562,
"logps/chosen": -4.627048015594482,
"logps/rejected": -5.251206874847412,
"loss": 2.5712,
"rewards/accuracies": 0.78125,
"rewards/chosen": -46.27048110961914,
"rewards/margins": 6.241583824157715,
"rewards/rejected": -52.51206588745117,
"step": 319
},
{
"epoch": 0.7237772123268307,
"grad_norm": 128.75616766030222,
"learning_rate": 1.7237349784407115e-07,
"logits/chosen": -0.9444934725761414,
"logits/rejected": -0.9463576674461365,
"logps/chosen": -4.7167863845825195,
"logps/rejected": -5.434706687927246,
"loss": 2.4956,
"rewards/accuracies": 0.78125,
"rewards/chosen": -47.16786575317383,
"rewards/margins": 7.179207801818848,
"rewards/rejected": -54.34707260131836,
"step": 320
},
{
"epoch": 0.726039016115352,
"grad_norm": 152.32529171434254,
"learning_rate": 1.6977782536882178e-07,
"logits/chosen": -0.8644733428955078,
"logits/rejected": -0.878294825553894,
"logps/chosen": -4.275421142578125,
"logps/rejected": -5.040489196777344,
"loss": 2.7444,
"rewards/accuracies": 0.78125,
"rewards/chosen": -42.75421142578125,
"rewards/margins": 7.650677680969238,
"rewards/rejected": -50.40489196777344,
"step": 321
},
{
"epoch": 0.7283008199038733,
"grad_norm": 114.93649081424365,
"learning_rate": 1.6719656951908708e-07,
"logits/chosen": -0.8660048246383667,
"logits/rejected": -0.8882208466529846,
"logps/chosen": -4.067705154418945,
"logps/rejected": -4.7624616622924805,
"loss": 2.4388,
"rewards/accuracies": 0.8125,
"rewards/chosen": -40.67705535888672,
"rewards/margins": 6.947561264038086,
"rewards/rejected": -47.62461853027344,
"step": 322
},
{
"epoch": 0.7305626236923947,
"grad_norm": 141.6695023872139,
"learning_rate": 1.6462989193437453e-07,
"logits/chosen": -0.9560823440551758,
"logits/rejected": -0.9642462730407715,
"logps/chosen": -4.5176310539245605,
"logps/rejected": -5.131880760192871,
"loss": 2.8127,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -45.17631149291992,
"rewards/margins": 6.142499923706055,
"rewards/rejected": -51.318809509277344,
"step": 323
},
{
"epoch": 0.732824427480916,
"grad_norm": 153.0336707141252,
"learning_rate": 1.6207795334129365e-07,
"logits/chosen": -0.9089516997337341,
"logits/rejected": -0.9075677394866943,
"logps/chosen": -4.768195152282715,
"logps/rejected": -5.365372180938721,
"loss": 2.9092,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -47.68195343017578,
"rewards/margins": 5.971770286560059,
"rewards/rejected": -53.65372085571289,
"step": 324
},
{
"epoch": 0.7350862312694374,
"grad_norm": 175.02628532285675,
"learning_rate": 1.5954091354349121e-07,
"logits/chosen": -0.93093341588974,
"logits/rejected": -0.9459247589111328,
"logps/chosen": -4.557867050170898,
"logps/rejected": -5.066596508026123,
"loss": 3.4744,
"rewards/accuracies": 0.75,
"rewards/chosen": -45.57866668701172,
"rewards/margins": 5.087299346923828,
"rewards/rejected": -50.66596603393555,
"step": 325
},
{
"epoch": 0.7373480350579588,
"grad_norm": 268.3403033769808,
"learning_rate": 1.5701893141164364e-07,
"logits/chosen": -0.9369128346443176,
"logits/rejected": -0.9450178742408752,
"logps/chosen": -4.78832483291626,
"logps/rejected": -5.510087490081787,
"loss": 3.4083,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -47.88325119018555,
"rewards/margins": 7.2176289558410645,
"rewards/rejected": -55.10087966918945,
"step": 326
},
{
"epoch": 0.73960983884648,
"grad_norm": 144.09484817777712,
"learning_rate": 1.545121648735093e-07,
"logits/chosen": -0.9158852100372314,
"logits/rejected": -0.924169659614563,
"logps/chosen": -4.634927749633789,
"logps/rejected": -5.217785358428955,
"loss": 3.0842,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -46.349273681640625,
"rewards/margins": 5.828577518463135,
"rewards/rejected": -52.17784881591797,
"step": 327
},
{
"epoch": 0.7418716426350014,
"grad_norm": 134.18959418719092,
"learning_rate": 1.5202077090403863e-07,
"logits/chosen": -0.9410818815231323,
"logits/rejected": -0.9246065616607666,
"logps/chosen": -4.225207805633545,
"logps/rejected": -4.864658355712891,
"loss": 2.5935,
"rewards/accuracies": 0.8125,
"rewards/chosen": -42.2520751953125,
"rewards/margins": 6.394504547119141,
"rewards/rejected": -48.646583557128906,
"step": 328
},
{
"epoch": 0.7441334464235227,
"grad_norm": 157.68518289039383,
"learning_rate": 1.495449055155443e-07,
"logits/chosen": -0.9306075572967529,
"logits/rejected": -0.942533552646637,
"logps/chosen": -4.558164119720459,
"logps/rejected": -5.3390398025512695,
"loss": 2.4075,
"rewards/accuracies": 0.796875,
"rewards/chosen": -45.581642150878906,
"rewards/margins": 7.808758735656738,
"rewards/rejected": -53.390403747558594,
"step": 329
},
{
"epoch": 0.7463952502120441,
"grad_norm": 155.53781883249212,
"learning_rate": 1.4708472374793112e-07,
"logits/chosen": -0.9006601572036743,
"logits/rejected": -0.9074862599372864,
"logps/chosen": -4.6048903465271,
"logps/rejected": -5.098773002624512,
"loss": 3.4763,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -46.04890441894531,
"rewards/margins": 4.938818454742432,
"rewards/rejected": -50.98772430419922,
"step": 330
},
{
"epoch": 0.7486570540005655,
"grad_norm": 156.32875339642223,
"learning_rate": 1.4464037965898878e-07,
"logits/chosen": -0.8546017408370972,
"logits/rejected": -0.8647469878196716,
"logps/chosen": -4.482880115509033,
"logps/rejected": -5.083220481872559,
"loss": 2.9646,
"rewards/accuracies": 0.765625,
"rewards/chosen": -44.82880401611328,
"rewards/margins": 6.003401756286621,
"rewards/rejected": -50.83220672607422,
"step": 331
},
{
"epoch": 0.7509188577890868,
"grad_norm": 131.67726299773977,
"learning_rate": 1.4221202631474282e-07,
"logits/chosen": -0.8612452745437622,
"logits/rejected": -0.8679234385490417,
"logps/chosen": -4.424932479858398,
"logps/rejected": -5.060441970825195,
"loss": 2.7243,
"rewards/accuracies": 0.78125,
"rewards/chosen": -44.249324798583984,
"rewards/margins": 6.355095863342285,
"rewards/rejected": -50.60442352294922,
"step": 332
},
{
"epoch": 0.7531806615776081,
"grad_norm": 143.74537092522394,
"learning_rate": 1.3979981577987113e-07,
"logits/chosen": -0.9003939628601074,
"logits/rejected": -0.8934139013290405,
"logps/chosen": -4.204031467437744,
"logps/rejected": -4.884526252746582,
"loss": 2.7886,
"rewards/accuracies": 0.734375,
"rewards/chosen": -42.040313720703125,
"rewards/margins": 6.804945945739746,
"rewards/rejected": -48.84525680541992,
"step": 333
},
{
"epoch": 0.7554424653661295,
"grad_norm": 122.3096159489272,
"learning_rate": 1.374038991081807e-07,
"logits/chosen": -0.9354572892189026,
"logits/rejected": -0.9418012499809265,
"logps/chosen": -4.416646957397461,
"logps/rejected": -4.966562747955322,
"loss": 2.9822,
"rewards/accuracies": 0.734375,
"rewards/chosen": -44.16646957397461,
"rewards/margins": 5.49915885925293,
"rewards/rejected": -49.665626525878906,
"step": 334
},
{
"epoch": 0.7577042691546508,
"grad_norm": 125.3467664124325,
"learning_rate": 1.3502442633314882e-07,
"logits/chosen": -0.8854781985282898,
"logits/rejected": -0.8959544897079468,
"logps/chosen": -3.8627703189849854,
"logps/rejected": -4.476377010345459,
"loss": 2.4221,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -38.62770462036133,
"rewards/margins": 6.136070728302002,
"rewards/rejected": -44.76377487182617,
"step": 335
},
{
"epoch": 0.7599660729431722,
"grad_norm": 130.27731954527835,
"learning_rate": 1.3266154645852815e-07,
"logits/chosen": -0.8756478428840637,
"logits/rejected": -0.8797450065612793,
"logps/chosen": -4.367222785949707,
"logps/rejected": -4.935546875,
"loss": 2.7571,
"rewards/accuracies": 0.765625,
"rewards/chosen": -43.6722297668457,
"rewards/margins": 5.683239936828613,
"rewards/rejected": -49.35546875,
"step": 336
},
{
"epoch": 0.7622278767316936,
"grad_norm": 174.47595303975626,
"learning_rate": 1.303154074490152e-07,
"logits/chosen": -0.9251211881637573,
"logits/rejected": -0.9095232486724854,
"logps/chosen": -4.15577507019043,
"logps/rejected": -4.788844108581543,
"loss": 3.0771,
"rewards/accuracies": 0.71875,
"rewards/chosen": -41.55774688720703,
"rewards/margins": 6.330696105957031,
"rewards/rejected": -47.88844680786133,
"step": 337
},
{
"epoch": 0.7644896805202148,
"grad_norm": 139.20935050711142,
"learning_rate": 1.2798615622098616e-07,
"logits/chosen": -0.9291560649871826,
"logits/rejected": -0.9244073033332825,
"logps/chosen": -4.082375526428223,
"logps/rejected": -4.767295837402344,
"loss": 2.935,
"rewards/accuracies": 0.75,
"rewards/chosen": -40.823753356933594,
"rewards/margins": 6.8492045402526855,
"rewards/rejected": -47.67295837402344,
"step": 338
},
{
"epoch": 0.7667514843087362,
"grad_norm": 125.85713373053201,
"learning_rate": 1.2567393863329523e-07,
"logits/chosen": -0.9064013957977295,
"logits/rejected": -0.9375932216644287,
"logps/chosen": -4.197393417358398,
"logps/rejected": -4.932326793670654,
"loss": 2.4315,
"rewards/accuracies": 0.8125,
"rewards/chosen": -41.973934173583984,
"rewards/margins": 7.349334239959717,
"rewards/rejected": -49.323272705078125,
"step": 339
},
{
"epoch": 0.7690132880972576,
"grad_norm": 113.88331415159809,
"learning_rate": 1.233788994781423e-07,
"logits/chosen": -0.9322744607925415,
"logits/rejected": -0.9696506261825562,
"logps/chosen": -3.992047071456909,
"logps/rejected": -4.617818355560303,
"loss": 2.3658,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -39.920467376708984,
"rewards/margins": 6.257713317871094,
"rewards/rejected": -46.178184509277344,
"step": 340
},
{
"epoch": 0.7712750918857789,
"grad_norm": 120.76771779346096,
"learning_rate": 1.2110118247200468e-07,
"logits/chosen": -0.930842399597168,
"logits/rejected": -0.9461864829063416,
"logps/chosen": -4.024172782897949,
"logps/rejected": -4.681670188903809,
"loss": 2.253,
"rewards/accuracies": 0.828125,
"rewards/chosen": -40.24173355102539,
"rewards/margins": 6.574971675872803,
"rewards/rejected": -46.81669998168945,
"step": 341
},
{
"epoch": 0.7735368956743003,
"grad_norm": 136.22802399303367,
"learning_rate": 1.1884093024663933e-07,
"logits/chosen": -0.9333779811859131,
"logits/rejected": -0.9390580058097839,
"logps/chosen": -3.7476558685302734,
"logps/rejected": -4.544672012329102,
"loss": 2.7483,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -37.47655487060547,
"rewards/margins": 7.970164775848389,
"rewards/rejected": -45.446720123291016,
"step": 342
},
{
"epoch": 0.7757986994628217,
"grad_norm": 142.58791407306794,
"learning_rate": 1.1659828434014886e-07,
"logits/chosen": -0.9368746280670166,
"logits/rejected": -0.9193394780158997,
"logps/chosen": -3.7733166217803955,
"logps/rejected": -4.509119033813477,
"loss": 2.5863,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -37.73316955566406,
"rewards/margins": 7.358019828796387,
"rewards/rejected": -45.0911865234375,
"step": 343
},
{
"epoch": 0.7780605032513429,
"grad_norm": 149.16462513003228,
"learning_rate": 1.143733851881203e-07,
"logits/chosen": -0.969507098197937,
"logits/rejected": -0.9765860438346863,
"logps/chosen": -4.0373382568359375,
"logps/rejected": -4.758094310760498,
"loss": 2.5659,
"rewards/accuracies": 0.8125,
"rewards/chosen": -40.373382568359375,
"rewards/margins": 7.207555770874023,
"rewards/rejected": -47.5809440612793,
"step": 344
},
{
"epoch": 0.7803223070398643,
"grad_norm": 124.92145191514574,
"learning_rate": 1.1216637211483005e-07,
"logits/chosen": -0.9140538573265076,
"logits/rejected": -0.9331907629966736,
"logps/chosen": -3.9364168643951416,
"logps/rejected": -4.495926856994629,
"loss": 2.7829,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -39.36417007446289,
"rewards/margins": 5.595102310180664,
"rewards/rejected": -44.959266662597656,
"step": 345
},
{
"epoch": 0.7825841108283856,
"grad_norm": 139.19218112831186,
"learning_rate": 1.0997738332451936e-07,
"logits/chosen": -0.9063421487808228,
"logits/rejected": -0.9176337718963623,
"logps/chosen": -4.21934175491333,
"logps/rejected": -4.815896034240723,
"loss": 2.6989,
"rewards/accuracies": 0.765625,
"rewards/chosen": -42.19342041015625,
"rewards/margins": 5.965543270111084,
"rewards/rejected": -48.15896224975586,
"step": 346
},
{
"epoch": 0.784845914616907,
"grad_norm": 135.08565556440382,
"learning_rate": 1.0780655589274031e-07,
"logits/chosen": -0.9613451957702637,
"logits/rejected": -0.9540661573410034,
"logps/chosen": -3.9844415187835693,
"logps/rejected": -4.568908214569092,
"loss": 2.3105,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -39.844417572021484,
"rewards/margins": 5.844663619995117,
"rewards/rejected": -45.68907928466797,
"step": 347
},
{
"epoch": 0.7871077184054284,
"grad_norm": 140.13866385079996,
"learning_rate": 1.056540257577712e-07,
"logits/chosen": -0.8759354948997498,
"logits/rejected": -0.8963236808776855,
"logps/chosen": -4.536995887756348,
"logps/rejected": -5.269956111907959,
"loss": 2.1934,
"rewards/accuracies": 0.859375,
"rewards/chosen": -45.369956970214844,
"rewards/margins": 7.329606056213379,
"rewards/rejected": -52.69956970214844,
"step": 348
},
{
"epoch": 0.7893695221939496,
"grad_norm": 137.27660659020637,
"learning_rate": 1.0351992771210554e-07,
"logits/chosen": -0.9132465720176697,
"logits/rejected": -0.9206264615058899,
"logps/chosen": -4.071958541870117,
"logps/rejected": -4.736346244812012,
"loss": 2.6102,
"rewards/accuracies": 0.828125,
"rewards/chosen": -40.719581604003906,
"rewards/margins": 6.643874168395996,
"rewards/rejected": -47.36345291137695,
"step": 349
},
{
"epoch": 0.791631325982471,
"grad_norm": 142.1622013900452,
"learning_rate": 1.0140439539400953e-07,
"logits/chosen": -0.8662968277931213,
"logits/rejected": -0.9013144969940186,
"logps/chosen": -4.007244110107422,
"logps/rejected": -4.624824523925781,
"loss": 3.0147,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -40.07244110107422,
"rewards/margins": 6.175803184509277,
"rewards/rejected": -46.24824523925781,
"step": 350
},
{
"epoch": 0.7938931297709924,
"grad_norm": 129.3739896140384,
"learning_rate": 9.930756127915488e-08,
"logits/chosen": -0.9286041259765625,
"logits/rejected": -0.9355084896087646,
"logps/chosen": -4.044810771942139,
"logps/rejected": -4.674356460571289,
"loss": 2.6907,
"rewards/accuracies": 0.78125,
"rewards/chosen": -40.44810485839844,
"rewards/margins": 6.295462131500244,
"rewards/rejected": -46.743568420410156,
"step": 351
},
{
"epoch": 0.7961549335595137,
"grad_norm": 154.70829465365628,
"learning_rate": 9.722955667232242e-08,
"logits/chosen": -0.9570465683937073,
"logits/rejected": -0.9680700302124023,
"logps/chosen": -4.297163963317871,
"logps/rejected": -4.780937194824219,
"loss": 3.2322,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -42.971641540527344,
"rewards/margins": 4.837734222412109,
"rewards/rejected": -47.80936813354492,
"step": 352
},
{
"epoch": 0.7984167373480351,
"grad_norm": 140.78323504174404,
"learning_rate": 9.517051169918016e-08,
"logits/chosen": -0.9370065331459045,
"logits/rejected": -0.9510276317596436,
"logps/chosen": -4.016280651092529,
"logps/rejected": -4.564591407775879,
"loss": 3.2779,
"rewards/accuracies": 0.75,
"rewards/chosen": -40.162811279296875,
"rewards/margins": 5.483105182647705,
"rewards/rejected": -45.64591979980469,
"step": 353
},
{
"epoch": 0.8006785411365565,
"grad_norm": 142.37033106886284,
"learning_rate": 9.313055529813412e-08,
"logits/chosen": -0.8857989311218262,
"logits/rejected": -0.9076879620552063,
"logps/chosen": -4.098158836364746,
"logps/rejected": -4.799932479858398,
"loss": 2.3024,
"rewards/accuracies": 0.8125,
"rewards/chosen": -40.98158645629883,
"rewards/margins": 7.017735004425049,
"rewards/rejected": -47.99932098388672,
"step": 354
},
{
"epoch": 0.8029403449250777,
"grad_norm": 144.22040244633794,
"learning_rate": 9.110981521225532e-08,
"logits/chosen": -0.9384421706199646,
"logits/rejected": -0.9499157667160034,
"logps/chosen": -4.076770782470703,
"logps/rejected": -4.640554428100586,
"loss": 2.9348,
"rewards/accuracies": 0.765625,
"rewards/chosen": -40.76770782470703,
"rewards/margins": 5.637840747833252,
"rewards/rejected": -46.40555191040039,
"step": 355
},
{
"epoch": 0.8052021487135991,
"grad_norm": 139.7935847003589,
"learning_rate": 8.910841798127884e-08,
"logits/chosen": -0.9020113945007324,
"logits/rejected": -0.9301177263259888,
"logps/chosen": -4.16035270690918,
"logps/rejected": -4.824099063873291,
"loss": 2.5111,
"rewards/accuracies": 0.796875,
"rewards/chosen": -41.60352325439453,
"rewards/margins": 6.637460708618164,
"rewards/rejected": -48.240989685058594,
"step": 356
},
{
"epoch": 0.8074639525021204,
"grad_norm": 166.03846448720964,
"learning_rate": 8.712648893368139e-08,
"logits/chosen": -0.9206175208091736,
"logits/rejected": -0.9502934217453003,
"logps/chosen": -4.101649761199951,
"logps/rejected": -4.884461879730225,
"loss": 2.5017,
"rewards/accuracies": 0.765625,
"rewards/chosen": -41.01649475097656,
"rewards/margins": 7.828126907348633,
"rewards/rejected": -48.844627380371094,
"step": 357
},
{
"epoch": 0.8097257562906418,
"grad_norm": 117.89459898307335,
"learning_rate": 8.516415217883186e-08,
"logits/chosen": -0.9100026488304138,
"logits/rejected": -0.9124536514282227,
"logps/chosen": -4.00990629196167,
"logps/rejected": -4.74934196472168,
"loss": 2.6175,
"rewards/accuracies": 0.828125,
"rewards/chosen": -40.099063873291016,
"rewards/margins": 7.394357681274414,
"rewards/rejected": -47.4934196472168,
"step": 358
},
{
"epoch": 0.8119875600791632,
"grad_norm": 151.06740224882444,
"learning_rate": 8.32215305992209e-08,
"logits/chosen": -0.9616566896438599,
"logits/rejected": -0.9742845296859741,
"logps/chosen": -3.94974422454834,
"logps/rejected": -4.5588812828063965,
"loss": 2.8848,
"rewards/accuracies": 0.796875,
"rewards/chosen": -39.49744415283203,
"rewards/margins": 6.091368675231934,
"rewards/rejected": -45.58881759643555,
"step": 359
},
{
"epoch": 0.8142493638676844,
"grad_norm": 116.28513294682625,
"learning_rate": 8.129874584276448e-08,
"logits/chosen": -0.9059348702430725,
"logits/rejected": -0.9224525690078735,
"logps/chosen": -4.079035758972168,
"logps/rejected": -4.834011554718018,
"loss": 2.1432,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -40.79035949707031,
"rewards/margins": 7.549759864807129,
"rewards/rejected": -48.34011459350586,
"step": 360
},
{
"epoch": 0.8165111676562058,
"grad_norm": 143.4855751446256,
"learning_rate": 7.939591831518746e-08,
"logits/chosen": -0.943411648273468,
"logits/rejected": -0.9577879905700684,
"logps/chosen": -4.06253719329834,
"logps/rejected": -4.622920513153076,
"loss": 2.308,
"rewards/accuracies": 0.8515625,
"rewards/chosen": -40.625370025634766,
"rewards/margins": 5.603834629058838,
"rewards/rejected": -46.22920608520508,
"step": 361
},
{
"epoch": 0.8187729714447272,
"grad_norm": 132.12698029254315,
"learning_rate": 7.751316717248304e-08,
"logits/chosen": -0.9082808494567871,
"logits/rejected": -0.9305973052978516,
"logps/chosen": -4.398627281188965,
"logps/rejected": -5.260112285614014,
"loss": 2.4971,
"rewards/accuracies": 0.796875,
"rewards/chosen": -43.986270904541016,
"rewards/margins": 8.614850044250488,
"rewards/rejected": -52.60112762451172,
"step": 362
},
{
"epoch": 0.8210347752332485,
"grad_norm": 158.043389904261,
"learning_rate": 7.565061031345142e-08,
"logits/chosen": -0.9185335040092468,
"logits/rejected": -0.9299246072769165,
"logps/chosen": -4.59241247177124,
"logps/rejected": -5.313713550567627,
"loss": 2.3348,
"rewards/accuracies": 0.8125,
"rewards/chosen": -45.92412185668945,
"rewards/margins": 7.213016033172607,
"rewards/rejected": -53.137142181396484,
"step": 363
},
{
"epoch": 0.8232965790217699,
"grad_norm": 137.14995559881837,
"learning_rate": 7.380836437231686e-08,
"logits/chosen": -0.9011315107345581,
"logits/rejected": -0.9010403752326965,
"logps/chosen": -4.083221435546875,
"logps/rejected": -4.828488349914551,
"loss": 2.3463,
"rewards/accuracies": 0.8125,
"rewards/chosen": -40.83221435546875,
"rewards/margins": 7.452672481536865,
"rewards/rejected": -48.284889221191406,
"step": 364
},
{
"epoch": 0.8255583828102913,
"grad_norm": 133.9434857418435,
"learning_rate": 7.198654471142371e-08,
"logits/chosen": -0.9325624704360962,
"logits/rejected": -0.9269375205039978,
"logps/chosen": -4.175022602081299,
"logps/rejected": -5.013765335083008,
"loss": 2.1937,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -41.75022506713867,
"rewards/margins": 8.387434959411621,
"rewards/rejected": -50.13766098022461,
"step": 365
},
{
"epoch": 0.8278201865988125,
"grad_norm": 140.3898107041387,
"learning_rate": 7.01852654140132e-08,
"logits/chosen": -0.954879105091095,
"logits/rejected": -0.9756340980529785,
"logps/chosen": -4.5630106925964355,
"logps/rejected": -5.309413433074951,
"loss": 2.3981,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -45.63011169433594,
"rewards/margins": 7.464024066925049,
"rewards/rejected": -53.0941276550293,
"step": 366
},
{
"epoch": 0.8300819903873339,
"grad_norm": 138.4057371685046,
"learning_rate": 6.840463927707833e-08,
"logits/chosen": -0.9294202923774719,
"logits/rejected": -0.9419483542442322,
"logps/chosen": -4.580535888671875,
"logps/rejected": -5.202334880828857,
"loss": 2.7198,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -45.80535888671875,
"rewards/margins": 6.217983245849609,
"rewards/rejected": -52.023345947265625,
"step": 367
},
{
"epoch": 0.8323437941758552,
"grad_norm": 128.96927985102926,
"learning_rate": 6.664477780430138e-08,
"logits/chosen": -0.9347717761993408,
"logits/rejected": -0.945314884185791,
"logps/chosen": -4.429632663726807,
"logps/rejected": -4.995570659637451,
"loss": 2.9752,
"rewards/accuracies": 0.765625,
"rewards/chosen": -44.29632568359375,
"rewards/margins": 5.659379959106445,
"rewards/rejected": -49.955711364746094,
"step": 368
},
{
"epoch": 0.8346055979643766,
"grad_norm": 152.34642547867944,
"learning_rate": 6.49057911990711e-08,
"logits/chosen": -0.8949201107025146,
"logits/rejected": -0.9076350927352905,
"logps/chosen": -4.397095680236816,
"logps/rejected": -4.981942176818848,
"loss": 2.9721,
"rewards/accuracies": 0.75,
"rewards/chosen": -43.970951080322266,
"rewards/margins": 5.848470687866211,
"rewards/rejected": -49.81943130493164,
"step": 369
},
{
"epoch": 0.836867401752898,
"grad_norm": 132.0391102685107,
"learning_rate": 6.318778835758189e-08,
"logits/chosen": -0.92762690782547,
"logits/rejected": -0.9354040026664734,
"logps/chosen": -4.483278274536133,
"logps/rejected": -5.1504974365234375,
"loss": 1.8653,
"rewards/accuracies": 0.8828125,
"rewards/chosen": -44.83277893066406,
"rewards/margins": 6.672185897827148,
"rewards/rejected": -51.50497055053711,
"step": 370
},
{
"epoch": 0.8391292055414192,
"grad_norm": 179.951771990511,
"learning_rate": 6.149087686201433e-08,
"logits/chosen": -0.9428873062133789,
"logits/rejected": -0.9634348154067993,
"logps/chosen": -4.341042518615723,
"logps/rejected": -4.949177265167236,
"loss": 3.3993,
"rewards/accuracies": 0.75,
"rewards/chosen": -43.41042709350586,
"rewards/margins": 6.081344127655029,
"rewards/rejected": -49.49176788330078,
"step": 371
},
{
"epoch": 0.8413910093299406,
"grad_norm": 136.21427347850883,
"learning_rate": 5.98151629737988e-08,
"logits/chosen": -0.9433773756027222,
"logits/rejected": -0.943168044090271,
"logps/chosen": -4.414024829864502,
"logps/rejected": -5.185835838317871,
"loss": 2.3556,
"rewards/accuracies": 0.796875,
"rewards/chosen": -44.14024353027344,
"rewards/margins": 7.718109607696533,
"rewards/rejected": -51.85835647583008,
"step": 372
},
{
"epoch": 0.843652813118462,
"grad_norm": 127.45396317271563,
"learning_rate": 5.816075162696097e-08,
"logits/chosen": -0.9678685069084167,
"logits/rejected": -0.9940780401229858,
"logps/chosen": -4.39580774307251,
"logps/rejected": -5.012912273406982,
"loss": 2.2762,
"rewards/accuracies": 0.8125,
"rewards/chosen": -43.958072662353516,
"rewards/margins": 6.171045303344727,
"rewards/rejected": -50.129119873046875,
"step": 373
},
{
"epoch": 0.8459146169069833,
"grad_norm": 123.86661129091185,
"learning_rate": 5.6527746421551046e-08,
"logits/chosen": -0.9064250588417053,
"logits/rejected": -0.9173108339309692,
"logps/chosen": -4.327992916107178,
"logps/rejected": -5.055395603179932,
"loss": 2.4613,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -43.279930114746094,
"rewards/margins": 7.274028778076172,
"rewards/rejected": -50.553955078125,
"step": 374
},
{
"epoch": 0.8481764206955047,
"grad_norm": 136.01804435251455,
"learning_rate": 5.4916249617156064e-08,
"logits/chosen": -0.9181968569755554,
"logits/rejected": -0.9360796213150024,
"logps/chosen": -4.141705513000488,
"logps/rejected": -4.756865978240967,
"loss": 2.774,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -41.41705322265625,
"rewards/margins": 6.151602745056152,
"rewards/rejected": -47.568660736083984,
"step": 375
},
{
"epoch": 0.8504382244840261,
"grad_norm": 135.3387610667954,
"learning_rate": 5.332636212649646e-08,
"logits/chosen": -0.8991196155548096,
"logits/rejected": -0.915702223777771,
"logps/chosen": -4.379838466644287,
"logps/rejected": -5.094522953033447,
"loss": 2.1719,
"rewards/accuracies": 0.828125,
"rewards/chosen": -43.79838562011719,
"rewards/margins": 7.146846771240234,
"rewards/rejected": -50.945228576660156,
"step": 376
},
{
"epoch": 0.8527000282725473,
"grad_norm": 164.19828475720155,
"learning_rate": 5.17581835091069e-08,
"logits/chosen": -0.9365058541297913,
"logits/rejected": -0.9663807153701782,
"logps/chosen": -4.514606475830078,
"logps/rejected": -5.128344535827637,
"loss": 3.0464,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -45.14606475830078,
"rewards/margins": 6.137386322021484,
"rewards/rejected": -51.28345489501953,
"step": 377
},
{
"epoch": 0.8549618320610687,
"grad_norm": 138.669954265479,
"learning_rate": 5.02118119651016e-08,
"logits/chosen": -0.9410414099693298,
"logits/rejected": -0.9501762390136719,
"logps/chosen": -4.367845058441162,
"logps/rejected": -5.0047101974487305,
"loss": 3.1673,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -43.67845153808594,
"rewards/margins": 6.368653297424316,
"rewards/rejected": -50.04710388183594,
"step": 378
},
{
"epoch": 0.85722363584959,
"grad_norm": 153.057632250685,
"learning_rate": 4.868734432902526e-08,
"logits/chosen": -1.0021592378616333,
"logits/rejected": -0.9952703714370728,
"logps/chosen": -4.49019718170166,
"logps/rejected": -5.29477071762085,
"loss": 3.0216,
"rewards/accuracies": 0.78125,
"rewards/chosen": -44.90196990966797,
"rewards/margins": 8.045737266540527,
"rewards/rejected": -52.94770812988281,
"step": 379
},
{
"epoch": 0.8594854396381114,
"grad_norm": 139.1146559906999,
"learning_rate": 4.7184876063789134e-08,
"logits/chosen": -0.9506573677062988,
"logits/rejected": -0.9560145139694214,
"logps/chosen": -3.926301956176758,
"logps/rejected": -4.576600074768066,
"loss": 2.563,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -39.263023376464844,
"rewards/margins": 6.5029826164245605,
"rewards/rejected": -45.7660026550293,
"step": 380
},
{
"epoch": 0.8617472434266328,
"grad_norm": 136.57207110844007,
"learning_rate": 4.570450125469314e-08,
"logits/chosen": -0.9335479140281677,
"logits/rejected": -0.9474495649337769,
"logps/chosen": -4.52652645111084,
"logps/rejected": -5.3495774269104,
"loss": 2.4878,
"rewards/accuracies": 0.828125,
"rewards/chosen": -45.2652587890625,
"rewards/margins": 8.230509757995605,
"rewards/rejected": -53.49578094482422,
"step": 381
},
{
"epoch": 0.864009047215154,
"grad_norm": 149.81502524090894,
"learning_rate": 4.424631260353378e-08,
"logits/chosen": -0.9694351553916931,
"logits/rejected": -0.9859524369239807,
"logps/chosen": -4.307926654815674,
"logps/rejected": -4.940521240234375,
"loss": 2.7104,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -43.07926940917969,
"rewards/margins": 6.325945854187012,
"rewards/rejected": -49.40521240234375,
"step": 382
},
{
"epoch": 0.8662708510036754,
"grad_norm": 121.3426641959867,
"learning_rate": 4.281040142280008e-08,
"logits/chosen": -0.9893457889556885,
"logits/rejected": -0.9991154670715332,
"logps/chosen": -4.156393527984619,
"logps/rejected": -4.968776226043701,
"loss": 1.989,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -41.56393814086914,
"rewards/margins": 8.123825073242188,
"rewards/rejected": -49.68776321411133,
"step": 383
},
{
"epoch": 0.8685326547921968,
"grad_norm": 141.4337079496108,
"learning_rate": 4.1396857629954286e-08,
"logits/chosen": -0.9534589052200317,
"logits/rejected": -0.9696213603019714,
"logps/chosen": -4.799047470092773,
"logps/rejected": -5.520049571990967,
"loss": 2.7878,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -47.99047088623047,
"rewards/margins": 7.210024833679199,
"rewards/rejected": -55.20050048828125,
"step": 384
},
{
"epoch": 0.8707944585807181,
"grad_norm": 119.73253748242631,
"learning_rate": 4.000576974180232e-08,
"logits/chosen": -0.9004536271095276,
"logits/rejected": -0.9263263940811157,
"logps/chosen": -4.2699875831604,
"logps/rejected": -4.922300338745117,
"loss": 2.9088,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -42.69987487792969,
"rewards/margins": 6.523127555847168,
"rewards/rejected": -49.22300720214844,
"step": 385
},
{
"epoch": 0.8730562623692395,
"grad_norm": 132.99002186990265,
"learning_rate": 3.8637224868950066e-08,
"logits/chosen": -0.9017341136932373,
"logits/rejected": -0.9102005958557129,
"logps/chosen": -4.248313903808594,
"logps/rejected": -4.877220630645752,
"loss": 2.8312,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -42.48313522338867,
"rewards/margins": 6.2890706062316895,
"rewards/rejected": -48.77220153808594,
"step": 386
},
{
"epoch": 0.8753180661577609,
"grad_norm": 140.2576422089798,
"learning_rate": 3.729130871034885e-08,
"logits/chosen": -0.9371786713600159,
"logits/rejected": -0.9420756101608276,
"logps/chosen": -4.37814998626709,
"logps/rejected": -5.035106182098389,
"loss": 2.8047,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -43.78150177001953,
"rewards/margins": 6.569563865661621,
"rewards/rejected": -50.3510627746582,
"step": 387
},
{
"epoch": 0.8775798699462821,
"grad_norm": 175.47044378770292,
"learning_rate": 3.596810554792888e-08,
"logits/chosen": -0.9239012598991394,
"logits/rejected": -0.9475809335708618,
"logps/chosen": -4.3749284744262695,
"logps/rejected": -5.061702251434326,
"loss": 3.1355,
"rewards/accuracies": 0.734375,
"rewards/chosen": -43.7492790222168,
"rewards/margins": 6.867737770080566,
"rewards/rejected": -50.61702346801758,
"step": 388
},
{
"epoch": 0.8798416737348035,
"grad_norm": 136.44044081625452,
"learning_rate": 3.466769824132116e-08,
"logits/chosen": -0.9199025630950928,
"logits/rejected": -0.9204123020172119,
"logps/chosen": -4.2924580574035645,
"logps/rejected": -4.982794761657715,
"loss": 2.3706,
"rewards/accuracies": 0.828125,
"rewards/chosen": -42.924583435058594,
"rewards/margins": 6.903364181518555,
"rewards/rejected": -49.82794189453125,
"step": 389
},
{
"epoch": 0.8821034775233249,
"grad_norm": 157.13088383312373,
"learning_rate": 3.339016822266925e-08,
"logits/chosen": -0.8951210975646973,
"logits/rejected": -0.9262260794639587,
"logps/chosen": -4.462003707885742,
"logps/rejected": -5.319886684417725,
"loss": 1.8385,
"rewards/accuracies": 0.8515625,
"rewards/chosen": -44.62003707885742,
"rewards/margins": 8.57883071899414,
"rewards/rejected": -53.19886779785156,
"step": 390
},
{
"epoch": 0.8843652813118462,
"grad_norm": 145.27269622289882,
"learning_rate": 3.213559549152958e-08,
"logits/chosen": -0.9537985920906067,
"logits/rejected": -0.9690415859222412,
"logps/chosen": -4.21071195602417,
"logps/rejected": -4.958610534667969,
"loss": 2.7635,
"rewards/accuracies": 0.765625,
"rewards/chosen": -42.10711669921875,
"rewards/margins": 7.478985786437988,
"rewards/rejected": -49.58610153198242,
"step": 391
},
{
"epoch": 0.8866270851003676,
"grad_norm": 152.2072238338678,
"learning_rate": 3.090405860986203e-08,
"logits/chosen": -0.9644224643707275,
"logits/rejected": -0.9988764524459839,
"logps/chosen": -4.434269428253174,
"logps/rejected": -5.323245048522949,
"loss": 2.3393,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -44.34269332885742,
"rewards/margins": 8.889755249023438,
"rewards/rejected": -53.23244857788086,
"step": 392
},
{
"epoch": 0.8888888888888888,
"grad_norm": 133.85280588404478,
"learning_rate": 2.9695634697110315e-08,
"logits/chosen": -0.9042102694511414,
"logits/rejected": -0.9273264408111572,
"logps/chosen": -4.202421188354492,
"logps/rejected": -5.033628463745117,
"loss": 2.664,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -42.02421569824219,
"rewards/margins": 8.31207275390625,
"rewards/rejected": -50.33628463745117,
"step": 393
},
{
"epoch": 0.8911506926774102,
"grad_norm": 139.50976820633048,
"learning_rate": 2.8510399425372766e-08,
"logits/chosen": -0.9206915497779846,
"logits/rejected": -0.9092394113540649,
"logps/chosen": -4.3293986320495605,
"logps/rejected": -4.970660209655762,
"loss": 2.7218,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -43.293983459472656,
"rewards/margins": 6.412619113922119,
"rewards/rejected": -49.70660400390625,
"step": 394
},
{
"epoch": 0.8934124964659316,
"grad_norm": 142.6192511354523,
"learning_rate": 2.734842701466329e-08,
"logits/chosen": -0.9256288409233093,
"logits/rejected": -0.9244977235794067,
"logps/chosen": -4.661899566650391,
"logps/rejected": -5.342780113220215,
"loss": 2.4201,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -46.61899948120117,
"rewards/margins": 6.808799743652344,
"rewards/rejected": -53.42779541015625,
"step": 395
},
{
"epoch": 0.8956743002544529,
"grad_norm": 130.63925825670492,
"learning_rate": 2.6209790228264438e-08,
"logits/chosen": -0.9332349300384521,
"logits/rejected": -0.94581139087677,
"logps/chosen": -4.036855220794678,
"logps/rejected": -4.773642539978027,
"loss": 2.2623,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -40.368553161621094,
"rewards/margins": 7.367873191833496,
"rewards/rejected": -47.736427307128906,
"step": 396
},
{
"epoch": 0.8979361040429743,
"grad_norm": 149.5991916133793,
"learning_rate": 2.5094560368170305e-08,
"logits/chosen": -0.9196925163269043,
"logits/rejected": -0.9395575523376465,
"logps/chosen": -4.5655694007873535,
"logps/rejected": -5.217185974121094,
"loss": 2.5713,
"rewards/accuracies": 0.796875,
"rewards/chosen": -45.65568923950195,
"rewards/margins": 6.516168594360352,
"rewards/rejected": -52.17185974121094,
"step": 397
},
{
"epoch": 0.9001979078314957,
"grad_norm": 122.6578201059457,
"learning_rate": 2.4002807270621893e-08,
"logits/chosen": -0.9552274942398071,
"logits/rejected": -0.9657354950904846,
"logps/chosen": -4.322449207305908,
"logps/rejected": -4.962361812591553,
"loss": 2.6024,
"rewards/accuracies": 0.828125,
"rewards/chosen": -43.2244987487793,
"rewards/margins": 6.399123191833496,
"rewards/rejected": -49.623619079589844,
"step": 398
},
{
"epoch": 0.9024597116200169,
"grad_norm": 135.84460405011131,
"learning_rate": 2.293459930173354e-08,
"logits/chosen": -0.9458591341972351,
"logits/rejected": -0.9692145586013794,
"logps/chosen": -4.452592849731445,
"logps/rejected": -5.130153179168701,
"loss": 2.783,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -44.52592468261719,
"rewards/margins": 6.775611400604248,
"rewards/rejected": -51.30153274536133,
"step": 399
},
{
"epoch": 0.9047215154085383,
"grad_norm": 171.33604881928616,
"learning_rate": 2.189000335321256e-08,
"logits/chosen": -0.9176933765411377,
"logits/rejected": -0.9229288101196289,
"logps/chosen": -4.287893295288086,
"logps/rejected": -4.882048606872559,
"loss": 3.0622,
"rewards/accuracies": 0.78125,
"rewards/chosen": -42.87893295288086,
"rewards/margins": 5.941554069519043,
"rewards/rejected": -48.82048797607422,
"step": 400
},
{
"epoch": 0.9069833191970597,
"grad_norm": 131.77434980590826,
"learning_rate": 2.086908483816954e-08,
"logits/chosen": -0.9492596387863159,
"logits/rejected": -0.9559190273284912,
"logps/chosen": -4.549674987792969,
"logps/rejected": -5.201348304748535,
"loss": 2.4757,
"rewards/accuracies": 0.828125,
"rewards/chosen": -45.49674606323242,
"rewards/margins": 6.516733169555664,
"rewards/rejected": -52.01348114013672,
"step": 401
},
{
"epoch": 0.909245122985581,
"grad_norm": 132.06347666424753,
"learning_rate": 1.9871907687022717e-08,
"logits/chosen": -0.916560173034668,
"logits/rejected": -0.9371925592422485,
"logps/chosen": -4.1877241134643555,
"logps/rejected": -4.8005266189575195,
"loss": 2.6123,
"rewards/accuracies": 0.765625,
"rewards/chosen": -41.87724685668945,
"rewards/margins": 6.128022193908691,
"rewards/rejected": -48.005271911621094,
"step": 402
},
{
"epoch": 0.9115069267741024,
"grad_norm": 114.509047735518,
"learning_rate": 1.889853434349451e-08,
"logits/chosen": -0.9288345575332642,
"logits/rejected": -0.9471941590309143,
"logps/chosen": -4.192251682281494,
"logps/rejected": -4.938650131225586,
"loss": 2.434,
"rewards/accuracies": 0.78125,
"rewards/chosen": -41.922515869140625,
"rewards/margins": 7.4639787673950195,
"rewards/rejected": -49.386497497558594,
"step": 403
},
{
"epoch": 0.9137687305626236,
"grad_norm": 139.51897990590004,
"learning_rate": 1.7949025760701164e-08,
"logits/chosen": -0.9225287437438965,
"logits/rejected": -0.9274791479110718,
"logps/chosen": -4.604381561279297,
"logps/rejected": -5.195613861083984,
"loss": 2.6384,
"rewards/accuracies": 0.828125,
"rewards/chosen": -46.043819427490234,
"rewards/margins": 5.912320137023926,
"rewards/rejected": -51.956138610839844,
"step": 404
},
{
"epoch": 0.916030534351145,
"grad_norm": 128.19324678370154,
"learning_rate": 1.7023441397336023e-08,
"logits/chosen": -0.9489941596984863,
"logits/rejected": -0.9579771757125854,
"logps/chosen": -4.172736167907715,
"logps/rejected": -4.906558513641357,
"loss": 2.4065,
"rewards/accuracies": 0.8125,
"rewards/chosen": -41.727359771728516,
"rewards/margins": 7.338226318359375,
"rewards/rejected": -49.065582275390625,
"step": 405
},
{
"epoch": 0.9182923381396664,
"grad_norm": 127.71243008762988,
"learning_rate": 1.6121839213945854e-08,
"logits/chosen": -0.9154041409492493,
"logits/rejected": -0.9540258049964905,
"logps/chosen": -4.24996280670166,
"logps/rejected": -5.038878440856934,
"loss": 2.6564,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -42.499629974365234,
"rewards/margins": 7.889162063598633,
"rewards/rejected": -50.3887939453125,
"step": 406
},
{
"epoch": 0.9205541419281877,
"grad_norm": 146.10675693844263,
"learning_rate": 1.5244275669301777e-08,
"logits/chosen": -0.955981969833374,
"logits/rejected": -0.9593254923820496,
"logps/chosen": -4.389744758605957,
"logps/rejected": -5.067453384399414,
"loss": 2.8744,
"rewards/accuracies": 0.78125,
"rewards/chosen": -43.8974494934082,
"rewards/margins": 6.777082920074463,
"rewards/rejected": -50.67453384399414,
"step": 407
},
{
"epoch": 0.9228159457167091,
"grad_norm": 132.95610789058466,
"learning_rate": 1.4390805716863398e-08,
"logits/chosen": -0.9074594378471375,
"logits/rejected": -0.9208613634109497,
"logps/chosen": -4.289839744567871,
"logps/rejected": -4.873122692108154,
"loss": 3.0576,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -42.898399353027344,
"rewards/margins": 5.832827091217041,
"rewards/rejected": -48.73122787475586,
"step": 408
},
{
"epoch": 0.9250777495052305,
"grad_norm": 141.9755163132043,
"learning_rate": 1.3561482801337908e-08,
"logits/chosen": -0.9116663336753845,
"logits/rejected": -0.9385542273521423,
"logps/chosen": -4.232028007507324,
"logps/rejected": -4.991069793701172,
"loss": 2.9291,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -42.320281982421875,
"rewards/margins": 7.590411186218262,
"rewards/rejected": -49.91069412231445,
"step": 409
},
{
"epoch": 0.9273395532937517,
"grad_norm": 133.18959451570092,
"learning_rate": 1.2756358855332904e-08,
"logits/chosen": -0.9445152282714844,
"logits/rejected": -0.9605578184127808,
"logps/chosen": -4.202373504638672,
"logps/rejected": -4.8074846267700195,
"loss": 3.1204,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -42.02373504638672,
"rewards/margins": 6.051117897033691,
"rewards/rejected": -48.074851989746094,
"step": 410
},
{
"epoch": 0.9296013570822731,
"grad_norm": 136.12472128004111,
"learning_rate": 1.1975484296105154e-08,
"logits/chosen": -0.9164653420448303,
"logits/rejected": -0.9311988353729248,
"logps/chosen": -4.378890037536621,
"logps/rejected": -5.060423851013184,
"loss": 2.8484,
"rewards/accuracies": 0.78125,
"rewards/chosen": -43.78889846801758,
"rewards/margins": 6.815339088439941,
"rewards/rejected": -50.6042366027832,
"step": 411
},
{
"epoch": 0.9318631608707945,
"grad_norm": 137.33443947595077,
"learning_rate": 1.1218908022402374e-08,
"logits/chosen": -0.9297804832458496,
"logits/rejected": -0.9439125061035156,
"logps/chosen": -4.097784996032715,
"logps/rejected": -4.829689025878906,
"loss": 2.4591,
"rewards/accuracies": 0.796875,
"rewards/chosen": -40.977848052978516,
"rewards/margins": 7.319035530090332,
"rewards/rejected": -48.2968864440918,
"step": 412
},
{
"epoch": 0.9341249646593158,
"grad_norm": 145.19557689058706,
"learning_rate": 1.0486677411402079e-08,
"logits/chosen": -0.9909257888793945,
"logits/rejected": -0.9965202212333679,
"logps/chosen": -4.445742607116699,
"logps/rejected": -5.312036037445068,
"loss": 2.6238,
"rewards/accuracies": 0.796875,
"rewards/chosen": -44.45742416381836,
"rewards/margins": 8.66294002532959,
"rewards/rejected": -53.120365142822266,
"step": 413
},
{
"epoch": 0.9363867684478372,
"grad_norm": 135.59114017765043,
"learning_rate": 9.778838315744353e-09,
"logits/chosen": -0.9647377133369446,
"logits/rejected": -0.9831647872924805,
"logps/chosen": -4.492733478546143,
"logps/rejected": -5.175955295562744,
"loss": 2.6291,
"rewards/accuracies": 0.8125,
"rewards/chosen": -44.927330017089844,
"rewards/margins": 6.832226753234863,
"rewards/rejected": -51.75955581665039,
"step": 414
},
{
"epoch": 0.9386485722363584,
"grad_norm": 145.5793767400526,
"learning_rate": 9.095435060660595e-09,
"logits/chosen": -0.9024043679237366,
"logits/rejected": -0.917569100856781,
"logps/chosen": -4.358269691467285,
"logps/rejected": -5.034271717071533,
"loss": 2.8645,
"rewards/accuracies": 0.796875,
"rewards/chosen": -43.58269500732422,
"rewards/margins": 6.760016441345215,
"rewards/rejected": -50.34271240234375,
"step": 415
},
{
"epoch": 0.9409103760248798,
"grad_norm": 162.49589370243754,
"learning_rate": 8.436510441197864e-09,
"logits/chosen": -0.9422574043273926,
"logits/rejected": -0.9609728455543518,
"logps/chosen": -4.340670585632324,
"logps/rejected": -5.023505210876465,
"loss": 2.9033,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -43.406707763671875,
"rewards/margins": 6.82834529876709,
"rewards/rejected": -50.23505401611328,
"step": 416
},
{
"epoch": 0.9431721798134012,
"grad_norm": 224.81877042117685,
"learning_rate": 7.802105719539076e-09,
"logits/chosen": -0.9420458078384399,
"logits/rejected": -0.9551193118095398,
"logps/chosen": -4.549409866333008,
"logps/rejected": -5.181853771209717,
"loss": 3.3733,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -45.49409866333008,
"rewards/margins": 6.324440002441406,
"rewards/rejected": -51.818538665771484,
"step": 417
},
{
"epoch": 0.9454339836019225,
"grad_norm": 131.4283806000118,
"learning_rate": 7.1922606224192e-09,
"logits/chosen": -0.9589974880218506,
"logits/rejected": -0.9696003198623657,
"logps/chosen": -4.475660800933838,
"logps/rejected": -5.16171407699585,
"loss": 2.5232,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -44.756614685058594,
"rewards/margins": 6.860527992248535,
"rewards/rejected": -51.61713790893555,
"step": 418
},
{
"epoch": 0.9476957873904439,
"grad_norm": 148.50211503722525,
"learning_rate": 6.6070133386372906e-09,
"logits/chosen": -0.9348170161247253,
"logits/rejected": -0.9493433237075806,
"logps/chosen": -4.343303680419922,
"logps/rejected": -4.941909313201904,
"loss": 3.0315,
"rewards/accuracies": 0.78125,
"rewards/chosen": -43.43303680419922,
"rewards/margins": 5.9860520362854,
"rewards/rejected": -49.419090270996094,
"step": 419
},
{
"epoch": 0.9499575911789653,
"grad_norm": 129.14740181782477,
"learning_rate": 6.046400516665384e-09,
"logits/chosen": -0.957095742225647,
"logits/rejected": -0.9551052451133728,
"logps/chosen": -4.2565999031066895,
"logps/rejected": -4.9523539543151855,
"loss": 3.1614,
"rewards/accuracies": 0.734375,
"rewards/chosen": -42.566001892089844,
"rewards/margins": 6.9575371742248535,
"rewards/rejected": -49.523536682128906,
"step": 420
},
{
"epoch": 0.9522193949674865,
"grad_norm": 122.48845625064148,
"learning_rate": 5.510457262353396e-09,
"logits/chosen": -0.9842012524604797,
"logits/rejected": -1.0115524530410767,
"logps/chosen": -4.26042366027832,
"logps/rejected": -4.900167942047119,
"loss": 2.4057,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -42.60423278808594,
"rewards/margins": 6.397446155548096,
"rewards/rejected": -49.00168228149414,
"step": 421
},
{
"epoch": 0.9544811987560079,
"grad_norm": 130.67374788625062,
"learning_rate": 4.9992171367309265e-09,
"logits/chosen": -0.9512357711791992,
"logits/rejected": -0.9497030377388,
"logps/chosen": -4.133967399597168,
"logps/rejected": -4.79262638092041,
"loss": 2.3627,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -41.33967208862305,
"rewards/margins": 6.586594581604004,
"rewards/rejected": -47.92626953125,
"step": 422
},
{
"epoch": 0.9567430025445293,
"grad_norm": 170.07916949000563,
"learning_rate": 4.5127121539052955e-09,
"logits/chosen": -0.9652352333068848,
"logits/rejected": -0.9730237722396851,
"logps/chosen": -4.610488414764404,
"logps/rejected": -5.30706262588501,
"loss": 2.5704,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -46.10488510131836,
"rewards/margins": 6.9657416343688965,
"rewards/rejected": -53.07062911987305,
"step": 423
},
{
"epoch": 0.9590048063330506,
"grad_norm": 148.77436953597567,
"learning_rate": 4.050972779057327e-09,
"logits/chosen": -0.8603891730308533,
"logits/rejected": -0.883198618888855,
"logps/chosen": -4.0621185302734375,
"logps/rejected": -4.761756896972656,
"loss": 2.5931,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -40.62118911743164,
"rewards/margins": 6.996386528015137,
"rewards/rejected": -47.61757278442383,
"step": 424
},
{
"epoch": 0.961266610121572,
"grad_norm": 150.51477842196144,
"learning_rate": 3.6140279265330477e-09,
"logits/chosen": -0.9070014357566833,
"logits/rejected": -0.9290311336517334,
"logps/chosen": -4.51793909072876,
"logps/rejected": -5.157177448272705,
"loss": 2.8884,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -45.17938995361328,
"rewards/margins": 6.392387390136719,
"rewards/rejected": -51.57177734375,
"step": 425
},
{
"epoch": 0.9635284139100933,
"grad_norm": 136.67894069927104,
"learning_rate": 3.2019049580335853e-09,
"logits/chosen": -0.9470658898353577,
"logits/rejected": -0.9471170902252197,
"logps/chosen": -4.178645133972168,
"logps/rejected": -4.754918098449707,
"loss": 3.3311,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -41.78644943237305,
"rewards/margins": 5.762726783752441,
"rewards/rejected": -47.54917907714844,
"step": 426
},
{
"epoch": 0.9657902176986146,
"grad_norm": 118.06690739543247,
"learning_rate": 2.814629680901337e-09,
"logits/chosen": -0.9594217538833618,
"logits/rejected": -0.9807270169258118,
"logps/chosen": -4.4059553146362305,
"logps/rejected": -5.0466628074646,
"loss": 2.3717,
"rewards/accuracies": 0.8125,
"rewards/chosen": -44.05955505371094,
"rewards/margins": 6.407071113586426,
"rewards/rejected": -50.46662902832031,
"step": 427
},
{
"epoch": 0.968052021487136,
"grad_norm": 150.5152498317493,
"learning_rate": 2.4522263465041937e-09,
"logits/chosen": -0.9186062812805176,
"logits/rejected": -0.9451611042022705,
"logps/chosen": -4.349206447601318,
"logps/rejected": -5.054396629333496,
"loss": 2.2936,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -43.49205780029297,
"rewards/margins": 7.051908016204834,
"rewards/rejected": -50.54396438598633,
"step": 428
},
{
"epoch": 0.9703138252756573,
"grad_norm": 135.25076850145902,
"learning_rate": 2.114717648716713e-09,
"logits/chosen": -0.8935461044311523,
"logits/rejected": -0.9095126986503601,
"logps/chosen": -4.378687381744385,
"logps/rejected": -5.177103042602539,
"loss": 2.34,
"rewards/accuracies": 0.8359375,
"rewards/chosen": -43.786869049072266,
"rewards/margins": 7.984163761138916,
"rewards/rejected": -51.771034240722656,
"step": 429
},
{
"epoch": 0.9725756290641787,
"grad_norm": 152.01906534523854,
"learning_rate": 1.802124722499121e-09,
"logits/chosen": -0.9317042231559753,
"logits/rejected": -0.9441834688186646,
"logps/chosen": -4.452950477600098,
"logps/rejected": -5.174241065979004,
"loss": 2.7273,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -44.529502868652344,
"rewards/margins": 7.2129082679748535,
"rewards/rejected": -51.742408752441406,
"step": 430
},
{
"epoch": 0.9748374328527001,
"grad_norm": 129.07592379619018,
"learning_rate": 1.5144671425737499e-09,
"logits/chosen": -0.9220924377441406,
"logits/rejected": -0.9323858022689819,
"logps/chosen": -4.173183441162109,
"logps/rejected": -4.836048603057861,
"loss": 2.9168,
"rewards/accuracies": 0.78125,
"rewards/chosen": -41.731834411621094,
"rewards/margins": 6.628646373748779,
"rewards/rejected": -48.36048126220703,
"step": 431
},
{
"epoch": 0.9770992366412213,
"grad_norm": 122.42308357023245,
"learning_rate": 1.251762922199484e-09,
"logits/chosen": -0.8762988448143005,
"logits/rejected": -0.9018377065658569,
"logps/chosen": -4.394649982452393,
"logps/rejected": -5.167266845703125,
"loss": 2.0131,
"rewards/accuracies": 0.859375,
"rewards/chosen": -43.94649887084961,
"rewards/margins": 7.726165771484375,
"rewards/rejected": -51.67266845703125,
"step": 432
},
{
"epoch": 0.9793610404297427,
"grad_norm": 144.42537157796136,
"learning_rate": 1.0140285120433744e-09,
"logits/chosen": -0.9518988132476807,
"logits/rejected": -0.9752581715583801,
"logps/chosen": -4.419306755065918,
"logps/rejected": -5.068571090698242,
"loss": 2.8378,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -44.19306564331055,
"rewards/margins": 6.492642402648926,
"rewards/rejected": -50.685707092285156,
"step": 433
},
{
"epoch": 0.9816228442182641,
"grad_norm": 144.35990425477752,
"learning_rate": 8.012787991508396e-10,
"logits/chosen": -0.9084798693656921,
"logits/rejected": -0.9402381181716919,
"logps/chosen": -4.297061920166016,
"logps/rejected": -5.11636209487915,
"loss": 2.7992,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -42.97062301635742,
"rewards/margins": 8.192997932434082,
"rewards/rejected": -51.16362380981445,
"step": 434
},
{
"epoch": 0.9838846480067854,
"grad_norm": 134.7783472158385,
"learning_rate": 6.135271060133007e-10,
"logits/chosen": -0.8788937330245972,
"logits/rejected": -0.8922220468521118,
"logps/chosen": -4.292323112487793,
"logps/rejected": -4.958512783050537,
"loss": 2.7245,
"rewards/accuracies": 0.765625,
"rewards/chosen": -42.9232292175293,
"rewards/margins": 6.6618971824646,
"rewards/rejected": -49.585121154785156,
"step": 435
},
{
"epoch": 0.9861464517953068,
"grad_norm": 131.05692298332755,
"learning_rate": 4.50785189733871e-10,
"logits/chosen": -0.8994375467300415,
"logits/rejected": -0.9363196492195129,
"logps/chosen": -4.137233257293701,
"logps/rejected": -4.894649982452393,
"loss": 2.0871,
"rewards/accuracies": 0.8515625,
"rewards/chosen": -41.372337341308594,
"rewards/margins": 7.574166297912598,
"rewards/rejected": -48.946495056152344,
"step": 436
},
{
"epoch": 0.988408255583828,
"grad_norm": 141.9667964600976,
"learning_rate": 3.1306324129118935e-10,
"logits/chosen": -0.9034903049468994,
"logits/rejected": -0.9206139445304871,
"logps/chosen": -4.409141540527344,
"logps/rejected": -5.078451156616211,
"loss": 2.5653,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -44.09141540527344,
"rewards/margins": 6.693098068237305,
"rewards/rejected": -50.784515380859375,
"step": 437
},
{
"epoch": 0.9906700593723494,
"grad_norm": 149.43690667028594,
"learning_rate": 2.003698849011748e-10,
"logits/chosen": -0.9702510237693787,
"logits/rejected": -0.992080807685852,
"logps/chosen": -4.582041263580322,
"logps/rejected": -5.16063928604126,
"loss": 2.9004,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -45.82041931152344,
"rewards/margins": 5.78598165512085,
"rewards/rejected": -51.60639953613281,
"step": 438
},
{
"epoch": 0.9929318631608708,
"grad_norm": 151.45409397929544,
"learning_rate": 1.1271217747714779e-10,
"logits/chosen": -0.9387862086296082,
"logits/rejected": -0.9742698669433594,
"logps/chosen": -4.416835784912109,
"logps/rejected": -5.033830165863037,
"loss": 2.8039,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -44.168357849121094,
"rewards/margins": 6.169943809509277,
"rewards/rejected": -50.33830642700195,
"step": 439
},
{
"epoch": 0.9951936669493922,
"grad_norm": 117.95433018808065,
"learning_rate": 5.0095608187739055e-11,
"logits/chosen": -0.9022542238235474,
"logits/rejected": -0.921513557434082,
"logps/chosen": -4.125314712524414,
"logps/rejected": -4.81278133392334,
"loss": 2.5334,
"rewards/accuracies": 0.765625,
"rewards/chosen": -41.25314712524414,
"rewards/margins": 6.874664306640625,
"rewards/rejected": -48.127811431884766,
"step": 440
},
{
"epoch": 0.9974554707379135,
"grad_norm": 150.85789440344882,
"learning_rate": 1.2524098113209092e-11,
"logits/chosen": -0.9529531002044678,
"logits/rejected": -0.9607404470443726,
"logps/chosen": -4.348971366882324,
"logps/rejected": -4.960037708282471,
"loss": 3.1636,
"rewards/accuracies": 0.796875,
"rewards/chosen": -43.48971176147461,
"rewards/margins": 6.110668182373047,
"rewards/rejected": -49.600379943847656,
"step": 441
},
{
"epoch": 0.9997172745264349,
"grad_norm": 133.66353463529444,
"learning_rate": 0.0,
"logits/chosen": -0.9448285698890686,
"logits/rejected": -0.951061487197876,
"logps/chosen": -4.208832263946533,
"logps/rejected": -4.896453857421875,
"loss": 2.7111,
"rewards/accuracies": 0.78125,
"rewards/chosen": -42.088321685791016,
"rewards/margins": 6.876214027404785,
"rewards/rejected": -48.96453857421875,
"step": 442
},
{
"epoch": 0.9997172745264349,
"eval_logits/chosen": -0.9250581860542297,
"eval_logits/rejected": -0.9405222535133362,
"eval_logps/chosen": -4.356727600097656,
"eval_logps/rejected": -5.03577995300293,
"eval_loss": 2.5820231437683105,
"eval_rewards/accuracies": 0.7914438843727112,
"eval_rewards/chosen": -43.567283630371094,
"eval_rewards/margins": 6.790517330169678,
"eval_rewards/rejected": -50.35779571533203,
"eval_runtime": 64.9654,
"eval_samples_per_second": 45.855,
"eval_steps_per_second": 2.878,
"step": 442
},
{
"epoch": 0.9997172745264349,
"step": 442,
"total_flos": 134366991482880.0,
"train_loss": 3.371998559026157,
"train_runtime": 3776.6556,
"train_samples_per_second": 14.984,
"train_steps_per_second": 0.117
}
],
"logging_steps": 1.0,
"max_steps": 442,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 134366991482880.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}