law_dpo3 / trainer_state.json
zzunyang's picture
Upload folder using huggingface_hub
2062b25 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 15.491866769945778,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.061967467079783116,
"grad_norm": 0.6070870757102966,
"learning_rate": 4e-05,
"logits/chosen": -2.0001754760742188,
"logits/rejected": -1.449440598487854,
"logps/chosen": -374.65521240234375,
"logps/rejected": -215.3085479736328,
"loss": 1.007,
"rewards/accuracies": 0.44062501192092896,
"rewards/chosen": -0.3046182096004486,
"rewards/margins": -0.20184263586997986,
"rewards/rejected": -0.10277555137872696,
"step": 20
},
{
"epoch": 0.12393493415956623,
"grad_norm": 0.5136411190032959,
"learning_rate": 8e-05,
"logits/chosen": -2.083824872970581,
"logits/rejected": -1.584017038345337,
"logps/chosen": -341.329833984375,
"logps/rejected": -208.3067169189453,
"loss": 0.1907,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 2.0636544227600098,
"rewards/margins": 2.9626474380493164,
"rewards/rejected": -0.8989933133125305,
"step": 40
},
{
"epoch": 0.18590240123934934,
"grad_norm": 0.18788862228393555,
"learning_rate": 0.00012,
"logits/chosen": -2.0708529949188232,
"logits/rejected": -1.5524569749832153,
"logps/chosen": -329.73193359375,
"logps/rejected": -221.080078125,
"loss": 0.0732,
"rewards/accuracies": 0.984375,
"rewards/chosen": 2.1646170616149902,
"rewards/margins": 4.800443649291992,
"rewards/rejected": -2.635826826095581,
"step": 60
},
{
"epoch": 0.24786986831913246,
"grad_norm": 0.2149907350540161,
"learning_rate": 0.00016,
"logits/chosen": -1.964525580406189,
"logits/rejected": -1.425443172454834,
"logps/chosen": -337.01165771484375,
"logps/rejected": -236.92935180664062,
"loss": 0.0384,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": 1.6233104467391968,
"rewards/margins": 6.25473690032959,
"rewards/rejected": -4.631426811218262,
"step": 80
},
{
"epoch": 0.30983733539891556,
"grad_norm": 0.13132674992084503,
"learning_rate": 0.0002,
"logits/chosen": -1.8194172382354736,
"logits/rejected": -1.3340699672698975,
"logps/chosen": -329.0172424316406,
"logps/rejected": -260.6822814941406,
"loss": 0.024,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": 0.2860015332698822,
"rewards/margins": 7.288111686706543,
"rewards/rejected": -7.002110958099365,
"step": 100
},
{
"epoch": 0.3718048024786987,
"grad_norm": 0.06768889725208282,
"learning_rate": 0.00019999177886783194,
"logits/chosen": -1.818981409072876,
"logits/rejected": -1.3484697341918945,
"logps/chosen": -359.87005615234375,
"logps/rejected": -294.05047607421875,
"loss": 0.021,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": 0.2960149049758911,
"rewards/margins": 8.185277938842773,
"rewards/rejected": -7.889264106750488,
"step": 120
},
{
"epoch": 0.4337722695584818,
"grad_norm": 0.00373012013733387,
"learning_rate": 0.000199967116823068,
"logits/chosen": -1.747314453125,
"logits/rejected": -1.209826946258545,
"logps/chosen": -356.72686767578125,
"logps/rejected": -287.92205810546875,
"loss": 0.012,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.20157980918884277,
"rewards/margins": 8.92736530303955,
"rewards/rejected": -8.725785255432129,
"step": 140
},
{
"epoch": 0.4957397366382649,
"grad_norm": 0.08832018822431564,
"learning_rate": 0.00019992601792070679,
"logits/chosen": -1.760593056678772,
"logits/rejected": -1.227081060409546,
"logps/chosen": -359.7059326171875,
"logps/rejected": -307.3652648925781,
"loss": 0.0121,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -0.5851167440414429,
"rewards/margins": 9.88296890258789,
"rewards/rejected": -10.468085289001465,
"step": 160
},
{
"epoch": 0.557707203718048,
"grad_norm": 0.12635135650634766,
"learning_rate": 0.00019986848891833845,
"logits/chosen": -1.6951453685760498,
"logits/rejected": -1.1247837543487549,
"logps/chosen": -369.36383056640625,
"logps/rejected": -313.21380615234375,
"loss": 0.0159,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -2.165026903152466,
"rewards/margins": 9.382209777832031,
"rewards/rejected": -11.547235488891602,
"step": 180
},
{
"epoch": 0.6196746707978311,
"grad_norm": 0.5119428038597107,
"learning_rate": 0.00019979453927503364,
"logits/chosen": -1.5557712316513062,
"logits/rejected": -0.9883753657341003,
"logps/chosen": -378.3529357910156,
"logps/rejected": -338.2301330566406,
"loss": 0.0109,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -1.9073299169540405,
"rewards/margins": 10.27137565612793,
"rewards/rejected": -12.178706169128418,
"step": 200
},
{
"epoch": 0.6816421378776143,
"grad_norm": 0.012499742209911346,
"learning_rate": 0.0001997041811497882,
"logits/chosen": -1.639301061630249,
"logits/rejected": -1.059734582901001,
"logps/chosen": -403.56439208984375,
"logps/rejected": -362.4933776855469,
"loss": 0.0113,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.50722599029541,
"rewards/margins": 11.781638145446777,
"rewards/rejected": -16.288862228393555,
"step": 220
},
{
"epoch": 0.7436096049573974,
"grad_norm": 0.015822602435946465,
"learning_rate": 0.00019959742939952392,
"logits/chosen": -1.801640510559082,
"logits/rejected": -1.2558636665344238,
"logps/chosen": -358.8158264160156,
"logps/rejected": -329.281494140625,
"loss": 0.0085,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -1.591296911239624,
"rewards/margins": 11.404090881347656,
"rewards/rejected": -12.995388984680176,
"step": 240
},
{
"epoch": 0.8055770720371804,
"grad_norm": 0.06576687842607498,
"learning_rate": 0.00019947430157664576,
"logits/chosen": -1.816361427307129,
"logits/rejected": -1.3142831325531006,
"logps/chosen": -375.107421875,
"logps/rejected": -361.25567626953125,
"loss": 0.0121,
"rewards/accuracies": 0.984375,
"rewards/chosen": -2.420842409133911,
"rewards/margins": 11.270395278930664,
"rewards/rejected": -13.691238403320312,
"step": 260
},
{
"epoch": 0.8675445391169636,
"grad_norm": 0.01211523823440075,
"learning_rate": 0.00019933481792615583,
"logits/chosen": -1.7951005697250366,
"logits/rejected": -1.256089448928833,
"logps/chosen": -363.334228515625,
"logps/rejected": -335.49615478515625,
"loss": 0.0069,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -1.6655162572860718,
"rewards/margins": 11.434516906738281,
"rewards/rejected": -13.1000337600708,
"step": 280
},
{
"epoch": 0.9295120061967467,
"grad_norm": 0.005867226514965296,
"learning_rate": 0.0001991790013823246,
"logits/chosen": -1.8247705698013306,
"logits/rejected": -1.2836697101593018,
"logps/chosen": -373.73175048828125,
"logps/rejected": -328.99371337890625,
"loss": 0.0072,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.960078239440918,
"rewards/margins": 11.281866073608398,
"rewards/rejected": -13.241943359375,
"step": 300
},
{
"epoch": 0.9914794732765299,
"grad_norm": 0.11168529838323593,
"learning_rate": 0.0001990068775649202,
"logits/chosen": -1.8314838409423828,
"logits/rejected": -1.3281538486480713,
"logps/chosen": -362.94549560546875,
"logps/rejected": -310.90692138671875,
"loss": 0.0109,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -0.7653158903121948,
"rewards/margins": 10.92064094543457,
"rewards/rejected": -11.685956001281738,
"step": 320
},
{
"epoch": 1.053446940356313,
"grad_norm": 0.053166139870882034,
"learning_rate": 0.00019881847477499557,
"logits/chosen": -1.8288739919662476,
"logits/rejected": -1.2687069177627563,
"logps/chosen": -379.93914794921875,
"logps/rejected": -346.6662902832031,
"loss": 0.007,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3435510993003845,
"rewards/margins": 12.371174812316895,
"rewards/rejected": -12.714726448059082,
"step": 340
},
{
"epoch": 1.115414407436096,
"grad_norm": 0.007846315391361713,
"learning_rate": 0.0001986138239902355,
"logits/chosen": -1.8146957159042358,
"logits/rejected": -1.1931467056274414,
"logps/chosen": -361.128173828125,
"logps/rejected": -333.5379333496094,
"loss": 0.0035,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -0.7167718410491943,
"rewards/margins": 13.46613597869873,
"rewards/rejected": -14.182907104492188,
"step": 360
},
{
"epoch": 1.1773818745158793,
"grad_norm": 0.0029342020861804485,
"learning_rate": 0.00019839295885986296,
"logits/chosen": -1.8402125835418701,
"logits/rejected": -1.3026095628738403,
"logps/chosen": -367.6770935058594,
"logps/rejected": -334.61505126953125,
"loss": 0.0035,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.48788753151893616,
"rewards/margins": 12.355894088745117,
"rewards/rejected": -12.843780517578125,
"step": 380
},
{
"epoch": 1.2393493415956622,
"grad_norm": 0.0005422068061307073,
"learning_rate": 0.00019815591569910654,
"logits/chosen": -1.781711220741272,
"logits/rejected": -1.2187694311141968,
"logps/chosen": -368.02130126953125,
"logps/rejected": -336.0605163574219,
"loss": 0.004,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -0.474712073802948,
"rewards/margins": 13.070175170898438,
"rewards/rejected": -13.544886589050293,
"step": 400
},
{
"epoch": 1.3013168086754454,
"grad_norm": 0.004247570876032114,
"learning_rate": 0.0001979027334832293,
"logits/chosen": -1.729142189025879,
"logits/rejected": -1.1420295238494873,
"logps/chosen": -363.62261962890625,
"logps/rejected": -350.509765625,
"loss": 0.0046,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9223267436027527,
"rewards/margins": 14.022272109985352,
"rewards/rejected": -14.944600105285645,
"step": 420
},
{
"epoch": 1.3632842757552286,
"grad_norm": 0.025411546230316162,
"learning_rate": 0.00019763345384112043,
"logits/chosen": -1.6916519403457642,
"logits/rejected": -1.1293952465057373,
"logps/chosen": -368.69122314453125,
"logps/rejected": -357.363037109375,
"loss": 0.002,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.3415491580963135,
"rewards/margins": 13.441192626953125,
"rewards/rejected": -14.782742500305176,
"step": 440
},
{
"epoch": 1.4252517428350115,
"grad_norm": 0.023552559316158295,
"learning_rate": 0.00019734812104845047,
"logits/chosen": -1.6404588222503662,
"logits/rejected": -1.0976492166519165,
"logps/chosen": -358.5830993652344,
"logps/rejected": -323.82977294921875,
"loss": 0.004,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.1879071146249771,
"rewards/margins": 11.893779754638672,
"rewards/rejected": -12.081686019897461,
"step": 460
},
{
"epoch": 1.4872192099147947,
"grad_norm": 0.04839726537466049,
"learning_rate": 0.0001970467820203915,
"logits/chosen": -1.4514319896697998,
"logits/rejected": -0.7945712208747864,
"logps/chosen": -395.62109375,
"logps/rejected": -361.99224853515625,
"loss": 0.0052,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -2.660977602005005,
"rewards/margins": 13.56675910949707,
"rewards/rejected": -16.227737426757812,
"step": 480
},
{
"epoch": 1.549186676994578,
"grad_norm": 0.04717102646827698,
"learning_rate": 0.00019672948630390294,
"logits/chosen": -1.6030662059783936,
"logits/rejected": -1.008603811264038,
"logps/chosen": -382.2178955078125,
"logps/rejected": -384.981201171875,
"loss": 0.0185,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -3.418046474456787,
"rewards/margins": 14.233471870422363,
"rewards/rejected": -17.65151596069336,
"step": 500
},
{
"epoch": 1.6111541440743609,
"grad_norm": 0.022282173857092857,
"learning_rate": 0.00019639628606958533,
"logits/chosen": -1.943267822265625,
"logits/rejected": -1.5064051151275635,
"logps/chosen": -350.5743408203125,
"logps/rejected": -292.48321533203125,
"loss": 0.0043,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -0.508022129535675,
"rewards/margins": 10.412274360656738,
"rewards/rejected": -10.920295715332031,
"step": 520
},
{
"epoch": 1.673121611154144,
"grad_norm": 0.009392939507961273,
"learning_rate": 0.00019604723610310194,
"logits/chosen": -1.932124376296997,
"logits/rejected": -1.507216215133667,
"logps/chosen": -366.7988586425781,
"logps/rejected": -342.846923828125,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8672822713851929,
"rewards/margins": 11.667869567871094,
"rewards/rejected": -12.535151481628418,
"step": 540
},
{
"epoch": 1.7350890782339272,
"grad_norm": 0.008884243667125702,
"learning_rate": 0.00019568239379617088,
"logits/chosen": -1.8822323083877563,
"logits/rejected": -1.4790470600128174,
"logps/chosen": -364.321044921875,
"logps/rejected": -341.40081787109375,
"loss": 0.0035,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -1.8236000537872314,
"rewards/margins": 12.299530982971191,
"rewards/rejected": -14.123130798339844,
"step": 560
},
{
"epoch": 1.7970565453137102,
"grad_norm": 0.0044061969965696335,
"learning_rate": 0.00019530181913712872,
"logits/chosen": -1.926490068435669,
"logits/rejected": -1.4624470472335815,
"logps/chosen": -372.48468017578125,
"logps/rejected": -331.5034484863281,
"loss": 0.0055,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -1.4063794612884521,
"rewards/margins": 12.16389274597168,
"rewards/rejected": -13.570272445678711,
"step": 580
},
{
"epoch": 1.8590240123934936,
"grad_norm": 0.028566114604473114,
"learning_rate": 0.00019490557470106686,
"logits/chosen": -1.92436945438385,
"logits/rejected": -1.499299168586731,
"logps/chosen": -355.2225646972656,
"logps/rejected": -351.27313232421875,
"loss": 0.0043,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -1.2374690771102905,
"rewards/margins": 13.03515338897705,
"rewards/rejected": -14.272623062133789,
"step": 600
},
{
"epoch": 1.9209914794732765,
"grad_norm": 0.006185224745422602,
"learning_rate": 0.00019449372563954293,
"logits/chosen": -1.9587417840957642,
"logits/rejected": -1.4495702981948853,
"logps/chosen": -383.0813903808594,
"logps/rejected": -355.744873046875,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.4742207527160645,
"rewards/margins": 13.494425773620605,
"rewards/rejected": -15.968646049499512,
"step": 620
},
{
"epoch": 1.9829589465530595,
"grad_norm": 0.006004327442497015,
"learning_rate": 0.00019406633966986828,
"logits/chosen": -1.9453758001327515,
"logits/rejected": -1.512027621269226,
"logps/chosen": -392.6808166503906,
"logps/rejected": -378.18316650390625,
"loss": 0.0018,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.3614554405212402,
"rewards/margins": 13.526113510131836,
"rewards/rejected": -15.88757038116455,
"step": 640
},
{
"epoch": 2.044926413632843,
"grad_norm": 0.013266593217849731,
"learning_rate": 0.00019362348706397373,
"logits/chosen": -1.9494597911834717,
"logits/rejected": -1.4765260219573975,
"logps/chosen": -373.5834045410156,
"logps/rejected": -355.810546875,
"loss": 0.0021,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -2.2433362007141113,
"rewards/margins": 13.272119522094727,
"rewards/rejected": -15.51545524597168,
"step": 660
},
{
"epoch": 2.106893880712626,
"grad_norm": 0.0013421621406450868,
"learning_rate": 0.0001931652406368554,
"logits/chosen": -1.879929542541504,
"logits/rejected": -1.4265925884246826,
"logps/chosen": -377.5626220703125,
"logps/rejected": -365.1024475097656,
"loss": 0.0016,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -2.054849624633789,
"rewards/margins": 14.068676948547363,
"rewards/rejected": -16.123525619506836,
"step": 680
},
{
"epoch": 2.168861347792409,
"grad_norm": 0.0016059954650700092,
"learning_rate": 0.0001926916757346022,
"logits/chosen": -1.8783481121063232,
"logits/rejected": -1.4017314910888672,
"logps/chosen": -375.7680969238281,
"logps/rejected": -356.9335021972656,
"loss": 0.0024,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -2.0512871742248535,
"rewards/margins": 14.513456344604492,
"rewards/rejected": -16.564743041992188,
"step": 700
},
{
"epoch": 2.230828814872192,
"grad_norm": 0.0020687805954366922,
"learning_rate": 0.00019220287022200707,
"logits/chosen": -1.8722127676010132,
"logits/rejected": -1.4170135259628296,
"logps/chosen": -360.9228515625,
"logps/rejected": -376.93304443359375,
"loss": 0.0024,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -2.443851947784424,
"rewards/margins": 15.007545471191406,
"rewards/rejected": -17.451396942138672,
"step": 720
},
{
"epoch": 2.292796281951975,
"grad_norm": 0.03182324767112732,
"learning_rate": 0.00019169890446976454,
"logits/chosen": -1.8520162105560303,
"logits/rejected": -1.316450834274292,
"logps/chosen": -392.74285888671875,
"logps/rejected": -379.98138427734375,
"loss": 0.0013,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -2.4773547649383545,
"rewards/margins": 15.281835556030273,
"rewards/rejected": -17.75918960571289,
"step": 740
},
{
"epoch": 2.3547637490317586,
"grad_norm": 0.015935391187667847,
"learning_rate": 0.0001911798613412557,
"logits/chosen": -1.8732004165649414,
"logits/rejected": -1.374529480934143,
"logps/chosen": -386.89178466796875,
"logps/rejected": -386.22894287109375,
"loss": 0.0034,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -2.536558151245117,
"rewards/margins": 15.137763977050781,
"rewards/rejected": -17.6743221282959,
"step": 760
},
{
"epoch": 2.4167312161115415,
"grad_norm": 0.00028358056442812085,
"learning_rate": 0.0001906458261789238,
"logits/chosen": -1.8395631313323975,
"logits/rejected": -1.3308550119400024,
"logps/chosen": -388.93792724609375,
"logps/rejected": -391.17559814453125,
"loss": 0.0018,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -2.6551461219787598,
"rewards/margins": 15.461560249328613,
"rewards/rejected": -18.116708755493164,
"step": 780
},
{
"epoch": 2.4786986831913245,
"grad_norm": 0.001103501650504768,
"learning_rate": 0.0001900968867902419,
"logits/chosen": -1.8540499210357666,
"logits/rejected": -1.3438807725906372,
"logps/chosen": -397.89093017578125,
"logps/rejected": -393.6608581542969,
"loss": 0.0015,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -2.684976100921631,
"rewards/margins": 15.562596321105957,
"rewards/rejected": -18.247573852539062,
"step": 800
},
{
"epoch": 2.5406661502711074,
"grad_norm": 0.05029486119747162,
"learning_rate": 0.0001895331334332753,
"logits/chosen": -1.8151705265045166,
"logits/rejected": -1.3103126287460327,
"logps/chosen": -396.3746643066406,
"logps/rejected": -391.5860900878906,
"loss": 0.0037,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -3.1363155841827393,
"rewards/margins": 15.38147258758545,
"rewards/rejected": -18.51778793334961,
"step": 820
},
{
"epoch": 2.602633617350891,
"grad_norm": 0.0015266811242327094,
"learning_rate": 0.0001889546588018412,
"logits/chosen": -1.850388765335083,
"logits/rejected": -1.3118959665298462,
"logps/chosen": -381.0390319824219,
"logps/rejected": -371.218505859375,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.7308974266052246,
"rewards/margins": 15.474958419799805,
"rewards/rejected": -18.205854415893555,
"step": 840
},
{
"epoch": 2.664601084430674,
"grad_norm": 0.010239909403026104,
"learning_rate": 0.00018836155801026753,
"logits/chosen": -1.8376766443252563,
"logits/rejected": -1.337482213973999,
"logps/chosen": -380.15032958984375,
"logps/rejected": -385.6625061035156,
"loss": 0.0059,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -2.8081612586975098,
"rewards/margins": 15.317975997924805,
"rewards/rejected": -18.12613868713379,
"step": 860
},
{
"epoch": 2.726568551510457,
"grad_norm": 0.005239796359091997,
"learning_rate": 0.00018775392857775432,
"logits/chosen": -1.8260116577148438,
"logits/rejected": -1.3371708393096924,
"logps/chosen": -386.72052001953125,
"logps/rejected": -393.1973571777344,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.4543259143829346,
"rewards/margins": 15.405393600463867,
"rewards/rejected": -18.859722137451172,
"step": 880
},
{
"epoch": 2.78853601859024,
"grad_norm": 0.0014312748098745942,
"learning_rate": 0.00018713187041233896,
"logits/chosen": -1.8437349796295166,
"logits/rejected": -1.295083999633789,
"logps/chosen": -396.12713623046875,
"logps/rejected": -400.5750427246094,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.4062328338623047,
"rewards/margins": 17.027809143066406,
"rewards/rejected": -20.434043884277344,
"step": 900
},
{
"epoch": 2.850503485670023,
"grad_norm": 0.03151211887598038,
"learning_rate": 0.00018649548579446936,
"logits/chosen": -1.8418632745742798,
"logits/rejected": -1.3832991123199463,
"logps/chosen": -387.4415588378906,
"logps/rejected": -418.4268493652344,
"loss": 0.0036,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -3.485564708709717,
"rewards/margins": 15.658266067504883,
"rewards/rejected": -19.14383316040039,
"step": 920
},
{
"epoch": 2.9124709527498065,
"grad_norm": 0.003437014762312174,
"learning_rate": 0.00018584487936018661,
"logits/chosen": -1.957241415977478,
"logits/rejected": -1.4707096815109253,
"logps/chosen": -370.52734375,
"logps/rejected": -367.0068054199219,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.7640680074691772,
"rewards/margins": 14.591270446777344,
"rewards/rejected": -16.3553409576416,
"step": 940
},
{
"epoch": 2.9744384198295895,
"grad_norm": 0.0018515066476538777,
"learning_rate": 0.00018518015808392045,
"logits/chosen": -1.8616878986358643,
"logits/rejected": -1.3850669860839844,
"logps/chosen": -370.74847412109375,
"logps/rejected": -395.7770690917969,
"loss": 0.0019,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.756985664367676,
"rewards/margins": 15.77873420715332,
"rewards/rejected": -18.53571891784668,
"step": 960
},
{
"epoch": 3.0364058869093724,
"grad_norm": 0.0055403695441782475,
"learning_rate": 0.00018450143126090015,
"logits/chosen": -1.9129266738891602,
"logits/rejected": -1.4352341890335083,
"logps/chosen": -378.54547119140625,
"logps/rejected": -389.22955322265625,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.454970359802246,
"rewards/margins": 15.567869186401367,
"rewards/rejected": -18.022838592529297,
"step": 980
},
{
"epoch": 3.098373353989156,
"grad_norm": 0.0003845282772090286,
"learning_rate": 0.00018380881048918405,
"logits/chosen": -1.955512285232544,
"logits/rejected": -1.4428436756134033,
"logps/chosen": -375.7381286621094,
"logps/rejected": -373.1043701171875,
"loss": 0.0023,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.9304916858673096,
"rewards/margins": 15.572137832641602,
"rewards/rejected": -17.502628326416016,
"step": 1000
},
{
"epoch": 3.1603408210689388,
"grad_norm": 0.000813652528449893,
"learning_rate": 0.00018310240965131041,
"logits/chosen": -1.9499313831329346,
"logits/rejected": -1.4106732606887817,
"logps/chosen": -363.78314208984375,
"logps/rejected": -364.62835693359375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.788172721862793,
"rewards/margins": 15.584823608398438,
"rewards/rejected": -17.372997283935547,
"step": 1020
},
{
"epoch": 3.2223082881487217,
"grad_norm": 0.0015642641810700297,
"learning_rate": 0.00018238234489557215,
"logits/chosen": -1.9376710653305054,
"logits/rejected": -1.4058828353881836,
"logps/chosen": -391.0188903808594,
"logps/rejected": -384.52716064453125,
"loss": 0.0023,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -1.709324598312378,
"rewards/margins": 16.003910064697266,
"rewards/rejected": -17.713237762451172,
"step": 1040
},
{
"epoch": 3.284275755228505,
"grad_norm": 0.013190961442887783,
"learning_rate": 0.00018164873461691986,
"logits/chosen": -1.9225285053253174,
"logits/rejected": -1.4039231538772583,
"logps/chosen": -389.7248840332031,
"logps/rejected": -403.44891357421875,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.2535457611083984,
"rewards/margins": 17.14788818359375,
"rewards/rejected": -19.401432037353516,
"step": 1060
},
{
"epoch": 3.346243222308288,
"grad_norm": 0.0009441258735023439,
"learning_rate": 0.00018090169943749476,
"logits/chosen": -1.9266620874404907,
"logits/rejected": -1.3820419311523438,
"logps/chosen": -377.3229064941406,
"logps/rejected": -394.3813171386719,
"loss": 0.0012,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -2.6834962368011475,
"rewards/margins": 16.853666305541992,
"rewards/rejected": -19.537160873413086,
"step": 1080
},
{
"epoch": 3.4082106893880715,
"grad_norm": 0.000891213770955801,
"learning_rate": 0.00018014136218679567,
"logits/chosen": -1.8898261785507202,
"logits/rejected": -1.3582581281661987,
"logps/chosen": -367.8475341796875,
"logps/rejected": -381.94219970703125,
"loss": 0.0034,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.8650197982788086,
"rewards/margins": 16.576953887939453,
"rewards/rejected": -19.441974639892578,
"step": 1100
},
{
"epoch": 3.4701781564678544,
"grad_norm": 0.0021270292345434427,
"learning_rate": 0.00017936784788148328,
"logits/chosen": -1.9054046869277954,
"logits/rejected": -1.3137685060501099,
"logps/chosen": -396.55718994140625,
"logps/rejected": -399.8603515625,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.9427146911621094,
"rewards/margins": 17.294252395629883,
"rewards/rejected": -20.236968994140625,
"step": 1120
},
{
"epoch": 3.5321456235476374,
"grad_norm": 0.0006443614838644862,
"learning_rate": 0.00017858128370482426,
"logits/chosen": -1.8784294128417969,
"logits/rejected": -1.3266098499298096,
"logps/chosen": -376.5830993652344,
"logps/rejected": -384.6981506347656,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.870404005050659,
"rewards/margins": 17.322202682495117,
"rewards/rejected": -20.192609786987305,
"step": 1140
},
{
"epoch": 3.5941130906274203,
"grad_norm": 0.0011427829740568995,
"learning_rate": 0.00017778179898577973,
"logits/chosen": -1.8605209589004517,
"logits/rejected": -1.3551753759384155,
"logps/chosen": -393.83099365234375,
"logps/rejected": -431.01824951171875,
"loss": 0.0044,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.188037872314453,
"rewards/margins": 17.58969497680664,
"rewards/rejected": -21.77773094177246,
"step": 1160
},
{
"epoch": 3.6560805577072037,
"grad_norm": 0.00015023932792246342,
"learning_rate": 0.00017696952517774062,
"logits/chosen": -1.8713442087173462,
"logits/rejected": -1.2884734869003296,
"logps/chosen": -389.5274658203125,
"logps/rejected": -406.44696044921875,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -3.2542433738708496,
"rewards/margins": 18.175609588623047,
"rewards/rejected": -21.429855346679688,
"step": 1180
},
{
"epoch": 3.7180480247869867,
"grad_norm": 0.0034171934239566326,
"learning_rate": 0.00017614459583691346,
"logits/chosen": -1.8342435359954834,
"logits/rejected": -1.33168625831604,
"logps/chosen": -392.7457275390625,
"logps/rejected": -424.7430725097656,
"loss": 0.0012,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.033926963806152,
"rewards/margins": 17.532773971557617,
"rewards/rejected": -21.566701889038086,
"step": 1200
},
{
"epoch": 3.78001549186677,
"grad_norm": 0.00014497939264401793,
"learning_rate": 0.00017530714660036112,
"logits/chosen": -1.8120412826538086,
"logits/rejected": -1.2837426662445068,
"logps/chosen": -400.38055419921875,
"logps/rejected": -432.98175048828125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.632486343383789,
"rewards/margins": 18.09763526916504,
"rewards/rejected": -21.730119705200195,
"step": 1220
},
{
"epoch": 3.841982958946553,
"grad_norm": 0.00035277256392873824,
"learning_rate": 0.0001744573151637007,
"logits/chosen": -1.7961149215698242,
"logits/rejected": -1.2880661487579346,
"logps/chosen": -389.3721618652344,
"logps/rejected": -458.435546875,
"loss": 0.0036,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.226949214935303,
"rewards/margins": 18.70314598083496,
"rewards/rejected": -22.930095672607422,
"step": 1240
},
{
"epoch": 3.903950426026336,
"grad_norm": 0.0018203147919848561,
"learning_rate": 0.0001735952412584635,
"logits/chosen": -1.8189284801483154,
"logits/rejected": -1.2755413055419922,
"logps/chosen": -403.92608642578125,
"logps/rejected": -437.57470703125,
"loss": 0.0023,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.302323818206787,
"rewards/margins": 18.439044952392578,
"rewards/rejected": -22.741369247436523,
"step": 1260
},
{
"epoch": 3.9659178931061194,
"grad_norm": 0.000810753321275115,
"learning_rate": 0.00017272106662911973,
"logits/chosen": -1.8001739978790283,
"logits/rejected": -1.2190439701080322,
"logps/chosen": -392.6038513183594,
"logps/rejected": -409.79754638671875,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.5174388885498047,
"rewards/margins": 18.15955924987793,
"rewards/rejected": -21.676998138427734,
"step": 1280
},
{
"epoch": 4.027885360185903,
"grad_norm": 0.0008877617656253278,
"learning_rate": 0.00017183493500977278,
"logits/chosen": -1.7996867895126343,
"logits/rejected": -1.2403078079223633,
"logps/chosen": -376.8688659667969,
"logps/rejected": -401.3122863769531,
"loss": 0.0012,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -3.8793225288391113,
"rewards/margins": 17.706012725830078,
"rewards/rejected": -21.58533477783203,
"step": 1300
},
{
"epoch": 4.089852827265686,
"grad_norm": 0.0007201443077065051,
"learning_rate": 0.0001709369921005258,
"logits/chosen": -1.7817294597625732,
"logits/rejected": -1.3144575357437134,
"logps/chosen": -362.8156433105469,
"logps/rejected": -421.5276794433594,
"loss": 0.0033,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -3.907405376434326,
"rewards/margins": 17.486907958984375,
"rewards/rejected": -21.394317626953125,
"step": 1320
},
{
"epoch": 4.151820294345469,
"grad_norm": 0.0004134229675401002,
"learning_rate": 0.00017002738554352552,
"logits/chosen": -1.7647602558135986,
"logits/rejected": -1.2397964000701904,
"logps/chosen": -400.63525390625,
"logps/rejected": -434.27734375,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.650538444519043,
"rewards/margins": 17.86612319946289,
"rewards/rejected": -22.516660690307617,
"step": 1340
},
{
"epoch": 4.213787761425252,
"grad_norm": 0.0018414207734167576,
"learning_rate": 0.00016910626489868649,
"logits/chosen": -1.8098886013031006,
"logits/rejected": -1.2557048797607422,
"logps/chosen": -403.9068908691406,
"logps/rejected": -441.5738220214844,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.8825366497039795,
"rewards/margins": 19.2824764251709,
"rewards/rejected": -23.165014266967773,
"step": 1360
},
{
"epoch": 4.275755228505035,
"grad_norm": 0.000604189292062074,
"learning_rate": 0.00016817378161909996,
"logits/chosen": -1.7331501245498657,
"logits/rejected": -1.1988348960876465,
"logps/chosen": -379.48004150390625,
"logps/rejected": -416.23504638671875,
"loss": 0.0022,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -4.858603477478027,
"rewards/margins": 17.692523956298828,
"rewards/rejected": -22.551128387451172,
"step": 1380
},
{
"epoch": 4.337722695584818,
"grad_norm": 0.0018184883520007133,
"learning_rate": 0.0001672300890261317,
"logits/chosen": -1.786969780921936,
"logits/rejected": -1.1631317138671875,
"logps/chosen": -399.63836669921875,
"logps/rejected": -406.0413513183594,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.497194766998291,
"rewards/margins": 17.606014251708984,
"rewards/rejected": -22.103206634521484,
"step": 1400
},
{
"epoch": 4.3996901626646014,
"grad_norm": 0.0004817396984435618,
"learning_rate": 0.0001662753422842123,
"logits/chosen": -1.803607702255249,
"logits/rejected": -1.2023392915725708,
"logps/chosen": -397.8926086425781,
"logps/rejected": -415.9464416503906,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.3522844314575195,
"rewards/margins": 18.28469467163086,
"rewards/rejected": -22.636978149414062,
"step": 1420
},
{
"epoch": 4.461657629744384,
"grad_norm": 0.0003521572216413915,
"learning_rate": 0.00016530969837532487,
"logits/chosen": -1.745550513267517,
"logits/rejected": -1.2345880270004272,
"logps/chosen": -398.3353271484375,
"logps/rejected": -455.84991455078125,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.580657005310059,
"rewards/margins": 18.520645141601562,
"rewards/rejected": -23.101301193237305,
"step": 1440
},
{
"epoch": 4.523625096824167,
"grad_norm": 0.001398236840032041,
"learning_rate": 0.00016433331607319343,
"logits/chosen": -1.7653003931045532,
"logits/rejected": -1.2409374713897705,
"logps/chosen": -390.4782409667969,
"logps/rejected": -445.02203369140625,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.244819641113281,
"rewards/margins": 19.066150665283203,
"rewards/rejected": -23.31096839904785,
"step": 1460
},
{
"epoch": 4.58559256390395,
"grad_norm": 0.0006393153453245759,
"learning_rate": 0.00016334635591717703,
"logits/chosen": -1.7738897800445557,
"logits/rejected": -1.2459341287612915,
"logps/chosen": -405.1599426269531,
"logps/rejected": -465.34796142578125,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.869115114212036,
"rewards/margins": 20.113529205322266,
"rewards/rejected": -23.98264503479004,
"step": 1480
},
{
"epoch": 4.647560030983733,
"grad_norm": 0.0002729636325966567,
"learning_rate": 0.00016234898018587337,
"logits/chosen": -1.7716586589813232,
"logits/rejected": -1.156842589378357,
"logps/chosen": -400.9200439453125,
"logps/rejected": -419.4234924316406,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.553537368774414,
"rewards/margins": 18.427448272705078,
"rewards/rejected": -22.980987548828125,
"step": 1500
},
{
"epoch": 4.709527498063517,
"grad_norm": 0.0016045222291722894,
"learning_rate": 0.00016134135287043669,
"logits/chosen": -1.7796188592910767,
"logits/rejected": -1.1779518127441406,
"logps/chosen": -407.48773193359375,
"logps/rejected": -439.03143310546875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.231381416320801,
"rewards/margins": 19.530107498168945,
"rewards/rejected": -23.761486053466797,
"step": 1520
},
{
"epoch": 4.7714949651433,
"grad_norm": 0.0001898371265269816,
"learning_rate": 0.00016032363964761363,
"logits/chosen": -1.7506084442138672,
"logits/rejected": -1.1158758401870728,
"logps/chosen": -412.0704650878906,
"logps/rejected": -419.58477783203125,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.735566139221191,
"rewards/margins": 18.557144165039062,
"rewards/rejected": -23.292709350585938,
"step": 1540
},
{
"epoch": 4.833462432223083,
"grad_norm": 0.0011102559510618448,
"learning_rate": 0.00015929600785250257,
"logits/chosen": -1.772351861000061,
"logits/rejected": -1.199371576309204,
"logps/chosen": -411.6983337402344,
"logps/rejected": -456.08526611328125,
"loss": 0.0044,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.344552516937256,
"rewards/margins": 19.66854476928711,
"rewards/rejected": -24.01309585571289,
"step": 1560
},
{
"epoch": 4.895429899302866,
"grad_norm": 0.0002147419872926548,
"learning_rate": 0.0001582586264510396,
"logits/chosen": -1.7624610662460327,
"logits/rejected": -1.1555306911468506,
"logps/chosen": -392.86846923828125,
"logps/rejected": -411.6356506347656,
"loss": 0.0033,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.8805503845214844,
"rewards/margins": 18.482906341552734,
"rewards/rejected": -22.36345672607422,
"step": 1580
},
{
"epoch": 4.957397366382649,
"grad_norm": 0.00014843855751678348,
"learning_rate": 0.00015721166601221698,
"logits/chosen": -1.7433449029922485,
"logits/rejected": -1.1605427265167236,
"logps/chosen": -402.5615539550781,
"logps/rejected": -437.72601318359375,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.379772186279297,
"rewards/margins": 19.26140022277832,
"rewards/rejected": -23.641170501708984,
"step": 1600
},
{
"epoch": 5.019364833462432,
"grad_norm": 9.896748815663159e-05,
"learning_rate": 0.0001561552986800375,
"logits/chosen": -1.7666635513305664,
"logits/rejected": -1.2081592082977295,
"logps/chosen": -409.02685546875,
"logps/rejected": -462.6644592285156,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.824324607849121,
"rewards/margins": 19.418132781982422,
"rewards/rejected": -24.242456436157227,
"step": 1620
},
{
"epoch": 5.081332300542216,
"grad_norm": 6.193404988152906e-05,
"learning_rate": 0.00015508969814521025,
"logits/chosen": -1.7530428171157837,
"logits/rejected": -1.2155699729919434,
"logps/chosen": -396.701171875,
"logps/rejected": -438.2998046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.757896423339844,
"rewards/margins": 18.720035552978516,
"rewards/rejected": -23.47793197631836,
"step": 1640
},
{
"epoch": 5.143299767621999,
"grad_norm": 0.0005012938636355102,
"learning_rate": 0.00015401503961659204,
"logits/chosen": -1.76808762550354,
"logits/rejected": -1.2039562463760376,
"logps/chosen": -416.18133544921875,
"logps/rejected": -471.65032958984375,
"loss": 0.0055,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -4.3714799880981445,
"rewards/margins": 20.104217529296875,
"rewards/rejected": -24.475696563720703,
"step": 1660
},
{
"epoch": 5.205267234701782,
"grad_norm": 0.0007204354042187333,
"learning_rate": 0.00015293149979237876,
"logits/chosen": -1.700727105140686,
"logits/rejected": -1.1688693761825562,
"logps/chosen": -395.04620361328125,
"logps/rejected": -459.3890686035156,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.376019477844238,
"rewards/margins": 19.267929077148438,
"rewards/rejected": -24.643945693969727,
"step": 1680
},
{
"epoch": 5.267234701781565,
"grad_norm": 0.00012067196075804532,
"learning_rate": 0.00015183925683105254,
"logits/chosen": -1.7348114252090454,
"logits/rejected": -1.1479172706604004,
"logps/chosen": -411.1114807128906,
"logps/rejected": -467.02777099609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.358091354370117,
"rewards/margins": 20.035839080810547,
"rewards/rejected": -24.393932342529297,
"step": 1700
},
{
"epoch": 5.329202168861348,
"grad_norm": 0.0015901889419183135,
"learning_rate": 0.00015073849032208822,
"logits/chosen": -1.7161178588867188,
"logits/rejected": -1.1550828218460083,
"logps/chosen": -408.5069885253906,
"logps/rejected": -455.2245178222656,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.192176342010498,
"rewards/margins": 19.474624633789062,
"rewards/rejected": -24.66680145263672,
"step": 1720
},
{
"epoch": 5.3911696359411305,
"grad_norm": 2.9804143196088262e-05,
"learning_rate": 0.00014962938125642503,
"logits/chosen": -1.7266225814819336,
"logits/rejected": -1.1720420122146606,
"logps/chosen": -404.70721435546875,
"logps/rejected": -468.11956787109375,
"loss": 0.0022,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.117176532745361,
"rewards/margins": 19.882728576660156,
"rewards/rejected": -24.99990463256836,
"step": 1740
},
{
"epoch": 5.453137103020914,
"grad_norm": 0.001581120421178639,
"learning_rate": 0.00014851211199670721,
"logits/chosen": -1.7630701065063477,
"logits/rejected": -1.1630027294158936,
"logps/chosen": -387.80364990234375,
"logps/rejected": -445.5340270996094,
"loss": 0.0076,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -4.650803565979004,
"rewards/margins": 19.620697021484375,
"rewards/rejected": -24.271501541137695,
"step": 1760
},
{
"epoch": 5.515104570100697,
"grad_norm": 7.492147415177897e-05,
"learning_rate": 0.00014738686624729986,
"logits/chosen": -1.7199184894561768,
"logits/rejected": -1.1519477367401123,
"logps/chosen": -398.6278991699219,
"logps/rejected": -449.28826904296875,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.650136947631836,
"rewards/margins": 19.344139099121094,
"rewards/rejected": -23.99427604675293,
"step": 1780
},
{
"epoch": 5.57707203718048,
"grad_norm": 0.0007189544849097729,
"learning_rate": 0.00014625382902408356,
"logits/chosen": -1.7485740184783936,
"logits/rejected": -1.15171217918396,
"logps/chosen": -413.4642639160156,
"logps/rejected": -454.82623291015625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.690885543823242,
"rewards/margins": 19.775279998779297,
"rewards/rejected": -24.466161727905273,
"step": 1800
},
{
"epoch": 5.639039504260263,
"grad_norm": 9.353666246170178e-05,
"learning_rate": 0.00014511318662403347,
"logits/chosen": -1.7578392028808594,
"logits/rejected": -1.1830543279647827,
"logps/chosen": -395.25433349609375,
"logps/rejected": -461.00128173828125,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.259980201721191,
"rewards/margins": 20.097646713256836,
"rewards/rejected": -24.35762596130371,
"step": 1820
},
{
"epoch": 5.701006971340046,
"grad_norm": 0.00011017426731996238,
"learning_rate": 0.00014396512659458824,
"logits/chosen": -1.718340277671814,
"logits/rejected": -1.1603585481643677,
"logps/chosen": -397.50201416015625,
"logps/rejected": -441.17120361328125,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.123129844665527,
"rewards/margins": 18.981271743774414,
"rewards/rejected": -24.104402542114258,
"step": 1840
},
{
"epoch": 5.76297443841983,
"grad_norm": 0.0007490446441806853,
"learning_rate": 0.0001428098377028126,
"logits/chosen": -1.7352231740951538,
"logits/rejected": -1.1633882522583008,
"logps/chosen": -395.93719482421875,
"logps/rejected": -450.5420837402344,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.510663032531738,
"rewards/margins": 20.08230972290039,
"rewards/rejected": -24.59296989440918,
"step": 1860
},
{
"epoch": 5.824941905499613,
"grad_norm": 0.002562998328357935,
"learning_rate": 0.0001416475099043599,
"logits/chosen": -1.7280263900756836,
"logits/rejected": -1.0888252258300781,
"logps/chosen": -383.5231628417969,
"logps/rejected": -423.22735595703125,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.361128330230713,
"rewards/margins": 19.707561492919922,
"rewards/rejected": -24.06869125366211,
"step": 1880
},
{
"epoch": 5.886909372579396,
"grad_norm": 0.0003409655182622373,
"learning_rate": 0.00014047833431223938,
"logits/chosen": -1.7228466272354126,
"logits/rejected": -1.1678210496902466,
"logps/chosen": -427.7156677246094,
"logps/rejected": -484.9002990722656,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.072082996368408,
"rewards/margins": 19.94878387451172,
"rewards/rejected": -25.0208683013916,
"step": 1900
},
{
"epoch": 5.948876839659179,
"grad_norm": 3.485321212792769e-05,
"learning_rate": 0.00013930250316539238,
"logits/chosen": -1.7439708709716797,
"logits/rejected": -1.1591265201568604,
"logps/chosen": -409.28485107421875,
"logps/rejected": -464.5729064941406,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.592177867889404,
"rewards/margins": 20.056758880615234,
"rewards/rejected": -24.64893913269043,
"step": 1920
},
{
"epoch": 6.010844306738962,
"grad_norm": 0.0024052930530160666,
"learning_rate": 0.00013812020979708418,
"logits/chosen": -1.766571044921875,
"logits/rejected": -1.1335632801055908,
"logps/chosen": -409.98095703125,
"logps/rejected": -432.7437438964844,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.963695526123047,
"rewards/margins": 19.679019927978516,
"rewards/rejected": -24.642715454101562,
"step": 1940
},
{
"epoch": 6.072811773818745,
"grad_norm": 7.735176041023806e-05,
"learning_rate": 0.00013693164860311565,
"logits/chosen": -1.7631984949111938,
"logits/rejected": -1.1198147535324097,
"logps/chosen": -398.9923400878906,
"logps/rejected": -429.88861083984375,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.000827312469482,
"rewards/margins": 20.33033561706543,
"rewards/rejected": -24.331165313720703,
"step": 1960
},
{
"epoch": 6.134779240898529,
"grad_norm": 0.0003688503638841212,
"learning_rate": 0.0001357370150098601,
"logits/chosen": -1.7265870571136475,
"logits/rejected": -1.1435579061508179,
"logps/chosen": -390.2747497558594,
"logps/rejected": -457.9873962402344,
"loss": 0.0033,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.666455268859863,
"rewards/margins": 20.30272102355957,
"rewards/rejected": -24.969173431396484,
"step": 1980
},
{
"epoch": 6.196746707978312,
"grad_norm": 0.0016685057198628783,
"learning_rate": 0.00013453650544213076,
"logits/chosen": -1.7364275455474854,
"logits/rejected": -1.1212728023529053,
"logps/chosen": -404.72869873046875,
"logps/rejected": -440.9786071777344,
"loss": 0.0022,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -4.581490993499756,
"rewards/margins": 19.78643035888672,
"rewards/rejected": -24.367919921875,
"step": 2000
},
{
"epoch": 6.258714175058095,
"grad_norm": 0.00023198116105049849,
"learning_rate": 0.00013333031729088419,
"logits/chosen": -1.7448314428329468,
"logits/rejected": -1.1462557315826416,
"logps/chosen": -401.00048828125,
"logps/rejected": -452.0621032714844,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.24946928024292,
"rewards/margins": 20.46927833557129,
"rewards/rejected": -24.718748092651367,
"step": 2020
},
{
"epoch": 6.3206816421378775,
"grad_norm": 0.00022464637004304677,
"learning_rate": 0.00013211864888076457,
"logits/chosen": -1.691931962966919,
"logits/rejected": -1.16156005859375,
"logps/chosen": -417.93585205078125,
"logps/rejected": -468.42791748046875,
"loss": 0.0044,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -5.851279258728027,
"rewards/margins": 19.037456512451172,
"rewards/rejected": -24.888734817504883,
"step": 2040
},
{
"epoch": 6.3826491092176605,
"grad_norm": 0.0001370076060993597,
"learning_rate": 0.00013090169943749476,
"logits/chosen": -1.7306629419326782,
"logits/rejected": -1.16789972782135,
"logps/chosen": -400.44989013671875,
"logps/rejected": -461.5997009277344,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.793812274932861,
"rewards/margins": 20.2277889251709,
"rewards/rejected": -25.02159881591797,
"step": 2060
},
{
"epoch": 6.4446165762974434,
"grad_norm": 0.0007584911654703319,
"learning_rate": 0.00012967966905511906,
"logits/chosen": -1.7538254261016846,
"logits/rejected": -1.1523357629776,
"logps/chosen": -400.55078125,
"logps/rejected": -457.19439697265625,
"loss": 0.0033,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.016867637634277,
"rewards/margins": 20.043991088867188,
"rewards/rejected": -25.06085777282715,
"step": 2080
},
{
"epoch": 6.506584043377227,
"grad_norm": 0.00025258222012780607,
"learning_rate": 0.00012845275866310324,
"logits/chosen": -1.709283471107483,
"logits/rejected": -1.1272356510162354,
"logps/chosen": -393.4644775390625,
"logps/rejected": -445.11932373046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.725881576538086,
"rewards/margins": 20.157442092895508,
"rewards/rejected": -24.88332176208496,
"step": 2100
},
{
"epoch": 6.56855151045701,
"grad_norm": 0.0005373629392124712,
"learning_rate": 0.00012722116999329712,
"logits/chosen": -1.7319450378417969,
"logits/rejected": -1.146323323249817,
"logps/chosen": -400.94219970703125,
"logps/rejected": -457.70294189453125,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.707498073577881,
"rewards/margins": 19.930648803710938,
"rewards/rejected": -24.638147354125977,
"step": 2120
},
{
"epoch": 6.630518977536793,
"grad_norm": 3.2575491786701605e-05,
"learning_rate": 0.0001259851055467653,
"logits/chosen": -1.7204310894012451,
"logits/rejected": -1.1470435857772827,
"logps/chosen": -407.14794921875,
"logps/rejected": -463.16937255859375,
"loss": 0.0022,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.401209831237793,
"rewards/margins": 19.731382369995117,
"rewards/rejected": -25.132593154907227,
"step": 2140
},
{
"epoch": 6.692486444616576,
"grad_norm": 4.120891753700562e-05,
"learning_rate": 0.00012474476856049144,
"logits/chosen": -1.758186936378479,
"logits/rejected": -1.0516242980957031,
"logps/chosen": -422.578125,
"logps/rejected": -450.13360595703125,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.05043888092041,
"rewards/margins": 20.296903610229492,
"rewards/rejected": -25.347341537475586,
"step": 2160
},
{
"epoch": 6.754453911696359,
"grad_norm": 0.0018112401012331247,
"learning_rate": 0.00012350036297396154,
"logits/chosen": -1.7569530010223389,
"logits/rejected": -1.1236534118652344,
"logps/chosen": -398.9664001464844,
"logps/rejected": -440.2588806152344,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.544419288635254,
"rewards/margins": 20.12918472290039,
"rewards/rejected": -24.673603057861328,
"step": 2180
},
{
"epoch": 6.816421378776143,
"grad_norm": 0.0009737831423990428,
"learning_rate": 0.00012225209339563145,
"logits/chosen": -1.709917664527893,
"logits/rejected": -1.1064178943634033,
"logps/chosen": -414.5459899902344,
"logps/rejected": -465.4837341308594,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.264222145080566,
"rewards/margins": 20.37704849243164,
"rewards/rejected": -25.64126968383789,
"step": 2200
},
{
"epoch": 6.878388845855926,
"grad_norm": 0.000668133026920259,
"learning_rate": 0.00012100016506928493,
"logits/chosen": -1.733787178993225,
"logits/rejected": -1.1450860500335693,
"logps/chosen": -403.2812805175781,
"logps/rejected": -477.0782165527344,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.641029357910156,
"rewards/margins": 21.0471134185791,
"rewards/rejected": -25.68814468383789,
"step": 2220
},
{
"epoch": 6.940356312935709,
"grad_norm": 0.00028338556876406074,
"learning_rate": 0.00011974478384028672,
"logits/chosen": -1.703685998916626,
"logits/rejected": -1.0926717519760132,
"logps/chosen": -415.73248291015625,
"logps/rejected": -474.7493591308594,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.833617210388184,
"rewards/margins": 19.839744567871094,
"rewards/rejected": -25.67336082458496,
"step": 2240
},
{
"epoch": 7.002323780015492,
"grad_norm": 9.248249261872843e-05,
"learning_rate": 0.00011848615612173688,
"logits/chosen": -1.727691888809204,
"logits/rejected": -1.1385018825531006,
"logps/chosen": -404.37158203125,
"logps/rejected": -455.1560974121094,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.189269065856934,
"rewards/margins": 20.383289337158203,
"rewards/rejected": -25.572555541992188,
"step": 2260
},
{
"epoch": 7.064291247095275,
"grad_norm": 1.9335082470206544e-05,
"learning_rate": 0.0001172244888605319,
"logits/chosen": -1.687378168106079,
"logits/rejected": -1.1057562828063965,
"logps/chosen": -406.32733154296875,
"logps/rejected": -474.8482360839844,
"loss": 0.0033,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -4.787657737731934,
"rewards/margins": 20.789146423339844,
"rewards/rejected": -25.576807022094727,
"step": 2280
},
{
"epoch": 7.126258714175058,
"grad_norm": 8.403878018725663e-05,
"learning_rate": 0.00011595998950333793,
"logits/chosen": -1.6789989471435547,
"logits/rejected": -1.1095144748687744,
"logps/chosen": -409.31524658203125,
"logps/rejected": -472.5364685058594,
"loss": 0.0033,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.127674579620361,
"rewards/margins": 20.548160552978516,
"rewards/rejected": -25.675832748413086,
"step": 2300
},
{
"epoch": 7.188226181254842,
"grad_norm": 0.0001840272598201409,
"learning_rate": 0.00011469286596248181,
"logits/chosen": -1.7186450958251953,
"logits/rejected": -1.0815023183822632,
"logps/chosen": -402.4718322753906,
"logps/rejected": -446.8160095214844,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.929797172546387,
"rewards/margins": 20.37470245361328,
"rewards/rejected": -25.304500579833984,
"step": 2320
},
{
"epoch": 7.2501936483346245,
"grad_norm": 0.00030283021624200046,
"learning_rate": 0.00011342332658176555,
"logits/chosen": -1.7267248630523682,
"logits/rejected": -1.1029185056686401,
"logps/chosen": -407.1277160644531,
"logps/rejected": -443.208251953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.898409843444824,
"rewards/margins": 19.7962589263916,
"rewards/rejected": -24.69466781616211,
"step": 2340
},
{
"epoch": 7.3121611154144075,
"grad_norm": 0.000179938884684816,
"learning_rate": 0.00011221521661813197,
"logits/chosen": -1.7125059366226196,
"logits/rejected": -1.107881784439087,
"logps/chosen": -411.54571533203125,
"logps/rejected": -468.47821044921875,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.634856700897217,
"rewards/margins": 20.49616050720215,
"rewards/rejected": -26.131017684936523,
"step": 2360
},
{
"epoch": 7.3741285824941905,
"grad_norm": 0.00018190982518717647,
"learning_rate": 0.0001109415670719721,
"logits/chosen": -1.6849457025527954,
"logits/rejected": -1.0680724382400513,
"logps/chosen": -408.02587890625,
"logps/rejected": -460.41015625,
"loss": 0.0033,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -4.917786598205566,
"rewards/margins": 20.782718658447266,
"rewards/rejected": -25.700504302978516,
"step": 2380
},
{
"epoch": 7.436096049573973,
"grad_norm": 0.00010547572310315445,
"learning_rate": 0.00010966611848443176,
"logits/chosen": -1.6835496425628662,
"logits/rejected": -1.0897111892700195,
"logps/chosen": -407.20318603515625,
"logps/rejected": -464.83935546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.396719932556152,
"rewards/margins": 20.730510711669922,
"rewards/rejected": -26.127233505249023,
"step": 2400
},
{
"epoch": 7.498063516653756,
"grad_norm": 0.0002746889949776232,
"learning_rate": 0.00010838908056813919,
"logits/chosen": -1.7222875356674194,
"logits/rejected": -1.0569690465927124,
"logps/chosen": -397.06500244140625,
"logps/rejected": -429.73663330078125,
"loss": 0.0033,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -4.969448566436768,
"rewards/margins": 20.237773895263672,
"rewards/rejected": -25.20722007751465,
"step": 2420
},
{
"epoch": 7.56003098373354,
"grad_norm": 0.0010378537699580193,
"learning_rate": 0.00010711066329704423,
"logits/chosen": -1.7328182458877563,
"logits/rejected": -1.0489845275878906,
"logps/chosen": -410.6394958496094,
"logps/rejected": -457.23126220703125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.752233505249023,
"rewards/margins": 20.957183837890625,
"rewards/rejected": -25.70941734313965,
"step": 2440
},
{
"epoch": 7.621998450813323,
"grad_norm": 0.00035315402783453465,
"learning_rate": 0.00010583107687189388,
"logits/chosen": -1.7303959131240845,
"logits/rejected": -1.0627490282058716,
"logps/chosen": -394.2586364746094,
"logps/rejected": -438.1336975097656,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.017716407775879,
"rewards/margins": 20.087886810302734,
"rewards/rejected": -25.105602264404297,
"step": 2460
},
{
"epoch": 7.683965917893106,
"grad_norm": 5.2913201216142625e-05,
"learning_rate": 0.00010455053168567064,
"logits/chosen": -1.701934814453125,
"logits/rejected": -1.0837266445159912,
"logps/chosen": -411.44390869140625,
"logps/rejected": -451.9497985839844,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.648865699768066,
"rewards/margins": 20.401885986328125,
"rewards/rejected": -26.050750732421875,
"step": 2480
},
{
"epoch": 7.745933384972889,
"grad_norm": 0.0004144099075347185,
"learning_rate": 0.00010326923828899894,
"logits/chosen": -1.66423761844635,
"logits/rejected": -1.0931271314620972,
"logps/chosen": -413.04266357421875,
"logps/rejected": -468.1424255371094,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.445749282836914,
"rewards/margins": 20.35373306274414,
"rewards/rejected": -25.799480438232422,
"step": 2500
},
{
"epoch": 7.807900852052672,
"grad_norm": 0.0005614625406451523,
"learning_rate": 0.00010198740735552596,
"logits/chosen": -1.7007503509521484,
"logits/rejected": -1.0203969478607178,
"logps/chosen": -409.26434326171875,
"logps/rejected": -450.35284423828125,
"loss": 0.0022,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.619626522064209,
"rewards/margins": 20.54979133605957,
"rewards/rejected": -26.169414520263672,
"step": 2520
},
{
"epoch": 7.869868319132456,
"grad_norm": 0.00046529798419214785,
"learning_rate": 0.00010070524964728218,
"logits/chosen": -1.6950366497039795,
"logits/rejected": -1.0599762201309204,
"logps/chosen": -388.9576416015625,
"logps/rejected": -438.4559020996094,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.69763708114624,
"rewards/margins": 19.549518585205078,
"rewards/rejected": -25.247156143188477,
"step": 2540
},
{
"epoch": 7.931835786212239,
"grad_norm": 0.0005010979948565364,
"learning_rate": 9.942297598002714e-05,
"logits/chosen": -1.6910135746002197,
"logits/rejected": -1.088746190071106,
"logps/chosen": -409.673583984375,
"logps/rejected": -460.9344177246094,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.090248107910156,
"rewards/margins": 20.458660125732422,
"rewards/rejected": -25.548908233642578,
"step": 2560
},
{
"epoch": 7.993803253292022,
"grad_norm": 2.1018489860580303e-05,
"learning_rate": 9.814079718858677e-05,
"logits/chosen": -1.6951793432235718,
"logits/rejected": -1.1038161516189575,
"logps/chosen": -427.29669189453125,
"logps/rejected": -482.02362060546875,
"loss": 0.0065,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.359341621398926,
"rewards/margins": 20.788881301879883,
"rewards/rejected": -26.148223876953125,
"step": 2580
},
{
"epoch": 8.055770720371806,
"grad_norm": 0.00020114157814532518,
"learning_rate": 9.685892409218717e-05,
"logits/chosen": -1.702978491783142,
"logits/rejected": -1.0864311456680298,
"logps/chosen": -405.50567626953125,
"logps/rejected": -455.3516540527344,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.847678184509277,
"rewards/margins": 20.718107223510742,
"rewards/rejected": -25.565786361694336,
"step": 2600
},
{
"epoch": 8.117738187451588,
"grad_norm": 0.00014650092634838074,
"learning_rate": 9.557756745979138e-05,
"logits/chosen": -1.692112922668457,
"logits/rejected": -1.106385588645935,
"logps/chosen": -400.7706298828125,
"logps/rejected": -458.6825256347656,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.732221603393555,
"rewards/margins": 21.058570861816406,
"rewards/rejected": -25.79079246520996,
"step": 2620
},
{
"epoch": 8.179705654531372,
"grad_norm": 0.0003632131847552955,
"learning_rate": 9.429693797544388e-05,
"logits/chosen": -1.727189302444458,
"logits/rejected": -1.0760419368743896,
"logps/chosen": -401.86767578125,
"logps/rejected": -446.3102111816406,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.723801612854004,
"rewards/margins": 20.717304229736328,
"rewards/rejected": -25.441104888916016,
"step": 2640
},
{
"epoch": 8.241673121611154,
"grad_norm": 0.00047560204984620214,
"learning_rate": 9.301724620362973e-05,
"logits/chosen": -1.7449928522109985,
"logits/rejected": -1.0541192293167114,
"logps/chosen": -409.01959228515625,
"logps/rejected": -449.57666015625,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.505074501037598,
"rewards/margins": 20.396114349365234,
"rewards/rejected": -25.901187896728516,
"step": 2660
},
{
"epoch": 8.303640588690937,
"grad_norm": 0.0010067891562357545,
"learning_rate": 9.173870255465275e-05,
"logits/chosen": -1.7413511276245117,
"logits/rejected": -1.073628544807434,
"logps/chosen": -413.9063415527344,
"logps/rejected": -457.25042724609375,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.8417158126831055,
"rewards/margins": 20.952346801757812,
"rewards/rejected": -25.7940616607666,
"step": 2680
},
{
"epoch": 8.36560805577072,
"grad_norm": 0.0007608987507410347,
"learning_rate": 9.046151725003931e-05,
"logits/chosen": -1.738470435142517,
"logits/rejected": -1.118428111076355,
"logps/chosen": -406.96368408203125,
"logps/rejected": -458.2310485839844,
"loss": 0.0033,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.3068695068359375,
"rewards/margins": 20.518783569335938,
"rewards/rejected": -25.825653076171875,
"step": 2700
},
{
"epoch": 8.427575522850503,
"grad_norm": 0.00037170801078900695,
"learning_rate": 8.918590028797327e-05,
"logits/chosen": -1.6667039394378662,
"logits/rejected": -1.076485276222229,
"logps/chosen": -417.1942443847656,
"logps/rejected": -475.34478759765625,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.047384262084961,
"rewards/margins": 21.4394588470459,
"rewards/rejected": -26.48684310913086,
"step": 2720
},
{
"epoch": 8.489542989930287,
"grad_norm": 0.00017155329987872392,
"learning_rate": 8.791206140876746e-05,
"logits/chosen": -1.6952327489852905,
"logits/rejected": -1.0440196990966797,
"logps/chosen": -390.47991943359375,
"logps/rejected": -446.51611328125,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.408968448638916,
"rewards/margins": 20.748926162719727,
"rewards/rejected": -25.157894134521484,
"step": 2740
},
{
"epoch": 8.55151045701007,
"grad_norm": 4.225455268169753e-05,
"learning_rate": 8.664021006037762e-05,
"logits/chosen": -1.7128692865371704,
"logits/rejected": -1.0821470022201538,
"logps/chosen": -424.44549560546875,
"logps/rejected": -469.12652587890625,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.529724597930908,
"rewards/margins": 20.326000213623047,
"rewards/rejected": -25.855722427368164,
"step": 2760
},
{
"epoch": 8.613477924089853,
"grad_norm": 0.0004146189312450588,
"learning_rate": 8.537055536396439e-05,
"logits/chosen": -1.7189327478408813,
"logits/rejected": -1.1234623193740845,
"logps/chosen": -413.88092041015625,
"logps/rejected": -489.74432373046875,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.031737327575684,
"rewards/margins": 20.76127815246582,
"rewards/rejected": -26.793010711669922,
"step": 2780
},
{
"epoch": 8.675445391169635,
"grad_norm": 0.0011191857047379017,
"learning_rate": 8.410330607950913e-05,
"logits/chosen": -1.6889803409576416,
"logits/rejected": -1.0510902404785156,
"logps/chosen": -409.9695739746094,
"logps/rejected": -461.45257568359375,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.462882041931152,
"rewards/margins": 20.715688705444336,
"rewards/rejected": -26.178569793701172,
"step": 2800
},
{
"epoch": 8.737412858249419,
"grad_norm": 0.0015039819991216063,
"learning_rate": 8.283867057148902e-05,
"logits/chosen": -1.6871960163116455,
"logits/rejected": -1.1272326707839966,
"logps/chosen": -424.3963928222656,
"logps/rejected": -478.30535888671875,
"loss": 0.0022,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.292850971221924,
"rewards/margins": 20.825016021728516,
"rewards/rejected": -26.117868423461914,
"step": 2820
},
{
"epoch": 8.799380325329203,
"grad_norm": 0.00024371009203605354,
"learning_rate": 8.157685677461708e-05,
"logits/chosen": -1.7314860820770264,
"logits/rejected": -1.0632710456848145,
"logps/chosen": -411.5020446777344,
"logps/rejected": -450.3389587402344,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.654230117797852,
"rewards/margins": 21.339710235595703,
"rewards/rejected": -25.993938446044922,
"step": 2840
},
{
"epoch": 8.861347792408985,
"grad_norm": 0.0004402414197102189,
"learning_rate": 8.031807215965337e-05,
"logits/chosen": -1.7364399433135986,
"logits/rejected": -1.0983723402023315,
"logps/chosen": -417.08746337890625,
"logps/rejected": -472.83984375,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.4446940422058105,
"rewards/margins": 21.18663215637207,
"rewards/rejected": -26.63132667541504,
"step": 2860
},
{
"epoch": 8.923315259488769,
"grad_norm": 0.00047181983245536685,
"learning_rate": 7.906252369929154e-05,
"logits/chosen": -1.6905673742294312,
"logits/rejected": -1.084665060043335,
"logps/chosen": -393.9977111816406,
"logps/rejected": -455.0557556152344,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.183560371398926,
"rewards/margins": 20.739307403564453,
"rewards/rejected": -25.922870635986328,
"step": 2880
},
{
"epoch": 8.98528272656855,
"grad_norm": 0.0003129359392914921,
"learning_rate": 7.781041783412845e-05,
"logits/chosen": -1.6950937509536743,
"logits/rejected": -1.0535084009170532,
"logps/chosen": -418.62701416015625,
"logps/rejected": -476.28387451171875,
"loss": 0.0033,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.857310771942139,
"rewards/margins": 21.914113998413086,
"rewards/rejected": -26.771427154541016,
"step": 2900
},
{
"epoch": 9.047250193648335,
"grad_norm": 0.0004019307089038193,
"learning_rate": 7.656196043872012e-05,
"logits/chosen": -1.7096707820892334,
"logits/rejected": -1.1031239032745361,
"logps/chosen": -416.05206298828125,
"logps/rejected": -494.614990234375,
"loss": 0.0033,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.534869194030762,
"rewards/margins": 21.93942642211914,
"rewards/rejected": -27.474294662475586,
"step": 2920
},
{
"epoch": 9.109217660728119,
"grad_norm": 0.0007387935766018927,
"learning_rate": 7.531735678773171e-05,
"logits/chosen": -1.7090095281600952,
"logits/rejected": -1.0878323316574097,
"logps/chosen": -400.01513671875,
"logps/rejected": -477.05535888671875,
"loss": 0.0033,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.275289058685303,
"rewards/margins": 21.69790267944336,
"rewards/rejected": -26.973194122314453,
"step": 2940
},
{
"epoch": 9.1711851278079,
"grad_norm": 0.00027141955797560513,
"learning_rate": 7.407681152218535e-05,
"logits/chosen": -1.6808192729949951,
"logits/rejected": -1.0295798778533936,
"logps/chosen": -404.32513427734375,
"logps/rejected": -460.8975524902344,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.207651615142822,
"rewards/margins": 20.58077049255371,
"rewards/rejected": -25.788421630859375,
"step": 2960
},
{
"epoch": 9.233152594887684,
"grad_norm": 0.0005088089383207262,
"learning_rate": 7.284052861581288e-05,
"logits/chosen": -1.7368125915527344,
"logits/rejected": -1.0655357837677002,
"logps/chosen": -410.697021484375,
"logps/rejected": -453.0840759277344,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.395773410797119,
"rewards/margins": 20.73539924621582,
"rewards/rejected": -26.13117027282715,
"step": 2980
},
{
"epoch": 9.295120061967467,
"grad_norm": 0.0002143807359971106,
"learning_rate": 7.160871134151775e-05,
"logits/chosen": -1.6661646366119385,
"logits/rejected": -1.092222809791565,
"logps/chosen": -405.39154052734375,
"logps/rejected": -485.67578125,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.456831455230713,
"rewards/margins": 21.252620697021484,
"rewards/rejected": -26.70945167541504,
"step": 3000
},
{
"epoch": 9.35708752904725,
"grad_norm": 8.41324872453697e-05,
"learning_rate": 7.038156223795224e-05,
"logits/chosen": -1.7362842559814453,
"logits/rejected": -1.082162857055664,
"logps/chosen": -410.0975646972656,
"logps/rejected": -466.8894958496094,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.814949989318848,
"rewards/margins": 21.61594009399414,
"rewards/rejected": -26.430889129638672,
"step": 3020
},
{
"epoch": 9.419054996127032,
"grad_norm": 2.4985982236103155e-05,
"learning_rate": 6.915928307621584e-05,
"logits/chosen": -1.7000200748443604,
"logits/rejected": -1.0128730535507202,
"logps/chosen": -417.96405029296875,
"logps/rejected": -461.15362548828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.359194278717041,
"rewards/margins": 21.4404296875,
"rewards/rejected": -25.79962158203125,
"step": 3040
},
{
"epoch": 9.481022463206816,
"grad_norm": 0.0002187406353186816,
"learning_rate": 6.794207482667918e-05,
"logits/chosen": -1.6875083446502686,
"logits/rejected": -1.0425808429718018,
"logps/chosen": -409.68170166015625,
"logps/rejected": -456.98114013671875,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.01973295211792,
"rewards/margins": 20.8963623046875,
"rewards/rejected": -25.916095733642578,
"step": 3060
},
{
"epoch": 9.5429899302866,
"grad_norm": 0.0001037058827932924,
"learning_rate": 6.673013762594022e-05,
"logits/chosen": -1.6812347173690796,
"logits/rejected": -1.0920425653457642,
"logps/chosen": -409.3445129394531,
"logps/rejected": -463.01702880859375,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.606844425201416,
"rewards/margins": 20.97027015686035,
"rewards/rejected": -26.57711410522461,
"step": 3080
},
{
"epoch": 9.604957397366382,
"grad_norm": 6.546611984958872e-05,
"learning_rate": 6.552367074391708e-05,
"logits/chosen": -1.6708405017852783,
"logits/rejected": -1.0272510051727295,
"logps/chosen": -421.3130798339844,
"logps/rejected": -468.8424377441406,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.817858695983887,
"rewards/margins": 21.14541244506836,
"rewards/rejected": -26.963272094726562,
"step": 3100
},
{
"epoch": 9.666924864446166,
"grad_norm": 0.0009899769211187959,
"learning_rate": 6.432287255108363e-05,
"logits/chosen": -1.7139580249786377,
"logits/rejected": -1.0682191848754883,
"logps/chosen": -415.08154296875,
"logps/rejected": -463.1947326660156,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.88477087020874,
"rewards/margins": 20.44330596923828,
"rewards/rejected": -26.328075408935547,
"step": 3120
},
{
"epoch": 9.728892331525948,
"grad_norm": 0.0010677826358005404,
"learning_rate": 6.312794048585286e-05,
"logits/chosen": -1.6608006954193115,
"logits/rejected": -1.0799270868301392,
"logps/chosen": -393.5787353515625,
"logps/rejected": -458.1851501464844,
"loss": 0.0033,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.194777488708496,
"rewards/margins": 20.60002899169922,
"rewards/rejected": -25.7948055267334,
"step": 3140
},
{
"epoch": 9.790859798605732,
"grad_norm": 0.00037055814755149186,
"learning_rate": 6.193907102211358e-05,
"logits/chosen": -1.700254201889038,
"logits/rejected": -1.149086594581604,
"logps/chosen": -414.83575439453125,
"logps/rejected": -480.109375,
"loss": 0.0054,
"rewards/accuracies": 0.984375,
"rewards/chosen": -6.013056755065918,
"rewards/margins": 20.352540969848633,
"rewards/rejected": -26.3655948638916,
"step": 3160
},
{
"epoch": 9.852827265685516,
"grad_norm": 0.00012906199845019728,
"learning_rate": 6.075645963692567e-05,
"logits/chosen": -1.6764156818389893,
"logits/rejected": -1.0942738056182861,
"logps/chosen": -410.2710876464844,
"logps/rejected": -480.7608337402344,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.723294734954834,
"rewards/margins": 21.212993621826172,
"rewards/rejected": -26.936288833618164,
"step": 3180
},
{
"epoch": 9.914794732765298,
"grad_norm": 9.71817207755521e-05,
"learning_rate": 5.9580300778379087e-05,
"logits/chosen": -1.6972318887710571,
"logits/rejected": -1.06034255027771,
"logps/chosen": -414.45697021484375,
"logps/rejected": -478.67608642578125,
"loss": 0.0054,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -4.851905822753906,
"rewards/margins": 22.140657424926758,
"rewards/rejected": -26.992563247680664,
"step": 3200
},
{
"epoch": 9.976762199845082,
"grad_norm": 0.0005355001194402575,
"learning_rate": 5.8410787833622414e-05,
"logits/chosen": -1.701051950454712,
"logits/rejected": -1.0390212535858154,
"logps/chosen": -392.62689208984375,
"logps/rejected": -438.70660400390625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.652411937713623,
"rewards/margins": 21.09701156616211,
"rewards/rejected": -25.749425888061523,
"step": 3220
},
{
"epoch": 10.038729666924864,
"grad_norm": 0.0007227555033750832,
"learning_rate": 5.724811309706547e-05,
"logits/chosen": -1.7204704284667969,
"logits/rejected": -1.0700039863586426,
"logps/chosen": -430.43206787109375,
"logps/rejected": -488.071044921875,
"loss": 0.0033,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.367037296295166,
"rewards/margins": 21.72504425048828,
"rewards/rejected": -27.092077255249023,
"step": 3240
},
{
"epoch": 10.100697134004648,
"grad_norm": 0.00017314284923486412,
"learning_rate": 5.6092467738761776e-05,
"logits/chosen": -1.6834897994995117,
"logits/rejected": -1.0887248516082764,
"logps/chosen": -416.51348876953125,
"logps/rejected": -469.4505920410156,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.5038862228393555,
"rewards/margins": 21.196359634399414,
"rewards/rejected": -26.700244903564453,
"step": 3260
},
{
"epoch": 10.162664601084431,
"grad_norm": 0.00027020045672543347,
"learning_rate": 5.494404177297595e-05,
"logits/chosen": -1.696730613708496,
"logits/rejected": -1.0611952543258667,
"logps/chosen": -399.0355529785156,
"logps/rejected": -449.93646240234375,
"loss": 0.0033,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.055383682250977,
"rewards/margins": 20.96977996826172,
"rewards/rejected": -26.025165557861328,
"step": 3280
},
{
"epoch": 10.224632068164214,
"grad_norm": 0.0003596362948883325,
"learning_rate": 5.380302402694104e-05,
"logits/chosen": -1.7198495864868164,
"logits/rejected": -1.0654425621032715,
"logps/chosen": -390.9352722167969,
"logps/rejected": -453.2206115722656,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.068055629730225,
"rewards/margins": 20.917200088500977,
"rewards/rejected": -25.98525619506836,
"step": 3300
},
{
"epoch": 10.286599535243997,
"grad_norm": 2.4758495783316903e-05,
"learning_rate": 5.266960210981089e-05,
"logits/chosen": -1.664912462234497,
"logits/rejected": -1.0661206245422363,
"logps/chosen": -402.9308166503906,
"logps/rejected": -467.4169921875,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.694643020629883,
"rewards/margins": 21.313457489013672,
"rewards/rejected": -27.008098602294922,
"step": 3320
},
{
"epoch": 10.34856700232378,
"grad_norm": 0.00036736109177581966,
"learning_rate": 5.15439623818132e-05,
"logits/chosen": -1.7021472454071045,
"logits/rejected": -1.1036940813064575,
"logps/chosen": -395.59149169921875,
"logps/rejected": -463.43316650390625,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.990979194641113,
"rewards/margins": 20.853925704956055,
"rewards/rejected": -26.84490394592285,
"step": 3340
},
{
"epoch": 10.410534469403563,
"grad_norm": 0.00021753676992375404,
"learning_rate": 5.042628992360755e-05,
"logits/chosen": -1.6948877573013306,
"logits/rejected": -1.0948389768600464,
"logps/chosen": -417.33160400390625,
"logps/rejected": -491.01483154296875,
"loss": 0.0033,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -5.629961967468262,
"rewards/margins": 21.473012924194336,
"rewards/rejected": -27.102975845336914,
"step": 3360
},
{
"epoch": 10.472501936483347,
"grad_norm": 0.0005015567876398563,
"learning_rate": 4.9316768505853864e-05,
"logits/chosen": -1.7080516815185547,
"logits/rejected": -1.0318862199783325,
"logps/chosen": -397.1073913574219,
"logps/rejected": -439.6314392089844,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.5096540451049805,
"rewards/margins": 20.36575698852539,
"rewards/rejected": -25.875408172607422,
"step": 3380
},
{
"epoch": 10.53446940356313,
"grad_norm": 0.000426275102654472,
"learning_rate": 4.8215580558996546e-05,
"logits/chosen": -1.6764377355575562,
"logits/rejected": -1.0771383047103882,
"logps/chosen": -404.91937255859375,
"logps/rejected": -485.12548828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.389082908630371,
"rewards/margins": 21.155742645263672,
"rewards/rejected": -26.54482650756836,
"step": 3400
},
{
"epoch": 10.596436870642913,
"grad_norm": 0.00011274849384790286,
"learning_rate": 4.7122907143268645e-05,
"logits/chosen": -1.7037220001220703,
"logits/rejected": -1.0873366594314575,
"logps/chosen": -417.3395080566406,
"logps/rejected": -485.4212951660156,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.40346622467041,
"rewards/margins": 21.43330955505371,
"rewards/rejected": -26.836772918701172,
"step": 3420
},
{
"epoch": 10.658404337722695,
"grad_norm": 0.0008545616874471307,
"learning_rate": 4.603892791892157e-05,
"logits/chosen": -1.7251865863800049,
"logits/rejected": -1.1108168363571167,
"logps/chosen": -409.8521423339844,
"logps/rejected": -483.19329833984375,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.274342060089111,
"rewards/margins": 22.360143661499023,
"rewards/rejected": -26.634485244750977,
"step": 3440
},
{
"epoch": 10.720371804802479,
"grad_norm": 0.0002442661498207599,
"learning_rate": 4.4963821116684645e-05,
"logits/chosen": -1.7168834209442139,
"logits/rejected": -1.0469696521759033,
"logps/chosen": -410.9766540527344,
"logps/rejected": -462.96759033203125,
"loss": 0.0043,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.01826286315918,
"rewards/margins": 21.594696044921875,
"rewards/rejected": -26.612957000732422,
"step": 3460
},
{
"epoch": 10.782339271882261,
"grad_norm": 2.5067949536605738e-05,
"learning_rate": 4.3897763508460235e-05,
"logits/chosen": -1.6555604934692383,
"logits/rejected": -1.067326307296753,
"logps/chosen": -411.1241149902344,
"logps/rejected": -471.122314453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.384194374084473,
"rewards/margins": 20.667926788330078,
"rewards/rejected": -26.052120208740234,
"step": 3480
},
{
"epoch": 10.844306738962045,
"grad_norm": 9.07514404389076e-05,
"learning_rate": 4.284093037825829e-05,
"logits/chosen": -1.7002710103988647,
"logits/rejected": -1.0244972705841064,
"logps/chosen": -396.713623046875,
"logps/rejected": -450.4693298339844,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.3324480056762695,
"rewards/margins": 20.980426788330078,
"rewards/rejected": -26.312875747680664,
"step": 3500
},
{
"epoch": 10.906274206041829,
"grad_norm": 0.0001592998596606776,
"learning_rate": 4.179349549337557e-05,
"logits/chosen": -1.704119086265564,
"logits/rejected": -1.0116019248962402,
"logps/chosen": -402.82666015625,
"logps/rejected": -443.30157470703125,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.895948886871338,
"rewards/margins": 21.18239402770996,
"rewards/rejected": -26.07834243774414,
"step": 3520
},
{
"epoch": 10.96824167312161,
"grad_norm": 1.9538027117960155e-05,
"learning_rate": 4.075563107582472e-05,
"logits/chosen": -1.668092966079712,
"logits/rejected": -1.065983533859253,
"logps/chosen": -398.3217468261719,
"logps/rejected": -477.6726989746094,
"loss": 0.0054,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -5.416517734527588,
"rewards/margins": 21.412036895751953,
"rewards/rejected": -26.82855224609375,
"step": 3540
},
{
"epoch": 11.030209140201395,
"grad_norm": 5.915413566981442e-05,
"learning_rate": 3.9727507774016635e-05,
"logits/chosen": -1.6671562194824219,
"logits/rejected": -1.0572084188461304,
"logps/chosen": -400.4344177246094,
"logps/rejected": -474.96038818359375,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.950907230377197,
"rewards/margins": 20.902238845825195,
"rewards/rejected": -26.8531494140625,
"step": 3560
},
{
"epoch": 11.092176607281177,
"grad_norm": 0.0006108521483838558,
"learning_rate": 3.8709294634702376e-05,
"logits/chosen": -1.7030471563339233,
"logits/rejected": -1.0317370891571045,
"logps/chosen": -398.74090576171875,
"logps/rejected": -459.75,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.71872615814209,
"rewards/margins": 22.286239624023438,
"rewards/rejected": -27.00496482849121,
"step": 3580
},
{
"epoch": 11.15414407436096,
"grad_norm": 0.000467544246930629,
"learning_rate": 3.770115907517773e-05,
"logits/chosen": -1.6686887741088867,
"logits/rejected": -1.0782063007354736,
"logps/chosen": -406.98138427734375,
"logps/rejected": -482.86572265625,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.86759090423584,
"rewards/margins": 21.316923141479492,
"rewards/rejected": -27.184513092041016,
"step": 3600
},
{
"epoch": 11.216111541440744,
"grad_norm": 0.0004900813801214099,
"learning_rate": 3.670326685575632e-05,
"logits/chosen": -1.7124903202056885,
"logits/rejected": -1.0398648977279663,
"logps/chosen": -415.08648681640625,
"logps/rejected": -477.70709228515625,
"loss": 0.0033,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.952596187591553,
"rewards/margins": 22.07376480102539,
"rewards/rejected": -27.026357650756836,
"step": 3620
},
{
"epoch": 11.278079008520526,
"grad_norm": 0.0002428332227282226,
"learning_rate": 3.571578205251459e-05,
"logits/chosen": -1.7211148738861084,
"logits/rejected": -1.1097770929336548,
"logps/chosen": -406.6622009277344,
"logps/rejected": -460.78643798828125,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.563107490539551,
"rewards/margins": 21.05852699279785,
"rewards/rejected": -26.621633529663086,
"step": 3640
},
{
"epoch": 11.34004647560031,
"grad_norm": 0.0004079696664121002,
"learning_rate": 3.4738867030314235e-05,
"logits/chosen": -1.7017863988876343,
"logits/rejected": -1.0735719203948975,
"logps/chosen": -414.16339111328125,
"logps/rejected": -490.61944580078125,
"loss": 0.0033,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.069756507873535,
"rewards/margins": 22.46738052368164,
"rewards/rejected": -27.53713607788086,
"step": 3660
},
{
"epoch": 11.402013942680092,
"grad_norm": 0.0001673255901550874,
"learning_rate": 3.377268241610555e-05,
"logits/chosen": -1.692521095275879,
"logits/rejected": -1.0149263143539429,
"logps/chosen": -412.38507080078125,
"logps/rejected": -467.0577697753906,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.867552280426025,
"rewards/margins": 20.83139991760254,
"rewards/rejected": -26.698949813842773,
"step": 3680
},
{
"epoch": 11.463981409759876,
"grad_norm": 0.00012532217078842223,
"learning_rate": 3.2817387072516726e-05,
"logits/chosen": -1.7133913040161133,
"logits/rejected": -1.1119440793991089,
"logps/chosen": -401.7035217285156,
"logps/rejected": -476.5845642089844,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.1463212966918945,
"rewards/margins": 22.046228408813477,
"rewards/rejected": -27.192550659179688,
"step": 3700
},
{
"epoch": 11.52594887683966,
"grad_norm": 0.0002491988998372108,
"learning_rate": 3.18731380717334e-05,
"logits/chosen": -1.6776504516601562,
"logits/rejected": -1.0443401336669922,
"logps/chosen": -402.75933837890625,
"logps/rejected": -455.70068359375,
"loss": 0.0022,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.209097385406494,
"rewards/margins": 21.239925384521484,
"rewards/rejected": -26.449024200439453,
"step": 3720
},
{
"epoch": 11.587916343919442,
"grad_norm": 0.0005044552381150424,
"learning_rate": 3.0940090669672215e-05,
"logits/chosen": -1.6772470474243164,
"logits/rejected": -1.0744705200195312,
"logps/chosen": -400.09912109375,
"logps/rejected": -477.5372619628906,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.949058532714844,
"rewards/margins": 21.821866989135742,
"rewards/rejected": -26.770925521850586,
"step": 3740
},
{
"epoch": 11.649883810999226,
"grad_norm": 4.5204073103377596e-05,
"learning_rate": 3.001839828045342e-05,
"logits/chosen": -1.7325446605682373,
"logits/rejected": -1.063987135887146,
"logps/chosen": -415.75592041015625,
"logps/rejected": -452.0940856933594,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.492778778076172,
"rewards/margins": 20.81328582763672,
"rewards/rejected": -26.30606460571289,
"step": 3760
},
{
"epoch": 11.711851278079008,
"grad_norm": 0.0002700432378333062,
"learning_rate": 2.9108212451176033e-05,
"logits/chosen": -1.7305303812026978,
"logits/rejected": -1.083184003829956,
"logps/chosen": -400.70635986328125,
"logps/rejected": -472.36114501953125,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.0615034103393555,
"rewards/margins": 22.031635284423828,
"rewards/rejected": -27.093135833740234,
"step": 3780
},
{
"epoch": 11.773818745158792,
"grad_norm": 0.00013194057100918144,
"learning_rate": 2.8209682837000072e-05,
"logits/chosen": -1.6789268255233765,
"logits/rejected": -1.0528620481491089,
"logps/chosen": -403.6865539550781,
"logps/rejected": -479.7601623535156,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.470952033996582,
"rewards/margins": 21.67144775390625,
"rewards/rejected": -27.14239501953125,
"step": 3800
},
{
"epoch": 11.835786212238574,
"grad_norm": 0.0002364068350289017,
"learning_rate": 2.7322957176539777e-05,
"logits/chosen": -1.6753734350204468,
"logits/rejected": -1.0195820331573486,
"logps/chosen": -417.6498107910156,
"logps/rejected": -472.09844970703125,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.569521903991699,
"rewards/margins": 20.978273391723633,
"rewards/rejected": -26.54779624938965,
"step": 3820
},
{
"epoch": 11.897753679318358,
"grad_norm": 0.00013174403284210712,
"learning_rate": 2.6448181267572226e-05,
"logits/chosen": -1.6455790996551514,
"logits/rejected": -1.046744465827942,
"logps/chosen": -410.19134521484375,
"logps/rejected": -483.3438415527344,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.728828430175781,
"rewards/margins": 21.940776824951172,
"rewards/rejected": -27.669601440429688,
"step": 3840
},
{
"epoch": 11.959721146398142,
"grad_norm": 0.00042892919736914337,
"learning_rate": 2.5585498943064724e-05,
"logits/chosen": -1.6926710605621338,
"logits/rejected": -1.0491944551467896,
"logps/chosen": -415.20550537109375,
"logps/rejected": -482.228271484375,
"loss": 0.0022,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.751172065734863,
"rewards/margins": 21.466909408569336,
"rewards/rejected": -27.21807861328125,
"step": 3860
},
{
"epoch": 12.021688613477924,
"grad_norm": 8.727656677365303e-05,
"learning_rate": 2.4735052047525398e-05,
"logits/chosen": -1.7163196802139282,
"logits/rejected": -1.059697151184082,
"logps/chosen": -422.93359375,
"logps/rejected": -472.23583984375,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.125914573669434,
"rewards/margins": 21.549646377563477,
"rewards/rejected": -26.675561904907227,
"step": 3880
},
{
"epoch": 12.083656080557708,
"grad_norm": 5.139048516866751e-05,
"learning_rate": 2.389698041368089e-05,
"logits/chosen": -1.682549238204956,
"logits/rejected": -1.0410518646240234,
"logps/chosen": -419.48529052734375,
"logps/rejected": -488.83154296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.320895195007324,
"rewards/margins": 22.32204246520996,
"rewards/rejected": -27.6429386138916,
"step": 3900
},
{
"epoch": 12.14562354763749,
"grad_norm": 0.00013814242265652865,
"learning_rate": 2.3071421839484554e-05,
"logits/chosen": -1.6900997161865234,
"logits/rejected": -1.0404036045074463,
"logps/chosen": -399.94854736328125,
"logps/rejected": -466.58642578125,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.632657051086426,
"rewards/margins": 21.346328735351562,
"rewards/rejected": -26.978984832763672,
"step": 3920
},
{
"epoch": 12.207591014717273,
"grad_norm": 0.0001951899757841602,
"learning_rate": 2.2258512065459448e-05,
"logits/chosen": -1.6699708700180054,
"logits/rejected": -1.058363437652588,
"logps/chosen": -421.36419677734375,
"logps/rejected": -490.47100830078125,
"loss": 0.0033,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.72733211517334,
"rewards/margins": 21.7630672454834,
"rewards/rejected": -27.490398406982422,
"step": 3940
},
{
"epoch": 12.269558481797057,
"grad_norm": 0.001167879207059741,
"learning_rate": 2.1458384752379357e-05,
"logits/chosen": -1.6963287591934204,
"logits/rejected": -1.078595757484436,
"logps/chosen": -400.4660339355469,
"logps/rejected": -470.71710205078125,
"loss": 0.0033,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.399907112121582,
"rewards/margins": 21.62917709350586,
"rewards/rejected": -27.02908706665039,
"step": 3960
},
{
"epoch": 12.33152594887684,
"grad_norm": 9.643881639931351e-06,
"learning_rate": 2.067117145929216e-05,
"logits/chosen": -1.688515305519104,
"logits/rejected": -1.08303964138031,
"logps/chosen": -402.33795166015625,
"logps/rejected": -477.7525329589844,
"loss": 0.0033,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -4.999421119689941,
"rewards/margins": 22.334285736083984,
"rewards/rejected": -27.333709716796875,
"step": 3980
},
{
"epoch": 12.393493415956623,
"grad_norm": 0.0006664241082035005,
"learning_rate": 1.9897001621888434e-05,
"logits/chosen": -1.7171924114227295,
"logits/rejected": -1.0485467910766602,
"logps/chosen": -409.967529296875,
"logps/rejected": -477.21551513671875,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.805159568786621,
"rewards/margins": 22.3187198638916,
"rewards/rejected": -27.123876571655273,
"step": 4000
},
{
"epoch": 12.455460883036405,
"grad_norm": 5.3627591114491224e-06,
"learning_rate": 1.913600253121919e-05,
"logits/chosen": -1.677496314048767,
"logits/rejected": -1.0768311023712158,
"logps/chosen": -421.8292541503906,
"logps/rejected": -494.90606689453125,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.249929904937744,
"rewards/margins": 21.906986236572266,
"rewards/rejected": -27.15691566467285,
"step": 4020
},
{
"epoch": 12.51742835011619,
"grad_norm": 3.554378781700507e-05,
"learning_rate": 1.838829931276653e-05,
"logits/chosen": -1.6907306909561157,
"logits/rejected": -1.0432696342468262,
"logps/chosen": -398.9062805175781,
"logps/rejected": -465.7071228027344,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -4.694939136505127,
"rewards/margins": 22.108684539794922,
"rewards/rejected": -26.80362319946289,
"step": 4040
},
{
"epoch": 12.579395817195973,
"grad_norm": 6.133209535619244e-05,
"learning_rate": 1.7654014905870098e-05,
"logits/chosen": -1.6698366403579712,
"logits/rejected": -1.0069531202316284,
"logps/chosen": -417.49237060546875,
"logps/rejected": -470.18902587890625,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.820713520050049,
"rewards/margins": 21.33327865600586,
"rewards/rejected": -27.15399169921875,
"step": 4060
},
{
"epoch": 12.641363284275755,
"grad_norm": 0.00020697916625067592,
"learning_rate": 1.6933270043513083e-05,
"logits/chosen": -1.677680253982544,
"logits/rejected": -1.0464431047439575,
"logps/chosen": -408.2115478515625,
"logps/rejected": -478.3711853027344,
"loss": 0.0022,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.883364200592041,
"rewards/margins": 21.521183013916016,
"rewards/rejected": -27.404544830322266,
"step": 4080
},
{
"epoch": 12.703330751355539,
"grad_norm": 0.00018397132225800306,
"learning_rate": 1.622618323247087e-05,
"logits/chosen": -1.6993494033813477,
"logits/rejected": -1.0857021808624268,
"logps/chosen": -405.2132873535156,
"logps/rejected": -485.60321044921875,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.594387531280518,
"rewards/margins": 21.590347290039062,
"rewards/rejected": -27.184734344482422,
"step": 4100
},
{
"epoch": 12.765298218435321,
"grad_norm": 0.00029773233109153807,
"learning_rate": 1.553287073382609e-05,
"logits/chosen": -1.7119516134262085,
"logits/rejected": -1.0656880140304565,
"logps/chosen": -405.5570373535156,
"logps/rejected": -462.2611389160156,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.116886615753174,
"rewards/margins": 21.456085205078125,
"rewards/rejected": -26.57297134399414,
"step": 4120
},
{
"epoch": 12.827265685515105,
"grad_norm": 0.0001080308502423577,
"learning_rate": 1.485344654385239e-05,
"logits/chosen": -1.6709296703338623,
"logits/rejected": -1.053741693496704,
"logps/chosen": -428.66839599609375,
"logps/rejected": -500.01092529296875,
"loss": 0.0043,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.277214050292969,
"rewards/margins": 22.146846771240234,
"rewards/rejected": -28.424060821533203,
"step": 4140
},
{
"epoch": 12.889233152594887,
"grad_norm": 6.432453665183857e-05,
"learning_rate": 1.418802237527106e-05,
"logits/chosen": -1.68827223777771,
"logits/rejected": -1.0494086742401123,
"logps/chosen": -424.75286865234375,
"logps/rejected": -481.1043395996094,
"loss": 0.0033,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.699560165405273,
"rewards/margins": 21.662763595581055,
"rewards/rejected": -27.362323760986328,
"step": 4160
},
{
"epoch": 12.95120061967467,
"grad_norm": 0.0004029480624012649,
"learning_rate": 1.3536707638882872e-05,
"logits/chosen": -1.6849908828735352,
"logits/rejected": -1.0281345844268799,
"logps/chosen": -419.80010986328125,
"logps/rejected": -460.801025390625,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.415833950042725,
"rewards/margins": 20.73134422302246,
"rewards/rejected": -26.147180557250977,
"step": 4180
},
{
"epoch": 13.013168086754455,
"grad_norm": 0.0002039131213678047,
"learning_rate": 1.289960942557844e-05,
"logits/chosen": -1.686678171157837,
"logits/rejected": -1.041481852531433,
"logps/chosen": -418.22686767578125,
"logps/rejected": -488.3094787597656,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.922072410583496,
"rewards/margins": 21.746536254882812,
"rewards/rejected": -27.66861343383789,
"step": 4200
},
{
"epoch": 13.075135553834237,
"grad_norm": 0.00016347317432519048,
"learning_rate": 1.2276832488730094e-05,
"logits/chosen": -1.7182451486587524,
"logits/rejected": -1.0532605648040771,
"logps/chosen": -441.8271484375,
"logps/rejected": -510.87628173828125,
"loss": 0.0043,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.668587684631348,
"rewards/margins": 22.97989273071289,
"rewards/rejected": -28.648479461669922,
"step": 4220
},
{
"epoch": 13.13710302091402,
"grad_norm": 0.00020034710178151727,
"learning_rate": 1.1668479226967965e-05,
"logits/chosen": -1.6925156116485596,
"logits/rejected": -1.0687302350997925,
"logps/chosen": -399.3315124511719,
"logps/rejected": -474.7539978027344,
"loss": 0.0033,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.459714889526367,
"rewards/margins": 21.628223419189453,
"rewards/rejected": -27.087936401367188,
"step": 4240
},
{
"epoch": 13.199070487993803,
"grad_norm": 0.00026680485461838543,
"learning_rate": 1.1074649667343506e-05,
"logits/chosen": -1.6791460514068604,
"logits/rejected": -1.0547727346420288,
"logps/chosen": -412.1854553222656,
"logps/rejected": -474.461181640625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.000552654266357,
"rewards/margins": 21.563953399658203,
"rewards/rejected": -26.564502716064453,
"step": 4260
},
{
"epoch": 13.261037955073586,
"grad_norm": 9.416981629328802e-05,
"learning_rate": 1.0495441448882571e-05,
"logits/chosen": -1.6752477884292603,
"logits/rejected": -1.0648829936981201,
"logps/chosen": -413.24609375,
"logps/rejected": -496.79327392578125,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.392674922943115,
"rewards/margins": 22.125301361083984,
"rewards/rejected": -27.51797866821289,
"step": 4280
},
{
"epoch": 13.32300542215337,
"grad_norm": 0.00027022938593290746,
"learning_rate": 9.930949806531509e-06,
"logits/chosen": -1.6898155212402344,
"logits/rejected": -1.0595139265060425,
"logps/chosen": -410.2594299316406,
"logps/rejected": -469.86883544921875,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.0617547035217285,
"rewards/margins": 21.85466194152832,
"rewards/rejected": -26.916418075561523,
"step": 4300
},
{
"epoch": 13.384972889233152,
"grad_norm": 5.3291834774427116e-05,
"learning_rate": 9.38126755549832e-06,
"logits/chosen": -1.6853482723236084,
"logits/rejected": -1.0476603507995605,
"logps/chosen": -411.350830078125,
"logps/rejected": -470.8251037597656,
"loss": 0.0033,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.509891510009766,
"rewards/margins": 21.421506881713867,
"rewards/rejected": -26.931400299072266,
"step": 4320
},
{
"epoch": 13.446940356312936,
"grad_norm": 8.903396519599482e-05,
"learning_rate": 8.846485075991728e-06,
"logits/chosen": -1.6736446619033813,
"logits/rejected": -1.0330798625946045,
"logps/chosen": -417.89044189453125,
"logps/rejected": -477.1280822753906,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.157768726348877,
"rewards/margins": 21.815839767456055,
"rewards/rejected": -26.973608016967773,
"step": 4340
},
{
"epoch": 13.508907823392718,
"grad_norm": 0.0006522313342429698,
"learning_rate": 8.326690298360639e-06,
"logits/chosen": -1.679149866104126,
"logits/rejected": -1.0622096061706543,
"logps/chosen": -403.9975891113281,
"logps/rejected": -478.72174072265625,
"loss": 0.0043,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.251322269439697,
"rewards/margins": 21.613903045654297,
"rewards/rejected": -26.8652286529541,
"step": 4360
},
{
"epoch": 13.570875290472502,
"grad_norm": 0.0001527480490040034,
"learning_rate": 7.821968688636383e-06,
"logits/chosen": -1.7000373601913452,
"logits/rejected": -1.0500789880752563,
"logps/chosen": -400.9742431640625,
"logps/rejected": -477.05450439453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.700057506561279,
"rewards/margins": 21.45535659790039,
"rewards/rejected": -27.155414581298828,
"step": 4380
},
{
"epoch": 13.632842757552286,
"grad_norm": 0.0005368488491512835,
"learning_rate": 7.332403234480223e-06,
"logits/chosen": -1.683445692062378,
"logits/rejected": -1.0166078805923462,
"logps/chosen": -401.72607421875,
"logps/rejected": -456.4202575683594,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.469435691833496,
"rewards/margins": 21.11139488220215,
"rewards/rejected": -26.580829620361328,
"step": 4400
},
{
"epoch": 13.694810224632068,
"grad_norm": 0.0005580181023105979,
"learning_rate": 6.858074431538164e-06,
"logits/chosen": -1.6824891567230225,
"logits/rejected": -1.0271477699279785,
"logps/chosen": -399.6391296386719,
"logps/rejected": -451.330078125,
"loss": 0.0022,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.243688583374023,
"rewards/margins": 21.208574295043945,
"rewards/rejected": -26.452260971069336,
"step": 4420
},
{
"epoch": 13.756777691711852,
"grad_norm": NaN,
"learning_rate": 6.421646080196197e-06,
"logits/chosen": -1.6686054468154907,
"logits/rejected": -1.0693179368972778,
"logps/chosen": -401.59844970703125,
"logps/rejected": -474.7311096191406,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.454672336578369,
"rewards/margins": 21.39242172241211,
"rewards/rejected": -26.847095489501953,
"step": 4440
},
{
"epoch": 13.818745158791634,
"grad_norm": 1.7149226550827734e-05,
"learning_rate": 5.9772507736462145e-06,
"logits/chosen": -1.710008978843689,
"logits/rejected": -1.0888980627059937,
"logps/chosen": -407.61260986328125,
"logps/rejected": -481.07550048828125,
"loss": 0.0022,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.6366987228393555,
"rewards/margins": 21.678539276123047,
"rewards/rejected": -27.315237045288086,
"step": 4460
},
{
"epoch": 13.880712625871418,
"grad_norm": 2.4136075808200985e-05,
"learning_rate": 5.54831493606015e-06,
"logits/chosen": -1.6713101863861084,
"logits/rejected": -1.0732184648513794,
"logps/chosen": -424.976806640625,
"logps/rejected": -506.0423889160156,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.765892028808594,
"rewards/margins": 22.11074447631836,
"rewards/rejected": -27.876636505126953,
"step": 4480
},
{
"epoch": 13.9426800929512,
"grad_norm": 7.025560626061633e-05,
"learning_rate": 5.134909094202267e-06,
"logits/chosen": -1.699441909790039,
"logits/rejected": -1.0467607975006104,
"logps/chosen": -401.03375244140625,
"logps/rejected": -447.85308837890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.419959545135498,
"rewards/margins": 20.893884658813477,
"rewards/rejected": -26.313846588134766,
"step": 4500
},
{
"epoch": 14.004647560030984,
"grad_norm": 0.0002559265703894198,
"learning_rate": 4.7371012213538235e-06,
"logits/chosen": -1.6893657445907593,
"logits/rejected": -1.0456167459487915,
"logps/chosen": -425.73895263671875,
"logps/rejected": -486.43890380859375,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.418589115142822,
"rewards/margins": 22.638408660888672,
"rewards/rejected": -28.0570011138916,
"step": 4520
},
{
"epoch": 14.066615027110767,
"grad_norm": 0.00043519827886484563,
"learning_rate": 4.35495672613685e-06,
"logits/chosen": -1.6840267181396484,
"logits/rejected": -1.0660759210586548,
"logps/chosen": -420.65692138671875,
"logps/rejected": -481.805419921875,
"loss": 0.0065,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -5.826098442077637,
"rewards/margins": 21.706336975097656,
"rewards/rejected": -27.53243637084961,
"step": 4540
},
{
"epoch": 14.12858249419055,
"grad_norm": 0.0004038415208924562,
"learning_rate": 3.988538441759382e-06,
"logits/chosen": -1.673048973083496,
"logits/rejected": -1.0200636386871338,
"logps/chosen": -403.9557189941406,
"logps/rejected": -461.65179443359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.021474361419678,
"rewards/margins": 21.59840965270996,
"rewards/rejected": -26.619884490966797,
"step": 4560
},
{
"epoch": 14.190549961270333,
"grad_norm": 0.00038054597098380327,
"learning_rate": 3.637906615684328e-06,
"logits/chosen": -1.6679537296295166,
"logits/rejected": -1.0269415378570557,
"logps/chosen": -410.174072265625,
"logps/rejected": -484.68865966796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.3633928298950195,
"rewards/margins": 22.351978302001953,
"rewards/rejected": -27.715368270874023,
"step": 4580
},
{
"epoch": 14.252517428350115,
"grad_norm": 5.562596925301477e-05,
"learning_rate": 3.3031188997233676e-06,
"logits/chosen": -1.6873247623443604,
"logits/rejected": -1.0105091333389282,
"logps/chosen": -405.04132080078125,
"logps/rejected": -454.36920166015625,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.410122871398926,
"rewards/margins": 21.17348289489746,
"rewards/rejected": -26.583606719970703,
"step": 4600
},
{
"epoch": 14.3144848954299,
"grad_norm": 4.7735171392560005e-05,
"learning_rate": 2.9842303405577366e-06,
"logits/chosen": -1.6932716369628906,
"logits/rejected": -1.026926040649414,
"logps/chosen": -416.610595703125,
"logps/rejected": -469.50335693359375,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.13016414642334,
"rewards/margins": 20.862241744995117,
"rewards/rejected": -26.99240493774414,
"step": 4620
},
{
"epoch": 14.376452362509683,
"grad_norm": 0.00047004391672089696,
"learning_rate": 2.6812933706872545e-06,
"logits/chosen": -1.6934292316436768,
"logits/rejected": -1.063394546508789,
"logps/chosen": -415.4750061035156,
"logps/rejected": -489.5491638183594,
"loss": 0.0022,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.2422404289245605,
"rewards/margins": 22.516773223876953,
"rewards/rejected": -27.759014129638672,
"step": 4640
},
{
"epoch": 14.438419829589465,
"grad_norm": 0.0008643981418572366,
"learning_rate": 2.394357799809277e-06,
"logits/chosen": -1.735192894935608,
"logits/rejected": -1.069784164428711,
"logps/chosen": -409.0735168457031,
"logps/rejected": -455.7366638183594,
"loss": 0.0022,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.419035911560059,
"rewards/margins": 21.468860626220703,
"rewards/rejected": -26.887897491455078,
"step": 4660
},
{
"epoch": 14.500387296669249,
"grad_norm": 0.0002557814004831016,
"learning_rate": 2.123470806628858e-06,
"logits/chosen": -1.6932361125946045,
"logits/rejected": -1.03562331199646,
"logps/chosen": -404.10223388671875,
"logps/rejected": -452.8517150878906,
"loss": 0.0043,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.449051856994629,
"rewards/margins": 21.111392974853516,
"rewards/rejected": -26.560443878173828,
"step": 4680
},
{
"epoch": 14.562354763749031,
"grad_norm": 0.00017765916709322482,
"learning_rate": 1.868676931101465e-06,
"logits/chosen": -1.6715888977050781,
"logits/rejected": -1.057328462600708,
"logps/chosen": -411.4977111816406,
"logps/rejected": -486.6917419433594,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.450153827667236,
"rewards/margins": 22.20999526977539,
"rewards/rejected": -27.6601505279541,
"step": 4700
},
{
"epoch": 14.624322230828815,
"grad_norm": 0.0006002355949021876,
"learning_rate": 1.6300180671096288e-06,
"logits/chosen": -1.6742595434188843,
"logits/rejected": -1.0468966960906982,
"logps/chosen": -414.0707092285156,
"logps/rejected": -482.42657470703125,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.493812561035156,
"rewards/margins": 21.657306671142578,
"rewards/rejected": -27.151119232177734,
"step": 4720
},
{
"epoch": 14.686289697908599,
"grad_norm": 0.00020658239373005927,
"learning_rate": 1.4075334555746055e-06,
"logits/chosen": -1.662987470626831,
"logits/rejected": -1.016445279121399,
"logps/chosen": -407.02423095703125,
"logps/rejected": -467.1194763183594,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.999020576477051,
"rewards/margins": 20.836938858032227,
"rewards/rejected": -26.83595848083496,
"step": 4740
},
{
"epoch": 14.748257164988381,
"grad_norm": 6.777382805012167e-05,
"learning_rate": 1.2012596780043627e-06,
"logits/chosen": -1.6404949426651,
"logits/rejected": -1.0619919300079346,
"logps/chosen": -394.98443603515625,
"logps/rejected": -479.7742614746094,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.746143341064453,
"rewards/margins": 21.60362434387207,
"rewards/rejected": -27.349767684936523,
"step": 4760
},
{
"epoch": 14.810224632068165,
"grad_norm": 0.00017278394079767168,
"learning_rate": 1.011230650478634e-06,
"logits/chosen": -1.6573286056518555,
"logits/rejected": -1.0122966766357422,
"logps/chosen": -396.2731018066406,
"logps/rejected": -456.626220703125,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.127909183502197,
"rewards/margins": 21.664600372314453,
"rewards/rejected": -26.79250717163086,
"step": 4780
},
{
"epoch": 14.872192099147947,
"grad_norm": 0.00017635834228713065,
"learning_rate": 8.374776180724575e-07,
"logits/chosen": -1.7095073461532593,
"logits/rejected": -1.0201966762542725,
"logps/chosen": -402.76763916015625,
"logps/rejected": -461.19903564453125,
"loss": 0.0033,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.404868125915527,
"rewards/margins": 21.330501556396484,
"rewards/rejected": -26.735370635986328,
"step": 4800
},
{
"epoch": 14.93415956622773,
"grad_norm": 0.0006217029877007008,
"learning_rate": 6.800291497187083e-07,
"logits/chosen": -1.7389657497406006,
"logits/rejected": -1.0253870487213135,
"logps/chosen": -406.7480163574219,
"logps/rejected": -461.8179626464844,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.920414447784424,
"rewards/margins": 21.916866302490234,
"rewards/rejected": -26.8372802734375,
"step": 4820
},
{
"epoch": 14.996127033307513,
"grad_norm": 0.0001935044419951737,
"learning_rate": 5.389111335107556e-07,
"logits/chosen": -1.696392297744751,
"logits/rejected": -1.0922819375991821,
"logps/chosen": -414.5367736816406,
"logps/rejected": -476.94012451171875,
"loss": 0.0033,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.998685359954834,
"rewards/margins": 21.558393478393555,
"rewards/rejected": -27.557079315185547,
"step": 4840
},
{
"epoch": 15.058094500387297,
"grad_norm": 4.989042645320296e-05,
"learning_rate": 4.1414677244584477e-07,
"logits/chosen": -1.690422773361206,
"logits/rejected": -1.0694575309753418,
"logps/chosen": -417.68487548828125,
"logps/rejected": -490.20989990234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.380603313446045,
"rewards/margins": 21.939071655273438,
"rewards/rejected": -27.31967544555664,
"step": 4860
},
{
"epoch": 15.12006196746708,
"grad_norm": 0.0008857127977535129,
"learning_rate": 3.0575658061001713e-07,
"logits/chosen": -1.692728042602539,
"logits/rejected": -1.0653448104858398,
"logps/chosen": -414.1552734375,
"logps/rejected": -490.3134765625,
"loss": 0.0022,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -6.1294355392456055,
"rewards/margins": 21.649871826171875,
"rewards/rejected": -27.779308319091797,
"step": 4880
},
{
"epoch": 15.182029434546862,
"grad_norm": 7.71297054598108e-05,
"learning_rate": 2.1375837980512904e-07,
"logits/chosen": -1.687190294265747,
"logits/rejected": -1.0721074342727661,
"logps/chosen": -410.22161865234375,
"logps/rejected": -491.24835205078125,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.182304859161377,
"rewards/margins": 22.23093032836914,
"rewards/rejected": -27.41323471069336,
"step": 4900
},
{
"epoch": 15.243996901626646,
"grad_norm": 0.00017248830408789217,
"learning_rate": 1.38167296618541e-07,
"logits/chosen": -1.682885766029358,
"logits/rejected": -1.0524094104766846,
"logps/chosen": -410.17681884765625,
"logps/rejected": -472.13885498046875,
"loss": 0.0011,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.831109046936035,
"rewards/margins": 21.399702072143555,
"rewards/rejected": -27.230810165405273,
"step": 4920
},
{
"epoch": 15.305964368706428,
"grad_norm": 0.0008164289756678045,
"learning_rate": 7.899575993597363e-08,
"logits/chosen": -1.6627308130264282,
"logits/rejected": -0.9520984888076782,
"logps/chosen": -395.6473693847656,
"logps/rejected": -433.9269104003906,
"loss": 0.0043,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -5.2272748947143555,
"rewards/margins": 20.858642578125,
"rewards/rejected": -26.08591651916504,
"step": 4940
},
{
"epoch": 15.367931835786212,
"grad_norm": 0.00019182954565621912,
"learning_rate": 3.6253498897886873e-08,
"logits/chosen": -1.6554197072982788,
"logits/rejected": -1.0059171915054321,
"logps/chosen": -394.91973876953125,
"logps/rejected": -451.76312255859375,
"loss": 0.0022,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -5.4583234786987305,
"rewards/margins": 21.091644287109375,
"rewards/rejected": -26.549968719482422,
"step": 4960
},
{
"epoch": 15.429899302865996,
"grad_norm": 0.00014239229494705796,
"learning_rate": 9.947541299837327e-09,
"logits/chosen": -1.7060569524765015,
"logits/rejected": -1.0418967008590698,
"logps/chosen": -427.88525390625,
"logps/rejected": -482.782958984375,
"loss": 0.0043,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -5.570733070373535,
"rewards/margins": 21.934314727783203,
"rewards/rejected": -27.505046844482422,
"step": 4980
},
{
"epoch": 15.491866769945778,
"grad_norm": 0.0005336150643415749,
"learning_rate": 8.221243689154889e-11,
"logits/chosen": -1.6255543231964111,
"logits/rejected": -1.027090311050415,
"logps/chosen": -393.7467956542969,
"logps/rejected": -484.93804931640625,
"loss": 0.0033,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -5.538996696472168,
"rewards/margins": 21.719022750854492,
"rewards/rejected": -27.25801658630371,
"step": 5000
}
],
"logging_steps": 20,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 16,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}