DPO_Chat-zephyr-7b-sft-full / trainer_state.json
TTTXXX01's picture
Model save
835f331 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1019,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009813542688910696,
"grad_norm": 12.871247750249635,
"learning_rate": 4.9019607843137254e-09,
"logits/chosen": 5327.5185546875,
"logits/rejected": 3678.846435546875,
"logps/chosen": -222.31866455078125,
"logps/rejected": -157.3788299560547,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.009813542688910697,
"grad_norm": 14.081428099593134,
"learning_rate": 4.901960784313725e-08,
"logits/chosen": 5387.51123046875,
"logits/rejected": 4719.13525390625,
"logps/chosen": -280.157958984375,
"logps/rejected": -244.06271362304688,
"loss": 0.6931,
"rewards/accuracies": 0.40740740299224854,
"rewards/chosen": -0.014360553584992886,
"rewards/margins": -0.05316641554236412,
"rewards/rejected": 0.03880586475133896,
"step": 10
},
{
"epoch": 0.019627085377821395,
"grad_norm": 10.990615121540667,
"learning_rate": 9.80392156862745e-08,
"logits/chosen": 4691.1123046875,
"logits/rejected": 4289.6572265625,
"logps/chosen": -243.6353302001953,
"logps/rejected": -236.8662872314453,
"loss": 0.6931,
"rewards/accuracies": 0.46666669845581055,
"rewards/chosen": -0.03324734792113304,
"rewards/margins": -0.0356144905090332,
"rewards/rejected": 0.0023671439848840237,
"step": 20
},
{
"epoch": 0.029440628066732092,
"grad_norm": 12.484022522013351,
"learning_rate": 1.4705882352941175e-07,
"logits/chosen": 5969.29296875,
"logits/rejected": 5405.775390625,
"logps/chosen": -284.97119140625,
"logps/rejected": -282.4980163574219,
"loss": 0.6922,
"rewards/accuracies": 0.6250000596046448,
"rewards/chosen": 0.2023317515850067,
"rewards/margins": 0.21659104526042938,
"rewards/rejected": -0.014259283430874348,
"step": 30
},
{
"epoch": 0.03925417075564279,
"grad_norm": 13.51105908880634,
"learning_rate": 1.96078431372549e-07,
"logits/chosen": 5424.30859375,
"logits/rejected": 4093.165283203125,
"logps/chosen": -278.38232421875,
"logps/rejected": -219.98922729492188,
"loss": 0.6901,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.5105813145637512,
"rewards/margins": 0.6657305955886841,
"rewards/rejected": -0.15514932572841644,
"step": 40
},
{
"epoch": 0.04906771344455348,
"grad_norm": 12.244316305452477,
"learning_rate": 2.4509803921568627e-07,
"logits/chosen": 5819.39111328125,
"logits/rejected": 4993.8203125,
"logps/chosen": -267.16241455078125,
"logps/rejected": -275.3472595214844,
"loss": 0.6865,
"rewards/accuracies": 0.6833333969116211,
"rewards/chosen": 1.6877946853637695,
"rewards/margins": 1.0000646114349365,
"rewards/rejected": 0.6877301931381226,
"step": 50
},
{
"epoch": 0.058881256133464184,
"grad_norm": 11.729075229552288,
"learning_rate": 2.941176470588235e-07,
"logits/chosen": 6246.43115234375,
"logits/rejected": 5279.3232421875,
"logps/chosen": -293.96044921875,
"logps/rejected": -250.30880737304688,
"loss": 0.6794,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 4.434187889099121,
"rewards/margins": 2.814074993133545,
"rewards/rejected": 1.6201130151748657,
"step": 60
},
{
"epoch": 0.06869479882237488,
"grad_norm": 11.58284928755517,
"learning_rate": 3.431372549019608e-07,
"logits/chosen": 5581.76318359375,
"logits/rejected": 5016.42333984375,
"logps/chosen": -273.4932556152344,
"logps/rejected": -272.8643493652344,
"loss": 0.6728,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": 4.158146858215332,
"rewards/margins": 5.391061782836914,
"rewards/rejected": -1.2329151630401611,
"step": 70
},
{
"epoch": 0.07850834151128558,
"grad_norm": 12.989777400848494,
"learning_rate": 3.92156862745098e-07,
"logits/chosen": 5730.53759765625,
"logits/rejected": 4633.5458984375,
"logps/chosen": -269.62908935546875,
"logps/rejected": -244.82156372070312,
"loss": 0.6613,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 1.3705421686172485,
"rewards/margins": 8.697429656982422,
"rewards/rejected": -7.326887607574463,
"step": 80
},
{
"epoch": 0.08832188420019627,
"grad_norm": 14.220448041653073,
"learning_rate": 4.4117647058823526e-07,
"logits/chosen": 5785.2666015625,
"logits/rejected": 5267.29931640625,
"logps/chosen": -262.34014892578125,
"logps/rejected": -285.23370361328125,
"loss": 0.6375,
"rewards/accuracies": 0.6833333373069763,
"rewards/chosen": -1.5268057584762573,
"rewards/margins": 10.330537796020508,
"rewards/rejected": -11.857342720031738,
"step": 90
},
{
"epoch": 0.09813542688910697,
"grad_norm": 15.81206433163477,
"learning_rate": 4.901960784313725e-07,
"logits/chosen": 5834.7822265625,
"logits/rejected": 4743.5556640625,
"logps/chosen": -311.53265380859375,
"logps/rejected": -305.3698425292969,
"loss": 0.6278,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -14.513681411743164,
"rewards/margins": 19.83308219909668,
"rewards/rejected": -34.346763610839844,
"step": 100
},
{
"epoch": 0.10794896957801767,
"grad_norm": 18.563361241995352,
"learning_rate": 4.999061090193831e-07,
"logits/chosen": 5575.4599609375,
"logits/rejected": 5340.49658203125,
"logps/chosen": -277.6549987792969,
"logps/rejected": -278.158447265625,
"loss": 0.6341,
"rewards/accuracies": 0.6166666746139526,
"rewards/chosen": -7.583652496337891,
"rewards/margins": 10.811137199401855,
"rewards/rejected": -18.394786834716797,
"step": 110
},
{
"epoch": 0.11776251226692837,
"grad_norm": 22.063065551890407,
"learning_rate": 4.995247977764035e-07,
"logits/chosen": 5714.29443359375,
"logits/rejected": 5232.7041015625,
"logps/chosen": -276.466552734375,
"logps/rejected": -295.88800048828125,
"loss": 0.6269,
"rewards/accuracies": 0.6583333015441895,
"rewards/chosen": -20.445241928100586,
"rewards/margins": 17.259180068969727,
"rewards/rejected": -37.70441818237305,
"step": 120
},
{
"epoch": 0.12757605495583907,
"grad_norm": 21.227897979315813,
"learning_rate": 4.988506452457066e-07,
"logits/chosen": 5282.2646484375,
"logits/rejected": 4814.9853515625,
"logps/chosen": -284.6465759277344,
"logps/rejected": -329.804931640625,
"loss": 0.6032,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -43.58851623535156,
"rewards/margins": 24.183361053466797,
"rewards/rejected": -67.7718734741211,
"step": 130
},
{
"epoch": 0.13738959764474976,
"grad_norm": 19.667832090255832,
"learning_rate": 4.9788444260996e-07,
"logits/chosen": 5482.5751953125,
"logits/rejected": 5381.85107421875,
"logps/chosen": -307.1512451171875,
"logps/rejected": -342.03619384765625,
"loss": 0.6036,
"rewards/accuracies": 0.6666667461395264,
"rewards/chosen": -47.92987823486328,
"rewards/margins": 22.427753448486328,
"rewards/rejected": -70.3576431274414,
"step": 140
},
{
"epoch": 0.14720314033366044,
"grad_norm": 25.463823735637064,
"learning_rate": 4.96627323800647e-07,
"logits/chosen": 5556.36572265625,
"logits/rejected": 4525.91796875,
"logps/chosen": -339.99114990234375,
"logps/rejected": -357.9053649902344,
"loss": 0.5659,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": -65.89563751220703,
"rewards/margins": 47.135643005371094,
"rewards/rejected": -113.0312728881836,
"step": 150
},
{
"epoch": 0.15701668302257116,
"grad_norm": 26.781231387453232,
"learning_rate": 4.95080764167289e-07,
"logits/chosen": 6055.6474609375,
"logits/rejected": 5491.48046875,
"logps/chosen": -350.4269104003906,
"logps/rejected": -381.8998107910156,
"loss": 0.5603,
"rewards/accuracies": 0.6083333492279053,
"rewards/chosen": -62.65166473388672,
"rewards/margins": 36.19008255004883,
"rewards/rejected": -98.84175109863281,
"step": 160
},
{
"epoch": 0.16683022571148184,
"grad_norm": 28.113973023052374,
"learning_rate": 4.932465787459808e-07,
"logits/chosen": 5991.466796875,
"logits/rejected": 5234.6416015625,
"logps/chosen": -302.66656494140625,
"logps/rejected": -343.98358154296875,
"loss": 0.5567,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -39.85801315307617,
"rewards/margins": 54.232994079589844,
"rewards/rejected": -94.09100341796875,
"step": 170
},
{
"epoch": 0.17664376840039253,
"grad_norm": 33.999159471041786,
"learning_rate": 4.911269201292724e-07,
"logits/chosen": 5687.16943359375,
"logits/rejected": 5025.896484375,
"logps/chosen": -303.44134521484375,
"logps/rejected": -364.39190673828125,
"loss": 0.5816,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -62.167022705078125,
"rewards/margins": 51.05744552612305,
"rewards/rejected": -113.2244644165039,
"step": 180
},
{
"epoch": 0.18645731108930325,
"grad_norm": 29.641088692190937,
"learning_rate": 4.887242759398945e-07,
"logits/chosen": 6036.60205078125,
"logits/rejected": 5355.47216796875,
"logps/chosen": -337.2464294433594,
"logps/rejected": -388.3368835449219,
"loss": 0.5383,
"rewards/accuracies": 0.7333332896232605,
"rewards/chosen": -57.11214065551758,
"rewards/margins": 51.385711669921875,
"rewards/rejected": -108.49784851074219,
"step": 190
},
{
"epoch": 0.19627085377821393,
"grad_norm": 44.68902740164567,
"learning_rate": 4.860414659112948e-07,
"logits/chosen": 6272.4951171875,
"logits/rejected": 5538.49609375,
"logps/chosen": -370.70849609375,
"logps/rejected": -407.4710998535156,
"loss": 0.5638,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -84.62608337402344,
"rewards/margins": 62.12910079956055,
"rewards/rejected": -146.75518798828125,
"step": 200
},
{
"epoch": 0.20608439646712462,
"grad_norm": 25.411968927521745,
"learning_rate": 4.830816385784104e-07,
"logits/chosen": 4968.16015625,
"logits/rejected": 4779.7099609375,
"logps/chosen": -331.57757568359375,
"logps/rejected": -343.7427062988281,
"loss": 0.5589,
"rewards/accuracies": 0.6583333611488342,
"rewards/chosen": -78.5858383178711,
"rewards/margins": 39.72159957885742,
"rewards/rejected": -118.30744934082031,
"step": 210
},
{
"epoch": 0.21589793915603533,
"grad_norm": 37.405063992584424,
"learning_rate": 4.798482675825602e-07,
"logits/chosen": 5361.2626953125,
"logits/rejected": 5484.0341796875,
"logps/chosen": -311.9710388183594,
"logps/rejected": -405.7643127441406,
"loss": 0.5245,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -71.02371215820312,
"rewards/margins": 85.1280746459961,
"rewards/rejected": -156.1517791748047,
"step": 220
},
{
"epoch": 0.22571148184494602,
"grad_norm": 50.62400555207534,
"learning_rate": 4.7634514759479275e-07,
"logits/chosen": 6291.7314453125,
"logits/rejected": 4984.1982421875,
"logps/chosen": -361.0018615722656,
"logps/rejected": -410.3404846191406,
"loss": 0.5001,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -79.9288330078125,
"rewards/margins": 78.11891174316406,
"rewards/rejected": -158.04774475097656,
"step": 230
},
{
"epoch": 0.23552502453385674,
"grad_norm": 27.660916558165752,
"learning_rate": 4.7257638986247684e-07,
"logits/chosen": 6535.8984375,
"logits/rejected": 5374.02294921875,
"logps/chosen": -426.83148193359375,
"logps/rejected": -457.632080078125,
"loss": 0.516,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -109.15034484863281,
"rewards/margins": 92.73652648925781,
"rewards/rejected": -201.88687133789062,
"step": 240
},
{
"epoch": 0.24533856722276742,
"grad_norm": 31.148002019742822,
"learning_rate": 4.685464173843574e-07,
"logits/chosen": 5497.865234375,
"logits/rejected": 4737.041015625,
"logps/chosen": -371.4256591796875,
"logps/rejected": -383.71661376953125,
"loss": 0.5543,
"rewards/accuracies": 0.6166666746139526,
"rewards/chosen": -120.2467269897461,
"rewards/margins": 42.96506881713867,
"rewards/rejected": -163.21180725097656,
"step": 250
},
{
"epoch": 0.25515210991167814,
"grad_norm": 31.571604145354506,
"learning_rate": 4.6425995971974265e-07,
"logits/chosen": 5646.7626953125,
"logits/rejected": 5109.78369140625,
"logps/chosen": -389.2139587402344,
"logps/rejected": -417.52374267578125,
"loss": 0.5557,
"rewards/accuracies": 0.6333333253860474,
"rewards/chosen": -108.80330657958984,
"rewards/margins": 49.87716293334961,
"rewards/rejected": -158.6804656982422,
"step": 260
},
{
"epoch": 0.2649656526005888,
"grad_norm": 35.26099277826172,
"learning_rate": 4.597220474379125e-07,
"logits/chosen": 5891.14990234375,
"logits/rejected": 4710.44384765625,
"logps/chosen": -349.8431701660156,
"logps/rejected": -394.3140869140625,
"loss": 0.5564,
"rewards/accuracies": 0.6416667103767395,
"rewards/chosen": -90.45411682128906,
"rewards/margins": 59.028533935546875,
"rewards/rejected": -149.48263549804688,
"step": 270
},
{
"epoch": 0.2747791952894995,
"grad_norm": 48.24359930236473,
"learning_rate": 4.549380062142627e-07,
"logits/chosen": 5449.0,
"logits/rejected": 4662.09521484375,
"logps/chosen": -345.41461181640625,
"logps/rejected": -420.5967712402344,
"loss": 0.5258,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -90.24136352539062,
"rewards/margins": 88.40531158447266,
"rewards/rejected": -178.64669799804688,
"step": 280
},
{
"epoch": 0.2845927379784102,
"grad_norm": 29.807942673069554,
"learning_rate": 4.499134505801141e-07,
"logits/chosen": 6478.8251953125,
"logits/rejected": 5145.69580078125,
"logps/chosen": -425.2914123535156,
"logps/rejected": -475.61224365234375,
"loss": 0.5069,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": -114.30363464355469,
"rewards/margins": 90.24967193603516,
"rewards/rejected": -204.55331420898438,
"step": 290
},
{
"epoch": 0.2944062806673209,
"grad_norm": 35.28572084013644,
"learning_rate": 4.4465427733352124e-07,
"logits/chosen": 5390.82275390625,
"logits/rejected": 5010.67919921875,
"logps/chosen": -404.703125,
"logps/rejected": -445.12921142578125,
"loss": 0.5624,
"rewards/accuracies": 0.6416667103767395,
"rewards/chosen": -142.99755859375,
"rewards/margins": 53.75908279418945,
"rewards/rejected": -196.75662231445312,
"step": 300
},
{
"epoch": 0.3042198233562316,
"grad_norm": 40.084769146081335,
"learning_rate": 4.391666586188145e-07,
"logits/chosen": 5972.5166015625,
"logits/rejected": 5158.81103515625,
"logps/chosen": -387.8962707519531,
"logps/rejected": -440.2579040527344,
"loss": 0.515,
"rewards/accuracies": 0.6916667222976685,
"rewards/chosen": -106.87181091308594,
"rewards/margins": 80.19728088378906,
"rewards/rejected": -187.06912231445312,
"step": 310
},
{
"epoch": 0.3140333660451423,
"grad_norm": 28.62016537121461,
"learning_rate": 4.3345703468299634e-07,
"logits/chosen": 5544.9384765625,
"logits/rejected": 4833.5224609375,
"logps/chosen": -360.482421875,
"logps/rejected": -389.13385009765625,
"loss": 0.5356,
"rewards/accuracies": 0.7333332896232605,
"rewards/chosen": -99.99230194091797,
"rewards/margins": 53.11186599731445,
"rewards/rejected": -153.10415649414062,
"step": 320
},
{
"epoch": 0.323846908734053,
"grad_norm": 59.032426150017336,
"learning_rate": 4.275321063174936e-07,
"logits/chosen": 5484.0458984375,
"logits/rejected": 4950.25537109375,
"logps/chosen": -403.98785400390625,
"logps/rejected": -519.8248291015625,
"loss": 0.4914,
"rewards/accuracies": 0.7333333492279053,
"rewards/chosen": -141.48928833007812,
"rewards/margins": 106.94425964355469,
"rewards/rejected": -248.4335479736328,
"step": 330
},
{
"epoch": 0.3336604514229637,
"grad_norm": 31.879122250786,
"learning_rate": 4.2139882699413613e-07,
"logits/chosen": 5405.72265625,
"logits/rejected": 4280.78857421875,
"logps/chosen": -441.2384338378906,
"logps/rejected": -502.58209228515625,
"loss": 0.5114,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": -169.95379638671875,
"rewards/margins": 94.90937805175781,
"rewards/rejected": -264.8631286621094,
"step": 340
},
{
"epoch": 0.3434739941118744,
"grad_norm": 33.81913218379805,
"learning_rate": 4.1506439470459056e-07,
"logits/chosen": 6440.4052734375,
"logits/rejected": 4974.0732421875,
"logps/chosen": -476.2796325683594,
"logps/rejected": -503.81915283203125,
"loss": 0.4891,
"rewards/accuracies": 0.783333420753479,
"rewards/chosen": -159.8102569580078,
"rewards/margins": 94.52180480957031,
"rewards/rejected": -254.3320770263672,
"step": 350
},
{
"epoch": 0.35328753680078506,
"grad_norm": 27.826546606872803,
"learning_rate": 4.085362435128262e-07,
"logits/chosen": 5557.1865234375,
"logits/rejected": 5118.06640625,
"logps/chosen": -378.259033203125,
"logps/rejected": -449.55145263671875,
"loss": 0.5236,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -135.6257781982422,
"rewards/margins": 67.34868621826172,
"rewards/rejected": -202.97447204589844,
"step": 360
},
{
"epoch": 0.3631010794896958,
"grad_norm": 35.17289058393036,
"learning_rate": 4.0182203483052825e-07,
"logits/chosen": 6366.83056640625,
"logits/rejected": 5257.0703125,
"logps/chosen": -399.1999206542969,
"logps/rejected": -466.55255126953125,
"loss": 0.4778,
"rewards/accuracies": 0.8083333969116211,
"rewards/chosen": -125.0527572631836,
"rewards/margins": 93.51255798339844,
"rewards/rejected": -218.56527709960938,
"step": 370
},
{
"epoch": 0.3729146221786065,
"grad_norm": 38.44052471860425,
"learning_rate": 3.949296484256959e-07,
"logits/chosen": 5621.7138671875,
"logits/rejected": 5390.65478515625,
"logps/chosen": -457.83837890625,
"logps/rejected": -548.638427734375,
"loss": 0.5489,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -193.8985137939453,
"rewards/margins": 95.32550048828125,
"rewards/rejected": -289.2240295410156,
"step": 380
},
{
"epoch": 0.38272816486751715,
"grad_norm": 36.66931449629898,
"learning_rate": 3.8786717317497875e-07,
"logits/chosen": 5111.90576171875,
"logits/rejected": 4626.9228515625,
"logps/chosen": -434.62835693359375,
"logps/rejected": -526.7476806640625,
"loss": 0.4832,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -197.80410766601562,
"rewards/margins": 103.44625091552734,
"rewards/rejected": -301.2503662109375,
"step": 390
},
{
"epoch": 0.39254170755642787,
"grad_norm": 57.59980893341699,
"learning_rate": 3.806428975706042e-07,
"logits/chosen": 6388.87158203125,
"logits/rejected": 4657.2216796875,
"logps/chosen": -454.86175537109375,
"logps/rejected": -485.8328552246094,
"loss": 0.4911,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -156.0497283935547,
"rewards/margins": 103.42845153808594,
"rewards/rejected": -259.4781494140625,
"step": 400
},
{
"epoch": 0.4023552502453386,
"grad_norm": 39.54551097407698,
"learning_rate": 3.7326529999303633e-07,
"logits/chosen": 6277.59228515625,
"logits/rejected": 5186.5234375,
"logps/chosen": -436.134521484375,
"logps/rejected": -489.72021484375,
"loss": 0.5039,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -157.28512573242188,
"rewards/margins": 91.26525115966797,
"rewards/rejected": -248.55038452148438,
"step": 410
},
{
"epoch": 0.41216879293424924,
"grad_norm": 28.9828512823159,
"learning_rate": 3.6574303876078366e-07,
"logits/chosen": 6166.8349609375,
"logits/rejected": 5749.53759765625,
"logps/chosen": -429.44842529296875,
"logps/rejected": -501.61553955078125,
"loss": 0.5346,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -146.19595336914062,
"rewards/margins": 76.49540710449219,
"rewards/rejected": -222.6913604736328,
"step": 420
},
{
"epoch": 0.42198233562315995,
"grad_norm": 38.109059334795454,
"learning_rate": 3.5808494196903117e-07,
"logits/chosen": 5872.1611328125,
"logits/rejected": 5257.36962890625,
"logps/chosen": -426.01629638671875,
"logps/rejected": -524.32470703125,
"loss": 0.4893,
"rewards/accuracies": 0.7249999642372131,
"rewards/chosen": -147.16848754882812,
"rewards/margins": 97.02165222167969,
"rewards/rejected": -244.1901397705078,
"step": 430
},
{
"epoch": 0.43179587831207067,
"grad_norm": 36.48516188691919,
"learning_rate": 3.5029999712902387e-07,
"logits/chosen": 5825.1708984375,
"logits/rejected": 5375.9892578125,
"logps/chosen": -412.44866943359375,
"logps/rejected": -492.4132385253906,
"loss": 0.5089,
"rewards/accuracies": 0.7583333849906921,
"rewards/chosen": -137.3380584716797,
"rewards/margins": 78.8681640625,
"rewards/rejected": -216.2062225341797,
"step": 440
},
{
"epoch": 0.44160942100098133,
"grad_norm": 32.455988556697044,
"learning_rate": 3.4239734062036067e-07,
"logits/chosen": 5395.5947265625,
"logits/rejected": 4995.49267578125,
"logps/chosen": -377.55206298828125,
"logps/rejected": -480.9095764160156,
"loss": 0.504,
"rewards/accuracies": 0.8166666030883789,
"rewards/chosen": -116.18603515625,
"rewards/margins": 91.22578430175781,
"rewards/rejected": -207.4118194580078,
"step": 450
},
{
"epoch": 0.45142296368989204,
"grad_norm": 40.54519554942862,
"learning_rate": 3.343862469685755e-07,
"logits/chosen": 5598.1201171875,
"logits/rejected": 5239.931640625,
"logps/chosen": -418.7256774902344,
"logps/rejected": -504.2228088378906,
"loss": 0.492,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -155.0460662841797,
"rewards/margins": 88.58525848388672,
"rewards/rejected": -243.63131713867188,
"step": 460
},
{
"epoch": 0.46123650637880276,
"grad_norm": 63.388353478656484,
"learning_rate": 3.2627611796059283e-07,
"logits/chosen": 6118.2041015625,
"logits/rejected": 4867.0166015625,
"logps/chosen": -513.0584716796875,
"logps/rejected": -580.1785278320312,
"loss": 0.4852,
"rewards/accuracies": 0.6833333373069763,
"rewards/chosen": -215.5017547607422,
"rewards/margins": 102.20874786376953,
"rewards/rejected": -317.71051025390625,
"step": 470
},
{
"epoch": 0.47105004906771347,
"grad_norm": 53.006728665147484,
"learning_rate": 3.1807647161082797e-07,
"logits/chosen": 6796.0439453125,
"logits/rejected": 4991.43505859375,
"logps/chosen": -505.2491760253906,
"logps/rejected": -591.8897094726562,
"loss": 0.467,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -224.52243041992188,
"rewards/margins": 122.7418441772461,
"rewards/rejected": -347.2642822265625,
"step": 480
},
{
"epoch": 0.48086359175662413,
"grad_norm": 45.433668803945835,
"learning_rate": 3.097969309908847e-07,
"logits/chosen": 6198.6357421875,
"logits/rejected": 4938.93701171875,
"logps/chosen": -537.33154296875,
"logps/rejected": -604.7870483398438,
"loss": 0.4907,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": -252.5094451904297,
"rewards/margins": 102.22807312011719,
"rewards/rejected": -354.7375183105469,
"step": 490
},
{
"epoch": 0.49067713444553485,
"grad_norm": 40.30222762902004,
"learning_rate": 3.01447212935957e-07,
"logits/chosen": 5542.91015625,
"logits/rejected": 4886.0283203125,
"logps/chosen": -515.2832641601562,
"logps/rejected": -611.3238525390625,
"loss": 0.4764,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -230.98831176757812,
"rewards/margins": 120.3188247680664,
"rewards/rejected": -351.30718994140625,
"step": 500
},
{
"epoch": 0.5004906771344455,
"grad_norm": 37.25943332077513,
"learning_rate": 2.930371166411915e-07,
"logits/chosen": 6290.35107421875,
"logits/rejected": 5406.603515625,
"logps/chosen": -496.71923828125,
"logps/rejected": -568.7760009765625,
"loss": 0.5204,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -171.8844451904297,
"rewards/margins": 106.14483642578125,
"rewards/rejected": -278.02923583984375,
"step": 510
},
{
"epoch": 0.5103042198233563,
"grad_norm": 34.219197608369974,
"learning_rate": 2.845765121613912e-07,
"logits/chosen": 5363.45361328125,
"logits/rejected": 4926.47705078125,
"logps/chosen": -400.9844665527344,
"logps/rejected": -468.4186096191406,
"loss": 0.4843,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -137.99566650390625,
"rewards/margins": 86.77244567871094,
"rewards/rejected": -224.7681121826172,
"step": 520
},
{
"epoch": 0.5201177625122669,
"grad_norm": 51.08115197166243,
"learning_rate": 2.760753288275598e-07,
"logits/chosen": 6380.15380859375,
"logits/rejected": 5523.56103515625,
"logps/chosen": -411.37030029296875,
"logps/rejected": -479.7333984375,
"loss": 0.524,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -120.66414642333984,
"rewards/margins": 82.25830841064453,
"rewards/rejected": -202.92245483398438,
"step": 530
},
{
"epoch": 0.5299313052011776,
"grad_norm": 32.10452041832907,
"learning_rate": 2.675435435938788e-07,
"logits/chosen": 5805.7861328125,
"logits/rejected": 4628.6015625,
"logps/chosen": -400.0195617675781,
"logps/rejected": -493.15631103515625,
"loss": 0.4989,
"rewards/accuracies": 0.7916666269302368,
"rewards/chosen": -143.0798797607422,
"rewards/margins": 111.36392974853516,
"rewards/rejected": -254.4438018798828,
"step": 540
},
{
"epoch": 0.5397448478900884,
"grad_norm": 35.50341902811831,
"learning_rate": 2.5899116932879534e-07,
"logits/chosen": 5951.2255859375,
"logits/rejected": 5129.73291015625,
"logps/chosen": -436.9695739746094,
"logps/rejected": -521.4527587890625,
"loss": 0.4679,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -170.31277465820312,
"rewards/margins": 103.4649658203125,
"rewards/rejected": -273.77777099609375,
"step": 550
},
{
"epoch": 0.549558390578999,
"grad_norm": 38.923123039929806,
"learning_rate": 2.504282430639594e-07,
"logits/chosen": 5168.88427734375,
"logits/rejected": 4690.22412109375,
"logps/chosen": -454.4593811035156,
"logps/rejected": -523.0364990234375,
"loss": 0.5234,
"rewards/accuracies": 0.6500000357627869,
"rewards/chosen": -206.7684783935547,
"rewards/margins": 83.95912170410156,
"rewards/rejected": -290.72760009765625,
"step": 560
},
{
"epoch": 0.5593719332679097,
"grad_norm": 35.9015213923173,
"learning_rate": 2.418648142148056e-07,
"logits/chosen": 5650.38818359375,
"logits/rejected": 4686.87158203125,
"logps/chosen": -421.58416748046875,
"logps/rejected": -519.3839721679688,
"loss": 0.4912,
"rewards/accuracies": 0.783333420753479,
"rewards/chosen": -161.98081970214844,
"rewards/margins": 115.42814636230469,
"rewards/rejected": -277.4089660644531,
"step": 570
},
{
"epoch": 0.5691854759568205,
"grad_norm": 35.377631368601875,
"learning_rate": 2.3331093278659906e-07,
"logits/chosen": 6001.3486328125,
"logits/rejected": 5075.9619140625,
"logps/chosen": -444.90069580078125,
"logps/rejected": -534.0222778320312,
"loss": 0.4834,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": -150.86428833007812,
"rewards/margins": 100.94398498535156,
"rewards/rejected": -251.80825805664062,
"step": 580
},
{
"epoch": 0.5789990186457311,
"grad_norm": 33.72847930978894,
"learning_rate": 2.247766375797906e-07,
"logits/chosen": 6150.4951171875,
"logits/rejected": 5650.3603515625,
"logps/chosen": -447.9390563964844,
"logps/rejected": -580.2978515625,
"loss": 0.459,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -173.01307678222656,
"rewards/margins": 128.7034454345703,
"rewards/rejected": -301.71649169921875,
"step": 590
},
{
"epoch": 0.5888125613346418,
"grad_norm": 38.272687769078246,
"learning_rate": 2.1627194440852142e-07,
"logits/chosen": 5934.83935546875,
"logits/rejected": 5138.47705078125,
"logps/chosen": -510.39532470703125,
"logps/rejected": -600.4871826171875,
"loss": 0.516,
"rewards/accuracies": 0.7583333849906921,
"rewards/chosen": -226.30844116210938,
"rewards/margins": 95.1800765991211,
"rewards/rejected": -321.4884948730469,
"step": 600
},
{
"epoch": 0.5986261040235525,
"grad_norm": 42.72545572301978,
"learning_rate": 2.0780683434610413e-07,
"logits/chosen": 5760.5244140625,
"logits/rejected": 4755.18798828125,
"logps/chosen": -520.7589721679688,
"logps/rejected": -605.10546875,
"loss": 0.4979,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -230.55068969726562,
"rewards/margins": 103.2475357055664,
"rewards/rejected": -333.7981872558594,
"step": 610
},
{
"epoch": 0.6084396467124632,
"grad_norm": 30.800890611965162,
"learning_rate": 1.993912420112756e-07,
"logits/chosen": 6323.02978515625,
"logits/rejected": 5290.75927734375,
"logps/chosen": -529.4403686523438,
"logps/rejected": -628.4583129882812,
"loss": 0.5034,
"rewards/accuracies": 0.75,
"rewards/chosen": -250.78018188476562,
"rewards/margins": 104.14213562011719,
"rewards/rejected": -354.92236328125,
"step": 620
},
{
"epoch": 0.6182531894013739,
"grad_norm": 33.05143266331657,
"learning_rate": 1.9103504390896944e-07,
"logits/chosen": 6340.01025390625,
"logits/rejected": 5427.24755859375,
"logps/chosen": -559.9760131835938,
"logps/rejected": -633.686767578125,
"loss": 0.4884,
"rewards/accuracies": 0.7583334445953369,
"rewards/chosen": -264.83856201171875,
"rewards/margins": 85.63264465332031,
"rewards/rejected": -350.47125244140625,
"step": 630
},
{
"epoch": 0.6280667320902846,
"grad_norm": 39.56952674823438,
"learning_rate": 1.8274804683928913e-07,
"logits/chosen": 5424.0146484375,
"logits/rejected": 4903.7958984375,
"logps/chosen": -535.6927490234375,
"logps/rejected": -647.5748901367188,
"loss": 0.4892,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -269.99102783203125,
"rewards/margins": 118.67204284667969,
"rewards/rejected": -388.6630554199219,
"step": 640
},
{
"epoch": 0.6378802747791953,
"grad_norm": 55.248010597812424,
"learning_rate": 1.745399763882881e-07,
"logits/chosen": 5793.76953125,
"logits/rejected": 4353.64794921875,
"logps/chosen": -535.369140625,
"logps/rejected": -589.8530883789062,
"loss": 0.4828,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -241.2775421142578,
"rewards/margins": 122.86383056640625,
"rewards/rejected": -364.1413879394531,
"step": 650
},
{
"epoch": 0.647693817468106,
"grad_norm": 38.643520028392174,
"learning_rate": 1.664204655140607e-07,
"logits/chosen": 6159.14306640625,
"logits/rejected": 4976.43994140625,
"logps/chosen": -499.28851318359375,
"logps/rejected": -561.6052856445312,
"loss": 0.495,
"rewards/accuracies": 0.7166666984558105,
"rewards/chosen": -212.77145385742188,
"rewards/margins": 93.96333312988281,
"rewards/rejected": -306.73480224609375,
"step": 660
},
{
"epoch": 0.6575073601570167,
"grad_norm": 35.07622366892728,
"learning_rate": 1.5839904324154273e-07,
"logits/chosen": 5574.2802734375,
"logits/rejected": 4987.9404296875,
"logps/chosen": -466.86346435546875,
"logps/rejected": -580.9351196289062,
"loss": 0.4938,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -179.73626708984375,
"rewards/margins": 136.65802001953125,
"rewards/rejected": -316.3943176269531,
"step": 670
},
{
"epoch": 0.6673209028459274,
"grad_norm": 33.542518567077636,
"learning_rate": 1.5048512347928564e-07,
"logits/chosen": 6700.78515625,
"logits/rejected": 5496.53662109375,
"logps/chosen": -503.79290771484375,
"logps/rejected": -590.5035400390625,
"loss": 0.4429,
"rewards/accuracies": 0.7499999403953552,
"rewards/chosen": -193.58926391601562,
"rewards/margins": 135.96713256835938,
"rewards/rejected": -329.556396484375,
"step": 680
},
{
"epoch": 0.677134445534838,
"grad_norm": 34.78474391764019,
"learning_rate": 1.426879939713322e-07,
"logits/chosen": 5514.447265625,
"logits/rejected": 4842.81640625,
"logps/chosen": -472.7972717285156,
"logps/rejected": -572.2882690429688,
"loss": 0.5124,
"rewards/accuracies": 0.7333332896232605,
"rewards/chosen": -220.92953491210938,
"rewards/margins": 110.0331039428711,
"rewards/rejected": -330.9626159667969,
"step": 690
},
{
"epoch": 0.6869479882237488,
"grad_norm": 39.067983682803174,
"learning_rate": 1.350168053971577e-07,
"logits/chosen": 5970.7685546875,
"logits/rejected": 5311.5283203125,
"logps/chosen": -452.698974609375,
"logps/rejected": -518.3038330078125,
"loss": 0.4982,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -172.0897979736328,
"rewards/margins": 97.96910095214844,
"rewards/rejected": -270.05889892578125,
"step": 700
},
{
"epoch": 0.6967615309126595,
"grad_norm": 40.38105094076238,
"learning_rate": 1.2748056063246994e-07,
"logits/chosen": 5575.70458984375,
"logits/rejected": 5063.31884765625,
"logps/chosen": -460.80413818359375,
"logps/rejected": -541.817138671875,
"loss": 0.5068,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -161.07498168945312,
"rewards/margins": 101.1529541015625,
"rewards/rejected": -262.2279357910156,
"step": 710
},
{
"epoch": 0.7065750736015701,
"grad_norm": 34.54330420809715,
"learning_rate": 1.2008810418347093e-07,
"logits/chosen": 5857.0908203125,
"logits/rejected": 5070.1689453125,
"logps/chosen": -448.393798828125,
"logps/rejected": -511.27130126953125,
"loss": 0.4955,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": -160.17967224121094,
"rewards/margins": 94.85597229003906,
"rewards/rejected": -255.03564453125,
"step": 720
},
{
"epoch": 0.7163886162904809,
"grad_norm": 36.199421295115016,
"learning_rate": 1.128481118069799e-07,
"logits/chosen": 5848.61279296875,
"logits/rejected": 4546.04296875,
"logps/chosen": -461.7185974121094,
"logps/rejected": -540.6113891601562,
"loss": 0.4906,
"rewards/accuracies": 0.6916666626930237,
"rewards/chosen": -193.16610717773438,
"rewards/margins": 106.0004653930664,
"rewards/rejected": -299.16656494140625,
"step": 730
},
{
"epoch": 0.7262021589793916,
"grad_norm": 58.298464182056584,
"learning_rate": 1.0576908032860088e-07,
"logits/chosen": 5177.734375,
"logits/rejected": 4254.4931640625,
"logps/chosen": -439.21923828125,
"logps/rejected": -490.22210693359375,
"loss": 0.4902,
"rewards/accuracies": 0.73333340883255,
"rewards/chosen": -179.5984344482422,
"rewards/margins": 106.27181243896484,
"rewards/rejected": -285.8702697753906,
"step": 740
},
{
"epoch": 0.7360157016683022,
"grad_norm": 47.06791612169973,
"learning_rate": 9.88593176708827e-08,
"logits/chosen": 5833.16748046875,
"logits/rejected": 4599.1416015625,
"logps/chosen": -447.70770263671875,
"logps/rejected": -503.156005859375,
"loss": 0.4893,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -168.36907958984375,
"rewards/margins": 97.07537078857422,
"rewards/rejected": -265.4444580078125,
"step": 750
},
{
"epoch": 0.745829244357213,
"grad_norm": 37.1160086085217,
"learning_rate": 9.212693310317479e-08,
"logits/chosen": 5141.75390625,
"logits/rejected": 4296.54833984375,
"logps/chosen": -440.88067626953125,
"logps/rejected": -532.016845703125,
"loss": 0.509,
"rewards/accuracies": 0.7000000476837158,
"rewards/chosen": -181.9151153564453,
"rewards/margins": 105.42694091796875,
"rewards/rejected": -287.34210205078125,
"step": 760
},
{
"epoch": 0.7556427870461236,
"grad_norm": 43.097373457610466,
"learning_rate": 8.557982772462138e-08,
"logits/chosen": 5532.06689453125,
"logits/rejected": 4944.3828125,
"logps/chosen": -424.0889587402344,
"logps/rejected": -537.22802734375,
"loss": 0.4679,
"rewards/accuracies": 0.8083333969116211,
"rewards/chosen": -165.0626983642578,
"rewards/margins": 116.6352767944336,
"rewards/rejected": -281.6979675292969,
"step": 770
},
{
"epoch": 0.7654563297350343,
"grad_norm": 51.31695327547084,
"learning_rate": 7.922568519146425e-08,
"logits/chosen": 5383.9931640625,
"logits/rejected": 4821.4970703125,
"logps/chosen": -442.91583251953125,
"logps/rejected": -547.6976928710938,
"loss": 0.4878,
"rewards/accuracies": 0.6833333969116211,
"rewards/chosen": -190.0575714111328,
"rewards/margins": 98.0093002319336,
"rewards/rejected": -288.06683349609375,
"step": 780
},
{
"epoch": 0.7752698724239451,
"grad_norm": 40.87283215033227,
"learning_rate": 7.307196269953444e-08,
"logits/chosen": 5953.62646484375,
"logits/rejected": 4360.71435546875,
"logps/chosen": -468.15301513671875,
"logps/rejected": -554.8399658203125,
"loss": 0.4513,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -173.2440185546875,
"rewards/margins": 138.4357147216797,
"rewards/rejected": -311.67974853515625,
"step": 790
},
{
"epoch": 0.7850834151128557,
"grad_norm": 53.01816227936955,
"learning_rate": 6.712588223251809e-08,
"logits/chosen": 5890.1064453125,
"logits/rejected": 5068.29052734375,
"logps/chosen": -507.1546936035156,
"logps/rejected": -587.9667358398438,
"loss": 0.4932,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -197.91702270507812,
"rewards/margins": 104.29020690917969,
"rewards/rejected": -302.20721435546875,
"step": 800
},
{
"epoch": 0.7948969578017664,
"grad_norm": 39.36718486541899,
"learning_rate": 6.139442208626517e-08,
"logits/chosen": 5642.1572265625,
"logits/rejected": 5064.44140625,
"logps/chosen": -466.017822265625,
"logps/rejected": -542.1941528320312,
"loss": 0.5086,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -191.46595764160156,
"rewards/margins": 93.6746597290039,
"rewards/rejected": -285.140625,
"step": 810
},
{
"epoch": 0.8047105004906772,
"grad_norm": 32.25150411325172,
"learning_rate": 5.5884308679090525e-08,
"logits/chosen": 6617.20166015625,
"logits/rejected": 5841.89990234375,
"logps/chosen": -489.13140869140625,
"logps/rejected": -556.3676147460938,
"loss": 0.4687,
"rewards/accuracies": 0.7666667103767395,
"rewards/chosen": -174.83181762695312,
"rewards/margins": 96.47708129882812,
"rewards/rejected": -271.30889892578125,
"step": 820
},
{
"epoch": 0.8145240431795878,
"grad_norm": 42.66355926716236,
"learning_rate": 5.060200865767605e-08,
"logits/chosen": 5482.3115234375,
"logits/rejected": 4349.36181640625,
"logps/chosen": -489.5411071777344,
"logps/rejected": -519.50439453125,
"loss": 0.4808,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -183.46530151367188,
"rewards/margins": 100.88998413085938,
"rewards/rejected": -284.3552551269531,
"step": 830
},
{
"epoch": 0.8243375858684985,
"grad_norm": 32.60506546982286,
"learning_rate": 4.555372130784102e-08,
"logits/chosen": 6099.6806640625,
"logits/rejected": 5423.52294921875,
"logps/chosen": -430.18377685546875,
"logps/rejected": -559.9306640625,
"loss": 0.4656,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -174.9810028076172,
"rewards/margins": 115.120361328125,
"rewards/rejected": -290.10137939453125,
"step": 840
},
{
"epoch": 0.8341511285574092,
"grad_norm": 28.976991791091827,
"learning_rate": 4.0745371279084976e-08,
"logits/chosen": 6144.4287109375,
"logits/rejected": 5391.69189453125,
"logps/chosen": -456.33270263671875,
"logps/rejected": -532.3670654296875,
"loss": 0.5206,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -177.6209716796875,
"rewards/margins": 89.14666748046875,
"rewards/rejected": -266.76763916015625,
"step": 850
},
{
"epoch": 0.8439646712463199,
"grad_norm": 35.3584148587086,
"learning_rate": 3.6182601631443596e-08,
"logits/chosen": 6054.46142578125,
"logits/rejected": 5496.1396484375,
"logps/chosen": -492.4789123535156,
"logps/rejected": -586.8856811523438,
"loss": 0.462,
"rewards/accuracies": 0.7916666269302368,
"rewards/chosen": -178.38107299804688,
"rewards/margins": 115.93116760253906,
"rewards/rejected": -294.312255859375,
"step": 860
},
{
"epoch": 0.8537782139352306,
"grad_norm": 34.93609119738404,
"learning_rate": 3.187076721281595e-08,
"logits/chosen": 5244.7314453125,
"logits/rejected": 4227.8193359375,
"logps/chosen": -435.866943359375,
"logps/rejected": -531.3182983398438,
"loss": 0.4827,
"rewards/accuracies": 0.7416667342185974,
"rewards/chosen": -182.42454528808594,
"rewards/margins": 120.93087005615234,
"rewards/rejected": -303.35540771484375,
"step": 870
},
{
"epoch": 0.8635917566241413,
"grad_norm": 42.21210418756789,
"learning_rate": 2.7814928374537334e-08,
"logits/chosen": 6968.44384765625,
"logits/rejected": 5644.8955078125,
"logps/chosen": -539.173828125,
"logps/rejected": -613.7080078125,
"loss": 0.4725,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": -196.6251220703125,
"rewards/margins": 135.3355255126953,
"rewards/rejected": -331.96063232421875,
"step": 880
},
{
"epoch": 0.873405299313052,
"grad_norm": 48.62475030995162,
"learning_rate": 2.4019845032570875e-08,
"logits/chosen": 6289.82763671875,
"logits/rejected": 4878.1728515625,
"logps/chosen": -469.8004455566406,
"logps/rejected": -565.7530517578125,
"loss": 0.4788,
"rewards/accuracies": 0.8166667222976685,
"rewards/chosen": -185.22061157226562,
"rewards/margins": 133.4977264404297,
"rewards/rejected": -318.7183532714844,
"step": 890
},
{
"epoch": 0.8832188420019627,
"grad_norm": 35.719224006833315,
"learning_rate": 2.0489971081290193e-08,
"logits/chosen": 5738.51318359375,
"logits/rejected": 4603.50439453125,
"logps/chosen": -483.54791259765625,
"logps/rejected": -549.1290283203125,
"loss": 0.4808,
"rewards/accuracies": 0.7249999642372131,
"rewards/chosen": -206.58328247070312,
"rewards/margins": 97.61729431152344,
"rewards/rejected": -304.2005920410156,
"step": 900
},
{
"epoch": 0.8930323846908734,
"grad_norm": 43.132524531606194,
"learning_rate": 1.7229449166406477e-08,
"logits/chosen": 5693.8486328125,
"logits/rejected": 4534.4052734375,
"logps/chosen": -469.5682067871094,
"logps/rejected": -569.1848754882812,
"loss": 0.4531,
"rewards/accuracies": 0.8083332777023315,
"rewards/chosen": -204.99484252929688,
"rewards/margins": 132.3409881591797,
"rewards/rejected": -337.3358154296875,
"step": 910
},
{
"epoch": 0.9028459273797841,
"grad_norm": 34.73048158998948,
"learning_rate": 1.4242105823176837e-08,
"logits/chosen": 6962.6904296875,
"logits/rejected": 5748.6943359375,
"logps/chosen": -525.45068359375,
"logps/rejected": -564.1856689453125,
"loss": 0.4516,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -192.100830078125,
"rewards/margins": 99.60045623779297,
"rewards/rejected": -291.7012939453125,
"step": 920
},
{
"epoch": 0.9126594700686947,
"grad_norm": 34.47893895251098,
"learning_rate": 1.1531446985597604e-08,
"logits/chosen": 5990.88525390625,
"logits/rejected": 5583.560546875,
"logps/chosen": -485.2509765625,
"logps/rejected": -572.419921875,
"loss": 0.4586,
"rewards/accuracies": 0.8250001072883606,
"rewards/chosen": -192.66342163085938,
"rewards/margins": 110.31678771972656,
"rewards/rejected": -302.9801940917969,
"step": 930
},
{
"epoch": 0.9224730127576055,
"grad_norm": 38.65102653124819,
"learning_rate": 9.100653871854963e-09,
"logits/chosen": 5348.1103515625,
"logits/rejected": 4875.837890625,
"logps/chosen": -461.71697998046875,
"logps/rejected": -564.880126953125,
"loss": 0.4878,
"rewards/accuracies": 0.7583333253860474,
"rewards/chosen": -193.52760314941406,
"rewards/margins": 111.4560546875,
"rewards/rejected": -304.98358154296875,
"step": 940
},
{
"epoch": 0.9322865554465162,
"grad_norm": 35.36010410132843,
"learning_rate": 6.9525792508597634e-09,
"logits/chosen": 5099.234375,
"logits/rejected": 4961.53466796875,
"logps/chosen": -456.7210388183594,
"logps/rejected": -571.4191284179688,
"loss": 0.496,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -211.7642364501953,
"rewards/margins": 97.53651428222656,
"rewards/rejected": -309.30072021484375,
"step": 950
},
{
"epoch": 0.9421000981354269,
"grad_norm": 60.3061901160388,
"learning_rate": 5.089744094249837e-09,
"logits/chosen": 6198.19091796875,
"logits/rejected": 5164.39013671875,
"logps/chosen": -477.7798767089844,
"logps/rejected": -606.0765991210938,
"loss": 0.4522,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -207.751953125,
"rewards/margins": 137.81539916992188,
"rewards/rejected": -345.56732177734375,
"step": 960
},
{
"epoch": 0.9519136408243376,
"grad_norm": 38.92788774449534,
"learning_rate": 3.5143346177878565e-09,
"logits/chosen": 6070.90673828125,
"logits/rejected": 5626.92578125,
"logps/chosen": -508.4833068847656,
"logps/rejected": -613.7086791992188,
"loss": 0.489,
"rewards/accuracies": 0.7750000357627869,
"rewards/chosen": -202.98733520507812,
"rewards/margins": 117.0799560546875,
"rewards/rejected": -320.0672912597656,
"step": 970
},
{
"epoch": 0.9617271835132483,
"grad_norm": 54.09136710237634,
"learning_rate": 2.2281997156273213e-09,
"logits/chosen": 6383.44775390625,
"logits/rejected": 5800.46484375,
"logps/chosen": -531.06884765625,
"logps/rejected": -637.2273559570312,
"loss": 0.5199,
"rewards/accuracies": 0.7500000596046448,
"rewards/chosen": -214.9575958251953,
"rewards/margins": 97.85444641113281,
"rewards/rejected": -312.81207275390625,
"step": 980
},
{
"epoch": 0.971540726202159,
"grad_norm": 37.58754664399141,
"learning_rate": 1.2328487904580131e-09,
"logits/chosen": 5965.31982421875,
"logits/rejected": 4487.17431640625,
"logps/chosen": -527.6492919921875,
"logps/rejected": -595.473876953125,
"loss": 0.4605,
"rewards/accuracies": 0.7833333611488342,
"rewards/chosen": -219.84896850585938,
"rewards/margins": 120.89280700683594,
"rewards/rejected": -340.7417907714844,
"step": 990
},
{
"epoch": 0.9813542688910697,
"grad_norm": 29.779875569619673,
"learning_rate": 5.29449982077046e-10,
"logits/chosen": 5757.50439453125,
"logits/rejected": 5476.619140625,
"logps/chosen": -460.2972106933594,
"logps/rejected": -551.8410034179688,
"loss": 0.4694,
"rewards/accuracies": 0.7249999642372131,
"rewards/chosen": -186.51734924316406,
"rewards/margins": 90.38455963134766,
"rewards/rejected": -276.90191650390625,
"step": 1000
},
{
"epoch": 0.9911678115799804,
"grad_norm": 27.069636754429258,
"learning_rate": 1.1882879646485379e-10,
"logits/chosen": 6565.1044921875,
"logits/rejected": 5212.9794921875,
"logps/chosen": -543.57421875,
"logps/rejected": -614.15185546875,
"loss": 0.4409,
"rewards/accuracies": 0.7416666746139526,
"rewards/chosen": -200.32835388183594,
"rewards/margins": 119.9702377319336,
"rewards/rejected": -320.298583984375,
"step": 1010
},
{
"epoch": 1.0,
"step": 1019,
"total_flos": 0.0,
"train_loss": 0.5212765811586988,
"train_runtime": 13234.9919,
"train_samples_per_second": 4.619,
"train_steps_per_second": 0.077
}
],
"logging_steps": 10,
"max_steps": 1019,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}