zephyr-7b-dpo-qlora / trainer_state.json
L1nkee's picture
Model save
e952b41 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 3821,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00026171159382360636,
"grad_norm": 2.4548187255859375,
"learning_rate": 1.3054830287206268e-08,
"logits/chosen": -2.4529099464416504,
"logits/rejected": -2.357592821121216,
"logps/chosen": -290.4953308105469,
"logps/rejected": -374.6131591796875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0026171159382360636,
"grad_norm": 2.4077019691467285,
"learning_rate": 1.3054830287206266e-07,
"logits/chosen": -2.281538724899292,
"logits/rejected": -2.181558132171631,
"logps/chosen": -279.58404541015625,
"logps/rejected": -245.3978729248047,
"loss": 0.6931,
"rewards/accuracies": 0.4444444477558136,
"rewards/chosen": -0.00010121504601556808,
"rewards/margins": 3.008971361850854e-05,
"rewards/rejected": -0.00013130476872902364,
"step": 10
},
{
"epoch": 0.005234231876472127,
"grad_norm": 2.513380527496338,
"learning_rate": 2.610966057441253e-07,
"logits/chosen": -2.2868807315826416,
"logits/rejected": -2.13252329826355,
"logps/chosen": -305.4561462402344,
"logps/rejected": -237.63320922851562,
"loss": 0.6924,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.003805191023275256,
"rewards/margins": 0.0015581005718559027,
"rewards/rejected": 0.00224709021858871,
"step": 20
},
{
"epoch": 0.007851347814708191,
"grad_norm": 2.3212156295776367,
"learning_rate": 3.9164490861618804e-07,
"logits/chosen": -2.271130084991455,
"logits/rejected": -2.2239737510681152,
"logps/chosen": -251.1245574951172,
"logps/rejected": -251.23959350585938,
"loss": 0.6925,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.012071892619132996,
"rewards/margins": 0.0013887647073715925,
"rewards/rejected": 0.010683128610253334,
"step": 30
},
{
"epoch": 0.010468463752944255,
"grad_norm": 1.9601231813430786,
"learning_rate": 5.221932114882506e-07,
"logits/chosen": -2.166848659515381,
"logits/rejected": -2.1315042972564697,
"logps/chosen": -216.14614868164062,
"logps/rejected": -221.613037109375,
"loss": 0.6916,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.018966395407915115,
"rewards/margins": 0.003130494151264429,
"rewards/rejected": 0.015835899859666824,
"step": 40
},
{
"epoch": 0.01308557969118032,
"grad_norm": 2.087867259979248,
"learning_rate": 6.527415143603135e-07,
"logits/chosen": -2.212444305419922,
"logits/rejected": -2.173527956008911,
"logps/chosen": -266.7479553222656,
"logps/rejected": -234.2369384765625,
"loss": 0.6908,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.029817840084433556,
"rewards/margins": 0.004731935448944569,
"rewards/rejected": 0.02508590742945671,
"step": 50
},
{
"epoch": 0.015702695629416383,
"grad_norm": 2.1439507007598877,
"learning_rate": 7.832898172323761e-07,
"logits/chosen": -2.169727325439453,
"logits/rejected": -2.1059727668762207,
"logps/chosen": -252.1941375732422,
"logps/rejected": -226.5443115234375,
"loss": 0.69,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.032556358724832535,
"rewards/margins": 0.006436903961002827,
"rewards/rejected": 0.026119451969861984,
"step": 60
},
{
"epoch": 0.018319811567652448,
"grad_norm": 2.062434673309326,
"learning_rate": 9.138381201044387e-07,
"logits/chosen": -2.3089659214019775,
"logits/rejected": -2.1857352256774902,
"logps/chosen": -271.83209228515625,
"logps/rejected": -246.536376953125,
"loss": 0.6874,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.042484961450099945,
"rewards/margins": 0.011770189739763737,
"rewards/rejected": 0.03071477636694908,
"step": 70
},
{
"epoch": 0.02093692750588851,
"grad_norm": 2.387071132659912,
"learning_rate": 1.0443864229765013e-06,
"logits/chosen": -2.2028708457946777,
"logits/rejected": -2.112464427947998,
"logps/chosen": -257.39007568359375,
"logps/rejected": -246.71914672851562,
"loss": 0.6875,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.038935337215662,
"rewards/margins": 0.011790206655859947,
"rewards/rejected": 0.027145132422447205,
"step": 80
},
{
"epoch": 0.023554043444124574,
"grad_norm": 2.23413348197937,
"learning_rate": 1.1749347258485642e-06,
"logits/chosen": -2.209949493408203,
"logits/rejected": -2.136049509048462,
"logps/chosen": -249.93435668945312,
"logps/rejected": -234.4086151123047,
"loss": 0.6838,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.0436517670750618,
"rewards/margins": 0.01937195286154747,
"rewards/rejected": 0.02427981235086918,
"step": 90
},
{
"epoch": 0.02617115938236064,
"grad_norm": 2.0995676517486572,
"learning_rate": 1.305483028720627e-06,
"logits/chosen": -2.250483751296997,
"logits/rejected": -2.1787195205688477,
"logps/chosen": -246.7249755859375,
"logps/rejected": -230.78726196289062,
"loss": 0.6809,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.04966636374592781,
"rewards/margins": 0.025339443236589432,
"rewards/rejected": 0.02432691864669323,
"step": 100
},
{
"epoch": 0.02617115938236064,
"eval_logits/chosen": -2.1487553119659424,
"eval_logits/rejected": -2.0557751655578613,
"eval_logps/chosen": -259.4205627441406,
"eval_logps/rejected": -241.98690795898438,
"eval_loss": 0.6806859970092773,
"eval_rewards/accuracies": 0.6579999923706055,
"eval_rewards/chosen": 0.051943570375442505,
"eval_rewards/margins": 0.026209397241473198,
"eval_rewards/rejected": 0.02573416940867901,
"eval_runtime": 1628.166,
"eval_samples_per_second": 1.228,
"eval_steps_per_second": 0.154,
"step": 100
},
{
"epoch": 0.028788275320596704,
"grad_norm": 2.388845682144165,
"learning_rate": 1.4360313315926894e-06,
"logits/chosen": -2.241703987121582,
"logits/rejected": -2.1246142387390137,
"logps/chosen": -284.23797607421875,
"logps/rejected": -239.15945434570312,
"loss": 0.6769,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.05070078372955322,
"rewards/margins": 0.034246720373630524,
"rewards/rejected": 0.016454065218567848,
"step": 110
},
{
"epoch": 0.031405391258832765,
"grad_norm": 2.2597317695617676,
"learning_rate": 1.5665796344647521e-06,
"logits/chosen": -2.2740626335144043,
"logits/rejected": -2.1607062816619873,
"logps/chosen": -287.3439025878906,
"logps/rejected": -272.5523681640625,
"loss": 0.6695,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.05601048469543457,
"rewards/margins": 0.049991391599178314,
"rewards/rejected": 0.00601908378303051,
"step": 120
},
{
"epoch": 0.03402250719706883,
"grad_norm": 2.850572109222412,
"learning_rate": 1.6971279373368146e-06,
"logits/chosen": -2.2888760566711426,
"logits/rejected": -2.1915862560272217,
"logps/chosen": -250.40792846679688,
"logps/rejected": -254.3070831298828,
"loss": 0.664,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.046189673244953156,
"rewards/margins": 0.06280346214771271,
"rewards/rejected": -0.016613787040114403,
"step": 130
},
{
"epoch": 0.036639623135304895,
"grad_norm": 2.827874183654785,
"learning_rate": 1.8276762402088774e-06,
"logits/chosen": -2.284651041030884,
"logits/rejected": -2.0745301246643066,
"logps/chosen": -272.6170349121094,
"logps/rejected": -229.8065185546875,
"loss": 0.6623,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.02093890681862831,
"rewards/margins": 0.06832924485206604,
"rewards/rejected": -0.047390345484018326,
"step": 140
},
{
"epoch": 0.03925673907354096,
"grad_norm": 3.0178773403167725,
"learning_rate": 1.9582245430809403e-06,
"logits/chosen": -2.2895896434783936,
"logits/rejected": -2.168560028076172,
"logps/chosen": -283.7343444824219,
"logps/rejected": -248.0662078857422,
"loss": 0.6607,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.0016987703274935484,
"rewards/margins": 0.07390020042657852,
"rewards/rejected": -0.0722014307975769,
"step": 150
},
{
"epoch": 0.04187385501177702,
"grad_norm": 3.1758434772491455,
"learning_rate": 2.0887728459530026e-06,
"logits/chosen": -2.209841251373291,
"logits/rejected": -2.150552988052368,
"logps/chosen": -262.22222900390625,
"logps/rejected": -269.7354431152344,
"loss": 0.6631,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.0545073039829731,
"rewards/margins": 0.07355803996324539,
"rewards/rejected": -0.1280653327703476,
"step": 160
},
{
"epoch": 0.04449097095001309,
"grad_norm": 3.9003469944000244,
"learning_rate": 2.2193211488250653e-06,
"logits/chosen": -2.1911962032318115,
"logits/rejected": -2.110661506652832,
"logps/chosen": -227.29232788085938,
"logps/rejected": -236.85684204101562,
"loss": 0.6631,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.0682467371225357,
"rewards/margins": 0.07278834283351898,
"rewards/rejected": -0.14103509485721588,
"step": 170
},
{
"epoch": 0.04710808688824915,
"grad_norm": 4.6043620109558105,
"learning_rate": 2.3498694516971284e-06,
"logits/chosen": -2.207650661468506,
"logits/rejected": -2.1264257431030273,
"logps/chosen": -273.5342712402344,
"logps/rejected": -269.66876220703125,
"loss": 0.6548,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.17898549139499664,
"rewards/margins": 0.09451718628406525,
"rewards/rejected": -0.2735026478767395,
"step": 180
},
{
"epoch": 0.04972520282648522,
"grad_norm": 4.391633987426758,
"learning_rate": 2.4804177545691907e-06,
"logits/chosen": -2.28275990486145,
"logits/rejected": -2.165226936340332,
"logps/chosen": -282.49395751953125,
"logps/rejected": -268.0941467285156,
"loss": 0.6364,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.12487462908029556,
"rewards/margins": 0.13966041803359985,
"rewards/rejected": -0.26453500986099243,
"step": 190
},
{
"epoch": 0.05234231876472128,
"grad_norm": 5.526188373565674,
"learning_rate": 2.610966057441254e-06,
"logits/chosen": -2.2079992294311523,
"logits/rejected": -2.086857318878174,
"logps/chosen": -267.3877258300781,
"logps/rejected": -241.93032836914062,
"loss": 0.6438,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.17729662358760834,
"rewards/margins": 0.12684157490730286,
"rewards/rejected": -0.3041382431983948,
"step": 200
},
{
"epoch": 0.05234231876472128,
"eval_logits/chosen": -2.1026034355163574,
"eval_logits/rejected": -2.0144851207733154,
"eval_logps/chosen": -283.6620788574219,
"eval_logps/rejected": -278.8497009277344,
"eval_loss": 0.6350578665733337,
"eval_rewards/accuracies": 0.6800000071525574,
"eval_rewards/chosen": -0.1904720813035965,
"eval_rewards/margins": 0.1524215042591095,
"eval_rewards/rejected": -0.3428936302661896,
"eval_runtime": 1725.681,
"eval_samples_per_second": 1.159,
"eval_steps_per_second": 0.145,
"step": 200
},
{
"epoch": 0.05495943470295734,
"grad_norm": 4.445857524871826,
"learning_rate": 2.741514360313316e-06,
"logits/chosen": -2.2572193145751953,
"logits/rejected": -2.1230695247650146,
"logps/chosen": -278.8066101074219,
"logps/rejected": -266.4608459472656,
"loss": 0.6154,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.1674547642469406,
"rewards/margins": 0.19266971945762634,
"rewards/rejected": -0.36012452840805054,
"step": 210
},
{
"epoch": 0.05757655064119341,
"grad_norm": 6.93760871887207,
"learning_rate": 2.872062663185379e-06,
"logits/chosen": -2.1456832885742188,
"logits/rejected": -2.0984296798706055,
"logps/chosen": -272.310546875,
"logps/rejected": -267.6050720214844,
"loss": 0.6093,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.18377810716629028,
"rewards/margins": 0.21743841469287872,
"rewards/rejected": -0.4012165069580078,
"step": 220
},
{
"epoch": 0.06019366657942947,
"grad_norm": 11.938814163208008,
"learning_rate": 3.0026109660574416e-06,
"logits/chosen": -2.285099744796753,
"logits/rejected": -2.1850409507751465,
"logps/chosen": -349.00244140625,
"logps/rejected": -328.79248046875,
"loss": 0.6575,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.4292556345462799,
"rewards/margins": 0.15273931622505188,
"rewards/rejected": -0.5819950103759766,
"step": 230
},
{
"epoch": 0.06281078251766553,
"grad_norm": 10.254609107971191,
"learning_rate": 3.1331592689295043e-06,
"logits/chosen": -2.1714115142822266,
"logits/rejected": -2.1014115810394287,
"logps/chosen": -335.92095947265625,
"logps/rejected": -342.66046142578125,
"loss": 0.6287,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.39719343185424805,
"rewards/margins": 0.21806569397449493,
"rewards/rejected": -0.6152591109275818,
"step": 240
},
{
"epoch": 0.06542789845590159,
"grad_norm": 5.5169548988342285,
"learning_rate": 3.263707571801567e-06,
"logits/chosen": -2.180147647857666,
"logits/rejected": -2.158116579055786,
"logps/chosen": -300.8548278808594,
"logps/rejected": -300.14276123046875,
"loss": 0.6065,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.4140791893005371,
"rewards/margins": 0.2470981329679489,
"rewards/rejected": -0.6611773371696472,
"step": 250
},
{
"epoch": 0.06804501439413765,
"grad_norm": 8.337285995483398,
"learning_rate": 3.3942558746736293e-06,
"logits/chosen": -2.213538646697998,
"logits/rejected": -2.089621067047119,
"logps/chosen": -320.1325378417969,
"logps/rejected": -318.0543518066406,
"loss": 0.6311,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.478567510843277,
"rewards/margins": 0.1991441696882248,
"rewards/rejected": -0.6777117252349854,
"step": 260
},
{
"epoch": 0.07066213033237373,
"grad_norm": 4.801840305328369,
"learning_rate": 3.524804177545692e-06,
"logits/chosen": -2.1710739135742188,
"logits/rejected": -2.1057240962982178,
"logps/chosen": -291.7244873046875,
"logps/rejected": -287.46148681640625,
"loss": 0.598,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.3111107349395752,
"rewards/margins": 0.27230924367904663,
"rewards/rejected": -0.583419919013977,
"step": 270
},
{
"epoch": 0.07327924627060979,
"grad_norm": 6.222449779510498,
"learning_rate": 3.6553524804177547e-06,
"logits/chosen": -2.206630229949951,
"logits/rejected": -2.086956739425659,
"logps/chosen": -302.73236083984375,
"logps/rejected": -307.13238525390625,
"loss": 0.6125,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.438936710357666,
"rewards/margins": 0.25869089365005493,
"rewards/rejected": -0.6976275444030762,
"step": 280
},
{
"epoch": 0.07589636220884585,
"grad_norm": 4.925107955932617,
"learning_rate": 3.7859007832898174e-06,
"logits/chosen": -2.1828970909118652,
"logits/rejected": -2.1146278381347656,
"logps/chosen": -345.37225341796875,
"logps/rejected": -351.8639221191406,
"loss": 0.5993,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.5980569124221802,
"rewards/margins": 0.31618183851242065,
"rewards/rejected": -0.9142388105392456,
"step": 290
},
{
"epoch": 0.07851347814708191,
"grad_norm": 9.889649391174316,
"learning_rate": 3.9164490861618806e-06,
"logits/chosen": -2.195115327835083,
"logits/rejected": -2.0654757022857666,
"logps/chosen": -292.487060546875,
"logps/rejected": -312.0032043457031,
"loss": 0.5829,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.48439502716064453,
"rewards/margins": 0.3321578800678253,
"rewards/rejected": -0.8165529370307922,
"step": 300
},
{
"epoch": 0.07851347814708191,
"eval_logits/chosen": -2.134176731109619,
"eval_logits/rejected": -2.0507938861846924,
"eval_logps/chosen": -309.2386474609375,
"eval_logps/rejected": -315.8948974609375,
"eval_loss": 0.6071631908416748,
"eval_rewards/accuracies": 0.6779999732971191,
"eval_rewards/chosen": -0.44623735547065735,
"eval_rewards/margins": 0.26710858941078186,
"eval_rewards/rejected": -0.7133459448814392,
"eval_runtime": 1583.0322,
"eval_samples_per_second": 1.263,
"eval_steps_per_second": 0.158,
"step": 300
},
{
"epoch": 0.08113059408531798,
"grad_norm": 7.277074813842773,
"learning_rate": 4.046997389033943e-06,
"logits/chosen": -2.3288633823394775,
"logits/rejected": -2.2150847911834717,
"logps/chosen": -340.07061767578125,
"logps/rejected": -320.6544494628906,
"loss": 0.575,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.5125407576560974,
"rewards/margins": 0.3419082760810852,
"rewards/rejected": -0.8544490933418274,
"step": 310
},
{
"epoch": 0.08374771002355404,
"grad_norm": 8.367517471313477,
"learning_rate": 4.177545691906005e-06,
"logits/chosen": -2.236116647720337,
"logits/rejected": -2.1389379501342773,
"logps/chosen": -327.5163879394531,
"logps/rejected": -331.678955078125,
"loss": 0.6384,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.6931655406951904,
"rewards/margins": 0.21217937767505646,
"rewards/rejected": -0.9053448438644409,
"step": 320
},
{
"epoch": 0.08636482596179011,
"grad_norm": 4.962728977203369,
"learning_rate": 4.308093994778068e-06,
"logits/chosen": -2.129941463470459,
"logits/rejected": -2.0937793254852295,
"logps/chosen": -362.57940673828125,
"logps/rejected": -357.7266845703125,
"loss": 0.631,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.0307652950286865,
"rewards/margins": 0.1927981674671173,
"rewards/rejected": -1.2235634326934814,
"step": 330
},
{
"epoch": 0.08898194190002617,
"grad_norm": 5.679270267486572,
"learning_rate": 4.4386422976501306e-06,
"logits/chosen": -2.1559038162231445,
"logits/rejected": -2.1013383865356445,
"logps/chosen": -398.1541442871094,
"logps/rejected": -407.53564453125,
"loss": 0.6093,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.0855846405029297,
"rewards/margins": 0.30799657106399536,
"rewards/rejected": -1.3935811519622803,
"step": 340
},
{
"epoch": 0.09159905783826224,
"grad_norm": 6.67193603515625,
"learning_rate": 4.569190600522193e-06,
"logits/chosen": -1.9782785177230835,
"logits/rejected": -1.8998119831085205,
"logps/chosen": -430.1962890625,
"logps/rejected": -446.568359375,
"loss": 0.6298,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.410946011543274,
"rewards/margins": 0.2784286439418793,
"rewards/rejected": -1.6893746852874756,
"step": 350
},
{
"epoch": 0.0942161737764983,
"grad_norm": 6.463137149810791,
"learning_rate": 4.699738903394257e-06,
"logits/chosen": -1.8780485391616821,
"logits/rejected": -1.8210369348526,
"logps/chosen": -398.168212890625,
"logps/rejected": -414.5179748535156,
"loss": 0.5745,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.5719618797302246,
"rewards/margins": 0.37264296412467957,
"rewards/rejected": -1.9446048736572266,
"step": 360
},
{
"epoch": 0.09683328971473436,
"grad_norm": 9.000224113464355,
"learning_rate": 4.8302872062663196e-06,
"logits/chosen": -1.8958412408828735,
"logits/rejected": -1.7773358821868896,
"logps/chosen": -420.69122314453125,
"logps/rejected": -423.59820556640625,
"loss": 0.5596,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.4388353824615479,
"rewards/margins": 0.4672966003417969,
"rewards/rejected": -1.9061321020126343,
"step": 370
},
{
"epoch": 0.09945040565297043,
"grad_norm": 8.802579879760742,
"learning_rate": 4.9608355091383814e-06,
"logits/chosen": -1.8825538158416748,
"logits/rejected": -1.7338628768920898,
"logps/chosen": -393.0355529785156,
"logps/rejected": -406.0670166015625,
"loss": 0.5634,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9511327743530273,
"rewards/margins": 0.46367520093917847,
"rewards/rejected": -1.4148077964782715,
"step": 380
},
{
"epoch": 0.1020675215912065,
"grad_norm": 13.02705192565918,
"learning_rate": 4.9999488562447675e-06,
"logits/chosen": -1.7599290609359741,
"logits/rejected": -1.6825551986694336,
"logps/chosen": -410.8023376464844,
"logps/rejected": -442.77069091796875,
"loss": 0.5788,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.298668622970581,
"rewards/margins": 0.5200435519218445,
"rewards/rejected": -1.8187124729156494,
"step": 390
},
{
"epoch": 0.10468463752944256,
"grad_norm": 13.382437705993652,
"learning_rate": 4.999698361256577e-06,
"logits/chosen": -1.7113679647445679,
"logits/rejected": -1.6131651401519775,
"logps/chosen": -458.7721252441406,
"logps/rejected": -455.18988037109375,
"loss": 0.6201,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.9338643550872803,
"rewards/margins": 0.3708694577217102,
"rewards/rejected": -2.3047337532043457,
"step": 400
},
{
"epoch": 0.10468463752944256,
"eval_logits/chosen": -1.7201672792434692,
"eval_logits/rejected": -1.637402057647705,
"eval_logps/chosen": -413.682861328125,
"eval_logps/rejected": -439.98870849609375,
"eval_loss": 0.5892496705055237,
"eval_rewards/accuracies": 0.684499979019165,
"eval_rewards/chosen": -1.4906798601150513,
"eval_rewards/margins": 0.4636039435863495,
"eval_rewards/rejected": -1.9542837142944336,
"eval_runtime": 1582.6906,
"eval_samples_per_second": 1.264,
"eval_steps_per_second": 0.158,
"step": 400
},
{
"epoch": 0.10730175346767862,
"grad_norm": 5.950835704803467,
"learning_rate": 4.999239142174581e-06,
"logits/chosen": -1.8206182718276978,
"logits/rejected": -1.76302969455719,
"logps/chosen": -344.7633056640625,
"logps/rejected": -375.6517639160156,
"loss": 0.6138,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.0250967741012573,
"rewards/margins": 0.3263424336910248,
"rewards/rejected": -1.3514392375946045,
"step": 410
},
{
"epoch": 0.10991886940591468,
"grad_norm": 7.535224437713623,
"learning_rate": 4.99857123734344e-06,
"logits/chosen": -1.721353530883789,
"logits/rejected": -1.5896873474121094,
"logps/chosen": -316.44989013671875,
"logps/rejected": -364.8625183105469,
"loss": 0.5309,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.8779315948486328,
"rewards/margins": 0.5743580460548401,
"rewards/rejected": -1.4522895812988281,
"step": 420
},
{
"epoch": 0.11253598534415074,
"grad_norm": 7.4984517097473145,
"learning_rate": 4.997694702533016e-06,
"logits/chosen": -1.5961456298828125,
"logits/rejected": -1.5111815929412842,
"logps/chosen": -389.32958984375,
"logps/rejected": -431.39727783203125,
"loss": 0.5386,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.118457555770874,
"rewards/margins": 0.6571696400642395,
"rewards/rejected": -1.7756273746490479,
"step": 430
},
{
"epoch": 0.11515310128238682,
"grad_norm": 8.746241569519043,
"learning_rate": 4.996609610933713e-06,
"logits/chosen": -1.7189710140228271,
"logits/rejected": -1.6461889743804932,
"logps/chosen": -359.4372253417969,
"logps/rejected": -380.6383361816406,
"loss": 0.5959,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8736177682876587,
"rewards/margins": 0.4956343173980713,
"rewards/rejected": -1.3692519664764404,
"step": 440
},
{
"epoch": 0.11777021722062288,
"grad_norm": 10.564971923828125,
"learning_rate": 4.995316053150366e-06,
"logits/chosen": -1.5438826084136963,
"logits/rejected": -1.46512770652771,
"logps/chosen": -384.11663818359375,
"logps/rejected": -418.8710021972656,
"loss": 0.5508,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1636836528778076,
"rewards/margins": 0.5783780813217163,
"rewards/rejected": -1.7420616149902344,
"step": 450
},
{
"epoch": 0.12038733315885894,
"grad_norm": 11.805535316467285,
"learning_rate": 4.9938141371946815e-06,
"logits/chosen": -1.4091435670852661,
"logits/rejected": -1.3203108310699463,
"logps/chosen": -540.69287109375,
"logps/rejected": -591.8529052734375,
"loss": 0.5454,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.756392478942871,
"rewards/margins": 0.657163679599762,
"rewards/rejected": -3.4135565757751465,
"step": 460
},
{
"epoch": 0.123004449097095,
"grad_norm": 9.790424346923828,
"learning_rate": 4.992103988476206e-06,
"logits/chosen": -1.5596047639846802,
"logits/rejected": -1.4357885122299194,
"logps/chosen": -443.34210205078125,
"logps/rejected": -497.86248779296875,
"loss": 0.5384,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.9961204528808594,
"rewards/margins": 0.6699923276901245,
"rewards/rejected": -2.6661131381988525,
"step": 470
},
{
"epoch": 0.12562156503533106,
"grad_norm": 5.368061542510986,
"learning_rate": 4.990185749791866e-06,
"logits/chosen": -1.7159227132797241,
"logits/rejected": -1.5995564460754395,
"logps/chosen": -349.0379943847656,
"logps/rejected": -414.6690979003906,
"loss": 0.5272,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9223679304122925,
"rewards/margins": 0.6385096907615662,
"rewards/rejected": -1.560877799987793,
"step": 480
},
{
"epoch": 0.12823868097356714,
"grad_norm": 14.292590141296387,
"learning_rate": 4.9880595813140395e-06,
"logits/chosen": -1.705514669418335,
"logits/rejected": -1.5731843709945679,
"logps/chosen": -385.6563720703125,
"logps/rejected": -410.178955078125,
"loss": 0.5619,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.0085517168045044,
"rewards/margins": 0.5483412742614746,
"rewards/rejected": -1.5568931102752686,
"step": 490
},
{
"epoch": 0.13085579691180318,
"grad_norm": 8.968839645385742,
"learning_rate": 4.985725660577184e-06,
"logits/chosen": -1.5214301347732544,
"logits/rejected": -1.3472332954406738,
"logps/chosen": -424.3779296875,
"logps/rejected": -453.65496826171875,
"loss": 0.5798,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.5070832967758179,
"rewards/margins": 0.7050014734268188,
"rewards/rejected": -2.2120845317840576,
"step": 500
},
{
"epoch": 0.13085579691180318,
"eval_logits/chosen": -1.3375675678253174,
"eval_logits/rejected": -1.204632043838501,
"eval_logps/chosen": -395.8431701660156,
"eval_logps/rejected": -444.97088623046875,
"eval_loss": 0.5667398571968079,
"eval_rewards/accuracies": 0.7020000219345093,
"eval_rewards/chosen": -1.3122824430465698,
"eval_rewards/margins": 0.6918234825134277,
"eval_rewards/rejected": -2.004106044769287,
"eval_runtime": 1590.0362,
"eval_samples_per_second": 1.258,
"eval_steps_per_second": 0.157,
"step": 500
},
{
"epoch": 0.13347291285003926,
"grad_norm": 18.905487060546875,
"learning_rate": 4.983184182463009e-06,
"logits/chosen": -1.4677969217300415,
"logits/rejected": -1.3418806791305542,
"logps/chosen": -405.25433349609375,
"logps/rejected": -447.4716796875,
"loss": 0.5465,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.2714810371398926,
"rewards/margins": 0.791718602180481,
"rewards/rejected": -2.063199520111084,
"step": 510
},
{
"epoch": 0.1360900287882753,
"grad_norm": 14.580382347106934,
"learning_rate": 4.980435359184203e-06,
"logits/chosen": -1.5167419910430908,
"logits/rejected": -1.4652340412139893,
"logps/chosen": -385.03106689453125,
"logps/rejected": -418.76470947265625,
"loss": 0.6033,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.12017822265625,
"rewards/margins": 0.5001946687698364,
"rewards/rejected": -1.6203731298446655,
"step": 520
},
{
"epoch": 0.13870714472651138,
"grad_norm": 6.707192420959473,
"learning_rate": 4.9774794202667236e-06,
"logits/chosen": -1.4496371746063232,
"logits/rejected": -1.4301903247833252,
"logps/chosen": -357.41998291015625,
"logps/rejected": -416.819091796875,
"loss": 0.5662,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9647631645202637,
"rewards/margins": 0.5089520215988159,
"rewards/rejected": -1.47371506690979,
"step": 530
},
{
"epoch": 0.14132426066474746,
"grad_norm": 11.740205764770508,
"learning_rate": 4.974316612530615e-06,
"logits/chosen": -1.251502513885498,
"logits/rejected": -1.0849131345748901,
"logps/chosen": -420.04437255859375,
"logps/rejected": -463.4141540527344,
"loss": 0.4635,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -1.4166905879974365,
"rewards/margins": 0.9010774493217468,
"rewards/rejected": -2.317768096923828,
"step": 540
},
{
"epoch": 0.1439413766029835,
"grad_norm": 13.1097412109375,
"learning_rate": 4.970947200069416e-06,
"logits/chosen": -1.1829249858856201,
"logits/rejected": -1.1032363176345825,
"logps/chosen": -457.55242919921875,
"logps/rejected": -504.1400451660156,
"loss": 0.6298,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.7912673950195312,
"rewards/margins": 0.6638978719711304,
"rewards/rejected": -2.455165386199951,
"step": 550
},
{
"epoch": 0.14655849254121958,
"grad_norm": 5.982730388641357,
"learning_rate": 4.967371464228096e-06,
"logits/chosen": -1.4941701889038086,
"logits/rejected": -1.3918794393539429,
"logps/chosen": -399.75579833984375,
"logps/rejected": -464.8260192871094,
"loss": 0.529,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.4229604005813599,
"rewards/margins": 0.6473892331123352,
"rewards/rejected": -2.0703494548797607,
"step": 560
},
{
"epoch": 0.14917560847945563,
"grad_norm": 8.946754455566406,
"learning_rate": 4.963589703579569e-06,
"logits/chosen": -1.5534603595733643,
"logits/rejected": -1.4113835096359253,
"logps/chosen": -472.273681640625,
"logps/rejected": -499.8206481933594,
"loss": 0.5544,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.722957968711853,
"rewards/margins": 0.6225946545600891,
"rewards/rejected": -2.345552921295166,
"step": 570
},
{
"epoch": 0.1517927244176917,
"grad_norm": 10.59168529510498,
"learning_rate": 4.9596022338997615e-06,
"logits/chosen": -1.4608839750289917,
"logits/rejected": -1.2478128671646118,
"logps/chosen": -492.757080078125,
"logps/rejected": -528.3729248046875,
"loss": 0.5337,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.9507982730865479,
"rewards/margins": 0.7664415240287781,
"rewards/rejected": -2.7172398567199707,
"step": 580
},
{
"epoch": 0.15440984035592778,
"grad_norm": 6.83862829208374,
"learning_rate": 4.955409388141243e-06,
"logits/chosen": -1.38741135597229,
"logits/rejected": -1.2707990407943726,
"logps/chosen": -408.0611572265625,
"logps/rejected": -441.0558166503906,
"loss": 0.5872,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.50876784324646,
"rewards/margins": 0.5663283467292786,
"rewards/rejected": -2.0750961303710938,
"step": 590
},
{
"epoch": 0.15702695629416383,
"grad_norm": 5.814328193664551,
"learning_rate": 4.951011516405429e-06,
"logits/chosen": -1.5240575075149536,
"logits/rejected": -1.4667167663574219,
"logps/chosen": -355.43487548828125,
"logps/rejected": -398.4039001464844,
"loss": 0.5395,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.0472333431243896,
"rewards/margins": 0.579511821269989,
"rewards/rejected": -1.6267452239990234,
"step": 600
},
{
"epoch": 0.15702695629416383,
"eval_logits/chosen": -1.2780898809432983,
"eval_logits/rejected": -1.1445080041885376,
"eval_logps/chosen": -386.18792724609375,
"eval_logps/rejected": -426.8257751464844,
"eval_loss": 0.5524141192436218,
"eval_rewards/accuracies": 0.703000009059906,
"eval_rewards/chosen": -1.2157301902770996,
"eval_rewards/margins": 0.6069244146347046,
"eval_rewards/rejected": -1.8226546049118042,
"eval_runtime": 1594.9357,
"eval_samples_per_second": 1.254,
"eval_steps_per_second": 0.157,
"step": 600
},
{
"epoch": 0.1596440722323999,
"grad_norm": 9.641666412353516,
"learning_rate": 4.946408985913344e-06,
"logits/chosen": -1.3044933080673218,
"logits/rejected": -1.1979707479476929,
"logps/chosen": -385.60382080078125,
"logps/rejected": -458.82373046875,
"loss": 0.4937,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.398618221282959,
"rewards/margins": 0.9089972376823425,
"rewards/rejected": -2.3076155185699463,
"step": 610
},
{
"epoch": 0.16226118817063595,
"grad_norm": 15.917795181274414,
"learning_rate": 4.941602180974958e-06,
"logits/chosen": -1.1524051427841187,
"logits/rejected": -0.8983421325683594,
"logps/chosen": -500.45880126953125,
"logps/rejected": -547.509765625,
"loss": 0.5479,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.1275360584259033,
"rewards/margins": 1.056699275970459,
"rewards/rejected": -3.1842353343963623,
"step": 620
},
{
"epoch": 0.16487830410887203,
"grad_norm": 10.020596504211426,
"learning_rate": 4.936591502957101e-06,
"logits/chosen": -1.091322422027588,
"logits/rejected": -0.9337531328201294,
"logps/chosen": -459.0081481933594,
"logps/rejected": -551.312255859375,
"loss": 0.5273,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.1100564002990723,
"rewards/margins": 1.0251764059066772,
"rewards/rejected": -3.135232925415039,
"step": 630
},
{
"epoch": 0.16749542004710807,
"grad_norm": 11.079347610473633,
"learning_rate": 4.931377370249946e-06,
"logits/chosen": -1.0583717823028564,
"logits/rejected": -0.8333446383476257,
"logps/chosen": -591.6964111328125,
"logps/rejected": -650.4478149414062,
"loss": 0.5474,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.2645416259765625,
"rewards/margins": 0.8453465700149536,
"rewards/rejected": -4.109888076782227,
"step": 640
},
{
"epoch": 0.17011253598534415,
"grad_norm": 15.071267127990723,
"learning_rate": 4.925960218232073e-06,
"logits/chosen": -1.0942609310150146,
"logits/rejected": -0.9619698524475098,
"logps/chosen": -564.9473876953125,
"logps/rejected": -646.3541259765625,
"loss": 0.5535,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.0973496437072754,
"rewards/margins": 0.9102842211723328,
"rewards/rejected": -4.007634162902832,
"step": 650
},
{
"epoch": 0.17272965192358022,
"grad_norm": 12.748456001281738,
"learning_rate": 4.920340499234116e-06,
"logits/chosen": -1.2404229640960693,
"logits/rejected": -1.0331848859786987,
"logps/chosen": -465.51190185546875,
"logps/rejected": -491.4901428222656,
"loss": 0.5536,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.9967219829559326,
"rewards/margins": 0.611833393573761,
"rewards/rejected": -2.608555555343628,
"step": 660
},
{
"epoch": 0.17534676786181627,
"grad_norm": 9.929147720336914,
"learning_rate": 4.914518682500995e-06,
"logits/chosen": -1.4407885074615479,
"logits/rejected": -1.2742736339569092,
"logps/chosen": -468.04180908203125,
"logps/rejected": -500.22406005859375,
"loss": 0.5299,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.8461347818374634,
"rewards/margins": 0.7285885810852051,
"rewards/rejected": -2.574723243713379,
"step": 670
},
{
"epoch": 0.17796388380005235,
"grad_norm": 7.518274784088135,
"learning_rate": 4.9084952541527315e-06,
"logits/chosen": -1.2451258897781372,
"logits/rejected": -1.081993818283081,
"logps/chosen": -526.33740234375,
"logps/rejected": -566.2431640625,
"loss": 0.5029,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.5310587882995605,
"rewards/margins": 0.8698722124099731,
"rewards/rejected": -3.400930881500244,
"step": 680
},
{
"epoch": 0.1805809997382884,
"grad_norm": 9.582080841064453,
"learning_rate": 4.902270717143858e-06,
"logits/chosen": -1.1439664363861084,
"logits/rejected": -1.0512256622314453,
"logps/chosen": -527.6976318359375,
"logps/rejected": -677.5025634765625,
"loss": 0.4169,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.915743350982666,
"rewards/margins": 1.3812744617462158,
"rewards/rejected": -4.297018051147461,
"step": 690
},
{
"epoch": 0.18319811567652447,
"grad_norm": 6.840953350067139,
"learning_rate": 4.895845591221427e-06,
"logits/chosen": -1.0450663566589355,
"logits/rejected": -0.9686886072158813,
"logps/chosen": -566.4291381835938,
"logps/rejected": -659.4857177734375,
"loss": 0.5278,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.1321775913238525,
"rewards/margins": 0.9749795794487,
"rewards/rejected": -4.1071577072143555,
"step": 700
},
{
"epoch": 0.18319811567652447,
"eval_logits/chosen": -0.8394417762756348,
"eval_logits/rejected": -0.6998772025108337,
"eval_logps/chosen": -578.4380493164062,
"eval_logps/rejected": -649.6521606445312,
"eval_loss": 0.5335752964019775,
"eval_rewards/accuracies": 0.7264999747276306,
"eval_rewards/chosen": -3.138230800628662,
"eval_rewards/margins": 0.9126878976821899,
"eval_rewards/rejected": -4.050919055938721,
"eval_runtime": 1582.5904,
"eval_samples_per_second": 1.264,
"eval_steps_per_second": 0.158,
"step": 700
},
{
"epoch": 0.18581523161476055,
"grad_norm": 13.642376899719238,
"learning_rate": 4.8892204128816e-06,
"logits/chosen": -1.055434226989746,
"logits/rejected": -0.9338513612747192,
"logps/chosen": -568.8038940429688,
"logps/rejected": -652.264404296875,
"loss": 0.4835,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.012946844100952,
"rewards/margins": 0.9706674814224243,
"rewards/rejected": -3.983614683151245,
"step": 710
},
{
"epoch": 0.1884323475529966,
"grad_norm": 12.86258316040039,
"learning_rate": 4.882395735324864e-06,
"logits/chosen": -0.9849473834037781,
"logits/rejected": -0.825210452079773,
"logps/chosen": -602.6118774414062,
"logps/rejected": -685.4152221679688,
"loss": 0.4941,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.3656258583068848,
"rewards/margins": 1.0001194477081299,
"rewards/rejected": -4.3657450675964355,
"step": 720
},
{
"epoch": 0.19104946349123267,
"grad_norm": 16.07886505126953,
"learning_rate": 4.87537212840983e-06,
"logits/chosen": -0.8920208811759949,
"logits/rejected": -0.7630427479743958,
"logps/chosen": -712.3763427734375,
"logps/rejected": -763.3992919921875,
"loss": 0.5946,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -4.461544036865234,
"rewards/margins": 0.8164655566215515,
"rewards/rejected": -5.27800989151001,
"step": 730
},
{
"epoch": 0.19366657942946872,
"grad_norm": 13.588937759399414,
"learning_rate": 4.8681501786056545e-06,
"logits/chosen": -1.0917376279830933,
"logits/rejected": -0.9343040585517883,
"logps/chosen": -462.0271911621094,
"logps/rejected": -513.8302001953125,
"loss": 0.4765,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.3388774394989014,
"rewards/margins": 0.840278148651123,
"rewards/rejected": -3.179155111312866,
"step": 740
},
{
"epoch": 0.1962836953677048,
"grad_norm": 14.81460189819336,
"learning_rate": 4.860730488943068e-06,
"logits/chosen": -1.0682129859924316,
"logits/rejected": -1.003150224685669,
"logps/chosen": -441.767333984375,
"logps/rejected": -530.33349609375,
"loss": 0.4932,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.0735113620758057,
"rewards/margins": 0.9047049283981323,
"rewards/rejected": -2.9782164096832275,
"step": 750
},
{
"epoch": 0.19890081130594087,
"grad_norm": 11.297514915466309,
"learning_rate": 4.853113678964022e-06,
"logits/chosen": -1.09920072555542,
"logits/rejected": -1.0103808641433716,
"logps/chosen": -463.56842041015625,
"logps/rejected": -555.3057861328125,
"loss": 0.4855,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.9122604131698608,
"rewards/margins": 0.9966481924057007,
"rewards/rejected": -2.9089083671569824,
"step": 760
},
{
"epoch": 0.20151792724417691,
"grad_norm": 6.034220218658447,
"learning_rate": 4.845300384669958e-06,
"logits/chosen": -1.2919018268585205,
"logits/rejected": -1.1613837480545044,
"logps/chosen": -394.8362121582031,
"logps/rejected": -444.3602600097656,
"loss": 0.5366,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.4172089099884033,
"rewards/margins": 0.7238287329673767,
"rewards/rejected": -2.141037702560425,
"step": 770
},
{
"epoch": 0.204135043182413,
"grad_norm": 10.259706497192383,
"learning_rate": 4.837291258468701e-06,
"logits/chosen": -1.4326658248901367,
"logits/rejected": -1.291441798210144,
"logps/chosen": -437.5020446777344,
"logps/rejected": -488.41802978515625,
"loss": 0.5435,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.5230964422225952,
"rewards/margins": 0.7579478025436401,
"rewards/rejected": -2.2810444831848145,
"step": 780
},
{
"epoch": 0.20675215912064904,
"grad_norm": 8.649606704711914,
"learning_rate": 4.829086969119984e-06,
"logits/chosen": -1.3141874074935913,
"logits/rejected": -1.3148314952850342,
"logps/chosen": -416.85675048828125,
"logps/rejected": -487.4302673339844,
"loss": 0.5649,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.662632703781128,
"rewards/margins": 0.6876001954078674,
"rewards/rejected": -2.3502330780029297,
"step": 790
},
{
"epoch": 0.2093692750588851,
"grad_norm": 10.932877540588379,
"learning_rate": 4.820688201679605e-06,
"logits/chosen": -1.563239574432373,
"logits/rejected": -1.2844064235687256,
"logps/chosen": -417.02923583984375,
"logps/rejected": -430.236572265625,
"loss": 0.4969,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5448291301727295,
"rewards/margins": 0.7562888264656067,
"rewards/rejected": -2.3011181354522705,
"step": 800
},
{
"epoch": 0.2093692750588851,
"eval_logits/chosen": -1.2524374723434448,
"eval_logits/rejected": -1.1249934434890747,
"eval_logps/chosen": -448.3450012207031,
"eval_logps/rejected": -507.1189270019531,
"eval_loss": 0.5242464542388916,
"eval_rewards/accuracies": 0.7245000004768372,
"eval_rewards/chosen": -1.8373013734817505,
"eval_rewards/margins": 0.788284957408905,
"eval_rewards/rejected": -2.6255862712860107,
"eval_runtime": 1587.9898,
"eval_samples_per_second": 1.259,
"eval_steps_per_second": 0.157,
"step": 800
},
{
"epoch": 0.21198639099712116,
"grad_norm": 9.641347885131836,
"learning_rate": 4.8120956574422315e-06,
"logits/chosen": -1.4272372722625732,
"logits/rejected": -1.4065699577331543,
"logps/chosen": -465.9081115722656,
"logps/rejected": -525.2551879882812,
"loss": 0.5804,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.9209187030792236,
"rewards/margins": 0.6635769605636597,
"rewards/rejected": -2.5844955444335938,
"step": 810
},
{
"epoch": 0.21460350693535724,
"grad_norm": 10.180907249450684,
"learning_rate": 4.803310053882831e-06,
"logits/chosen": -1.4811725616455078,
"logits/rejected": -1.4760491847991943,
"logps/chosen": -394.45654296875,
"logps/rejected": -489.70751953125,
"loss": 0.524,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.6524245738983154,
"rewards/margins": 0.8287159204483032,
"rewards/rejected": -2.481140613555908,
"step": 820
},
{
"epoch": 0.2172206228735933,
"grad_norm": 16.830577850341797,
"learning_rate": 4.794332124596775e-06,
"logits/chosen": -1.5256417989730835,
"logits/rejected": -1.4408817291259766,
"logps/chosen": -442.7582092285156,
"logps/rejected": -510.44903564453125,
"loss": 0.6008,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.7438207864761353,
"rewards/margins": 0.7136639356613159,
"rewards/rejected": -2.4574849605560303,
"step": 830
},
{
"epoch": 0.21983773881182936,
"grad_norm": 9.129151344299316,
"learning_rate": 4.785162619238575e-06,
"logits/chosen": -1.5453526973724365,
"logits/rejected": -1.4078967571258545,
"logps/chosen": -414.2274475097656,
"logps/rejected": -470.25115966796875,
"loss": 0.5318,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.6049034595489502,
"rewards/margins": 0.8330914378166199,
"rewards/rejected": -2.437994956970215,
"step": 840
},
{
"epoch": 0.22245485475006543,
"grad_norm": 9.432477951049805,
"learning_rate": 4.775802303459288e-06,
"logits/chosen": -1.3889689445495605,
"logits/rejected": -1.3114643096923828,
"logps/chosen": -435.85797119140625,
"logps/rejected": -512.47705078125,
"loss": 0.5479,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.8816181421279907,
"rewards/margins": 0.8134034276008606,
"rewards/rejected": -2.695021629333496,
"step": 850
},
{
"epoch": 0.22507197068830148,
"grad_norm": 9.855865478515625,
"learning_rate": 4.766251958842589e-06,
"logits/chosen": -1.3516982793807983,
"logits/rejected": -1.249561071395874,
"logps/chosen": -436.2867736816406,
"logps/rejected": -491.6578674316406,
"loss": 0.5388,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.637629508972168,
"rewards/margins": 0.6592670679092407,
"rewards/rejected": -2.296896457672119,
"step": 860
},
{
"epoch": 0.22768908662653756,
"grad_norm": 6.627049446105957,
"learning_rate": 4.7565123828395066e-06,
"logits/chosen": -1.2943369150161743,
"logits/rejected": -1.1972229480743408,
"logps/chosen": -425.5354919433594,
"logps/rejected": -497.001953125,
"loss": 0.5271,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.6972328424453735,
"rewards/margins": 0.7206228971481323,
"rewards/rejected": -2.417855739593506,
"step": 870
},
{
"epoch": 0.23030620256477363,
"grad_norm": 8.324175834655762,
"learning_rate": 4.746584388701831e-06,
"logits/chosen": -1.277025818824768,
"logits/rejected": -1.2231056690216064,
"logps/chosen": -471.81292724609375,
"logps/rejected": -548.893798828125,
"loss": 0.4897,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.0869038105010986,
"rewards/margins": 0.9123502969741821,
"rewards/rejected": -2.999253749847412,
"step": 880
},
{
"epoch": 0.23292331850300968,
"grad_norm": 16.953506469726562,
"learning_rate": 4.736468805414218e-06,
"logits/chosen": -1.0355793237686157,
"logits/rejected": -0.9944553375244141,
"logps/chosen": -496.1917419433594,
"logps/rejected": -595.2552490234375,
"loss": 0.6087,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.4879398345947266,
"rewards/margins": 0.8530998229980469,
"rewards/rejected": -3.3410396575927734,
"step": 890
},
{
"epoch": 0.23554043444124576,
"grad_norm": 16.194597244262695,
"learning_rate": 4.7261664776249595e-06,
"logits/chosen": -0.8758188486099243,
"logits/rejected": -0.7574308514595032,
"logps/chosen": -454.79925537109375,
"logps/rejected": -559.8719482421875,
"loss": 0.4794,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.2598698139190674,
"rewards/margins": 1.1400644779205322,
"rewards/rejected": -3.3999342918395996,
"step": 900
},
{
"epoch": 0.23554043444124576,
"eval_logits/chosen": -0.9944241046905518,
"eval_logits/rejected": -0.8587719798088074,
"eval_logps/chosen": -465.2022399902344,
"eval_logps/rejected": -527.2197875976562,
"eval_loss": 0.5245745778083801,
"eval_rewards/accuracies": 0.7254999876022339,
"eval_rewards/chosen": -2.005873441696167,
"eval_rewards/margins": 0.8207210302352905,
"eval_rewards/rejected": -2.826594591140747,
"eval_runtime": 1581.4306,
"eval_samples_per_second": 1.265,
"eval_steps_per_second": 0.158,
"step": 900
},
{
"epoch": 0.2381575503794818,
"grad_norm": 9.985879898071289,
"learning_rate": 4.715678265575463e-06,
"logits/chosen": -1.2251172065734863,
"logits/rejected": -1.0086650848388672,
"logps/chosen": -471.8104553222656,
"logps/rejected": -480.17510986328125,
"loss": 0.5554,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.8755344152450562,
"rewards/margins": 0.6405603885650635,
"rewards/rejected": -2.516094923019409,
"step": 910
},
{
"epoch": 0.24077466631771788,
"grad_norm": 8.599123001098633,
"learning_rate": 4.705005045028415e-06,
"logits/chosen": -1.0334179401397705,
"logits/rejected": -0.8868207931518555,
"logps/chosen": -470.3492126464844,
"logps/rejected": -531.8078002929688,
"loss": 0.5416,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.072953701019287,
"rewards/margins": 0.7891368865966797,
"rewards/rejected": -2.862090587615967,
"step": 920
},
{
"epoch": 0.24339178225595393,
"grad_norm": 10.509832382202148,
"learning_rate": 4.694147707194659e-06,
"logits/chosen": -1.0835293531417847,
"logits/rejected": -0.979448139667511,
"logps/chosen": -508.9417419433594,
"logps/rejected": -565.7315063476562,
"loss": 0.5485,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -2.3716158866882324,
"rewards/margins": 0.7661987543106079,
"rewards/rejected": -3.137814998626709,
"step": 930
},
{
"epoch": 0.24600889819419,
"grad_norm": 6.1488494873046875,
"learning_rate": 4.683107158658782e-06,
"logits/chosen": -1.15608811378479,
"logits/rejected": -1.035563349723816,
"logps/chosen": -482.892578125,
"logps/rejected": -541.1561279296875,
"loss": 0.4962,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.9270384311676025,
"rewards/margins": 0.8703593015670776,
"rewards/rejected": -2.7973976135253906,
"step": 940
},
{
"epoch": 0.24862601413242608,
"grad_norm": 7.954674243927002,
"learning_rate": 4.671884321303407e-06,
"logits/chosen": -1.3060693740844727,
"logits/rejected": -1.1917331218719482,
"logps/chosen": -448.90484619140625,
"logps/rejected": -514.2643432617188,
"loss": 0.507,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.0268380641937256,
"rewards/margins": 0.797137975692749,
"rewards/rejected": -2.8239760398864746,
"step": 950
},
{
"epoch": 0.2512431300706621,
"grad_norm": 8.678433418273926,
"learning_rate": 4.660480132232224e-06,
"logits/chosen": -1.2542150020599365,
"logits/rejected": -1.151984453201294,
"logps/chosen": -519.7725830078125,
"logps/rejected": -568.0989990234375,
"loss": 0.5622,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.4804420471191406,
"rewards/margins": 0.7246801853179932,
"rewards/rejected": -3.2051219940185547,
"step": 960
},
{
"epoch": 0.25386024600889817,
"grad_norm": 19.324024200439453,
"learning_rate": 4.6488955436917414e-06,
"logits/chosen": -1.0881518125534058,
"logits/rejected": -0.8402830958366394,
"logps/chosen": -611.234130859375,
"logps/rejected": -676.3607177734375,
"loss": 0.522,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -3.3131954669952393,
"rewards/margins": 1.1177177429199219,
"rewards/rejected": -4.430912971496582,
"step": 970
},
{
"epoch": 0.2564773619471343,
"grad_norm": 5.164144992828369,
"learning_rate": 4.6371315229917644e-06,
"logits/chosen": -0.9441936612129211,
"logits/rejected": -0.7941098213195801,
"logps/chosen": -666.5744018554688,
"logps/rejected": -751.6942138671875,
"loss": 0.5046,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.8323254585266113,
"rewards/margins": 1.055508017539978,
"rewards/rejected": -4.887833118438721,
"step": 980
},
{
"epoch": 0.2590944778853703,
"grad_norm": 19.875019073486328,
"learning_rate": 4.625189052424638e-06,
"logits/chosen": -0.7852658033370972,
"logits/rejected": -0.6114678382873535,
"logps/chosen": -659.4119873046875,
"logps/rejected": -755.3154907226562,
"loss": 0.4696,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.203821182250977,
"rewards/margins": 1.186416745185852,
"rewards/rejected": -5.390236854553223,
"step": 990
},
{
"epoch": 0.26171159382360637,
"grad_norm": 7.329054355621338,
"learning_rate": 4.613069129183218e-06,
"logits/chosen": -1.0289504528045654,
"logits/rejected": -0.8489596247673035,
"logps/chosen": -642.2366943359375,
"logps/rejected": -696.42138671875,
"loss": 0.5261,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.388699769973755,
"rewards/margins": 0.904699444770813,
"rewards/rejected": -4.293398857116699,
"step": 1000
},
{
"epoch": 0.26171159382360637,
"eval_logits/chosen": -0.819309413433075,
"eval_logits/rejected": -0.6716277003288269,
"eval_logps/chosen": -553.1188354492188,
"eval_logps/rejected": -624.8492431640625,
"eval_loss": 0.5109167098999023,
"eval_rewards/accuracies": 0.7394999861717224,
"eval_rewards/chosen": -2.8850390911102295,
"eval_rewards/margins": 0.9178500771522522,
"eval_rewards/rejected": -3.802889108657837,
"eval_runtime": 1590.9987,
"eval_samples_per_second": 1.257,
"eval_steps_per_second": 0.157,
"step": 1000
},
{
"epoch": 0.2643287097618425,
"grad_norm": 11.057512283325195,
"learning_rate": 4.600772765277607e-06,
"logits/chosen": -0.9332865476608276,
"logits/rejected": -0.8343530893325806,
"logps/chosen": -501.17291259765625,
"logps/rejected": -588.1627197265625,
"loss": 0.491,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.676569700241089,
"rewards/margins": 0.9215409159660339,
"rewards/rejected": -3.5981109142303467,
"step": 1010
},
{
"epoch": 0.2669458257000785,
"grad_norm": 15.726754188537598,
"learning_rate": 4.588300987450652e-06,
"logits/chosen": -1.0615359544754028,
"logits/rejected": -0.9324240684509277,
"logps/chosen": -485.42474365234375,
"logps/rejected": -538.6219482421875,
"loss": 0.524,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.318000316619873,
"rewards/margins": 0.8668109774589539,
"rewards/rejected": -3.1848113536834717,
"step": 1020
},
{
"epoch": 0.26956294163831457,
"grad_norm": 8.744333267211914,
"learning_rate": 4.5756548370922136e-06,
"logits/chosen": -1.0338976383209229,
"logits/rejected": -0.9279731512069702,
"logps/chosen": -438.6556091308594,
"logps/rejected": -530.7064208984375,
"loss": 0.4732,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.980308175086975,
"rewards/margins": 1.0102787017822266,
"rewards/rejected": -2.990586757659912,
"step": 1030
},
{
"epoch": 0.2721800575765506,
"grad_norm": 18.558218002319336,
"learning_rate": 4.562835370152206e-06,
"logits/chosen": -1.0217876434326172,
"logits/rejected": -0.7763082385063171,
"logps/chosen": -592.7269897460938,
"logps/rejected": -723.8841552734375,
"loss": 0.4507,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.9158542156219482,
"rewards/margins": 1.6223560571670532,
"rewards/rejected": -4.538210391998291,
"step": 1040
},
{
"epoch": 0.2747971735147867,
"grad_norm": 13.077289581298828,
"learning_rate": 4.54984365705243e-06,
"logits/chosen": -0.9829725027084351,
"logits/rejected": -0.8464131355285645,
"logps/chosen": -542.563720703125,
"logps/rejected": -691.7203369140625,
"loss": 0.4689,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.7414119243621826,
"rewards/margins": 1.5575182437896729,
"rewards/rejected": -4.2989301681518555,
"step": 1050
},
{
"epoch": 0.27741428945302277,
"grad_norm": 8.922407150268555,
"learning_rate": 4.536680782597191e-06,
"logits/chosen": -1.0383745431900024,
"logits/rejected": -0.9313938021659851,
"logps/chosen": -409.2937316894531,
"logps/rejected": -496.7439880371094,
"loss": 0.582,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.7329803705215454,
"rewards/margins": 0.931407630443573,
"rewards/rejected": -2.6643881797790527,
"step": 1060
},
{
"epoch": 0.2800314053912588,
"grad_norm": 11.105772018432617,
"learning_rate": 4.523347845882718e-06,
"logits/chosen": -1.2218599319458008,
"logits/rejected": -0.9990504384040833,
"logps/chosen": -422.4734802246094,
"logps/rejected": -471.9090270996094,
"loss": 0.4615,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3439210653305054,
"rewards/margins": 0.9874658584594727,
"rewards/rejected": -2.3313870429992676,
"step": 1070
},
{
"epoch": 0.2826485213294949,
"grad_norm": 8.625237464904785,
"learning_rate": 4.50984596020539e-06,
"logits/chosen": -0.8810186386108398,
"logits/rejected": -0.8003977537155151,
"logps/chosen": -499.4249572753906,
"logps/rejected": -544.4656982421875,
"loss": 0.5526,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.1937453746795654,
"rewards/margins": 0.7729299664497375,
"rewards/rejected": -2.9666755199432373,
"step": 1080
},
{
"epoch": 0.28526563726773096,
"grad_norm": 6.088487148284912,
"learning_rate": 4.4961762529687745e-06,
"logits/chosen": -1.0772597789764404,
"logits/rejected": -0.9252904653549194,
"logps/chosen": -473.551025390625,
"logps/rejected": -539.3673095703125,
"loss": 0.509,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.1322884559631348,
"rewards/margins": 0.8234016299247742,
"rewards/rejected": -2.955690383911133,
"step": 1090
},
{
"epoch": 0.287882753205967,
"grad_norm": 7.50374174118042,
"learning_rate": 4.482339865589492e-06,
"logits/chosen": -1.0490493774414062,
"logits/rejected": -0.8567570447921753,
"logps/chosen": -522.2603149414062,
"logps/rejected": -541.4241943359375,
"loss": 0.6001,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.5611276626586914,
"rewards/margins": 0.630212128162384,
"rewards/rejected": -3.1913397312164307,
"step": 1100
},
{
"epoch": 0.287882753205967,
"eval_logits/chosen": -0.8244917988777161,
"eval_logits/rejected": -0.6634394526481628,
"eval_logps/chosen": -513.6636352539062,
"eval_logps/rejected": -577.7298583984375,
"eval_loss": 0.5050398707389832,
"eval_rewards/accuracies": 0.737500011920929,
"eval_rewards/chosen": -2.490487813949585,
"eval_rewards/margins": 0.8412085175514221,
"eval_rewards/rejected": -3.3316965103149414,
"eval_runtime": 1582.1916,
"eval_samples_per_second": 1.264,
"eval_steps_per_second": 0.158,
"step": 1100
},
{
"epoch": 0.2904998691442031,
"grad_norm": 6.5892252922058105,
"learning_rate": 4.468337953401909e-06,
"logits/chosen": -1.082047700881958,
"logits/rejected": -1.00830078125,
"logps/chosen": -517.5233764648438,
"logps/rejected": -578.21923828125,
"loss": 0.5718,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.4717154502868652,
"rewards/margins": 0.6562873721122742,
"rewards/rejected": -3.128002882003784,
"step": 1110
},
{
"epoch": 0.29311698508243916,
"grad_norm": 8.295710563659668,
"learning_rate": 4.45417168556166e-06,
"logits/chosen": -1.0961480140686035,
"logits/rejected": -0.9785219430923462,
"logps/chosen": -446.82977294921875,
"logps/rejected": -531.4866333007812,
"loss": 0.4985,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.056777000427246,
"rewards/margins": 0.8543047904968262,
"rewards/rejected": -2.911081552505493,
"step": 1120
},
{
"epoch": 0.2957341010206752,
"grad_norm": 9.79799747467041,
"learning_rate": 4.439842244948036e-06,
"logits/chosen": -1.0972565412521362,
"logits/rejected": -0.9272262454032898,
"logps/chosen": -524.0053100585938,
"logps/rejected": -611.5361938476562,
"loss": 0.5547,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.6295433044433594,
"rewards/margins": 0.9316496849060059,
"rewards/rejected": -3.5611929893493652,
"step": 1130
},
{
"epoch": 0.29835121695891126,
"grad_norm": 31.351959228515625,
"learning_rate": 4.425350828065204e-06,
"logits/chosen": -1.2044315338134766,
"logits/rejected": -0.9763747453689575,
"logps/chosen": -536.931640625,
"logps/rejected": -602.4816284179688,
"loss": 0.4757,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.528210163116455,
"rewards/margins": 1.1348176002502441,
"rewards/rejected": -3.6630280017852783,
"step": 1140
},
{
"epoch": 0.30096833289714736,
"grad_norm": 13.463656425476074,
"learning_rate": 4.410698644942303e-06,
"logits/chosen": -1.2988972663879395,
"logits/rejected": -1.1377476453781128,
"logps/chosen": -515.65380859375,
"logps/rejected": -611.4737548828125,
"loss": 0.4582,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.433767795562744,
"rewards/margins": 1.198464274406433,
"rewards/rejected": -3.632232189178467,
"step": 1150
},
{
"epoch": 0.3035854488353834,
"grad_norm": 13.99889850616455,
"learning_rate": 4.395886919032406e-06,
"logits/chosen": -1.2166707515716553,
"logits/rejected": -1.0688965320587158,
"logps/chosen": -525.2330932617188,
"logps/rejected": -608.0958862304688,
"loss": 0.4977,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.6150729656219482,
"rewards/margins": 1.093909502029419,
"rewards/rejected": -3.708981990814209,
"step": 1160
},
{
"epoch": 0.30620256477361946,
"grad_norm": 15.137212753295898,
"learning_rate": 4.380916887110366e-06,
"logits/chosen": -1.260829210281372,
"logits/rejected": -1.0494515895843506,
"logps/chosen": -565.7590942382812,
"logps/rejected": -643.6331787109375,
"loss": 0.5416,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.06677508354187,
"rewards/margins": 1.1827514171600342,
"rewards/rejected": -4.249526500701904,
"step": 1170
},
{
"epoch": 0.30881968071185556,
"grad_norm": 9.361719131469727,
"learning_rate": 4.365789799169539e-06,
"logits/chosen": -1.1176745891571045,
"logits/rejected": -1.1530115604400635,
"logps/chosen": -499.57470703125,
"logps/rejected": -600.5289306640625,
"loss": 0.5294,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.5301427841186523,
"rewards/margins": 0.9564053416252136,
"rewards/rejected": -3.4865479469299316,
"step": 1180
},
{
"epoch": 0.3114367966500916,
"grad_norm": 16.095386505126953,
"learning_rate": 4.350506918317416e-06,
"logits/chosen": -1.39276921749115,
"logits/rejected": -1.2349721193313599,
"logps/chosen": -434.96197509765625,
"logps/rejected": -517.3797607421875,
"loss": 0.4972,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.8744901418685913,
"rewards/margins": 0.8959289789199829,
"rewards/rejected": -2.770418882369995,
"step": 1190
},
{
"epoch": 0.31405391258832765,
"grad_norm": 9.526153564453125,
"learning_rate": 4.335069520670149e-06,
"logits/chosen": -1.138518214225769,
"logits/rejected": -1.0583736896514893,
"logps/chosen": -489.06463623046875,
"logps/rejected": -578.7098999023438,
"loss": 0.5911,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -2.59414005279541,
"rewards/margins": 0.8143970370292664,
"rewards/rejected": -3.4085373878479004,
"step": 1200
},
{
"epoch": 0.31405391258832765,
"eval_logits/chosen": -1.1191861629486084,
"eval_logits/rejected": -0.9870566725730896,
"eval_logps/chosen": -491.9688415527344,
"eval_logps/rejected": -566.8433837890625,
"eval_loss": 0.4982523024082184,
"eval_rewards/accuracies": 0.7384999990463257,
"eval_rewards/chosen": -2.2735395431518555,
"eval_rewards/margins": 0.9492914080619812,
"eval_rewards/rejected": -3.2228307723999023,
"eval_runtime": 1581.9568,
"eval_samples_per_second": 1.264,
"eval_steps_per_second": 0.158,
"step": 1200
},
{
"epoch": 0.3166710285265637,
"grad_norm": 7.82928991317749,
"learning_rate": 4.319478895246e-06,
"logits/chosen": -1.2663036584854126,
"logits/rejected": -1.0805227756500244,
"logps/chosen": -465.62579345703125,
"logps/rejected": -534.9702758789062,
"loss": 0.4779,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.185610294342041,
"rewards/margins": 0.9405841827392578,
"rewards/rejected": -3.126194477081299,
"step": 1210
},
{
"epoch": 0.3192881444647998,
"grad_norm": 14.477264404296875,
"learning_rate": 4.303736343857704e-06,
"logits/chosen": -1.2190170288085938,
"logits/rejected": -1.10158371925354,
"logps/chosen": -563.8433227539062,
"logps/rejected": -729.468017578125,
"loss": 0.4598,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.0673668384552,
"rewards/margins": 1.5377466678619385,
"rewards/rejected": -4.6051130294799805,
"step": 1220
},
{
"epoch": 0.32190526040303585,
"grad_norm": 12.053476333618164,
"learning_rate": 4.287843181003772e-06,
"logits/chosen": -1.2529375553131104,
"logits/rejected": -1.0980578660964966,
"logps/chosen": -634.8171997070312,
"logps/rejected": -681.6033935546875,
"loss": 0.5584,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.410432815551758,
"rewards/margins": 0.9311054944992065,
"rewards/rejected": -4.341538429260254,
"step": 1230
},
{
"epoch": 0.3245223763412719,
"grad_norm": 8.703567504882812,
"learning_rate": 4.27180073375873e-06,
"logits/chosen": -1.3562941551208496,
"logits/rejected": -1.2354745864868164,
"logps/chosen": -528.8560791015625,
"logps/rejected": -573.3671875,
"loss": 0.5288,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.4158124923706055,
"rewards/margins": 0.8610725402832031,
"rewards/rejected": -3.2768847942352295,
"step": 1240
},
{
"epoch": 0.327139492279508,
"grad_norm": 5.352301120758057,
"learning_rate": 4.255610341662304e-06,
"logits/chosen": -1.3785821199417114,
"logits/rejected": -1.1847844123840332,
"logps/chosen": -459.38037109375,
"logps/rejected": -520.2581787109375,
"loss": 0.5513,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -2.04193115234375,
"rewards/margins": 0.8080309629440308,
"rewards/rejected": -2.849961757659912,
"step": 1250
},
{
"epoch": 0.32975660821774405,
"grad_norm": 10.721965789794922,
"learning_rate": 4.2392733566075764e-06,
"logits/chosen": -1.393056869506836,
"logits/rejected": -1.2600148916244507,
"logps/chosen": -454.79547119140625,
"logps/rejected": -493.84405517578125,
"loss": 0.5687,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.9885499477386475,
"rewards/margins": 0.5462992787361145,
"rewards/rejected": -2.534849166870117,
"step": 1260
},
{
"epoch": 0.3323737241559801,
"grad_norm": 7.625462532043457,
"learning_rate": 4.2227911427280975e-06,
"logits/chosen": -1.2262321710586548,
"logits/rejected": -1.0494863986968994,
"logps/chosen": -501.2200622558594,
"logps/rejected": -556.45849609375,
"loss": 0.5118,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -2.510348081588745,
"rewards/margins": 0.879997730255127,
"rewards/rejected": -3.390345811843872,
"step": 1270
},
{
"epoch": 0.33499084009421615,
"grad_norm": 10.538474082946777,
"learning_rate": 4.206165076283983e-06,
"logits/chosen": -1.1859577894210815,
"logits/rejected": -1.030694842338562,
"logps/chosen": -578.6294555664062,
"logps/rejected": -669.954833984375,
"loss": 0.4772,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.32794189453125,
"rewards/margins": 1.1238527297973633,
"rewards/rejected": -4.451794624328613,
"step": 1280
},
{
"epoch": 0.33760795603245225,
"grad_norm": 12.202057838439941,
"learning_rate": 4.189396545546995e-06,
"logits/chosen": -1.1280696392059326,
"logits/rejected": -0.9983634948730469,
"logps/chosen": -617.6463623046875,
"logps/rejected": -723.6165771484375,
"loss": 0.4987,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.684889554977417,
"rewards/margins": 1.2597671747207642,
"rewards/rejected": -4.9446563720703125,
"step": 1290
},
{
"epoch": 0.3402250719706883,
"grad_norm": 13.77889633178711,
"learning_rate": 4.172486950684627e-06,
"logits/chosen": -1.147534728050232,
"logits/rejected": -1.0692282915115356,
"logps/chosen": -597.9291381835938,
"logps/rejected": -696.7471923828125,
"loss": 0.5345,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.4435715675354004,
"rewards/margins": 1.0198230743408203,
"rewards/rejected": -4.463394641876221,
"step": 1300
},
{
"epoch": 0.3402250719706883,
"eval_logits/chosen": -0.9910528063774109,
"eval_logits/rejected": -0.8540138006210327,
"eval_logps/chosen": -616.756591796875,
"eval_logps/rejected": -717.8565063476562,
"eval_loss": 0.5000870227813721,
"eval_rewards/accuracies": 0.7450000047683716,
"eval_rewards/chosen": -3.5214157104492188,
"eval_rewards/margins": 1.2115455865859985,
"eval_rewards/rejected": -4.732961654663086,
"eval_runtime": 1582.4164,
"eval_samples_per_second": 1.264,
"eval_steps_per_second": 0.158,
"step": 1300
},
{
"epoch": 0.34284218790892435,
"grad_norm": 7.990820407867432,
"learning_rate": 4.155437703643182e-06,
"logits/chosen": -1.223760724067688,
"logits/rejected": -1.0244286060333252,
"logps/chosen": -568.0660400390625,
"logps/rejected": -653.6463623046875,
"loss": 0.5074,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.2144699096679688,
"rewards/margins": 1.1462501287460327,
"rewards/rejected": -4.360719680786133,
"step": 1310
},
{
"epoch": 0.34545930384716045,
"grad_norm": 11.47555160522461,
"learning_rate": 4.138250228029882e-06,
"logits/chosen": -1.2019230127334595,
"logits/rejected": -1.108407735824585,
"logps/chosen": -539.8394775390625,
"logps/rejected": -651.2247314453125,
"loss": 0.4882,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.8300833702087402,
"rewards/margins": 1.0788848400115967,
"rewards/rejected": -3.908967971801758,
"step": 1320
},
{
"epoch": 0.3480764197853965,
"grad_norm": 5.791493892669678,
"learning_rate": 4.120925958993994e-06,
"logits/chosen": -1.1601794958114624,
"logits/rejected": -1.067439317703247,
"logps/chosen": -501.59100341796875,
"logps/rejected": -596.958984375,
"loss": 0.5485,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.677966356277466,
"rewards/margins": 0.9972420930862427,
"rewards/rejected": -3.675208568572998,
"step": 1330
},
{
"epoch": 0.35069353572363254,
"grad_norm": 13.772578239440918,
"learning_rate": 4.103466343106999e-06,
"logits/chosen": -1.3145829439163208,
"logits/rejected": -1.1666548252105713,
"logps/chosen": -544.5247192382812,
"logps/rejected": -613.3075561523438,
"loss": 0.5066,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.751533031463623,
"rewards/margins": 0.9478660821914673,
"rewards/rejected": -3.69939923286438,
"step": 1340
},
{
"epoch": 0.35331065166186865,
"grad_norm": 10.991410255432129,
"learning_rate": 4.085872838241797e-06,
"logits/chosen": -1.2049071788787842,
"logits/rejected": -1.057510495185852,
"logps/chosen": -566.255859375,
"logps/rejected": -625.919921875,
"loss": 0.5737,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -3.0400071144104004,
"rewards/margins": 0.797439694404602,
"rewards/rejected": -3.837446928024292,
"step": 1350
},
{
"epoch": 0.3559277676001047,
"grad_norm": 10.892525672912598,
"learning_rate": 4.06814691345098e-06,
"logits/chosen": -1.1951572895050049,
"logits/rejected": -1.0272043943405151,
"logps/chosen": -564.4779052734375,
"logps/rejected": -638.2157592773438,
"loss": 0.4798,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.090005397796631,
"rewards/margins": 0.9581144452095032,
"rewards/rejected": -4.048120021820068,
"step": 1360
},
{
"epoch": 0.35854488353834074,
"grad_norm": 10.314249992370605,
"learning_rate": 4.050290048844171e-06,
"logits/chosen": -1.2887331247329712,
"logits/rejected": -1.192067265510559,
"logps/chosen": -587.147705078125,
"logps/rejected": -671.292236328125,
"loss": 0.5371,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -3.1862995624542236,
"rewards/margins": 0.8927896618843079,
"rewards/rejected": -4.079089164733887,
"step": 1370
},
{
"epoch": 0.3611619994765768,
"grad_norm": 9.613045692443848,
"learning_rate": 4.032303735464422e-06,
"logits/chosen": -1.3485846519470215,
"logits/rejected": -1.1314103603363037,
"logps/chosen": -622.3319091796875,
"logps/rejected": -722.0323486328125,
"loss": 0.4515,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.5042572021484375,
"rewards/margins": 1.2276619672775269,
"rewards/rejected": -4.731919288635254,
"step": 1380
},
{
"epoch": 0.3637791154148129,
"grad_norm": 13.027145385742188,
"learning_rate": 4.014189475163727e-06,
"logits/chosen": -1.1553242206573486,
"logits/rejected": -1.0384472608566284,
"logps/chosen": -590.9238891601562,
"logps/rejected": -693.7357788085938,
"loss": 0.4786,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.3637802600860596,
"rewards/margins": 1.1092660427093506,
"rewards/rejected": -4.47304630279541,
"step": 1390
},
{
"epoch": 0.36639623135304894,
"grad_norm": 13.17509651184082,
"learning_rate": 3.995948780477605e-06,
"logits/chosen": -1.3155999183654785,
"logits/rejected": -1.1753036975860596,
"logps/chosen": -568.0977783203125,
"logps/rejected": -626.4328002929688,
"loss": 0.5291,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.994460344314575,
"rewards/margins": 0.8298677206039429,
"rewards/rejected": -3.8243279457092285,
"step": 1400
},
{
"epoch": 0.36639623135304894,
"eval_logits/chosen": -1.2062066793441772,
"eval_logits/rejected": -1.0816161632537842,
"eval_logps/chosen": -543.2669677734375,
"eval_logps/rejected": -619.3544921875,
"eval_loss": 0.4986713230609894,
"eval_rewards/accuracies": 0.7475000023841858,
"eval_rewards/chosen": -2.786520004272461,
"eval_rewards/margins": 0.9614222049713135,
"eval_rewards/rejected": -3.7479424476623535,
"eval_runtime": 1581.7334,
"eval_samples_per_second": 1.264,
"eval_steps_per_second": 0.158,
"step": 1400
},
{
"epoch": 0.369013347291285,
"grad_norm": 11.276718139648438,
"learning_rate": 3.977583174498816e-06,
"logits/chosen": -1.2875537872314453,
"logits/rejected": -1.158477544784546,
"logps/chosen": -564.5877075195312,
"logps/rejected": -679.6033935546875,
"loss": 0.3861,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -3.009406089782715,
"rewards/margins": 1.3703964948654175,
"rewards/rejected": -4.379802227020264,
"step": 1410
},
{
"epoch": 0.3716304632295211,
"grad_norm": 12.995461463928223,
"learning_rate": 3.959094190750172e-06,
"logits/chosen": -1.2381871938705444,
"logits/rejected": -1.0811434984207153,
"logps/chosen": -635.1746826171875,
"logps/rejected": -741.3914184570312,
"loss": 0.4966,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.5006301403045654,
"rewards/margins": 1.3320776224136353,
"rewards/rejected": -4.832707405090332,
"step": 1420
},
{
"epoch": 0.37424757916775714,
"grad_norm": 12.000993728637695,
"learning_rate": 3.9404833730564975e-06,
"logits/chosen": -1.029284119606018,
"logits/rejected": -0.9086493253707886,
"logps/chosen": -637.097900390625,
"logps/rejected": -769.3482666015625,
"loss": 0.4966,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.841416120529175,
"rewards/margins": 1.4375669956207275,
"rewards/rejected": -5.278983116149902,
"step": 1430
},
{
"epoch": 0.3768646951059932,
"grad_norm": 32.742244720458984,
"learning_rate": 3.921752275415712e-06,
"logits/chosen": -1.1185188293457031,
"logits/rejected": -1.0146936178207397,
"logps/chosen": -668.8628540039062,
"logps/rejected": -822.2618408203125,
"loss": 0.4455,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -4.184566020965576,
"rewards/margins": 1.7391846179962158,
"rewards/rejected": -5.923750877380371,
"step": 1440
},
{
"epoch": 0.37948181104422923,
"grad_norm": 9.557754516601562,
"learning_rate": 3.902902461869079e-06,
"logits/chosen": -1.2069816589355469,
"logits/rejected": -1.0695757865905762,
"logps/chosen": -616.6637573242188,
"logps/rejected": -743.1511840820312,
"loss": 0.5608,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.7714271545410156,
"rewards/margins": 1.4299030303955078,
"rewards/rejected": -5.201330184936523,
"step": 1450
},
{
"epoch": 0.38209892698246534,
"grad_norm": 15.0908784866333,
"learning_rate": 3.883935506370605e-06,
"logits/chosen": -1.3125331401824951,
"logits/rejected": -1.2091357707977295,
"logps/chosen": -542.5530395507812,
"logps/rejected": -604.1723022460938,
"loss": 0.573,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.894252300262451,
"rewards/margins": 0.8939558863639832,
"rewards/rejected": -3.788208484649658,
"step": 1460
},
{
"epoch": 0.3847160429207014,
"grad_norm": 4.927533149719238,
"learning_rate": 3.864852992655617e-06,
"logits/chosen": -1.4343416690826416,
"logits/rejected": -1.3349957466125488,
"logps/chosen": -462.3154296875,
"logps/rejected": -552.6721801757812,
"loss": 0.4451,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -2.1374027729034424,
"rewards/margins": 1.026588797569275,
"rewards/rejected": -3.1639912128448486,
"step": 1470
},
{
"epoch": 0.38733315885893743,
"grad_norm": 10.096611976623535,
"learning_rate": 3.845656514108516e-06,
"logits/chosen": -1.3481743335723877,
"logits/rejected": -1.198091745376587,
"logps/chosen": -515.7351684570312,
"logps/rejected": -568.5164184570312,
"loss": 0.473,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.5670249462127686,
"rewards/margins": 1.077282190322876,
"rewards/rejected": -3.6443073749542236,
"step": 1480
},
{
"epoch": 0.38995027479717354,
"grad_norm": 13.104360580444336,
"learning_rate": 3.826347673629738e-06,
"logits/chosen": -1.2953057289123535,
"logits/rejected": -1.1235467195510864,
"logps/chosen": -528.2382202148438,
"logps/rejected": -629.0557861328125,
"loss": 0.502,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.7498528957366943,
"rewards/margins": 1.271679162979126,
"rewards/rejected": -4.02153205871582,
"step": 1490
},
{
"epoch": 0.3925673907354096,
"grad_norm": 16.11091423034668,
"learning_rate": 3.8069280835019062e-06,
"logits/chosen": -1.3702330589294434,
"logits/rejected": -1.2128852605819702,
"logps/chosen": -517.0223999023438,
"logps/rejected": -641.86962890625,
"loss": 0.4495,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.527927875518799,
"rewards/margins": 1.4398820400238037,
"rewards/rejected": -3.9678096771240234,
"step": 1500
},
{
"epoch": 0.3925673907354096,
"eval_logits/chosen": -1.3215845823287964,
"eval_logits/rejected": -1.1934013366699219,
"eval_logps/chosen": -510.6183776855469,
"eval_logps/rejected": -609.4039306640625,
"eval_loss": 0.5144493579864502,
"eval_rewards/accuracies": 0.7329999804496765,
"eval_rewards/chosen": -2.460034132003784,
"eval_rewards/margins": 1.1884018182754517,
"eval_rewards/rejected": -3.6484363079071045,
"eval_runtime": 1581.1744,
"eval_samples_per_second": 1.265,
"eval_steps_per_second": 0.158,
"step": 1500
},
{
"epoch": 0.39518450667364563,
"grad_norm": 14.719096183776855,
"learning_rate": 3.7873993652552077e-06,
"logits/chosen": -1.391238808631897,
"logits/rejected": -1.3038396835327148,
"logps/chosen": -469.97021484375,
"logps/rejected": -566.0907592773438,
"loss": 0.5832,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -2.3631210327148438,
"rewards/margins": 1.0193054676055908,
"rewards/rejected": -3.3824265003204346,
"step": 1510
},
{
"epoch": 0.39780162261188173,
"grad_norm": 10.856192588806152,
"learning_rate": 3.7677631495319953e-06,
"logits/chosen": -1.5884860754013062,
"logits/rejected": -1.493381381034851,
"logps/chosen": -435.23876953125,
"logps/rejected": -504.7139587402344,
"loss": 0.5117,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.7161569595336914,
"rewards/margins": 0.8754542469978333,
"rewards/rejected": -2.5916106700897217,
"step": 1520
},
{
"epoch": 0.4004187385501178,
"grad_norm": 7.564871311187744,
"learning_rate": 3.748021075950633e-06,
"logits/chosen": -1.6664730310440063,
"logits/rejected": -1.5948269367218018,
"logps/chosen": -443.94915771484375,
"logps/rejected": -495.65643310546875,
"loss": 0.5975,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.69313645362854,
"rewards/margins": 0.6239147186279297,
"rewards/rejected": -2.3170511722564697,
"step": 1530
},
{
"epoch": 0.40303585448835383,
"grad_norm": 11.641048431396484,
"learning_rate": 3.7281747929685824e-06,
"logits/chosen": -1.4826856851577759,
"logits/rejected": -1.3677144050598145,
"logps/chosen": -416.457275390625,
"logps/rejected": -480.78118896484375,
"loss": 0.5409,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7890077829360962,
"rewards/margins": 0.8001953959465027,
"rewards/rejected": -2.589203119277954,
"step": 1540
},
{
"epoch": 0.4056529704265899,
"grad_norm": 8.30944538116455,
"learning_rate": 3.7082259577447604e-06,
"logits/chosen": -1.5535342693328857,
"logits/rejected": -1.4495346546173096,
"logps/chosen": -458.6407165527344,
"logps/rejected": -524.59423828125,
"loss": 0.4768,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8754609823226929,
"rewards/margins": 0.8596477508544922,
"rewards/rejected": -2.7351086139678955,
"step": 1550
},
{
"epoch": 0.408270086364826,
"grad_norm": 11.737417221069336,
"learning_rate": 3.6881762360011688e-06,
"logits/chosen": -1.492614984512329,
"logits/rejected": -1.2994787693023682,
"logps/chosen": -562.4512939453125,
"logps/rejected": -632.38818359375,
"loss": 0.5106,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.8106777667999268,
"rewards/margins": 1.069096326828003,
"rewards/rejected": -3.879774570465088,
"step": 1560
},
{
"epoch": 0.410887202303062,
"grad_norm": 13.686888694763184,
"learning_rate": 3.668027301883802e-06,
"logits/chosen": -1.384356141090393,
"logits/rejected": -1.24310302734375,
"logps/chosen": -584.9012451171875,
"logps/rejected": -698.5891723632812,
"loss": 0.4669,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.33648419380188,
"rewards/margins": 1.2853710651397705,
"rewards/rejected": -4.621855735778809,
"step": 1570
},
{
"epoch": 0.4135043182412981,
"grad_norm": 8.767433166503906,
"learning_rate": 3.64778083782286e-06,
"logits/chosen": -1.2861278057098389,
"logits/rejected": -1.2656017541885376,
"logps/chosen": -606.9259033203125,
"logps/rejected": -742.4180908203125,
"loss": 0.577,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.5447895526885986,
"rewards/margins": 1.0646626949310303,
"rewards/rejected": -4.609452247619629,
"step": 1580
},
{
"epoch": 0.4161214341795342,
"grad_norm": 9.344108581542969,
"learning_rate": 3.627438534392268e-06,
"logits/chosen": -1.4602949619293213,
"logits/rejected": -1.431760311126709,
"logps/chosen": -516.3866577148438,
"logps/rejected": -624.79248046875,
"loss": 0.4808,
"rewards/accuracies": 0.78125,
"rewards/chosen": -2.784986972808838,
"rewards/margins": 1.016643762588501,
"rewards/rejected": -3.801631212234497,
"step": 1590
},
{
"epoch": 0.4187385501177702,
"grad_norm": 8.162246704101562,
"learning_rate": 3.607002090168506e-06,
"logits/chosen": -1.3787976503372192,
"logits/rejected": -1.3049051761627197,
"logps/chosen": -540.64697265625,
"logps/rejected": -601.52392578125,
"loss": 0.5586,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -2.764016628265381,
"rewards/margins": 0.8225992321968079,
"rewards/rejected": -3.586615800857544,
"step": 1600
},
{
"epoch": 0.4187385501177702,
"eval_logits/chosen": -1.3065768480300903,
"eval_logits/rejected": -1.1838239431381226,
"eval_logps/chosen": -514.4846801757812,
"eval_logps/rejected": -594.8329467773438,
"eval_loss": 0.493674635887146,
"eval_rewards/accuracies": 0.7429999709129333,
"eval_rewards/chosen": -2.4986977577209473,
"eval_rewards/margins": 1.004028558731079,
"eval_rewards/rejected": -3.5027263164520264,
"eval_runtime": 1581.6015,
"eval_samples_per_second": 1.265,
"eval_steps_per_second": 0.158,
"step": 1600
},
{
"epoch": 0.4213556660560063,
"grad_norm": 5.075291156768799,
"learning_rate": 3.586473211588787e-06,
"logits/chosen": -1.412367582321167,
"logits/rejected": -1.3295118808746338,
"logps/chosen": -484.82781982421875,
"logps/rejected": -605.0839233398438,
"loss": 0.4373,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.4015586376190186,
"rewards/margins": 1.136573076248169,
"rewards/rejected": -3.5381317138671875,
"step": 1610
},
{
"epoch": 0.4239727819942423,
"grad_norm": 14.283614158630371,
"learning_rate": 3.5658536128085623e-06,
"logits/chosen": -1.3783130645751953,
"logits/rejected": -1.2037944793701172,
"logps/chosen": -573.0740966796875,
"logps/rejected": -635.7131958007812,
"loss": 0.5834,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -3.1027450561523438,
"rewards/margins": 0.9221722483634949,
"rewards/rejected": -4.024917125701904,
"step": 1620
},
{
"epoch": 0.4265898979324784,
"grad_norm": 9.143744468688965,
"learning_rate": 3.545145015558399e-06,
"logits/chosen": -1.0775479078292847,
"logits/rejected": -1.0551337003707886,
"logps/chosen": -561.3260498046875,
"logps/rejected": -659.1534423828125,
"loss": 0.4713,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.2980690002441406,
"rewards/margins": 1.12051522731781,
"rewards/rejected": -4.41858434677124,
"step": 1630
},
{
"epoch": 0.42920701387071447,
"grad_norm": 5.6775970458984375,
"learning_rate": 3.5243491490002056e-06,
"logits/chosen": -1.1355875730514526,
"logits/rejected": -1.0401685237884521,
"logps/chosen": -623.6513671875,
"logps/rejected": -720.0593872070312,
"loss": 0.5648,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.7002768516540527,
"rewards/margins": 1.0166988372802734,
"rewards/rejected": -4.716975688934326,
"step": 1640
},
{
"epoch": 0.4318241298089505,
"grad_norm": 8.781460762023926,
"learning_rate": 3.503467749582857e-06,
"logits/chosen": -1.1672102212905884,
"logits/rejected": -0.9630798101425171,
"logps/chosen": -615.6878051757812,
"logps/rejected": -661.8714599609375,
"loss": 0.5984,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -3.6454033851623535,
"rewards/margins": 0.8021238446235657,
"rewards/rejected": -4.4475274085998535,
"step": 1650
},
{
"epoch": 0.4344412457471866,
"grad_norm": 17.43423843383789,
"learning_rate": 3.4825025608971947e-06,
"logits/chosen": -1.0625419616699219,
"logits/rejected": -0.9829475283622742,
"logps/chosen": -560.93896484375,
"logps/rejected": -651.3380126953125,
"loss": 0.5128,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.390519618988037,
"rewards/margins": 0.8660950660705566,
"rewards/rejected": -4.256614685058594,
"step": 1660
},
{
"epoch": 0.43705836168542267,
"grad_norm": 6.469093322753906,
"learning_rate": 3.4614553335304407e-06,
"logits/chosen": -1.1648396253585815,
"logits/rejected": -0.9342324137687683,
"logps/chosen": -580.8231201171875,
"logps/rejected": -656.4859008789062,
"loss": 0.4754,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.1200098991394043,
"rewards/margins": 1.074102759361267,
"rewards/rejected": -4.194112300872803,
"step": 1670
},
{
"epoch": 0.4396754776236587,
"grad_norm": 9.7577486038208,
"learning_rate": 3.4403278249200222e-06,
"logits/chosen": -1.2089046239852905,
"logits/rejected": -0.9787479639053345,
"logps/chosen": -566.4661254882812,
"logps/rejected": -651.2072143554688,
"loss": 0.449,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -2.8394370079040527,
"rewards/margins": 1.2666107416152954,
"rewards/rejected": -4.106047630310059,
"step": 1680
},
{
"epoch": 0.44229259356189476,
"grad_norm": 17.261714935302734,
"learning_rate": 3.4191217992068293e-06,
"logits/chosen": -1.2693849802017212,
"logits/rejected": -1.0484362840652466,
"logps/chosen": -608.1680908203125,
"logps/rejected": -667.4273681640625,
"loss": 0.5398,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.3012607097625732,
"rewards/margins": 1.0486795902252197,
"rewards/rejected": -4.349940299987793,
"step": 1690
},
{
"epoch": 0.44490970950013087,
"grad_norm": 12.436171531677246,
"learning_rate": 3.3978390270879056e-06,
"logits/chosen": -1.1414504051208496,
"logits/rejected": -1.039656400680542,
"logps/chosen": -577.3287353515625,
"logps/rejected": -704.9564208984375,
"loss": 0.4895,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.674631118774414,
"rewards/margins": 1.2074581384658813,
"rewards/rejected": -4.882089138031006,
"step": 1700
},
{
"epoch": 0.44490970950013087,
"eval_logits/chosen": -1.106369972229004,
"eval_logits/rejected": -0.9647883176803589,
"eval_logps/chosen": -626.7305297851562,
"eval_logps/rejected": -725.0693969726562,
"eval_loss": 0.4948273003101349,
"eval_rewards/accuracies": 0.7294999957084656,
"eval_rewards/chosen": -3.6211562156677246,
"eval_rewards/margins": 1.183934211730957,
"eval_rewards/rejected": -4.805090427398682,
"eval_runtime": 1583.0831,
"eval_samples_per_second": 1.263,
"eval_steps_per_second": 0.158,
"step": 1700
},
{
"epoch": 0.4475268254383669,
"grad_norm": 13.189878463745117,
"learning_rate": 3.3764812856685995e-06,
"logits/chosen": -1.2679539918899536,
"logits/rejected": -1.2505522966384888,
"logps/chosen": -562.8509521484375,
"logps/rejected": -688.5130615234375,
"loss": 0.4914,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.3842010498046875,
"rewards/margins": 1.1209660768508911,
"rewards/rejected": -4.505166530609131,
"step": 1710
},
{
"epoch": 0.45014394137660296,
"grad_norm": 10.39622974395752,
"learning_rate": 3.3550503583141726e-06,
"logits/chosen": -1.4502748250961304,
"logits/rejected": -1.3018438816070557,
"logps/chosen": -608.05517578125,
"logps/rejected": -713.238037109375,
"loss": 0.5054,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.393193006515503,
"rewards/margins": 1.203611135482788,
"rewards/rejected": -4.596804618835449,
"step": 1720
},
{
"epoch": 0.45276105731483907,
"grad_norm": 9.055331230163574,
"learning_rate": 3.3335480345008907e-06,
"logits/chosen": -1.3674051761627197,
"logits/rejected": -1.2791941165924072,
"logps/chosen": -541.0001831054688,
"logps/rejected": -634.0692749023438,
"loss": 0.4663,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.8097071647644043,
"rewards/margins": 1.1776827573776245,
"rewards/rejected": -3.9873898029327393,
"step": 1730
},
{
"epoch": 0.4553781732530751,
"grad_norm": 11.12189769744873,
"learning_rate": 3.3119761096666055e-06,
"logits/chosen": -1.4076497554779053,
"logits/rejected": -1.251043438911438,
"logps/chosen": -572.8175659179688,
"logps/rejected": -628.5023803710938,
"loss": 0.5592,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -2.9791908264160156,
"rewards/margins": 0.8598833084106445,
"rewards/rejected": -3.839073896408081,
"step": 1740
},
{
"epoch": 0.45799528919131116,
"grad_norm": 8.793429374694824,
"learning_rate": 3.290336385060832e-06,
"logits/chosen": -1.5849800109863281,
"logits/rejected": -1.3935880661010742,
"logps/chosen": -543.2794189453125,
"logps/rejected": -622.4841918945312,
"loss": 0.5157,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -2.9448046684265137,
"rewards/margins": 0.9939256906509399,
"rewards/rejected": -3.9387307167053223,
"step": 1750
},
{
"epoch": 0.46061240512954726,
"grad_norm": 11.67789077758789,
"learning_rate": 3.268630667594348e-06,
"logits/chosen": -1.357431173324585,
"logits/rejected": -1.3213989734649658,
"logps/chosen": -582.795654296875,
"logps/rejected": -671.8525390625,
"loss": 0.4753,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -3.2719674110412598,
"rewards/margins": 1.1332902908325195,
"rewards/rejected": -4.4052581787109375,
"step": 1760
},
{
"epoch": 0.4632295210677833,
"grad_norm": 10.402399063110352,
"learning_rate": 3.2468607696883147e-06,
"logits/chosen": -1.3948209285736084,
"logits/rejected": -1.33302640914917,
"logps/chosen": -597.5047607421875,
"logps/rejected": -718.071044921875,
"loss": 0.4639,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.4439845085144043,
"rewards/margins": 1.1630891561508179,
"rewards/rejected": -4.607073783874512,
"step": 1770
},
{
"epoch": 0.46584663700601936,
"grad_norm": 8.766528129577637,
"learning_rate": 3.225028509122944e-06,
"logits/chosen": -1.4917911291122437,
"logits/rejected": -1.3801844120025635,
"logps/chosen": -554.3919677734375,
"logps/rejected": -640.0030517578125,
"loss": 0.5114,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.1797173023223877,
"rewards/margins": 0.9688693881034851,
"rewards/rejected": -4.148587226867676,
"step": 1780
},
{
"epoch": 0.4684637529442554,
"grad_norm": 15.830283164978027,
"learning_rate": 3.2031357088857083e-06,
"logits/chosen": -1.43815279006958,
"logits/rejected": -1.3467546701431274,
"logps/chosen": -635.6055908203125,
"logps/rejected": -746.7036743164062,
"loss": 0.5194,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.6045749187469482,
"rewards/margins": 1.145918607711792,
"rewards/rejected": -4.75049352645874,
"step": 1790
},
{
"epoch": 0.4710808688824915,
"grad_norm": 15.555473327636719,
"learning_rate": 3.181184197019127e-06,
"logits/chosen": -1.162718653678894,
"logits/rejected": -1.0536067485809326,
"logps/chosen": -632.3372802734375,
"logps/rejected": -805.0314331054688,
"loss": 0.485,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -4.021511554718018,
"rewards/margins": 1.4656221866607666,
"rewards/rejected": -5.487133979797363,
"step": 1800
},
{
"epoch": 0.4710808688824915,
"eval_logits/chosen": -1.1612821817398071,
"eval_logits/rejected": -1.0275636911392212,
"eval_logps/chosen": -666.7680053710938,
"eval_logps/rejected": -767.4140625,
"eval_loss": 0.4885352551937103,
"eval_rewards/accuracies": 0.7524999976158142,
"eval_rewards/chosen": -4.021530628204346,
"eval_rewards/margins": 1.2070072889328003,
"eval_rewards/rejected": -5.228537082672119,
"eval_runtime": 1582.3542,
"eval_samples_per_second": 1.264,
"eval_steps_per_second": 0.158,
"step": 1800
},
{
"epoch": 0.47369798482072756,
"grad_norm": 19.3488712310791,
"learning_rate": 3.159175806468126e-06,
"logits/chosen": -1.1995022296905518,
"logits/rejected": -1.0088884830474854,
"logps/chosen": -648.7389526367188,
"logps/rejected": -741.3822021484375,
"loss": 0.4856,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -4.001690864562988,
"rewards/margins": 1.2280538082122803,
"rewards/rejected": -5.229744911193848,
"step": 1810
},
{
"epoch": 0.4763151007589636,
"grad_norm": 12.791426658630371,
"learning_rate": 3.1371123749269804e-06,
"logits/chosen": -1.2882441282272339,
"logits/rejected": -1.2072982788085938,
"logps/chosen": -709.89990234375,
"logps/rejected": -785.1103515625,
"loss": 0.5788,
"rewards/accuracies": 0.6875,
"rewards/chosen": -4.266518592834473,
"rewards/margins": 0.9120771288871765,
"rewards/rejected": -5.178596496582031,
"step": 1820
},
{
"epoch": 0.4789322166971997,
"grad_norm": 13.390192985534668,
"learning_rate": 3.114995744685877e-06,
"logits/chosen": -1.2616273164749146,
"logits/rejected": -1.2146679162979126,
"logps/chosen": -636.3683471679688,
"logps/rejected": -724.3128051757812,
"loss": 0.4985,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -3.891195774078369,
"rewards/margins": 1.0095303058624268,
"rewards/rejected": -4.900726318359375,
"step": 1830
},
{
"epoch": 0.48154933263543576,
"grad_norm": 6.8147382736206055,
"learning_rate": 3.0928277624770743e-06,
"logits/chosen": -1.444204330444336,
"logits/rejected": -1.2651526927947998,
"logps/chosen": -641.5262451171875,
"logps/rejected": -739.78271484375,
"loss": 0.496,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.5686423778533936,
"rewards/margins": 1.2668758630752563,
"rewards/rejected": -4.835517883300781,
"step": 1840
},
{
"epoch": 0.4841664485736718,
"grad_norm": 7.521138668060303,
"learning_rate": 3.070610279320708e-06,
"logits/chosen": -1.4095209836959839,
"logits/rejected": -1.2402890920639038,
"logps/chosen": -645.0303955078125,
"logps/rejected": -744.2957153320312,
"loss": 0.44,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.5808544158935547,
"rewards/margins": 1.2336914539337158,
"rewards/rejected": -4.81454610824585,
"step": 1850
},
{
"epoch": 0.48678356451190785,
"grad_norm": 5.6627326011657715,
"learning_rate": 3.0483451503702264e-06,
"logits/chosen": -1.3021334409713745,
"logits/rejected": -1.2168514728546143,
"logps/chosen": -695.7568969726562,
"logps/rejected": -792.8882446289062,
"loss": 0.5355,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -4.097269535064697,
"rewards/margins": 1.1725034713745117,
"rewards/rejected": -5.269773006439209,
"step": 1860
},
{
"epoch": 0.48940068045014395,
"grad_norm": 7.5200653076171875,
"learning_rate": 3.0260342347574916e-06,
"logits/chosen": -1.2390494346618652,
"logits/rejected": -1.080314040184021,
"logps/chosen": -702.23681640625,
"logps/rejected": -841.40234375,
"loss": 0.4339,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -4.300023555755615,
"rewards/margins": 1.5825563669204712,
"rewards/rejected": -5.882579803466797,
"step": 1870
},
{
"epoch": 0.49201779638838,
"grad_norm": 11.655389785766602,
"learning_rate": 3.0036793954375358e-06,
"logits/chosen": -1.2104527950286865,
"logits/rejected": -1.0451383590698242,
"logps/chosen": -758.5480346679688,
"logps/rejected": -856.0066528320312,
"loss": 0.4526,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -4.885684967041016,
"rewards/margins": 1.3967931270599365,
"rewards/rejected": -6.282477855682373,
"step": 1880
},
{
"epoch": 0.49463491232661605,
"grad_norm": 12.27940845489502,
"learning_rate": 2.981282499033009e-06,
"logits/chosen": -1.2786993980407715,
"logits/rejected": -1.154975175857544,
"logps/chosen": -723.9029541015625,
"logps/rejected": -814.64892578125,
"loss": 0.5139,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.477005958557129,
"rewards/margins": 1.1614168882369995,
"rewards/rejected": -5.63842248916626,
"step": 1890
},
{
"epoch": 0.49725202826485215,
"grad_norm": 7.594533920288086,
"learning_rate": 2.9588454156783163e-06,
"logits/chosen": -1.377384066581726,
"logits/rejected": -1.2067997455596924,
"logps/chosen": -656.7349243164062,
"logps/rejected": -785.1405029296875,
"loss": 0.4387,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.754779815673828,
"rewards/margins": 1.5012847185134888,
"rewards/rejected": -5.256064414978027,
"step": 1900
},
{
"epoch": 0.49725202826485215,
"eval_logits/chosen": -1.2419021129608154,
"eval_logits/rejected": -1.1075339317321777,
"eval_logps/chosen": -645.9786376953125,
"eval_logps/rejected": -748.0073852539062,
"eval_loss": 0.4897352159023285,
"eval_rewards/accuracies": 0.7459999918937683,
"eval_rewards/chosen": -3.8136374950408936,
"eval_rewards/margins": 1.2208337783813477,
"eval_rewards/rejected": -5.034470558166504,
"eval_runtime": 1582.9561,
"eval_samples_per_second": 1.263,
"eval_steps_per_second": 0.158,
"step": 1900
},
{
"epoch": 0.4998691442030882,
"grad_norm": 12.187158584594727,
"learning_rate": 2.9363700188634597e-06,
"logits/chosen": -1.3125779628753662,
"logits/rejected": -1.1751724481582642,
"logps/chosen": -683.4190063476562,
"logps/rejected": -763.2318115234375,
"loss": 0.4992,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -4.184360027313232,
"rewards/margins": 1.1687465906143188,
"rewards/rejected": -5.353107452392578,
"step": 1910
},
{
"epoch": 0.5024862601413242,
"grad_norm": 13.572044372558594,
"learning_rate": 2.9138581852776053e-06,
"logits/chosen": -1.2750508785247803,
"logits/rejected": -1.1450278759002686,
"logps/chosen": -725.9456176757812,
"logps/rejected": -845.9400634765625,
"loss": 0.51,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -4.662615776062012,
"rewards/margins": 1.3297778367996216,
"rewards/rejected": -5.992393970489502,
"step": 1920
},
{
"epoch": 0.5051033760795604,
"grad_norm": 7.278676509857178,
"learning_rate": 2.8913117946523805e-06,
"logits/chosen": -1.3194019794464111,
"logits/rejected": -1.127657175064087,
"logps/chosen": -717.7136840820312,
"logps/rejected": -793.281982421875,
"loss": 0.4772,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.51358699798584,
"rewards/margins": 1.136637806892395,
"rewards/rejected": -5.650224685668945,
"step": 1930
},
{
"epoch": 0.5077204920177963,
"grad_norm": 11.58935260772705,
"learning_rate": 2.8687327296049126e-06,
"logits/chosen": -1.3775203227996826,
"logits/rejected": -1.272014856338501,
"logps/chosen": -640.1255493164062,
"logps/rejected": -746.0568237304688,
"loss": 0.5109,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.8106136322021484,
"rewards/margins": 1.1573151350021362,
"rewards/rejected": -4.967928886413574,
"step": 1940
},
{
"epoch": 0.5103376079560324,
"grad_norm": 11.089369773864746,
"learning_rate": 2.8461228754806376e-06,
"logits/chosen": -1.464210867881775,
"logits/rejected": -1.2921464443206787,
"logps/chosen": -602.5208740234375,
"logps/rejected": -670.5046997070312,
"loss": 0.5279,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.2676520347595215,
"rewards/margins": 0.9385223388671875,
"rewards/rejected": -4.206175327301025,
"step": 1950
},
{
"epoch": 0.5129547238942685,
"grad_norm": 9.643657684326172,
"learning_rate": 2.823484120195865e-06,
"logits/chosen": -1.5414282083511353,
"logits/rejected": -1.351072907447815,
"logps/chosen": -588.3757934570312,
"logps/rejected": -664.9412841796875,
"loss": 0.4289,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.1064059734344482,
"rewards/margins": 1.1096309423446655,
"rewards/rejected": -4.216037273406982,
"step": 1960
},
{
"epoch": 0.5155718398325045,
"grad_norm": 12.020472526550293,
"learning_rate": 2.8008183540801486e-06,
"logits/chosen": -1.2709811925888062,
"logits/rejected": -1.1159567832946777,
"logps/chosen": -681.0741577148438,
"logps/rejected": -740.572509765625,
"loss": 0.5049,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -4.075676918029785,
"rewards/margins": 1.0485068559646606,
"rewards/rejected": -5.124184608459473,
"step": 1970
},
{
"epoch": 0.5181889557707406,
"grad_norm": 14.415773391723633,
"learning_rate": 2.7781274697184353e-06,
"logits/chosen": -1.0758979320526123,
"logits/rejected": -1.0921449661254883,
"logps/chosen": -699.5072631835938,
"logps/rejected": -847.3580932617188,
"loss": 0.53,
"rewards/accuracies": 0.71875,
"rewards/chosen": -4.702380180358887,
"rewards/margins": 1.275566816329956,
"rewards/rejected": -5.9779462814331055,
"step": 1980
},
{
"epoch": 0.5208060717089767,
"grad_norm": 8.904159545898438,
"learning_rate": 2.7554133617930397e-06,
"logits/chosen": -1.1817315816879272,
"logits/rejected": -1.0418232679367065,
"logps/chosen": -711.0933227539062,
"logps/rejected": -818.5221557617188,
"loss": 0.5067,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -4.63338565826416,
"rewards/margins": 1.2256052494049072,
"rewards/rejected": -5.858990669250488,
"step": 1990
},
{
"epoch": 0.5234231876472127,
"grad_norm": 13.392192840576172,
"learning_rate": 2.7326779269254363e-06,
"logits/chosen": -1.3758423328399658,
"logits/rejected": -1.1924948692321777,
"logps/chosen": -736.4212646484375,
"logps/rejected": -802.3355712890625,
"loss": 0.4613,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.471527099609375,
"rewards/margins": 1.2190699577331543,
"rewards/rejected": -5.690597057342529,
"step": 2000
},
{
"epoch": 0.5234231876472127,
"eval_logits/chosen": -1.1241540908813477,
"eval_logits/rejected": -0.9858745336532593,
"eval_logps/chosen": -721.045654296875,
"eval_logps/rejected": -814.3307495117188,
"eval_loss": 0.494096577167511,
"eval_rewards/accuracies": 0.7409999966621399,
"eval_rewards/chosen": -4.56430721282959,
"eval_rewards/margins": 1.133396863937378,
"eval_rewards/rejected": -5.697703838348389,
"eval_runtime": 1582.7861,
"eval_samples_per_second": 1.264,
"eval_steps_per_second": 0.158,
"step": 2000
},
{
"epoch": 0.5260403035854488,
"grad_norm": 9.46854019165039,
"learning_rate": 2.7099230635178954e-06,
"logits/chosen": -1.2051199674606323,
"logits/rejected": -1.154191017150879,
"logps/chosen": -719.0904541015625,
"logps/rejected": -816.6644287109375,
"loss": 0.5094,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -4.54345703125,
"rewards/margins": 1.0406770706176758,
"rewards/rejected": -5.584134101867676,
"step": 2010
},
{
"epoch": 0.528657419523685,
"grad_norm": 10.047252655029297,
"learning_rate": 2.6871506715949608e-06,
"logits/chosen": -1.3990776538848877,
"logits/rejected": -1.2440322637557983,
"logps/chosen": -642.8494873046875,
"logps/rejected": -737.8421020507812,
"loss": 0.4694,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.8995118141174316,
"rewards/margins": 1.1307276487350464,
"rewards/rejected": -5.030240058898926,
"step": 2020
},
{
"epoch": 0.5312745354619209,
"grad_norm": 9.14461898803711,
"learning_rate": 2.6643626526448063e-06,
"logits/chosen": -1.4923516511917114,
"logits/rejected": -1.3187439441680908,
"logps/chosen": -675.9149169921875,
"logps/rejected": -767.320068359375,
"loss": 0.4364,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -3.8115932941436768,
"rewards/margins": 1.337866187095642,
"rewards/rejected": -5.149459362030029,
"step": 2030
},
{
"epoch": 0.533891651400157,
"grad_norm": 10.868741989135742,
"learning_rate": 2.6415609094604562e-06,
"logits/chosen": -1.2933242321014404,
"logits/rejected": -1.2320917844772339,
"logps/chosen": -715.2813720703125,
"logps/rejected": -814.0577392578125,
"loss": 0.4635,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -4.448263168334961,
"rewards/margins": 1.1784393787384033,
"rewards/rejected": -5.626702308654785,
"step": 2040
},
{
"epoch": 0.5365087673383931,
"grad_norm": 13.911977767944336,
"learning_rate": 2.618747345980904e-06,
"logits/chosen": -1.2531259059906006,
"logits/rejected": -1.0494439601898193,
"logps/chosen": -810.5252685546875,
"logps/rejected": -883.2147216796875,
"loss": 0.5478,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -5.621924877166748,
"rewards/margins": 1.2322816848754883,
"rewards/rejected": -6.8542070388793945,
"step": 2050
},
{
"epoch": 0.5391258832766291,
"grad_norm": 5.480376243591309,
"learning_rate": 2.595923867132136e-06,
"logits/chosen": -1.294999599456787,
"logits/rejected": -1.1645463705062866,
"logps/chosen": -820.8541870117188,
"logps/rejected": -930.9850463867188,
"loss": 0.4806,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -5.399382591247559,
"rewards/margins": 1.3490444421768188,
"rewards/rejected": -6.748426914215088,
"step": 2060
},
{
"epoch": 0.5417429992148652,
"grad_norm": 8.86125373840332,
"learning_rate": 2.5730923786680672e-06,
"logits/chosen": -1.2584960460662842,
"logits/rejected": -1.2339789867401123,
"logps/chosen": -752.2889404296875,
"logps/rejected": -858.5090942382812,
"loss": 0.545,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -5.008147716522217,
"rewards/margins": 0.9858999252319336,
"rewards/rejected": -5.99404764175415,
"step": 2070
},
{
"epoch": 0.5443601151531012,
"grad_norm": 6.723228931427002,
"learning_rate": 2.5502547870114137e-06,
"logits/chosen": -1.3701547384262085,
"logits/rejected": -1.22635018825531,
"logps/chosen": -713.291748046875,
"logps/rejected": -783.838623046875,
"loss": 0.5206,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.579865455627441,
"rewards/margins": 1.0079872608184814,
"rewards/rejected": -5.587852954864502,
"step": 2080
},
{
"epoch": 0.5469772310913373,
"grad_norm": 25.50057601928711,
"learning_rate": 2.527412999094507e-06,
"logits/chosen": -1.343469262123108,
"logits/rejected": -1.181768774986267,
"logps/chosen": -755.4833374023438,
"logps/rejected": -878.3968505859375,
"loss": 0.4539,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.6183061599731445,
"rewards/margins": 1.3548907041549683,
"rewards/rejected": -5.973196983337402,
"step": 2090
},
{
"epoch": 0.5495943470295734,
"grad_norm": 15.880741119384766,
"learning_rate": 2.504568922200064e-06,
"logits/chosen": -1.2968125343322754,
"logits/rejected": -1.1565604209899902,
"logps/chosen": -708.8741455078125,
"logps/rejected": -828.703125,
"loss": 0.4939,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -4.676494598388672,
"rewards/margins": 1.3593757152557373,
"rewards/rejected": -6.035870552062988,
"step": 2100
},
{
"epoch": 0.5495943470295734,
"eval_logits/chosen": -1.2699260711669922,
"eval_logits/rejected": -1.1445250511169434,
"eval_logps/chosen": -729.02099609375,
"eval_logps/rejected": -829.7324829101562,
"eval_loss": 0.4877359867095947,
"eval_rewards/accuracies": 0.75,
"eval_rewards/chosen": -4.6440606117248535,
"eval_rewards/margins": 1.2076616287231445,
"eval_rewards/rejected": -5.85172176361084,
"eval_runtime": 1582.8734,
"eval_samples_per_second": 1.264,
"eval_steps_per_second": 0.158,
"step": 2100
},
{
"epoch": 0.5522114629678094,
"grad_norm": 8.566736221313477,
"learning_rate": 2.4817244638019333e-06,
"logits/chosen": -1.408442735671997,
"logits/rejected": -1.2569968700408936,
"logps/chosen": -724.885498046875,
"logps/rejected": -797.1796875,
"loss": 0.5195,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -4.491677284240723,
"rewards/margins": 1.1345134973526,
"rewards/rejected": -5.626191139221191,
"step": 2110
},
{
"epoch": 0.5548285789060455,
"grad_norm": 12.80423355102539,
"learning_rate": 2.4588815314058155e-06,
"logits/chosen": -1.4017808437347412,
"logits/rejected": -1.3316090106964111,
"logps/chosen": -649.954345703125,
"logps/rejected": -722.61083984375,
"loss": 0.4704,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -4.044957160949707,
"rewards/margins": 1.0687569379806519,
"rewards/rejected": -5.11371374130249,
"step": 2120
},
{
"epoch": 0.5574456948442816,
"grad_norm": 9.04439926147461,
"learning_rate": 2.4360420323899922e-06,
"logits/chosen": -1.4811222553253174,
"logits/rejected": -1.3649004697799683,
"logps/chosen": -638.9652099609375,
"logps/rejected": -694.5174560546875,
"loss": 0.5689,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -3.63200306892395,
"rewards/margins": 0.8811875581741333,
"rewards/rejected": -4.513190746307373,
"step": 2130
},
{
"epoch": 0.5600628107825176,
"grad_norm": 7.143604278564453,
"learning_rate": 2.4132078738460585e-06,
"logits/chosen": -1.5229297876358032,
"logits/rejected": -1.3823162317276,
"logps/chosen": -608.2154541015625,
"logps/rejected": -684.1043090820312,
"loss": 0.4571,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.4388651847839355,
"rewards/margins": 1.1439706087112427,
"rewards/rejected": -4.582836151123047,
"step": 2140
},
{
"epoch": 0.5626799267207537,
"grad_norm": 12.864585876464844,
"learning_rate": 2.3903809624196826e-06,
"logits/chosen": -1.4179461002349854,
"logits/rejected": -1.2970348596572876,
"logps/chosen": -606.7810668945312,
"logps/rejected": -675.0203857421875,
"loss": 0.531,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.7310357093811035,
"rewards/margins": 1.007622480392456,
"rewards/rejected": -4.7386579513549805,
"step": 2150
},
{
"epoch": 0.5652970426589898,
"grad_norm": 21.30654525756836,
"learning_rate": 2.3675632041513978e-06,
"logits/chosen": -1.4896656274795532,
"logits/rejected": -1.2570466995239258,
"logps/chosen": -715.8450317382812,
"logps/rejected": -766.1806640625,
"loss": 0.5043,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -4.289364814758301,
"rewards/margins": 1.15054190158844,
"rewards/rejected": -5.439906597137451,
"step": 2160
},
{
"epoch": 0.5679141585972258,
"grad_norm": 10.769180297851562,
"learning_rate": 2.3447565043174533e-06,
"logits/chosen": -1.3695234060287476,
"logits/rejected": -1.2086126804351807,
"logps/chosen": -677.4241943359375,
"logps/rejected": -737.0889892578125,
"loss": 0.5263,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -4.191761016845703,
"rewards/margins": 0.9847186803817749,
"rewards/rejected": -5.176480293273926,
"step": 2170
},
{
"epoch": 0.5705312745354619,
"grad_norm": 9.414416313171387,
"learning_rate": 2.321962767270724e-06,
"logits/chosen": -1.4165534973144531,
"logits/rejected": -1.287619709968567,
"logps/chosen": -636.1911010742188,
"logps/rejected": -688.1452026367188,
"loss": 0.5339,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.863328456878662,
"rewards/margins": 0.8915898203849792,
"rewards/rejected": -4.754918575286865,
"step": 2180
},
{
"epoch": 0.573148390473698,
"grad_norm": 7.431769847869873,
"learning_rate": 2.299183896281692e-06,
"logits/chosen": -1.3594014644622803,
"logits/rejected": -1.2485125064849854,
"logps/chosen": -615.8621826171875,
"logps/rejected": -705.419921875,
"loss": 0.5003,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.6098227500915527,
"rewards/margins": 0.9186857342720032,
"rewards/rejected": -4.52850866317749,
"step": 2190
},
{
"epoch": 0.575765506411934,
"grad_norm": 7.507672309875488,
"learning_rate": 2.2764217933795297e-06,
"logits/chosen": -1.4956846237182617,
"logits/rejected": -1.378463864326477,
"logps/chosen": -583.9468994140625,
"logps/rejected": -668.4165649414062,
"loss": 0.4782,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.225862503051758,
"rewards/margins": 1.049989104270935,
"rewards/rejected": -4.275851726531982,
"step": 2200
},
{
"epoch": 0.575765506411934,
"eval_logits/chosen": -1.3664623498916626,
"eval_logits/rejected": -1.2438994646072388,
"eval_logps/chosen": -592.4716186523438,
"eval_logps/rejected": -673.7171020507812,
"eval_loss": 0.4812549352645874,
"eval_rewards/accuracies": 0.7484999895095825,
"eval_rewards/chosen": -3.27856707572937,
"eval_rewards/margins": 1.0130008459091187,
"eval_rewards/rejected": -4.291567802429199,
"eval_runtime": 1583.0348,
"eval_samples_per_second": 1.263,
"eval_steps_per_second": 0.158,
"step": 2200
},
{
"epoch": 0.5783826223501701,
"grad_norm": 5.585911273956299,
"learning_rate": 2.2536783591932786e-06,
"logits/chosen": -1.5299404859542847,
"logits/rejected": -1.3702765703201294,
"logps/chosen": -628.8416137695312,
"logps/rejected": -697.6521606445312,
"loss": 0.5367,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -3.5344300270080566,
"rewards/margins": 0.8697922825813293,
"rewards/rejected": -4.404222011566162,
"step": 2210
},
{
"epoch": 0.5809997382884062,
"grad_norm": 10.258821487426758,
"learning_rate": 2.230955492793149e-06,
"logits/chosen": -1.3099156618118286,
"logits/rejected": -1.2557871341705322,
"logps/chosen": -661.6383056640625,
"logps/rejected": -749.5225830078125,
"loss": 0.5803,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.8788514137268066,
"rewards/margins": 1.0083012580871582,
"rewards/rejected": -4.887152671813965,
"step": 2220
},
{
"epoch": 0.5836168542266422,
"grad_norm": 6.397181987762451,
"learning_rate": 2.208255091531947e-06,
"logits/chosen": -1.3133596181869507,
"logits/rejected": -1.229486107826233,
"logps/chosen": -645.9674682617188,
"logps/rejected": -744.9403076171875,
"loss": 0.459,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.690619707107544,
"rewards/margins": 1.3393402099609375,
"rewards/rejected": -5.0299601554870605,
"step": 2230
},
{
"epoch": 0.5862339701648783,
"grad_norm": 17.43966293334961,
"learning_rate": 2.1855790508866435e-06,
"logits/chosen": -1.4071886539459229,
"logits/rejected": -1.3069418668746948,
"logps/chosen": -654.7639770507812,
"logps/rejected": -755.3448486328125,
"loss": 0.5125,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -3.5785202980041504,
"rewards/margins": 1.1814624071121216,
"rewards/rejected": -4.759982109069824,
"step": 2240
},
{
"epoch": 0.5888510861031143,
"grad_norm": 6.193937301635742,
"learning_rate": 2.162929264300107e-06,
"logits/chosen": -1.4731229543685913,
"logits/rejected": -1.3733452558517456,
"logps/chosen": -552.7366333007812,
"logps/rejected": -673.0077514648438,
"loss": 0.4034,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -2.83868670463562,
"rewards/margins": 1.426152229309082,
"rewards/rejected": -4.264839172363281,
"step": 2250
},
{
"epoch": 0.5914682020413504,
"grad_norm": 11.390615463256836,
"learning_rate": 2.1403076230230006e-06,
"logits/chosen": -1.4542146921157837,
"logits/rejected": -1.3438105583190918,
"logps/chosen": -562.8676147460938,
"logps/rejected": -622.1143798828125,
"loss": 0.5682,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.917558193206787,
"rewards/margins": 0.870224118232727,
"rewards/rejected": -3.7877821922302246,
"step": 2260
},
{
"epoch": 0.5940853179795865,
"grad_norm": 7.33363151550293,
"learning_rate": 2.11771601595586e-06,
"logits/chosen": -1.5080881118774414,
"logits/rejected": -1.3906389474868774,
"logps/chosen": -563.2923583984375,
"logps/rejected": -613.376220703125,
"loss": 0.4969,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.813906669616699,
"rewards/margins": 1.0224448442459106,
"rewards/rejected": -3.836350917816162,
"step": 2270
},
{
"epoch": 0.5967024339178225,
"grad_norm": 16.42394256591797,
"learning_rate": 2.0951563294913737e-06,
"logits/chosen": -1.4487019777297974,
"logits/rejected": -1.2721295356750488,
"logps/chosen": -563.4182739257812,
"logps/rejected": -641.2840576171875,
"loss": 0.4499,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.0196499824523926,
"rewards/margins": 1.0839171409606934,
"rewards/rejected": -4.103566646575928,
"step": 2280
},
{
"epoch": 0.5993195498560586,
"grad_norm": 8.327413558959961,
"learning_rate": 2.0726304473568693e-06,
"logits/chosen": -1.3579986095428467,
"logits/rejected": -1.2439508438110352,
"logps/chosen": -605.872314453125,
"logps/rejected": -684.9946899414062,
"loss": 0.4622,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.47078013420105,
"rewards/margins": 1.1140120029449463,
"rewards/rejected": -4.584792137145996,
"step": 2290
},
{
"epoch": 0.6019366657942947,
"grad_norm": 10.106181144714355,
"learning_rate": 2.050140250457023e-06,
"logits/chosen": -1.427367091178894,
"logits/rejected": -1.222748875617981,
"logps/chosen": -683.3574829101562,
"logps/rejected": -780.22509765625,
"loss": 0.4682,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.059211730957031,
"rewards/margins": 1.3089077472686768,
"rewards/rejected": -5.368119716644287,
"step": 2300
},
{
"epoch": 0.6019366657942947,
"eval_logits/chosen": -1.19517183303833,
"eval_logits/rejected": -1.0666708946228027,
"eval_logps/chosen": -680.9020385742188,
"eval_logps/rejected": -799.8125610351562,
"eval_loss": 0.48845258355140686,
"eval_rewards/accuracies": 0.7455000281333923,
"eval_rewards/chosen": -4.162871837615967,
"eval_rewards/margins": 1.3896511793136597,
"eval_rewards/rejected": -5.552523612976074,
"eval_runtime": 1582.9401,
"eval_samples_per_second": 1.263,
"eval_steps_per_second": 0.158,
"step": 2300
},
{
"epoch": 0.6045537817325307,
"grad_norm": 16.23677635192871,
"learning_rate": 2.0276876167168042e-06,
"logits/chosen": -1.1654255390167236,
"logits/rejected": -1.065344214439392,
"logps/chosen": -639.0543212890625,
"logps/rejected": -735.3461303710938,
"loss": 0.5693,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.146035194396973,
"rewards/margins": 1.2893322706222534,
"rewards/rejected": -5.435367584228516,
"step": 2310
},
{
"epoch": 0.6071708976707668,
"grad_norm": 8.372858047485352,
"learning_rate": 2.0052744209246682e-06,
"logits/chosen": -1.2929928302764893,
"logits/rejected": -1.1719223260879517,
"logps/chosen": -675.3624877929688,
"logps/rejected": -769.5217895507812,
"loss": 0.4909,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.201422214508057,
"rewards/margins": 1.270340919494629,
"rewards/rejected": -5.471763610839844,
"step": 2320
},
{
"epoch": 0.6097880136090029,
"grad_norm": 10.845846176147461,
"learning_rate": 1.9829025345760127e-06,
"logits/chosen": -1.3178662061691284,
"logits/rejected": -1.2613779306411743,
"logps/chosen": -671.625732421875,
"logps/rejected": -765.293701171875,
"loss": 0.5304,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -3.9272491931915283,
"rewards/margins": 0.9973451495170593,
"rewards/rejected": -4.924594402313232,
"step": 2330
},
{
"epoch": 0.6124051295472389,
"grad_norm": 8.701257705688477,
"learning_rate": 1.9605738257169115e-06,
"logits/chosen": -1.2995572090148926,
"logits/rejected": -1.156830906867981,
"logps/chosen": -596.4144897460938,
"logps/rejected": -711.1951904296875,
"loss": 0.4876,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.6349895000457764,
"rewards/margins": 1.2283639907836914,
"rewards/rejected": -4.863353252410889,
"step": 2340
},
{
"epoch": 0.615022245485475,
"grad_norm": 10.194114685058594,
"learning_rate": 1.9382901587881275e-06,
"logits/chosen": -1.3707363605499268,
"logits/rejected": -1.2504899501800537,
"logps/chosen": -594.62109375,
"logps/rejected": -689.7848510742188,
"loss": 0.4225,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -3.3595714569091797,
"rewards/margins": 1.2741920948028564,
"rewards/rejected": -4.633763313293457,
"step": 2350
},
{
"epoch": 0.6176393614237111,
"grad_norm": 13.661014556884766,
"learning_rate": 1.916053394469437e-06,
"logits/chosen": -1.4088404178619385,
"logits/rejected": -1.216277837753296,
"logps/chosen": -611.9348754882812,
"logps/rejected": -714.869873046875,
"loss": 0.4988,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.4869492053985596,
"rewards/margins": 1.1751216650009155,
"rewards/rejected": -4.662070274353027,
"step": 2360
},
{
"epoch": 0.6202564773619471,
"grad_norm": 8.576478004455566,
"learning_rate": 1.8938653895242604e-06,
"logits/chosen": -1.350233793258667,
"logits/rejected": -1.172080159187317,
"logps/chosen": -622.2972412109375,
"logps/rejected": -720.3619384765625,
"loss": 0.4285,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.581017255783081,
"rewards/margins": 1.2686632871627808,
"rewards/rejected": -4.8496809005737305,
"step": 2370
},
{
"epoch": 0.6228735933001832,
"grad_norm": 10.128191947937012,
"learning_rate": 1.8717279966446267e-06,
"logits/chosen": -1.1970648765563965,
"logits/rejected": -1.1168218851089478,
"logps/chosen": -642.9293823242188,
"logps/rejected": -753.526123046875,
"loss": 0.4417,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.944753646850586,
"rewards/margins": 1.2011665105819702,
"rewards/rejected": -5.145920753479004,
"step": 2380
},
{
"epoch": 0.6254907092384192,
"grad_norm": 6.274438381195068,
"learning_rate": 1.8496430642964698e-06,
"logits/chosen": -1.257615327835083,
"logits/rejected": -1.1408653259277344,
"logps/chosen": -668.20361328125,
"logps/rejected": -755.4804077148438,
"loss": 0.504,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.9589202404022217,
"rewards/margins": 1.1090304851531982,
"rewards/rejected": -5.06795072555542,
"step": 2390
},
{
"epoch": 0.6281078251766553,
"grad_norm": 6.87614107131958,
"learning_rate": 1.827612436565286e-06,
"logits/chosen": -1.2781522274017334,
"logits/rejected": -1.12553071975708,
"logps/chosen": -630.0660400390625,
"logps/rejected": -739.07177734375,
"loss": 0.4582,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.671879291534424,
"rewards/margins": 1.2957801818847656,
"rewards/rejected": -4.967658996582031,
"step": 2400
},
{
"epoch": 0.6281078251766553,
"eval_logits/chosen": -1.1734751462936401,
"eval_logits/rejected": -1.0476148128509521,
"eval_logps/chosen": -638.953369140625,
"eval_logps/rejected": -742.967529296875,
"eval_loss": 0.4858860671520233,
"eval_rewards/accuracies": 0.7459999918937683,
"eval_rewards/chosen": -3.74338436126709,
"eval_rewards/margins": 1.2406867742538452,
"eval_rewards/rejected": -4.984071254730225,
"eval_runtime": 1583.062,
"eval_samples_per_second": 1.263,
"eval_steps_per_second": 0.158,
"step": 2400
},
{
"epoch": 0.6307249411148914,
"grad_norm": 26.042829513549805,
"learning_rate": 1.8056379530021492e-06,
"logits/chosen": -1.3303982019424438,
"logits/rejected": -1.2409372329711914,
"logps/chosen": -617.5029296875,
"logps/rejected": -694.068359375,
"loss": 0.5426,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.733428955078125,
"rewards/margins": 1.014293909072876,
"rewards/rejected": -4.747722625732422,
"step": 2410
},
{
"epoch": 0.6333420570531274,
"grad_norm": 28.547435760498047,
"learning_rate": 1.7837214484701154e-06,
"logits/chosen": -1.376987099647522,
"logits/rejected": -1.2434319257736206,
"logps/chosen": -573.7362060546875,
"logps/rejected": -679.9476318359375,
"loss": 0.457,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.2147457599639893,
"rewards/margins": 1.295339584350586,
"rewards/rejected": -4.510085105895996,
"step": 2420
},
{
"epoch": 0.6359591729913635,
"grad_norm": 13.484001159667969,
"learning_rate": 1.7618647529910043e-06,
"logits/chosen": -1.3759535551071167,
"logits/rejected": -1.2478660345077515,
"logps/chosen": -576.2813720703125,
"logps/rejected": -690.5980834960938,
"loss": 0.4886,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.1959147453308105,
"rewards/margins": 1.2616323232650757,
"rewards/rejected": -4.457546710968018,
"step": 2430
},
{
"epoch": 0.6385762889295996,
"grad_norm": 9.300795555114746,
"learning_rate": 1.7400696915925996e-06,
"logits/chosen": -1.3617148399353027,
"logits/rejected": -1.17989182472229,
"logps/chosen": -608.29052734375,
"logps/rejected": -669.2510986328125,
"loss": 0.515,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.3776187896728516,
"rewards/margins": 1.1922826766967773,
"rewards/rejected": -4.569901943206787,
"step": 2440
},
{
"epoch": 0.6411934048678356,
"grad_norm": 14.334630012512207,
"learning_rate": 1.718338084156254e-06,
"logits/chosen": -1.30707585811615,
"logits/rejected": -1.1524229049682617,
"logps/chosen": -616.8284912109375,
"logps/rejected": -702.1578369140625,
"loss": 0.4582,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.30047869682312,
"rewards/margins": 1.238687515258789,
"rewards/rejected": -4.53916597366333,
"step": 2450
},
{
"epoch": 0.6438105208060717,
"grad_norm": 12.695516586303711,
"learning_rate": 1.6966717452649372e-06,
"logits/chosen": -1.3930574655532837,
"logits/rejected": -1.2220638990402222,
"logps/chosen": -628.47802734375,
"logps/rejected": -705.5762939453125,
"loss": 0.434,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.5162925720214844,
"rewards/margins": 1.2897061109542847,
"rewards/rejected": -4.805999279022217,
"step": 2460
},
{
"epoch": 0.6464276367443078,
"grad_norm": 8.12979793548584,
"learning_rate": 1.6750724840517103e-06,
"logits/chosen": -1.309929609298706,
"logits/rejected": -1.2359564304351807,
"logps/chosen": -624.3968505859375,
"logps/rejected": -734.6637573242188,
"loss": 0.5156,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.716935634613037,
"rewards/margins": 1.0615085363388062,
"rewards/rejected": -4.778443813323975,
"step": 2470
},
{
"epoch": 0.6490447526825438,
"grad_norm": 12.232304573059082,
"learning_rate": 1.6535421040486686e-06,
"logits/chosen": -1.1189963817596436,
"logits/rejected": -1.0223686695098877,
"logps/chosen": -641.5491943359375,
"logps/rejected": -750.5919189453125,
"loss": 0.4207,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.883084535598755,
"rewards/margins": 1.4329438209533691,
"rewards/rejected": -5.316028118133545,
"step": 2480
},
{
"epoch": 0.6516618686207799,
"grad_norm": 12.076385498046875,
"learning_rate": 1.6320824030363458e-06,
"logits/chosen": -1.2061289548873901,
"logits/rejected": -1.1477553844451904,
"logps/chosen": -615.6104736328125,
"logps/rejected": -732.1640625,
"loss": 0.4687,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.807772159576416,
"rewards/margins": 1.32088303565979,
"rewards/rejected": -5.1286540031433105,
"step": 2490
},
{
"epoch": 0.654278984559016,
"grad_norm": 16.170991897583008,
"learning_rate": 1.6106951728936028e-06,
"logits/chosen": -1.3174595832824707,
"logits/rejected": -1.1845439672470093,
"logps/chosen": -622.0203857421875,
"logps/rejected": -727.765869140625,
"loss": 0.4948,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -3.6488277912139893,
"rewards/margins": 1.1369919776916504,
"rewards/rejected": -4.785820007324219,
"step": 2500
},
{
"epoch": 0.654278984559016,
"eval_logits/chosen": -1.1780551671981812,
"eval_logits/rejected": -1.0471770763397217,
"eval_logps/chosen": -625.8917846679688,
"eval_logps/rejected": -728.1769409179688,
"eval_loss": 0.48169419169425964,
"eval_rewards/accuracies": 0.7425000071525574,
"eval_rewards/chosen": -3.6127684116363525,
"eval_rewards/margins": 1.223397970199585,
"eval_rewards/rejected": -4.8361663818359375,
"eval_runtime": 1582.5825,
"eval_samples_per_second": 1.264,
"eval_steps_per_second": 0.158,
"step": 2500
},
{
"epoch": 0.656896100497252,
"grad_norm": 8.167618751525879,
"learning_rate": 1.5893821994479996e-06,
"logits/chosen": -1.3206676244735718,
"logits/rejected": -1.2007882595062256,
"logps/chosen": -629.7269287109375,
"logps/rejected": -714.9498901367188,
"loss": 0.4634,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.523585796356201,
"rewards/margins": 1.2378898859024048,
"rewards/rejected": -4.761475563049316,
"step": 2510
},
{
"epoch": 0.6595132164354881,
"grad_norm": 8.540782928466797,
"learning_rate": 1.5681452623266868e-06,
"logits/chosen": -1.3026401996612549,
"logits/rejected": -1.052721381187439,
"logps/chosen": -674.0380249023438,
"logps/rejected": -757.1068115234375,
"loss": 0.4692,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.850877046585083,
"rewards/margins": 1.4135150909423828,
"rewards/rejected": -5.264391899108887,
"step": 2520
},
{
"epoch": 0.6621303323737242,
"grad_norm": 7.330903053283691,
"learning_rate": 1.5469861348078014e-06,
"logits/chosen": -1.2844129800796509,
"logits/rejected": -1.130133867263794,
"logps/chosen": -647.9868774414062,
"logps/rejected": -788.3101806640625,
"loss": 0.4094,
"rewards/accuracies": 0.78125,
"rewards/chosen": -4.012332439422607,
"rewards/margins": 1.5098378658294678,
"rewards/rejected": -5.522170066833496,
"step": 2530
},
{
"epoch": 0.6647474483119602,
"grad_norm": 11.852506637573242,
"learning_rate": 1.5259065836724035e-06,
"logits/chosen": -1.1574808359146118,
"logits/rejected": -1.0841835737228394,
"logps/chosen": -666.4259033203125,
"logps/rejected": -808.6561279296875,
"loss": 0.415,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -4.217291831970215,
"rewards/margins": 1.4833755493164062,
"rewards/rejected": -5.700667381286621,
"step": 2540
},
{
"epoch": 0.6673645642501963,
"grad_norm": 28.528125762939453,
"learning_rate": 1.5049083690569456e-06,
"logits/chosen": -1.2304925918579102,
"logits/rejected": -1.1360973119735718,
"logps/chosen": -650.3413696289062,
"logps/rejected": -789.5084228515625,
"loss": 0.4926,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -4.129073619842529,
"rewards/margins": 1.4358268976211548,
"rewards/rejected": -5.5649003982543945,
"step": 2550
},
{
"epoch": 0.6699816801884323,
"grad_norm": 12.270853042602539,
"learning_rate": 1.4839932443063057e-06,
"logits/chosen": -1.2561757564544678,
"logits/rejected": -1.1032966375350952,
"logps/chosen": -674.009765625,
"logps/rejected": -758.3587036132812,
"loss": 0.4531,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.820760726928711,
"rewards/margins": 1.4071722030639648,
"rewards/rejected": -5.227932453155518,
"step": 2560
},
{
"epoch": 0.6725987961266684,
"grad_norm": 18.18036460876465,
"learning_rate": 1.4631629558273803e-06,
"logits/chosen": -1.313661813735962,
"logits/rejected": -1.1960715055465698,
"logps/chosen": -597.9262084960938,
"logps/rejected": -680.4265747070312,
"loss": 0.6359,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -3.5625762939453125,
"rewards/margins": 0.9416546821594238,
"rewards/rejected": -4.504230976104736,
"step": 2570
},
{
"epoch": 0.6752159120649045,
"grad_norm": 5.7035298347473145,
"learning_rate": 1.4424192429432657e-06,
"logits/chosen": -1.433161973953247,
"logits/rejected": -1.3360836505889893,
"logps/chosen": -541.0769653320312,
"logps/rejected": -661.7882690429688,
"loss": 0.4728,
"rewards/accuracies": 0.71875,
"rewards/chosen": -2.8529908657073975,
"rewards/margins": 1.1742432117462158,
"rewards/rejected": -4.027234077453613,
"step": 2580
},
{
"epoch": 0.6778330280031405,
"grad_norm": 10.68341064453125,
"learning_rate": 1.421763837748016e-06,
"logits/chosen": -1.3998830318450928,
"logits/rejected": -1.3137894868850708,
"logps/chosen": -547.9580078125,
"logps/rejected": -663.7359619140625,
"loss": 0.4378,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -2.9742558002471924,
"rewards/margins": 1.242157220840454,
"rewards/rejected": -4.2164130210876465,
"step": 2590
},
{
"epoch": 0.6804501439413766,
"grad_norm": 9.300664901733398,
"learning_rate": 1.401198464962021e-06,
"logits/chosen": -1.3978197574615479,
"logits/rejected": -1.243277668952942,
"logps/chosen": -609.5036010742188,
"logps/rejected": -684.1070556640625,
"loss": 0.4588,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.410266876220703,
"rewards/margins": 1.1315726041793823,
"rewards/rejected": -4.541839599609375,
"step": 2600
},
{
"epoch": 0.6804501439413766,
"eval_logits/chosen": -1.239994764328003,
"eval_logits/rejected": -1.1158030033111572,
"eval_logps/chosen": -624.4171142578125,
"eval_logps/rejected": -730.1331176757812,
"eval_loss": 0.48536595702171326,
"eval_rewards/accuracies": 0.7429999709129333,
"eval_rewards/chosen": -3.598021984100342,
"eval_rewards/margins": 1.2577056884765625,
"eval_rewards/rejected": -4.855727672576904,
"eval_runtime": 1583.4838,
"eval_samples_per_second": 1.263,
"eval_steps_per_second": 0.158,
"step": 2600
},
{
"epoch": 0.6830672598796127,
"grad_norm": 7.191524028778076,
"learning_rate": 1.3807248417879896e-06,
"logits/chosen": -1.4120800495147705,
"logits/rejected": -1.2935571670532227,
"logps/chosen": -636.7886962890625,
"logps/rejected": -754.2188110351562,
"loss": 0.4331,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.6377861499786377,
"rewards/margins": 1.3760086297988892,
"rewards/rejected": -5.013794898986816,
"step": 2610
},
{
"epoch": 0.6856843758178487,
"grad_norm": 31.748910903930664,
"learning_rate": 1.3603446777675665e-06,
"logits/chosen": -1.2162339687347412,
"logits/rejected": -1.0921854972839355,
"logps/chosen": -682.8983154296875,
"logps/rejected": -786.9542846679688,
"loss": 0.5535,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -4.19767427444458,
"rewards/margins": 1.2615830898284912,
"rewards/rejected": -5.45925760269165,
"step": 2620
},
{
"epoch": 0.6883014917560848,
"grad_norm": 5.8050408363342285,
"learning_rate": 1.3400596746385817e-06,
"logits/chosen": -1.3429977893829346,
"logits/rejected": -1.1918339729309082,
"logps/chosen": -684.1312255859375,
"logps/rejected": -784.0355224609375,
"loss": 0.4981,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -4.149628639221191,
"rewards/margins": 1.270444393157959,
"rewards/rejected": -5.420073509216309,
"step": 2630
},
{
"epoch": 0.6909186076943209,
"grad_norm": 11.632438659667969,
"learning_rate": 1.3198715261929587e-06,
"logits/chosen": -1.3046501874923706,
"logits/rejected": -1.1579644680023193,
"logps/chosen": -673.1400146484375,
"logps/rejected": -799.3960571289062,
"loss": 0.4126,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -4.367093086242676,
"rewards/margins": 1.3987313508987427,
"rewards/rejected": -5.765824317932129,
"step": 2640
},
{
"epoch": 0.6935357236325569,
"grad_norm": 11.619688034057617,
"learning_rate": 1.2997819181352823e-06,
"logits/chosen": -1.3125207424163818,
"logits/rejected": -1.147236704826355,
"logps/chosen": -736.60009765625,
"logps/rejected": -873.1632690429688,
"loss": 0.425,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -4.3982462882995605,
"rewards/margins": 1.6112134456634521,
"rewards/rejected": -6.009459495544434,
"step": 2650
},
{
"epoch": 0.696152839570793,
"grad_norm": 18.199705123901367,
"learning_rate": 1.2797925279420454e-06,
"logits/chosen": -1.274456262588501,
"logits/rejected": -1.1420204639434814,
"logps/chosen": -735.5014038085938,
"logps/rejected": -867.34912109375,
"loss": 0.4631,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -4.637304782867432,
"rewards/margins": 1.4379584789276123,
"rewards/rejected": -6.075263023376465,
"step": 2660
},
{
"epoch": 0.6987699555090291,
"grad_norm": 12.624371528625488,
"learning_rate": 1.2599050247215764e-06,
"logits/chosen": -1.2420213222503662,
"logits/rejected": -1.128442406654358,
"logps/chosen": -703.1854248046875,
"logps/rejected": -820.5750732421875,
"loss": 0.4843,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -4.453032493591309,
"rewards/margins": 1.4042234420776367,
"rewards/rejected": -5.857255935668945,
"step": 2670
},
{
"epoch": 0.7013870714472651,
"grad_norm": 12.940664291381836,
"learning_rate": 1.2401210690746705e-06,
"logits/chosen": -1.2617765665054321,
"logits/rejected": -1.117337703704834,
"logps/chosen": -708.4508056640625,
"logps/rejected": -800.98046875,
"loss": 0.505,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -4.413388252258301,
"rewards/margins": 1.2580486536026,
"rewards/rejected": -5.671436786651611,
"step": 2680
},
{
"epoch": 0.7040041873855012,
"grad_norm": 9.87313175201416,
"learning_rate": 1.2204423129559306e-06,
"logits/chosen": -1.303589105606079,
"logits/rejected": -1.2480074167251587,
"logps/chosen": -684.2908935546875,
"logps/rejected": -803.417724609375,
"loss": 0.4903,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -4.188497543334961,
"rewards/margins": 1.2328007221221924,
"rewards/rejected": -5.421298027038574,
"step": 2690
},
{
"epoch": 0.7066213033237373,
"grad_norm": 20.87190818786621,
"learning_rate": 1.20087039953583e-06,
"logits/chosen": -1.3425134420394897,
"logits/rejected": -1.2293293476104736,
"logps/chosen": -675.4927368164062,
"logps/rejected": -778.1267700195312,
"loss": 0.5354,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.113883018493652,
"rewards/margins": 1.2898164987564087,
"rewards/rejected": -5.403698921203613,
"step": 2700
},
{
"epoch": 0.7066213033237373,
"eval_logits/chosen": -1.1949888467788696,
"eval_logits/rejected": -1.071972131729126,
"eval_logps/chosen": -677.2342529296875,
"eval_logps/rejected": -781.0516967773438,
"eval_loss": 0.4856647849082947,
"eval_rewards/accuracies": 0.7444999814033508,
"eval_rewards/chosen": -4.126193523406982,
"eval_rewards/margins": 1.2387206554412842,
"eval_rewards/rejected": -5.3649139404296875,
"eval_runtime": 1583.0821,
"eval_samples_per_second": 1.263,
"eval_steps_per_second": 0.158,
"step": 2700
},
{
"epoch": 0.7092384192619733,
"grad_norm": 11.783329963684082,
"learning_rate": 1.181406963063507e-06,
"logits/chosen": -1.2694941759109497,
"logits/rejected": -1.2013720273971558,
"logps/chosen": -656.7574462890625,
"logps/rejected": -778.4323120117188,
"loss": 0.4917,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.9103951454162598,
"rewards/margins": 1.222727656364441,
"rewards/rejected": -5.13312292098999,
"step": 2710
},
{
"epoch": 0.7118555352002094,
"grad_norm": 7.543945789337158,
"learning_rate": 1.1620536287303052e-06,
"logits/chosen": -1.3921228647232056,
"logits/rejected": -1.2788370847702026,
"logps/chosen": -665.0604248046875,
"logps/rejected": -739.6992797851562,
"loss": 0.5404,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.739894390106201,
"rewards/margins": 1.048128604888916,
"rewards/rejected": -4.788022994995117,
"step": 2720
},
{
"epoch": 0.7144726511384454,
"grad_norm": 10.31935977935791,
"learning_rate": 1.1428120125340717e-06,
"logits/chosen": -1.3381649255752563,
"logits/rejected": -1.1965323686599731,
"logps/chosen": -610.3319091796875,
"logps/rejected": -734.8633422851562,
"loss": 0.382,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -3.563275098800659,
"rewards/margins": 1.659253478050232,
"rewards/rejected": -5.222528457641602,
"step": 2730
},
{
"epoch": 0.7170897670766815,
"grad_norm": 8.897191047668457,
"learning_rate": 1.123683721144223e-06,
"logits/chosen": -1.3344717025756836,
"logits/rejected": -1.2315446138381958,
"logps/chosen": -659.2275390625,
"logps/rejected": -775.5537109375,
"loss": 0.4248,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -3.7568366527557373,
"rewards/margins": 1.4870188236236572,
"rewards/rejected": -5.243854999542236,
"step": 2740
},
{
"epoch": 0.7197068830149176,
"grad_norm": 7.804056167602539,
"learning_rate": 1.1046703517675848e-06,
"logits/chosen": -1.3492381572723389,
"logits/rejected": -1.2712219953536987,
"logps/chosen": -637.6734619140625,
"logps/rejected": -759.937744140625,
"loss": 0.5158,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.8261501789093018,
"rewards/margins": 1.1614983081817627,
"rewards/rejected": -4.987648963928223,
"step": 2750
},
{
"epoch": 0.7223239989531536,
"grad_norm": 12.590385437011719,
"learning_rate": 1.085773492015028e-06,
"logits/chosen": -1.3491287231445312,
"logits/rejected": -1.1939796209335327,
"logps/chosen": -623.6396484375,
"logps/rejected": -731.343994140625,
"loss": 0.4266,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.7736754417419434,
"rewards/margins": 1.4199963808059692,
"rewards/rejected": -5.193671226501465,
"step": 2760
},
{
"epoch": 0.7249411148913897,
"grad_norm": 18.164583206176758,
"learning_rate": 1.0669947197689034e-06,
"logits/chosen": -1.3109443187713623,
"logits/rejected": -1.1731343269348145,
"logps/chosen": -682.9834594726562,
"logps/rejected": -781.8642578125,
"loss": 0.5067,
"rewards/accuracies": 0.75,
"rewards/chosen": -4.103693962097168,
"rewards/margins": 1.2765446901321411,
"rewards/rejected": -5.3802385330200195,
"step": 2770
},
{
"epoch": 0.7275582308296258,
"grad_norm": 9.202481269836426,
"learning_rate": 1.048335603051291e-06,
"logits/chosen": -1.3194820880889893,
"logits/rejected": -1.1984010934829712,
"logps/chosen": -711.1646728515625,
"logps/rejected": -828.0762939453125,
"loss": 0.4534,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -4.235322952270508,
"rewards/margins": 1.4504263401031494,
"rewards/rejected": -5.685749530792236,
"step": 2780
},
{
"epoch": 0.7301753467678618,
"grad_norm": 11.143651962280273,
"learning_rate": 1.0297976998930665e-06,
"logits/chosen": -1.3201286792755127,
"logits/rejected": -1.204737901687622,
"logps/chosen": -666.336669921875,
"logps/rejected": -790.0221557617188,
"loss": 0.454,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -4.058554172515869,
"rewards/margins": 1.5139802694320679,
"rewards/rejected": -5.572534561157227,
"step": 2790
},
{
"epoch": 0.7327924627060979,
"grad_norm": 9.879022598266602,
"learning_rate": 1.0113825582038078e-06,
"logits/chosen": -1.3698309659957886,
"logits/rejected": -1.259245753288269,
"logps/chosen": -664.8604736328125,
"logps/rejected": -768.4254760742188,
"loss": 0.4782,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.9975292682647705,
"rewards/margins": 1.2025644779205322,
"rewards/rejected": -5.2000932693481445,
"step": 2800
},
{
"epoch": 0.7327924627060979,
"eval_logits/chosen": -1.2732905149459839,
"eval_logits/rejected": -1.1544142961502075,
"eval_logps/chosen": -650.2978515625,
"eval_logps/rejected": -755.7132568359375,
"eval_loss": 0.48224759101867676,
"eval_rewards/accuracies": 0.7459999918937683,
"eval_rewards/chosen": -3.85683012008667,
"eval_rewards/margins": 1.2546993494033813,
"eval_rewards/rejected": -5.111529350280762,
"eval_runtime": 1582.0799,
"eval_samples_per_second": 1.264,
"eval_steps_per_second": 0.158,
"step": 2800
},
{
"epoch": 0.735409578644334,
"grad_norm": 10.106141090393066,
"learning_rate": 9.930917156425477e-07,
"logits/chosen": -1.3600491285324097,
"logits/rejected": -1.2630369663238525,
"logps/chosen": -658.0189208984375,
"logps/rejected": -777.9553833007812,
"loss": 0.5103,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.969266176223755,
"rewards/margins": 1.2290141582489014,
"rewards/rejected": -5.198280334472656,
"step": 2810
},
{
"epoch": 0.73802669458257,
"grad_norm": 17.606903076171875,
"learning_rate": 9.749266994893756e-07,
"logits/chosen": -1.3096013069152832,
"logits/rejected": -1.1738802194595337,
"logps/chosen": -621.0303955078125,
"logps/rejected": -706.1574096679688,
"loss": 0.5693,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.8049144744873047,
"rewards/margins": 0.9849715232849121,
"rewards/rejected": -4.789886474609375,
"step": 2820
},
{
"epoch": 0.7406438105208061,
"grad_norm": 14.5999174118042,
"learning_rate": 9.56889026517913e-07,
"logits/chosen": -1.3722821474075317,
"logits/rejected": -1.2702796459197998,
"logps/chosen": -643.744873046875,
"logps/rejected": -727.7888793945312,
"loss": 0.5005,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.8248817920684814,
"rewards/margins": 1.1005748510360718,
"rewards/rejected": -4.925456523895264,
"step": 2830
},
{
"epoch": 0.7432609264590422,
"grad_norm": 9.080976486206055,
"learning_rate": 9.389802028686617e-07,
"logits/chosen": -1.44389009475708,
"logits/rejected": -1.3529036045074463,
"logps/chosen": -636.1320190429688,
"logps/rejected": -695.2192993164062,
"loss": 0.5432,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.754615068435669,
"rewards/margins": 0.9210684895515442,
"rewards/rejected": -4.67568302154541,
"step": 2840
},
{
"epoch": 0.7458780423972782,
"grad_norm": 10.234068870544434,
"learning_rate": 9.212017239232427e-07,
"logits/chosen": -1.3670417070388794,
"logits/rejected": -1.234607458114624,
"logps/chosen": -646.2342529296875,
"logps/rejected": -762.1930541992188,
"loss": 0.4571,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.7331900596618652,
"rewards/margins": 1.3892850875854492,
"rewards/rejected": -5.122475624084473,
"step": 2850
},
{
"epoch": 0.7484951583355143,
"grad_norm": 9.236120223999023,
"learning_rate": 9.03555074179533e-07,
"logits/chosen": -1.3387397527694702,
"logits/rejected": -1.3080878257751465,
"logps/chosen": -629.9612426757812,
"logps/rejected": -772.1923828125,
"loss": 0.4375,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.732499599456787,
"rewards/margins": 1.3648207187652588,
"rewards/rejected": -5.097320556640625,
"step": 2860
},
{
"epoch": 0.7511122742737504,
"grad_norm": 9.961119651794434,
"learning_rate": 8.860417271277067e-07,
"logits/chosen": -1.4475278854370117,
"logits/rejected": -1.3916738033294678,
"logps/chosen": -651.3665161132812,
"logps/rejected": -753.1768798828125,
"loss": 0.4777,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.8218841552734375,
"rewards/margins": 1.0978153944015503,
"rewards/rejected": -4.919699192047119,
"step": 2870
},
{
"epoch": 0.7537293902119864,
"grad_norm": 11.848674774169922,
"learning_rate": 8.686631451272029e-07,
"logits/chosen": -1.408536672592163,
"logits/rejected": -1.2724813222885132,
"logps/chosen": -660.1790161132812,
"logps/rejected": -770.7523193359375,
"loss": 0.4821,
"rewards/accuracies": 0.71875,
"rewards/chosen": -4.0685577392578125,
"rewards/margins": 1.3600984811782837,
"rewards/rejected": -5.428656101226807,
"step": 2880
},
{
"epoch": 0.7563465061502225,
"grad_norm": 8.26095199584961,
"learning_rate": 8.514207792846168e-07,
"logits/chosen": -1.429696798324585,
"logits/rejected": -1.319048523902893,
"logps/chosen": -654.4461669921875,
"logps/rejected": -754.9195556640625,
"loss": 0.4719,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -4.036130905151367,
"rewards/margins": 1.2892920970916748,
"rewards/rejected": -5.325423240661621,
"step": 2890
},
{
"epoch": 0.7589636220884585,
"grad_norm": 8.51677417755127,
"learning_rate": 8.343160693325356e-07,
"logits/chosen": -1.330289602279663,
"logits/rejected": -1.227264642715454,
"logps/chosen": -666.0051879882812,
"logps/rejected": -786.2086791992188,
"loss": 0.5135,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -4.104722023010254,
"rewards/margins": 1.2456867694854736,
"rewards/rejected": -5.350409507751465,
"step": 2900
},
{
"epoch": 0.7589636220884585,
"eval_logits/chosen": -1.296103835105896,
"eval_logits/rejected": -1.1772661209106445,
"eval_logps/chosen": -659.640625,
"eval_logps/rejected": -767.6243896484375,
"eval_loss": 0.4806530177593231,
"eval_rewards/accuracies": 0.7475000023841858,
"eval_rewards/chosen": -3.9502570629119873,
"eval_rewards/margins": 1.2803831100463867,
"eval_rewards/rejected": -5.230639457702637,
"eval_runtime": 1581.8876,
"eval_samples_per_second": 1.264,
"eval_steps_per_second": 0.158,
"step": 2900
},
{
"epoch": 0.7615807380266946,
"grad_norm": 7.22358512878418,
"learning_rate": 8.173504435093174e-07,
"logits/chosen": -1.3467257022857666,
"logits/rejected": -1.1828867197036743,
"logps/chosen": -630.05615234375,
"logps/rejected": -734.7799072265625,
"loss": 0.4723,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.8855319023132324,
"rewards/margins": 1.3687862157821655,
"rewards/rejected": -5.254318714141846,
"step": 2910
},
{
"epoch": 0.7641978539649307,
"grad_norm": 8.41374683380127,
"learning_rate": 8.00525318439836e-07,
"logits/chosen": -1.383126974105835,
"logits/rejected": -1.2696807384490967,
"logps/chosen": -666.6580810546875,
"logps/rejected": -765.6370849609375,
"loss": 0.5407,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.899862766265869,
"rewards/margins": 1.0650051832199097,
"rewards/rejected": -4.964868068695068,
"step": 2920
},
{
"epoch": 0.7668149699031667,
"grad_norm": 7.640721321105957,
"learning_rate": 7.838420990171927e-07,
"logits/chosen": -1.4619683027267456,
"logits/rejected": -1.3194328546524048,
"logps/chosen": -648.1679077148438,
"logps/rejected": -739.3377685546875,
"loss": 0.503,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.7884438037872314,
"rewards/margins": 1.1673457622528076,
"rewards/rejected": -4.955790042877197,
"step": 2930
},
{
"epoch": 0.7694320858414028,
"grad_norm": 6.787487030029297,
"learning_rate": 7.673021782854084e-07,
"logits/chosen": -1.3260360956192017,
"logits/rejected": -1.2002280950546265,
"logps/chosen": -652.7042236328125,
"logps/rejected": -749.501220703125,
"loss": 0.4834,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.9212698936462402,
"rewards/margins": 1.38629150390625,
"rewards/rejected": -5.307560920715332,
"step": 2940
},
{
"epoch": 0.7720492017796389,
"grad_norm": 12.033089637756348,
"learning_rate": 7.509069373231039e-07,
"logits/chosen": -1.3529894351959229,
"logits/rejected": -1.241321086883545,
"logps/chosen": -645.5877075195312,
"logps/rejected": -721.0560913085938,
"loss": 0.5339,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.9297664165496826,
"rewards/margins": 1.0031468868255615,
"rewards/rejected": -4.932913780212402,
"step": 2950
},
{
"epoch": 0.7746663177178749,
"grad_norm": 5.963113784790039,
"learning_rate": 7.346577451281822e-07,
"logits/chosen": -1.3783152103424072,
"logits/rejected": -1.3106451034545898,
"logps/chosen": -633.4246215820312,
"logps/rejected": -740.6929931640625,
"loss": 0.4649,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.7124524116516113,
"rewards/margins": 1.314141035079956,
"rewards/rejected": -5.0265936851501465,
"step": 2960
},
{
"epoch": 0.777283433656111,
"grad_norm": 10.886439323425293,
"learning_rate": 7.185559585035138e-07,
"logits/chosen": -1.4251292943954468,
"logits/rejected": -1.27992844581604,
"logps/chosen": -660.6417846679688,
"logps/rejected": -765.4755249023438,
"loss": 0.4755,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.779376268386841,
"rewards/margins": 1.207291841506958,
"rewards/rejected": -4.986666679382324,
"step": 2970
},
{
"epoch": 0.7799005495943471,
"grad_norm": 7.657388210296631,
"learning_rate": 7.026029219436504e-07,
"logits/chosen": -1.442101240158081,
"logits/rejected": -1.2947075366973877,
"logps/chosen": -608.75390625,
"logps/rejected": -730.2647705078125,
"loss": 0.4571,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.5647411346435547,
"rewards/margins": 1.344158411026001,
"rewards/rejected": -4.908899784088135,
"step": 2980
},
{
"epoch": 0.7825176655325831,
"grad_norm": 8.993943214416504,
"learning_rate": 6.867999675225523e-07,
"logits/chosen": -1.4737087488174438,
"logits/rejected": -1.3463550806045532,
"logps/chosen": -584.3572387695312,
"logps/rejected": -691.84130859375,
"loss": 0.4735,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.5355136394500732,
"rewards/margins": 1.2167141437530518,
"rewards/rejected": -4.752227783203125,
"step": 2990
},
{
"epoch": 0.7851347814708192,
"grad_norm": 9.367300033569336,
"learning_rate": 6.711484147823663e-07,
"logits/chosen": -1.3742111921310425,
"logits/rejected": -1.2975659370422363,
"logps/chosen": -587.8654174804688,
"logps/rejected": -723.0962524414062,
"loss": 0.4613,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.5372672080993652,
"rewards/margins": 1.3036311864852905,
"rewards/rejected": -4.840898513793945,
"step": 3000
},
{
"epoch": 0.7851347814708192,
"eval_logits/chosen": -1.312312126159668,
"eval_logits/rejected": -1.1940079927444458,
"eval_logps/chosen": -629.1587524414062,
"eval_logps/rejected": -726.3348999023438,
"eval_loss": 0.47825390100479126,
"eval_rewards/accuracies": 0.7544999718666077,
"eval_rewards/chosen": -3.645437717437744,
"eval_rewards/margins": 1.1723082065582275,
"eval_rewards/rejected": -4.817745685577393,
"eval_runtime": 1582.4909,
"eval_samples_per_second": 1.264,
"eval_steps_per_second": 0.158,
"step": 3000
},
{
"epoch": 0.7877518974090553,
"grad_norm": 10.457430839538574,
"learning_rate": 6.556495706232413e-07,
"logits/chosen": -1.3809382915496826,
"logits/rejected": -1.31160569190979,
"logps/chosen": -637.9448852539062,
"logps/rejected": -725.9009399414062,
"loss": 0.5419,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.6930198669433594,
"rewards/margins": 1.064775824546814,
"rewards/rejected": -4.757795810699463,
"step": 3010
},
{
"epoch": 0.7903690133472913,
"grad_norm": 10.19312858581543,
"learning_rate": 6.403047291942057e-07,
"logits/chosen": -1.3200327157974243,
"logits/rejected": -1.176688551902771,
"logps/chosen": -585.4561157226562,
"logps/rejected": -678.8817138671875,
"loss": 0.4657,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.592146396636963,
"rewards/margins": 1.1788526773452759,
"rewards/rejected": -4.770998954772949,
"step": 3020
},
{
"epoch": 0.7929861292855274,
"grad_norm": 10.267387390136719,
"learning_rate": 6.251151717851023e-07,
"logits/chosen": -1.3837000131607056,
"logits/rejected": -1.2987123727798462,
"logps/chosen": -583.4530029296875,
"logps/rejected": -682.2948608398438,
"loss": 0.4972,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.5015385150909424,
"rewards/margins": 1.1210825443267822,
"rewards/rejected": -4.622621059417725,
"step": 3030
},
{
"epoch": 0.7956032452237635,
"grad_norm": 6.017190933227539,
"learning_rate": 6.100821667196041e-07,
"logits/chosen": -1.5390170812606812,
"logits/rejected": -1.3035577535629272,
"logps/chosen": -615.9330444335938,
"logps/rejected": -661.38037109375,
"loss": 0.4849,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.404139757156372,
"rewards/margins": 1.1094605922698975,
"rewards/rejected": -4.5136003494262695,
"step": 3040
},
{
"epoch": 0.7982203611619995,
"grad_norm": 7.074293613433838,
"learning_rate": 5.952069692493062e-07,
"logits/chosen": -1.3566112518310547,
"logits/rejected": -1.2582927942276,
"logps/chosen": -576.7879638671875,
"logps/rejected": -713.3567504882812,
"loss": 0.4162,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.4851574897766113,
"rewards/margins": 1.3315637111663818,
"rewards/rejected": -4.816721439361572,
"step": 3050
},
{
"epoch": 0.8008374771002356,
"grad_norm": 10.666440963745117,
"learning_rate": 5.80490821448918e-07,
"logits/chosen": -1.3122138977050781,
"logits/rejected": -1.2932623624801636,
"logps/chosen": -627.7955322265625,
"logps/rejected": -814.8255615234375,
"loss": 0.4273,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.653444290161133,
"rewards/margins": 1.4460790157318115,
"rewards/rejected": -5.099523067474365,
"step": 3060
},
{
"epoch": 0.8034545930384716,
"grad_norm": 9.67546558380127,
"learning_rate": 5.659349521125459e-07,
"logits/chosen": -1.4693623781204224,
"logits/rejected": -1.4153467416763306,
"logps/chosen": -655.6878662109375,
"logps/rejected": -741.1046752929688,
"loss": 0.5202,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.7438597679138184,
"rewards/margins": 1.0592159032821655,
"rewards/rejected": -4.803075790405273,
"step": 3070
},
{
"epoch": 0.8060717089767077,
"grad_norm": 7.3843865394592285,
"learning_rate": 5.5154057665109e-07,
"logits/chosen": -1.4262231588363647,
"logits/rejected": -1.2912954092025757,
"logps/chosen": -656.511962890625,
"logps/rejected": -770.8526611328125,
"loss": 0.492,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.956704616546631,
"rewards/margins": 1.4080426692962646,
"rewards/rejected": -5.364747524261475,
"step": 3080
},
{
"epoch": 0.8086888249149438,
"grad_norm": 7.734601020812988,
"learning_rate": 5.373088969907586e-07,
"logits/chosen": -1.4562673568725586,
"logits/rejected": -1.2930481433868408,
"logps/chosen": -664.8043823242188,
"logps/rejected": -734.4459228515625,
"loss": 0.4492,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.8809738159179688,
"rewards/margins": 1.1982970237731934,
"rewards/rejected": -5.079270362854004,
"step": 3090
},
{
"epoch": 0.8113059408531798,
"grad_norm": 8.992291450500488,
"learning_rate": 5.23241101472709e-07,
"logits/chosen": -1.376122236251831,
"logits/rejected": -1.2622703313827515,
"logps/chosen": -655.6407470703125,
"logps/rejected": -750.63818359375,
"loss": 0.4904,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.794213056564331,
"rewards/margins": 1.1705824136734009,
"rewards/rejected": -4.9647955894470215,
"step": 3100
},
{
"epoch": 0.8113059408531798,
"eval_logits/chosen": -1.2847076654434204,
"eval_logits/rejected": -1.1654341220855713,
"eval_logps/chosen": -653.8601684570312,
"eval_logps/rejected": -760.7857055664062,
"eval_loss": 0.4786832332611084,
"eval_rewards/accuracies": 0.7534999847412109,
"eval_rewards/chosen": -3.892453193664551,
"eval_rewards/margins": 1.2698006629943848,
"eval_rewards/rejected": -5.162253379821777,
"eval_runtime": 1582.3299,
"eval_samples_per_second": 1.264,
"eval_steps_per_second": 0.158,
"step": 3100
},
{
"epoch": 0.8139230567914159,
"grad_norm": 9.587347030639648,
"learning_rate": 5.09338364753818e-07,
"logits/chosen": -1.4489879608154297,
"logits/rejected": -1.285023808479309,
"logps/chosen": -667.4219360351562,
"logps/rejected": -779.0870361328125,
"loss": 0.5309,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.8209068775177,
"rewards/margins": 1.2750650644302368,
"rewards/rejected": -5.095972061157227,
"step": 3110
},
{
"epoch": 0.816540172729652,
"grad_norm": 9.464641571044922,
"learning_rate": 4.956018477086005e-07,
"logits/chosen": -1.4130933284759521,
"logits/rejected": -1.2669379711151123,
"logps/chosen": -666.3409423828125,
"logps/rejected": -763.8473510742188,
"loss": 0.5186,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.9732048511505127,
"rewards/margins": 1.2480720281600952,
"rewards/rejected": -5.221276760101318,
"step": 3120
},
{
"epoch": 0.819157288667888,
"grad_norm": 11.495794296264648,
"learning_rate": 4.820326973322764e-07,
"logits/chosen": -1.332169771194458,
"logits/rejected": -1.248384952545166,
"logps/chosen": -647.7191162109375,
"logps/rejected": -768.2628173828125,
"loss": 0.519,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -3.9689033031463623,
"rewards/margins": 1.2653844356536865,
"rewards/rejected": -5.234287738800049,
"step": 3130
},
{
"epoch": 0.821774404606124,
"grad_norm": 13.087740898132324,
"learning_rate": 4.686320466449981e-07,
"logits/chosen": -1.3172610998153687,
"logits/rejected": -1.1501275300979614,
"logps/chosen": -616.0255126953125,
"logps/rejected": -763.9444580078125,
"loss": 0.4235,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.7686569690704346,
"rewards/margins": 1.5006383657455444,
"rewards/rejected": -5.2692952156066895,
"step": 3140
},
{
"epoch": 0.8243915205443602,
"grad_norm": 7.49388313293457,
"learning_rate": 4.554010145972418e-07,
"logits/chosen": -1.4834858179092407,
"logits/rejected": -1.3113733530044556,
"logps/chosen": -653.4777221679688,
"logps/rejected": -768.167236328125,
"loss": 0.5502,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.8982043266296387,
"rewards/margins": 1.2591768503189087,
"rewards/rejected": -5.157381534576416,
"step": 3150
},
{
"epoch": 0.8270086364825961,
"grad_norm": 9.854113578796387,
"learning_rate": 4.4234070597637455e-07,
"logits/chosen": -1.33645761013031,
"logits/rejected": -1.2524462938308716,
"logps/chosen": -650.8976440429688,
"logps/rejected": -750.16796875,
"loss": 0.5245,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.754307270050049,
"rewards/margins": 1.112375020980835,
"rewards/rejected": -4.866682052612305,
"step": 3160
},
{
"epoch": 0.8296257524208323,
"grad_norm": 6.365306377410889,
"learning_rate": 4.2945221131440783e-07,
"logits/chosen": -1.3347636461257935,
"logits/rejected": -1.1535985469818115,
"logps/chosen": -623.1888427734375,
"logps/rejected": -730.3753662109375,
"loss": 0.4036,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -3.575324296951294,
"rewards/margins": 1.3884552717208862,
"rewards/rejected": -4.963778972625732,
"step": 3170
},
{
"epoch": 0.8322428683590684,
"grad_norm": 10.03058910369873,
"learning_rate": 4.167366067969381e-07,
"logits/chosen": -1.4377801418304443,
"logits/rejected": -1.3695132732391357,
"logps/chosen": -578.0193481445312,
"logps/rejected": -705.9163208007812,
"loss": 0.5003,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.5517592430114746,
"rewards/margins": 1.0413546562194824,
"rewards/rejected": -4.593113899230957,
"step": 3180
},
{
"epoch": 0.8348599842973043,
"grad_norm": 5.4473876953125,
"learning_rate": 4.041949541732826e-07,
"logits/chosen": -1.4346039295196533,
"logits/rejected": -1.3620599508285522,
"logps/chosen": -621.5526733398438,
"logps/rejected": -724.8057861328125,
"loss": 0.4962,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -3.5769572257995605,
"rewards/margins": 1.1758462190628052,
"rewards/rejected": -4.752803325653076,
"step": 3190
},
{
"epoch": 0.8374771002355405,
"grad_norm": 11.63176441192627,
"learning_rate": 3.9182830066782614e-07,
"logits/chosen": -1.3472647666931152,
"logits/rejected": -1.330804467201233,
"logps/chosen": -613.8287963867188,
"logps/rejected": -755.5730590820312,
"loss": 0.4706,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.5616798400878906,
"rewards/margins": 1.2930762767791748,
"rewards/rejected": -4.854755878448486,
"step": 3200
},
{
"epoch": 0.8374771002355405,
"eval_logits/chosen": -1.3301115036010742,
"eval_logits/rejected": -1.2139098644256592,
"eval_logps/chosen": -613.1915283203125,
"eval_logps/rejected": -714.2922973632812,
"eval_loss": 0.4755466878414154,
"eval_rewards/accuracies": 0.7524999976158142,
"eval_rewards/chosen": -3.4857659339904785,
"eval_rewards/margins": 1.2115534543991089,
"eval_rewards/rejected": -4.697319507598877,
"eval_runtime": 1591.0618,
"eval_samples_per_second": 1.257,
"eval_steps_per_second": 0.157,
"step": 3200
},
{
"epoch": 0.8400942161737766,
"grad_norm": 6.076069355010986,
"learning_rate": 3.796376788925771e-07,
"logits/chosen": -1.354421854019165,
"logits/rejected": -1.2948524951934814,
"logps/chosen": -597.9942016601562,
"logps/rejected": -688.4937133789062,
"loss": 0.4835,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.366337299346924,
"rewards/margins": 1.1454763412475586,
"rewards/rejected": -4.511813163757324,
"step": 3210
},
{
"epoch": 0.8427113321120125,
"grad_norm": 7.7770161628723145,
"learning_rate": 3.676241067609465e-07,
"logits/chosen": -1.432493805885315,
"logits/rejected": -1.3264445066452026,
"logps/chosen": -639.21142578125,
"logps/rejected": -703.9612426757812,
"loss": 0.5345,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.4550623893737793,
"rewards/margins": 1.0796080827713013,
"rewards/rejected": -4.534670352935791,
"step": 3220
},
{
"epoch": 0.8453284480502486,
"grad_norm": 15.151912689208984,
"learning_rate": 3.5578858740274976e-07,
"logits/chosen": -1.369700312614441,
"logits/rejected": -1.2841273546218872,
"logps/chosen": -613.7825927734375,
"logps/rejected": -702.5382080078125,
"loss": 0.5013,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.5211188793182373,
"rewards/margins": 1.0141242742538452,
"rewards/rejected": -4.535243034362793,
"step": 3230
},
{
"epoch": 0.8479455639884846,
"grad_norm": 11.273491859436035,
"learning_rate": 3.44132109080447e-07,
"logits/chosen": -1.5450893640518188,
"logits/rejected": -1.3859776258468628,
"logps/chosen": -600.0426025390625,
"logps/rejected": -696.7976684570312,
"loss": 0.4268,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.348644256591797,
"rewards/margins": 1.3427956104278564,
"rewards/rejected": -4.691439628601074,
"step": 3240
},
{
"epoch": 0.8505626799267207,
"grad_norm": 7.93589973449707,
"learning_rate": 3.3265564510662344e-07,
"logits/chosen": -1.4886678457260132,
"logits/rejected": -1.3508810997009277,
"logps/chosen": -630.6646728515625,
"logps/rejected": -736.4685668945312,
"loss": 0.4298,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.374323606491089,
"rewards/margins": 1.3093299865722656,
"rewards/rejected": -4.683653354644775,
"step": 3250
},
{
"epoch": 0.8531797958649568,
"grad_norm": 17.22613525390625,
"learning_rate": 3.213601537627195e-07,
"logits/chosen": -1.3882197141647339,
"logits/rejected": -1.2806700468063354,
"logps/chosen": -637.3515014648438,
"logps/rejected": -732.1290283203125,
"loss": 0.5388,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.802922010421753,
"rewards/margins": 1.1185327768325806,
"rewards/rejected": -4.921454906463623,
"step": 3260
},
{
"epoch": 0.8557969118031928,
"grad_norm": 14.590155601501465,
"learning_rate": 3.1024657821901063e-07,
"logits/chosen": -1.4339938163757324,
"logits/rejected": -1.3640520572662354,
"logps/chosen": -598.9895629882812,
"logps/rejected": -707.3479614257812,
"loss": 0.4939,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.513320207595825,
"rewards/margins": 1.2418614625930786,
"rewards/rejected": -4.755181789398193,
"step": 3270
},
{
"epoch": 0.8584140277414289,
"grad_norm": 12.836420059204102,
"learning_rate": 2.9931584645585654e-07,
"logits/chosen": -1.3704925775527954,
"logits/rejected": -1.3468918800354004,
"logps/chosen": -630.4825439453125,
"logps/rejected": -753.4363403320312,
"loss": 0.4829,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.5602269172668457,
"rewards/margins": 1.2000439167022705,
"rewards/rejected": -4.760270595550537,
"step": 3280
},
{
"epoch": 0.861031143679665,
"grad_norm": 5.077442169189453,
"learning_rate": 2.885688711862136e-07,
"logits/chosen": -1.4080350399017334,
"logits/rejected": -1.3973747491836548,
"logps/chosen": -639.9348754882812,
"logps/rejected": -779.698974609375,
"loss": 0.5049,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.790173053741455,
"rewards/margins": 1.416594386100769,
"rewards/rejected": -5.206767559051514,
"step": 3290
},
{
"epoch": 0.863648259617901,
"grad_norm": 6.491420269012451,
"learning_rate": 2.7800654977942486e-07,
"logits/chosen": -1.3865365982055664,
"logits/rejected": -1.275614857673645,
"logps/chosen": -620.8766479492188,
"logps/rejected": -736.026611328125,
"loss": 0.519,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.636903762817383,
"rewards/margins": 1.1893088817596436,
"rewards/rejected": -4.8262128829956055,
"step": 3300
},
{
"epoch": 0.863648259617901,
"eval_logits/chosen": -1.3147002458572388,
"eval_logits/rejected": -1.1985875368118286,
"eval_logps/chosen": -633.2412109375,
"eval_logps/rejected": -738.4901123046875,
"eval_loss": 0.4761752188205719,
"eval_rewards/accuracies": 0.7524999976158142,
"eval_rewards/chosen": -3.686262607574463,
"eval_rewards/margins": 1.2530354261398315,
"eval_rewards/rejected": -4.939297676086426,
"eval_runtime": 1583.9739,
"eval_samples_per_second": 1.263,
"eval_steps_per_second": 0.158,
"step": 3300
},
{
"epoch": 0.8662653755561371,
"grad_norm": 19.23656463623047,
"learning_rate": 2.6762976418628797e-07,
"logits/chosen": -1.409970998764038,
"logits/rejected": -1.2858604192733765,
"logps/chosen": -586.6197509765625,
"logps/rejected": -659.4794921875,
"loss": 0.4991,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.5687224864959717,
"rewards/margins": 1.1649234294891357,
"rewards/rejected": -4.733645915985107,
"step": 3310
},
{
"epoch": 0.8688824914943732,
"grad_norm": 9.18576431274414,
"learning_rate": 2.5743938086541354e-07,
"logits/chosen": -1.3930479288101196,
"logits/rejected": -1.2830320596694946,
"logps/chosen": -632.0997314453125,
"logps/rejected": -728.1014404296875,
"loss": 0.5094,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.6787362098693848,
"rewards/margins": 1.2093604803085327,
"rewards/rejected": -4.888096809387207,
"step": 3320
},
{
"epoch": 0.8714996074326092,
"grad_norm": 14.547459602355957,
"learning_rate": 2.4743625071087574e-07,
"logits/chosen": -1.539896011352539,
"logits/rejected": -1.3763868808746338,
"logps/chosen": -630.2366943359375,
"logps/rejected": -743.1935424804688,
"loss": 0.4683,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.5275490283966064,
"rewards/margins": 1.4253052473068237,
"rewards/rejected": -4.952854156494141,
"step": 3330
},
{
"epoch": 0.8741167233708453,
"grad_norm": 10.659337043762207,
"learning_rate": 2.3762120898116498e-07,
"logits/chosen": -1.4196306467056274,
"logits/rejected": -1.3246088027954102,
"logps/chosen": -650.2425537109375,
"logps/rejected": -748.0562744140625,
"loss": 0.4816,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.8152756690979004,
"rewards/margins": 1.0581191778182983,
"rewards/rejected": -4.873394966125488,
"step": 3340
},
{
"epoch": 0.8767338393090814,
"grad_norm": 7.751733779907227,
"learning_rate": 2.2799507522944048e-07,
"logits/chosen": -1.364739179611206,
"logits/rejected": -1.269736409187317,
"logps/chosen": -621.5891723632812,
"logps/rejected": -746.6184692382812,
"loss": 0.4506,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.5590720176696777,
"rewards/margins": 1.3328505754470825,
"rewards/rejected": -4.891922473907471,
"step": 3350
},
{
"epoch": 0.8793509552473174,
"grad_norm": 8.421998023986816,
"learning_rate": 2.1855865323510056e-07,
"logits/chosen": -1.4129679203033447,
"logits/rejected": -1.2285114526748657,
"logps/chosen": -637.7249755859375,
"logps/rejected": -789.73779296875,
"loss": 0.4224,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.650952100753784,
"rewards/margins": 1.57891047000885,
"rewards/rejected": -5.229863166809082,
"step": 3360
},
{
"epoch": 0.8819680711855535,
"grad_norm": 7.72039270401001,
"learning_rate": 2.0931273093666575e-07,
"logits/chosen": -1.3570077419281006,
"logits/rejected": -1.2234935760498047,
"logps/chosen": -617.6724243164062,
"logps/rejected": -736.6064453125,
"loss": 0.4279,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -3.775437831878662,
"rewards/margins": 1.3919856548309326,
"rewards/rejected": -5.167424201965332,
"step": 3370
},
{
"epoch": 0.8845851871237895,
"grad_norm": 12.305501937866211,
"learning_rate": 2.002580803659873e-07,
"logits/chosen": -1.3679782152175903,
"logits/rejected": -1.2431459426879883,
"logps/chosen": -638.044921875,
"logps/rejected": -742.0861206054688,
"loss": 0.469,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.8360562324523926,
"rewards/margins": 1.2222352027893066,
"rewards/rejected": -5.058291435241699,
"step": 3380
},
{
"epoch": 0.8872023030620256,
"grad_norm": 6.246249675750732,
"learning_rate": 1.913954575837826e-07,
"logits/chosen": -1.414441466331482,
"logits/rejected": -1.2020373344421387,
"logps/chosen": -651.7365112304688,
"logps/rejected": -722.0252685546875,
"loss": 0.4699,
"rewards/accuracies": 0.75,
"rewards/chosen": -3.7970893383026123,
"rewards/margins": 1.1998703479766846,
"rewards/rejected": -4.996959209442139,
"step": 3390
},
{
"epoch": 0.8898194190002617,
"grad_norm": 8.567130088806152,
"learning_rate": 1.827256026165028e-07,
"logits/chosen": -1.42445707321167,
"logits/rejected": -1.2594492435455322,
"logps/chosen": -675.4766235351562,
"logps/rejected": -760.4920654296875,
"loss": 0.4446,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.605090618133545,
"rewards/margins": 1.4216583967208862,
"rewards/rejected": -5.026749134063721,
"step": 3400
},
{
"epoch": 0.8898194190002617,
"eval_logits/chosen": -1.2863726615905762,
"eval_logits/rejected": -1.1676445007324219,
"eval_logps/chosen": -646.8135375976562,
"eval_logps/rejected": -755.2252197265625,
"eval_loss": 0.476205974817276,
"eval_rewards/accuracies": 0.7534999847412109,
"eval_rewards/chosen": -3.821986436843872,
"eval_rewards/margins": 1.284662127494812,
"eval_rewards/rejected": -5.1066484451293945,
"eval_runtime": 1584.3019,
"eval_samples_per_second": 1.262,
"eval_steps_per_second": 0.158,
"step": 3400
},
{
"epoch": 0.8924365349384977,
"grad_norm": 9.226576805114746,
"learning_rate": 1.7424923939454274e-07,
"logits/chosen": -1.3717691898345947,
"logits/rejected": -1.211722493171692,
"logps/chosen": -668.0113525390625,
"logps/rejected": -759.1041259765625,
"loss": 0.4317,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -3.867192506790161,
"rewards/margins": 1.3442370891571045,
"rewards/rejected": -5.211429119110107,
"step": 3410
},
{
"epoch": 0.8950536508767338,
"grad_norm": 19.934711456298828,
"learning_rate": 1.6596707569179304e-07,
"logits/chosen": -1.4707757234573364,
"logits/rejected": -1.3320478200912476,
"logps/chosen": -664.7568969726562,
"logps/rejected": -757.882568359375,
"loss": 0.4794,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.836893081665039,
"rewards/margins": 1.2826611995697021,
"rewards/rejected": -5.119554042816162,
"step": 3420
},
{
"epoch": 0.8976707668149699,
"grad_norm": 10.98475170135498,
"learning_rate": 1.578798030665385e-07,
"logits/chosen": -1.4144407510757446,
"logits/rejected": -1.2425600290298462,
"logps/chosen": -652.3900146484375,
"logps/rejected": -788.5800170898438,
"loss": 0.431,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.7777092456817627,
"rewards/margins": 1.5329736471176147,
"rewards/rejected": -5.310682773590088,
"step": 3430
},
{
"epoch": 0.9002878827532059,
"grad_norm": 8.336905479431152,
"learning_rate": 1.499880968037165e-07,
"logits/chosen": -1.3964354991912842,
"logits/rejected": -1.2669428586959839,
"logps/chosen": -629.9685668945312,
"logps/rejected": -719.1429443359375,
"loss": 0.5023,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.744743824005127,
"rewards/margins": 1.254372000694275,
"rewards/rejected": -4.999115467071533,
"step": 3440
},
{
"epoch": 0.902904998691442,
"grad_norm": 13.396973609924316,
"learning_rate": 1.4229261585852805e-07,
"logits/chosen": -1.426830530166626,
"logits/rejected": -1.3484631776809692,
"logps/chosen": -632.507568359375,
"logps/rejected": -737.5188598632812,
"loss": 0.4408,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -3.6420531272888184,
"rewards/margins": 1.3079338073730469,
"rewards/rejected": -4.949986934661865,
"step": 3450
},
{
"epoch": 0.9055221146296781,
"grad_norm": 12.532068252563477,
"learning_rate": 1.3479400280141886e-07,
"logits/chosen": -1.3498502969741821,
"logits/rejected": -1.3053711652755737,
"logps/chosen": -631.3232421875,
"logps/rejected": -761.801025390625,
"loss": 0.4882,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.851062059402466,
"rewards/margins": 1.3252581357955933,
"rewards/rejected": -5.1763200759887695,
"step": 3460
},
{
"epoch": 0.9081392305679141,
"grad_norm": 9.887036323547363,
"learning_rate": 1.2749288376442044e-07,
"logits/chosen": -1.4401438236236572,
"logits/rejected": -1.2534105777740479,
"logps/chosen": -664.2423706054688,
"logps/rejected": -740.9425048828125,
"loss": 0.4562,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -3.6783053874969482,
"rewards/margins": 1.3420859575271606,
"rewards/rejected": -5.020391464233398,
"step": 3470
},
{
"epoch": 0.9107563465061502,
"grad_norm": 8.95275592803955,
"learning_rate": 1.203898683888713e-07,
"logits/chosen": -1.438189148902893,
"logits/rejected": -1.3202855587005615,
"logps/chosen": -631.1199951171875,
"logps/rejected": -736.70458984375,
"loss": 0.5591,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -3.8719773292541504,
"rewards/margins": 1.087269902229309,
"rewards/rejected": -4.959246635437012,
"step": 3480
},
{
"epoch": 0.9133734624443863,
"grad_norm": 9.944456100463867,
"learning_rate": 1.1348554977451132e-07,
"logits/chosen": -1.4648234844207764,
"logits/rejected": -1.3383164405822754,
"logps/chosen": -654.9630737304688,
"logps/rejected": -742.1627807617188,
"loss": 0.5072,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -3.731118679046631,
"rewards/margins": 1.2008678913116455,
"rewards/rejected": -4.9319868087768555,
"step": 3490
},
{
"epoch": 0.9159905783826223,
"grad_norm": 6.555506229400635,
"learning_rate": 1.0678050442995802e-07,
"logits/chosen": -1.4295841455459595,
"logits/rejected": -1.2433637380599976,
"logps/chosen": -656.9864501953125,
"logps/rejected": -733.3885498046875,
"loss": 0.5378,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.7627615928649902,
"rewards/margins": 1.2152563333511353,
"rewards/rejected": -4.978017807006836,
"step": 3500
},
{
"epoch": 0.9159905783826223,
"eval_logits/chosen": -1.3105764389038086,
"eval_logits/rejected": -1.1933128833770752,
"eval_logps/chosen": -640.232666015625,
"eval_logps/rejected": -749.0795288085938,
"eval_loss": 0.47588038444519043,
"eval_rewards/accuracies": 0.753000020980835,
"eval_rewards/chosen": -3.7561774253845215,
"eval_rewards/margins": 1.2890138626098633,
"eval_rewards/rejected": -5.045191287994385,
"eval_runtime": 1584.5871,
"eval_samples_per_second": 1.262,
"eval_steps_per_second": 0.158,
"step": 3500
},
{
"epoch": 0.9186076943208584,
"grad_norm": 10.787802696228027,
"learning_rate": 1.0027529222456755e-07,
"logits/chosen": -1.4012380838394165,
"logits/rejected": -1.240236759185791,
"logps/chosen": -624.1384887695312,
"logps/rejected": -739.2446899414062,
"loss": 0.4361,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.699873447418213,
"rewards/margins": 1.3274140357971191,
"rewards/rejected": -5.027287483215332,
"step": 3510
},
{
"epoch": 0.9212248102590945,
"grad_norm": 8.844759941101074,
"learning_rate": 9.397045634168766e-08,
"logits/chosen": -1.4404442310333252,
"logits/rejected": -1.366231083869934,
"logps/chosen": -637.5411987304688,
"logps/rejected": -778.2499389648438,
"loss": 0.4573,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.691349506378174,
"rewards/margins": 1.4043484926223755,
"rewards/rejected": -5.09569787979126,
"step": 3520
},
{
"epoch": 0.9238419261973305,
"grad_norm": 17.08158302307129,
"learning_rate": 8.78665232332998e-08,
"logits/chosen": -1.366356611251831,
"logits/rejected": -1.2875096797943115,
"logps/chosen": -614.8327026367188,
"logps/rejected": -738.70263671875,
"loss": 0.4711,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.841702938079834,
"rewards/margins": 1.2470335960388184,
"rewards/rejected": -5.0887370109558105,
"step": 3530
},
{
"epoch": 0.9264590421355666,
"grad_norm": 7.980059623718262,
"learning_rate": 8.196400257606208e-08,
"logits/chosen": -1.4690799713134766,
"logits/rejected": -1.3165347576141357,
"logps/chosen": -652.5242919921875,
"logps/rejected": -803.9196166992188,
"loss": 0.4108,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -3.6954245567321777,
"rewards/margins": 1.5657516717910767,
"rewards/rejected": -5.261176109313965,
"step": 3540
},
{
"epoch": 0.9290761580738026,
"grad_norm": 10.548364639282227,
"learning_rate": 7.626338722875076e-08,
"logits/chosen": -1.4087066650390625,
"logits/rejected": -1.3460147380828857,
"logps/chosen": -621.637451171875,
"logps/rejected": -753.1847534179688,
"loss": 0.4788,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.676053524017334,
"rewards/margins": 1.3061879873275757,
"rewards/rejected": -4.982241630554199,
"step": 3550
},
{
"epoch": 0.9316932740120387,
"grad_norm": 6.714722633361816,
"learning_rate": 7.076515319110688e-08,
"logits/chosen": -1.4119006395339966,
"logits/rejected": -1.3059628009796143,
"logps/chosen": -620.5264892578125,
"logps/rejected": -717.8798828125,
"loss": 0.5122,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.6425423622131348,
"rewards/margins": 1.3905235528945923,
"rewards/rejected": -5.0330657958984375,
"step": 3560
},
{
"epoch": 0.9343103899502748,
"grad_norm": 7.078303813934326,
"learning_rate": 6.54697595640899e-08,
"logits/chosen": -1.4229375123977661,
"logits/rejected": -1.312577486038208,
"logps/chosen": -663.3248291015625,
"logps/rejected": -762.3607177734375,
"loss": 0.4733,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.7122771739959717,
"rewards/margins": 1.2408705949783325,
"rewards/rejected": -4.953147888183594,
"step": 3570
},
{
"epoch": 0.9369275058885108,
"grad_norm": 10.152729988098145,
"learning_rate": 6.037764851154426e-08,
"logits/chosen": -1.4115734100341797,
"logits/rejected": -1.3518728017807007,
"logps/chosen": -633.0020751953125,
"logps/rejected": -763.7161865234375,
"loss": 0.5006,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.6908345222473145,
"rewards/margins": 1.2587831020355225,
"rewards/rejected": -4.949617862701416,
"step": 3580
},
{
"epoch": 0.9395446218267469,
"grad_norm": 6.685621738433838,
"learning_rate": 5.548924522327748e-08,
"logits/chosen": -1.401659369468689,
"logits/rejected": -1.2891366481781006,
"logps/chosen": -619.3512573242188,
"logps/rejected": -733.2835083007812,
"loss": 0.4568,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.577338457107544,
"rewards/margins": 1.285097599029541,
"rewards/rejected": -4.862435817718506,
"step": 3590
},
{
"epoch": 0.942161737764983,
"grad_norm": 13.303339958190918,
"learning_rate": 5.0804957879556915e-08,
"logits/chosen": -1.3285419940948486,
"logits/rejected": -1.2414448261260986,
"logps/chosen": -591.6680908203125,
"logps/rejected": -724.0158081054688,
"loss": 0.4506,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.662086009979248,
"rewards/margins": 1.2775993347167969,
"rewards/rejected": -4.939684867858887,
"step": 3600
},
{
"epoch": 0.942161737764983,
"eval_logits/chosen": -1.3114778995513916,
"eval_logits/rejected": -1.1944193840026855,
"eval_logps/chosen": -635.4866943359375,
"eval_logps/rejected": -744.0071411132812,
"eval_loss": 0.4759487509727478,
"eval_rewards/accuracies": 0.7534999847412109,
"eval_rewards/chosen": -3.7087175846099854,
"eval_rewards/margins": 1.2857497930526733,
"eval_rewards/rejected": -4.994467735290527,
"eval_runtime": 1593.4546,
"eval_samples_per_second": 1.255,
"eval_steps_per_second": 0.157,
"step": 3600
},
{
"epoch": 0.944778853703219,
"grad_norm": 9.378813743591309,
"learning_rate": 4.632517761702815e-08,
"logits/chosen": -1.3526476621627808,
"logits/rejected": -1.2188866138458252,
"logps/chosen": -608.4993286132812,
"logps/rejected": -749.1359252929688,
"loss": 0.4235,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.732194423675537,
"rewards/margins": 1.5286829471588135,
"rewards/rejected": -5.26087760925293,
"step": 3610
},
{
"epoch": 0.9473959696414551,
"grad_norm": 11.812514305114746,
"learning_rate": 4.205027849605359e-08,
"logits/chosen": -1.379093050956726,
"logits/rejected": -1.2838860750198364,
"logps/chosen": -625.9976806640625,
"logps/rejected": -716.8546752929688,
"loss": 0.5215,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.781675338745117,
"rewards/margins": 1.2094558477401733,
"rewards/rejected": -4.991130828857422,
"step": 3620
},
{
"epoch": 0.9500130855796912,
"grad_norm": 9.726532936096191,
"learning_rate": 3.798061746947995e-08,
"logits/chosen": -1.4842337369918823,
"logits/rejected": -1.3593313694000244,
"logps/chosen": -630.210693359375,
"logps/rejected": -723.7564697265625,
"loss": 0.4643,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.6620631217956543,
"rewards/margins": 1.3267091512680054,
"rewards/rejected": -4.988772392272949,
"step": 3630
},
{
"epoch": 0.9526302015179272,
"grad_norm": 10.711019515991211,
"learning_rate": 3.411653435283158e-08,
"logits/chosen": -1.411237120628357,
"logits/rejected": -1.2247424125671387,
"logps/chosen": -640.1253662109375,
"logps/rejected": -703.46875,
"loss": 0.4769,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.6698365211486816,
"rewards/margins": 1.190108299255371,
"rewards/rejected": -4.8599443435668945,
"step": 3640
},
{
"epoch": 0.9552473174561633,
"grad_norm": 7.913322925567627,
"learning_rate": 3.04583517959367e-08,
"logits/chosen": -1.4571387767791748,
"logits/rejected": -1.3205856084823608,
"logps/chosen": -603.8260498046875,
"logps/rejected": -697.1012573242188,
"loss": 0.4523,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.5181479454040527,
"rewards/margins": 1.2571742534637451,
"rewards/rejected": -4.775322914123535,
"step": 3650
},
{
"epoch": 0.9578644333943994,
"grad_norm": 14.019694328308105,
"learning_rate": 2.7006375255985984e-08,
"logits/chosen": -1.3986600637435913,
"logits/rejected": -1.3672528266906738,
"logps/chosen": -645.9337158203125,
"logps/rejected": -747.8303833007812,
"loss": 0.5895,
"rewards/accuracies": 0.6875,
"rewards/chosen": -3.8122050762176514,
"rewards/margins": 1.0134917497634888,
"rewards/rejected": -4.82569694519043,
"step": 3660
},
{
"epoch": 0.9604815493326354,
"grad_norm": 11.917638778686523,
"learning_rate": 2.3760892972027328e-08,
"logits/chosen": -1.5023940801620483,
"logits/rejected": -1.3516952991485596,
"logps/chosen": -656.4842529296875,
"logps/rejected": -742.977783203125,
"loss": 0.5408,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -3.840627670288086,
"rewards/margins": 1.2045114040374756,
"rewards/rejected": -5.045139789581299,
"step": 3670
},
{
"epoch": 0.9630986652708715,
"grad_norm": 12.40701961517334,
"learning_rate": 2.072217594089765e-08,
"logits/chosen": -1.368158221244812,
"logits/rejected": -1.353477120399475,
"logps/chosen": -638.237060546875,
"logps/rejected": -766.6362915039062,
"loss": 0.4165,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.7673258781433105,
"rewards/margins": 1.40687096118927,
"rewards/rejected": -5.174197196960449,
"step": 3680
},
{
"epoch": 0.9657157812091076,
"grad_norm": 10.349726676940918,
"learning_rate": 1.789047789459375e-08,
"logits/chosen": -1.4667888879776,
"logits/rejected": -1.289011001586914,
"logps/chosen": -689.1021728515625,
"logps/rejected": -770.5333862304688,
"loss": 0.5123,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.7222888469696045,
"rewards/margins": 1.3046228885650635,
"rewards/rejected": -5.026911735534668,
"step": 3690
},
{
"epoch": 0.9683328971473436,
"grad_norm": 9.133951187133789,
"learning_rate": 1.5266035279088708e-08,
"logits/chosen": -1.3182872533798218,
"logits/rejected": -1.2082844972610474,
"logps/chosen": -675.6629638671875,
"logps/rejected": -779.8258666992188,
"loss": 0.4732,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -3.791264295578003,
"rewards/margins": 1.3038253784179688,
"rewards/rejected": -5.095089912414551,
"step": 3700
},
{
"epoch": 0.9683328971473436,
"eval_logits/chosen": -1.3108527660369873,
"eval_logits/rejected": -1.1938129663467407,
"eval_logps/chosen": -633.6405029296875,
"eval_logps/rejected": -741.50830078125,
"eval_loss": 0.4758020043373108,
"eval_rewards/accuracies": 0.7540000081062317,
"eval_rewards/chosen": -3.6902551651000977,
"eval_rewards/margins": 1.2792243957519531,
"eval_rewards/rejected": -4.969479560852051,
"eval_runtime": 1596.3049,
"eval_samples_per_second": 1.253,
"eval_steps_per_second": 0.157,
"step": 3700
},
{
"epoch": 0.9709500130855797,
"grad_norm": 13.42652702331543,
"learning_rate": 1.2849067234584623e-08,
"logits/chosen": -1.3121452331542969,
"logits/rejected": -1.2471363544464111,
"logps/chosen": -600.2552490234375,
"logps/rejected": -731.0992431640625,
"loss": 0.4679,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.6395773887634277,
"rewards/margins": 1.3849769830703735,
"rewards/rejected": -5.02455472946167,
"step": 3710
},
{
"epoch": 0.9735671290238157,
"grad_norm": 7.9909467697143555,
"learning_rate": 1.0639775577218625e-08,
"logits/chosen": -1.3058593273162842,
"logits/rejected": -1.1494419574737549,
"logps/chosen": -619.4830932617188,
"logps/rejected": -719.6461181640625,
"loss": 0.4847,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.703446865081787,
"rewards/margins": 1.3608992099761963,
"rewards/rejected": -5.0643463134765625,
"step": 3720
},
{
"epoch": 0.9761842449620518,
"grad_norm": 9.135430335998535,
"learning_rate": 8.638344782207486e-09,
"logits/chosen": -1.3282991647720337,
"logits/rejected": -1.2410192489624023,
"logps/chosen": -604.8692016601562,
"logps/rejected": -705.7180786132812,
"loss": 0.4839,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.5968875885009766,
"rewards/margins": 1.2363134622573853,
"rewards/rejected": -4.833200931549072,
"step": 3730
},
{
"epoch": 0.9788013609002879,
"grad_norm": 9.719812393188477,
"learning_rate": 6.84494196844715e-09,
"logits/chosen": -1.385698914527893,
"logits/rejected": -1.2765476703643799,
"logps/chosen": -639.7982788085938,
"logps/rejected": -776.7741088867188,
"loss": 0.4564,
"rewards/accuracies": 0.78125,
"rewards/chosen": -3.678164005279541,
"rewards/margins": 1.4851183891296387,
"rewards/rejected": -5.163282871246338,
"step": 3740
},
{
"epoch": 0.9814184768385239,
"grad_norm": 10.63892936706543,
"learning_rate": 5.259716884556121e-09,
"logits/chosen": -1.4346338510513306,
"logits/rejected": -1.3097164630889893,
"logps/chosen": -635.2260131835938,
"logps/rejected": -746.8245849609375,
"loss": 0.46,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -3.7078888416290283,
"rewards/margins": 1.264432668685913,
"rewards/rejected": -4.972321510314941,
"step": 3750
},
{
"epoch": 0.98403559277676,
"grad_norm": 9.860692024230957,
"learning_rate": 3.882801896372967e-09,
"logits/chosen": -1.433598279953003,
"logits/rejected": -1.3519879579544067,
"logps/chosen": -632.1627197265625,
"logps/rejected": -731.175048828125,
"loss": 0.5046,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.6687378883361816,
"rewards/margins": 1.3088436126708984,
"rewards/rejected": -4.97758150100708,
"step": 3760
},
{
"epoch": 0.9866527087149961,
"grad_norm": 11.127714157104492,
"learning_rate": 2.7143119759026614e-09,
"logits/chosen": -1.444789171218872,
"logits/rejected": -1.2916604280471802,
"logps/chosen": -641.5234985351562,
"logps/rejected": -745.976318359375,
"loss": 0.413,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -3.573714017868042,
"rewards/margins": 1.2991065979003906,
"rewards/rejected": -4.8728203773498535,
"step": 3770
},
{
"epoch": 0.9892698246532321,
"grad_norm": 12.815695762634277,
"learning_rate": 1.754344691717591e-09,
"logits/chosen": -1.3558757305145264,
"logits/rejected": -1.311942219734192,
"logps/chosen": -625.3250122070312,
"logps/rejected": -757.5562744140625,
"loss": 0.5029,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -3.7559523582458496,
"rewards/margins": 1.1017048358917236,
"rewards/rejected": -4.857657432556152,
"step": 3780
},
{
"epoch": 0.9918869405914682,
"grad_norm": 16.819839477539062,
"learning_rate": 1.0029802008096335e-09,
"logits/chosen": -1.3806864023208618,
"logits/rejected": -1.2327523231506348,
"logps/chosen": -646.9813842773438,
"logps/rejected": -753.9891357421875,
"loss": 0.4782,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -3.727670669555664,
"rewards/margins": 1.29872465133667,
"rewards/rejected": -5.026394844055176,
"step": 3790
},
{
"epoch": 0.9945040565297043,
"grad_norm": 15.355550765991211,
"learning_rate": 4.602812418974534e-10,
"logits/chosen": -1.4688384532928467,
"logits/rejected": -1.3403863906860352,
"logps/chosen": -656.9666748046875,
"logps/rejected": -763.0733642578125,
"loss": 0.5041,
"rewards/accuracies": 0.71875,
"rewards/chosen": -3.752246856689453,
"rewards/margins": 1.3061609268188477,
"rewards/rejected": -5.058407783508301,
"step": 3800
},
{
"epoch": 0.9945040565297043,
"eval_logits/chosen": -1.3093682527542114,
"eval_logits/rejected": -1.1921626329421997,
"eval_logps/chosen": -633.0255737304688,
"eval_logps/rejected": -740.7546997070312,
"eval_loss": 0.4757540225982666,
"eval_rewards/accuracies": 0.7544999718666077,
"eval_rewards/chosen": -3.6841063499450684,
"eval_rewards/margins": 1.2778375148773193,
"eval_rewards/rejected": -4.961943626403809,
"eval_runtime": 1598.9917,
"eval_samples_per_second": 1.251,
"eval_steps_per_second": 0.156,
"step": 3800
},
{
"epoch": 0.9971211724679403,
"grad_norm": 11.665594100952148,
"learning_rate": 1.2629313018819312e-10,
"logits/chosen": -1.3954050540924072,
"logits/rejected": -1.2826766967773438,
"logps/chosen": -617.3712158203125,
"logps/rejected": -716.8834228515625,
"loss": 0.4933,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -3.650341033935547,
"rewards/margins": 1.1511954069137573,
"rewards/rejected": -4.8015360832214355,
"step": 3810
},
{
"epoch": 0.9997382884061764,
"grad_norm": 16.381610870361328,
"learning_rate": 1.0437535929996855e-12,
"logits/chosen": -1.3866708278656006,
"logits/rejected": -1.2333214282989502,
"logps/chosen": -657.7440795898438,
"logps/rejected": -766.9520874023438,
"loss": 0.4571,
"rewards/accuracies": 0.8125,
"rewards/chosen": -3.7108898162841797,
"rewards/margins": 1.5390161275863647,
"rewards/rejected": -5.249906063079834,
"step": 3820
},
{
"epoch": 1.0,
"step": 3821,
"total_flos": 0.0,
"train_loss": 0.5140745321264819,
"train_runtime": 164862.9896,
"train_samples_per_second": 0.371,
"train_steps_per_second": 0.023
}
],
"logging_steps": 10,
"max_steps": 3821,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}