llama3.1-cpo_j-full-0911 / trainer_state.json
jbjeong91's picture
Model save
a9baadc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.9927766541462,
"eval_steps": 500,
"global_step": 2160,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.023114706732158336,
"grad_norm": 68.88048553466797,
"learning_rate": 4.629629629629629e-08,
"logits/chosen": -0.3351331651210785,
"logits/rejected": -0.3151743412017822,
"logps/chosen": -269.4203796386719,
"logps/rejected": -267.72064208984375,
"loss": 2.9236,
"nll_loss": 1.0532859563827515,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -26.94203758239746,
"rewards/margins": -0.1699729710817337,
"rewards/rejected": -26.77206802368164,
"step": 10
},
{
"epoch": 0.04622941346431667,
"grad_norm": 61.09861755371094,
"learning_rate": 9.259259259259258e-08,
"logits/chosen": -0.33865073323249817,
"logits/rejected": -0.3208921253681183,
"logps/chosen": -263.8262634277344,
"logps/rejected": -270.32977294921875,
"loss": 2.896,
"nll_loss": 0.9992793202400208,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -26.38262939453125,
"rewards/margins": 0.6503503918647766,
"rewards/rejected": -27.03297996520996,
"step": 20
},
{
"epoch": 0.06934412019647501,
"grad_norm": 64.75421142578125,
"learning_rate": 1.3888888888888888e-07,
"logits/chosen": -0.2800094485282898,
"logits/rejected": -0.2686631977558136,
"logps/chosen": -262.0818176269531,
"logps/rejected": -265.42999267578125,
"loss": 2.826,
"nll_loss": 1.124384880065918,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -26.20818519592285,
"rewards/margins": 0.33481523394584656,
"rewards/rejected": -26.54299545288086,
"step": 30
},
{
"epoch": 0.09245882692863334,
"grad_norm": 54.530216217041016,
"learning_rate": 1.8518518518518516e-07,
"logits/chosen": -0.328824520111084,
"logits/rejected": -0.3197949528694153,
"logps/chosen": -250.150146484375,
"logps/rejected": -252.0699005126953,
"loss": 2.7636,
"nll_loss": 1.1389970779418945,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -25.015010833740234,
"rewards/margins": 0.19197671115398407,
"rewards/rejected": -25.206989288330078,
"step": 40
},
{
"epoch": 0.11557353366079168,
"grad_norm": 54.73969650268555,
"learning_rate": 2.3148148148148148e-07,
"logits/chosen": -0.36699360609054565,
"logits/rejected": -0.344801664352417,
"logps/chosen": -259.365966796875,
"logps/rejected": -257.6177062988281,
"loss": 2.8769,
"nll_loss": 0.9557002782821655,
"rewards/accuracies": 0.5,
"rewards/chosen": -25.936599731445312,
"rewards/margins": -0.17483071982860565,
"rewards/rejected": -25.761768341064453,
"step": 50
},
{
"epoch": 0.13868824039295002,
"grad_norm": 61.527992248535156,
"learning_rate": 2.7777777777777776e-07,
"logits/chosen": -0.4444943368434906,
"logits/rejected": -0.43780913949012756,
"logps/chosen": -241.99569702148438,
"logps/rejected": -240.5470428466797,
"loss": 2.8199,
"nll_loss": 1.0306382179260254,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": -24.199569702148438,
"rewards/margins": -0.144865483045578,
"rewards/rejected": -24.054706573486328,
"step": 60
},
{
"epoch": 0.16180294712510834,
"grad_norm": 58.2850341796875,
"learning_rate": 3.2407407407407406e-07,
"logits/chosen": -0.5648446083068848,
"logits/rejected": -0.5444747805595398,
"logps/chosen": -224.255126953125,
"logps/rejected": -223.83773803710938,
"loss": 2.7692,
"nll_loss": 0.9458900690078735,
"rewards/accuracies": 0.4906249940395355,
"rewards/chosen": -22.425512313842773,
"rewards/margins": -0.04173760861158371,
"rewards/rejected": -22.383777618408203,
"step": 70
},
{
"epoch": 0.1849176538572667,
"grad_norm": 50.89101028442383,
"learning_rate": 3.703703703703703e-07,
"logits/chosen": -0.7499346733093262,
"logits/rejected": -0.7246556282043457,
"logps/chosen": -214.29019165039062,
"logps/rejected": -215.6709442138672,
"loss": 2.4664,
"nll_loss": 0.8191965222358704,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -21.429019927978516,
"rewards/margins": 0.13807573914527893,
"rewards/rejected": -21.567096710205078,
"step": 80
},
{
"epoch": 0.208032360589425,
"grad_norm": 51.08415222167969,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -0.824606716632843,
"logits/rejected": -0.803991436958313,
"logps/chosen": -185.02096557617188,
"logps/rejected": -191.6359405517578,
"loss": 2.215,
"nll_loss": 0.6511534452438354,
"rewards/accuracies": 0.565625011920929,
"rewards/chosen": -18.50209617614746,
"rewards/margins": 0.6614967584609985,
"rewards/rejected": -19.163593292236328,
"step": 90
},
{
"epoch": 0.23114706732158335,
"grad_norm": 50.10819625854492,
"learning_rate": 4.6296296296296297e-07,
"logits/chosen": -0.7869374752044678,
"logits/rejected": -0.7605717778205872,
"logps/chosen": -172.6743927001953,
"logps/rejected": -173.7969512939453,
"loss": 2.2028,
"nll_loss": 0.5232411623001099,
"rewards/accuracies": 0.515625,
"rewards/chosen": -17.267436981201172,
"rewards/margins": 0.1122552752494812,
"rewards/rejected": -17.379695892333984,
"step": 100
},
{
"epoch": 0.2542617740537417,
"grad_norm": 49.00399398803711,
"learning_rate": 5.092592592592593e-07,
"logits/chosen": -0.6167671084403992,
"logits/rejected": -0.5838115811347961,
"logps/chosen": -156.83273315429688,
"logps/rejected": -159.6825408935547,
"loss": 1.8947,
"nll_loss": 0.3989648222923279,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -15.683273315429688,
"rewards/margins": 0.2849821150302887,
"rewards/rejected": -15.968255996704102,
"step": 110
},
{
"epoch": 0.27737648078590005,
"grad_norm": 48.19024658203125,
"learning_rate": 5.555555555555555e-07,
"logits/chosen": -0.48373740911483765,
"logits/rejected": -0.46102485060691833,
"logps/chosen": -161.04762268066406,
"logps/rejected": -159.78451538085938,
"loss": 1.8634,
"nll_loss": 0.3991420865058899,
"rewards/accuracies": 0.5218750238418579,
"rewards/chosen": -16.10476303100586,
"rewards/margins": -0.12630942463874817,
"rewards/rejected": -15.9784517288208,
"step": 120
},
{
"epoch": 0.30049118751805837,
"grad_norm": 63.570125579833984,
"learning_rate": 6.018518518518519e-07,
"logits/chosen": -0.5185505747795105,
"logits/rejected": -0.4863056242465973,
"logps/chosen": -154.00921630859375,
"logps/rejected": -161.2861785888672,
"loss": 1.8664,
"nll_loss": 0.3488847315311432,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -15.400922775268555,
"rewards/margins": 0.7276966571807861,
"rewards/rejected": -16.128618240356445,
"step": 130
},
{
"epoch": 0.3236058942502167,
"grad_norm": 55.390159606933594,
"learning_rate": 6.481481481481481e-07,
"logits/chosen": -0.5367673635482788,
"logits/rejected": -0.5227854251861572,
"logps/chosen": -144.9154815673828,
"logps/rejected": -148.911376953125,
"loss": 1.8519,
"nll_loss": 0.29890117049217224,
"rewards/accuracies": 0.546875,
"rewards/chosen": -14.491546630859375,
"rewards/margins": 0.39959025382995605,
"rewards/rejected": -14.891136169433594,
"step": 140
},
{
"epoch": 0.34672060098237506,
"grad_norm": 88.29100799560547,
"learning_rate": 6.944444444444444e-07,
"logits/chosen": -0.5234349370002747,
"logits/rejected": -0.5064178705215454,
"logps/chosen": -144.33682250976562,
"logps/rejected": -146.9467315673828,
"loss": 1.8867,
"nll_loss": 0.29581302404403687,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -14.433680534362793,
"rewards/margins": 0.2609911262989044,
"rewards/rejected": -14.694673538208008,
"step": 150
},
{
"epoch": 0.3698353077145334,
"grad_norm": 43.19578170776367,
"learning_rate": 7.407407407407406e-07,
"logits/chosen": -0.47395405173301697,
"logits/rejected": -0.4435350298881531,
"logps/chosen": -155.87083435058594,
"logps/rejected": -157.5062255859375,
"loss": 1.7061,
"nll_loss": 0.3032439351081848,
"rewards/accuracies": 0.5218750238418579,
"rewards/chosen": -15.58708381652832,
"rewards/margins": 0.16353729367256165,
"rewards/rejected": -15.750622749328613,
"step": 160
},
{
"epoch": 0.3929500144466917,
"grad_norm": 54.197662353515625,
"learning_rate": 7.870370370370371e-07,
"logits/chosen": -0.4344661235809326,
"logits/rejected": -0.4211999475955963,
"logps/chosen": -155.08998107910156,
"logps/rejected": -160.6627655029297,
"loss": 1.5591,
"nll_loss": 0.2847481667995453,
"rewards/accuracies": 0.546875,
"rewards/chosen": -15.508997917175293,
"rewards/margins": 0.5572806000709534,
"rewards/rejected": -16.066280364990234,
"step": 170
},
{
"epoch": 0.41606472117885,
"grad_norm": 48.73773956298828,
"learning_rate": 8.333333333333333e-07,
"logits/chosen": -0.42254990339279175,
"logits/rejected": -0.4155765473842621,
"logps/chosen": -149.37136840820312,
"logps/rejected": -154.17172241210938,
"loss": 1.61,
"nll_loss": 0.27371498942375183,
"rewards/accuracies": 0.546875,
"rewards/chosen": -14.93713665008545,
"rewards/margins": 0.48003578186035156,
"rewards/rejected": -15.4171724319458,
"step": 180
},
{
"epoch": 0.4391794279110084,
"grad_norm": 51.67360305786133,
"learning_rate": 8.796296296296296e-07,
"logits/chosen": -0.4299948811531067,
"logits/rejected": -0.4166909158229828,
"logps/chosen": -157.9515380859375,
"logps/rejected": -162.32485961914062,
"loss": 1.6692,
"nll_loss": 0.2900438606739044,
"rewards/accuracies": 0.5625,
"rewards/chosen": -15.795153617858887,
"rewards/margins": 0.4373341500759125,
"rewards/rejected": -16.232486724853516,
"step": 190
},
{
"epoch": 0.4622941346431667,
"grad_norm": 45.50596618652344,
"learning_rate": 9.259259259259259e-07,
"logits/chosen": -0.35690927505493164,
"logits/rejected": -0.34764981269836426,
"logps/chosen": -154.99716186523438,
"logps/rejected": -160.2298126220703,
"loss": 1.6466,
"nll_loss": 0.2945239543914795,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": -15.499715805053711,
"rewards/margins": 0.5232647061347961,
"rewards/rejected": -16.022979736328125,
"step": 200
},
{
"epoch": 0.48540884137532503,
"grad_norm": 52.31976318359375,
"learning_rate": 9.722222222222222e-07,
"logits/chosen": -0.4234965443611145,
"logits/rejected": -0.39612382650375366,
"logps/chosen": -154.9087371826172,
"logps/rejected": -155.92794799804688,
"loss": 1.6004,
"nll_loss": 0.2901446223258972,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -15.490873336791992,
"rewards/margins": 0.10192202031612396,
"rewards/rejected": -15.592794418334961,
"step": 210
},
{
"epoch": 0.5085235481074833,
"grad_norm": 54.61393737792969,
"learning_rate": 9.979423868312756e-07,
"logits/chosen": -0.4337913393974304,
"logits/rejected": -0.4053143560886383,
"logps/chosen": -168.09202575683594,
"logps/rejected": -172.47401428222656,
"loss": 1.6616,
"nll_loss": 0.30150192975997925,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -16.809206008911133,
"rewards/margins": 0.43819671869277954,
"rewards/rejected": -17.24740219116211,
"step": 220
},
{
"epoch": 0.5316382548396418,
"grad_norm": 46.82304000854492,
"learning_rate": 9.927983539094649e-07,
"logits/chosen": -0.41667041182518005,
"logits/rejected": -0.3951401710510254,
"logps/chosen": -165.96499633789062,
"logps/rejected": -171.3835906982422,
"loss": 1.6745,
"nll_loss": 0.30009427666664124,
"rewards/accuracies": 0.5718749761581421,
"rewards/chosen": -16.596498489379883,
"rewards/margins": 0.5418606996536255,
"rewards/rejected": -17.13835906982422,
"step": 230
},
{
"epoch": 0.5547529615718001,
"grad_norm": 51.5750846862793,
"learning_rate": 9.876543209876542e-07,
"logits/chosen": -0.3943902254104614,
"logits/rejected": -0.3833962082862854,
"logps/chosen": -163.68643188476562,
"logps/rejected": -167.90953063964844,
"loss": 1.4982,
"nll_loss": 0.2821606993675232,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -16.368642807006836,
"rewards/margins": 0.42231208086013794,
"rewards/rejected": -16.79095458984375,
"step": 240
},
{
"epoch": 0.5778676683039584,
"grad_norm": 54.075496673583984,
"learning_rate": 9.825102880658436e-07,
"logits/chosen": -0.4583554267883301,
"logits/rejected": -0.4463082253932953,
"logps/chosen": -160.63284301757812,
"logps/rejected": -163.09634399414062,
"loss": 1.639,
"nll_loss": 0.25729092955589294,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": -16.063283920288086,
"rewards/margins": 0.24634972214698792,
"rewards/rejected": -16.309635162353516,
"step": 250
},
{
"epoch": 0.6009823750361167,
"grad_norm": 50.17490768432617,
"learning_rate": 9.77366255144033e-07,
"logits/chosen": -0.4777965545654297,
"logits/rejected": -0.4631553292274475,
"logps/chosen": -154.1898956298828,
"logps/rejected": -162.0362091064453,
"loss": 1.4771,
"nll_loss": 0.27278777956962585,
"rewards/accuracies": 0.621874988079071,
"rewards/chosen": -15.418991088867188,
"rewards/margins": 0.7846304178237915,
"rewards/rejected": -16.2036190032959,
"step": 260
},
{
"epoch": 0.624097081768275,
"grad_norm": 44.40957260131836,
"learning_rate": 9.722222222222222e-07,
"logits/chosen": -0.48693957924842834,
"logits/rejected": -0.4778309762477875,
"logps/chosen": -162.27188110351562,
"logps/rejected": -169.07962036132812,
"loss": 1.5028,
"nll_loss": 0.2821035087108612,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -16.227190017700195,
"rewards/margins": 0.6807710528373718,
"rewards/rejected": -16.907960891723633,
"step": 270
},
{
"epoch": 0.6472117885004334,
"grad_norm": 50.629066467285156,
"learning_rate": 9.670781893004115e-07,
"logits/chosen": -0.39725005626678467,
"logits/rejected": -0.3660200238227844,
"logps/chosen": -158.48001098632812,
"logps/rejected": -167.71119689941406,
"loss": 1.4805,
"nll_loss": 0.2827926576137543,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -15.848001480102539,
"rewards/margins": 0.9231182932853699,
"rewards/rejected": -16.771120071411133,
"step": 280
},
{
"epoch": 0.6703264952325917,
"grad_norm": 55.39129638671875,
"learning_rate": 9.619341563786007e-07,
"logits/chosen": -0.5320179462432861,
"logits/rejected": -0.4930430054664612,
"logps/chosen": -166.970947265625,
"logps/rejected": -172.72909545898438,
"loss": 1.4575,
"nll_loss": 0.2989470362663269,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -16.697093963623047,
"rewards/margins": 0.5758152604103088,
"rewards/rejected": -17.272911071777344,
"step": 290
},
{
"epoch": 0.6934412019647501,
"grad_norm": 42.369606018066406,
"learning_rate": 9.567901234567902e-07,
"logits/chosen": -0.43348032236099243,
"logits/rejected": -0.4254017472267151,
"logps/chosen": -162.8667449951172,
"logps/rejected": -172.35897827148438,
"loss": 1.4884,
"nll_loss": 0.2910870611667633,
"rewards/accuracies": 0.59375,
"rewards/chosen": -16.286678314208984,
"rewards/margins": 0.9492223858833313,
"rewards/rejected": -17.235897064208984,
"step": 300
},
{
"epoch": 0.7165559086969084,
"grad_norm": 48.293399810791016,
"learning_rate": 9.516460905349794e-07,
"logits/chosen": -0.509886622428894,
"logits/rejected": -0.49991345405578613,
"logps/chosen": -173.03567504882812,
"logps/rejected": -176.65750122070312,
"loss": 1.5401,
"nll_loss": 0.30316367745399475,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -17.30356788635254,
"rewards/margins": 0.36218342185020447,
"rewards/rejected": -17.665752410888672,
"step": 310
},
{
"epoch": 0.7396706154290668,
"grad_norm": 45.7746467590332,
"learning_rate": 9.465020576131687e-07,
"logits/chosen": -0.503333568572998,
"logits/rejected": -0.4878058433532715,
"logps/chosen": -163.34519958496094,
"logps/rejected": -172.25938415527344,
"loss": 1.5247,
"nll_loss": 0.29550039768218994,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -16.33452033996582,
"rewards/margins": 0.89141845703125,
"rewards/rejected": -17.225940704345703,
"step": 320
},
{
"epoch": 0.7627853221612251,
"grad_norm": 48.05742645263672,
"learning_rate": 9.413580246913579e-07,
"logits/chosen": -0.5755558609962463,
"logits/rejected": -0.5767273902893066,
"logps/chosen": -158.17958068847656,
"logps/rejected": -165.14163208007812,
"loss": 1.4969,
"nll_loss": 0.2938057780265808,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -15.817957878112793,
"rewards/margins": 0.6962078809738159,
"rewards/rejected": -16.5141658782959,
"step": 330
},
{
"epoch": 0.7859000288933834,
"grad_norm": 45.862648010253906,
"learning_rate": 9.362139917695473e-07,
"logits/chosen": -0.6315797567367554,
"logits/rejected": -0.6231464147567749,
"logps/chosen": -164.8571014404297,
"logps/rejected": -170.53570556640625,
"loss": 1.3908,
"nll_loss": 0.28307533264160156,
"rewards/accuracies": 0.5843750238418579,
"rewards/chosen": -16.48571014404297,
"rewards/margins": 0.567859947681427,
"rewards/rejected": -17.053571701049805,
"step": 340
},
{
"epoch": 0.8090147356255417,
"grad_norm": 45.217002868652344,
"learning_rate": 9.310699588477366e-07,
"logits/chosen": -0.5783101320266724,
"logits/rejected": -0.5816030502319336,
"logps/chosen": -167.26516723632812,
"logps/rejected": -176.68746948242188,
"loss": 1.5036,
"nll_loss": 0.2909998297691345,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -16.726520538330078,
"rewards/margins": 0.9422298669815063,
"rewards/rejected": -17.66874885559082,
"step": 350
},
{
"epoch": 0.8321294423577,
"grad_norm": 56.84000778198242,
"learning_rate": 9.259259259259259e-07,
"logits/chosen": -0.5195820927619934,
"logits/rejected": -0.5026860237121582,
"logps/chosen": -171.53640747070312,
"logps/rejected": -177.3377227783203,
"loss": 1.5078,
"nll_loss": 0.29021695256233215,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -17.153636932373047,
"rewards/margins": 0.5801342725753784,
"rewards/rejected": -17.733774185180664,
"step": 360
},
{
"epoch": 0.8552441490898585,
"grad_norm": 50.610069274902344,
"learning_rate": 9.207818930041152e-07,
"logits/chosen": -0.49760836362838745,
"logits/rejected": -0.4677702784538269,
"logps/chosen": -161.1763153076172,
"logps/rejected": -171.69003295898438,
"loss": 1.3722,
"nll_loss": 0.26248103380203247,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -16.117631912231445,
"rewards/margins": 1.051371693611145,
"rewards/rejected": -17.169002532958984,
"step": 370
},
{
"epoch": 0.8783588558220168,
"grad_norm": 54.772438049316406,
"learning_rate": 9.156378600823045e-07,
"logits/chosen": -0.42570480704307556,
"logits/rejected": -0.4065491259098053,
"logps/chosen": -168.25025939941406,
"logps/rejected": -176.4032440185547,
"loss": 1.3843,
"nll_loss": 0.313023179769516,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -16.825023651123047,
"rewards/margins": 0.8152991533279419,
"rewards/rejected": -17.64032554626465,
"step": 380
},
{
"epoch": 0.9014735625541751,
"grad_norm": 50.42124557495117,
"learning_rate": 9.104938271604939e-07,
"logits/chosen": -0.43410390615463257,
"logits/rejected": -0.4136204719543457,
"logps/chosen": -165.08279418945312,
"logps/rejected": -176.14059448242188,
"loss": 1.4235,
"nll_loss": 0.27761662006378174,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -16.50827980041504,
"rewards/margins": 1.105778455734253,
"rewards/rejected": -17.614057540893555,
"step": 390
},
{
"epoch": 0.9245882692863334,
"grad_norm": 51.66304016113281,
"learning_rate": 9.053497942386831e-07,
"logits/chosen": -0.40831509232521057,
"logits/rejected": -0.3836323916912079,
"logps/chosen": -162.02064514160156,
"logps/rejected": -169.6013946533203,
"loss": 1.3933,
"nll_loss": 0.28827401995658875,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": -16.20206642150879,
"rewards/margins": 0.7580735087394714,
"rewards/rejected": -16.960140228271484,
"step": 400
},
{
"epoch": 0.9477029760184917,
"grad_norm": 48.54574966430664,
"learning_rate": 9.002057613168724e-07,
"logits/chosen": -0.36130112409591675,
"logits/rejected": -0.35345903038978577,
"logps/chosen": -159.15536499023438,
"logps/rejected": -170.9656524658203,
"loss": 1.3593,
"nll_loss": 0.2898252308368683,
"rewards/accuracies": 0.6343749761581421,
"rewards/chosen": -15.915536880493164,
"rewards/margins": 1.181027889251709,
"rewards/rejected": -17.09656524658203,
"step": 410
},
{
"epoch": 0.9708176827506501,
"grad_norm": 43.59242248535156,
"learning_rate": 8.950617283950617e-07,
"logits/chosen": -0.4918903410434723,
"logits/rejected": -0.4697975516319275,
"logps/chosen": -165.565673828125,
"logps/rejected": -174.68519592285156,
"loss": 1.3598,
"nll_loss": 0.30875933170318604,
"rewards/accuracies": 0.596875011920929,
"rewards/chosen": -16.556568145751953,
"rewards/margins": 0.9119526147842407,
"rewards/rejected": -17.468521118164062,
"step": 420
},
{
"epoch": 0.9939323894828085,
"grad_norm": 50.116798400878906,
"learning_rate": 8.89917695473251e-07,
"logits/chosen": -0.49847784638404846,
"logits/rejected": -0.5088882446289062,
"logps/chosen": -167.231201171875,
"logps/rejected": -177.6866455078125,
"loss": 1.4367,
"nll_loss": 0.28403669595718384,
"rewards/accuracies": 0.6343749761581421,
"rewards/chosen": -16.723121643066406,
"rewards/margins": 1.0455443859100342,
"rewards/rejected": -17.768667221069336,
"step": 430
},
{
"epoch": 0.9985553308292401,
"eval_logits/chosen": -0.4373142123222351,
"eval_logits/rejected": -0.40795600414276123,
"eval_logps/chosen": -170.67918395996094,
"eval_logps/rejected": -180.96241760253906,
"eval_loss": 1.392618179321289,
"eval_nll_loss": 0.3199608623981476,
"eval_rewards/accuracies": 0.656521737575531,
"eval_rewards/chosen": -17.067920684814453,
"eval_rewards/margins": 1.0283225774765015,
"eval_rewards/rejected": -18.096242904663086,
"eval_runtime": 77.5612,
"eval_samples_per_second": 23.543,
"eval_steps_per_second": 1.483,
"step": 432
},
{
"epoch": 1.0170470962149667,
"grad_norm": 35.45933151245117,
"learning_rate": 8.847736625514403e-07,
"logits/chosen": -0.45173630118370056,
"logits/rejected": -0.4663858413696289,
"logps/chosen": -160.457275390625,
"logps/rejected": -179.97222900390625,
"loss": 0.9484,
"nll_loss": 0.30594602227211,
"rewards/accuracies": 0.784375011920929,
"rewards/chosen": -16.045726776123047,
"rewards/margins": 1.9514964818954468,
"rewards/rejected": -17.997224807739258,
"step": 440
},
{
"epoch": 1.0401618029471251,
"grad_norm": 27.835773468017578,
"learning_rate": 8.796296296296296e-07,
"logits/chosen": -0.3361106514930725,
"logits/rejected": -0.3292810022830963,
"logps/chosen": -149.01544189453125,
"logps/rejected": -169.8839111328125,
"loss": 0.7764,
"nll_loss": 0.25240465998649597,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -14.901544570922852,
"rewards/margins": 2.086846351623535,
"rewards/rejected": -16.988391876220703,
"step": 450
},
{
"epoch": 1.0632765096792833,
"grad_norm": 32.76046371459961,
"learning_rate": 8.744855967078189e-07,
"logits/chosen": -0.4512772560119629,
"logits/rejected": -0.4271810054779053,
"logps/chosen": -152.64132690429688,
"logps/rejected": -174.70986938476562,
"loss": 0.7216,
"nll_loss": 0.25062257051467896,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -15.264132499694824,
"rewards/margins": 2.206853151321411,
"rewards/rejected": -17.470985412597656,
"step": 460
},
{
"epoch": 1.0863912164114418,
"grad_norm": 46.92816162109375,
"learning_rate": 8.693415637860082e-07,
"logits/chosen": -0.510484516620636,
"logits/rejected": -0.4754946827888489,
"logps/chosen": -151.33753967285156,
"logps/rejected": -175.41604614257812,
"loss": 0.7542,
"nll_loss": 0.2625353932380676,
"rewards/accuracies": 0.840624988079071,
"rewards/chosen": -15.133753776550293,
"rewards/margins": 2.4078497886657715,
"rewards/rejected": -17.54160499572754,
"step": 470
},
{
"epoch": 1.1095059231436002,
"grad_norm": 45.01936721801758,
"learning_rate": 8.641975308641974e-07,
"logits/chosen": -0.5488854646682739,
"logits/rejected": -0.534773588180542,
"logps/chosen": -158.13259887695312,
"logps/rejected": -183.81103515625,
"loss": 0.7397,
"nll_loss": 0.23221275210380554,
"rewards/accuracies": 0.840624988079071,
"rewards/chosen": -15.813260078430176,
"rewards/margins": 2.5678436756134033,
"rewards/rejected": -18.381103515625,
"step": 480
},
{
"epoch": 1.1326206298757584,
"grad_norm": 29.731250762939453,
"learning_rate": 8.590534979423868e-07,
"logits/chosen": -0.4209683835506439,
"logits/rejected": -0.40175366401672363,
"logps/chosen": -148.5663604736328,
"logps/rejected": -172.50228881835938,
"loss": 0.6839,
"nll_loss": 0.2801415026187897,
"rewards/accuracies": 0.846875011920929,
"rewards/chosen": -14.856637954711914,
"rewards/margins": 2.3935940265655518,
"rewards/rejected": -17.250232696533203,
"step": 490
},
{
"epoch": 1.1557353366079168,
"grad_norm": 35.19107437133789,
"learning_rate": 8.539094650205761e-07,
"logits/chosen": -0.5119351148605347,
"logits/rejected": -0.48603877425193787,
"logps/chosen": -147.54727172851562,
"logps/rejected": -172.57888793945312,
"loss": 0.7342,
"nll_loss": 0.24299657344818115,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -14.754727363586426,
"rewards/margins": 2.503164529800415,
"rewards/rejected": -17.257890701293945,
"step": 500
},
{
"epoch": 1.178850043340075,
"grad_norm": 36.37306213378906,
"learning_rate": 8.487654320987654e-07,
"logits/chosen": -0.5116412043571472,
"logits/rejected": -0.5097488164901733,
"logps/chosen": -152.76693725585938,
"logps/rejected": -173.20547485351562,
"loss": 0.7418,
"nll_loss": 0.2616187334060669,
"rewards/accuracies": 0.796875,
"rewards/chosen": -15.276693344116211,
"rewards/margins": 2.0438523292541504,
"rewards/rejected": -17.320547103881836,
"step": 510
},
{
"epoch": 1.2019647500722335,
"grad_norm": 32.158714294433594,
"learning_rate": 8.436213991769548e-07,
"logits/chosen": -0.41989222168922424,
"logits/rejected": -0.40580207109451294,
"logps/chosen": -160.35772705078125,
"logps/rejected": -186.72616577148438,
"loss": 0.7297,
"nll_loss": 0.2849249839782715,
"rewards/accuracies": 0.840624988079071,
"rewards/chosen": -16.0357723236084,
"rewards/margins": 2.636845111846924,
"rewards/rejected": -18.672618865966797,
"step": 520
},
{
"epoch": 1.2250794568043917,
"grad_norm": 38.98585510253906,
"learning_rate": 8.38477366255144e-07,
"logits/chosen": -0.43002861738204956,
"logits/rejected": -0.43659868836402893,
"logps/chosen": -149.89114379882812,
"logps/rejected": -177.4897918701172,
"loss": 0.7001,
"nll_loss": 0.25785765051841736,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -14.989115715026855,
"rewards/margins": 2.7598659992218018,
"rewards/rejected": -17.748981475830078,
"step": 530
},
{
"epoch": 1.24819416353655,
"grad_norm": 33.50174331665039,
"learning_rate": 8.333333333333333e-07,
"logits/chosen": -0.5792837142944336,
"logits/rejected": -0.5748234987258911,
"logps/chosen": -154.1841278076172,
"logps/rejected": -175.39093017578125,
"loss": 0.77,
"nll_loss": 0.28076162934303284,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -15.418413162231445,
"rewards/margins": 2.120678424835205,
"rewards/rejected": -17.539093017578125,
"step": 540
},
{
"epoch": 1.2713088702687085,
"grad_norm": 35.51890182495117,
"learning_rate": 8.281893004115226e-07,
"logits/chosen": -0.6797876358032227,
"logits/rejected": -0.6701671481132507,
"logps/chosen": -164.1734619140625,
"logps/rejected": -189.96820068359375,
"loss": 0.6452,
"nll_loss": 0.2875816822052002,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -16.417346954345703,
"rewards/margins": 2.579475164413452,
"rewards/rejected": -18.9968204498291,
"step": 550
},
{
"epoch": 1.2944235770008667,
"grad_norm": 36.58209228515625,
"learning_rate": 8.23045267489712e-07,
"logits/chosen": -0.6092251539230347,
"logits/rejected": -0.5988754630088806,
"logps/chosen": -150.59115600585938,
"logps/rejected": -178.7034149169922,
"loss": 0.7005,
"nll_loss": 0.26352283358573914,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": -15.059117317199707,
"rewards/margins": 2.811225652694702,
"rewards/rejected": -17.870342254638672,
"step": 560
},
{
"epoch": 1.3175382837330252,
"grad_norm": 38.884254455566406,
"learning_rate": 8.179012345679011e-07,
"logits/chosen": -0.5773380994796753,
"logits/rejected": -0.5545040369033813,
"logps/chosen": -159.92147827148438,
"logps/rejected": -186.68997192382812,
"loss": 0.7401,
"nll_loss": 0.26087266206741333,
"rewards/accuracies": 0.84375,
"rewards/chosen": -15.992147445678711,
"rewards/margins": 2.6768481731414795,
"rewards/rejected": -18.668996810913086,
"step": 570
},
{
"epoch": 1.3406529904651836,
"grad_norm": 43.70725631713867,
"learning_rate": 8.127572016460905e-07,
"logits/chosen": -0.5863763093948364,
"logits/rejected": -0.5670869946479797,
"logps/chosen": -157.2144012451172,
"logps/rejected": -184.8651123046875,
"loss": 0.72,
"nll_loss": 0.2669151723384857,
"rewards/accuracies": 0.8531249761581421,
"rewards/chosen": -15.721441268920898,
"rewards/margins": 2.7650701999664307,
"rewards/rejected": -18.486513137817383,
"step": 580
},
{
"epoch": 1.3637676971973418,
"grad_norm": 39.63798904418945,
"learning_rate": 8.076131687242798e-07,
"logits/chosen": -0.529544472694397,
"logits/rejected": -0.5398887395858765,
"logps/chosen": -148.3323974609375,
"logps/rejected": -174.19955444335938,
"loss": 0.6607,
"nll_loss": 0.24997957050800323,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": -14.833239555358887,
"rewards/margins": 2.586716890335083,
"rewards/rejected": -17.419958114624023,
"step": 590
},
{
"epoch": 1.3868824039295,
"grad_norm": 36.14802169799805,
"learning_rate": 8.024691358024691e-07,
"logits/chosen": -0.441204309463501,
"logits/rejected": -0.4048687815666199,
"logps/chosen": -156.30531311035156,
"logps/rejected": -183.83956909179688,
"loss": 0.733,
"nll_loss": 0.2541951537132263,
"rewards/accuracies": 0.8343750238418579,
"rewards/chosen": -15.630529403686523,
"rewards/margins": 2.753427743911743,
"rewards/rejected": -18.38395881652832,
"step": 600
},
{
"epoch": 1.4099971106616584,
"grad_norm": 40.05307388305664,
"learning_rate": 7.973251028806583e-07,
"logits/chosen": -0.41722431778907776,
"logits/rejected": -0.4100796580314636,
"logps/chosen": -151.99453735351562,
"logps/rejected": -175.85577392578125,
"loss": 0.7682,
"nll_loss": 0.25730782747268677,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -15.199453353881836,
"rewards/margins": 2.3861212730407715,
"rewards/rejected": -17.585575103759766,
"step": 610
},
{
"epoch": 1.4331118173938169,
"grad_norm": 24.526100158691406,
"learning_rate": 7.921810699588477e-07,
"logits/chosen": -0.5749002695083618,
"logits/rejected": -0.5751099586486816,
"logps/chosen": -157.60520935058594,
"logps/rejected": -185.5096893310547,
"loss": 0.5956,
"nll_loss": 0.24547366797924042,
"rewards/accuracies": 0.8843749761581421,
"rewards/chosen": -15.760522842407227,
"rewards/margins": 2.790447473526001,
"rewards/rejected": -18.55097007751465,
"step": 620
},
{
"epoch": 1.456226524125975,
"grad_norm": 36.09085464477539,
"learning_rate": 7.870370370370371e-07,
"logits/chosen": -0.5282450914382935,
"logits/rejected": -0.5175204873085022,
"logps/chosen": -146.50106811523438,
"logps/rejected": -173.6673126220703,
"loss": 0.6405,
"nll_loss": 0.24812671542167664,
"rewards/accuracies": 0.859375,
"rewards/chosen": -14.650106430053711,
"rewards/margins": 2.7166221141815186,
"rewards/rejected": -17.366729736328125,
"step": 630
},
{
"epoch": 1.4793412308581335,
"grad_norm": 41.768348693847656,
"learning_rate": 7.818930041152262e-07,
"logits/chosen": -0.45312589406967163,
"logits/rejected": -0.4504320025444031,
"logps/chosen": -142.28053283691406,
"logps/rejected": -170.82095336914062,
"loss": 0.6841,
"nll_loss": 0.23785972595214844,
"rewards/accuracies": 0.871874988079071,
"rewards/chosen": -14.228052139282227,
"rewards/margins": 2.8540425300598145,
"rewards/rejected": -17.082096099853516,
"step": 640
},
{
"epoch": 1.502455937590292,
"grad_norm": 34.300228118896484,
"learning_rate": 7.767489711934156e-07,
"logits/chosen": -0.5092964172363281,
"logits/rejected": -0.5271193981170654,
"logps/chosen": -155.85000610351562,
"logps/rejected": -186.28884887695312,
"loss": 0.6303,
"nll_loss": 0.24494795501232147,
"rewards/accuracies": 0.878125011920929,
"rewards/chosen": -15.584999084472656,
"rewards/margins": 3.0438854694366455,
"rewards/rejected": -18.62888526916504,
"step": 650
},
{
"epoch": 1.5255706443224502,
"grad_norm": 33.022884368896484,
"learning_rate": 7.716049382716049e-07,
"logits/chosen": -0.5350406169891357,
"logits/rejected": -0.5363395810127258,
"logps/chosen": -147.15267944335938,
"logps/rejected": -174.66571044921875,
"loss": 0.7096,
"nll_loss": 0.24733343720436096,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": -14.7152681350708,
"rewards/margins": 2.751302480697632,
"rewards/rejected": -17.466571807861328,
"step": 660
},
{
"epoch": 1.5486853510546084,
"grad_norm": 53.42652130126953,
"learning_rate": 7.664609053497943e-07,
"logits/chosen": -0.6187707781791687,
"logits/rejected": -0.6232476234436035,
"logps/chosen": -158.1448211669922,
"logps/rejected": -187.09014892578125,
"loss": 0.6173,
"nll_loss": 0.22900207340717316,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -15.814483642578125,
"rewards/margins": 2.8945329189300537,
"rewards/rejected": -18.709014892578125,
"step": 670
},
{
"epoch": 1.5718000577867668,
"grad_norm": 40.11577606201172,
"learning_rate": 7.613168724279834e-07,
"logits/chosen": -0.5888317227363586,
"logits/rejected": -0.600538432598114,
"logps/chosen": -149.23678588867188,
"logps/rejected": -175.3176727294922,
"loss": 0.7099,
"nll_loss": 0.21695959568023682,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -14.923675537109375,
"rewards/margins": 2.6080896854400635,
"rewards/rejected": -17.531766891479492,
"step": 680
},
{
"epoch": 1.5949147645189252,
"grad_norm": 26.918350219726562,
"learning_rate": 7.561728395061728e-07,
"logits/chosen": -0.6150851845741272,
"logits/rejected": -0.6231178045272827,
"logps/chosen": -164.5893096923828,
"logps/rejected": -196.3010711669922,
"loss": 0.6595,
"nll_loss": 0.23331816494464874,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -16.45893096923828,
"rewards/margins": 3.171175479888916,
"rewards/rejected": -19.630107879638672,
"step": 690
},
{
"epoch": 1.6180294712510834,
"grad_norm": 33.39554214477539,
"learning_rate": 7.510288065843621e-07,
"logits/chosen": -0.5018739700317383,
"logits/rejected": -0.4825282692909241,
"logps/chosen": -149.8149871826172,
"logps/rejected": -177.98583984375,
"loss": 0.6348,
"nll_loss": 0.2212187498807907,
"rewards/accuracies": 0.8656250238418579,
"rewards/chosen": -14.981498718261719,
"rewards/margins": 2.817084789276123,
"rewards/rejected": -17.798583984375,
"step": 700
},
{
"epoch": 1.6411441779832419,
"grad_norm": 29.109973907470703,
"learning_rate": 7.458847736625515e-07,
"logits/chosen": -0.47257423400878906,
"logits/rejected": -0.4691304564476013,
"logps/chosen": -138.67837524414062,
"logps/rejected": -164.54855346679688,
"loss": 0.6175,
"nll_loss": 0.1982104480266571,
"rewards/accuracies": 0.831250011920929,
"rewards/chosen": -13.867838859558105,
"rewards/margins": 2.5870203971862793,
"rewards/rejected": -16.454858779907227,
"step": 710
},
{
"epoch": 1.6642588847154003,
"grad_norm": 38.35542678833008,
"learning_rate": 7.407407407407406e-07,
"logits/chosen": -0.6042996644973755,
"logits/rejected": -0.6067830324172974,
"logps/chosen": -144.49464416503906,
"logps/rejected": -169.24853515625,
"loss": 0.5938,
"nll_loss": 0.23023180663585663,
"rewards/accuracies": 0.840624988079071,
"rewards/chosen": -14.449464797973633,
"rewards/margins": 2.4753904342651367,
"rewards/rejected": -16.924854278564453,
"step": 720
},
{
"epoch": 1.6873735914475585,
"grad_norm": 32.6804084777832,
"learning_rate": 7.3559670781893e-07,
"logits/chosen": -0.6318911910057068,
"logits/rejected": -0.623616099357605,
"logps/chosen": -151.0692596435547,
"logps/rejected": -178.22621154785156,
"loss": 0.6287,
"nll_loss": 0.20305195450782776,
"rewards/accuracies": 0.84375,
"rewards/chosen": -15.106924057006836,
"rewards/margins": 2.7156949043273926,
"rewards/rejected": -17.822620391845703,
"step": 730
},
{
"epoch": 1.7104882981797167,
"grad_norm": 33.47980499267578,
"learning_rate": 7.304526748971193e-07,
"logits/chosen": -0.5788182020187378,
"logits/rejected": -0.5648819208145142,
"logps/chosen": -162.39569091796875,
"logps/rejected": -193.59268188476562,
"loss": 0.5942,
"nll_loss": 0.21426251530647278,
"rewards/accuracies": 0.84375,
"rewards/chosen": -16.23956871032715,
"rewards/margins": 3.1196982860565186,
"rewards/rejected": -19.359268188476562,
"step": 740
},
{
"epoch": 1.7336030049118751,
"grad_norm": 37.14680099487305,
"learning_rate": 7.253086419753086e-07,
"logits/chosen": -0.5623105764389038,
"logits/rejected": -0.5381472110748291,
"logps/chosen": -139.84085083007812,
"logps/rejected": -167.0809326171875,
"loss": 0.598,
"nll_loss": 0.18970206379890442,
"rewards/accuracies": 0.8656250238418579,
"rewards/chosen": -13.984085083007812,
"rewards/margins": 2.7240078449249268,
"rewards/rejected": -16.708093643188477,
"step": 750
},
{
"epoch": 1.7567177116440336,
"grad_norm": 35.07746124267578,
"learning_rate": 7.201646090534979e-07,
"logits/chosen": -0.5330817103385925,
"logits/rejected": -0.540014386177063,
"logps/chosen": -153.24600219726562,
"logps/rejected": -185.0384063720703,
"loss": 0.6322,
"nll_loss": 0.198031947016716,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -15.324602127075195,
"rewards/margins": 3.1792402267456055,
"rewards/rejected": -18.503841400146484,
"step": 760
},
{
"epoch": 1.7798324183761918,
"grad_norm": 34.26885986328125,
"learning_rate": 7.150205761316872e-07,
"logits/chosen": -0.6087044477462769,
"logits/rejected": -0.599485456943512,
"logps/chosen": -145.72488403320312,
"logps/rejected": -171.98873901367188,
"loss": 0.6407,
"nll_loss": 0.18888258934020996,
"rewards/accuracies": 0.8187500238418579,
"rewards/chosen": -14.572488784790039,
"rewards/margins": 2.626385450363159,
"rewards/rejected": -17.19887351989746,
"step": 770
},
{
"epoch": 1.8029471251083502,
"grad_norm": 33.3639030456543,
"learning_rate": 7.098765432098766e-07,
"logits/chosen": -0.6275098323822021,
"logits/rejected": -0.6126091480255127,
"logps/chosen": -149.48826599121094,
"logps/rejected": -179.92613220214844,
"loss": 0.6014,
"nll_loss": 0.2067473828792572,
"rewards/accuracies": 0.890625,
"rewards/chosen": -14.948827743530273,
"rewards/margins": 3.0437865257263184,
"rewards/rejected": -17.99261474609375,
"step": 780
},
{
"epoch": 1.8260618318405086,
"grad_norm": 34.436153411865234,
"learning_rate": 7.047325102880658e-07,
"logits/chosen": -0.6325902938842773,
"logits/rejected": -0.6320141553878784,
"logps/chosen": -149.53546142578125,
"logps/rejected": -177.4294891357422,
"loss": 0.5987,
"nll_loss": 0.21218529343605042,
"rewards/accuracies": 0.840624988079071,
"rewards/chosen": -14.953544616699219,
"rewards/margins": 2.7894036769866943,
"rewards/rejected": -17.742948532104492,
"step": 790
},
{
"epoch": 1.8491765385726668,
"grad_norm": 41.68962097167969,
"learning_rate": 6.995884773662551e-07,
"logits/chosen": -0.5112544298171997,
"logits/rejected": -0.5018970370292664,
"logps/chosen": -139.74612426757812,
"logps/rejected": -170.65365600585938,
"loss": 0.5737,
"nll_loss": 0.18416205048561096,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": -13.97461223602295,
"rewards/margins": 3.0907552242279053,
"rewards/rejected": -17.065366744995117,
"step": 800
},
{
"epoch": 1.872291245304825,
"grad_norm": 34.62812423706055,
"learning_rate": 6.944444444444444e-07,
"logits/chosen": -0.5771014094352722,
"logits/rejected": -0.5736783146858215,
"logps/chosen": -149.42527770996094,
"logps/rejected": -179.3314666748047,
"loss": 0.6492,
"nll_loss": 0.19857726991176605,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": -14.942527770996094,
"rewards/margins": 2.990619421005249,
"rewards/rejected": -17.933147430419922,
"step": 810
},
{
"epoch": 1.8954059520369835,
"grad_norm": 27.703113555908203,
"learning_rate": 6.893004115226337e-07,
"logits/chosen": -0.6073204278945923,
"logits/rejected": -0.6056413054466248,
"logps/chosen": -151.15286254882812,
"logps/rejected": -184.02236938476562,
"loss": 0.5758,
"nll_loss": 0.20334260165691376,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -15.115285873413086,
"rewards/margins": 3.2869529724121094,
"rewards/rejected": -18.402238845825195,
"step": 820
},
{
"epoch": 1.918520658769142,
"grad_norm": 38.63829040527344,
"learning_rate": 6.84156378600823e-07,
"logits/chosen": -0.564698338508606,
"logits/rejected": -0.5553814172744751,
"logps/chosen": -141.9647216796875,
"logps/rejected": -167.49462890625,
"loss": 0.604,
"nll_loss": 0.19638094305992126,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -14.19647216796875,
"rewards/margins": 2.552992343902588,
"rewards/rejected": -16.74946403503418,
"step": 830
},
{
"epoch": 1.9416353655013001,
"grad_norm": 37.33395767211914,
"learning_rate": 6.790123456790123e-07,
"logits/chosen": -0.6794390678405762,
"logits/rejected": -0.6817184686660767,
"logps/chosen": -150.2278289794922,
"logps/rejected": -178.04473876953125,
"loss": 0.6078,
"nll_loss": 0.18291929364204407,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -15.022783279418945,
"rewards/margins": 2.781691074371338,
"rewards/rejected": -17.804473876953125,
"step": 840
},
{
"epoch": 1.9647500722334585,
"grad_norm": 33.96713638305664,
"learning_rate": 6.738683127572016e-07,
"logits/chosen": -0.716331422328949,
"logits/rejected": -0.7188450694084167,
"logps/chosen": -147.86050415039062,
"logps/rejected": -174.76864624023438,
"loss": 0.5987,
"nll_loss": 0.19556212425231934,
"rewards/accuracies": 0.878125011920929,
"rewards/chosen": -14.786050796508789,
"rewards/margins": 2.6908116340637207,
"rewards/rejected": -17.476863861083984,
"step": 850
},
{
"epoch": 1.987864778965617,
"grad_norm": 35.31864929199219,
"learning_rate": 6.687242798353909e-07,
"logits/chosen": -0.6668294668197632,
"logits/rejected": -0.6580954790115356,
"logps/chosen": -149.87158203125,
"logps/rejected": -180.49496459960938,
"loss": 0.5472,
"nll_loss": 0.1864423006772995,
"rewards/accuracies": 0.8656250238418579,
"rewards/chosen": -14.987157821655273,
"rewards/margins": 3.06233811378479,
"rewards/rejected": -18.049495697021484,
"step": 860
},
{
"epoch": 1.999422132331696,
"eval_logits/chosen": -0.5687969923019409,
"eval_logits/rejected": -0.5434355139732361,
"eval_logps/chosen": -162.90855407714844,
"eval_logps/rejected": -175.85232543945312,
"eval_loss": 1.2972584962844849,
"eval_nll_loss": 0.2148308902978897,
"eval_rewards/accuracies": 0.658695638179779,
"eval_rewards/chosen": -16.290855407714844,
"eval_rewards/margins": 1.2943781614303589,
"eval_rewards/rejected": -17.585235595703125,
"eval_runtime": 77.3685,
"eval_samples_per_second": 23.601,
"eval_steps_per_second": 1.486,
"step": 865
},
{
"epoch": 2.010979485697775,
"grad_norm": 11.489439964294434,
"learning_rate": 6.635802469135802e-07,
"logits/chosen": -0.6154376864433289,
"logits/rejected": -0.581082820892334,
"logps/chosen": -146.31674194335938,
"logps/rejected": -183.1867218017578,
"loss": 0.4233,
"nll_loss": 0.17745935916900635,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -14.631675720214844,
"rewards/margins": 3.6869969367980957,
"rewards/rejected": -18.318674087524414,
"step": 870
},
{
"epoch": 2.0340941924299334,
"grad_norm": 8.267936706542969,
"learning_rate": 6.584362139917695e-07,
"logits/chosen": -0.5296713709831238,
"logits/rejected": -0.5492919683456421,
"logps/chosen": -135.2528839111328,
"logps/rejected": -184.4834747314453,
"loss": 0.2554,
"nll_loss": 0.17692770063877106,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -13.525288581848145,
"rewards/margins": 4.923060417175293,
"rewards/rejected": -18.448348999023438,
"step": 880
},
{
"epoch": 2.057208899162092,
"grad_norm": 17.753084182739258,
"learning_rate": 6.532921810699589e-07,
"logits/chosen": -0.4458081126213074,
"logits/rejected": -0.45663532614707947,
"logps/chosen": -132.5780792236328,
"logps/rejected": -181.31776428222656,
"loss": 0.2358,
"nll_loss": 0.1446482390165329,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -13.257807731628418,
"rewards/margins": 4.87396764755249,
"rewards/rejected": -18.13177490234375,
"step": 890
},
{
"epoch": 2.0803236058942502,
"grad_norm": 9.170333862304688,
"learning_rate": 6.481481481481481e-07,
"logits/chosen": -0.4914008677005768,
"logits/rejected": -0.4894467890262604,
"logps/chosen": -139.57400512695312,
"logps/rejected": -189.27447509765625,
"loss": 0.2373,
"nll_loss": 0.1590987890958786,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -13.95740032196045,
"rewards/margins": 4.970045566558838,
"rewards/rejected": -18.927448272705078,
"step": 900
},
{
"epoch": 2.1034383126264085,
"grad_norm": 16.0671329498291,
"learning_rate": 6.430041152263375e-07,
"logits/chosen": -0.29768380522727966,
"logits/rejected": -0.3132530450820923,
"logps/chosen": -133.86160278320312,
"logps/rejected": -184.111083984375,
"loss": 0.2528,
"nll_loss": 0.1800731122493744,
"rewards/accuracies": 0.984375,
"rewards/chosen": -13.386159896850586,
"rewards/margins": 5.024949073791504,
"rewards/rejected": -18.411109924316406,
"step": 910
},
{
"epoch": 2.1265530193585667,
"grad_norm": 11.169416427612305,
"learning_rate": 6.378600823045267e-07,
"logits/chosen": -0.25930145382881165,
"logits/rejected": -0.2452802211046219,
"logps/chosen": -138.69859313964844,
"logps/rejected": -188.9458465576172,
"loss": 0.2369,
"nll_loss": 0.15493367612361908,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -13.86985969543457,
"rewards/margins": 5.024728298187256,
"rewards/rejected": -18.894588470458984,
"step": 920
},
{
"epoch": 2.1496677260907253,
"grad_norm": 20.787609100341797,
"learning_rate": 6.32716049382716e-07,
"logits/chosen": -0.4232078194618225,
"logits/rejected": -0.4213971197605133,
"logps/chosen": -133.97911071777344,
"logps/rejected": -183.1697540283203,
"loss": 0.2526,
"nll_loss": 0.17497238516807556,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -13.397911071777344,
"rewards/margins": 4.919064998626709,
"rewards/rejected": -18.31697654724121,
"step": 930
},
{
"epoch": 2.1727824328228835,
"grad_norm": 16.55530548095703,
"learning_rate": 6.275720164609053e-07,
"logits/chosen": -0.5225564241409302,
"logits/rejected": -0.5253915190696716,
"logps/chosen": -147.48667907714844,
"logps/rejected": -200.44107055664062,
"loss": 0.2383,
"nll_loss": 0.16094490885734558,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -14.748669624328613,
"rewards/margins": 5.295438766479492,
"rewards/rejected": -20.044105529785156,
"step": 940
},
{
"epoch": 2.1958971395550417,
"grad_norm": 25.473421096801758,
"learning_rate": 6.224279835390947e-07,
"logits/chosen": -0.6133296489715576,
"logits/rejected": -0.6065386533737183,
"logps/chosen": -147.1841583251953,
"logps/rejected": -198.28070068359375,
"loss": 0.2342,
"nll_loss": 0.17038078606128693,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -14.718416213989258,
"rewards/margins": 5.109654903411865,
"rewards/rejected": -19.82806968688965,
"step": 950
},
{
"epoch": 2.2190118462872004,
"grad_norm": 28.808799743652344,
"learning_rate": 6.172839506172839e-07,
"logits/chosen": -0.566586971282959,
"logits/rejected": -0.5580301284790039,
"logps/chosen": -141.78317260742188,
"logps/rejected": -189.71841430664062,
"loss": 0.2432,
"nll_loss": 0.16720861196517944,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -14.178317070007324,
"rewards/margins": 4.793524265289307,
"rewards/rejected": -18.97184181213379,
"step": 960
},
{
"epoch": 2.2421265530193586,
"grad_norm": 15.181388854980469,
"learning_rate": 6.121399176954732e-07,
"logits/chosen": -0.5153671503067017,
"logits/rejected": -0.49234214425086975,
"logps/chosen": -142.28048706054688,
"logps/rejected": -192.72178649902344,
"loss": 0.2565,
"nll_loss": 0.173838809132576,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -14.228050231933594,
"rewards/margins": 5.044127464294434,
"rewards/rejected": -19.272180557250977,
"step": 970
},
{
"epoch": 2.265241259751517,
"grad_norm": 10.162031173706055,
"learning_rate": 6.069958847736625e-07,
"logits/chosen": -0.3831091523170471,
"logits/rejected": -0.3817598521709442,
"logps/chosen": -142.67413330078125,
"logps/rejected": -191.6265106201172,
"loss": 0.2239,
"nll_loss": 0.15289117395877838,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -14.267412185668945,
"rewards/margins": 4.89523983001709,
"rewards/rejected": -19.16265296936035,
"step": 980
},
{
"epoch": 2.2883559664836755,
"grad_norm": 11.667806625366211,
"learning_rate": 6.018518518518519e-07,
"logits/chosen": -0.37663665413856506,
"logits/rejected": -0.36168596148490906,
"logps/chosen": -134.7302703857422,
"logps/rejected": -181.87161254882812,
"loss": 0.2179,
"nll_loss": 0.14360648393630981,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -13.473027229309082,
"rewards/margins": 4.714133262634277,
"rewards/rejected": -18.18716049194336,
"step": 990
},
{
"epoch": 2.3114706732158337,
"grad_norm": 13.98948860168457,
"learning_rate": 5.96707818930041e-07,
"logits/chosen": -0.35517022013664246,
"logits/rejected": -0.3607296645641327,
"logps/chosen": -143.46397399902344,
"logps/rejected": -196.64694213867188,
"loss": 0.2393,
"nll_loss": 0.16406962275505066,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -14.346399307250977,
"rewards/margins": 5.318297863006592,
"rewards/rejected": -19.664695739746094,
"step": 1000
},
{
"epoch": 2.334585379947992,
"grad_norm": 13.17771053314209,
"learning_rate": 5.915637860082304e-07,
"logits/chosen": -0.3597460389137268,
"logits/rejected": -0.36051079630851746,
"logps/chosen": -138.61643981933594,
"logps/rejected": -192.05581665039062,
"loss": 0.2306,
"nll_loss": 0.16202880442142487,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -13.86164379119873,
"rewards/margins": 5.343939304351807,
"rewards/rejected": -19.205581665039062,
"step": 1010
},
{
"epoch": 2.35770008668015,
"grad_norm": 13.457245826721191,
"learning_rate": 5.864197530864198e-07,
"logits/chosen": -0.4916199743747711,
"logits/rejected": -0.5020965933799744,
"logps/chosen": -147.89541625976562,
"logps/rejected": -199.31967163085938,
"loss": 0.2374,
"nll_loss": 0.16406235098838806,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -14.789543151855469,
"rewards/margins": 5.142425060272217,
"rewards/rejected": -19.931964874267578,
"step": 1020
},
{
"epoch": 2.3808147934123087,
"grad_norm": 13.335782051086426,
"learning_rate": 5.812757201646091e-07,
"logits/chosen": -0.39383864402770996,
"logits/rejected": -0.40474215149879456,
"logps/chosen": -133.04669189453125,
"logps/rejected": -180.41250610351562,
"loss": 0.242,
"nll_loss": 0.1537107676267624,
"rewards/accuracies": 0.965624988079071,
"rewards/chosen": -13.304669380187988,
"rewards/margins": 4.736581802368164,
"rewards/rejected": -18.041250228881836,
"step": 1030
},
{
"epoch": 2.403929500144467,
"grad_norm": 6.159650802612305,
"learning_rate": 5.761316872427983e-07,
"logits/chosen": -0.6221314668655396,
"logits/rejected": -0.5792278051376343,
"logps/chosen": -147.80052185058594,
"logps/rejected": -199.4378662109375,
"loss": 0.2262,
"nll_loss": 0.151776522397995,
"rewards/accuracies": 0.984375,
"rewards/chosen": -14.780054092407227,
"rewards/margins": 5.163733005523682,
"rewards/rejected": -19.943782806396484,
"step": 1040
},
{
"epoch": 2.427044206876625,
"grad_norm": 12.739320755004883,
"learning_rate": 5.709876543209876e-07,
"logits/chosen": -0.5569005012512207,
"logits/rejected": -0.5471926927566528,
"logps/chosen": -150.28656005859375,
"logps/rejected": -203.32809448242188,
"loss": 0.2392,
"nll_loss": 0.15395130217075348,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -15.028657913208008,
"rewards/margins": 5.304154872894287,
"rewards/rejected": -20.332813262939453,
"step": 1050
},
{
"epoch": 2.4501589136087834,
"grad_norm": 10.99962329864502,
"learning_rate": 5.65843621399177e-07,
"logits/chosen": -0.6100250482559204,
"logits/rejected": -0.6070842146873474,
"logps/chosen": -144.28292846679688,
"logps/rejected": -192.26254272460938,
"loss": 0.2358,
"nll_loss": 0.16113388538360596,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -14.42829418182373,
"rewards/margins": 4.797961235046387,
"rewards/rejected": -19.226253509521484,
"step": 1060
},
{
"epoch": 2.473273620340942,
"grad_norm": 14.381885528564453,
"learning_rate": 5.606995884773662e-07,
"logits/chosen": -0.4229808747768402,
"logits/rejected": -0.4043405055999756,
"logps/chosen": -135.27508544921875,
"logps/rejected": -184.1940460205078,
"loss": 0.2726,
"nll_loss": 0.16423283517360687,
"rewards/accuracies": 0.965624988079071,
"rewards/chosen": -13.527506828308105,
"rewards/margins": 4.8918962478637695,
"rewards/rejected": -18.419404983520508,
"step": 1070
},
{
"epoch": 2.4963883270731,
"grad_norm": 11.742487907409668,
"learning_rate": 5.555555555555555e-07,
"logits/chosen": -0.4398534297943115,
"logits/rejected": -0.43547695875167847,
"logps/chosen": -134.5975341796875,
"logps/rejected": -182.41848754882812,
"loss": 0.2452,
"nll_loss": 0.16178709268569946,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -13.459753036499023,
"rewards/margins": 4.782095909118652,
"rewards/rejected": -18.24184799194336,
"step": 1080
},
{
"epoch": 2.5195030338052584,
"grad_norm": 12.080589294433594,
"learning_rate": 5.504115226337448e-07,
"logits/chosen": -0.45496922731399536,
"logits/rejected": -0.45996856689453125,
"logps/chosen": -132.09829711914062,
"logps/rejected": -180.12393188476562,
"loss": 0.2284,
"nll_loss": 0.1582447737455368,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -13.209829330444336,
"rewards/margins": 4.80256462097168,
"rewards/rejected": -18.012393951416016,
"step": 1090
},
{
"epoch": 2.542617740537417,
"grad_norm": 24.479488372802734,
"learning_rate": 5.452674897119342e-07,
"logits/chosen": -0.36444956064224243,
"logits/rejected": -0.3619704842567444,
"logps/chosen": -141.44894409179688,
"logps/rejected": -194.81773376464844,
"loss": 0.2364,
"nll_loss": 0.17286133766174316,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -14.14489459991455,
"rewards/margins": 5.336878776550293,
"rewards/rejected": -19.48177146911621,
"step": 1100
},
{
"epoch": 2.5657324472695753,
"grad_norm": 12.051857948303223,
"learning_rate": 5.401234567901234e-07,
"logits/chosen": -0.45673027634620667,
"logits/rejected": -0.4733441472053528,
"logps/chosen": -136.0276336669922,
"logps/rejected": -188.5570068359375,
"loss": 0.2305,
"nll_loss": 0.1618407666683197,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -13.602763175964355,
"rewards/margins": 5.252939224243164,
"rewards/rejected": -18.855701446533203,
"step": 1110
},
{
"epoch": 2.5888471540017335,
"grad_norm": 10.467662811279297,
"learning_rate": 5.349794238683127e-07,
"logits/chosen": -0.4598791003227234,
"logits/rejected": -0.4583801329135895,
"logps/chosen": -137.6591033935547,
"logps/rejected": -189.61471557617188,
"loss": 0.2583,
"nll_loss": 0.16606256365776062,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -13.765910148620605,
"rewards/margins": 5.195560932159424,
"rewards/rejected": -18.961471557617188,
"step": 1120
},
{
"epoch": 2.611961860733892,
"grad_norm": 17.334087371826172,
"learning_rate": 5.29835390946502e-07,
"logits/chosen": -0.45638832449913025,
"logits/rejected": -0.4596933424472809,
"logps/chosen": -134.4242401123047,
"logps/rejected": -185.4617156982422,
"loss": 0.231,
"nll_loss": 0.15201494097709656,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -13.442425727844238,
"rewards/margins": 5.1037468910217285,
"rewards/rejected": -18.546171188354492,
"step": 1130
},
{
"epoch": 2.6350765674660503,
"grad_norm": 9.82776927947998,
"learning_rate": 5.246913580246914e-07,
"logits/chosen": -0.4979328513145447,
"logits/rejected": -0.4829026758670807,
"logps/chosen": -142.7810516357422,
"logps/rejected": -195.93936157226562,
"loss": 0.2197,
"nll_loss": 0.14758186042308807,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -14.278106689453125,
"rewards/margins": 5.315831184387207,
"rewards/rejected": -19.593936920166016,
"step": 1140
},
{
"epoch": 2.6581912741982086,
"grad_norm": 21.076847076416016,
"learning_rate": 5.195473251028807e-07,
"logits/chosen": -0.4889853894710541,
"logits/rejected": -0.4779161810874939,
"logps/chosen": -147.04873657226562,
"logps/rejected": -195.0872802734375,
"loss": 0.2223,
"nll_loss": 0.155166894197464,
"rewards/accuracies": 0.984375,
"rewards/chosen": -14.704874038696289,
"rewards/margins": 4.803854942321777,
"rewards/rejected": -19.50872802734375,
"step": 1150
},
{
"epoch": 2.681305980930367,
"grad_norm": 19.175827026367188,
"learning_rate": 5.144032921810699e-07,
"logits/chosen": -0.4997631013393402,
"logits/rejected": -0.4868396818637848,
"logps/chosen": -132.46238708496094,
"logps/rejected": -182.9662322998047,
"loss": 0.2392,
"nll_loss": 0.15937396883964539,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -13.246240615844727,
"rewards/margins": 5.050384521484375,
"rewards/rejected": -18.29662322998047,
"step": 1160
},
{
"epoch": 2.7044206876625254,
"grad_norm": 13.847294807434082,
"learning_rate": 5.092592592592593e-07,
"logits/chosen": -0.42537322640419006,
"logits/rejected": -0.40758857131004333,
"logps/chosen": -132.64317321777344,
"logps/rejected": -185.53622436523438,
"loss": 0.2315,
"nll_loss": 0.1639558970928192,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -13.264317512512207,
"rewards/margins": 5.289304733276367,
"rewards/rejected": -18.55362319946289,
"step": 1170
},
{
"epoch": 2.7275353943946836,
"grad_norm": 17.215343475341797,
"learning_rate": 5.041152263374485e-07,
"logits/chosen": -0.4605620503425598,
"logits/rejected": -0.47386521100997925,
"logps/chosen": -142.31393432617188,
"logps/rejected": -201.610107421875,
"loss": 0.2355,
"nll_loss": 0.1665884107351303,
"rewards/accuracies": 0.984375,
"rewards/chosen": -14.231393814086914,
"rewards/margins": 5.929617881774902,
"rewards/rejected": -20.161012649536133,
"step": 1180
},
{
"epoch": 2.750650101126842,
"grad_norm": 11.339929580688477,
"learning_rate": 4.989711934156378e-07,
"logits/chosen": -0.5646448731422424,
"logits/rejected": -0.5591720342636108,
"logps/chosen": -144.7230987548828,
"logps/rejected": -198.4960479736328,
"loss": 0.2296,
"nll_loss": 0.17730608582496643,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -14.472311019897461,
"rewards/margins": 5.377293109893799,
"rewards/rejected": -19.8496036529541,
"step": 1190
},
{
"epoch": 2.773764807859,
"grad_norm": 10.567920684814453,
"learning_rate": 4.938271604938271e-07,
"logits/chosen": -0.5628112554550171,
"logits/rejected": -0.5627862215042114,
"logps/chosen": -134.7103271484375,
"logps/rejected": -181.05490112304688,
"loss": 0.2401,
"nll_loss": 0.16600725054740906,
"rewards/accuracies": 0.984375,
"rewards/chosen": -13.471035957336426,
"rewards/margins": 4.634454250335693,
"rewards/rejected": -18.105487823486328,
"step": 1200
},
{
"epoch": 2.7968795145911587,
"grad_norm": 11.1284818649292,
"learning_rate": 4.886831275720165e-07,
"logits/chosen": -0.5333854556083679,
"logits/rejected": -0.5228737592697144,
"logps/chosen": -129.60784912109375,
"logps/rejected": -179.29922485351562,
"loss": 0.2237,
"nll_loss": 0.15326835215091705,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -12.960784912109375,
"rewards/margins": 4.969139099121094,
"rewards/rejected": -17.929922103881836,
"step": 1210
},
{
"epoch": 2.819994221323317,
"grad_norm": 10.869100570678711,
"learning_rate": 4.835390946502057e-07,
"logits/chosen": -0.4685629904270172,
"logits/rejected": -0.4411331117153168,
"logps/chosen": -137.3936767578125,
"logps/rejected": -190.50975036621094,
"loss": 0.2258,
"nll_loss": 0.16754138469696045,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -13.739367485046387,
"rewards/margins": 5.311608791351318,
"rewards/rejected": -19.050975799560547,
"step": 1220
},
{
"epoch": 2.843108928055475,
"grad_norm": 11.171156883239746,
"learning_rate": 4.783950617283951e-07,
"logits/chosen": -0.39593321084976196,
"logits/rejected": -0.3724592328071594,
"logps/chosen": -129.14064025878906,
"logps/rejected": -181.44851684570312,
"loss": 0.2196,
"nll_loss": 0.15831029415130615,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -12.914064407348633,
"rewards/margins": 5.230786323547363,
"rewards/rejected": -18.14484977722168,
"step": 1230
},
{
"epoch": 2.8662236347876338,
"grad_norm": 16.257095336914062,
"learning_rate": 4.732510288065844e-07,
"logits/chosen": -0.41909652948379517,
"logits/rejected": -0.4289626479148865,
"logps/chosen": -137.97906494140625,
"logps/rejected": -189.48602294921875,
"loss": 0.2401,
"nll_loss": 0.15598097443580627,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -13.797907829284668,
"rewards/margins": 5.15069580078125,
"rewards/rejected": -18.9486026763916,
"step": 1240
},
{
"epoch": 2.889338341519792,
"grad_norm": 24.864940643310547,
"learning_rate": 4.6810699588477364e-07,
"logits/chosen": -0.36290091276168823,
"logits/rejected": -0.34600576758384705,
"logps/chosen": -136.03607177734375,
"logps/rejected": -185.31668090820312,
"loss": 0.2201,
"nll_loss": 0.14870640635490417,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -13.603607177734375,
"rewards/margins": 4.9280619621276855,
"rewards/rejected": -18.53166961669922,
"step": 1250
},
{
"epoch": 2.91245304825195,
"grad_norm": 9.861152648925781,
"learning_rate": 4.6296296296296297e-07,
"logits/chosen": -0.43973201513290405,
"logits/rejected": -0.44227686524391174,
"logps/chosen": -139.79000854492188,
"logps/rejected": -191.3979949951172,
"loss": 0.2338,
"nll_loss": 0.15694692730903625,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -13.97900104522705,
"rewards/margins": 5.160799026489258,
"rewards/rejected": -19.139801025390625,
"step": 1260
},
{
"epoch": 2.935567754984109,
"grad_norm": 11.536057472229004,
"learning_rate": 4.5781893004115224e-07,
"logits/chosen": -0.4365859925746918,
"logits/rejected": -0.43007755279541016,
"logps/chosen": -143.85635375976562,
"logps/rejected": -197.02879333496094,
"loss": 0.2355,
"nll_loss": 0.15321387350559235,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -14.385635375976562,
"rewards/margins": 5.317243576049805,
"rewards/rejected": -19.702880859375,
"step": 1270
},
{
"epoch": 2.958682461716267,
"grad_norm": 18.637239456176758,
"learning_rate": 4.5267489711934156e-07,
"logits/chosen": -0.47489842772483826,
"logits/rejected": -0.4829436242580414,
"logps/chosen": -140.48260498046875,
"logps/rejected": -196.2875213623047,
"loss": 0.2461,
"nll_loss": 0.16315388679504395,
"rewards/accuracies": 0.984375,
"rewards/chosen": -14.048260688781738,
"rewards/margins": 5.5804924964904785,
"rewards/rejected": -19.628753662109375,
"step": 1280
},
{
"epoch": 2.9817971684484252,
"grad_norm": 13.219135284423828,
"learning_rate": 4.4753086419753083e-07,
"logits/chosen": -0.45336833596229553,
"logits/rejected": -0.44670405983924866,
"logps/chosen": -141.3701934814453,
"logps/rejected": -192.05670166015625,
"loss": 0.2244,
"nll_loss": 0.16718199849128723,
"rewards/accuracies": 0.984375,
"rewards/chosen": -14.137018203735352,
"rewards/margins": 5.0686516761779785,
"rewards/rejected": -19.205671310424805,
"step": 1290
},
{
"epoch": 2.997977463160936,
"eval_logits/chosen": -0.3714839220046997,
"eval_logits/rejected": -0.3428020179271698,
"eval_logps/chosen": -157.10519409179688,
"eval_logps/rejected": -172.1945343017578,
"eval_loss": 1.3861061334609985,
"eval_nll_loss": 0.20338018238544464,
"eval_rewards/accuracies": 0.656521737575531,
"eval_rewards/chosen": -15.710522651672363,
"eval_rewards/margins": 1.5089313983917236,
"eval_rewards/rejected": -17.219451904296875,
"eval_runtime": 77.2394,
"eval_samples_per_second": 23.641,
"eval_steps_per_second": 1.489,
"step": 1297
},
{
"epoch": 3.0049118751805834,
"grad_norm": 5.132666110992432,
"learning_rate": 4.4238683127572015e-07,
"logits/chosen": -0.44278082251548767,
"logits/rejected": -0.44281044602394104,
"logps/chosen": -141.17550659179688,
"logps/rejected": -196.56248474121094,
"loss": 0.2016,
"nll_loss": 0.15163448452949524,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -14.117551803588867,
"rewards/margins": 5.538697719573975,
"rewards/rejected": -19.656248092651367,
"step": 1300
},
{
"epoch": 3.028026581912742,
"grad_norm": 3.1660420894622803,
"learning_rate": 4.372427983539094e-07,
"logits/chosen": -0.40755367279052734,
"logits/rejected": -0.3970012962818146,
"logps/chosen": -125.93168640136719,
"logps/rejected": -186.09402465820312,
"loss": 0.1537,
"nll_loss": 0.13879674673080444,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.593169212341309,
"rewards/margins": 6.016233921051025,
"rewards/rejected": -18.609403610229492,
"step": 1310
},
{
"epoch": 3.0511412886449003,
"grad_norm": 3.5848960876464844,
"learning_rate": 4.320987654320987e-07,
"logits/chosen": -0.44615453481674194,
"logits/rejected": -0.43949246406555176,
"logps/chosen": -126.3210220336914,
"logps/rejected": -184.44094848632812,
"loss": 0.1556,
"nll_loss": 0.1318623572587967,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.632102012634277,
"rewards/margins": 5.811993598937988,
"rewards/rejected": -18.444095611572266,
"step": 1320
},
{
"epoch": 3.0742559953770585,
"grad_norm": 3.971622943878174,
"learning_rate": 4.2695473251028807e-07,
"logits/chosen": -0.34509214758872986,
"logits/rejected": -0.3416140079498291,
"logps/chosen": -130.82965087890625,
"logps/rejected": -189.31130981445312,
"loss": 0.1539,
"nll_loss": 0.13816341757774353,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -13.082966804504395,
"rewards/margins": 5.848166465759277,
"rewards/rejected": -18.931133270263672,
"step": 1330
},
{
"epoch": 3.097370702109217,
"grad_norm": 3.245117664337158,
"learning_rate": 4.218106995884774e-07,
"logits/chosen": -0.263519287109375,
"logits/rejected": -0.25365307927131653,
"logps/chosen": -128.29852294921875,
"logps/rejected": -189.9366455078125,
"loss": 0.1518,
"nll_loss": 0.13781467080116272,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.829852104187012,
"rewards/margins": 6.16381311416626,
"rewards/rejected": -18.99366569519043,
"step": 1340
},
{
"epoch": 3.1204854088413754,
"grad_norm": 4.314767837524414,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -0.2783138155937195,
"logits/rejected": -0.3006114363670349,
"logps/chosen": -128.49453735351562,
"logps/rejected": -187.8452606201172,
"loss": 0.1516,
"nll_loss": 0.14406827092170715,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.849452018737793,
"rewards/margins": 5.935072898864746,
"rewards/rejected": -18.784526824951172,
"step": 1350
},
{
"epoch": 3.1436001155735336,
"grad_norm": 2.8442511558532715,
"learning_rate": 4.11522633744856e-07,
"logits/chosen": -0.19675478339195251,
"logits/rejected": -0.18994562327861786,
"logps/chosen": -130.37368774414062,
"logps/rejected": -191.08071899414062,
"loss": 0.1502,
"nll_loss": 0.14177414774894714,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.037368774414062,
"rewards/margins": 6.070704936981201,
"rewards/rejected": -19.10807228088379,
"step": 1360
},
{
"epoch": 3.166714822305692,
"grad_norm": 4.321190357208252,
"learning_rate": 4.0637860082304526e-07,
"logits/chosen": -0.29594722390174866,
"logits/rejected": -0.2727283537387848,
"logps/chosen": -126.78936767578125,
"logps/rejected": -183.8494873046875,
"loss": 0.1495,
"nll_loss": 0.13010382652282715,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.678936958312988,
"rewards/margins": 5.706011772155762,
"rewards/rejected": -18.38494873046875,
"step": 1370
},
{
"epoch": 3.1898295290378504,
"grad_norm": 3.650377035140991,
"learning_rate": 4.0123456790123453e-07,
"logits/chosen": -0.37024635076522827,
"logits/rejected": -0.36072778701782227,
"logps/chosen": -134.62948608398438,
"logps/rejected": -194.2451171875,
"loss": 0.1556,
"nll_loss": 0.1394232213497162,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -13.4629487991333,
"rewards/margins": 5.9615631103515625,
"rewards/rejected": -19.42451286315918,
"step": 1380
},
{
"epoch": 3.2129442357700086,
"grad_norm": 5.636937141418457,
"learning_rate": 3.9609053497942385e-07,
"logits/chosen": -0.27522599697113037,
"logits/rejected": -0.27910444140434265,
"logps/chosen": -124.5965805053711,
"logps/rejected": -187.5218505859375,
"loss": 0.1484,
"nll_loss": 0.12636372447013855,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -12.459661483764648,
"rewards/margins": 6.292525768280029,
"rewards/rejected": -18.752187728881836,
"step": 1390
},
{
"epoch": 3.236058942502167,
"grad_norm": 3.8186678886413574,
"learning_rate": 3.909465020576131e-07,
"logits/chosen": -0.2928979992866516,
"logits/rejected": -0.2864636480808258,
"logps/chosen": -124.09950256347656,
"logps/rejected": -181.70155334472656,
"loss": 0.1549,
"nll_loss": 0.13333001732826233,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -12.409948348999023,
"rewards/margins": 5.7602057456970215,
"rewards/rejected": -18.170154571533203,
"step": 1400
},
{
"epoch": 3.2591736492343255,
"grad_norm": 3.9708776473999023,
"learning_rate": 3.8580246913580245e-07,
"logits/chosen": -0.3393842577934265,
"logits/rejected": -0.32439425587654114,
"logps/chosen": -130.1053009033203,
"logps/rejected": -188.5397491455078,
"loss": 0.1556,
"nll_loss": 0.13221554458141327,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.010530471801758,
"rewards/margins": 5.843444347381592,
"rewards/rejected": -18.853975296020508,
"step": 1410
},
{
"epoch": 3.2822883559664837,
"grad_norm": 3.5606882572174072,
"learning_rate": 3.806584362139917e-07,
"logits/chosen": -0.31585693359375,
"logits/rejected": -0.26836958527565,
"logps/chosen": -120.08418273925781,
"logps/rejected": -180.00120544433594,
"loss": 0.1471,
"nll_loss": 0.12899354100227356,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.008419036865234,
"rewards/margins": 5.991702079772949,
"rewards/rejected": -18.000120162963867,
"step": 1420
},
{
"epoch": 3.305403062698642,
"grad_norm": 3.3717777729034424,
"learning_rate": 3.7551440329218104e-07,
"logits/chosen": -0.23174750804901123,
"logits/rejected": -0.2522903382778168,
"logps/chosen": -131.6839599609375,
"logps/rejected": -198.05081176757812,
"loss": 0.1565,
"nll_loss": 0.13706137239933014,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.16839599609375,
"rewards/margins": 6.636684417724609,
"rewards/rejected": -19.80508041381836,
"step": 1430
},
{
"epoch": 3.3285177694308006,
"grad_norm": 3.782886028289795,
"learning_rate": 3.703703703703703e-07,
"logits/chosen": -0.3117191195487976,
"logits/rejected": -0.31785351037979126,
"logps/chosen": -131.83470153808594,
"logps/rejected": -189.18441772460938,
"loss": 0.1492,
"nll_loss": 0.12388783693313599,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.183469772338867,
"rewards/margins": 5.734971046447754,
"rewards/rejected": -18.918439865112305,
"step": 1440
},
{
"epoch": 3.351632476162959,
"grad_norm": 3.158254384994507,
"learning_rate": 3.6522633744855963e-07,
"logits/chosen": -0.3361268639564514,
"logits/rejected": -0.3252175748348236,
"logps/chosen": -128.30125427246094,
"logps/rejected": -186.31838989257812,
"loss": 0.1539,
"nll_loss": 0.13049830496311188,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -12.830126762390137,
"rewards/margins": 5.801713943481445,
"rewards/rejected": -18.631839752197266,
"step": 1450
},
{
"epoch": 3.374747182895117,
"grad_norm": 4.768058776855469,
"learning_rate": 3.6008230452674896e-07,
"logits/chosen": -0.23867249488830566,
"logits/rejected": -0.20122122764587402,
"logps/chosen": -123.92413330078125,
"logps/rejected": -186.30250549316406,
"loss": 0.1616,
"nll_loss": 0.14071312546730042,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.392415046691895,
"rewards/margins": 6.2378363609313965,
"rewards/rejected": -18.630252838134766,
"step": 1460
},
{
"epoch": 3.397861889627275,
"grad_norm": 3.911938428878784,
"learning_rate": 3.549382716049383e-07,
"logits/chosen": -0.2685008943080902,
"logits/rejected": -0.23969027400016785,
"logps/chosen": -127.1446304321289,
"logps/rejected": -186.02838134765625,
"loss": 0.1486,
"nll_loss": 0.12472818791866302,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -12.714462280273438,
"rewards/margins": 5.888378143310547,
"rewards/rejected": -18.602840423583984,
"step": 1470
},
{
"epoch": 3.420976596359434,
"grad_norm": 3.9447271823883057,
"learning_rate": 3.4979423868312755e-07,
"logits/chosen": -0.28780004382133484,
"logits/rejected": -0.290294349193573,
"logps/chosen": -127.8751449584961,
"logps/rejected": -189.95578002929688,
"loss": 0.1473,
"nll_loss": 0.13437309861183167,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.787514686584473,
"rewards/margins": 6.208063125610352,
"rewards/rejected": -18.99557876586914,
"step": 1480
},
{
"epoch": 3.444091303091592,
"grad_norm": 6.313704490661621,
"learning_rate": 3.446502057613169e-07,
"logits/chosen": -0.23013484477996826,
"logits/rejected": -0.23306229710578918,
"logps/chosen": -122.0789566040039,
"logps/rejected": -185.14695739746094,
"loss": 0.1478,
"nll_loss": 0.13203728199005127,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.207897186279297,
"rewards/margins": 6.306800842285156,
"rewards/rejected": -18.514698028564453,
"step": 1490
},
{
"epoch": 3.4672060098237503,
"grad_norm": 2.906285524368286,
"learning_rate": 3.3950617283950614e-07,
"logits/chosen": -0.3435348868370056,
"logits/rejected": -0.33539697527885437,
"logps/chosen": -123.60890197753906,
"logps/rejected": -183.1199493408203,
"loss": 0.1513,
"nll_loss": 0.13879191875457764,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.36089038848877,
"rewards/margins": 5.951104640960693,
"rewards/rejected": -18.311994552612305,
"step": 1500
},
{
"epoch": 3.4903207165559085,
"grad_norm": 2.990963935852051,
"learning_rate": 3.3436213991769547e-07,
"logits/chosen": -0.26741576194763184,
"logits/rejected": -0.273776650428772,
"logps/chosen": -129.36013793945312,
"logps/rejected": -186.50009155273438,
"loss": 0.1465,
"nll_loss": 0.14070597290992737,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -12.936014175415039,
"rewards/margins": 5.713995933532715,
"rewards/rejected": -18.650009155273438,
"step": 1510
},
{
"epoch": 3.513435423288067,
"grad_norm": 5.473604679107666,
"learning_rate": 3.2921810699588474e-07,
"logits/chosen": -0.28439709544181824,
"logits/rejected": -0.2706482410430908,
"logps/chosen": -123.5947265625,
"logps/rejected": -185.80001831054688,
"loss": 0.1509,
"nll_loss": 0.1402612030506134,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.359472274780273,
"rewards/margins": 6.220528602600098,
"rewards/rejected": -18.580001831054688,
"step": 1520
},
{
"epoch": 3.5365501300202253,
"grad_norm": 6.9896626472473145,
"learning_rate": 3.2407407407407406e-07,
"logits/chosen": -0.3721368908882141,
"logits/rejected": -0.3583984673023224,
"logps/chosen": -128.07249450683594,
"logps/rejected": -187.01959228515625,
"loss": 0.1538,
"nll_loss": 0.13780102133750916,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.807249069213867,
"rewards/margins": 5.894709587097168,
"rewards/rejected": -18.701961517333984,
"step": 1530
},
{
"epoch": 3.5596648367523835,
"grad_norm": 2.910080671310425,
"learning_rate": 3.1893004115226333e-07,
"logits/chosen": -0.3633486330509186,
"logits/rejected": -0.34488505125045776,
"logps/chosen": -125.72395324707031,
"logps/rejected": -184.29405212402344,
"loss": 0.1547,
"nll_loss": 0.1316194236278534,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -12.572395324707031,
"rewards/margins": 5.857010841369629,
"rewards/rejected": -18.429405212402344,
"step": 1540
},
{
"epoch": 3.582779543484542,
"grad_norm": 3.2864928245544434,
"learning_rate": 3.1378600823045266e-07,
"logits/chosen": -0.36337172985076904,
"logits/rejected": -0.3896876871585846,
"logps/chosen": -130.9540252685547,
"logps/rejected": -192.02456665039062,
"loss": 0.143,
"nll_loss": 0.12916973233222961,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.095403671264648,
"rewards/margins": 6.107052803039551,
"rewards/rejected": -19.202457427978516,
"step": 1550
},
{
"epoch": 3.6058942502167004,
"grad_norm": 9.098392486572266,
"learning_rate": 3.086419753086419e-07,
"logits/chosen": -0.26420459151268005,
"logits/rejected": -0.30124431848526,
"logps/chosen": -132.1412353515625,
"logps/rejected": -196.06668090820312,
"loss": 0.1472,
"nll_loss": 0.12210263311862946,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.214123725891113,
"rewards/margins": 6.392544269561768,
"rewards/rejected": -19.60666847229004,
"step": 1560
},
{
"epoch": 3.6290089569488586,
"grad_norm": 3.135023593902588,
"learning_rate": 3.0349794238683125e-07,
"logits/chosen": -0.2870226800441742,
"logits/rejected": -0.32922470569610596,
"logps/chosen": -127.20719909667969,
"logps/rejected": -187.71414184570312,
"loss": 0.1606,
"nll_loss": 0.13571253418922424,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.720720291137695,
"rewards/margins": 6.050693511962891,
"rewards/rejected": -18.771413803100586,
"step": 1570
},
{
"epoch": 3.6521236636810173,
"grad_norm": 2.965545892715454,
"learning_rate": 2.983539094650205e-07,
"logits/chosen": -0.2955471873283386,
"logits/rejected": -0.29221171140670776,
"logps/chosen": -120.03623962402344,
"logps/rejected": -177.8092041015625,
"loss": 0.141,
"nll_loss": 0.12610065937042236,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.003625869750977,
"rewards/margins": 5.777295112609863,
"rewards/rejected": -17.780920028686523,
"step": 1580
},
{
"epoch": 3.6752383704131755,
"grad_norm": 3.8427724838256836,
"learning_rate": 2.932098765432099e-07,
"logits/chosen": -0.294664204120636,
"logits/rejected": -0.315548837184906,
"logps/chosen": -126.55033874511719,
"logps/rejected": -186.32962036132812,
"loss": 0.1472,
"nll_loss": 0.1299527883529663,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.655034065246582,
"rewards/margins": 5.977927207946777,
"rewards/rejected": -18.63296127319336,
"step": 1590
},
{
"epoch": 3.6983530771453337,
"grad_norm": 3.386413335800171,
"learning_rate": 2.8806584362139917e-07,
"logits/chosen": -0.21596117317676544,
"logits/rejected": -0.20901863276958466,
"logps/chosen": -118.6823959350586,
"logps/rejected": -177.80654907226562,
"loss": 0.1584,
"nll_loss": 0.14362338185310364,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.86823844909668,
"rewards/margins": 5.912415504455566,
"rewards/rejected": -17.780656814575195,
"step": 1600
},
{
"epoch": 3.7214677838774923,
"grad_norm": 3.672924518585205,
"learning_rate": 2.829218106995885e-07,
"logits/chosen": -0.26348841190338135,
"logits/rejected": -0.262240469455719,
"logps/chosen": -124.21568298339844,
"logps/rejected": -183.1221466064453,
"loss": 0.1513,
"nll_loss": 0.11891283839941025,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -12.421568870544434,
"rewards/margins": 5.8906474113464355,
"rewards/rejected": -18.31221580505371,
"step": 1610
},
{
"epoch": 3.7445824906096505,
"grad_norm": 3.7650656700134277,
"learning_rate": 2.7777777777777776e-07,
"logits/chosen": -0.278475821018219,
"logits/rejected": -0.2345239669084549,
"logps/chosen": -123.59881591796875,
"logps/rejected": -183.743896484375,
"loss": 0.1518,
"nll_loss": 0.12711484730243683,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -12.359882354736328,
"rewards/margins": 6.014508247375488,
"rewards/rejected": -18.3743896484375,
"step": 1620
},
{
"epoch": 3.7676971973418087,
"grad_norm": 3.11409592628479,
"learning_rate": 2.726337448559671e-07,
"logits/chosen": -0.29814380407333374,
"logits/rejected": -0.28927913308143616,
"logps/chosen": -127.12947082519531,
"logps/rejected": -183.96328735351562,
"loss": 0.1502,
"nll_loss": 0.11745184659957886,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -12.712945938110352,
"rewards/margins": 5.683382987976074,
"rewards/rejected": -18.396331787109375,
"step": 1630
},
{
"epoch": 3.790811904073967,
"grad_norm": 4.140903949737549,
"learning_rate": 2.6748971193415635e-07,
"logits/chosen": -0.29099392890930176,
"logits/rejected": -0.3041759133338928,
"logps/chosen": -130.06552124023438,
"logps/rejected": -191.20046997070312,
"loss": 0.1509,
"nll_loss": 0.14280778169631958,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.006550788879395,
"rewards/margins": 6.11349630355835,
"rewards/rejected": -19.120046615600586,
"step": 1640
},
{
"epoch": 3.813926610806125,
"grad_norm": 8.86196231842041,
"learning_rate": 2.623456790123457e-07,
"logits/chosen": -0.2659907341003418,
"logits/rejected": -0.27678874135017395,
"logps/chosen": -126.56221008300781,
"logps/rejected": -185.51071166992188,
"loss": 0.1458,
"nll_loss": 0.1296006143093109,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.656221389770508,
"rewards/margins": 5.894850730895996,
"rewards/rejected": -18.551071166992188,
"step": 1650
},
{
"epoch": 3.837041317538284,
"grad_norm": 7.074207305908203,
"learning_rate": 2.5720164609053495e-07,
"logits/chosen": -0.2648230195045471,
"logits/rejected": -0.2591935098171234,
"logps/chosen": -117.26505279541016,
"logps/rejected": -177.61654663085938,
"loss": 0.1454,
"nll_loss": 0.13034331798553467,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.726505279541016,
"rewards/margins": 6.03515100479126,
"rewards/rejected": -17.761655807495117,
"step": 1660
},
{
"epoch": 3.860156024270442,
"grad_norm": 3.6986083984375,
"learning_rate": 2.5205761316872427e-07,
"logits/chosen": -0.3297143876552582,
"logits/rejected": -0.31857237219810486,
"logps/chosen": -133.59078979492188,
"logps/rejected": -194.1522979736328,
"loss": 0.156,
"nll_loss": 0.1323135942220688,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.359077453613281,
"rewards/margins": 6.056151390075684,
"rewards/rejected": -19.41522789001465,
"step": 1670
},
{
"epoch": 3.8832707310026002,
"grad_norm": 3.5342583656311035,
"learning_rate": 2.4691358024691354e-07,
"logits/chosen": -0.3504456877708435,
"logits/rejected": -0.3491267263889313,
"logps/chosen": -125.02303314208984,
"logps/rejected": -186.25491333007812,
"loss": 0.1414,
"nll_loss": 0.1284278929233551,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.502302169799805,
"rewards/margins": 6.123185157775879,
"rewards/rejected": -18.62548828125,
"step": 1680
},
{
"epoch": 3.906385437734759,
"grad_norm": 9.769820213317871,
"learning_rate": 2.4176954732510286e-07,
"logits/chosen": -0.3653779923915863,
"logits/rejected": -0.3362106382846832,
"logps/chosen": -135.67111206054688,
"logps/rejected": -198.921142578125,
"loss": 0.1563,
"nll_loss": 0.1389894187450409,
"rewards/accuracies": 1.0,
"rewards/chosen": -13.567111015319824,
"rewards/margins": 6.325002193450928,
"rewards/rejected": -19.892114639282227,
"step": 1690
},
{
"epoch": 3.929500144466917,
"grad_norm": 12.724737167358398,
"learning_rate": 2.366255144032922e-07,
"logits/chosen": -0.3556443452835083,
"logits/rejected": -0.33838778734207153,
"logps/chosen": -126.82794189453125,
"logps/rejected": -187.8473663330078,
"loss": 0.1457,
"nll_loss": 0.13801956176757812,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.682792663574219,
"rewards/margins": 6.101943016052246,
"rewards/rejected": -18.784738540649414,
"step": 1700
},
{
"epoch": 3.9526148511990753,
"grad_norm": 2.656416654586792,
"learning_rate": 2.3148148148148148e-07,
"logits/chosen": -0.3134855329990387,
"logits/rejected": -0.305325984954834,
"logps/chosen": -128.65797424316406,
"logps/rejected": -188.01309204101562,
"loss": 0.1369,
"nll_loss": 0.12594002485275269,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.865796089172363,
"rewards/margins": 5.935511589050293,
"rewards/rejected": -18.80130958557129,
"step": 1710
},
{
"epoch": 3.975729557931234,
"grad_norm": 12.101499557495117,
"learning_rate": 2.2633744855967078e-07,
"logits/chosen": -0.4090637266635895,
"logits/rejected": -0.3877164423465729,
"logps/chosen": -134.76638793945312,
"logps/rejected": -194.9758758544922,
"loss": 0.1532,
"nll_loss": 0.14175161719322205,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -13.476638793945312,
"rewards/margins": 6.020949840545654,
"rewards/rejected": -19.497589111328125,
"step": 1720
},
{
"epoch": 3.998844264663392,
"grad_norm": 6.0831708908081055,
"learning_rate": 2.2119341563786008e-07,
"logits/chosen": -0.3833851218223572,
"logits/rejected": -0.39498597383499146,
"logps/chosen": -129.8985595703125,
"logps/rejected": -187.89739990234375,
"loss": 0.1472,
"nll_loss": 0.12770399451255798,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -12.98985481262207,
"rewards/margins": 5.799884796142578,
"rewards/rejected": -18.78973960876465,
"step": 1730
},
{
"epoch": 3.998844264663392,
"eval_logits/chosen": -0.3029468059539795,
"eval_logits/rejected": -0.270137220621109,
"eval_logps/chosen": -146.46226501464844,
"eval_logps/rejected": -161.38487243652344,
"eval_loss": 1.4029475450515747,
"eval_nll_loss": 0.1876361072063446,
"eval_rewards/accuracies": 0.6521739363670349,
"eval_rewards/chosen": -14.646224975585938,
"eval_rewards/margins": 1.4922590255737305,
"eval_rewards/rejected": -16.138486862182617,
"eval_runtime": 77.4371,
"eval_samples_per_second": 23.58,
"eval_steps_per_second": 1.485,
"step": 1730
},
{
"epoch": 4.02195897139555,
"grad_norm": 1.9679253101348877,
"learning_rate": 2.1604938271604935e-07,
"logits/chosen": -0.3585730195045471,
"logits/rejected": -0.3200622498989105,
"logps/chosen": -118.93489074707031,
"logps/rejected": -183.91061401367188,
"loss": 0.1179,
"nll_loss": 0.1184120774269104,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.893487930297852,
"rewards/margins": 6.497572422027588,
"rewards/rejected": -18.391061782836914,
"step": 1740
},
{
"epoch": 4.045073678127709,
"grad_norm": 1.426239252090454,
"learning_rate": 2.109053497942387e-07,
"logits/chosen": -0.3198128640651703,
"logits/rejected": -0.3108198940753937,
"logps/chosen": -119.95533752441406,
"logps/rejected": -182.93043518066406,
"loss": 0.1218,
"nll_loss": 0.10763946920633316,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.99553394317627,
"rewards/margins": 6.29750919342041,
"rewards/rejected": -18.293041229248047,
"step": 1750
},
{
"epoch": 4.068188384859867,
"grad_norm": 1.8550798892974854,
"learning_rate": 2.05761316872428e-07,
"logits/chosen": -0.28298747539520264,
"logits/rejected": -0.2920450270175934,
"logps/chosen": -117.935791015625,
"logps/rejected": -186.0088653564453,
"loss": 0.1233,
"nll_loss": 0.11667722463607788,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.7935791015625,
"rewards/margins": 6.807305812835693,
"rewards/rejected": -18.60088539123535,
"step": 1760
},
{
"epoch": 4.091303091592025,
"grad_norm": 1.947771668434143,
"learning_rate": 2.0061728395061726e-07,
"logits/chosen": -0.21840214729309082,
"logits/rejected": -0.2067776620388031,
"logps/chosen": -115.0444564819336,
"logps/rejected": -179.38697814941406,
"loss": 0.1213,
"nll_loss": 0.1122204065322876,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.504446029663086,
"rewards/margins": 6.4342522621154785,
"rewards/rejected": -17.938695907592773,
"step": 1770
},
{
"epoch": 4.114417798324184,
"grad_norm": 1.8407361507415771,
"learning_rate": 1.9547325102880656e-07,
"logits/chosen": -0.29772254824638367,
"logits/rejected": -0.2754737138748169,
"logps/chosen": -113.61384582519531,
"logps/rejected": -177.0957489013672,
"loss": 0.1227,
"nll_loss": 0.10529961436986923,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.361384391784668,
"rewards/margins": 6.348191738128662,
"rewards/rejected": -17.709575653076172,
"step": 1780
},
{
"epoch": 4.137532505056342,
"grad_norm": 1.4201513528823853,
"learning_rate": 1.9032921810699586e-07,
"logits/chosen": -0.30481767654418945,
"logits/rejected": -0.2908991277217865,
"logps/chosen": -119.33686828613281,
"logps/rejected": -184.93646240234375,
"loss": 0.1227,
"nll_loss": 0.1168881431221962,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.933687210083008,
"rewards/margins": 6.559959411621094,
"rewards/rejected": -18.493648529052734,
"step": 1790
},
{
"epoch": 4.1606472117885005,
"grad_norm": 1.8120708465576172,
"learning_rate": 1.8518518518518516e-07,
"logits/chosen": -0.3080504834651947,
"logits/rejected": -0.30417922139167786,
"logps/chosen": -122.6048812866211,
"logps/rejected": -185.8119659423828,
"loss": 0.126,
"nll_loss": 0.12334553897380829,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.26048755645752,
"rewards/margins": 6.320708751678467,
"rewards/rejected": -18.581195831298828,
"step": 1800
},
{
"epoch": 4.183761918520659,
"grad_norm": 2.5624470710754395,
"learning_rate": 1.8004115226337448e-07,
"logits/chosen": -0.24937394261360168,
"logits/rejected": -0.2712889313697815,
"logps/chosen": -124.1614761352539,
"logps/rejected": -188.57559204101562,
"loss": 0.1226,
"nll_loss": 0.1163693517446518,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.41614818572998,
"rewards/margins": 6.44141149520874,
"rewards/rejected": -18.857561111450195,
"step": 1810
},
{
"epoch": 4.206876625252817,
"grad_norm": 1.5446466207504272,
"learning_rate": 1.7489711934156378e-07,
"logits/chosen": -0.23896384239196777,
"logits/rejected": -0.2415800839662552,
"logps/chosen": -119.49736022949219,
"logps/rejected": -185.11898803710938,
"loss": 0.1212,
"nll_loss": 0.11859021335840225,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.949737548828125,
"rewards/margins": 6.56216287612915,
"rewards/rejected": -18.511898040771484,
"step": 1820
},
{
"epoch": 4.229991331984976,
"grad_norm": 1.7995822429656982,
"learning_rate": 1.6975308641975307e-07,
"logits/chosen": -0.24105176329612732,
"logits/rejected": -0.21960768103599548,
"logps/chosen": -113.63651275634766,
"logps/rejected": -176.64730834960938,
"loss": 0.1216,
"nll_loss": 0.11322028934955597,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.363652229309082,
"rewards/margins": 6.301081657409668,
"rewards/rejected": -17.66473388671875,
"step": 1830
},
{
"epoch": 4.253106038717133,
"grad_norm": 1.7273714542388916,
"learning_rate": 1.6460905349794237e-07,
"logits/chosen": -0.253646582365036,
"logits/rejected": -0.26175594329833984,
"logps/chosen": -118.37306213378906,
"logps/rejected": -184.26153564453125,
"loss": 0.1206,
"nll_loss": 0.11956053972244263,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.837307929992676,
"rewards/margins": 6.5888471603393555,
"rewards/rejected": -18.4261531829834,
"step": 1840
},
{
"epoch": 4.276220745449292,
"grad_norm": 4.887149810791016,
"learning_rate": 1.5946502057613167e-07,
"logits/chosen": -0.2122907191514969,
"logits/rejected": -0.2090766876935959,
"logps/chosen": -113.57759094238281,
"logps/rejected": -174.99594116210938,
"loss": 0.1184,
"nll_loss": 0.10560585558414459,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -11.357759475708008,
"rewards/margins": 6.141837120056152,
"rewards/rejected": -17.499595642089844,
"step": 1850
},
{
"epoch": 4.299335452181451,
"grad_norm": 1.5595005750656128,
"learning_rate": 1.5432098765432096e-07,
"logits/chosen": -0.13843365013599396,
"logits/rejected": -0.1982315182685852,
"logps/chosen": -118.16423034667969,
"logps/rejected": -182.03799438476562,
"loss": 0.1211,
"nll_loss": 0.11699899286031723,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.816422462463379,
"rewards/margins": 6.387377738952637,
"rewards/rejected": -18.203800201416016,
"step": 1860
},
{
"epoch": 4.322450158913608,
"grad_norm": 2.2779886722564697,
"learning_rate": 1.4917695473251026e-07,
"logits/chosen": -0.265516459941864,
"logits/rejected": -0.2614438533782959,
"logps/chosen": -124.3641128540039,
"logps/rejected": -187.52963256835938,
"loss": 0.1261,
"nll_loss": 0.11983609199523926,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.43641185760498,
"rewards/margins": 6.316550254821777,
"rewards/rejected": -18.75296401977539,
"step": 1870
},
{
"epoch": 4.345564865645767,
"grad_norm": 2.2859365940093994,
"learning_rate": 1.4403292181069958e-07,
"logits/chosen": -0.25305554270744324,
"logits/rejected": -0.2473808228969574,
"logps/chosen": -124.98432922363281,
"logps/rejected": -187.47373962402344,
"loss": 0.1245,
"nll_loss": 0.12777109444141388,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.498431205749512,
"rewards/margins": 6.2489423751831055,
"rewards/rejected": -18.74737548828125,
"step": 1880
},
{
"epoch": 4.368679572377926,
"grad_norm": 1.4982426166534424,
"learning_rate": 1.3888888888888888e-07,
"logits/chosen": -0.2519396245479584,
"logits/rejected": -0.24396154284477234,
"logps/chosen": -117.44911193847656,
"logps/rejected": -182.5988006591797,
"loss": 0.1127,
"nll_loss": 0.1127076968550682,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.744911193847656,
"rewards/margins": 6.5149688720703125,
"rewards/rejected": -18.25988006591797,
"step": 1890
},
{
"epoch": 4.3917942791100835,
"grad_norm": 2.1417200565338135,
"learning_rate": 1.3374485596707818e-07,
"logits/chosen": -0.19052667915821075,
"logits/rejected": -0.1665157973766327,
"logps/chosen": -116.32462310791016,
"logps/rejected": -181.2820587158203,
"loss": 0.1205,
"nll_loss": 0.11788536608219147,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.632462501525879,
"rewards/margins": 6.495743751525879,
"rewards/rejected": -18.12820816040039,
"step": 1900
},
{
"epoch": 4.414908985842242,
"grad_norm": 1.5730674266815186,
"learning_rate": 1.2860082304526747e-07,
"logits/chosen": -0.28410059213638306,
"logits/rejected": -0.24584396183490753,
"logps/chosen": -126.806884765625,
"logps/rejected": -191.36875915527344,
"loss": 0.1188,
"nll_loss": 0.11963550001382828,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.680688858032227,
"rewards/margins": 6.456188201904297,
"rewards/rejected": -19.13687515258789,
"step": 1910
},
{
"epoch": 4.438023692574401,
"grad_norm": 2.536539077758789,
"learning_rate": 1.2345679012345677e-07,
"logits/chosen": -0.2129761278629303,
"logits/rejected": -0.1930898129940033,
"logps/chosen": -117.23963928222656,
"logps/rejected": -180.9163818359375,
"loss": 0.1262,
"nll_loss": 0.11052282154560089,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.72396469116211,
"rewards/margins": 6.367676258087158,
"rewards/rejected": -18.09164047241211,
"step": 1920
},
{
"epoch": 4.4611383993065585,
"grad_norm": 1.6419086456298828,
"learning_rate": 1.183127572016461e-07,
"logits/chosen": -0.18322396278381348,
"logits/rejected": -0.15920376777648926,
"logps/chosen": -116.58353424072266,
"logps/rejected": -184.9496307373047,
"loss": 0.114,
"nll_loss": 0.10174567997455597,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.658352851867676,
"rewards/margins": 6.836610317230225,
"rewards/rejected": -18.494962692260742,
"step": 1930
},
{
"epoch": 4.484253106038717,
"grad_norm": 2.5254459381103516,
"learning_rate": 1.1316872427983539e-07,
"logits/chosen": -0.20438556373119354,
"logits/rejected": -0.19316819310188293,
"logps/chosen": -111.71683502197266,
"logps/rejected": -176.36444091796875,
"loss": 0.1143,
"nll_loss": 0.10253375768661499,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.171684265136719,
"rewards/margins": 6.464761257171631,
"rewards/rejected": -17.636444091796875,
"step": 1940
},
{
"epoch": 4.507367812770876,
"grad_norm": 4.048756122589111,
"learning_rate": 1.0802469135802467e-07,
"logits/chosen": -0.20184461772441864,
"logits/rejected": -0.20470590889453888,
"logps/chosen": -112.52592468261719,
"logps/rejected": -176.77975463867188,
"loss": 0.122,
"nll_loss": 0.10450093448162079,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.252592086791992,
"rewards/margins": 6.4253830909729,
"rewards/rejected": -17.677974700927734,
"step": 1950
},
{
"epoch": 4.530482519503034,
"grad_norm": 1.5695422887802124,
"learning_rate": 1.02880658436214e-07,
"logits/chosen": -0.15921801328659058,
"logits/rejected": -0.16545803844928741,
"logps/chosen": -116.6390151977539,
"logps/rejected": -182.0139617919922,
"loss": 0.123,
"nll_loss": 0.11899758875370026,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.66390323638916,
"rewards/margins": 6.537497043609619,
"rewards/rejected": -18.201400756835938,
"step": 1960
},
{
"epoch": 4.553597226235192,
"grad_norm": 1.8795533180236816,
"learning_rate": 9.773662551440328e-08,
"logits/chosen": -0.21856431663036346,
"logits/rejected": -0.22739803791046143,
"logps/chosen": -111.40470123291016,
"logps/rejected": -175.14663696289062,
"loss": 0.1173,
"nll_loss": 0.10676850378513336,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.140469551086426,
"rewards/margins": 6.374191761016846,
"rewards/rejected": -17.514663696289062,
"step": 1970
},
{
"epoch": 4.576711932967351,
"grad_norm": 2.4999828338623047,
"learning_rate": 9.259259259259258e-08,
"logits/chosen": -0.16077259182929993,
"logits/rejected": -0.15148191154003143,
"logps/chosen": -112.52552795410156,
"logps/rejected": -175.3218994140625,
"loss": 0.122,
"nll_loss": 0.11213432252407074,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -11.25255298614502,
"rewards/margins": 6.279637336730957,
"rewards/rejected": -17.532190322875977,
"step": 1980
},
{
"epoch": 4.599826639699509,
"grad_norm": 2.170232057571411,
"learning_rate": 8.744855967078189e-08,
"logits/chosen": -0.20790553092956543,
"logits/rejected": -0.19387516379356384,
"logps/chosen": -117.14433288574219,
"logps/rejected": -181.39340209960938,
"loss": 0.1185,
"nll_loss": 0.11259637773036957,
"rewards/accuracies": 0.996874988079071,
"rewards/chosen": -11.714433670043945,
"rewards/margins": 6.424906253814697,
"rewards/rejected": -18.139341354370117,
"step": 1990
},
{
"epoch": 4.622941346431667,
"grad_norm": 2.0322587490081787,
"learning_rate": 8.230452674897118e-08,
"logits/chosen": -0.1339203268289566,
"logits/rejected": -0.14758563041687012,
"logps/chosen": -109.77425384521484,
"logps/rejected": -176.02438354492188,
"loss": 0.1248,
"nll_loss": 0.11588220298290253,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.977426528930664,
"rewards/margins": 6.6250104904174805,
"rewards/rejected": -17.602436065673828,
"step": 2000
},
{
"epoch": 4.646056053163825,
"grad_norm": 3.8062565326690674,
"learning_rate": 7.716049382716048e-08,
"logits/chosen": -0.25674083828926086,
"logits/rejected": -0.23061016201972961,
"logps/chosen": -122.008056640625,
"logps/rejected": -186.86663818359375,
"loss": 0.1177,
"nll_loss": 0.11457221210002899,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.200803756713867,
"rewards/margins": 6.485858917236328,
"rewards/rejected": -18.686664581298828,
"step": 2010
},
{
"epoch": 4.669170759895984,
"grad_norm": 1.300473928451538,
"learning_rate": 7.201646090534979e-08,
"logits/chosen": -0.12542086839675903,
"logits/rejected": -0.12564246356487274,
"logps/chosen": -112.2677993774414,
"logps/rejected": -177.23947143554688,
"loss": 0.1197,
"nll_loss": 0.10939665883779526,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.226778984069824,
"rewards/margins": 6.497168064117432,
"rewards/rejected": -17.723949432373047,
"step": 2020
},
{
"epoch": 4.692285466628142,
"grad_norm": 3.699575901031494,
"learning_rate": 6.687242798353909e-08,
"logits/chosen": -0.15934507548809052,
"logits/rejected": -0.15075993537902832,
"logps/chosen": -116.63383483886719,
"logps/rejected": -181.26510620117188,
"loss": 0.1222,
"nll_loss": 0.13159163296222687,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.663382530212402,
"rewards/margins": 6.463127136230469,
"rewards/rejected": -18.126508712768555,
"step": 2030
},
{
"epoch": 4.7154001733603,
"grad_norm": 3.081348180770874,
"learning_rate": 6.172839506172839e-08,
"logits/chosen": -0.2664518356323242,
"logits/rejected": -0.24538561701774597,
"logps/chosen": -122.5953140258789,
"logps/rejected": -189.40269470214844,
"loss": 0.122,
"nll_loss": 0.11068514734506607,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.259531021118164,
"rewards/margins": 6.680737495422363,
"rewards/rejected": -18.940269470214844,
"step": 2040
},
{
"epoch": 4.738514880092459,
"grad_norm": 1.9295371770858765,
"learning_rate": 5.6584362139917695e-08,
"logits/chosen": -0.3057961165904999,
"logits/rejected": -0.2679705023765564,
"logps/chosen": -119.34764099121094,
"logps/rejected": -184.24545288085938,
"loss": 0.1254,
"nll_loss": 0.11074963957071304,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.934765815734863,
"rewards/margins": 6.489781379699707,
"rewards/rejected": -18.424545288085938,
"step": 2050
},
{
"epoch": 4.7616295868246175,
"grad_norm": 1.486010193824768,
"learning_rate": 5.1440329218107e-08,
"logits/chosen": -0.17464767396450043,
"logits/rejected": -0.17597734928131104,
"logps/chosen": -118.97342681884766,
"logps/rejected": -184.82752990722656,
"loss": 0.116,
"nll_loss": 0.11164693534374237,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.89734172821045,
"rewards/margins": 6.585410118103027,
"rewards/rejected": -18.48275375366211,
"step": 2060
},
{
"epoch": 4.784744293556775,
"grad_norm": 1.5164188146591187,
"learning_rate": 4.629629629629629e-08,
"logits/chosen": -0.1697818785905838,
"logits/rejected": -0.17655737698078156,
"logps/chosen": -123.55452728271484,
"logps/rejected": -191.81411743164062,
"loss": 0.1175,
"nll_loss": 0.10650823265314102,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.355452537536621,
"rewards/margins": 6.8259596824646,
"rewards/rejected": -19.18140983581543,
"step": 2070
},
{
"epoch": 4.807859000288934,
"grad_norm": 2.9849853515625,
"learning_rate": 4.115226337448559e-08,
"logits/chosen": -0.1795181930065155,
"logits/rejected": -0.19433379173278809,
"logps/chosen": -118.71900939941406,
"logps/rejected": -185.427734375,
"loss": 0.1176,
"nll_loss": 0.11160220950841904,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.87190055847168,
"rewards/margins": 6.670874118804932,
"rewards/rejected": -18.542774200439453,
"step": 2080
},
{
"epoch": 4.8309737070210925,
"grad_norm": 1.8896292448043823,
"learning_rate": 3.6008230452674896e-08,
"logits/chosen": -0.20320720970630646,
"logits/rejected": -0.21179303526878357,
"logps/chosen": -121.1741714477539,
"logps/rejected": -189.61380004882812,
"loss": 0.1197,
"nll_loss": 0.12176340818405151,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.117416381835938,
"rewards/margins": 6.843962669372559,
"rewards/rejected": -18.961380004882812,
"step": 2090
},
{
"epoch": 4.85408841375325,
"grad_norm": 2.13209867477417,
"learning_rate": 3.086419753086419e-08,
"logits/chosen": -0.202679842710495,
"logits/rejected": -0.19807621836662292,
"logps/chosen": -121.65214538574219,
"logps/rejected": -187.36184692382812,
"loss": 0.1117,
"nll_loss": 0.1065160408616066,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.165216445922852,
"rewards/margins": 6.570970058441162,
"rewards/rejected": -18.736186981201172,
"step": 2100
},
{
"epoch": 4.877203120485409,
"grad_norm": 2.2168078422546387,
"learning_rate": 2.57201646090535e-08,
"logits/chosen": -0.20957596600055695,
"logits/rejected": -0.19148316979408264,
"logps/chosen": -112.11415100097656,
"logps/rejected": -176.92153930664062,
"loss": 0.12,
"nll_loss": 0.12247494608163834,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.211416244506836,
"rewards/margins": 6.480741024017334,
"rewards/rejected": -17.692157745361328,
"step": 2110
},
{
"epoch": 4.900317827217567,
"grad_norm": 1.704630970954895,
"learning_rate": 2.0576131687242796e-08,
"logits/chosen": -0.21424663066864014,
"logits/rejected": -0.24735161662101746,
"logps/chosen": -128.74917602539062,
"logps/rejected": -195.53347778320312,
"loss": 0.1239,
"nll_loss": 0.14099851250648499,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.874917984008789,
"rewards/margins": 6.678428649902344,
"rewards/rejected": -19.553346633911133,
"step": 2120
},
{
"epoch": 4.923432533949725,
"grad_norm": 2.0087478160858154,
"learning_rate": 1.5432098765432096e-08,
"logits/chosen": -0.1421460658311844,
"logits/rejected": -0.1667608767747879,
"logps/chosen": -113.1841812133789,
"logps/rejected": -177.54026794433594,
"loss": 0.1199,
"nll_loss": 0.12012244760990143,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.318418502807617,
"rewards/margins": 6.435610771179199,
"rewards/rejected": -17.7540283203125,
"step": 2130
},
{
"epoch": 4.946547240681884,
"grad_norm": 3.1608433723449707,
"learning_rate": 1.0288065843621398e-08,
"logits/chosen": -0.20297956466674805,
"logits/rejected": -0.18899144232273102,
"logps/chosen": -118.35282897949219,
"logps/rejected": -183.97708129882812,
"loss": 0.1174,
"nll_loss": 0.10903529822826385,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.835283279418945,
"rewards/margins": 6.562425136566162,
"rewards/rejected": -18.397706985473633,
"step": 2140
},
{
"epoch": 4.969661947414043,
"grad_norm": 1.8710432052612305,
"learning_rate": 5.144032921810699e-09,
"logits/chosen": -0.2684074640274048,
"logits/rejected": -0.22125348448753357,
"logps/chosen": -129.27749633789062,
"logps/rejected": -194.1197967529297,
"loss": 0.1183,
"nll_loss": 0.11009220033884048,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.927749633789062,
"rewards/margins": 6.484231472015381,
"rewards/rejected": -19.4119815826416,
"step": 2150
},
{
"epoch": 4.9927766541462,
"grad_norm": 1.882362961769104,
"learning_rate": 0.0,
"logits/chosen": -0.15148359537124634,
"logits/rejected": -0.1361338496208191,
"logps/chosen": -107.99227142333984,
"logps/rejected": -173.1522979736328,
"loss": 0.1143,
"nll_loss": 0.10342558473348618,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.799227714538574,
"rewards/margins": 6.516002655029297,
"rewards/rejected": -17.315229415893555,
"step": 2160
},
{
"epoch": 4.9927766541462,
"eval_logits/chosen": -0.14757364988327026,
"eval_logits/rejected": -0.11364421248435974,
"eval_logps/chosen": -141.49264526367188,
"eval_logps/rejected": -155.7095184326172,
"eval_loss": 1.4373149871826172,
"eval_nll_loss": 0.17254449427127838,
"eval_rewards/accuracies": 0.654347836971283,
"eval_rewards/chosen": -14.149263381958008,
"eval_rewards/margins": 1.4216874837875366,
"eval_rewards/rejected": -15.570951461791992,
"eval_runtime": 76.6761,
"eval_samples_per_second": 23.814,
"eval_steps_per_second": 1.5,
"step": 2160
},
{
"epoch": 4.9927766541462,
"step": 2160,
"total_flos": 0.0,
"train_loss": 0.5995175864961412,
"train_runtime": 46944.6998,
"train_samples_per_second": 5.898,
"train_steps_per_second": 0.046
}
],
"logging_steps": 10,
"max_steps": 2160,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}