sfulay's picture
Model save
a15b904 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9988571428571429,
"eval_steps": 50,
"global_step": 437,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.022857142857142857,
"grad_norm": 8.11412987933583,
"learning_rate": 1.1363636363636363e-07,
"logits/chosen": -2.66959810256958,
"logits/rejected": -2.6077542304992676,
"logps/chosen": -296.6876220703125,
"logps/rejected": -254.7753448486328,
"loss": 0.6933,
"rewards/accuracies": 0.3812499940395355,
"rewards/chosen": -0.0003348872414790094,
"rewards/margins": -0.00012643556692637503,
"rewards/rejected": -0.0002084516454488039,
"step": 10
},
{
"epoch": 0.045714285714285714,
"grad_norm": 9.934680850734814,
"learning_rate": 2.2727272727272726e-07,
"logits/chosen": -2.6891586780548096,
"logits/rejected": -2.6237130165100098,
"logps/chosen": -294.1405334472656,
"logps/rejected": -254.05810546875,
"loss": 0.6922,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.0007635392248630524,
"rewards/margins": 0.0031762172002345324,
"rewards/rejected": -0.0024126782082021236,
"step": 20
},
{
"epoch": 0.06857142857142857,
"grad_norm": 8.46839233994518,
"learning_rate": 3.4090909090909085e-07,
"logits/chosen": -2.657853364944458,
"logits/rejected": -2.562720775604248,
"logps/chosen": -285.6708984375,
"logps/rejected": -247.06838989257812,
"loss": 0.6879,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.012036588974297047,
"rewards/margins": 0.01440697442740202,
"rewards/rejected": -0.002370386151596904,
"step": 30
},
{
"epoch": 0.09142857142857143,
"grad_norm": 10.38860192358711,
"learning_rate": 4.545454545454545e-07,
"logits/chosen": -2.6249475479125977,
"logits/rejected": -2.5956408977508545,
"logps/chosen": -269.8529357910156,
"logps/rejected": -273.1573791503906,
"loss": 0.6793,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.02421986497938633,
"rewards/margins": 0.020207645371556282,
"rewards/rejected": 0.004012218676507473,
"step": 40
},
{
"epoch": 0.11428571428571428,
"grad_norm": 8.812178369158405,
"learning_rate": 4.997124959943201e-07,
"logits/chosen": -2.6135976314544678,
"logits/rejected": -2.5756285190582275,
"logps/chosen": -305.51312255859375,
"logps/rejected": -271.22247314453125,
"loss": 0.6619,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.010274273343384266,
"rewards/margins": 0.067402184009552,
"rewards/rejected": -0.0776764526963234,
"step": 50
},
{
"epoch": 0.11428571428571428,
"eval_logits/chosen": -2.569648265838623,
"eval_logits/rejected": -2.4695067405700684,
"eval_logps/chosen": -277.47930908203125,
"eval_logps/rejected": -235.5894012451172,
"eval_loss": 0.6465452313423157,
"eval_rewards/accuracies": 0.693965494632721,
"eval_rewards/chosen": -0.018753662705421448,
"eval_rewards/margins": 0.1463788002729416,
"eval_rewards/rejected": -0.16513246297836304,
"eval_runtime": 90.334,
"eval_samples_per_second": 20.269,
"eval_steps_per_second": 0.321,
"step": 50
},
{
"epoch": 0.13714285714285715,
"grad_norm": 11.6151396248973,
"learning_rate": 4.979579212164186e-07,
"logits/chosen": -2.6002144813537598,
"logits/rejected": -2.501227855682373,
"logps/chosen": -298.59063720703125,
"logps/rejected": -302.35577392578125,
"loss": 0.6307,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.047010406851768494,
"rewards/margins": 0.16466036438941956,
"rewards/rejected": -0.21167078614234924,
"step": 60
},
{
"epoch": 0.16,
"grad_norm": 14.811620243521006,
"learning_rate": 4.946196886175515e-07,
"logits/chosen": -2.6356024742126465,
"logits/rejected": -2.5619795322418213,
"logps/chosen": -280.7294921875,
"logps/rejected": -274.32647705078125,
"loss": 0.6194,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.16827444732189178,
"rewards/margins": 0.20077195763587952,
"rewards/rejected": -0.3690463900566101,
"step": 70
},
{
"epoch": 0.18285714285714286,
"grad_norm": 16.993875634534675,
"learning_rate": 4.897191188239667e-07,
"logits/chosen": -2.642766237258911,
"logits/rejected": -2.582462787628174,
"logps/chosen": -321.19744873046875,
"logps/rejected": -298.94171142578125,
"loss": 0.6087,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.0975220575928688,
"rewards/margins": 0.27095723152160645,
"rewards/rejected": -0.36847931146621704,
"step": 80
},
{
"epoch": 0.2057142857142857,
"grad_norm": 17.671501985453162,
"learning_rate": 4.832875107981763e-07,
"logits/chosen": -2.4848644733428955,
"logits/rejected": -2.404571533203125,
"logps/chosen": -310.0569152832031,
"logps/rejected": -313.3294982910156,
"loss": 0.6012,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.345020592212677,
"rewards/margins": 0.33231958746910095,
"rewards/rejected": -0.6773402690887451,
"step": 90
},
{
"epoch": 0.22857142857142856,
"grad_norm": 17.978608029535753,
"learning_rate": 4.753659419387223e-07,
"logits/chosen": -1.396976351737976,
"logits/rejected": -1.3131816387176514,
"logps/chosen": -309.3507385253906,
"logps/rejected": -321.52056884765625,
"loss": 0.5843,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.6027094125747681,
"rewards/margins": 0.35499778389930725,
"rewards/rejected": -0.9577071070671082,
"step": 100
},
{
"epoch": 0.22857142857142856,
"eval_logits/chosen": -1.3850308656692505,
"eval_logits/rejected": -1.1914278268814087,
"eval_logps/chosen": -320.03228759765625,
"eval_logps/rejected": -321.1168518066406,
"eval_loss": 0.5719701647758484,
"eval_rewards/accuracies": 0.7456896305084229,
"eval_rewards/chosen": -0.44428348541259766,
"eval_rewards/margins": 0.5761240124702454,
"eval_rewards/rejected": -1.0204075574874878,
"eval_runtime": 89.9125,
"eval_samples_per_second": 20.364,
"eval_steps_per_second": 0.323,
"step": 100
},
{
"epoch": 0.25142857142857145,
"grad_norm": 30.473603379477254,
"learning_rate": 4.660050057270191e-07,
"logits/chosen": -1.5161502361297607,
"logits/rejected": -1.4007251262664795,
"logps/chosen": -326.7287292480469,
"logps/rejected": -381.71234130859375,
"loss": 0.5578,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.47587689757347107,
"rewards/margins": 0.4765087962150574,
"rewards/rejected": -0.9523857235908508,
"step": 110
},
{
"epoch": 0.2742857142857143,
"grad_norm": 16.07677825536776,
"learning_rate": 4.5526448859687144e-07,
"logits/chosen": -1.329465627670288,
"logits/rejected": -1.2029626369476318,
"logps/chosen": -313.04150390625,
"logps/rejected": -344.30377197265625,
"loss": 0.59,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.4427226483821869,
"rewards/margins": 0.44687420129776,
"rewards/rejected": -0.8895969390869141,
"step": 120
},
{
"epoch": 0.29714285714285715,
"grad_norm": 19.25626895436831,
"learning_rate": 4.432129880904388e-07,
"logits/chosen": -1.0987221002578735,
"logits/rejected": -0.7802125215530396,
"logps/chosen": -344.36712646484375,
"logps/rejected": -391.39154052734375,
"loss": 0.5477,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.5906578302383423,
"rewards/margins": 0.5952860116958618,
"rewards/rejected": -1.1859437227249146,
"step": 130
},
{
"epoch": 0.32,
"grad_norm": 32.057510195911,
"learning_rate": 4.299274747394055e-07,
"logits/chosen": -0.759337306022644,
"logits/rejected": -0.5684966444969177,
"logps/chosen": -352.746826171875,
"logps/rejected": -371.2802734375,
"loss": 0.5676,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.540864884853363,
"rewards/margins": 0.43386125564575195,
"rewards/rejected": -0.9747260808944702,
"step": 140
},
{
"epoch": 0.34285714285714286,
"grad_norm": 31.167990046129457,
"learning_rate": 4.1549280046953653e-07,
"logits/chosen": -1.4192949533462524,
"logits/rejected": -0.8910300135612488,
"logps/chosen": -302.2892150878906,
"logps/rejected": -339.9471435546875,
"loss": 0.5509,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.45886653661727905,
"rewards/margins": 0.62995845079422,
"rewards/rejected": -1.0888248682022095,
"step": 150
},
{
"epoch": 0.34285714285714286,
"eval_logits/chosen": -1.355178952217102,
"eval_logits/rejected": -0.6634992361068726,
"eval_logps/chosen": -323.4683532714844,
"eval_logps/rejected": -345.5482482910156,
"eval_loss": 0.5466835498809814,
"eval_rewards/accuracies": 0.732758641242981,
"eval_rewards/chosen": -0.4786438047885895,
"eval_rewards/margins": 0.7860775589942932,
"eval_rewards/rejected": -1.264721393585205,
"eval_runtime": 90.391,
"eval_samples_per_second": 20.256,
"eval_steps_per_second": 0.321,
"step": 150
},
{
"epoch": 0.3657142857142857,
"grad_norm": 24.250974026024966,
"learning_rate": 4.000011566683401e-07,
"logits/chosen": -1.3669617176055908,
"logits/rejected": -0.9052613377571106,
"logps/chosen": -333.5279846191406,
"logps/rejected": -360.4810791015625,
"loss": 0.55,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6962090730667114,
"rewards/margins": 0.5538384318351746,
"rewards/rejected": -1.2500474452972412,
"step": 160
},
{
"epoch": 0.38857142857142857,
"grad_norm": 24.704591344446357,
"learning_rate": 3.8355148537705047e-07,
"logits/chosen": -1.3166093826293945,
"logits/rejected": -0.7781628966331482,
"logps/chosen": -343.19940185546875,
"logps/rejected": -390.14239501953125,
"loss": 0.5266,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6181753873825073,
"rewards/margins": 0.671627402305603,
"rewards/rejected": -1.2898027896881104,
"step": 170
},
{
"epoch": 0.4114285714285714,
"grad_norm": 22.05889500026718,
"learning_rate": 3.662488473675315e-07,
"logits/chosen": -1.254248023033142,
"logits/rejected": -0.8255653381347656,
"logps/chosen": -341.5926818847656,
"logps/rejected": -382.6631774902344,
"loss": 0.542,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7656871676445007,
"rewards/margins": 0.5109010338783264,
"rewards/rejected": -1.2765882015228271,
"step": 180
},
{
"epoch": 0.4342857142857143,
"grad_norm": 23.909926187935003,
"learning_rate": 3.48203751140067e-07,
"logits/chosen": -1.0761396884918213,
"logits/rejected": -0.6647660136222839,
"logps/chosen": -325.56683349609375,
"logps/rejected": -379.4165344238281,
"loss": 0.5356,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6315457820892334,
"rewards/margins": 0.5936378240585327,
"rewards/rejected": -1.2251836061477661,
"step": 190
},
{
"epoch": 0.45714285714285713,
"grad_norm": 20.73701530382175,
"learning_rate": 3.2953144712759537e-07,
"logits/chosen": -0.5213090181350708,
"logits/rejected": 0.013022899627685547,
"logps/chosen": -359.0880432128906,
"logps/rejected": -410.6299743652344,
"loss": 0.5275,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.927217960357666,
"rewards/margins": 0.6191995739936829,
"rewards/rejected": -1.5464175939559937,
"step": 200
},
{
"epoch": 0.45714285714285713,
"eval_logits/chosen": -1.0376836061477661,
"eval_logits/rejected": -0.08949049562215805,
"eval_logps/chosen": -327.8020935058594,
"eval_logps/rejected": -357.148681640625,
"eval_loss": 0.5396182537078857,
"eval_rewards/accuracies": 0.7715517282485962,
"eval_rewards/chosen": -0.5219810605049133,
"eval_rewards/margins": 0.8587445020675659,
"eval_rewards/rejected": -1.380725622177124,
"eval_runtime": 90.1866,
"eval_samples_per_second": 20.302,
"eval_steps_per_second": 0.322,
"step": 200
},
{
"epoch": 0.48,
"grad_norm": 23.237204035063847,
"learning_rate": 3.103511916141658e-07,
"logits/chosen": -0.9574594497680664,
"logits/rejected": -0.16778725385665894,
"logps/chosen": -348.38385009765625,
"logps/rejected": -397.10321044921875,
"loss": 0.5219,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.686768651008606,
"rewards/margins": 0.7565571665763855,
"rewards/rejected": -1.4433258771896362,
"step": 210
},
{
"epoch": 0.5028571428571429,
"grad_norm": 19.18259747527528,
"learning_rate": 2.9078548506882117e-07,
"logits/chosen": -0.37646159529685974,
"logits/rejected": 0.516203761100769,
"logps/chosen": -383.25811767578125,
"logps/rejected": -413.0987243652344,
"loss": 0.5326,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8871996998786926,
"rewards/margins": 0.6418129205703735,
"rewards/rejected": -1.529012680053711,
"step": 220
},
{
"epoch": 0.5257142857142857,
"grad_norm": 21.792119076799803,
"learning_rate": 2.709592897595191e-07,
"logits/chosen": -0.94196617603302,
"logits/rejected": -0.2367326021194458,
"logps/chosen": -343.68157958984375,
"logps/rejected": -391.78729248046875,
"loss": 0.5313,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.583425760269165,
"rewards/margins": 0.6397222280502319,
"rewards/rejected": -1.2231481075286865,
"step": 230
},
{
"epoch": 0.5485714285714286,
"grad_norm": 23.250665470567046,
"learning_rate": 2.509992316440332e-07,
"logits/chosen": -0.7590802907943726,
"logits/rejected": 0.23232534527778625,
"logps/chosen": -375.7669982910156,
"logps/rejected": -393.1439208984375,
"loss": 0.5083,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.661091685295105,
"rewards/margins": 0.7419286370277405,
"rewards/rejected": -1.4030205011367798,
"step": 240
},
{
"epoch": 0.5714285714285714,
"grad_norm": 23.283347220975337,
"learning_rate": 2.3103279163519918e-07,
"logits/chosen": 0.3902924656867981,
"logits/rejected": 1.2702767848968506,
"logps/chosen": -430.42510986328125,
"logps/rejected": -429.90240478515625,
"loss": 0.5665,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.4317457675933838,
"rewards/margins": 0.5207899808883667,
"rewards/rejected": -1.95253586769104,
"step": 250
},
{
"epoch": 0.5714285714285714,
"eval_logits/chosen": 0.2878158688545227,
"eval_logits/rejected": 1.5253632068634033,
"eval_logps/chosen": -384.02301025390625,
"eval_logps/rejected": -415.1577453613281,
"eval_loss": 0.5404527187347412,
"eval_rewards/accuracies": 0.7629310488700867,
"eval_rewards/chosen": -1.0841907262802124,
"eval_rewards/margins": 0.8766254782676697,
"eval_rewards/rejected": -1.9608159065246582,
"eval_runtime": 91.5614,
"eval_samples_per_second": 19.998,
"eval_steps_per_second": 0.317,
"step": 250
},
{
"epoch": 0.5942857142857143,
"grad_norm": 28.334390897274133,
"learning_rate": 2.1118749140573358e-07,
"logits/chosen": -0.2290700227022171,
"logits/rejected": 0.8288987278938293,
"logps/chosen": -408.00201416015625,
"logps/rejected": -452.023681640625,
"loss": 0.5197,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.9032201766967773,
"rewards/margins": 0.8297585248947144,
"rewards/rejected": -1.7329788208007812,
"step": 260
},
{
"epoch": 0.6171428571428571,
"grad_norm": 23.854460344353054,
"learning_rate": 1.9159007893272703e-07,
"logits/chosen": -0.17421701550483704,
"logits/rejected": 0.6511275172233582,
"logps/chosen": -342.1952209472656,
"logps/rejected": -396.54083251953125,
"loss": 0.5294,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.7613161206245422,
"rewards/margins": 0.7101620435714722,
"rewards/rejected": -1.4714782238006592,
"step": 270
},
{
"epoch": 0.64,
"grad_norm": 27.414501100794606,
"learning_rate": 1.7236571898357766e-07,
"logits/chosen": -0.08564956486225128,
"logits/rejected": 0.9427372813224792,
"logps/chosen": -367.49407958984375,
"logps/rejected": -409.3499755859375,
"loss": 0.518,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.8473467826843262,
"rewards/margins": 0.8089747428894043,
"rewards/rejected": -1.6563212871551514,
"step": 280
},
{
"epoch": 0.6628571428571428,
"grad_norm": 23.166387688948994,
"learning_rate": 1.5363719371356882e-07,
"logits/chosen": 0.06339935958385468,
"logits/rejected": 0.5719184875488281,
"logps/chosen": -360.3900451660156,
"logps/rejected": -409.3319091796875,
"loss": 0.5387,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9427051544189453,
"rewards/margins": 0.5575781464576721,
"rewards/rejected": -1.500283122062683,
"step": 290
},
{
"epoch": 0.6857142857142857,
"grad_norm": 28.449941474840944,
"learning_rate": 1.3552411848071565e-07,
"logits/chosen": -0.3682107627391815,
"logits/rejected": 0.5333132743835449,
"logps/chosen": -333.92449951171875,
"logps/rejected": -425.3599548339844,
"loss": 0.5202,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7675741314888,
"rewards/margins": 0.9044278264045715,
"rewards/rejected": -1.6720020771026611,
"step": 300
},
{
"epoch": 0.6857142857142857,
"eval_logits/chosen": -0.21915225684642792,
"eval_logits/rejected": 0.9934114813804626,
"eval_logps/chosen": -349.8433837890625,
"eval_logps/rejected": -392.549560546875,
"eval_loss": 0.5274777412414551,
"eval_rewards/accuracies": 0.7715517282485962,
"eval_rewards/chosen": -0.7423940300941467,
"eval_rewards/margins": 0.9923400282859802,
"eval_rewards/rejected": -1.734734058380127,
"eval_runtime": 90.6157,
"eval_samples_per_second": 20.206,
"eval_steps_per_second": 0.32,
"step": 300
},
{
"epoch": 0.7085714285714285,
"grad_norm": 21.530490280501045,
"learning_rate": 1.1814217788631473e-07,
"logits/chosen": -0.17677690088748932,
"logits/rejected": 0.3225722908973694,
"logps/chosen": -356.5892639160156,
"logps/rejected": -431.662109375,
"loss": 0.5328,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.8150566816329956,
"rewards/margins": 0.6877792477607727,
"rewards/rejected": -1.502835988998413,
"step": 310
},
{
"epoch": 0.7314285714285714,
"grad_norm": 26.395907969219987,
"learning_rate": 1.0160238692045331e-07,
"logits/chosen": -0.2509198486804962,
"logits/rejected": 0.4886396527290344,
"logps/chosen": -359.4185485839844,
"logps/rejected": -413.3671875,
"loss": 0.535,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8422183990478516,
"rewards/margins": 0.7798849940299988,
"rewards/rejected": -1.6221033334732056,
"step": 320
},
{
"epoch": 0.7542857142857143,
"grad_norm": 25.409253983308545,
"learning_rate": 8.601038193139438e-08,
"logits/chosen": -0.13014790415763855,
"logits/rejected": 0.5298113822937012,
"logps/chosen": -378.82012939453125,
"logps/rejected": -402.17156982421875,
"loss": 0.5302,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.973560631275177,
"rewards/margins": 0.6031589508056641,
"rewards/rejected": -1.5767196416854858,
"step": 330
},
{
"epoch": 0.7771428571428571,
"grad_norm": 26.91341277336219,
"learning_rate": 7.146574594727572e-08,
"logits/chosen": -0.4229533076286316,
"logits/rejected": 0.5619645714759827,
"logps/chosen": -363.5431213378906,
"logps/rejected": -404.77935791015625,
"loss": 0.5192,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8485255241394043,
"rewards/margins": 0.8047366142272949,
"rewards/rejected": -1.6532618999481201,
"step": 340
},
{
"epoch": 0.8,
"grad_norm": 24.565353732521466,
"learning_rate": 5.8061372659157306e-08,
"logits/chosen": -0.17489977180957794,
"logits/rejected": 0.7508963346481323,
"logps/chosen": -371.7098693847656,
"logps/rejected": -395.7353210449219,
"loss": 0.5261,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.8898698091506958,
"rewards/margins": 0.6660181879997253,
"rewards/rejected": -1.5558879375457764,
"step": 350
},
{
"epoch": 0.8,
"eval_logits/chosen": -0.2696850597858429,
"eval_logits/rejected": 0.9007923007011414,
"eval_logps/chosen": -351.73101806640625,
"eval_logps/rejected": -391.03973388671875,
"eval_loss": 0.5234382748603821,
"eval_rewards/accuracies": 0.7586206793785095,
"eval_rewards/chosen": -0.76127028465271,
"eval_rewards/margins": 0.9583660364151001,
"eval_rewards/rejected": -1.71963632106781,
"eval_runtime": 90.1922,
"eval_samples_per_second": 20.301,
"eval_steps_per_second": 0.322,
"step": 350
},
{
"epoch": 0.8228571428571428,
"grad_norm": 26.528490406026805,
"learning_rate": 4.5882873127531614e-08,
"logits/chosen": -0.18714679777622223,
"logits/rejected": 0.5642833113670349,
"logps/chosen": -389.9281921386719,
"logps/rejected": -425.6780700683594,
"loss": 0.5053,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8513727188110352,
"rewards/margins": 0.6271126866340637,
"rewards/rejected": -1.478485345840454,
"step": 360
},
{
"epoch": 0.8457142857142858,
"grad_norm": 26.160208453826513,
"learning_rate": 3.500802900154412e-08,
"logits/chosen": -0.11366554349660873,
"logits/rejected": 0.7036272287368774,
"logps/chosen": -315.89349365234375,
"logps/rejected": -380.2809143066406,
"loss": 0.5279,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.7633152008056641,
"rewards/margins": 0.7076437473297119,
"rewards/rejected": -1.470958948135376,
"step": 370
},
{
"epoch": 0.8685714285714285,
"grad_norm": 22.845593504615394,
"learning_rate": 2.550629574310309e-08,
"logits/chosen": -0.2266564667224884,
"logits/rejected": 0.5173102021217346,
"logps/chosen": -349.7990417480469,
"logps/rejected": -402.1055603027344,
"loss": 0.5236,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.7807506918907166,
"rewards/margins": 0.665580689907074,
"rewards/rejected": -1.4463313817977905,
"step": 380
},
{
"epoch": 0.8914285714285715,
"grad_norm": 22.30415183791987,
"learning_rate": 1.7438359028687983e-08,
"logits/chosen": -0.18654844164848328,
"logits/rejected": 0.17555546760559082,
"logps/chosen": -343.80609130859375,
"logps/rejected": -413.258544921875,
"loss": 0.5215,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8466179966926575,
"rewards/margins": 0.5847481489181519,
"rewards/rejected": -1.431365966796875,
"step": 390
},
{
"epoch": 0.9142857142857143,
"grad_norm": 25.36293321655948,
"learning_rate": 1.0855747162029361e-08,
"logits/chosen": -0.39323678612709045,
"logits/rejected": 0.6321589946746826,
"logps/chosen": -364.44781494140625,
"logps/rejected": -393.2674865722656,
"loss": 0.5343,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.7503901720046997,
"rewards/margins": 0.7446303963661194,
"rewards/rejected": -1.4950206279754639,
"step": 400
},
{
"epoch": 0.9142857142857143,
"eval_logits/chosen": -0.3663737177848816,
"eval_logits/rejected": 0.7649080157279968,
"eval_logps/chosen": -345.4830017089844,
"eval_logps/rejected": -382.0352478027344,
"eval_loss": 0.5226701498031616,
"eval_rewards/accuracies": 0.7629310488700867,
"eval_rewards/chosen": -0.6987906098365784,
"eval_rewards/margins": 0.9308006763458252,
"eval_rewards/rejected": -1.6295913457870483,
"eval_runtime": 91.5738,
"eval_samples_per_second": 19.995,
"eval_steps_per_second": 0.317,
"step": 400
},
{
"epoch": 0.9371428571428572,
"grad_norm": 21.80106996482806,
"learning_rate": 5.8005019731033615e-09,
"logits/chosen": -0.4477500021457672,
"logits/rejected": 0.4871234893798828,
"logps/chosen": -340.3522033691406,
"logps/rejected": -401.26629638671875,
"loss": 0.5153,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.6757135391235352,
"rewards/margins": 0.9294708371162415,
"rewards/rejected": -1.6051843166351318,
"step": 410
},
{
"epoch": 0.96,
"grad_norm": 23.103516202896476,
"learning_rate": 2.3049103053431886e-09,
"logits/chosen": -0.3995040953159332,
"logits/rejected": 0.3316659927368164,
"logps/chosen": -386.6695861816406,
"logps/rejected": -383.6183776855469,
"loss": 0.5348,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.7931796312332153,
"rewards/margins": 0.6191332936286926,
"rewards/rejected": -1.4123131036758423,
"step": 420
},
{
"epoch": 0.9828571428571429,
"grad_norm": 22.91261742161674,
"learning_rate": 3.9129780600541397e-10,
"logits/chosen": -0.2376430332660675,
"logits/rejected": 0.6147540807723999,
"logps/chosen": -368.62451171875,
"logps/rejected": -394.5757751464844,
"loss": 0.5369,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8315681219100952,
"rewards/margins": 0.7056232690811157,
"rewards/rejected": -1.53719162940979,
"step": 430
},
{
"epoch": 0.9988571428571429,
"step": 437,
"total_flos": 0.0,
"train_loss": 0.5600041279258117,
"train_runtime": 11271.3049,
"train_samples_per_second": 4.968,
"train_steps_per_second": 0.039
}
],
"logging_steps": 10,
"max_steps": 437,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}