{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988571428571429, "eval_steps": 50, "global_step": 437, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022857142857142857, "grad_norm": 8.11412987933583, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.66959810256958, "logits/rejected": -2.6077542304992676, "logps/chosen": -296.6876220703125, "logps/rejected": -254.7753448486328, "loss": 0.6933, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.0003348872414790094, "rewards/margins": -0.00012643556692637503, "rewards/rejected": -0.0002084516454488039, "step": 10 }, { "epoch": 0.045714285714285714, "grad_norm": 9.934680850734814, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.6891586780548096, "logits/rejected": -2.6237130165100098, "logps/chosen": -294.1405334472656, "logps/rejected": -254.05810546875, "loss": 0.6922, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0007635392248630524, "rewards/margins": 0.0031762172002345324, "rewards/rejected": -0.0024126782082021236, "step": 20 }, { "epoch": 0.06857142857142857, "grad_norm": 8.46839233994518, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.657853364944458, "logits/rejected": -2.562720775604248, "logps/chosen": -285.6708984375, "logps/rejected": -247.06838989257812, "loss": 0.6879, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.012036588974297047, "rewards/margins": 0.01440697442740202, "rewards/rejected": -0.002370386151596904, "step": 30 }, { "epoch": 0.09142857142857143, "grad_norm": 10.38860192358711, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.6249475479125977, "logits/rejected": -2.5956408977508545, "logps/chosen": -269.8529357910156, "logps/rejected": -273.1573791503906, "loss": 0.6793, "rewards/accuracies": 0.65625, "rewards/chosen": 0.02421986497938633, "rewards/margins": 0.020207645371556282, "rewards/rejected": 0.004012218676507473, "step": 40 }, { "epoch": 0.11428571428571428, "grad_norm": 8.812178369158405, "learning_rate": 4.997124959943201e-07, "logits/chosen": -2.6135976314544678, "logits/rejected": -2.5756285190582275, "logps/chosen": -305.51312255859375, "logps/rejected": -271.22247314453125, "loss": 0.6619, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.010274273343384266, "rewards/margins": 0.067402184009552, "rewards/rejected": -0.0776764526963234, "step": 50 }, { "epoch": 0.11428571428571428, "eval_logits/chosen": -2.569648265838623, "eval_logits/rejected": -2.4695067405700684, "eval_logps/chosen": -277.47930908203125, "eval_logps/rejected": -235.5894012451172, "eval_loss": 0.6465452313423157, "eval_rewards/accuracies": 0.693965494632721, "eval_rewards/chosen": -0.018753662705421448, "eval_rewards/margins": 0.1463788002729416, "eval_rewards/rejected": -0.16513246297836304, "eval_runtime": 90.334, "eval_samples_per_second": 20.269, "eval_steps_per_second": 0.321, "step": 50 }, { "epoch": 0.13714285714285715, "grad_norm": 11.6151396248973, "learning_rate": 4.979579212164186e-07, "logits/chosen": -2.6002144813537598, "logits/rejected": -2.501227855682373, "logps/chosen": -298.59063720703125, "logps/rejected": -302.35577392578125, "loss": 0.6307, "rewards/accuracies": 0.71875, "rewards/chosen": -0.047010406851768494, "rewards/margins": 0.16466036438941956, "rewards/rejected": -0.21167078614234924, "step": 60 }, { "epoch": 0.16, "grad_norm": 14.811620243521006, "learning_rate": 4.946196886175515e-07, "logits/chosen": -2.6356024742126465, "logits/rejected": -2.5619795322418213, "logps/chosen": -280.7294921875, "logps/rejected": -274.32647705078125, "loss": 0.6194, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.16827444732189178, "rewards/margins": 0.20077195763587952, "rewards/rejected": -0.3690463900566101, "step": 70 }, { "epoch": 0.18285714285714286, "grad_norm": 16.993875634534675, "learning_rate": 4.897191188239667e-07, "logits/chosen": -2.642766237258911, "logits/rejected": -2.582462787628174, "logps/chosen": -321.19744873046875, "logps/rejected": -298.94171142578125, "loss": 0.6087, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.0975220575928688, "rewards/margins": 0.27095723152160645, "rewards/rejected": -0.36847931146621704, "step": 80 }, { "epoch": 0.2057142857142857, "grad_norm": 17.671501985453162, "learning_rate": 4.832875107981763e-07, "logits/chosen": -2.4848644733428955, "logits/rejected": -2.404571533203125, "logps/chosen": -310.0569152832031, "logps/rejected": -313.3294982910156, "loss": 0.6012, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.345020592212677, "rewards/margins": 0.33231958746910095, "rewards/rejected": -0.6773402690887451, "step": 90 }, { "epoch": 0.22857142857142856, "grad_norm": 17.978608029535753, "learning_rate": 4.753659419387223e-07, "logits/chosen": -1.396976351737976, "logits/rejected": -1.3131816387176514, "logps/chosen": -309.3507385253906, "logps/rejected": -321.52056884765625, "loss": 0.5843, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6027094125747681, "rewards/margins": 0.35499778389930725, "rewards/rejected": -0.9577071070671082, "step": 100 }, { "epoch": 0.22857142857142856, "eval_logits/chosen": -1.3850308656692505, "eval_logits/rejected": -1.1914278268814087, "eval_logps/chosen": -320.03228759765625, "eval_logps/rejected": -321.1168518066406, "eval_loss": 0.5719701647758484, "eval_rewards/accuracies": 0.7456896305084229, "eval_rewards/chosen": -0.44428348541259766, "eval_rewards/margins": 0.5761240124702454, "eval_rewards/rejected": -1.0204075574874878, "eval_runtime": 89.9125, "eval_samples_per_second": 20.364, "eval_steps_per_second": 0.323, "step": 100 }, { "epoch": 0.25142857142857145, "grad_norm": 30.473603379477254, "learning_rate": 4.660050057270191e-07, "logits/chosen": -1.5161502361297607, "logits/rejected": -1.4007251262664795, "logps/chosen": -326.7287292480469, "logps/rejected": -381.71234130859375, "loss": 0.5578, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.47587689757347107, "rewards/margins": 0.4765087962150574, "rewards/rejected": -0.9523857235908508, "step": 110 }, { "epoch": 0.2742857142857143, "grad_norm": 16.07677825536776, "learning_rate": 4.5526448859687144e-07, "logits/chosen": -1.329465627670288, "logits/rejected": -1.2029626369476318, "logps/chosen": -313.04150390625, "logps/rejected": -344.30377197265625, "loss": 0.59, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4427226483821869, "rewards/margins": 0.44687420129776, "rewards/rejected": -0.8895969390869141, "step": 120 }, { "epoch": 0.29714285714285715, "grad_norm": 19.25626895436831, "learning_rate": 4.432129880904388e-07, "logits/chosen": -1.0987221002578735, "logits/rejected": -0.7802125215530396, "logps/chosen": -344.36712646484375, "logps/rejected": -391.39154052734375, "loss": 0.5477, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5906578302383423, "rewards/margins": 0.5952860116958618, "rewards/rejected": -1.1859437227249146, "step": 130 }, { "epoch": 0.32, "grad_norm": 32.057510195911, "learning_rate": 4.299274747394055e-07, "logits/chosen": -0.759337306022644, "logits/rejected": -0.5684966444969177, "logps/chosen": -352.746826171875, "logps/rejected": -371.2802734375, "loss": 0.5676, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.540864884853363, "rewards/margins": 0.43386125564575195, "rewards/rejected": -0.9747260808944702, "step": 140 }, { "epoch": 0.34285714285714286, "grad_norm": 31.167990046129457, "learning_rate": 4.1549280046953653e-07, "logits/chosen": -1.4192949533462524, "logits/rejected": -0.8910300135612488, "logps/chosen": -302.2892150878906, "logps/rejected": -339.9471435546875, "loss": 0.5509, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.45886653661727905, "rewards/margins": 0.62995845079422, "rewards/rejected": -1.0888248682022095, "step": 150 }, { "epoch": 0.34285714285714286, "eval_logits/chosen": -1.355178952217102, "eval_logits/rejected": -0.6634992361068726, "eval_logps/chosen": -323.4683532714844, "eval_logps/rejected": -345.5482482910156, "eval_loss": 0.5466835498809814, "eval_rewards/accuracies": 0.732758641242981, "eval_rewards/chosen": -0.4786438047885895, "eval_rewards/margins": 0.7860775589942932, "eval_rewards/rejected": -1.264721393585205, "eval_runtime": 90.391, "eval_samples_per_second": 20.256, "eval_steps_per_second": 0.321, "step": 150 }, { "epoch": 0.3657142857142857, "grad_norm": 24.250974026024966, "learning_rate": 4.000011566683401e-07, "logits/chosen": -1.3669617176055908, "logits/rejected": -0.9052613377571106, "logps/chosen": -333.5279846191406, "logps/rejected": -360.4810791015625, "loss": 0.55, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6962090730667114, "rewards/margins": 0.5538384318351746, "rewards/rejected": -1.2500474452972412, "step": 160 }, { "epoch": 0.38857142857142857, "grad_norm": 24.704591344446357, "learning_rate": 3.8355148537705047e-07, "logits/chosen": -1.3166093826293945, "logits/rejected": -0.7781628966331482, "logps/chosen": -343.19940185546875, "logps/rejected": -390.14239501953125, "loss": 0.5266, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6181753873825073, "rewards/margins": 0.671627402305603, "rewards/rejected": -1.2898027896881104, "step": 170 }, { "epoch": 0.4114285714285714, "grad_norm": 22.05889500026718, "learning_rate": 3.662488473675315e-07, "logits/chosen": -1.254248023033142, "logits/rejected": -0.8255653381347656, "logps/chosen": -341.5926818847656, "logps/rejected": -382.6631774902344, "loss": 0.542, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7656871676445007, "rewards/margins": 0.5109010338783264, "rewards/rejected": -1.2765882015228271, "step": 180 }, { "epoch": 0.4342857142857143, "grad_norm": 23.909926187935003, "learning_rate": 3.48203751140067e-07, "logits/chosen": -1.0761396884918213, "logits/rejected": -0.6647660136222839, "logps/chosen": -325.56683349609375, "logps/rejected": -379.4165344238281, "loss": 0.5356, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6315457820892334, "rewards/margins": 0.5936378240585327, "rewards/rejected": -1.2251836061477661, "step": 190 }, { "epoch": 0.45714285714285713, "grad_norm": 20.73701530382175, "learning_rate": 3.2953144712759537e-07, "logits/chosen": -0.5213090181350708, "logits/rejected": 0.013022899627685547, "logps/chosen": -359.0880432128906, "logps/rejected": -410.6299743652344, "loss": 0.5275, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.927217960357666, "rewards/margins": 0.6191995739936829, "rewards/rejected": -1.5464175939559937, "step": 200 }, { "epoch": 0.45714285714285713, "eval_logits/chosen": -1.0376836061477661, "eval_logits/rejected": -0.08949049562215805, "eval_logps/chosen": -327.8020935058594, "eval_logps/rejected": -357.148681640625, "eval_loss": 0.5396182537078857, "eval_rewards/accuracies": 0.7715517282485962, "eval_rewards/chosen": -0.5219810605049133, "eval_rewards/margins": 0.8587445020675659, "eval_rewards/rejected": -1.380725622177124, "eval_runtime": 90.1866, "eval_samples_per_second": 20.302, "eval_steps_per_second": 0.322, "step": 200 }, { "epoch": 0.48, "grad_norm": 23.237204035063847, "learning_rate": 3.103511916141658e-07, "logits/chosen": -0.9574594497680664, "logits/rejected": -0.16778725385665894, "logps/chosen": -348.38385009765625, "logps/rejected": -397.10321044921875, "loss": 0.5219, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.686768651008606, "rewards/margins": 0.7565571665763855, "rewards/rejected": -1.4433258771896362, "step": 210 }, { "epoch": 0.5028571428571429, "grad_norm": 19.18259747527528, "learning_rate": 2.9078548506882117e-07, "logits/chosen": -0.37646159529685974, "logits/rejected": 0.516203761100769, "logps/chosen": -383.25811767578125, "logps/rejected": -413.0987243652344, "loss": 0.5326, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8871996998786926, "rewards/margins": 0.6418129205703735, "rewards/rejected": -1.529012680053711, "step": 220 }, { "epoch": 0.5257142857142857, "grad_norm": 21.792119076799803, "learning_rate": 2.709592897595191e-07, "logits/chosen": -0.94196617603302, "logits/rejected": -0.2367326021194458, "logps/chosen": -343.68157958984375, "logps/rejected": -391.78729248046875, "loss": 0.5313, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.583425760269165, "rewards/margins": 0.6397222280502319, "rewards/rejected": -1.2231481075286865, "step": 230 }, { "epoch": 0.5485714285714286, "grad_norm": 23.250665470567046, "learning_rate": 2.509992316440332e-07, "logits/chosen": -0.7590802907943726, "logits/rejected": 0.23232534527778625, "logps/chosen": -375.7669982910156, "logps/rejected": -393.1439208984375, "loss": 0.5083, "rewards/accuracies": 0.75, "rewards/chosen": -0.661091685295105, "rewards/margins": 0.7419286370277405, "rewards/rejected": -1.4030205011367798, "step": 240 }, { "epoch": 0.5714285714285714, "grad_norm": 23.283347220975337, "learning_rate": 2.3103279163519918e-07, "logits/chosen": 0.3902924656867981, "logits/rejected": 1.2702767848968506, "logps/chosen": -430.42510986328125, "logps/rejected": -429.90240478515625, "loss": 0.5665, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.4317457675933838, "rewards/margins": 0.5207899808883667, "rewards/rejected": -1.95253586769104, "step": 250 }, { "epoch": 0.5714285714285714, "eval_logits/chosen": 0.2878158688545227, "eval_logits/rejected": 1.5253632068634033, "eval_logps/chosen": -384.02301025390625, "eval_logps/rejected": -415.1577453613281, "eval_loss": 0.5404527187347412, "eval_rewards/accuracies": 0.7629310488700867, "eval_rewards/chosen": -1.0841907262802124, "eval_rewards/margins": 0.8766254782676697, "eval_rewards/rejected": -1.9608159065246582, "eval_runtime": 91.5614, "eval_samples_per_second": 19.998, "eval_steps_per_second": 0.317, "step": 250 }, { "epoch": 0.5942857142857143, "grad_norm": 28.334390897274133, "learning_rate": 2.1118749140573358e-07, "logits/chosen": -0.2290700227022171, "logits/rejected": 0.8288987278938293, "logps/chosen": -408.00201416015625, "logps/rejected": -452.023681640625, "loss": 0.5197, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9032201766967773, "rewards/margins": 0.8297585248947144, "rewards/rejected": -1.7329788208007812, "step": 260 }, { "epoch": 0.6171428571428571, "grad_norm": 23.854460344353054, "learning_rate": 1.9159007893272703e-07, "logits/chosen": -0.17421701550483704, "logits/rejected": 0.6511275172233582, "logps/chosen": -342.1952209472656, "logps/rejected": -396.54083251953125, "loss": 0.5294, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7613161206245422, "rewards/margins": 0.7101620435714722, "rewards/rejected": -1.4714782238006592, "step": 270 }, { "epoch": 0.64, "grad_norm": 27.414501100794606, "learning_rate": 1.7236571898357766e-07, "logits/chosen": -0.08564956486225128, "logits/rejected": 0.9427372813224792, "logps/chosen": -367.49407958984375, "logps/rejected": -409.3499755859375, "loss": 0.518, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8473467826843262, "rewards/margins": 0.8089747428894043, "rewards/rejected": -1.6563212871551514, "step": 280 }, { "epoch": 0.6628571428571428, "grad_norm": 23.166387688948994, "learning_rate": 1.5363719371356882e-07, "logits/chosen": 0.06339935958385468, "logits/rejected": 0.5719184875488281, "logps/chosen": -360.3900451660156, "logps/rejected": -409.3319091796875, "loss": 0.5387, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9427051544189453, "rewards/margins": 0.5575781464576721, "rewards/rejected": -1.500283122062683, "step": 290 }, { "epoch": 0.6857142857142857, "grad_norm": 28.449941474840944, "learning_rate": 1.3552411848071565e-07, "logits/chosen": -0.3682107627391815, "logits/rejected": 0.5333132743835449, "logps/chosen": -333.92449951171875, "logps/rejected": -425.3599548339844, "loss": 0.5202, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7675741314888, "rewards/margins": 0.9044278264045715, "rewards/rejected": -1.6720020771026611, "step": 300 }, { "epoch": 0.6857142857142857, "eval_logits/chosen": -0.21915225684642792, "eval_logits/rejected": 0.9934114813804626, "eval_logps/chosen": -349.8433837890625, "eval_logps/rejected": -392.549560546875, "eval_loss": 0.5274777412414551, "eval_rewards/accuracies": 0.7715517282485962, "eval_rewards/chosen": -0.7423940300941467, "eval_rewards/margins": 0.9923400282859802, "eval_rewards/rejected": -1.734734058380127, "eval_runtime": 90.6157, "eval_samples_per_second": 20.206, "eval_steps_per_second": 0.32, "step": 300 }, { "epoch": 0.7085714285714285, "grad_norm": 21.530490280501045, "learning_rate": 1.1814217788631473e-07, "logits/chosen": -0.17677690088748932, "logits/rejected": 0.3225722908973694, "logps/chosen": -356.5892639160156, "logps/rejected": -431.662109375, "loss": 0.5328, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8150566816329956, "rewards/margins": 0.6877792477607727, "rewards/rejected": -1.502835988998413, "step": 310 }, { "epoch": 0.7314285714285714, "grad_norm": 26.395907969219987, "learning_rate": 1.0160238692045331e-07, "logits/chosen": -0.2509198486804962, "logits/rejected": 0.4886396527290344, "logps/chosen": -359.4185485839844, "logps/rejected": -413.3671875, "loss": 0.535, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8422183990478516, "rewards/margins": 0.7798849940299988, "rewards/rejected": -1.6221033334732056, "step": 320 }, { "epoch": 0.7542857142857143, "grad_norm": 25.409253983308545, "learning_rate": 8.601038193139438e-08, "logits/chosen": -0.13014790415763855, "logits/rejected": 0.5298113822937012, "logps/chosen": -378.82012939453125, "logps/rejected": -402.17156982421875, "loss": 0.5302, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.973560631275177, "rewards/margins": 0.6031589508056641, "rewards/rejected": -1.5767196416854858, "step": 330 }, { "epoch": 0.7771428571428571, "grad_norm": 26.91341277336219, "learning_rate": 7.146574594727572e-08, "logits/chosen": -0.4229533076286316, "logits/rejected": 0.5619645714759827, "logps/chosen": -363.5431213378906, "logps/rejected": -404.77935791015625, "loss": 0.5192, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8485255241394043, "rewards/margins": 0.8047366142272949, "rewards/rejected": -1.6532618999481201, "step": 340 }, { "epoch": 0.8, "grad_norm": 24.565353732521466, "learning_rate": 5.8061372659157306e-08, "logits/chosen": -0.17489977180957794, "logits/rejected": 0.7508963346481323, "logps/chosen": -371.7098693847656, "logps/rejected": -395.7353210449219, "loss": 0.5261, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8898698091506958, "rewards/margins": 0.6660181879997253, "rewards/rejected": -1.5558879375457764, "step": 350 }, { "epoch": 0.8, "eval_logits/chosen": -0.2696850597858429, "eval_logits/rejected": 0.9007923007011414, "eval_logps/chosen": -351.73101806640625, "eval_logps/rejected": -391.03973388671875, "eval_loss": 0.5234382748603821, "eval_rewards/accuracies": 0.7586206793785095, "eval_rewards/chosen": -0.76127028465271, "eval_rewards/margins": 0.9583660364151001, "eval_rewards/rejected": -1.71963632106781, "eval_runtime": 90.1922, "eval_samples_per_second": 20.301, "eval_steps_per_second": 0.322, "step": 350 }, { "epoch": 0.8228571428571428, "grad_norm": 26.528490406026805, "learning_rate": 4.5882873127531614e-08, "logits/chosen": -0.18714679777622223, "logits/rejected": 0.5642833113670349, "logps/chosen": -389.9281921386719, "logps/rejected": -425.6780700683594, "loss": 0.5053, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8513727188110352, "rewards/margins": 0.6271126866340637, "rewards/rejected": -1.478485345840454, "step": 360 }, { "epoch": 0.8457142857142858, "grad_norm": 26.160208453826513, "learning_rate": 3.500802900154412e-08, "logits/chosen": -0.11366554349660873, "logits/rejected": 0.7036272287368774, "logps/chosen": -315.89349365234375, "logps/rejected": -380.2809143066406, "loss": 0.5279, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7633152008056641, "rewards/margins": 0.7076437473297119, "rewards/rejected": -1.470958948135376, "step": 370 }, { "epoch": 0.8685714285714285, "grad_norm": 22.845593504615394, "learning_rate": 2.550629574310309e-08, "logits/chosen": -0.2266564667224884, "logits/rejected": 0.5173102021217346, "logps/chosen": -349.7990417480469, "logps/rejected": -402.1055603027344, "loss": 0.5236, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.7807506918907166, "rewards/margins": 0.665580689907074, "rewards/rejected": -1.4463313817977905, "step": 380 }, { "epoch": 0.8914285714285715, "grad_norm": 22.30415183791987, "learning_rate": 1.7438359028687983e-08, "logits/chosen": -0.18654844164848328, "logits/rejected": 0.17555546760559082, "logps/chosen": -343.80609130859375, "logps/rejected": -413.258544921875, "loss": 0.5215, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8466179966926575, "rewards/margins": 0.5847481489181519, "rewards/rejected": -1.431365966796875, "step": 390 }, { "epoch": 0.9142857142857143, "grad_norm": 25.36293321655948, "learning_rate": 1.0855747162029361e-08, "logits/chosen": -0.39323678612709045, "logits/rejected": 0.6321589946746826, "logps/chosen": -364.44781494140625, "logps/rejected": -393.2674865722656, "loss": 0.5343, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.7503901720046997, "rewards/margins": 0.7446303963661194, "rewards/rejected": -1.4950206279754639, "step": 400 }, { "epoch": 0.9142857142857143, "eval_logits/chosen": -0.3663737177848816, "eval_logits/rejected": 0.7649080157279968, "eval_logps/chosen": -345.4830017089844, "eval_logps/rejected": -382.0352478027344, "eval_loss": 0.5226701498031616, "eval_rewards/accuracies": 0.7629310488700867, "eval_rewards/chosen": -0.6987906098365784, "eval_rewards/margins": 0.9308006763458252, "eval_rewards/rejected": -1.6295913457870483, "eval_runtime": 91.5738, "eval_samples_per_second": 19.995, "eval_steps_per_second": 0.317, "step": 400 }, { "epoch": 0.9371428571428572, "grad_norm": 21.80106996482806, "learning_rate": 5.8005019731033615e-09, "logits/chosen": -0.4477500021457672, "logits/rejected": 0.4871234893798828, "logps/chosen": -340.3522033691406, "logps/rejected": -401.26629638671875, "loss": 0.5153, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6757135391235352, "rewards/margins": 0.9294708371162415, "rewards/rejected": -1.6051843166351318, "step": 410 }, { "epoch": 0.96, "grad_norm": 23.103516202896476, "learning_rate": 2.3049103053431886e-09, "logits/chosen": -0.3995040953159332, "logits/rejected": 0.3316659927368164, "logps/chosen": -386.6695861816406, "logps/rejected": -383.6183776855469, "loss": 0.5348, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7931796312332153, "rewards/margins": 0.6191332936286926, "rewards/rejected": -1.4123131036758423, "step": 420 }, { "epoch": 0.9828571428571429, "grad_norm": 22.91261742161674, "learning_rate": 3.9129780600541397e-10, "logits/chosen": -0.2376430332660675, "logits/rejected": 0.6147540807723999, "logps/chosen": -368.62451171875, "logps/rejected": -394.5757751464844, "loss": 0.5369, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8315681219100952, "rewards/margins": 0.7056232690811157, "rewards/rejected": -1.53719162940979, "step": 430 }, { "epoch": 0.9988571428571429, "step": 437, "total_flos": 0.0, "train_loss": 0.5600041279258117, "train_runtime": 11271.3049, "train_samples_per_second": 4.968, "train_steps_per_second": 0.039 } ], "logging_steps": 10, "max_steps": 437, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }