qwen2.5-7B-instruct-orpo / trainer_state.json
ptrdvn's picture
Upload folder using huggingface_hub
7fb9cab verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9982859101816935,
"eval_steps": 0,
"global_step": 182,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005485087418580734,
"grad_norm": 5.038297822406707,
"learning_rate": 2.6315789473684213e-07,
"logits/chosen": -0.3854110836982727,
"logits/rejected": -0.38843637704849243,
"logps/chosen": -0.5867404937744141,
"logps/rejected": -0.7349259853363037,
"loss": 0.8549,
"odds_ratio_loss": 8.495767593383789,
"rewards/accuracies": 0.328125,
"rewards/chosen": -0.07349259406328201,
"rewards/margins": -0.014818555675446987,
"rewards/rejected": -0.05867404490709305,
"sft_loss": 0.00530597660690546,
"step": 1
},
{
"epoch": 0.010970174837161468,
"grad_norm": 3.8688701991936516,
"learning_rate": 5.263157894736843e-07,
"logits/chosen": -0.4200110137462616,
"logits/rejected": -0.4337027370929718,
"logps/chosen": -0.5888247489929199,
"logps/rejected": -0.7141146659851074,
"loss": 0.8261,
"odds_ratio_loss": 8.218369483947754,
"rewards/accuracies": 0.3671875,
"rewards/chosen": -0.07141146808862686,
"rewards/margins": -0.01252899132668972,
"rewards/rejected": -0.05888247489929199,
"sft_loss": 0.004305172245949507,
"step": 2
},
{
"epoch": 0.0164552622557422,
"grad_norm": 3.4925619509791903,
"learning_rate": 7.894736842105263e-07,
"logits/chosen": -0.38829293847084045,
"logits/rejected": -0.36370420455932617,
"logps/chosen": -0.6846970319747925,
"logps/rejected": -0.6908957362174988,
"loss": 0.8057,
"odds_ratio_loss": 8.025218963623047,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.06908956915140152,
"rewards/margins": -0.0006198745686560869,
"rewards/rejected": -0.06846970319747925,
"sft_loss": 0.003192172385752201,
"step": 3
},
{
"epoch": 0.021940349674322936,
"grad_norm": 5.442119869368371,
"learning_rate": 1.0526315789473685e-06,
"logits/chosen": -0.4157654047012329,
"logits/rejected": -0.4449327886104584,
"logps/chosen": -0.6106168031692505,
"logps/rejected": -0.7167029976844788,
"loss": 0.8331,
"odds_ratio_loss": 8.275010108947754,
"rewards/accuracies": 0.4140625,
"rewards/chosen": -0.0716702938079834,
"rewards/margins": -0.010608619078993797,
"rewards/rejected": -0.06106168404221535,
"sft_loss": 0.005572030786424875,
"step": 4
},
{
"epoch": 0.027425437092903668,
"grad_norm": 3.9040280718139004,
"learning_rate": 1.3157894736842106e-06,
"logits/chosen": -0.4705536961555481,
"logits/rejected": -0.448209285736084,
"logps/chosen": -0.5760213136672974,
"logps/rejected": -0.6794447898864746,
"loss": 0.7994,
"odds_ratio_loss": 7.961273193359375,
"rewards/accuracies": 0.3359375,
"rewards/chosen": -0.0679444819688797,
"rewards/margins": -0.010342349298298359,
"rewards/rejected": -0.057602137327194214,
"sft_loss": 0.003320502582937479,
"step": 5
},
{
"epoch": 0.0329105245114844,
"grad_norm": 3.9490673466980417,
"learning_rate": 1.5789473684210526e-06,
"logits/chosen": -0.3221694231033325,
"logits/rejected": -0.405836820602417,
"logps/chosen": -0.6044615507125854,
"logps/rejected": -0.7186299562454224,
"loss": 0.8315,
"odds_ratio_loss": 8.270588874816895,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.07186298817396164,
"rewards/margins": -0.011416830122470856,
"rewards/rejected": -0.060446158051490784,
"sft_loss": 0.004453588742762804,
"step": 6
},
{
"epoch": 0.03839561193006513,
"grad_norm": 4.211003798628622,
"learning_rate": 1.8421052631578948e-06,
"logits/chosen": -0.4302704334259033,
"logits/rejected": -0.48908236622810364,
"logps/chosen": -0.6016876697540283,
"logps/rejected": -0.6534877419471741,
"loss": 0.7601,
"odds_ratio_loss": 7.552478313446045,
"rewards/accuracies": 0.4140625,
"rewards/chosen": -0.06534876674413681,
"rewards/margins": -0.005180003587156534,
"rewards/rejected": -0.06016876921057701,
"sft_loss": 0.004846740514039993,
"step": 7
},
{
"epoch": 0.04388069934864587,
"grad_norm": 4.5970681547716055,
"learning_rate": 2.105263157894737e-06,
"logits/chosen": -0.4125446677207947,
"logits/rejected": -0.48786473274230957,
"logps/chosen": -0.5902884602546692,
"logps/rejected": -0.6903232336044312,
"loss": 0.8077,
"odds_ratio_loss": 8.029266357421875,
"rewards/accuracies": 0.390625,
"rewards/chosen": -0.06903232634067535,
"rewards/margins": -0.010003475472331047,
"rewards/rejected": -0.05902884528040886,
"sft_loss": 0.004780109040439129,
"step": 8
},
{
"epoch": 0.049365786767226603,
"grad_norm": 3.9830322243241154,
"learning_rate": 2.368421052631579e-06,
"logits/chosen": -0.4158952832221985,
"logits/rejected": -0.4516904354095459,
"logps/chosen": -0.49427568912506104,
"logps/rejected": -0.7636269927024841,
"loss": 0.8995,
"odds_ratio_loss": 8.943946838378906,
"rewards/accuracies": 0.265625,
"rewards/chosen": -0.07636269927024841,
"rewards/margins": -0.02693513222038746,
"rewards/rejected": -0.049427565187215805,
"sft_loss": 0.005144394934177399,
"step": 9
},
{
"epoch": 0.054850874185807336,
"grad_norm": 3.963656319575701,
"learning_rate": 2.631578947368421e-06,
"logits/chosen": -0.4935796558856964,
"logits/rejected": -0.4773009419441223,
"logps/chosen": -0.5309734344482422,
"logps/rejected": -0.7864019870758057,
"loss": 0.914,
"odds_ratio_loss": 9.082796096801758,
"rewards/accuracies": 0.296875,
"rewards/chosen": -0.07864020019769669,
"rewards/margins": -0.025542862713336945,
"rewards/rejected": -0.05309733748435974,
"sft_loss": 0.005673164501786232,
"step": 10
},
{
"epoch": 0.06033596160438807,
"grad_norm": 6.086739375903808,
"learning_rate": 2.8947368421052634e-06,
"logits/chosen": -0.49898672103881836,
"logits/rejected": -0.5067213773727417,
"logps/chosen": -0.5432000756263733,
"logps/rejected": -0.728182315826416,
"loss": 0.8437,
"odds_ratio_loss": 8.347268104553223,
"rewards/accuracies": 0.3359375,
"rewards/chosen": -0.07281822711229324,
"rewards/margins": -0.018498217687010765,
"rewards/rejected": -0.05432000756263733,
"sft_loss": 0.00892576016485691,
"step": 11
},
{
"epoch": 0.0658210490229688,
"grad_norm": 2.6128540943717016,
"learning_rate": 3.157894736842105e-06,
"logits/chosen": -0.5385469198226929,
"logits/rejected": -0.5363799333572388,
"logps/chosen": -0.6628677248954773,
"logps/rejected": -0.7211645245552063,
"loss": 0.8365,
"odds_ratio_loss": 8.324950218200684,
"rewards/accuracies": 0.296875,
"rewards/chosen": -0.07211645692586899,
"rewards/margins": -0.005829682573676109,
"rewards/rejected": -0.06628676503896713,
"sft_loss": 0.003960596397519112,
"step": 12
},
{
"epoch": 0.07130613644154954,
"grad_norm": 3.092946841589385,
"learning_rate": 3.421052631578948e-06,
"logits/chosen": -0.5176294445991516,
"logits/rejected": -0.5164329409599304,
"logps/chosen": -0.5450438857078552,
"logps/rejected": -0.6731284856796265,
"loss": 0.7842,
"odds_ratio_loss": 7.793478012084961,
"rewards/accuracies": 0.3359375,
"rewards/chosen": -0.06731285154819489,
"rewards/margins": -0.012808457016944885,
"rewards/rejected": -0.054504383355379105,
"sft_loss": 0.004814960993826389,
"step": 13
},
{
"epoch": 0.07679122386013026,
"grad_norm": 2.8558270742996137,
"learning_rate": 3.6842105263157896e-06,
"logits/chosen": -0.6181472539901733,
"logits/rejected": -0.5952631831169128,
"logps/chosen": -0.6915932893753052,
"logps/rejected": -0.753544807434082,
"loss": 0.8682,
"odds_ratio_loss": 8.637476921081543,
"rewards/accuracies": 0.328125,
"rewards/chosen": -0.07535447925329208,
"rewards/margins": -0.006195154972374439,
"rewards/rejected": -0.06915932893753052,
"sft_loss": 0.0044339620508253574,
"step": 14
},
{
"epoch": 0.082276311278711,
"grad_norm": 3.7864729296015134,
"learning_rate": 3.947368421052632e-06,
"logits/chosen": -0.6190563440322876,
"logits/rejected": -0.5924232006072998,
"logps/chosen": -0.5566739439964294,
"logps/rejected": -0.7037681937217712,
"loss": 0.8093,
"odds_ratio_loss": 8.019332885742188,
"rewards/accuracies": 0.328125,
"rewards/chosen": -0.07037682831287384,
"rewards/margins": -0.014709431678056717,
"rewards/rejected": -0.05566739663481712,
"sft_loss": 0.007347959093749523,
"step": 15
},
{
"epoch": 0.08776139869729174,
"grad_norm": 3.0120281360552097,
"learning_rate": 4.210526315789474e-06,
"logits/chosen": -0.6849666833877563,
"logits/rejected": -0.6525503396987915,
"logps/chosen": -0.5696557760238647,
"logps/rejected": -0.7036406993865967,
"loss": 0.8093,
"odds_ratio_loss": 8.039867401123047,
"rewards/accuracies": 0.359375,
"rewards/chosen": -0.0703640729188919,
"rewards/margins": -0.013398496434092522,
"rewards/rejected": -0.05696558207273483,
"sft_loss": 0.005266908556222916,
"step": 16
},
{
"epoch": 0.09324648611587247,
"grad_norm": 2.6504291324403315,
"learning_rate": 4.473684210526316e-06,
"logits/chosen": -0.641123354434967,
"logits/rejected": -0.6362437605857849,
"logps/chosen": -0.6884087324142456,
"logps/rejected": -0.7263792157173157,
"loss": 0.8353,
"odds_ratio_loss": 8.31103515625,
"rewards/accuracies": 0.2734375,
"rewards/chosen": -0.07263792306184769,
"rewards/margins": -0.0037970547564327717,
"rewards/rejected": -0.0688408762216568,
"sft_loss": 0.004170028492808342,
"step": 17
},
{
"epoch": 0.09873157353445321,
"grad_norm": 2.886807767840121,
"learning_rate": 4.736842105263158e-06,
"logits/chosen": -0.6371440291404724,
"logits/rejected": -0.6085755825042725,
"logps/chosen": -0.6927422881126404,
"logps/rejected": -0.706704318523407,
"loss": 0.8013,
"odds_ratio_loss": 7.961912155151367,
"rewards/accuracies": 0.4140625,
"rewards/chosen": -0.07067042589187622,
"rewards/margins": -0.0013961929362267256,
"rewards/rejected": -0.06927423179149628,
"sft_loss": 0.005079690366983414,
"step": 18
},
{
"epoch": 0.10421666095303393,
"grad_norm": 5.880825411085245,
"learning_rate": 5e-06,
"logits/chosen": -0.6107761859893799,
"logits/rejected": -0.6409167647361755,
"logps/chosen": -0.628943681716919,
"logps/rejected": -0.6733644008636475,
"loss": 0.7634,
"odds_ratio_loss": 7.441443920135498,
"rewards/accuracies": 0.3359375,
"rewards/chosen": -0.06733644008636475,
"rewards/margins": -0.004442068748176098,
"rewards/rejected": -0.06289438158273697,
"sft_loss": 0.01928992196917534,
"step": 19
},
{
"epoch": 0.10970174837161467,
"grad_norm": 2.5047291630264965,
"learning_rate": 4.999535676028338e-06,
"logits/chosen": -0.53981614112854,
"logits/rejected": -0.5691806674003601,
"logps/chosen": -0.5706157088279724,
"logps/rejected": -0.6909669637680054,
"loss": 0.7906,
"odds_ratio_loss": 7.871767997741699,
"rewards/accuracies": 0.359375,
"rewards/chosen": -0.06909669935703278,
"rewards/margins": -0.012035122141242027,
"rewards/rejected": -0.0570615753531456,
"sft_loss": 0.003470680210739374,
"step": 20
},
{
"epoch": 0.11518683579019541,
"grad_norm": 2.731427921236176,
"learning_rate": 4.998142876590749e-06,
"logits/chosen": -0.6109368801116943,
"logits/rejected": -0.5822383165359497,
"logps/chosen": -0.5740761756896973,
"logps/rejected": -0.7196378111839294,
"loss": 0.8229,
"odds_ratio_loss": 8.185101509094238,
"rewards/accuracies": 0.3203125,
"rewards/chosen": -0.07196377962827682,
"rewards/margins": -0.014556167647242546,
"rewards/rejected": -0.05740761756896973,
"sft_loss": 0.004432335030287504,
"step": 21
},
{
"epoch": 0.12067192320877614,
"grad_norm": 2.6457518569851914,
"learning_rate": 4.9958221190553705e-06,
"logits/chosen": -0.6459155082702637,
"logits/rejected": -0.6224872469902039,
"logps/chosen": -0.9075093269348145,
"logps/rejected": -0.6858919858932495,
"loss": 0.7814,
"odds_ratio_loss": 7.773059844970703,
"rewards/accuracies": 0.3359375,
"rewards/chosen": -0.06858920305967331,
"rewards/margins": 0.02216172218322754,
"rewards/rejected": -0.09075092524290085,
"sft_loss": 0.004068214446306229,
"step": 22
},
{
"epoch": 0.12615701062735687,
"grad_norm": 2.3568903077737398,
"learning_rate": 4.992574265488883e-06,
"logits/chosen": -0.596120297908783,
"logits/rejected": -0.5599789619445801,
"logps/chosen": -0.6051114201545715,
"logps/rejected": -0.6642252206802368,
"loss": 0.7587,
"odds_ratio_loss": 7.555657386779785,
"rewards/accuracies": 0.3203125,
"rewards/chosen": -0.06642251461744308,
"rewards/margins": -0.0059113758616149426,
"rewards/rejected": -0.06051114201545715,
"sft_loss": 0.0031609206926077604,
"step": 23
},
{
"epoch": 0.1316420980459376,
"grad_norm": 2.4794879192215245,
"learning_rate": 4.988400522336304e-06,
"logits/chosen": -0.6102786064147949,
"logits/rejected": -0.5751500129699707,
"logps/chosen": -0.6058512926101685,
"logps/rejected": -0.6483108997344971,
"loss": 0.7446,
"odds_ratio_loss": 7.4141364097595215,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.06483108550310135,
"rewards/margins": -0.004245956428349018,
"rewards/rejected": -0.060585130006074905,
"sft_loss": 0.003168251132592559,
"step": 24
},
{
"epoch": 0.13712718546451835,
"grad_norm": 2.6638728856099076,
"learning_rate": 4.9833024399728295e-06,
"logits/chosen": -0.6613443493843079,
"logits/rejected": -0.6328415870666504,
"logps/chosen": -0.6213655471801758,
"logps/rejected": -0.6468154191970825,
"loss": 0.7378,
"odds_ratio_loss": 7.336475372314453,
"rewards/accuracies": 0.3984375,
"rewards/chosen": -0.06468154489994049,
"rewards/margins": -0.0025449953973293304,
"rewards/rejected": -0.06213654577732086,
"sft_loss": 0.004186041187494993,
"step": 25
},
{
"epoch": 0.14261227288309908,
"grad_norm": 2.487669137174005,
"learning_rate": 4.9772819121279395e-06,
"logits/chosen": -0.6384425759315491,
"logits/rejected": -0.6189238429069519,
"logps/chosen": -0.5178334712982178,
"logps/rejected": -0.6201947927474976,
"loss": 0.7189,
"odds_ratio_loss": 7.1527557373046875,
"rewards/accuracies": 0.421875,
"rewards/chosen": -0.0620194748044014,
"rewards/margins": -0.010236131027340889,
"rewards/rejected": -0.05178334563970566,
"sft_loss": 0.00362430140376091,
"step": 26
},
{
"epoch": 0.1480973603016798,
"grad_norm": 2.3790278282662314,
"learning_rate": 4.970341175181957e-06,
"logits/chosen": -0.6167687177658081,
"logits/rejected": -0.6090523600578308,
"logps/chosen": -0.5448204278945923,
"logps/rejected": -0.660010576248169,
"loss": 0.7598,
"odds_ratio_loss": 7.5589399337768555,
"rewards/accuracies": 0.3203125,
"rewards/chosen": -0.06600106507539749,
"rewards/margins": -0.011519018560647964,
"rewards/rejected": -0.05448204651474953,
"sft_loss": 0.00385806686244905,
"step": 27
},
{
"epoch": 0.15358244772026053,
"grad_norm": 2.6617242551359275,
"learning_rate": 4.9624828073353144e-06,
"logits/chosen": -0.6628867387771606,
"logits/rejected": -0.642227053642273,
"logps/chosen": -0.5881469249725342,
"logps/rejected": -0.6607677340507507,
"loss": 0.7543,
"odds_ratio_loss": 7.503334045410156,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.06607677042484283,
"rewards/margins": -0.007262084167450666,
"rewards/rejected": -0.05881468579173088,
"sft_loss": 0.003945589065551758,
"step": 28
},
{
"epoch": 0.15906753513884128,
"grad_norm": 5.959951004497131,
"learning_rate": 4.95370972765087e-06,
"logits/chosen": -0.6791335344314575,
"logits/rejected": -0.6436203122138977,
"logps/chosen": -0.6152352094650269,
"logps/rejected": -0.7523278594017029,
"loss": 0.853,
"odds_ratio_loss": 8.355252265930176,
"rewards/accuracies": 0.28125,
"rewards/chosen": -0.07523278146982193,
"rewards/margins": -0.013709258288145065,
"rewards/rejected": -0.06152352690696716,
"sft_loss": 0.017459843307733536,
"step": 29
},
{
"epoch": 0.164552622557422,
"grad_norm": 2.3211607809375328,
"learning_rate": 4.944025194969586e-06,
"logits/chosen": -0.6864458322525024,
"logits/rejected": -0.6491591930389404,
"logps/chosen": -0.591070294380188,
"logps/rejected": -0.6974666118621826,
"loss": 0.7936,
"odds_ratio_loss": 7.89208984375,
"rewards/accuracies": 0.3984375,
"rewards/chosen": -0.06974666565656662,
"rewards/margins": -0.010639630258083344,
"rewards/rejected": -0.05910703167319298,
"sft_loss": 0.004418414086103439,
"step": 30
},
{
"epoch": 0.17003770997600273,
"grad_norm": 2.4315181303090307,
"learning_rate": 4.933432806700004e-06,
"logits/chosen": -0.6587315797805786,
"logits/rejected": -0.6222696304321289,
"logps/chosen": -0.6303814053535461,
"logps/rejected": -0.7028146982192993,
"loss": 0.8067,
"odds_ratio_loss": 8.026537895202637,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.07028146833181381,
"rewards/margins": -0.007243326865136623,
"rewards/rejected": -0.06303813308477402,
"sft_loss": 0.004071646369993687,
"step": 31
},
{
"epoch": 0.17552279739458349,
"grad_norm": 2.210136151504232,
"learning_rate": 4.921936497481956e-06,
"logits/chosen": -0.7069422006607056,
"logits/rejected": -0.6679366230964661,
"logps/chosen": -0.5178747177124023,
"logps/rejected": -0.6546376347541809,
"loss": 0.7551,
"odds_ratio_loss": 7.520661354064941,
"rewards/accuracies": 0.3515625,
"rewards/chosen": -0.06546376645565033,
"rewards/margins": -0.013676293194293976,
"rewards/rejected": -0.05178747698664665,
"sft_loss": 0.003003381658345461,
"step": 32
},
{
"epoch": 0.1810078848131642,
"grad_norm": 3.4053506226819574,
"learning_rate": 4.909540537725007e-06,
"logits/chosen": -0.6776018142700195,
"logits/rejected": -0.6548346877098083,
"logps/chosen": -0.6834210157394409,
"logps/rejected": -0.6428653001785278,
"loss": 0.7378,
"odds_ratio_loss": 7.308858871459961,
"rewards/accuracies": 0.421875,
"rewards/chosen": -0.06428653001785278,
"rewards/margins": 0.004055576398968697,
"rewards/rejected": -0.06834210455417633,
"sft_loss": 0.006872784812003374,
"step": 33
},
{
"epoch": 0.18649297223174494,
"grad_norm": 2.539813335312958,
"learning_rate": 4.8962495320221714e-06,
"logits/chosen": -0.7110268473625183,
"logits/rejected": -0.6500384211540222,
"logps/chosen": -0.5525631308555603,
"logps/rejected": -0.673240065574646,
"loss": 0.7743,
"odds_ratio_loss": 7.700582027435303,
"rewards/accuracies": 0.3359375,
"rewards/chosen": -0.06732401251792908,
"rewards/margins": -0.012067697942256927,
"rewards/rejected": -0.05525631457567215,
"sft_loss": 0.0042497809045016766,
"step": 34
},
{
"epoch": 0.1919780596503257,
"grad_norm": 2.655970243707059,
"learning_rate": 4.8820684174394935e-06,
"logits/chosen": -0.7138683199882507,
"logits/rejected": -0.6818023920059204,
"logps/chosen": -0.5597530603408813,
"logps/rejected": -0.6535578966140747,
"loss": 0.7491,
"odds_ratio_loss": 7.4469194412231445,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.06535579264163971,
"rewards/margins": -0.009380483068525791,
"rewards/rejected": -0.0559752993285656,
"sft_loss": 0.004387051332741976,
"step": 35
},
{
"epoch": 0.19746314706890641,
"grad_norm": 2.5709022001566275,
"learning_rate": 4.867002461682129e-06,
"logits/chosen": -0.6977266073226929,
"logits/rejected": -0.6760712265968323,
"logps/chosen": -0.5894060134887695,
"logps/rejected": -0.7010530829429626,
"loss": 0.8056,
"odds_ratio_loss": 8.00703239440918,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.07010531425476074,
"rewards/margins": -0.011164708994328976,
"rewards/rejected": -0.058940596878528595,
"sft_loss": 0.004914172925055027,
"step": 36
},
{
"epoch": 0.20294823448748714,
"grad_norm": 2.3705551951599895,
"learning_rate": 4.851057261137608e-06,
"logits/chosen": -0.69024658203125,
"logits/rejected": -0.6706061363220215,
"logps/chosen": -0.5469837784767151,
"logps/rejected": -0.6902478933334351,
"loss": 0.7905,
"odds_ratio_loss": 7.859821319580078,
"rewards/accuracies": 0.34375,
"rewards/chosen": -0.06902480125427246,
"rewards/margins": -0.014326417818665504,
"rewards/rejected": -0.05469837784767151,
"sft_loss": 0.0044726720079779625,
"step": 37
},
{
"epoch": 0.20843332190606786,
"grad_norm": 5.234021982751859,
"learning_rate": 4.8342387387970105e-06,
"logits/chosen": -0.6684221625328064,
"logits/rejected": -0.6224137544631958,
"logps/chosen": -0.6020541191101074,
"logps/rejected": -0.6657444834709167,
"loss": 0.7536,
"odds_ratio_loss": 7.442554473876953,
"rewards/accuracies": 0.4140625,
"rewards/chosen": -0.06657445430755615,
"rewards/margins": -0.0063690319657325745,
"rewards/rejected": -0.06020541861653328,
"sft_loss": 0.009305169805884361,
"step": 38
},
{
"epoch": 0.21391840932464862,
"grad_norm": 2.0334062957234833,
"learning_rate": 4.816553142054806e-06,
"logits/chosen": -0.6587538719177246,
"logits/rejected": -0.6190416216850281,
"logps/chosen": -0.5317018032073975,
"logps/rejected": -0.6559545993804932,
"loss": 0.7589,
"odds_ratio_loss": 7.557468414306641,
"rewards/accuracies": 0.328125,
"rewards/chosen": -0.06559545546770096,
"rewards/margins": -0.012425270862877369,
"rewards/rejected": -0.053170185536146164,
"sft_loss": 0.003142669564113021,
"step": 39
},
{
"epoch": 0.21940349674322934,
"grad_norm": 2.615502723756528,
"learning_rate": 4.798007040388212e-06,
"logits/chosen": -0.6537474393844604,
"logits/rejected": -0.6558958888053894,
"logps/chosen": -0.5594260692596436,
"logps/rejected": -0.7137230634689331,
"loss": 0.8176,
"odds_ratio_loss": 8.131279945373535,
"rewards/accuracies": 0.296875,
"rewards/chosen": -0.07137230783700943,
"rewards/margins": -0.015429697930812836,
"rewards/rejected": -0.055942609906196594,
"sft_loss": 0.004487923812121153,
"step": 40
},
{
"epoch": 0.22488858416181007,
"grad_norm": 3.147981874003909,
"learning_rate": 4.778607322916896e-06,
"logits/chosen": -0.7805840969085693,
"logits/rejected": -0.7332777976989746,
"logps/chosen": -0.5440269708633423,
"logps/rejected": -0.6570194959640503,
"loss": 0.755,
"odds_ratio_loss": 7.461404800415039,
"rewards/accuracies": 0.34375,
"rewards/chosen": -0.06570195406675339,
"rewards/margins": -0.011299250647425652,
"rewards/rejected": -0.05440270155668259,
"sft_loss": 0.008839858695864677,
"step": 41
},
{
"epoch": 0.23037367158039082,
"grad_norm": 2.4132096188646868,
"learning_rate": 4.7583611958439514e-06,
"logits/chosen": -0.6508989334106445,
"logits/rejected": -0.6227612495422363,
"logps/chosen": -0.6552981734275818,
"logps/rejected": -0.7332005500793457,
"loss": 0.8445,
"odds_ratio_loss": 8.40545654296875,
"rewards/accuracies": 0.2734375,
"rewards/chosen": -0.07332006096839905,
"rewards/margins": -0.007790243253111839,
"rewards/rejected": -0.06552981585264206,
"sft_loss": 0.0039908913895487785,
"step": 42
},
{
"epoch": 0.23585875899897155,
"grad_norm": 2.029276338522748,
"learning_rate": 4.7372761797790836e-06,
"logits/chosen": -0.6611707210540771,
"logits/rejected": -0.6425488591194153,
"logps/chosen": -0.5845383405685425,
"logps/rejected": -0.6395372748374939,
"loss": 0.7307,
"odds_ratio_loss": 7.273993968963623,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.06395373493432999,
"rewards/margins": -0.005499903112649918,
"rewards/rejected": -0.05845382809638977,
"sft_loss": 0.003296809270977974,
"step": 43
},
{
"epoch": 0.24134384641755227,
"grad_norm": 2.334850850202144,
"learning_rate": 4.715360106945015e-06,
"logits/chosen": -0.6538242101669312,
"logits/rejected": -0.6651148796081543,
"logps/chosen": -0.5922138690948486,
"logps/rejected": -0.6789554357528687,
"loss": 0.7687,
"odds_ratio_loss": 7.646347999572754,
"rewards/accuracies": 0.359375,
"rewards/chosen": -0.0678955465555191,
"rewards/margins": -0.008674154058098793,
"rewards/rejected": -0.059221383184194565,
"sft_loss": 0.004072089679539204,
"step": 44
},
{
"epoch": 0.24682893383613302,
"grad_norm": 3.278246254509892,
"learning_rate": 4.6926211182681295e-06,
"logits/chosen": -0.7525214552879333,
"logits/rejected": -0.7238048315048218,
"logps/chosen": -0.6908121109008789,
"logps/rejected": -0.6656503677368164,
"loss": 0.7586,
"odds_ratio_loss": 7.535221099853516,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.06656503677368164,
"rewards/margins": 0.0025161777157336473,
"rewards/rejected": -0.06908121705055237,
"sft_loss": 0.005099683068692684,
"step": 45
},
{
"epoch": 0.25231402125471375,
"grad_norm": 2.1664816320305897,
"learning_rate": 4.669067660354456e-06,
"logits/chosen": -0.6813299059867859,
"logits/rejected": -0.6507269144058228,
"logps/chosen": -0.5852293372154236,
"logps/rejected": -0.6381067633628845,
"loss": 0.726,
"odds_ratio_loss": 7.226728439331055,
"rewards/accuracies": 0.3515625,
"rewards/chosen": -0.06381067633628845,
"rewards/margins": -0.005287742242217064,
"rewards/rejected": -0.05852293595671654,
"sft_loss": 0.003340219147503376,
"step": 46
},
{
"epoch": 0.2577991086732945,
"grad_norm": 2.3755468836145535,
"learning_rate": 4.644708482352093e-06,
"logits/chosen": -0.7196726202964783,
"logits/rejected": -0.671117901802063,
"logps/chosen": -0.496429979801178,
"logps/rejected": -0.6285478472709656,
"loss": 0.7368,
"odds_ratio_loss": 7.332725524902344,
"rewards/accuracies": 0.3203125,
"rewards/chosen": -0.06285478919744492,
"rewards/margins": -0.013211790472269058,
"rewards/rejected": -0.04964299499988556,
"sft_loss": 0.003541831858456135,
"step": 47
},
{
"epoch": 0.2632841960918752,
"grad_norm": 1.882676786275039,
"learning_rate": 4.619552632701263e-06,
"logits/chosen": -0.7029179334640503,
"logits/rejected": -0.6741898059844971,
"logps/chosen": -0.5508084297180176,
"logps/rejected": -0.6012480854988098,
"loss": 0.6905,
"odds_ratio_loss": 6.879641532897949,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.06012480705976486,
"rewards/margins": -0.005043962970376015,
"rewards/rejected": -0.055080845952034,
"sft_loss": 0.0025444331113249063,
"step": 48
},
{
"epoch": 0.2687692835104559,
"grad_norm": 2.3230672316134244,
"learning_rate": 4.5936094557731815e-06,
"logits/chosen": -0.6754356622695923,
"logits/rejected": -0.6543477177619934,
"logps/chosen": -0.5631517171859741,
"logps/rejected": -0.6714473366737366,
"loss": 0.7666,
"odds_ratio_loss": 7.623527526855469,
"rewards/accuracies": 0.328125,
"rewards/chosen": -0.0671447217464447,
"rewards/margins": -0.010829559527337551,
"rewards/rejected": -0.05631516873836517,
"sft_loss": 0.004294781945645809,
"step": 49
},
{
"epoch": 0.2742543709290367,
"grad_norm": 2.3609733896189353,
"learning_rate": 4.566888588399007e-06,
"logits/chosen": -0.6326004266738892,
"logits/rejected": -0.6783662438392639,
"logps/chosen": -0.6179240345954895,
"logps/rejected": -0.64500892162323,
"loss": 0.7366,
"odds_ratio_loss": 7.319670677185059,
"rewards/accuracies": 0.4140625,
"rewards/chosen": -0.06450089812278748,
"rewards/margins": -0.002708489540964365,
"rewards/rejected": -0.06179240718483925,
"sft_loss": 0.0045843422412872314,
"step": 50
},
{
"epoch": 0.27973945834761743,
"grad_norm": 2.150795076150455,
"learning_rate": 4.539399956290152e-06,
"logits/chosen": -0.6697653532028198,
"logits/rejected": -0.6334677934646606,
"logps/chosen": -0.5712593197822571,
"logps/rejected": -0.5926052927970886,
"loss": 0.6852,
"odds_ratio_loss": 6.818970680236816,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.0592605359852314,
"rewards/margins": -0.002134602749720216,
"rewards/rejected": -0.05712592974305153,
"sft_loss": 0.003330723848193884,
"step": 51
},
{
"epoch": 0.28522454576619816,
"grad_norm": 2.2865734329168563,
"learning_rate": 4.511153770351288e-06,
"logits/chosen": -0.7038233280181885,
"logits/rejected": -0.6797916889190674,
"logps/chosen": -0.626400351524353,
"logps/rejected": -0.6849571466445923,
"loss": 0.7765,
"odds_ratio_loss": 7.725215435028076,
"rewards/accuracies": 0.328125,
"rewards/chosen": -0.06849571317434311,
"rewards/margins": -0.0058556715957820415,
"rewards/rejected": -0.06264004111289978,
"sft_loss": 0.00396262900903821,
"step": 52
},
{
"epoch": 0.2907096331847789,
"grad_norm": 2.3920025552439967,
"learning_rate": 4.482160522887404e-06,
"logits/chosen": -0.6559309363365173,
"logits/rejected": -0.6758289337158203,
"logps/chosen": -0.6616750955581665,
"logps/rejected": -0.6394928693771362,
"loss": 0.7281,
"odds_ratio_loss": 7.238113880157471,
"rewards/accuracies": 0.390625,
"rewards/chosen": -0.06394927203655243,
"rewards/margins": 0.0022182257380336523,
"rewards/rejected": -0.06616750359535217,
"sft_loss": 0.004315241239964962,
"step": 53
},
{
"epoch": 0.2961947206033596,
"grad_norm": 2.5117956914717032,
"learning_rate": 4.452430983706351e-06,
"logits/chosen": -0.7268598079681396,
"logits/rejected": -0.7092704772949219,
"logps/chosen": -0.549469530582428,
"logps/rejected": -0.6699557304382324,
"loss": 0.7657,
"odds_ratio_loss": 7.608267784118652,
"rewards/accuracies": 0.3671875,
"rewards/chosen": -0.06699557602405548,
"rewards/margins": -0.012048620730638504,
"rewards/rejected": -0.054946959018707275,
"sft_loss": 0.004888926167041063,
"step": 54
},
{
"epoch": 0.30167980802194033,
"grad_norm": 6.1164968534949375,
"learning_rate": 4.421976196118297e-06,
"logits/chosen": -0.6996357440948486,
"logits/rejected": -0.6125066876411438,
"logps/chosen": -0.5666108131408691,
"logps/rejected": -0.7606990933418274,
"loss": 0.8647,
"odds_ratio_loss": 7.164684295654297,
"rewards/accuracies": 0.3984375,
"rewards/chosen": -0.0760699063539505,
"rewards/margins": -0.019408833235502243,
"rewards/rejected": -0.056661076843738556,
"sft_loss": 0.14825321733951569,
"step": 55
},
{
"epoch": 0.30716489544052106,
"grad_norm": 2.3449325969971553,
"learning_rate": 4.390807472833585e-06,
"logits/chosen": -0.7196471095085144,
"logits/rejected": -0.6497766375541687,
"logps/chosen": -0.5705811977386475,
"logps/rejected": -0.6152413487434387,
"loss": 0.7056,
"odds_ratio_loss": 7.016576766967773,
"rewards/accuracies": 0.4140625,
"rewards/chosen": -0.061524130403995514,
"rewards/margins": -0.004466014914214611,
"rewards/rejected": -0.05705811828374863,
"sft_loss": 0.003966608550399542,
"step": 56
},
{
"epoch": 0.31264998285910184,
"grad_norm": 1.9758244727864838,
"learning_rate": 4.358936391760524e-06,
"logits/chosen": -0.6982048749923706,
"logits/rejected": -0.6489894986152649,
"logps/chosen": -0.7246107459068298,
"logps/rejected": -0.6382527947425842,
"loss": 0.7298,
"odds_ratio_loss": 7.263551235198975,
"rewards/accuracies": 0.3984375,
"rewards/chosen": -0.06382527947425842,
"rewards/margins": 0.008635802194476128,
"rewards/rejected": -0.0724610835313797,
"sft_loss": 0.0034022387117147446,
"step": 57
},
{
"epoch": 0.31813507027768256,
"grad_norm": 2.174135911231623,
"learning_rate": 4.32637479170467e-06,
"logits/chosen": -0.7073652148246765,
"logits/rejected": -0.6690998077392578,
"logps/chosen": -0.6008009314537048,
"logps/rejected": -0.6260058879852295,
"loss": 0.7136,
"odds_ratio_loss": 7.0961079597473145,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.06260059028863907,
"rewards/margins": -0.0025204988196492195,
"rewards/rejected": -0.060080088675022125,
"sft_loss": 0.00403111707419157,
"step": 58
},
{
"epoch": 0.3236201576962633,
"grad_norm": 2.168405994153748,
"learning_rate": 4.293134767971193e-06,
"logits/chosen": -0.7020463347434998,
"logits/rejected": -0.6942731142044067,
"logps/chosen": -0.5773230791091919,
"logps/rejected": -0.6443250179290771,
"loss": 0.7339,
"odds_ratio_loss": 7.302745342254639,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.06443249434232712,
"rewards/margins": -0.006700189784169197,
"rewards/rejected": -0.05773231014609337,
"sft_loss": 0.003663026262074709,
"step": 59
},
{
"epoch": 0.329105245114844,
"grad_norm": 2.185211880662804,
"learning_rate": 4.259228667871963e-06,
"logits/chosen": -0.6689984202384949,
"logits/rejected": -0.675392746925354,
"logps/chosen": -0.6228610277175903,
"logps/rejected": -0.6264554262161255,
"loss": 0.7157,
"odds_ratio_loss": 7.120779991149902,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.06264554709196091,
"rewards/margins": -0.0003594460431486368,
"rewards/rejected": -0.062286097556352615,
"sft_loss": 0.0035854382440447807,
"step": 60
},
{
"epoch": 0.33459033253342474,
"grad_norm": 2.57070026430243,
"learning_rate": 4.22466908613903e-06,
"logits/chosen": -0.702937126159668,
"logits/rejected": -0.7012047171592712,
"logps/chosen": -0.7283627986907959,
"logps/rejected": -0.6880404949188232,
"loss": 0.7811,
"odds_ratio_loss": 7.746993064880371,
"rewards/accuracies": 0.3671875,
"rewards/chosen": -0.0688040480017662,
"rewards/margins": 0.0040322281420230865,
"rewards/rejected": -0.07283627241849899,
"sft_loss": 0.006447223015129566,
"step": 61
},
{
"epoch": 0.34007541995200546,
"grad_norm": 4.244463596912575,
"learning_rate": 4.189468860246192e-06,
"logits/chosen": -0.6791540384292603,
"logits/rejected": -0.6635554432868958,
"logps/chosen": -0.6107332706451416,
"logps/rejected": -0.6247188448905945,
"loss": 0.7142,
"odds_ratio_loss": 7.000344753265381,
"rewards/accuracies": 0.3984375,
"rewards/chosen": -0.06247188523411751,
"rewards/margins": -0.001398557680658996,
"rewards/rejected": -0.06107332557439804,
"sft_loss": 0.014190858229994774,
"step": 62
},
{
"epoch": 0.34556050737058625,
"grad_norm": 2.025728542662067,
"learning_rate": 4.153641065640402e-06,
"logits/chosen": -0.693527340888977,
"logits/rejected": -0.6523553133010864,
"logps/chosen": -0.5719731450080872,
"logps/rejected": -0.6185833811759949,
"loss": 0.7056,
"odds_ratio_loss": 7.020816802978516,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.06185833364725113,
"rewards/margins": -0.004661021754145622,
"rewards/rejected": -0.057197313755750656,
"sft_loss": 0.003522280603647232,
"step": 63
},
{
"epoch": 0.35104559478916697,
"grad_norm": 2.295177803143149,
"learning_rate": 4.1171990108847705e-06,
"logits/chosen": -0.6868746280670166,
"logits/rejected": -0.6699408888816833,
"logps/chosen": -0.6467013359069824,
"logps/rejected": -0.6266091465950012,
"loss": 0.719,
"odds_ratio_loss": 7.1493024826049805,
"rewards/accuracies": 0.3984375,
"rewards/chosen": -0.06266091763973236,
"rewards/margins": 0.002009219955652952,
"rewards/rejected": -0.0646701380610466,
"sft_loss": 0.004113649483770132,
"step": 64
},
{
"epoch": 0.3565306822077477,
"grad_norm": 2.2636310766420693,
"learning_rate": 4.080156232714976e-06,
"logits/chosen": -0.6988135576248169,
"logits/rejected": -0.6200304627418518,
"logps/chosen": -0.560631513595581,
"logps/rejected": -0.6326756477355957,
"loss": 0.7288,
"odds_ratio_loss": 7.247946262359619,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.06326755881309509,
"rewards/margins": -0.007204408757388592,
"rewards/rejected": -0.056063152849674225,
"sft_loss": 0.004002667032182217,
"step": 65
},
{
"epoch": 0.3620157696263284,
"grad_norm": 2.3726953806807543,
"learning_rate": 4.0425264910109245e-06,
"logits/chosen": -0.6342322826385498,
"logits/rejected": -0.6143465042114258,
"logps/chosen": -0.5673171877861023,
"logps/rejected": -0.6150503158569336,
"loss": 0.704,
"odds_ratio_loss": 6.996849060058594,
"rewards/accuracies": 0.3515625,
"rewards/chosen": -0.0615050233900547,
"rewards/margins": -0.004773305729031563,
"rewards/rejected": -0.05673171579837799,
"sft_loss": 0.004326392896473408,
"step": 66
},
{
"epoch": 0.36750085704490915,
"grad_norm": 2.3264926175994343,
"learning_rate": 4.004323763685511e-06,
"logits/chosen": -0.6961945295333862,
"logits/rejected": -0.6595050096511841,
"logps/chosen": -0.7507973909378052,
"logps/rejected": -0.6752977967262268,
"loss": 0.7631,
"odds_ratio_loss": 7.585504531860352,
"rewards/accuracies": 0.4140625,
"rewards/chosen": -0.06752977520227432,
"rewards/margins": 0.0075499615631997585,
"rewards/rejected": -0.07507973909378052,
"sft_loss": 0.004573096055537462,
"step": 67
},
{
"epoch": 0.37298594446348987,
"grad_norm": 2.1356607945545245,
"learning_rate": 3.965562241492401e-06,
"logits/chosen": -0.6466645002365112,
"logits/rejected": -0.5876697301864624,
"logps/chosen": -0.6032683253288269,
"logps/rejected": -0.6798368692398071,
"loss": 0.7748,
"odds_ratio_loss": 7.713741779327393,
"rewards/accuracies": 0.3359375,
"rewards/chosen": -0.06798368692398071,
"rewards/margins": -0.007656855508685112,
"rewards/rejected": -0.06032683700323105,
"sft_loss": 0.0033814627677202225,
"step": 68
},
{
"epoch": 0.3784710318820706,
"grad_norm": 2.2504419735299632,
"learning_rate": 3.92625632275474e-06,
"logits/chosen": -0.6422359943389893,
"logits/rejected": -0.635747492313385,
"logps/chosen": -0.6667078137397766,
"logps/rejected": -0.6623092889785767,
"loss": 0.7479,
"odds_ratio_loss": 7.437192916870117,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.06623092293739319,
"rewards/margins": 0.00043986161472275853,
"rewards/rejected": -0.06667079031467438,
"sft_loss": 0.004172264598309994,
"step": 69
},
{
"epoch": 0.3839561193006514,
"grad_norm": 2.305721199557074,
"learning_rate": 3.886420608016767e-06,
"logits/chosen": -0.6914317607879639,
"logits/rejected": -0.6497669219970703,
"logps/chosen": -0.6576675176620483,
"logps/rejected": -0.6327704191207886,
"loss": 0.72,
"odds_ratio_loss": 7.161937236785889,
"rewards/accuracies": 0.421875,
"rewards/chosen": -0.06327703595161438,
"rewards/margins": 0.002489713719114661,
"rewards/rejected": -0.06576675176620483,
"sft_loss": 0.003831625683233142,
"step": 70
},
{
"epoch": 0.3894412067192321,
"grad_norm": 2.3551007707163962,
"learning_rate": 3.846069894620306e-06,
"logits/chosen": -0.6762869358062744,
"logits/rejected": -0.689199686050415,
"logps/chosen": -0.5313611030578613,
"logps/rejected": -0.6086665391921997,
"loss": 0.6972,
"odds_ratio_loss": 6.926117420196533,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.06086665764451027,
"rewards/margins": -0.007730549667030573,
"rewards/rejected": -0.05313611030578613,
"sft_loss": 0.0045737153850495815,
"step": 71
},
{
"epoch": 0.39492629413781283,
"grad_norm": 2.556752307102043,
"learning_rate": 3.80521917120816e-06,
"logits/chosen": -0.6555116772651672,
"logits/rejected": -0.6547709107398987,
"logps/chosen": -0.624289333820343,
"logps/rejected": -0.6435787081718445,
"loss": 0.7314,
"odds_ratio_loss": 7.269766330718994,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.06435786932706833,
"rewards/margins": -0.001928933197632432,
"rewards/rejected": -0.06242894381284714,
"sft_loss": 0.00443875789642334,
"step": 72
},
{
"epoch": 0.40041138155639355,
"grad_norm": 2.62244580300382,
"learning_rate": 3.7638836121564414e-06,
"logits/chosen": -0.633210301399231,
"logits/rejected": -0.6068264245986938,
"logps/chosen": -0.6534138917922974,
"logps/rejected": -0.6456267833709717,
"loss": 0.7342,
"odds_ratio_loss": 7.288712501525879,
"rewards/accuracies": 0.390625,
"rewards/chosen": -0.06456267833709717,
"rewards/margins": 0.0007787136128172278,
"rewards/rejected": -0.06534139811992645,
"sft_loss": 0.005336450412869453,
"step": 73
},
{
"epoch": 0.4058964689749743,
"grad_norm": 2.375173958989858,
"learning_rate": 3.72207857193791e-06,
"logits/chosen": -0.6621152758598328,
"logits/rejected": -0.6223613023757935,
"logps/chosen": -0.5641170144081116,
"logps/rejected": -0.6517760157585144,
"loss": 0.7414,
"odds_ratio_loss": 7.36290979385376,
"rewards/accuracies": 0.3671875,
"rewards/chosen": -0.06517761200666428,
"rewards/margins": -0.00876590609550476,
"rewards/rejected": -0.056411705911159515,
"sft_loss": 0.005075749009847641,
"step": 74
},
{
"epoch": 0.411381556393555,
"grad_norm": 2.178626918834999,
"learning_rate": 3.679819579418414e-06,
"logits/chosen": -0.6223607659339905,
"logits/rejected": -0.6354334354400635,
"logps/chosen": -0.6071732640266418,
"logps/rejected": -0.664821207523346,
"loss": 0.7537,
"odds_ratio_loss": 7.4999518394470215,
"rewards/accuracies": 0.3515625,
"rewards/chosen": -0.06648211926221848,
"rewards/margins": -0.005764788947999477,
"rewards/rejected": -0.06071732938289642,
"sft_loss": 0.003658156841993332,
"step": 75
},
{
"epoch": 0.41686664381213573,
"grad_norm": 2.029172062754319,
"learning_rate": 3.6371223320885492e-06,
"logits/chosen": -0.6456342339515686,
"logits/rejected": -0.6101202368736267,
"logps/chosen": -0.5387505888938904,
"logps/rejected": -0.653649091720581,
"loss": 0.748,
"odds_ratio_loss": 7.446066379547119,
"rewards/accuracies": 0.28125,
"rewards/chosen": -0.06536491215229034,
"rewards/margins": -0.011489855125546455,
"rewards/rejected": -0.05387505888938904,
"sft_loss": 0.0033676247112452984,
"step": 76
},
{
"epoch": 0.4223517312307165,
"grad_norm": 2.566739156357152,
"learning_rate": 3.5940026902326825e-06,
"logits/chosen": -0.6308640837669373,
"logits/rejected": -0.5544230937957764,
"logps/chosen": -0.6081095933914185,
"logps/rejected": -0.6010054349899292,
"loss": 0.686,
"odds_ratio_loss": 6.817296028137207,
"rewards/accuracies": 0.421875,
"rewards/chosen": -0.06010054424405098,
"rewards/margins": 0.000710406806319952,
"rewards/rejected": -0.06081094965338707,
"sft_loss": 0.00426267646253109,
"step": 77
},
{
"epoch": 0.42783681864929723,
"grad_norm": 2.2168093785628424,
"learning_rate": 3.550476671037505e-06,
"logits/chosen": -0.6168922781944275,
"logits/rejected": -0.6250343322753906,
"logps/chosen": -0.7373520135879517,
"logps/rejected": -0.6632636785507202,
"loss": 0.7541,
"odds_ratio_loss": 7.504645824432373,
"rewards/accuracies": 0.3671875,
"rewards/chosen": -0.06632636487483978,
"rewards/margins": 0.0074088433757424355,
"rewards/rejected": -0.07373520731925964,
"sft_loss": 0.003666748758405447,
"step": 78
},
{
"epoch": 0.43332190606787796,
"grad_norm": 2.0671981983555767,
"learning_rate": 3.5065604426422995e-06,
"logits/chosen": -0.6872790455818176,
"logits/rejected": -0.6633861064910889,
"logps/chosen": -0.6065261363983154,
"logps/rejected": -0.6005845069885254,
"loss": 0.6867,
"odds_ratio_loss": 6.836608409881592,
"rewards/accuracies": 0.3984375,
"rewards/chosen": -0.06005845218896866,
"rewards/margins": 0.0005941606359556317,
"rewards/rejected": -0.06065261363983154,
"sft_loss": 0.003060600720345974,
"step": 79
},
{
"epoch": 0.4388069934864587,
"grad_norm": 2.2579388709063477,
"learning_rate": 3.462270318133136e-06,
"logits/chosen": -0.7127959728240967,
"logits/rejected": -0.7076155543327332,
"logps/chosen": -0.6201507449150085,
"logps/rejected": -0.6548338532447815,
"loss": 0.7463,
"odds_ratio_loss": 7.421444892883301,
"rewards/accuracies": 0.296875,
"rewards/chosen": -0.06548339128494263,
"rewards/margins": -0.003468305105343461,
"rewards/rejected": -0.06201507896184921,
"sft_loss": 0.0041392529383301735,
"step": 80
},
{
"epoch": 0.4442920809050394,
"grad_norm": 2.2666354495836587,
"learning_rate": 3.4176227494832305e-06,
"logits/chosen": -0.5993680953979492,
"logits/rejected": -0.5843874216079712,
"logps/chosen": -0.5722631216049194,
"logps/rejected": -0.6499100923538208,
"loss": 0.7394,
"odds_ratio_loss": 7.35059118270874,
"rewards/accuracies": 0.34375,
"rewards/chosen": -0.06499100476503372,
"rewards/margins": -0.007764694280922413,
"rewards/rejected": -0.05722631514072418,
"sft_loss": 0.004326997324824333,
"step": 81
},
{
"epoch": 0.44977716832362014,
"grad_norm": 2.283880288724398,
"learning_rate": 3.3726343214417023e-06,
"logits/chosen": -0.6333982348442078,
"logits/rejected": -0.6226303577423096,
"logps/chosen": -0.7987644672393799,
"logps/rejected": -0.652344822883606,
"loss": 0.734,
"odds_ratio_loss": 7.291712284088135,
"rewards/accuracies": 0.3984375,
"rewards/chosen": -0.0652344822883606,
"rewards/margins": 0.014641974121332169,
"rewards/rejected": -0.07987645268440247,
"sft_loss": 0.0048220655880868435,
"step": 82
},
{
"epoch": 0.4552622557422009,
"grad_norm": 2.1155189965315406,
"learning_rate": 3.327321745373021e-06,
"logits/chosen": -0.6671404242515564,
"logits/rejected": -0.6946146488189697,
"logps/chosen": -0.5693039894104004,
"logps/rejected": -0.6252968907356262,
"loss": 0.7121,
"odds_ratio_loss": 7.0859150886535645,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.06252970546483994,
"rewards/margins": -0.005599300377070904,
"rewards/rejected": -0.05693039298057556,
"sft_loss": 0.003511702874675393,
"step": 83
},
{
"epoch": 0.46074734316078164,
"grad_norm": 1.9540588513738342,
"learning_rate": 3.2817018530494164e-06,
"logits/chosen": -0.6938011646270752,
"logits/rejected": -0.6819745302200317,
"logps/chosen": -0.6028576493263245,
"logps/rejected": -0.6305505037307739,
"loss": 0.7166,
"odds_ratio_loss": 7.133669853210449,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.06305506080389023,
"rewards/margins": -0.0027692890726029873,
"rewards/rejected": -0.060285769402980804,
"sft_loss": 0.003260136814787984,
"step": 84
},
{
"epoch": 0.46623243057936237,
"grad_norm": 2.3068483623141813,
"learning_rate": 3.2357915903985605e-06,
"logits/chosen": -0.669509768486023,
"logits/rejected": -0.6838399171829224,
"logps/chosen": -0.7373601198196411,
"logps/rejected": -0.63920658826828,
"loss": 0.7284,
"odds_ratio_loss": 7.239907264709473,
"rewards/accuracies": 0.3359375,
"rewards/chosen": -0.06392066925764084,
"rewards/margins": 0.009815343655645847,
"rewards/rejected": -0.07373600453138351,
"sft_loss": 0.004396573640406132,
"step": 85
},
{
"epoch": 0.4717175179979431,
"grad_norm": 2.400639181271588,
"learning_rate": 3.1896080112088477e-06,
"logits/chosen": -0.6363841891288757,
"logits/rejected": -0.6474689245223999,
"logps/chosen": -0.6146495938301086,
"logps/rejected": -0.665170431137085,
"loss": 0.7518,
"odds_ratio_loss": 7.468950271606445,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.06651704013347626,
"rewards/margins": -0.005052081309258938,
"rewards/rejected": -0.061464957892894745,
"sft_loss": 0.00492935162037611,
"step": 86
},
{
"epoch": 0.4772026054165238,
"grad_norm": 6.788892165244669,
"learning_rate": 3.143168270794612e-06,
"logits/chosen": -0.7378703355789185,
"logits/rejected": -0.7363454699516296,
"logps/chosen": -0.792299747467041,
"logps/rejected": -0.6177616119384766,
"loss": 0.6952,
"odds_ratio_loss": 6.825002670288086,
"rewards/accuracies": 0.4453125,
"rewards/chosen": -0.061776161193847656,
"rewards/margins": 0.017453810200095177,
"rewards/rejected": -0.07922996580600739,
"sft_loss": 0.012708396650850773,
"step": 87
},
{
"epoch": 0.48268769283510454,
"grad_norm": 4.194375970507543,
"learning_rate": 3.0964896196236217e-06,
"logits/chosen": -0.6821984052658081,
"logits/rejected": -0.6771599054336548,
"logps/chosen": -0.6080144643783569,
"logps/rejected": -0.6563798189163208,
"loss": 0.7478,
"odds_ratio_loss": 7.3482136726379395,
"rewards/accuracies": 0.359375,
"rewards/chosen": -0.0656379908323288,
"rewards/margins": -0.004836536943912506,
"rewards/rejected": -0.06080144643783569,
"sft_loss": 0.012991908006370068,
"step": 88
},
{
"epoch": 0.48817278025368527,
"grad_norm": 3.2712874288459868,
"learning_rate": 3.0495893969092395e-06,
"logits/chosen": -0.7360261082649231,
"logits/rejected": -0.648233950138092,
"logps/chosen": -0.6474156975746155,
"logps/rejected": -0.6273612976074219,
"loss": 0.7148,
"odds_ratio_loss": 7.092808723449707,
"rewards/accuracies": 0.4296875,
"rewards/chosen": -0.0627361312508583,
"rewards/margins": 0.002005442278459668,
"rewards/rejected": -0.0647415742278099,
"sft_loss": 0.005477006547152996,
"step": 89
},
{
"epoch": 0.49365786767226605,
"grad_norm": 3.363992426087972,
"learning_rate": 3.0024850241696128e-06,
"logits/chosen": -0.6951555013656616,
"logits/rejected": -0.6976528167724609,
"logps/chosen": -0.6175632476806641,
"logps/rejected": -0.6432512402534485,
"loss": 0.7269,
"odds_ratio_loss": 7.175661087036133,
"rewards/accuracies": 0.390625,
"rewards/chosen": -0.06432512402534485,
"rewards/margins": -0.0025687951128929853,
"rewards/rejected": -0.061756327748298645,
"sft_loss": 0.009336180053651333,
"step": 90
},
{
"epoch": 0.4991429550908468,
"grad_norm": 2.4802416219258125,
"learning_rate": 2.9551939987562866e-06,
"logits/chosen": -0.6312170028686523,
"logits/rejected": -0.6572614908218384,
"logps/chosen": -0.590150773525238,
"logps/rejected": -0.6796594858169556,
"loss": 0.7688,
"odds_ratio_loss": 7.620222568511963,
"rewards/accuracies": 0.3671875,
"rewards/chosen": -0.06796594709157944,
"rewards/margins": -0.008950873278081417,
"rewards/rejected": -0.059015076607465744,
"sft_loss": 0.006825140677392483,
"step": 91
},
{
"epoch": 0.5046280425094275,
"grad_norm": 2.0888605610235156,
"learning_rate": 2.907733887354657e-06,
"logits/chosen": -0.7382671236991882,
"logits/rejected": -0.704073965549469,
"logps/chosen": -0.60980224609375,
"logps/rejected": -0.5962368249893188,
"loss": 0.6756,
"odds_ratio_loss": 6.72347354888916,
"rewards/accuracies": 0.421875,
"rewards/chosen": -0.05962368845939636,
"rewards/margins": 0.0013565376866608858,
"rewards/rejected": -0.06098022684454918,
"sft_loss": 0.0032284611370414495,
"step": 92
},
{
"epoch": 0.5101131299280083,
"grad_norm": 2.594959628383668,
"learning_rate": 2.8601223194586613e-06,
"logits/chosen": -0.6301463842391968,
"logits/rejected": -0.6508597731590271,
"logps/chosen": -0.6760177612304688,
"logps/rejected": -0.6208183169364929,
"loss": 0.6984,
"odds_ratio_loss": 6.931189060211182,
"rewards/accuracies": 0.4453125,
"rewards/chosen": -0.06208183616399765,
"rewards/margins": 0.005519941449165344,
"rewards/rejected": -0.067601777613163,
"sft_loss": 0.0052444045431911945,
"step": 93
},
{
"epoch": 0.515598217346589,
"grad_norm": 2.4177461424114957,
"learning_rate": 2.8123769808221407e-06,
"logits/chosen": -0.7098735570907593,
"logits/rejected": -0.694360613822937,
"logps/chosen": -0.6068992614746094,
"logps/rejected": -0.672624945640564,
"loss": 0.7656,
"odds_ratio_loss": 7.604451656341553,
"rewards/accuracies": 0.3203125,
"rewards/chosen": -0.06726250052452087,
"rewards/margins": -0.006572564598172903,
"rewards/rejected": -0.060689933598041534,
"sft_loss": 0.005188749171793461,
"step": 94
},
{
"epoch": 0.5210833047651697,
"grad_norm": 2.1914520317020125,
"learning_rate": 2.7645156068893075e-06,
"logits/chosen": -0.6409396529197693,
"logits/rejected": -0.6979643702507019,
"logps/chosen": -0.534928560256958,
"logps/rejected": -0.6406592726707458,
"loss": 0.7345,
"odds_ratio_loss": 7.306841850280762,
"rewards/accuracies": 0.3515625,
"rewards/chosen": -0.06406592577695847,
"rewards/margins": -0.010573070496320724,
"rewards/rejected": -0.05349285155534744,
"sft_loss": 0.0038562421686947346,
"step": 95
},
{
"epoch": 0.5265683921837504,
"grad_norm": 2.0840677499320335,
"learning_rate": 2.716555976206748e-06,
"logits/chosen": -0.7479691505432129,
"logits/rejected": -0.723030149936676,
"logps/chosen": -0.5895535349845886,
"logps/rejected": -0.6176682114601135,
"loss": 0.7002,
"odds_ratio_loss": 6.960841655731201,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.06176682561635971,
"rewards/margins": -0.0028114626184105873,
"rewards/rejected": -0.0589553564786911,
"sft_loss": 0.004080518614500761,
"step": 96
},
{
"epoch": 0.5320534796023312,
"grad_norm": 2.444265846509054,
"learning_rate": 2.6685159038194202e-06,
"logits/chosen": -0.6860970258712769,
"logits/rejected": -0.6489076018333435,
"logps/chosen": -0.61269611120224,
"logps/rejected": -0.6757325530052185,
"loss": 0.7617,
"odds_ratio_loss": 7.5671162605285645,
"rewards/accuracies": 0.3359375,
"rewards/chosen": -0.06757325679063797,
"rewards/margins": -0.006303644739091396,
"rewards/rejected": -0.061269611120224,
"sft_loss": 0.0049641747027635574,
"step": 97
},
{
"epoch": 0.5375385670209119,
"grad_norm": 2.128773042167142,
"learning_rate": 2.6204132346530936e-06,
"logits/chosen": -0.7502190470695496,
"logits/rejected": -0.7040455937385559,
"logps/chosen": -0.7278321981430054,
"logps/rejected": -0.6399706602096558,
"loss": 0.7272,
"odds_ratio_loss": 7.2354841232299805,
"rewards/accuracies": 0.390625,
"rewards/chosen": -0.0639970600605011,
"rewards/margins": 0.008786162361502647,
"rewards/rejected": -0.0727832242846489,
"sft_loss": 0.00368284760043025,
"step": 98
},
{
"epoch": 0.5430236544394926,
"grad_norm": 2.5919828332695833,
"learning_rate": 2.572265836885682e-06,
"logits/chosen": -0.7643656134605408,
"logits/rejected": -0.6870676279067993,
"logps/chosen": -0.7556079030036926,
"logps/rejected": -0.6754346489906311,
"loss": 0.7635,
"odds_ratio_loss": 7.5766706466674805,
"rewards/accuracies": 0.3671875,
"rewards/chosen": -0.06754346191883087,
"rewards/margins": 0.008017327636480331,
"rewards/rejected": -0.0755607932806015,
"sft_loss": 0.005802985280752182,
"step": 99
},
{
"epoch": 0.5485087418580734,
"grad_norm": 2.449701609589594,
"learning_rate": 2.524091595309952e-06,
"logits/chosen": -0.701339602470398,
"logits/rejected": -0.6497517824172974,
"logps/chosen": -0.6323676705360413,
"logps/rejected": -0.633608341217041,
"loss": 0.7136,
"odds_ratio_loss": 7.087641716003418,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.06336082518100739,
"rewards/margins": -0.00012405950110405684,
"rewards/rejected": -0.0632367730140686,
"sft_loss": 0.004797851666808128,
"step": 100
},
{
"epoch": 0.5539938292766541,
"grad_norm": 1.8980955014875789,
"learning_rate": 2.475908404690049e-06,
"logits/chosen": -0.7044223546981812,
"logits/rejected": -0.6829949021339417,
"logps/chosen": -0.5901881456375122,
"logps/rejected": -0.5841273665428162,
"loss": 0.6646,
"odds_ratio_loss": 6.614431381225586,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.05841274932026863,
"rewards/margins": 0.0006060737650841475,
"rewards/rejected": -0.0590188167989254,
"sft_loss": 0.0031673426274210215,
"step": 101
},
{
"epoch": 0.5594789166952349,
"grad_norm": 2.283805508955311,
"learning_rate": 2.427734163114319e-06,
"logits/chosen": -0.7973791360855103,
"logits/rejected": -0.7453299164772034,
"logps/chosen": -0.5993779301643372,
"logps/rejected": -0.6565759778022766,
"loss": 0.7469,
"odds_ratio_loss": 7.423154354095459,
"rewards/accuracies": 0.34375,
"rewards/chosen": -0.0656576007604599,
"rewards/margins": -0.0057198042050004005,
"rewards/rejected": -0.059937797486782074,
"sft_loss": 0.004618693143129349,
"step": 102
},
{
"epoch": 0.5649640041138155,
"grad_norm": 2.1097973408223165,
"learning_rate": 2.3795867653469072e-06,
"logits/chosen": -0.7150146961212158,
"logits/rejected": -0.6962917447090149,
"logps/chosen": -0.6036943793296814,
"logps/rejected": -0.6037816405296326,
"loss": 0.6859,
"odds_ratio_loss": 6.8253865242004395,
"rewards/accuracies": 0.390625,
"rewards/chosen": -0.06037816405296326,
"rewards/margins": -8.72323289513588e-06,
"rewards/rejected": -0.06036944314837456,
"sft_loss": 0.0033163288608193398,
"step": 103
},
{
"epoch": 0.5704490915323963,
"grad_norm": 2.261348542854153,
"learning_rate": 2.3314840961805806e-06,
"logits/chosen": -0.7404282689094543,
"logits/rejected": -0.7224586606025696,
"logps/chosen": -0.6297097206115723,
"logps/rejected": -0.6101536154747009,
"loss": 0.6902,
"odds_ratio_loss": 6.866119384765625,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.06101536750793457,
"rewards/margins": 0.0019556120969355106,
"rewards/rejected": -0.06297097355127335,
"sft_loss": 0.0035803657956421375,
"step": 104
},
{
"epoch": 0.575934178950977,
"grad_norm": 2.3462527146244394,
"learning_rate": 2.2834440237932537e-06,
"logits/chosen": -0.7421097755432129,
"logits/rejected": -0.7392350435256958,
"logps/chosen": -0.5688321590423584,
"logps/rejected": -0.5941362380981445,
"loss": 0.676,
"odds_ratio_loss": 6.721547603607178,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.05941362306475639,
"rewards/margins": -0.0025304073933511972,
"rewards/rejected": -0.05688321590423584,
"sft_loss": 0.003852433292195201,
"step": 105
},
{
"epoch": 0.5814192663695578,
"grad_norm": 2.2368897755771844,
"learning_rate": 2.2354843931106933e-06,
"logits/chosen": -0.6827410459518433,
"logits/rejected": -0.6356707811355591,
"logps/chosen": -0.7240877747535706,
"logps/rejected": -0.6512930393218994,
"loss": 0.7339,
"odds_ratio_loss": 7.29488468170166,
"rewards/accuracies": 0.3984375,
"rewards/chosen": -0.06512930989265442,
"rewards/margins": 0.007279472425580025,
"rewards/rejected": -0.0724087804555893,
"sft_loss": 0.004439079202711582,
"step": 106
},
{
"epoch": 0.5869043537881385,
"grad_norm": 3.1022172612718313,
"learning_rate": 2.1876230191778598e-06,
"logits/chosen": -0.749907374382019,
"logits/rejected": -0.7236763834953308,
"logps/chosen": -0.6079933047294617,
"logps/rejected": -0.6530497074127197,
"loss": 0.7391,
"odds_ratio_loss": 7.280797481536865,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.06530497968196869,
"rewards/margins": -0.004505641758441925,
"rewards/rejected": -0.06079933047294617,
"sft_loss": 0.01101373415440321,
"step": 107
},
{
"epoch": 0.5923894412067192,
"grad_norm": 2.197063400051544,
"learning_rate": 2.13987768054134e-06,
"logits/chosen": -0.6889519095420837,
"logits/rejected": -0.6868714094161987,
"logps/chosen": -0.6039581894874573,
"logps/rejected": -0.5748084187507629,
"loss": 0.6528,
"odds_ratio_loss": 6.491232395172119,
"rewards/accuracies": 0.4296875,
"rewards/chosen": -0.057480841875076294,
"rewards/margins": 0.00291498308070004,
"rewards/rejected": -0.060395821928977966,
"sft_loss": 0.00366477994248271,
"step": 108
},
{
"epoch": 0.5978745286253,
"grad_norm": 2.159407846285504,
"learning_rate": 2.0922661126453436e-06,
"logits/chosen": -0.6941989064216614,
"logits/rejected": -0.6537268757820129,
"logps/chosen": -0.7240587472915649,
"logps/rejected": -0.6288081407546997,
"loss": 0.7097,
"odds_ratio_loss": 7.062224864959717,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.06288080662488937,
"rewards/margins": 0.00952506810426712,
"rewards/rejected": -0.0724058747291565,
"sft_loss": 0.0034658340737223625,
"step": 109
},
{
"epoch": 0.6033596160438807,
"grad_norm": 2.4398874181778822,
"learning_rate": 2.044806001243714e-06,
"logits/chosen": -0.7023935317993164,
"logits/rejected": -0.6535657644271851,
"logps/chosen": -0.6954001188278198,
"logps/rejected": -0.6107276082038879,
"loss": 0.6873,
"odds_ratio_loss": 6.822115898132324,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -0.061072759330272675,
"rewards/margins": 0.008467256091535091,
"rewards/rejected": -0.06954001635313034,
"sft_loss": 0.00511753186583519,
"step": 110
},
{
"epoch": 0.6088447034624614,
"grad_norm": 3.1443922922473795,
"learning_rate": 1.9975149758303885e-06,
"logits/chosen": -0.6652725338935852,
"logits/rejected": -0.6267548203468323,
"logps/chosen": -0.7502924799919128,
"logps/rejected": -0.6726891398429871,
"loss": 0.7606,
"odds_ratio_loss": 7.564757347106934,
"rewards/accuracies": 0.34375,
"rewards/chosen": -0.06726891547441483,
"rewards/margins": 0.007760328706353903,
"rewards/rejected": -0.07502924650907516,
"sft_loss": 0.004097915254533291,
"step": 111
},
{
"epoch": 0.6143297908810421,
"grad_norm": 3.640891877033526,
"learning_rate": 1.9504106030907605e-06,
"logits/chosen": -0.6462224125862122,
"logits/rejected": -0.661245584487915,
"logps/chosen": -0.6592689752578735,
"logps/rejected": -0.6112386584281921,
"loss": 0.6912,
"odds_ratio_loss": 6.873685836791992,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.06112387031316757,
"rewards/margins": 0.00480302982032299,
"rewards/rejected": -0.06592689454555511,
"sft_loss": 0.0038500106893479824,
"step": 112
},
{
"epoch": 0.6198148782996229,
"grad_norm": 2.7204380499722967,
"learning_rate": 1.9035103803763793e-06,
"logits/chosen": -0.6889287233352661,
"logits/rejected": -0.7191394567489624,
"logps/chosen": -0.6261990070343018,
"logps/rejected": -0.6758837699890137,
"loss": 0.7625,
"odds_ratio_loss": 7.566086769104004,
"rewards/accuracies": 0.359375,
"rewards/chosen": -0.06758838146924973,
"rewards/margins": -0.004968480207026005,
"rewards/rejected": -0.0626199021935463,
"sft_loss": 0.005911496467888355,
"step": 113
},
{
"epoch": 0.6252999657182037,
"grad_norm": 2.3221228206896543,
"learning_rate": 1.8568317292053894e-06,
"logits/chosen": -0.6986726522445679,
"logits/rejected": -0.7248036861419678,
"logps/chosen": -0.7008225917816162,
"logps/rejected": -0.6063806414604187,
"loss": 0.6896,
"odds_ratio_loss": 6.8465399742126465,
"rewards/accuracies": 0.4296875,
"rewards/chosen": -0.06063806638121605,
"rewards/margins": 0.009444191120564938,
"rewards/rejected": -0.07008224725723267,
"sft_loss": 0.004963008686900139,
"step": 114
},
{
"epoch": 0.6307850531367843,
"grad_norm": 2.3323657507043585,
"learning_rate": 1.8103919887911525e-06,
"logits/chosen": -0.6923948526382446,
"logits/rejected": -0.6713389158248901,
"logps/chosen": -0.6527585983276367,
"logps/rejected": -0.6079282164573669,
"loss": 0.6887,
"odds_ratio_loss": 6.84266996383667,
"rewards/accuracies": 0.421875,
"rewards/chosen": -0.06079282611608505,
"rewards/margins": 0.004483034834265709,
"rewards/rejected": -0.06527585536241531,
"sft_loss": 0.004444844089448452,
"step": 115
},
{
"epoch": 0.6362701405553651,
"grad_norm": 2.2756656865836824,
"learning_rate": 1.7642084096014405e-06,
"logits/chosen": -0.6769150495529175,
"logits/rejected": -0.6519336104393005,
"logps/chosen": -0.6810208559036255,
"logps/rejected": -0.6118040680885315,
"loss": 0.6967,
"odds_ratio_loss": 6.925226211547852,
"rewards/accuracies": 0.359375,
"rewards/chosen": -0.06118040531873703,
"rewards/margins": 0.006921680178493261,
"rewards/rejected": -0.06810208410024643,
"sft_loss": 0.0041780658066272736,
"step": 116
},
{
"epoch": 0.6417552279739458,
"grad_norm": 4.533574316801059,
"learning_rate": 1.718298146950585e-06,
"logits/chosen": -0.6806620955467224,
"logits/rejected": -0.6420994400978088,
"logps/chosen": -0.6579238176345825,
"logps/rejected": -0.6758315563201904,
"loss": 0.7643,
"odds_ratio_loss": 7.476547718048096,
"rewards/accuracies": 0.390625,
"rewards/chosen": -0.06758316606283188,
"rewards/margins": -0.0017907717265188694,
"rewards/rejected": -0.06579238921403885,
"sft_loss": 0.016644245013594627,
"step": 117
},
{
"epoch": 0.6472403153925266,
"grad_norm": 2.4121811138925016,
"learning_rate": 1.6726782546269793e-06,
"logits/chosen": -0.7141998410224915,
"logits/rejected": -0.6570146083831787,
"logps/chosen": -0.6056728959083557,
"logps/rejected": -0.6163904666900635,
"loss": 0.7006,
"odds_ratio_loss": 6.9596381187438965,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.061639051884412766,
"rewards/margins": -0.0010717544937506318,
"rewards/rejected": -0.06056729331612587,
"sft_loss": 0.004627158399671316,
"step": 118
},
{
"epoch": 0.6527254028111072,
"grad_norm": 2.116647341589779,
"learning_rate": 1.6273656785582986e-06,
"logits/chosen": -0.7080201506614685,
"logits/rejected": -0.6555320024490356,
"logps/chosen": -0.6334236860275269,
"logps/rejected": -0.6177656650543213,
"loss": 0.7001,
"odds_ratio_loss": 6.9610371589660645,
"rewards/accuracies": 0.421875,
"rewards/chosen": -0.06177656352519989,
"rewards/margins": 0.0015658115735277534,
"rewards/rejected": -0.0633423700928688,
"sft_loss": 0.00395183265209198,
"step": 119
},
{
"epoch": 0.658210490229688,
"grad_norm": 2.10163675871464,
"learning_rate": 1.58237725051677e-06,
"logits/chosen": -0.6549379229545593,
"logits/rejected": -0.6431285738945007,
"logps/chosen": -0.5666630268096924,
"logps/rejected": -0.534731388092041,
"loss": 0.6123,
"odds_ratio_loss": 6.093177795410156,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.05347314849495888,
"rewards/margins": 0.0031931637786328793,
"rewards/rejected": -0.056666307151317596,
"sft_loss": 0.003024790436029434,
"step": 120
},
{
"epoch": 0.6636955776482688,
"grad_norm": 2.117301914309323,
"learning_rate": 1.5377296818668638e-06,
"logits/chosen": -0.6497895121574402,
"logits/rejected": -0.6862410306930542,
"logps/chosen": -0.5236161351203918,
"logps/rejected": -0.6108285188674927,
"loss": 0.7019,
"odds_ratio_loss": 6.986342430114746,
"rewards/accuracies": 0.3515625,
"rewards/chosen": -0.06108284741640091,
"rewards/margins": -0.008721234276890755,
"rewards/rejected": -0.052361615002155304,
"sft_loss": 0.0032780070323497057,
"step": 121
},
{
"epoch": 0.6691806650668495,
"grad_norm": 2.2927939430967093,
"learning_rate": 1.4934395573577016e-06,
"logits/chosen": -0.680001974105835,
"logits/rejected": -0.6553654670715332,
"logps/chosen": -0.5565463900566101,
"logps/rejected": -0.5977818965911865,
"loss": 0.681,
"odds_ratio_loss": 6.767004489898682,
"rewards/accuracies": 0.4140625,
"rewards/chosen": -0.05977818742394447,
"rewards/margins": -0.004123550374060869,
"rewards/rejected": -0.05565463379025459,
"sft_loss": 0.004254105966538191,
"step": 122
},
{
"epoch": 0.6746657524854303,
"grad_norm": 1.9919891401234977,
"learning_rate": 1.449523328962496e-06,
"logits/chosen": -0.6618916988372803,
"logits/rejected": -0.6230502724647522,
"logps/chosen": -0.6067531108856201,
"logps/rejected": -0.6063209772109985,
"loss": 0.6881,
"odds_ratio_loss": 6.851033687591553,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.06063209846615791,
"rewards/margins": 4.3215928599238396e-05,
"rewards/rejected": -0.06067531555891037,
"sft_loss": 0.0030309129506349564,
"step": 123
},
{
"epoch": 0.6801508399040109,
"grad_norm": 2.4998193508161006,
"learning_rate": 1.4059973097673187e-06,
"logits/chosen": -0.7269992232322693,
"logits/rejected": -0.6942679286003113,
"logps/chosen": -0.6811712980270386,
"logps/rejected": -0.6196709871292114,
"loss": 0.6976,
"odds_ratio_loss": 6.929691314697266,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.061967093497514725,
"rewards/margins": 0.006150041241198778,
"rewards/rejected": -0.06811713427305222,
"sft_loss": 0.004657902754843235,
"step": 124
},
{
"epoch": 0.6856359273225917,
"grad_norm": 2.1081004296697583,
"learning_rate": 1.3628776679114516e-06,
"logits/chosen": -0.7377051711082458,
"logits/rejected": -0.7069607973098755,
"logps/chosen": -0.8355605602264404,
"logps/rejected": -0.6344387531280518,
"loss": 0.7112,
"odds_ratio_loss": 7.074479103088379,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -0.0634438693523407,
"rewards/margins": 0.020112188532948494,
"rewards/rejected": -0.08355606347322464,
"sft_loss": 0.003787390887737274,
"step": 125
},
{
"epoch": 0.6911210147411725,
"grad_norm": 2.085036928586896,
"learning_rate": 1.3201804205815872e-06,
"logits/chosen": -0.6863076686859131,
"logits/rejected": -0.6810216307640076,
"logps/chosen": -0.7168741226196289,
"logps/rejected": -0.6459383964538574,
"loss": 0.7361,
"odds_ratio_loss": 7.325974464416504,
"rewards/accuracies": 0.3515625,
"rewards/chosen": -0.0645938366651535,
"rewards/margins": 0.0070935748517513275,
"rewards/rejected": -0.07168740779161453,
"sft_loss": 0.0035439669154584408,
"step": 126
},
{
"epoch": 0.6966061021597532,
"grad_norm": 2.351506031335298,
"learning_rate": 1.277921428062091e-06,
"logits/chosen": -0.6885042190551758,
"logits/rejected": -0.6516430974006653,
"logps/chosen": -0.6768959164619446,
"logps/rejected": -0.6008927822113037,
"loss": 0.6796,
"odds_ratio_loss": 6.754604339599609,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.06008927896618843,
"rewards/margins": 0.007600321434438229,
"rewards/rejected": -0.06768959760665894,
"sft_loss": 0.004147130064666271,
"step": 127
},
{
"epoch": 0.7020911895783339,
"grad_norm": 2.027558705672352,
"learning_rate": 1.2361163878435594e-06,
"logits/chosen": -0.6875728964805603,
"logits/rejected": -0.6000896096229553,
"logps/chosen": -0.6457569599151611,
"logps/rejected": -0.6216841340065002,
"loss": 0.7047,
"odds_ratio_loss": 7.014739036560059,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.062168411910533905,
"rewards/margins": 0.0024072853848338127,
"rewards/rejected": -0.06457570195198059,
"sft_loss": 0.0032098242081701756,
"step": 128
},
{
"epoch": 0.7075762769969146,
"grad_norm": 2.6891394023374913,
"learning_rate": 1.1947808287918406e-06,
"logits/chosen": -0.7073705792427063,
"logits/rejected": -0.6391410231590271,
"logps/chosen": -0.9505314826965332,
"logps/rejected": -0.6497754454612732,
"loss": 0.7381,
"odds_ratio_loss": 7.3174543380737305,
"rewards/accuracies": 0.3359375,
"rewards/chosen": -0.06497755646705627,
"rewards/margins": 0.030075591057538986,
"rewards/rejected": -0.09505314379930496,
"sft_loss": 0.006380516104400158,
"step": 129
},
{
"epoch": 0.7130613644154954,
"grad_norm": 2.7243982781242653,
"learning_rate": 1.153930105379695e-06,
"logits/chosen": -0.6403245329856873,
"logits/rejected": -0.6941784620285034,
"logps/chosen": -0.6206578016281128,
"logps/rejected": -0.6329513788223267,
"loss": 0.7166,
"odds_ratio_loss": 7.099569797515869,
"rewards/accuracies": 0.4140625,
"rewards/chosen": -0.0632951408624649,
"rewards/margins": -0.0012293587205931544,
"rewards/rejected": -0.06206577643752098,
"sft_loss": 0.006686141714453697,
"step": 130
},
{
"epoch": 0.7185464518340761,
"grad_norm": 3.0501747152037226,
"learning_rate": 1.1135793919832336e-06,
"logits/chosen": -0.6880025863647461,
"logits/rejected": -0.6472803950309753,
"logps/chosen": -0.6972535252571106,
"logps/rejected": -0.6315404176712036,
"loss": 0.7149,
"odds_ratio_loss": 7.093357086181641,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.06315404176712036,
"rewards/margins": 0.006571306847035885,
"rewards/rejected": -0.06972534954547882,
"sft_loss": 0.005599521566182375,
"step": 131
},
{
"epoch": 0.7240315392526568,
"grad_norm": 4.097059851270684,
"learning_rate": 1.0737436772452602e-06,
"logits/chosen": -0.6801282167434692,
"logits/rejected": -0.6225499510765076,
"logps/chosen": -0.6466732025146484,
"logps/rejected": -0.6455317139625549,
"loss": 0.7287,
"odds_ratio_loss": 7.189926624298096,
"rewards/accuracies": 0.4296875,
"rewards/chosen": -0.0645531713962555,
"rewards/margins": 0.00011415383778512478,
"rewards/rejected": -0.06466732174158096,
"sft_loss": 0.00968949869275093,
"step": 132
},
{
"epoch": 0.7295166266712376,
"grad_norm": 2.1627454720493695,
"learning_rate": 1.0344377585076e-06,
"logits/chosen": -0.6447775363922119,
"logits/rejected": -0.6088653206825256,
"logps/chosen": -0.5911606550216675,
"logps/rejected": -0.5996113419532776,
"loss": 0.6819,
"odds_ratio_loss": 6.785092830657959,
"rewards/accuracies": 0.3671875,
"rewards/chosen": -0.05996113270521164,
"rewards/margins": -0.000845066097099334,
"rewards/rejected": -0.05911606550216675,
"sft_loss": 0.00336868641898036,
"step": 133
},
{
"epoch": 0.7350017140898183,
"grad_norm": 2.3398426117994826,
"learning_rate": 9.956762363144892e-07,
"logits/chosen": -0.6249683499336243,
"logits/rejected": -0.6248048543930054,
"logps/chosen": -0.6350124478340149,
"logps/rejected": -0.5843938589096069,
"loss": 0.666,
"odds_ratio_loss": 6.617175579071045,
"rewards/accuracies": 0.4453125,
"rewards/chosen": -0.05843937769532204,
"rewards/margins": 0.005061861127614975,
"rewards/rejected": -0.06350124627351761,
"sft_loss": 0.004313306882977486,
"step": 134
},
{
"epoch": 0.7404868015083991,
"grad_norm": 2.241323764733524,
"learning_rate": 9.574735089890765e-07,
"logits/chosen": -0.6376866102218628,
"logits/rejected": -0.5942208766937256,
"logps/chosen": -0.6546105742454529,
"logps/rejected": -0.6053332090377808,
"loss": 0.6863,
"odds_ratio_loss": 6.821003437042236,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.060533322393894196,
"rewards/margins": 0.0049277422949671745,
"rewards/rejected": -0.06546106189489365,
"sft_loss": 0.004214235581457615,
"step": 135
},
{
"epoch": 0.7459718889269797,
"grad_norm": 2.2084551304631654,
"learning_rate": 9.198437672850249e-07,
"logits/chosen": -0.6654888391494751,
"logits/rejected": -0.616665244102478,
"logps/chosen": -0.6150586009025574,
"logps/rejected": -0.6242785453796387,
"loss": 0.7088,
"odds_ratio_loss": 7.0509033203125,
"rewards/accuracies": 0.3984375,
"rewards/chosen": -0.062427863478660583,
"rewards/margins": -0.0009219995117746294,
"rewards/rejected": -0.06150586158037186,
"sft_loss": 0.0037482583429664373,
"step": 136
},
{
"epoch": 0.7514569763455605,
"grad_norm": 2.5791794615739603,
"learning_rate": 8.828009891152301e-07,
"logits/chosen": -0.6492202877998352,
"logits/rejected": -0.6373860836029053,
"logps/chosen": -0.6348594427108765,
"logps/rejected": -0.5977117419242859,
"loss": 0.6816,
"odds_ratio_loss": 6.770447731018066,
"rewards/accuracies": 0.390625,
"rewards/chosen": -0.05977117270231247,
"rewards/margins": 0.003714768448844552,
"rewards/rejected": -0.06348594278097153,
"sft_loss": 0.004584567621350288,
"step": 137
},
{
"epoch": 0.7569420637641412,
"grad_norm": 2.10960114347503,
"learning_rate": 8.463589343595976e-07,
"logits/chosen": -0.6831432580947876,
"logits/rejected": -0.6670156717300415,
"logps/chosen": -0.6033125519752502,
"logps/rejected": -0.589252769947052,
"loss": 0.6723,
"odds_ratio_loss": 6.688982009887695,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.05892528221011162,
"rewards/margins": 0.0014059704262763262,
"rewards/rejected": -0.06033124774694443,
"sft_loss": 0.0033750347793102264,
"step": 138
},
{
"epoch": 0.762427151182722,
"grad_norm": 2.2233884047037513,
"learning_rate": 8.105311397538085e-07,
"logits/chosen": -0.6301755905151367,
"logits/rejected": -0.6074115037918091,
"logps/chosen": -0.642350435256958,
"logps/rejected": -0.6004490852355957,
"loss": 0.6787,
"odds_ratio_loss": 6.747027397155762,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.06004491448402405,
"rewards/margins": 0.004190134350210428,
"rewards/rejected": -0.06423504650592804,
"sft_loss": 0.0040018935687839985,
"step": 139
},
{
"epoch": 0.7679122386013028,
"grad_norm": 2.1001393729716997,
"learning_rate": 7.753309138609705e-07,
"logits/chosen": -0.6419239044189453,
"logits/rejected": -0.6177327632904053,
"logps/chosen": -0.7455356121063232,
"logps/rejected": -0.6029459834098816,
"loss": 0.6764,
"odds_ratio_loss": 6.730374336242676,
"rewards/accuracies": 0.4609375,
"rewards/chosen": -0.06029459461569786,
"rewards/margins": 0.014258962124586105,
"rewards/rejected": -0.07455356419086456,
"sft_loss": 0.003380353096872568,
"step": 140
},
{
"epoch": 0.7733973260198834,
"grad_norm": 1.955388109291546,
"learning_rate": 7.407713321280377e-07,
"logits/chosen": -0.6610326766967773,
"logits/rejected": -0.5916581749916077,
"logps/chosen": -0.6852571368217468,
"logps/rejected": -0.5728943347930908,
"loss": 0.6502,
"odds_ratio_loss": 6.471943378448486,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.057289429008960724,
"rewards/margins": 0.011236282996833324,
"rewards/rejected": -0.06852570921182632,
"sft_loss": 0.003054150380194187,
"step": 141
},
{
"epoch": 0.7788824134384642,
"grad_norm": 1.9633538931591568,
"learning_rate": 7.068652320288081e-07,
"logits/chosen": -0.5979795455932617,
"logits/rejected": -0.5797224640846252,
"logps/chosen": -0.5880254507064819,
"logps/rejected": -0.6247429847717285,
"loss": 0.7109,
"odds_ratio_loss": 7.07578706741333,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.06247429549694061,
"rewards/margins": -0.003671749262139201,
"rewards/rejected": -0.05880254879593849,
"sft_loss": 0.003300985088571906,
"step": 142
},
{
"epoch": 0.7843675008570449,
"grad_norm": 1.9235465695795355,
"learning_rate": 6.736252082953307e-07,
"logits/chosen": -0.5846747756004333,
"logits/rejected": -0.5656622052192688,
"logps/chosen": -0.5795245170593262,
"logps/rejected": -0.5894888639450073,
"loss": 0.6764,
"odds_ratio_loss": 6.733616352081299,
"rewards/accuracies": 0.359375,
"rewards/chosen": -0.05894888564944267,
"rewards/margins": -0.0009964264463633299,
"rewards/rejected": -0.057952456176280975,
"sft_loss": 0.0030053844675421715,
"step": 143
},
{
"epoch": 0.7898525882756257,
"grad_norm": 1.8461477291438253,
"learning_rate": 6.410636082394772e-07,
"logits/chosen": -0.6314021944999695,
"logits/rejected": -0.6141982078552246,
"logps/chosen": -0.7406495809555054,
"logps/rejected": -0.6059472560882568,
"loss": 0.6888,
"odds_ratio_loss": 6.85874605178833,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.06059472635388374,
"rewards/margins": 0.013470232486724854,
"rewards/rejected": -0.0740649551153183,
"sft_loss": 0.0029607657343149185,
"step": 144
},
{
"epoch": 0.7953376756942063,
"grad_norm": 2.4596952552330196,
"learning_rate": 6.091925271664156e-07,
"logits/chosen": -0.6452285051345825,
"logits/rejected": -0.6411218047142029,
"logps/chosen": -0.6096831560134888,
"logps/rejected": -0.6008501648902893,
"loss": 0.6837,
"odds_ratio_loss": 6.7790327072143555,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.06008501723408699,
"rewards/margins": 0.0008833012543618679,
"rewards/rejected": -0.0609683133661747,
"sft_loss": 0.005798771046102047,
"step": 145
},
{
"epoch": 0.8008227631127871,
"grad_norm": 2.395026025025573,
"learning_rate": 5.780238038817035e-07,
"logits/chosen": -0.6628804802894592,
"logits/rejected": -0.6211153268814087,
"logps/chosen": -0.6752290725708008,
"logps/rejected": -0.6579370498657227,
"loss": 0.746,
"odds_ratio_loss": 7.417111873626709,
"rewards/accuracies": 0.328125,
"rewards/chosen": -0.0657937079668045,
"rewards/margins": 0.0017291962867602706,
"rewards/rejected": -0.06752290576696396,
"sft_loss": 0.004298758693039417,
"step": 146
},
{
"epoch": 0.8063078505313679,
"grad_norm": 2.068579141907181,
"learning_rate": 5.47569016293649e-07,
"logits/chosen": -0.6299046874046326,
"logits/rejected": -0.6121450662612915,
"logps/chosen": -0.8063814640045166,
"logps/rejected": -0.5979243516921997,
"loss": 0.6784,
"odds_ratio_loss": 6.7539262771606445,
"rewards/accuracies": 0.4296875,
"rewards/chosen": -0.05979243293404579,
"rewards/margins": 0.020845718681812286,
"rewards/rejected": -0.08063814043998718,
"sft_loss": 0.0030515664257109165,
"step": 147
},
{
"epoch": 0.8117929379499486,
"grad_norm": 3.6637986589772233,
"learning_rate": 5.178394771125969e-07,
"logits/chosen": -0.6278913021087646,
"logits/rejected": -0.5689703226089478,
"logps/chosen": -0.7562225461006165,
"logps/rejected": -0.6262752413749695,
"loss": 0.7032,
"odds_ratio_loss": 6.891101837158203,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.06262752413749695,
"rewards/margins": 0.012994730845093727,
"rewards/rejected": -0.07562224566936493,
"sft_loss": 0.01412445493042469,
"step": 148
},
{
"epoch": 0.8172780253685293,
"grad_norm": 2.3203742871591144,
"learning_rate": 4.888462296487129e-07,
"logits/chosen": -0.6456272602081299,
"logits/rejected": -0.6417357325553894,
"logps/chosen": -0.6021786332130432,
"logps/rejected": -0.6396222114562988,
"loss": 0.7282,
"odds_ratio_loss": 7.244186878204346,
"rewards/accuracies": 0.3359375,
"rewards/chosen": -0.06396222114562988,
"rewards/margins": -0.0037443540059030056,
"rewards/rejected": -0.06021786853671074,
"sft_loss": 0.0037422627210617065,
"step": 149
},
{
"epoch": 0.82276311278711,
"grad_norm": 2.5512980917433157,
"learning_rate": 4.6060004370984763e-07,
"logits/chosen": -0.631007969379425,
"logits/rejected": -0.5933969020843506,
"logps/chosen": -0.8098146915435791,
"logps/rejected": -0.6024014353752136,
"loss": 0.6766,
"odds_ratio_loss": 6.713679790496826,
"rewards/accuracies": 0.4921875,
"rewards/chosen": -0.06024014204740524,
"rewards/margins": 0.02074132300913334,
"rewards/rejected": -0.08098147064447403,
"sft_loss": 0.005258024670183659,
"step": 150
},
{
"epoch": 0.8282482002056908,
"grad_norm": 2.198032076864518,
"learning_rate": 4.331114116009938e-07,
"logits/chosen": -0.6112239360809326,
"logits/rejected": -0.55472332239151,
"logps/chosen": -0.7825672030448914,
"logps/rejected": -0.6584952473640442,
"loss": 0.7438,
"odds_ratio_loss": 7.401620864868164,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.06584953516721725,
"rewards/margins": 0.012407200410962105,
"rewards/rejected": -0.07825673371553421,
"sft_loss": 0.0036755804903805256,
"step": 151
},
{
"epoch": 0.8337332876242715,
"grad_norm": 2.062470910623532,
"learning_rate": 4.063905442268201e-07,
"logits/chosen": -0.6306271553039551,
"logits/rejected": -0.6064797043800354,
"logps/chosen": -0.6712342500686646,
"logps/rejected": -0.6343151926994324,
"loss": 0.7212,
"odds_ratio_loss": 7.176795959472656,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.063431516289711,
"rewards/margins": 0.0036919128615409136,
"rewards/rejected": -0.0671234279870987,
"sft_loss": 0.0035129080060869455,
"step": 152
},
{
"epoch": 0.8392183750428522,
"grad_norm": 2.3428435069292166,
"learning_rate": 3.80447367298738e-07,
"logits/chosen": -0.6870932579040527,
"logits/rejected": -0.6724913716316223,
"logps/chosen": -0.6094024777412415,
"logps/rejected": -0.6165374517440796,
"loss": 0.7051,
"odds_ratio_loss": 7.0048112869262695,
"rewards/accuracies": 0.3671875,
"rewards/chosen": -0.0616537444293499,
"rewards/margins": -0.0007134978659451008,
"rewards/rejected": -0.06094024330377579,
"sft_loss": 0.004614294972270727,
"step": 153
},
{
"epoch": 0.844703462461433,
"grad_norm": 2.162714758637227,
"learning_rate": 3.5529151764790715e-07,
"logits/chosen": -0.6563050746917725,
"logits/rejected": -0.6312206983566284,
"logps/chosen": -0.7330479025840759,
"logps/rejected": -0.6632793545722961,
"loss": 0.7478,
"odds_ratio_loss": 7.4362664222717285,
"rewards/accuracies": 0.359375,
"rewards/chosen": -0.06632794439792633,
"rewards/margins": 0.006976846605539322,
"rewards/rejected": -0.07330479472875595,
"sft_loss": 0.004126606043428183,
"step": 154
},
{
"epoch": 0.8501885498800137,
"grad_norm": 2.061116429438832,
"learning_rate": 3.3093233964554464e-07,
"logits/chosen": -0.6102815866470337,
"logits/rejected": -0.6046280860900879,
"logps/chosen": -0.6141259670257568,
"logps/rejected": -0.5823516845703125,
"loss": 0.6607,
"odds_ratio_loss": 6.578813076019287,
"rewards/accuracies": 0.4609375,
"rewards/chosen": -0.05823516845703125,
"rewards/margins": 0.0031774300150573254,
"rewards/rejected": -0.061412595212459564,
"sft_loss": 0.002771932166069746,
"step": 155
},
{
"epoch": 0.8556736372985945,
"grad_norm": 2.432594869696479,
"learning_rate": 3.0737888173187067e-07,
"logits/chosen": -0.6550750136375427,
"logits/rejected": -0.6305856108665466,
"logps/chosen": -0.6801073551177979,
"logps/rejected": -0.5993965268135071,
"loss": 0.6803,
"odds_ratio_loss": 6.759056568145752,
"rewards/accuracies": 0.4140625,
"rewards/chosen": -0.05993964523077011,
"rewards/margins": 0.008071082644164562,
"rewards/rejected": -0.06801073253154755,
"sft_loss": 0.004404113162308931,
"step": 156
},
{
"epoch": 0.8611587247171751,
"grad_norm": 2.0454259039650977,
"learning_rate": 2.8463989305498596e-07,
"logits/chosen": -0.5875197649002075,
"logits/rejected": -0.6273292303085327,
"logps/chosen": -0.5868790149688721,
"logps/rejected": -0.5973340272903442,
"loss": 0.6798,
"odds_ratio_loss": 6.763144493103027,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.05973340570926666,
"rewards/margins": -0.0010455029550939798,
"rewards/rejected": -0.058687906712293625,
"sft_loss": 0.0034761279821395874,
"step": 157
},
{
"epoch": 0.8666438121357559,
"grad_norm": 2.158571117053065,
"learning_rate": 2.6272382022091704e-07,
"logits/chosen": -0.6387627124786377,
"logits/rejected": -0.6411675214767456,
"logps/chosen": -0.6698893308639526,
"logps/rejected": -0.5845687389373779,
"loss": 0.6596,
"odds_ratio_loss": 6.559039115905762,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -0.05845687910914421,
"rewards/margins": 0.008532052859663963,
"rewards/rejected": -0.06698893010616302,
"sft_loss": 0.0036818967200815678,
"step": 158
},
{
"epoch": 0.8721288995543367,
"grad_norm": 2.1296893473193985,
"learning_rate": 2.4163880415604913e-07,
"logits/chosen": -0.5978600978851318,
"logits/rejected": -0.5640897154808044,
"logps/chosen": -0.6199032664299011,
"logps/rejected": -0.6052448749542236,
"loss": 0.6879,
"odds_ratio_loss": 6.839813232421875,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.06052448973059654,
"rewards/margins": 0.0014658391010016203,
"rewards/rejected": -0.06199032440781593,
"sft_loss": 0.003927412908524275,
"step": 159
},
{
"epoch": 0.8776139869729174,
"grad_norm": 2.155495858759351,
"learning_rate": 2.2139267708310457e-07,
"logits/chosen": -0.676996111869812,
"logits/rejected": -0.6428037285804749,
"logps/chosen": -0.6014530658721924,
"logps/rejected": -0.5981815457344055,
"loss": 0.6794,
"odds_ratio_loss": 6.754974365234375,
"rewards/accuracies": 0.4140625,
"rewards/chosen": -0.05981815233826637,
"rewards/margins": 0.00032715569250285625,
"rewards/rejected": -0.060145311057567596,
"sft_loss": 0.0038914589677006006,
"step": 160
},
{
"epoch": 0.8830990743914982,
"grad_norm": 2.6138265531338565,
"learning_rate": 2.0199295961178893e-07,
"logits/chosen": -0.6286705136299133,
"logits/rejected": -0.6172723770141602,
"logps/chosen": -0.6388348937034607,
"logps/rejected": -0.5947953462600708,
"loss": 0.6768,
"odds_ratio_loss": 6.710721492767334,
"rewards/accuracies": 0.4140625,
"rewards/chosen": -0.05947953462600708,
"rewards/margins": 0.004403956700116396,
"rewards/rejected": -0.06388349086046219,
"sft_loss": 0.005766674876213074,
"step": 161
},
{
"epoch": 0.8885841618100788,
"grad_norm": 2.022101568731076,
"learning_rate": 1.8344685794519507e-07,
"logits/chosen": -0.6218512058258057,
"logits/rejected": -0.5689311027526855,
"logps/chosen": -0.6789796352386475,
"logps/rejected": -0.6389855742454529,
"loss": 0.7205,
"odds_ratio_loss": 7.172222137451172,
"rewards/accuracies": 0.421875,
"rewards/chosen": -0.06389855593442917,
"rewards/margins": 0.003999411594122648,
"rewards/rejected": -0.06789796054363251,
"sft_loss": 0.0032456754706799984,
"step": 162
},
{
"epoch": 0.8940692492286596,
"grad_norm": 2.2581765266272185,
"learning_rate": 1.6576126120299046e-07,
"logits/chosen": -0.6435633301734924,
"logits/rejected": -0.589432954788208,
"logps/chosen": -0.7249476313591003,
"logps/rejected": -0.6334548592567444,
"loss": 0.7181,
"odds_ratio_loss": 7.1389479637146,
"rewards/accuracies": 0.3671875,
"rewards/chosen": -0.06334548443555832,
"rewards/margins": 0.009149276651442051,
"rewards/rejected": -0.0724947601556778,
"sft_loss": 0.004166773054748774,
"step": 163
},
{
"epoch": 0.8995543366472403,
"grad_norm": 2.2259746913759004,
"learning_rate": 1.4894273886239208e-07,
"logits/chosen": -0.6249025464057922,
"logits/rejected": -0.5969172716140747,
"logps/chosen": -0.633272647857666,
"logps/rejected": -0.6413824558258057,
"loss": 0.726,
"odds_ratio_loss": 7.205265045166016,
"rewards/accuracies": 0.3671875,
"rewards/chosen": -0.06413824111223221,
"rewards/margins": -0.0008109819609671831,
"rewards/rejected": -0.06332726776599884,
"sft_loss": 0.005517784971743822,
"step": 164
},
{
"epoch": 0.905039424065821,
"grad_norm": 2.2159816572748468,
"learning_rate": 1.3299753831787193e-07,
"logits/chosen": -0.601813018321991,
"logits/rejected": -0.6476290822029114,
"logps/chosen": -0.6103772521018982,
"logps/rejected": -0.6375135183334351,
"loss": 0.7213,
"odds_ratio_loss": 7.173505783081055,
"rewards/accuracies": 0.3828125,
"rewards/chosen": -0.06375134736299515,
"rewards/margins": -0.002713626716285944,
"rewards/rejected": -0.06103772297501564,
"sft_loss": 0.003954712767153978,
"step": 165
},
{
"epoch": 0.9105245114844018,
"grad_norm": 1.987492056492753,
"learning_rate": 1.1793158256050708e-07,
"logits/chosen": -0.628088116645813,
"logits/rejected": -0.591625452041626,
"logps/chosen": -0.6703177690505981,
"logps/rejected": -0.5828872323036194,
"loss": 0.6629,
"odds_ratio_loss": 6.595256805419922,
"rewards/accuracies": 0.4296875,
"rewards/chosen": -0.05828872323036194,
"rewards/margins": 0.008743051439523697,
"rewards/rejected": -0.06703177839517593,
"sft_loss": 0.003396927611902356,
"step": 166
},
{
"epoch": 0.9160095989029825,
"grad_norm": 2.059761656496877,
"learning_rate": 1.0375046797782868e-07,
"logits/chosen": -0.6672204732894897,
"logits/rejected": -0.6035845875740051,
"logps/chosen": -0.6069723963737488,
"logps/rejected": -0.6074884533882141,
"loss": 0.6911,
"odds_ratio_loss": 6.8784990310668945,
"rewards/accuracies": 0.3984375,
"rewards/chosen": -0.06074884906411171,
"rewards/margins": -5.1606097258627415e-05,
"rewards/rejected": -0.06069723516702652,
"sft_loss": 0.0032916096970438957,
"step": 167
},
{
"epoch": 0.9214946863215633,
"grad_norm": 1.8571804970978707,
"learning_rate": 9.045946227499298e-08,
"logits/chosen": -0.670376718044281,
"logits/rejected": -0.5822762846946716,
"logps/chosen": -0.7293166518211365,
"logps/rejected": -0.6415222883224487,
"loss": 0.726,
"odds_ratio_loss": 7.229472637176514,
"rewards/accuracies": 0.34375,
"rewards/chosen": -0.06415222585201263,
"rewards/margins": 0.008779437281191349,
"rewards/rejected": -0.072931669652462,
"sft_loss": 0.0030599033925682306,
"step": 168
},
{
"epoch": 0.926979773740144,
"grad_norm": 2.279667665781299,
"learning_rate": 7.806350251804484e-08,
"logits/chosen": -0.6420009732246399,
"logits/rejected": -0.5824066400527954,
"logps/chosen": -0.875396728515625,
"logps/rejected": -0.6270475387573242,
"loss": 0.7041,
"odds_ratio_loss": 7.000705242156982,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.06270475685596466,
"rewards/margins": 0.02483491040766239,
"rewards/rejected": -0.0875396728515625,
"sft_loss": 0.004043279215693474,
"step": 169
},
{
"epoch": 0.9324648611587247,
"grad_norm": 1.9899455007963414,
"learning_rate": 6.6567193299997e-08,
"logits/chosen": -0.6606361269950867,
"logits/rejected": -0.6265894174575806,
"logps/chosen": -0.8179676532745361,
"logps/rejected": -0.6608296036720276,
"loss": 0.7442,
"odds_ratio_loss": 7.407998085021973,
"rewards/accuracies": 0.34375,
"rewards/chosen": -0.06608295440673828,
"rewards/margins": 0.015713810920715332,
"rewards/rejected": -0.08179676532745361,
"sft_loss": 0.0034413619432598352,
"step": 170
},
{
"epoch": 0.9379499485773054,
"grad_norm": 2.432140341651671,
"learning_rate": 5.597480503041486e-08,
"logits/chosen": -0.6267792582511902,
"logits/rejected": -0.6068964004516602,
"logps/chosen": -0.6993312835693359,
"logps/rejected": -0.6519731283187866,
"loss": 0.7372,
"odds_ratio_loss": 7.303807258605957,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.06519731879234314,
"rewards/margins": 0.004735812544822693,
"rewards/rejected": -0.06993313133716583,
"sft_loss": 0.006868740543723106,
"step": 171
},
{
"epoch": 0.9434350359958862,
"grad_norm": 2.1615902439918186,
"learning_rate": 4.629027234912986e-08,
"logits/chosen": -0.662708044052124,
"logits/rejected": -0.6557428240776062,
"logps/chosen": -0.608113169670105,
"logps/rejected": -0.5925155878067017,
"loss": 0.6778,
"odds_ratio_loss": 6.741554260253906,
"rewards/accuracies": 0.390625,
"rewards/chosen": -0.059251561760902405,
"rewards/margins": 0.0015597562305629253,
"rewards/rejected": -0.06081131845712662,
"sft_loss": 0.003658043686300516,
"step": 172
},
{
"epoch": 0.948920123414467,
"grad_norm": 2.2861540937365947,
"learning_rate": 3.7517192664685844e-08,
"logits/chosen": -0.6392232179641724,
"logits/rejected": -0.6070412993431091,
"logps/chosen": -0.6354994177818298,
"logps/rejected": -0.628402590751648,
"loss": 0.7149,
"odds_ratio_loss": 7.101507186889648,
"rewards/accuracies": 0.359375,
"rewards/chosen": -0.06284026056528091,
"rewards/margins": 0.0007096900371834636,
"rewards/rejected": -0.0635499432682991,
"sft_loss": 0.004706856328994036,
"step": 173
},
{
"epoch": 0.9544052108330476,
"grad_norm": 2.3390784115932726,
"learning_rate": 2.9658824818044328e-08,
"logits/chosen": -0.6274293065071106,
"logits/rejected": -0.5854682326316833,
"logps/chosen": -0.7155969142913818,
"logps/rejected": -0.6342028975486755,
"loss": 0.7122,
"odds_ratio_loss": 7.073307514190674,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.06342029571533203,
"rewards/margins": 0.008139407262206078,
"rewards/rejected": -0.07155969738960266,
"sft_loss": 0.004918898921459913,
"step": 174
},
{
"epoch": 0.9598902982516284,
"grad_norm": 2.563514632765213,
"learning_rate": 2.2718087872060925e-08,
"logits/chosen": -0.6743865013122559,
"logits/rejected": -0.6293442845344543,
"logps/chosen": -0.7085027098655701,
"logps/rejected": -0.6935465931892395,
"loss": 0.7803,
"odds_ratio_loss": 7.746453285217285,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.06935466080904007,
"rewards/margins": 0.0014956118538975716,
"rewards/rejected": -0.07085027545690536,
"sft_loss": 0.0056967539712786674,
"step": 175
},
{
"epoch": 0.9653753856702091,
"grad_norm": 2.056659344905833,
"learning_rate": 1.6697560027171543e-08,
"logits/chosen": -0.620927631855011,
"logits/rejected": -0.5821961760520935,
"logps/chosen": -0.6993247866630554,
"logps/rejected": -0.5995112657546997,
"loss": 0.6786,
"odds_ratio_loss": 6.743234634399414,
"rewards/accuracies": 0.4453125,
"rewards/chosen": -0.05995112285017967,
"rewards/margins": 0.00998135469853878,
"rewards/rejected": -0.0699324831366539,
"sft_loss": 0.004281484521925449,
"step": 176
},
{
"epoch": 0.9708604730887899,
"grad_norm": 2.133552946595605,
"learning_rate": 1.1599477663696845e-08,
"logits/chosen": -0.6138472557067871,
"logits/rejected": -0.6400952339172363,
"logps/chosen": -0.6972988843917847,
"logps/rejected": -0.6593939661979675,
"loss": 0.7433,
"odds_ratio_loss": 7.3952131271362305,
"rewards/accuracies": 0.3984375,
"rewards/chosen": -0.06593939661979675,
"rewards/margins": 0.0037904919590801,
"rewards/rejected": -0.06972989439964294,
"sft_loss": 0.00373737677000463,
"step": 177
},
{
"epoch": 0.9763455605073705,
"grad_norm": 2.256606648624031,
"learning_rate": 7.425734511117e-09,
"logits/chosen": -0.6247913241386414,
"logits/rejected": -0.6045972108840942,
"logps/chosen": -0.5932058691978455,
"logps/rejected": -0.6002853512763977,
"loss": 0.685,
"odds_ratio_loss": 6.808387756347656,
"rewards/accuracies": 0.3671875,
"rewards/chosen": -0.06002853438258171,
"rewards/margins": -0.00070795021019876,
"rewards/rejected": -0.059320587664842606,
"sft_loss": 0.004185628145933151,
"step": 178
},
{
"epoch": 0.9818306479259513,
"grad_norm": 2.370987455595276,
"learning_rate": 4.17788094463023e-09,
"logits/chosen": -0.6684907674789429,
"logits/rejected": -0.613418698310852,
"logps/chosen": -0.6800941824913025,
"logps/rejected": -0.5861297845840454,
"loss": 0.6704,
"odds_ratio_loss": 6.643950462341309,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.05861297994852066,
"rewards/margins": 0.009396445006132126,
"rewards/rejected": -0.06800942122936249,
"sft_loss": 0.006054120138287544,
"step": 179
},
{
"epoch": 0.9873157353445321,
"grad_norm": 2.134555240071597,
"learning_rate": 1.857123409250705e-09,
"logits/chosen": -0.6588638424873352,
"logits/rejected": -0.6565414071083069,
"logps/chosen": -0.6550459265708923,
"logps/rejected": -0.6189512014389038,
"loss": 0.7047,
"odds_ratio_loss": 7.008230209350586,
"rewards/accuracies": 0.3671875,
"rewards/chosen": -0.06189511716365814,
"rewards/margins": 0.0036094679962843657,
"rewards/rejected": -0.06550458818674088,
"sft_loss": 0.003911715466529131,
"step": 180
},
{
"epoch": 0.9928008227631128,
"grad_norm": 2.189370604534784,
"learning_rate": 4.6432397166285e-10,
"logits/chosen": -0.6217331886291504,
"logits/rejected": -0.553974986076355,
"logps/chosen": -0.6876395344734192,
"logps/rejected": -0.5693202018737793,
"loss": 0.6418,
"odds_ratio_loss": 6.377911567687988,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.05693202465772629,
"rewards/margins": 0.011831930838525295,
"rewards/rejected": -0.06876395642757416,
"sft_loss": 0.003973289392888546,
"step": 181
},
{
"epoch": 0.9982859101816935,
"grad_norm": 2.1620947430558077,
"learning_rate": 0.0,
"logits/chosen": -0.7046372890472412,
"logits/rejected": -0.6097927689552307,
"logps/chosen": -0.9193150401115417,
"logps/rejected": -0.5992479920387268,
"loss": 0.6706,
"odds_ratio_loss": 6.670595645904541,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.05992480367422104,
"rewards/margins": 0.032006699591875076,
"rewards/rejected": -0.09193150699138641,
"sft_loss": 0.003579822601750493,
"step": 182
},
{
"epoch": 0.9982859101816935,
"step": 182,
"total_flos": 58779245903872.0,
"train_loss": 0.733595297559277,
"train_runtime": 13688.6483,
"train_samples_per_second": 1.704,
"train_steps_per_second": 0.013
}
],
"logging_steps": 1,
"max_steps": 182,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 182,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 58779245903872.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}