{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.32666394446712943, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016333197223356473, "grad_norm": 1.1593419313430786, "learning_rate": 4.9996340229869763e-05, "logits/chosen": -1.0346699953079224, "logits/rejected": -2.6695446968078613, "logps/chosen": -1.101169466972351, "logps/rejected": -1.0235556364059448, "loss": 1.178, "odds_ratio_loss": 0.768240213394165, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.11011694371700287, "rewards/margins": -0.007761377841234207, "rewards/rejected": -0.10235557705163956, "sft_loss": 1.101169466972351, "step": 5 }, { "epoch": 0.032666394446712946, "grad_norm": 1.1402696371078491, "learning_rate": 4.9985361990992455e-05, "logits/chosen": -0.8193758726119995, "logits/rejected": -2.842833995819092, "logps/chosen": -1.0217273235321045, "logps/rejected": -1.0100481510162354, "loss": 1.0927, "odds_ratio_loss": 0.7092453837394714, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.10217274725437164, "rewards/margins": -0.0011679243762046099, "rewards/rejected": -0.10100481659173965, "sft_loss": 1.0217273235321045, "step": 10 }, { "epoch": 0.04899959167006942, "grad_norm": 1.3554140329360962, "learning_rate": 4.996706849759453e-05, "logits/chosen": -0.7696830034255981, "logits/rejected": -2.703421115875244, "logps/chosen": -1.0294106006622314, "logps/rejected": -1.0599125623703003, "loss": 1.0993, "odds_ratio_loss": 0.6984036564826965, "rewards/accuracies": 0.5, "rewards/chosen": -0.10294107347726822, "rewards/margins": 0.003050185739994049, "rewards/rejected": -0.10599125921726227, "sft_loss": 1.0294106006622314, "step": 15 }, { "epoch": 0.06533278889342589, "grad_norm": 1.406853199005127, "learning_rate": 4.9941465105674435e-05, "logits/chosen": -0.698488712310791, "logits/rejected": -2.808516263961792, "logps/chosen": -1.0651928186416626, "logps/rejected": -0.996777355670929, "loss": 1.1402, "odds_ratio_loss": 0.7504132390022278, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.10651928186416626, "rewards/margins": -0.006841552909463644, "rewards/rejected": -0.09967772662639618, "sft_loss": 1.0651928186416626, "step": 20 }, { "epoch": 0.08166598611678236, "grad_norm": 1.238513469696045, "learning_rate": 4.99085593114345e-05, "logits/chosen": -0.6306881904602051, "logits/rejected": -2.698362350463867, "logps/chosen": -1.0242538452148438, "logps/rejected": -1.0320734977722168, "loss": 1.0937, "odds_ratio_loss": 0.6945221424102783, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.10242538154125214, "rewards/margins": 0.0007819652673788369, "rewards/rejected": -0.10320734977722168, "sft_loss": 1.0242538452148438, "step": 25 }, { "epoch": 0.09799918334013884, "grad_norm": 1.9588699340820312, "learning_rate": 4.986836074908616e-05, "logits/chosen": -0.5534936189651489, "logits/rejected": -2.678889036178589, "logps/chosen": -1.1003000736236572, "logps/rejected": -1.0425399541854858, "loss": 1.1745, "odds_ratio_loss": 0.7416313290596008, "rewards/accuracies": 0.5, "rewards/chosen": -0.11003003269433975, "rewards/margins": -0.005776026751846075, "rewards/rejected": -0.10425399243831635, "sft_loss": 1.1003000736236572, "step": 30 }, { "epoch": 0.1143323805634953, "grad_norm": 1.1857821941375732, "learning_rate": 4.982088118802922e-05, "logits/chosen": -0.6129628419876099, "logits/rejected": -2.876364231109619, "logps/chosen": -0.9628928303718567, "logps/rejected": -0.952060341835022, "loss": 1.0336, "odds_ratio_loss": 0.7068274617195129, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.09628929197788239, "rewards/margins": -0.001083258306607604, "rewards/rejected": -0.09520602226257324, "sft_loss": 0.9628928303718567, "step": 35 }, { "epoch": 0.13066557778685178, "grad_norm": 1.33924400806427, "learning_rate": 4.976613452940604e-05, "logits/chosen": -0.7091814279556274, "logits/rejected": -2.6595664024353027, "logps/chosen": -0.89775550365448, "logps/rejected": -0.935952365398407, "loss": 0.9642, "odds_ratio_loss": 0.6644185781478882, "rewards/accuracies": 0.625, "rewards/chosen": -0.08977555483579636, "rewards/margins": 0.003819683101028204, "rewards/rejected": -0.0935952290892601, "sft_loss": 0.89775550365448, "step": 40 }, { "epoch": 0.14699877501020825, "grad_norm": 1.136667013168335, "learning_rate": 4.9704136802031485e-05, "logits/chosen": -0.7233263850212097, "logits/rejected": -2.6673388481140137, "logps/chosen": -0.9036429524421692, "logps/rejected": -0.9419530630111694, "loss": 0.9699, "odds_ratio_loss": 0.662254273891449, "rewards/accuracies": 0.625, "rewards/chosen": -0.09036429226398468, "rewards/margins": 0.0038310133386403322, "rewards/rejected": -0.09419531375169754, "sft_loss": 0.9036429524421692, "step": 45 }, { "epoch": 0.16333197223356472, "grad_norm": 1.4290399551391602, "learning_rate": 4.9634906157700036e-05, "logits/chosen": -0.6113058924674988, "logits/rejected": -2.705373764038086, "logps/chosen": -0.9550830125808716, "logps/rejected": -0.9817631840705872, "loss": 1.0223, "odds_ratio_loss": 0.6725824475288391, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.09550830721855164, "rewards/margins": 0.002668013097718358, "rewards/rejected": -0.09817632287740707, "sft_loss": 0.9550830125808716, "step": 50 }, { "epoch": 0.17966516945692118, "grad_norm": 1.4945589303970337, "learning_rate": 4.955846286587122e-05, "logits/chosen": -0.647710919380188, "logits/rejected": -2.7208316326141357, "logps/chosen": -0.9462896585464478, "logps/rejected": -0.9507497549057007, "loss": 1.0148, "odds_ratio_loss": 0.6852728128433228, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0946289673447609, "rewards/margins": 0.0004460157942958176, "rewards/rejected": -0.09507497400045395, "sft_loss": 0.9462896585464478, "step": 55 }, { "epoch": 0.19599836668027767, "grad_norm": 1.3553156852722168, "learning_rate": 4.9474829307735115e-05, "logits/chosen": -0.6279743909835815, "logits/rejected": -2.6854496002197266, "logps/chosen": -0.8896121978759766, "logps/rejected": -0.9993942975997925, "loss": 0.9509, "odds_ratio_loss": 0.6129180192947388, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08896122127771378, "rewards/margins": 0.010978218168020248, "rewards/rejected": -0.09993942826986313, "sft_loss": 0.8896121978759766, "step": 60 }, { "epoch": 0.21233156390363414, "grad_norm": 1.4247411489486694, "learning_rate": 4.9384029969659573e-05, "logits/chosen": -0.698381781578064, "logits/rejected": -2.7967753410339355, "logps/chosen": -0.9455701112747192, "logps/rejected": -0.9564310908317566, "loss": 1.0147, "odds_ratio_loss": 0.6911824941635132, "rewards/accuracies": 0.5, "rewards/chosen": -0.0945570170879364, "rewards/margins": 0.0010860899928957224, "rewards/rejected": -0.09564311057329178, "sft_loss": 0.9455701112747192, "step": 65 }, { "epoch": 0.2286647611269906, "grad_norm": 1.6154508590698242, "learning_rate": 4.9286091436021015e-05, "logits/chosen": -0.595032274723053, "logits/rejected": -2.740784168243408, "logps/chosen": -0.9214372634887695, "logps/rejected": -1.0210167169570923, "loss": 0.9836, "odds_ratio_loss": 0.6212525963783264, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09214373677968979, "rewards/margins": 0.00995794590562582, "rewards/rejected": -0.10210166871547699, "sft_loss": 0.9214372634887695, "step": 70 }, { "epoch": 0.24499795835034707, "grad_norm": 1.8231102228164673, "learning_rate": 4.918104238142104e-05, "logits/chosen": -0.5186889171600342, "logits/rejected": -2.7576358318328857, "logps/chosen": -0.9789311289787292, "logps/rejected": -1.0169849395751953, "loss": 1.0454, "odds_ratio_loss": 0.6644267439842224, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.09789310395717621, "rewards/margins": 0.0038053765892982483, "rewards/rejected": -0.10169849544763565, "sft_loss": 0.9789311289787292, "step": 75 }, { "epoch": 0.26133115557370357, "grad_norm": 2.05936336517334, "learning_rate": 4.906891356229103e-05, "logits/chosen": -0.47254711389541626, "logits/rejected": -2.68280029296875, "logps/chosen": -0.9620391726493835, "logps/rejected": -1.0026928186416626, "loss": 1.0294, "odds_ratio_loss": 0.6731926202774048, "rewards/accuracies": 0.5, "rewards/chosen": -0.09620390832424164, "rewards/margins": 0.004065364599227905, "rewards/rejected": -0.10026928037405014, "sft_loss": 0.9620391726493835, "step": 80 }, { "epoch": 0.27766435279706003, "grad_norm": 1.6302626132965088, "learning_rate": 4.894973780788722e-05, "logits/chosen": -0.5315309762954712, "logits/rejected": -2.7227425575256348, "logps/chosen": -0.9423310160636902, "logps/rejected": -1.071781039237976, "loss": 1.0032, "odds_ratio_loss": 0.6090766191482544, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09423311054706573, "rewards/margins": 0.012944999150931835, "rewards/rejected": -0.10717810690402985, "sft_loss": 0.9423310160636902, "step": 85 }, { "epoch": 0.2939975500204165, "grad_norm": 1.7891793251037598, "learning_rate": 4.882355001067892e-05, "logits/chosen": -0.5275685787200928, "logits/rejected": -2.6127567291259766, "logps/chosen": -0.9351356625556946, "logps/rejected": -1.0083874464035034, "loss": 0.9995, "odds_ratio_loss": 0.6435801386833191, "rewards/accuracies": 0.625, "rewards/chosen": -0.09351354837417603, "rewards/margins": 0.0073251971043646336, "rewards/rejected": -0.10083875805139542, "sft_loss": 0.9351356625556946, "step": 90 }, { "epoch": 0.31033074724377296, "grad_norm": 1.852216124534607, "learning_rate": 4.869038711613259e-05, "logits/chosen": -0.5760647058486938, "logits/rejected": -2.722364664077759, "logps/chosen": -0.9301847219467163, "logps/rejected": -1.0749822854995728, "loss": 0.9903, "odds_ratio_loss": 0.601297914981842, "rewards/accuracies": 0.75, "rewards/chosen": -0.09301847964525223, "rewards/margins": 0.014479748904705048, "rewards/rejected": -0.10749821364879608, "sft_loss": 0.9301847219467163, "step": 95 }, { "epoch": 0.32666394446712943, "grad_norm": 1.8597307205200195, "learning_rate": 4.855028811189496e-05, "logits/chosen": -0.5871164202690125, "logits/rejected": -2.6969285011291504, "logps/chosen": -1.0701930522918701, "logps/rejected": -1.0556358098983765, "loss": 1.1427, "odds_ratio_loss": 0.7247555255889893, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.10701930522918701, "rewards/margins": -0.0014557272661477327, "rewards/rejected": -0.10556356608867645, "sft_loss": 1.0701930522918701, "step": 100 } ], "logging_steps": 5, "max_steps": 918, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 4.118738025735782e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }