{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.986666666666667, "eval_steps": 500, "global_step": 336, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08888888888888889, "grad_norm": 1.792281150817871, "kl": 0.15646514296531677, "learning_rate": 1.4705882352941177e-06, "logps/chosen": -164.1169189453125, "loss": 0.5043, "rewards/chosen": -0.0013721466064453125, "step": 10 }, { "epoch": 0.17777777777777778, "grad_norm": 1.7645841836929321, "kl": 0.23952817916870117, "learning_rate": 2.9411764705882355e-06, "logps/chosen": -161.21431884765624, "loss": 0.506, "rewards/chosen": -0.000222191633656621, "step": 20 }, { "epoch": 0.26666666666666666, "grad_norm": 1.6355303525924683, "kl": 0.2789602279663086, "learning_rate": 4.411764705882353e-06, "logps/chosen": -161.8250244140625, "loss": 0.5046, "rewards/chosen": 0.009497338533401489, "step": 30 }, { "epoch": 0.35555555555555557, "grad_norm": 1.8716518878936768, "kl": 0.5364601016044617, "learning_rate": 4.995131923687488e-06, "logps/chosen": -160.8272705078125, "loss": 0.5011, "rewards/chosen": 0.04936442375183105, "step": 40 }, { "epoch": 0.4444444444444444, "grad_norm": 1.7810463905334473, "kl": 1.5138229131698608, "learning_rate": 4.965451197130373e-06, "logps/chosen": -159.2620361328125, "loss": 0.5081, "rewards/chosen": 0.11891223192214966, "step": 50 }, { "epoch": 0.5333333333333333, "grad_norm": 2.2072041034698486, "kl": 3.1553397178649902, "learning_rate": 4.90911473983908e-06, "logps/chosen": -163.2381103515625, "loss": 0.5049, "rewards/chosen": 0.295930004119873, "step": 60 }, { "epoch": 0.6222222222222222, "grad_norm": 1.858336329460144, "kl": 6.15082311630249, "learning_rate": 4.826731644963705e-06, "logps/chosen": -160.2790771484375, "loss": 0.5238, "rewards/chosen": 0.517311429977417, "step": 70 }, { "epoch": 0.7111111111111111, "grad_norm": 2.1484944820404053, "kl": 9.094244003295898, "learning_rate": 4.71919261421297e-06, "logps/chosen": -165.65592041015626, "loss": 0.4949, "rewards/chosen": 0.9306423187255859, "step": 80 }, { "epoch": 0.8, "grad_norm": 1.9223041534423828, "kl": 12.533430099487305, "learning_rate": 4.587660327850203e-06, "logps/chosen": -147.84144287109376, "loss": 0.5327, "rewards/chosen": 1.1160342216491699, "step": 90 }, { "epoch": 0.8888888888888888, "grad_norm": 2.052889823913574, "kl": 15.455533981323242, "learning_rate": 4.43355687413747e-06, "logps/chosen": -141.32998046875, "loss": 0.5258, "rewards/chosen": 1.4277078628540039, "step": 100 }, { "epoch": 0.9777777777777777, "grad_norm": 1.8008378744125366, "kl": 18.05435562133789, "learning_rate": 4.258548374136976e-06, "logps/chosen": -142.55394287109374, "loss": 0.5491, "rewards/chosen": 1.5885238647460938, "step": 110 }, { "epoch": 1.0666666666666667, "grad_norm": 1.9117740392684937, "kl": 20.216676712036133, "learning_rate": 4.064526968101844e-06, "logps/chosen": -138.30635986328124, "loss": 0.5418, "rewards/chosen": 1.8262062072753906, "step": 120 }, { "epoch": 1.1555555555555554, "grad_norm": 2.0452499389648438, "kl": 22.620920181274414, "learning_rate": 3.853590358214119e-06, "logps/chosen": -142.5436767578125, "loss": 0.5414, "rewards/chosen": 2.02716178894043, "step": 130 }, { "epoch": 1.2444444444444445, "grad_norm": 1.5861475467681885, "kl": 21.474929809570312, "learning_rate": 3.6280191288478437e-06, "logps/chosen": -139.66346435546876, "loss": 0.5436, "rewards/chosen": 1.9418670654296875, "step": 140 }, { "epoch": 1.3333333333333333, "grad_norm": 2.040761947631836, "kl": 24.884174346923828, "learning_rate": 3.3902520895638674e-06, "logps/chosen": -144.4580078125, "loss": 0.5551, "rewards/chosen": 2.2067359924316405, "step": 150 }, { "epoch": 1.4222222222222223, "grad_norm": 1.4137938022613525, "kl": 24.920988082885742, "learning_rate": 3.142859907420615e-06, "logps/chosen": -134.41856689453124, "loss": 0.5452, "rewards/chosen": 2.2827281951904297, "step": 160 }, { "epoch": 1.511111111111111, "grad_norm": 1.9247097969055176, "kl": 27.40714454650879, "learning_rate": 2.8885173136805126e-06, "logps/chosen": -134.56304931640625, "loss": 0.5554, "rewards/chosen": 2.478318786621094, "step": 170 }, { "epoch": 1.6, "grad_norm": 1.7178384065628052, "kl": 26.36468505859375, "learning_rate": 2.629974185404951e-06, "logps/chosen": -140.7825927734375, "loss": 0.5482, "rewards/chosen": 2.3276464462280275, "step": 180 }, { "epoch": 1.6888888888888889, "grad_norm": 1.899584174156189, "kl": 27.777027130126953, "learning_rate": 2.3700258145950495e-06, "logps/chosen": -140.53466796875, "loss": 0.5618, "rewards/chosen": 2.424651336669922, "step": 190 }, { "epoch": 1.7777777777777777, "grad_norm": 1.7362196445465088, "kl": 27.96476173400879, "learning_rate": 2.1114826863194882e-06, "logps/chosen": -132.751318359375, "loss": 0.5549, "rewards/chosen": 2.5185680389404297, "step": 200 }, { "epoch": 1.8666666666666667, "grad_norm": 1.5568628311157227, "kl": 25.686458587646484, "learning_rate": 1.8571400925793855e-06, "logps/chosen": -141.460986328125, "loss": 0.4799, "rewards/chosen": 2.678207778930664, "step": 210 }, { "epoch": 1.9555555555555557, "grad_norm": 1.4741252660751343, "kl": 27.350994110107422, "learning_rate": 1.6097479104361328e-06, "logps/chosen": -138.36417236328126, "loss": 0.5164, "rewards/chosen": 2.5984352111816404, "step": 220 }, { "epoch": 2.0444444444444443, "grad_norm": 1.4206268787384033, "kl": 27.860248565673828, "learning_rate": 1.3719808711521573e-06, "logps/chosen": -140.159765625, "loss": 0.5315, "rewards/chosen": 2.6533029556274412, "step": 230 }, { "epoch": 2.1333333333333333, "grad_norm": 1.964991569519043, "kl": 28.185623168945312, "learning_rate": 1.1464096417858821e-06, "logps/chosen": -136.175439453125, "loss": 0.5194, "rewards/chosen": 2.697865676879883, "step": 240 }, { "epoch": 2.2222222222222223, "grad_norm": 2.0303757190704346, "kl": 29.453914642333984, "learning_rate": 9.354730318981561e-07, "logps/chosen": -124.939111328125, "loss": 0.5565, "rewards/chosen": 2.6518789291381837, "step": 250 }, { "epoch": 2.311111111111111, "grad_norm": 1.9753375053405762, "kl": 28.224218368530273, "learning_rate": 7.414516258630245e-07, "logps/chosen": -143.79691162109376, "loss": 0.4645, "rewards/chosen": 2.974603271484375, "step": 260 }, { "epoch": 2.4, "grad_norm": 1.5573641061782837, "kl": 28.22760581970215, "learning_rate": 5.664431258625305e-07, "logps/chosen": -135.832470703125, "loss": 0.5039, "rewards/chosen": 2.787263107299805, "step": 270 }, { "epoch": 2.488888888888889, "grad_norm": 1.8748925924301147, "kl": 29.364704132080078, "learning_rate": 4.123396721497977e-07, "logps/chosen": -132.555029296875, "loss": 0.5659, "rewards/chosen": 2.5855674743652344, "step": 280 }, { "epoch": 2.5777777777777775, "grad_norm": 1.9802337884902954, "kl": 29.822818756103516, "learning_rate": 2.8080738578703054e-07, "logps/chosen": -132.623876953125, "loss": 0.5254, "rewards/chosen": 2.7944366455078127, "step": 290 }, { "epoch": 2.6666666666666665, "grad_norm": 1.6388640403747559, "kl": 29.55389404296875, "learning_rate": 1.7326835503629542e-07, "logps/chosen": -132.94588623046874, "loss": 0.5661, "rewards/chosen": 2.6015438079833983, "step": 300 }, { "epoch": 2.7555555555555555, "grad_norm": 2.059163808822632, "kl": 28.346817016601562, "learning_rate": 9.088526016092142e-08, "logps/chosen": -142.41676025390626, "loss": 0.4982, "rewards/chosen": 2.8422063827514648, "step": 310 }, { "epoch": 2.8444444444444446, "grad_norm": 1.8374332189559937, "kl": 28.879934310913086, "learning_rate": 3.4548802869627806e-08, "logps/chosen": -133.9089599609375, "loss": 0.508, "rewards/chosen": 2.8151947021484376, "step": 320 }, { "epoch": 2.9333333333333336, "grad_norm": 1.6760042905807495, "kl": 28.577600479125977, "learning_rate": 4.868076312512515e-09, "logps/chosen": -133.6552001953125, "loss": 0.5269, "rewards/chosen": 2.694635200500488, "step": 330 } ], "logging_steps": 10, "max_steps": 336, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.5905450080116736e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }