{ "best_metric": 0.7562676668167114, "best_model_checkpoint": "./output/checkpoints/2024-05-27_09-00-27/checkpoint-100", "epoch": 1.0, "eval_steps": 100, "global_step": 198, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025252525252525252, "grad_norm": 31.540218353271484, "learning_rate": 6e-06, "logits/chosen": 0.14386241137981415, "logits/rejected": -0.5877799391746521, "logps/chosen": -220.9837646484375, "logps/rejected": -171.7880096435547, "loss": 0.6932, "rewards/accuracies": 0.11249999701976776, "rewards/chosen": 0.004091186448931694, "rewards/margins": -7.568336877739057e-05, "rewards/rejected": 0.004166870377957821, "step": 5 }, { "epoch": 0.050505050505050504, "grad_norm": 23.380859375, "learning_rate": 1.6000000000000003e-05, "logits/chosen": -0.10832454264163971, "logits/rejected": -0.4757871627807617, "logps/chosen": -196.83120727539062, "logps/rejected": -171.42141723632812, "loss": 0.6684, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.33129221200942993, "rewards/margins": 0.08790162205696106, "rewards/rejected": 0.24339056015014648, "step": 10 }, { "epoch": 0.07575757575757576, "grad_norm": 23.615068435668945, "learning_rate": 2.6000000000000002e-05, "logits/chosen": -0.2714936137199402, "logits/rejected": -0.6976348161697388, "logps/chosen": -209.19827270507812, "logps/rejected": -173.0024871826172, "loss": 0.6943, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.1716283559799194, "rewards/margins": 0.2701141834259033, "rewards/rejected": 0.9015142321586609, "step": 15 }, { "epoch": 0.10101010101010101, "grad_norm": 23.117158889770508, "learning_rate": 3.6e-05, "logits/chosen": -0.4857279658317566, "logits/rejected": -0.9330118298530579, "logps/chosen": -202.5274200439453, "logps/rejected": -176.1457977294922, "loss": 0.6632, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.3160903453826904, "rewards/margins": 0.552257239818573, "rewards/rejected": 0.7638329863548279, "step": 20 }, { "epoch": 0.12626262626262627, "grad_norm": 31.966520309448242, "learning_rate": 3.997197144003557e-05, "logits/chosen": 0.038466982543468475, "logits/rejected": -0.503484845161438, "logps/chosen": -225.3045196533203, "logps/rejected": -201.3688201904297, "loss": 0.6399, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.8800640106201172, "rewards/margins": 0.41275835037231445, "rewards/rejected": 0.46730566024780273, "step": 25 }, { "epoch": 0.15151515151515152, "grad_norm": 33.95348358154297, "learning_rate": 3.980097021028909e-05, "logits/chosen": -0.036555200815200806, "logits/rejected": -0.4494614005088806, "logps/chosen": -230.3584747314453, "logps/rejected": -197.8057098388672, "loss": 0.7822, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.15881267189979553, "rewards/margins": 0.23805825412273407, "rewards/rejected": -0.07924561202526093, "step": 30 }, { "epoch": 0.17676767676767677, "grad_norm": 30.72279930114746, "learning_rate": 3.947586836927601e-05, "logits/chosen": -0.24093489348888397, "logits/rejected": -0.8071072697639465, "logps/chosen": -206.206787109375, "logps/rejected": -173.3004608154297, "loss": 0.6694, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3433918356895447, "rewards/margins": 0.5247436761856079, "rewards/rejected": -0.18135181069374084, "step": 35 }, { "epoch": 0.20202020202020202, "grad_norm": 29.44301986694336, "learning_rate": 3.899919601485982e-05, "logits/chosen": -0.5006700754165649, "logits/rejected": -1.0141886472702026, "logps/chosen": -211.9564666748047, "logps/rejected": -185.8589630126953, "loss": 0.6946, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.9416486620903015, "rewards/margins": 0.5160013437271118, "rewards/rejected": 0.4256472587585449, "step": 40 }, { "epoch": 0.22727272727272727, "grad_norm": 21.33418083190918, "learning_rate": 3.837466283906112e-05, "logits/chosen": -0.37220650911331177, "logits/rejected": -1.1134871244430542, "logps/chosen": -203.7546844482422, "logps/rejected": -160.15695190429688, "loss": 0.7258, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.9558101892471313, "rewards/margins": 0.6899305582046509, "rewards/rejected": 1.2658796310424805, "step": 45 }, { "epoch": 0.25252525252525254, "grad_norm": 29.768083572387695, "learning_rate": 3.760712925746183e-05, "logits/chosen": -0.40102189779281616, "logits/rejected": -0.8553081750869751, "logps/chosen": -213.06222534179688, "logps/rejected": -184.9990997314453, "loss": 0.7829, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 2.8542065620422363, "rewards/margins": 1.1129443645477295, "rewards/rejected": 1.7412618398666382, "step": 50 }, { "epoch": 0.2777777777777778, "grad_norm": 21.650400161743164, "learning_rate": 3.6702568583128715e-05, "logits/chosen": -0.5856252908706665, "logits/rejected": -1.2530784606933594, "logps/chosen": -204.8536834716797, "logps/rejected": -151.60122680664062, "loss": 0.6457, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.6728694438934326, "rewards/margins": 1.0119760036468506, "rewards/rejected": 0.6608934998512268, "step": 55 }, { "epoch": 0.30303030303030304, "grad_norm": 25.246517181396484, "learning_rate": 3.566802053943705e-05, "logits/chosen": -0.56941157579422, "logits/rejected": -1.02021062374115, "logps/chosen": -209.837158203125, "logps/rejected": -172.53732299804688, "loss": 0.763, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.4966418743133545, "rewards/margins": 0.739094078540802, "rewards/rejected": 0.7575478553771973, "step": 60 }, { "epoch": 0.3282828282828283, "grad_norm": 17.691490173339844, "learning_rate": 3.451153647357965e-05, "logits/chosen": -0.05042291432619095, "logits/rejected": -0.6024894118309021, "logps/chosen": -234.263427734375, "logps/rejected": -189.74607849121094, "loss": 0.7844, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.7601999044418335, "rewards/margins": 0.520693302154541, "rewards/rejected": 1.2395066022872925, "step": 65 }, { "epoch": 0.35353535353535354, "grad_norm": 26.0878849029541, "learning_rate": 3.3242116697136015e-05, "logits/chosen": -0.10399909317493439, "logits/rejected": -0.5000227093696594, "logps/chosen": -210.8389434814453, "logps/rejected": -188.54824829101562, "loss": 0.7101, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.1393743753433228, "rewards/margins": 0.3506864905357361, "rewards/rejected": 0.7886878848075867, "step": 70 }, { "epoch": 0.3787878787878788, "grad_norm": 24.767913818359375, "learning_rate": 3.186964044134713e-05, "logits/chosen": -0.6071327328681946, "logits/rejected": -0.8622013926506042, "logps/chosen": -188.81149291992188, "logps/rejected": -173.66650390625, "loss": 0.769, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.9471501111984253, "rewards/margins": 0.3631848394870758, "rewards/rejected": 0.5839653611183167, "step": 75 }, { "epoch": 0.40404040404040403, "grad_norm": 27.299219131469727, "learning_rate": 3.0404788972217645e-05, "logits/chosen": -0.47333812713623047, "logits/rejected": -0.855948805809021, "logps/chosen": -232.19448852539062, "logps/rejected": -185.18539428710938, "loss": 0.7493, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.0035685300827026, "rewards/margins": 0.4641413688659668, "rewards/rejected": 0.5394272208213806, "step": 80 }, { "epoch": 0.4292929292929293, "grad_norm": 11.157441139221191, "learning_rate": 2.8858962463800163e-05, "logits/chosen": -0.3463514745235443, "logits/rejected": -1.0284178256988525, "logps/chosen": -227.76937866210938, "logps/rejected": -173.3926239013672, "loss": 0.7012, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.2513014078140259, "rewards/margins": 0.724143922328949, "rewards/rejected": 0.5271574854850769, "step": 85 }, { "epoch": 0.45454545454545453, "grad_norm": 21.36394500732422, "learning_rate": 2.7244191276593653e-05, "logits/chosen": -0.396036297082901, "logits/rejected": -0.6990352272987366, "logps/chosen": -198.6284942626953, "logps/rejected": -180.7591094970703, "loss": 0.7398, "rewards/accuracies": 0.5625, "rewards/chosen": 1.1550531387329102, "rewards/margins": 0.42906150221824646, "rewards/rejected": 0.7259916067123413, "step": 90 }, { "epoch": 0.4797979797979798, "grad_norm": 21.15500831604004, "learning_rate": 2.5573042331529846e-05, "logits/chosen": -0.5320285558700562, "logits/rejected": -0.9789053797721863, "logps/chosen": -216.59359741210938, "logps/rejected": -175.47976684570312, "loss": 0.7284, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.1451869010925293, "rewards/margins": 0.508368968963623, "rewards/rejected": 0.6368179321289062, "step": 95 }, { "epoch": 0.5050505050505051, "grad_norm": 25.441944122314453, "learning_rate": 2.385852130818994e-05, "logits/chosen": -0.38838592171669006, "logits/rejected": -0.7203149795532227, "logps/chosen": -212.9621124267578, "logps/rejected": -187.50942993164062, "loss": 0.7677, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.5237963199615479, "rewards/margins": 0.5711702108383179, "rewards/rejected": 0.95262610912323, "step": 100 }, { "epoch": 0.5050505050505051, "eval_logits/chosen": 1.4398491382598877, "eval_logits/rejected": 1.2089641094207764, "eval_logps/chosen": -209.8059844970703, "eval_logps/rejected": -178.4928741455078, "eval_loss": 0.7562676668167114, "eval_rewards/accuracies": 0.59375, "eval_rewards/chosen": 1.1455074548721313, "eval_rewards/margins": 0.41021832823753357, "eval_rewards/rejected": 0.7352891564369202, "eval_runtime": 274.7063, "eval_samples_per_second": 2.33, "eval_steps_per_second": 0.146, "step": 100 }, { "epoch": 0.5303030303030303, "grad_norm": 18.52869415283203, "learning_rate": 2.2113971428391862e-05, "logits/chosen": -0.5178086757659912, "logits/rejected": -1.0226086378097534, "logps/chosen": -208.1871795654297, "logps/rejected": -171.2388458251953, "loss": 0.7824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9819921255111694, "rewards/margins": 0.44732803106307983, "rewards/rejected": 0.5346641540527344, "step": 105 }, { "epoch": 0.5555555555555556, "grad_norm": 18.219402313232422, "learning_rate": 2.0352969612862576e-05, "logits/chosen": -0.5200484395027161, "logits/rejected": -0.8673607707023621, "logps/chosen": -207.2660369873047, "logps/rejected": -179.528564453125, "loss": 0.6481, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.8134071230888367, "rewards/margins": 0.5864471197128296, "rewards/rejected": 0.2269599884748459, "step": 110 }, { "epoch": 0.5808080808080808, "grad_norm": 22.682283401489258, "learning_rate": 1.858922081915378e-05, "logits/chosen": -0.3367950916290283, "logits/rejected": -0.7870457768440247, "logps/chosen": -187.28225708007812, "logps/rejected": -167.66563415527344, "loss": 0.6859, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.0122456550598145, "rewards/margins": 0.42765671014785767, "rewards/rejected": 0.5845889449119568, "step": 115 }, { "epoch": 0.6060606060606061, "grad_norm": 21.8991756439209, "learning_rate": 1.6836451383113923e-05, "logits/chosen": -0.6206024885177612, "logits/rejected": -1.1339770555496216, "logps/chosen": -216.4201202392578, "logps/rejected": -181.8843536376953, "loss": 0.7218, "rewards/accuracies": 0.6875, "rewards/chosen": 1.1707611083984375, "rewards/margins": 0.4338921010494232, "rewards/rejected": 0.7368690371513367, "step": 120 }, { "epoch": 0.6313131313131313, "grad_norm": 26.37303352355957, "learning_rate": 1.5108302193984004e-05, "logits/chosen": -0.3522421717643738, "logits/rejected": -1.013496994972229, "logps/chosen": -231.60025024414062, "logps/rejected": -174.62521362304688, "loss": 0.631, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.5266729593276978, "rewards/margins": 0.6238378286361694, "rewards/rejected": 0.9028350710868835, "step": 125 }, { "epoch": 0.6565656565656566, "grad_norm": 19.788497924804688, "learning_rate": 1.3752556459724117e-05, "logits/chosen": -0.44988712668418884, "logits/rejected": -0.899565577507019, "logps/chosen": -206.1852264404297, "logps/rejected": -188.783447265625, "loss": 0.685, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.4883193969726562, "rewards/margins": 0.6105667352676392, "rewards/rejected": 0.8777526021003723, "step": 130 }, { "epoch": 0.6818181818181818, "grad_norm": 17.837421417236328, "learning_rate": 1.2102420272588178e-05, "logits/chosen": -0.269491583108902, "logits/rejected": -0.7996856570243835, "logps/chosen": -219.24240112304688, "logps/rejected": -178.4714813232422, "loss": 0.6356, "rewards/accuracies": 0.6875, "rewards/chosen": 1.4494140148162842, "rewards/margins": 0.8258872032165527, "rewards/rejected": 0.6235266923904419, "step": 135 }, { "epoch": 0.7070707070707071, "grad_norm": 30.42537498474121, "learning_rate": 1.0513746824428951e-05, "logits/chosen": -0.6660299897193909, "logits/rejected": -1.161827564239502, "logps/chosen": -225.38668823242188, "logps/rejected": -188.2053985595703, "loss": 0.7771, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.9760153889656067, "rewards/margins": 0.37082913517951965, "rewards/rejected": 0.6051862239837646, "step": 140 }, { "epoch": 0.7323232323232324, "grad_norm": 21.641803741455078, "learning_rate": 8.998899931103173e-06, "logits/chosen": -0.6154942512512207, "logits/rejected": -1.150761365890503, "logps/chosen": -205.3412628173828, "logps/rejected": -176.29917907714844, "loss": 0.6959, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.9418373107910156, "rewards/margins": 0.5389925241470337, "rewards/rejected": 0.40284472703933716, "step": 145 }, { "epoch": 0.7575757575757576, "grad_norm": 17.89181137084961, "learning_rate": 7.569668854942815e-06, "logits/chosen": -0.2999069094657898, "logits/rejected": -0.8618285059928894, "logps/chosen": -208.3065948486328, "logps/rejected": -163.25619506835938, "loss": 0.6386, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9588847160339355, "rewards/margins": 0.6155614256858826, "rewards/rejected": 0.343323290348053, "step": 150 }, { "epoch": 0.7828282828282829, "grad_norm": 20.726341247558594, "learning_rate": 6.237176555082625e-06, "logits/chosen": -0.09209189563989639, "logits/rejected": -0.40321072936058044, "logps/chosen": -221.6671905517578, "logps/rejected": -206.04989624023438, "loss": 0.7131, "rewards/accuracies": 0.625, "rewards/chosen": 0.885414719581604, "rewards/margins": 0.5608201622962952, "rewards/rejected": 0.32459449768066406, "step": 155 }, { "epoch": 0.8080808080808081, "grad_norm": 16.55711555480957, "learning_rate": 5.0117931232775e-06, "logits/chosen": -0.19318969547748566, "logits/rejected": -0.7406075596809387, "logps/chosen": -207.5948944091797, "logps/rejected": -176.14010620117188, "loss": 0.6493, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.8107932209968567, "rewards/margins": 0.5306531190872192, "rewards/rejected": 0.28014007210731506, "step": 160 }, { "epoch": 0.8333333333333334, "grad_norm": 17.550378799438477, "learning_rate": 3.903055078893489e-06, "logits/chosen": -0.05737120658159256, "logits/rejected": -0.6579563617706299, "logps/chosen": -220.5160675048828, "logps/rejected": -174.6505584716797, "loss": 0.7329, "rewards/accuracies": 0.625, "rewards/chosen": 0.9277563095092773, "rewards/margins": 0.7453610301017761, "rewards/rejected": 0.18239526450634003, "step": 165 }, { "epoch": 0.8585858585858586, "grad_norm": 33.4338493347168, "learning_rate": 2.919591151157475e-06, "logits/chosen": -0.6612521409988403, "logits/rejected": -1.0171642303466797, "logps/chosen": -191.87083435058594, "logps/rejected": -168.1193084716797, "loss": 0.7216, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.65160071849823, "rewards/margins": 0.4179254472255707, "rewards/rejected": 0.2336752861738205, "step": 170 }, { "epoch": 0.8838383838383839, "grad_norm": 18.22390365600586, "learning_rate": 2.069055126263433e-06, "logits/chosen": -0.7078531384468079, "logits/rejected": -0.9975414276123047, "logps/chosen": -177.2061004638672, "logps/rejected": -164.01715087890625, "loss": 0.7113, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7737497091293335, "rewards/margins": 0.5676254034042358, "rewards/rejected": 0.20612427592277527, "step": 175 }, { "epoch": 0.9090909090909091, "grad_norm": 17.45758056640625, "learning_rate": 1.3580662819512093e-06, "logits/chosen": -0.49047690629959106, "logits/rejected": -0.9685350656509399, "logps/chosen": -211.9286346435547, "logps/rejected": -177.24661254882812, "loss": 0.6725, "rewards/accuracies": 0.625, "rewards/chosen": 0.8009859323501587, "rewards/margins": 0.5601747632026672, "rewards/rejected": 0.24081113934516907, "step": 180 }, { "epoch": 0.9343434343434344, "grad_norm": 24.04217529296875, "learning_rate": 7.92157873124415e-07, "logits/chosen": -0.42594170570373535, "logits/rejected": -1.1140912771224976, "logps/chosen": -225.57559204101562, "logps/rejected": -178.7427520751953, "loss": 0.645, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0804827213287354, "rewards/margins": 0.6804853081703186, "rewards/rejected": 0.39999738335609436, "step": 185 }, { "epoch": 0.9595959595959596, "grad_norm": 30.535690307617188, "learning_rate": 3.757340694169109e-07, "logits/chosen": -0.4965648651123047, "logits/rejected": -0.7092806696891785, "logps/chosen": -183.77874755859375, "logps/rejected": -176.67739868164062, "loss": 0.7775, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.7033542394638062, "rewards/margins": 0.2690446078777313, "rewards/rejected": 0.4343096613883972, "step": 190 }, { "epoch": 0.9848484848484849, "grad_norm": 17.108673095703125, "learning_rate": 1.1203567984036101e-07, "logits/chosen": -0.38576817512512207, "logits/rejected": -1.0843629837036133, "logps/chosen": -222.65896606445312, "logps/rejected": -185.77786254882812, "loss": 0.5822, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.9926906824111938, "rewards/margins": 0.9567824602127075, "rewards/rejected": 0.03590827062726021, "step": 195 }, { "epoch": 1.0, "step": 198, "total_flos": 0.0, "train_loss": 0.7032270744593456, "train_runtime": 3353.7827, "train_samples_per_second": 0.943, "train_steps_per_second": 0.059 } ], "logging_steps": 5, "max_steps": 198, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }