diff --git "a/checkpoint-3000/trainer_state.json" "b/checkpoint-3000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-3000/trainer_state.json" @@ -0,0 +1,54021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.8328611898017, + "eval_steps": 500, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 15.287191390991211, + "learning_rate": 1.5723270440251572e-09, + "logps/chosen": -36.293212890625, + "logps/rejected": -54.14521789550781, + "loss": 0.6931, + "losses/dpo": 0.6931471824645996, + "losses/sft": 1.333309292793274, + "losses/total": 0.6931471824645996, + "ref_logps/chosen": -36.293212890625, + "ref_logps/rejected": -54.14521789550781, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 16.00140953063965, + "learning_rate": 3.1446540880503143e-09, + "logps/chosen": -44.562477111816406, + "logps/rejected": -46.48662185668945, + "loss": 0.6931, + "losses/dpo": 0.6931471824645996, + "losses/sft": 0.9847710132598877, + "losses/total": 0.6931471824645996, + "ref_logps/chosen": -44.562477111816406, + "ref_logps/rejected": -46.48662185668945, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 16.550865173339844, + "learning_rate": 4.716981132075472e-09, + "logps/chosen": -41.267608642578125, + "logps/rejected": -55.37574768066406, + "loss": 0.6946, + "losses/dpo": 0.6985788941383362, + "losses/sft": 1.231009840965271, + "losses/total": 0.6985788941383362, + "ref_logps/chosen": -41.19807434082031, + "ref_logps/rejected": -55.33381652832031, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.006953191943466663, + "rewards/margins": -0.0027600531466305256, + "rewards/rejected": -0.0041931383311748505, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 19.647977828979492, + "learning_rate": 6.289308176100629e-09, + "logps/chosen": -31.251550674438477, + "logps/rejected": -41.84539031982422, + "loss": 0.6929, + "losses/dpo": 0.6954505443572998, + "losses/sft": 1.0519485473632812, + "losses/total": 0.6954505443572998, + "ref_logps/chosen": -31.226787567138672, + "ref_logps/rejected": -41.81562805175781, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.002476316411048174, + "rewards/margins": 0.0005000412929803133, + "rewards/rejected": -0.002976357936859131, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 13.318140983581543, + "learning_rate": 7.861635220125786e-09, + "logps/chosen": -32.866783142089844, + "logps/rejected": -37.394134521484375, + "loss": 0.6929, + "losses/dpo": 0.6889871954917908, + "losses/sft": 1.273924469947815, + "losses/total": 0.6889871954917908, + "ref_logps/chosen": -32.928836822509766, + "ref_logps/rejected": -37.45051193237305, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.006205156445503235, + "rewards/margins": 0.0005674933781847358, + "rewards/rejected": 0.005637663416564465, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 14.464860916137695, + "learning_rate": 9.433962264150943e-09, + "logps/chosen": -40.548370361328125, + "logps/rejected": -41.034645080566406, + "loss": 0.6927, + "losses/dpo": 0.6911370754241943, + "losses/sft": 1.591616153717041, + "losses/total": 0.6911370754241943, + "ref_logps/chosen": -40.54157257080078, + "ref_logps/rejected": -41.017581939697266, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.000679713673889637, + "rewards/margins": 0.0010262848809361458, + "rewards/rejected": -0.0017059982055798173, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 13.795360565185547, + "learning_rate": 1.1006289308176099e-08, + "logps/chosen": -40.470916748046875, + "logps/rejected": -42.38670349121094, + "loss": 0.6955, + "losses/dpo": 0.6914644837379456, + "losses/sft": 1.4545750617980957, + "losses/total": 0.6914644837379456, + "ref_logps/chosen": -40.477840423583984, + "ref_logps/rejected": -42.43923568725586, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0006923316977918148, + "rewards/margins": -0.004560908768326044, + "rewards/rejected": 0.005253240931779146, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 16.392980575561523, + "learning_rate": 1.2578616352201257e-08, + "logps/chosen": -42.82190704345703, + "logps/rejected": -50.23805236816406, + "loss": 0.6902, + "losses/dpo": 0.6934096217155457, + "losses/sft": 1.8958083391189575, + "losses/total": 0.6934096217155457, + "ref_logps/chosen": -42.789737701416016, + "ref_logps/rejected": -50.1453857421875, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0032165111042559147, + "rewards/margins": 0.006050032563507557, + "rewards/rejected": -0.00926654227077961, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 14.590455055236816, + "learning_rate": 1.4150943396226414e-08, + "logps/chosen": -32.913726806640625, + "logps/rejected": -40.874839782714844, + "loss": 0.696, + "losses/dpo": 0.6927824020385742, + "losses/sft": 1.417920470237732, + "losses/total": 0.6927824020385742, + "ref_logps/chosen": -32.87944793701172, + "ref_logps/rejected": -40.89663314819336, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.003427929012104869, + "rewards/margins": -0.005606940947473049, + "rewards/rejected": 0.0021790117025375366, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 13.96774959564209, + "learning_rate": 1.5723270440251573e-08, + "logps/chosen": -33.643367767333984, + "logps/rejected": -39.197227478027344, + "loss": 0.6936, + "losses/dpo": 0.6882840394973755, + "losses/sft": 1.5314520597457886, + "losses/total": 0.6882840394973755, + "ref_logps/chosen": -33.65266036987305, + "ref_logps/rejected": -39.214569091796875, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0009294033516198397, + "rewards/margins": -0.0008048177696764469, + "rewards/rejected": 0.0017342206556349993, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 15.812729835510254, + "learning_rate": 1.729559748427673e-08, + "logps/chosen": -33.84188461303711, + "logps/rejected": -42.659942626953125, + "loss": 0.6923, + "losses/dpo": 0.6912386417388916, + "losses/sft": 1.217553973197937, + "losses/total": 0.6912386417388916, + "ref_logps/chosen": -33.8734130859375, + "ref_logps/rejected": -42.67314529418945, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0031529488041996956, + "rewards/margins": 0.0018325985874980688, + "rewards/rejected": 0.0013203503331169486, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 13.129110336303711, + "learning_rate": 1.8867924528301887e-08, + "logps/chosen": -35.366729736328125, + "logps/rejected": -36.78556823730469, + "loss": 0.6954, + "losses/dpo": 0.6976528167724609, + "losses/sft": 1.3461360931396484, + "losses/total": 0.6976528167724609, + "ref_logps/chosen": -35.39350891113281, + "ref_logps/rejected": -36.857025146484375, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0026777982711791992, + "rewards/margins": -0.004467433784157038, + "rewards/rejected": 0.007145232520997524, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 15.79524040222168, + "learning_rate": 2.044025157232704e-08, + "logps/chosen": -42.27802658081055, + "logps/rejected": -46.32032012939453, + "loss": 0.6964, + "losses/dpo": 0.704017162322998, + "losses/sft": 1.3899930715560913, + "losses/total": 0.704017162322998, + "ref_logps/chosen": -42.27894973754883, + "ref_logps/rejected": -46.3837890625, + "rewards/accuracies": 0.25, + "rewards/chosen": 9.242584928870201e-05, + "rewards/margins": -0.006254673004150391, + "rewards/rejected": 0.006347098853439093, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 13.720596313476562, + "learning_rate": 2.2012578616352197e-08, + "logps/chosen": -35.25041961669922, + "logps/rejected": -43.295013427734375, + "loss": 0.691, + "losses/dpo": 0.681664764881134, + "losses/sft": 1.2782015800476074, + "losses/total": 0.681664764881134, + "ref_logps/chosen": -35.20989227294922, + "ref_logps/rejected": -43.21023178100586, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.004052662290632725, + "rewards/margins": 0.004425554536283016, + "rewards/rejected": -0.00847821868956089, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 13.796195983886719, + "learning_rate": 2.3584905660377358e-08, + "logps/chosen": -35.010528564453125, + "logps/rejected": -36.66010665893555, + "loss": 0.6938, + "losses/dpo": 0.6871126890182495, + "losses/sft": 1.4939182996749878, + "losses/total": 0.6871126890182495, + "ref_logps/chosen": -35.017948150634766, + "ref_logps/rejected": -36.68010711669922, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0007417916785925627, + "rewards/margins": -0.001258552074432373, + "rewards/rejected": 0.0020003439858555794, + "step": 15 + }, + { + "epoch": 0.02, + "grad_norm": 13.905455589294434, + "learning_rate": 2.5157232704402515e-08, + "logps/chosen": -35.276771545410156, + "logps/rejected": -30.285860061645508, + "loss": 0.6959, + "losses/dpo": 0.6939616203308105, + "losses/sft": 1.1386065483093262, + "losses/total": 0.6939616203308105, + "ref_logps/chosen": -35.17827224731445, + "ref_logps/rejected": -30.2420597076416, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0098500307649374, + "rewards/margins": -0.005470088683068752, + "rewards/rejected": -0.004379943013191223, + "step": 16 + }, + { + "epoch": 0.02, + "grad_norm": 15.296125411987305, + "learning_rate": 2.672955974842767e-08, + "logps/chosen": -47.09111022949219, + "logps/rejected": -43.19539260864258, + "loss": 0.6922, + "losses/dpo": 0.6896570920944214, + "losses/sft": 1.3599190711975098, + "losses/total": 0.6896570920944214, + "ref_logps/chosen": -47.078372955322266, + "ref_logps/rejected": -43.1630859375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0012737751239910722, + "rewards/margins": 0.0019566123373806477, + "rewards/rejected": -0.0032303868792951107, + "step": 17 + }, + { + "epoch": 0.02, + "grad_norm": 16.253629684448242, + "learning_rate": 2.830188679245283e-08, + "logps/chosen": -35.313194274902344, + "logps/rejected": -49.410675048828125, + "loss": 0.6973, + "losses/dpo": 0.7041923999786377, + "losses/sft": 1.6521106958389282, + "losses/total": 0.7041923999786377, + "ref_logps/chosen": -35.28705978393555, + "ref_logps/rejected": -49.466217041015625, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.002613651566207409, + "rewards/margins": -0.008167792111635208, + "rewards/rejected": 0.005554139614105225, + "step": 18 + }, + { + "epoch": 0.02, + "grad_norm": 17.191499710083008, + "learning_rate": 2.987421383647799e-08, + "logps/chosen": -48.732940673828125, + "logps/rejected": -58.556121826171875, + "loss": 0.6931, + "losses/dpo": 0.6878706812858582, + "losses/sft": 1.5482776165008545, + "losses/total": 0.6878706812858582, + "ref_logps/chosen": -48.78472137451172, + "ref_logps/rejected": -58.60450744628906, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.005178308114409447, + "rewards/margins": 0.0003401516005396843, + "rewards/rejected": 0.004838156513869762, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 15.549980163574219, + "learning_rate": 3.1446540880503146e-08, + "logps/chosen": -36.84483337402344, + "logps/rejected": -39.059471130371094, + "loss": 0.6956, + "losses/dpo": 0.699995756149292, + "losses/sft": 1.1807096004486084, + "losses/total": 0.699995756149292, + "ref_logps/chosen": -36.83661651611328, + "ref_logps/rejected": -39.09865188598633, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.000821572495624423, + "rewards/margins": -0.004739719443023205, + "rewards/rejected": 0.003918147645890713, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 14.073959350585938, + "learning_rate": 3.30188679245283e-08, + "logps/chosen": -36.344810485839844, + "logps/rejected": -41.956016540527344, + "loss": 0.6923, + "losses/dpo": 0.6898228526115417, + "losses/sft": 1.222610354423523, + "losses/total": 0.6898228526115417, + "ref_logps/chosen": -36.35798263549805, + "ref_logps/rejected": -41.95195770263672, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0013175427448004484, + "rewards/margins": 0.0017231464153155684, + "rewards/rejected": -0.0004056035540997982, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 14.14978313446045, + "learning_rate": 3.459119496855346e-08, + "logps/chosen": -33.29254150390625, + "logps/rejected": -47.185489654541016, + "loss": 0.6939, + "losses/dpo": 0.6981321573257446, + "losses/sft": 1.4130489826202393, + "losses/total": 0.6981321573257446, + "ref_logps/chosen": -33.252281188964844, + "ref_logps/rejected": -47.157386779785156, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.004025830887258053, + "rewards/margins": -0.0012156391749158502, + "rewards/rejected": -0.002810192294418812, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 14.848532676696777, + "learning_rate": 3.6163522012578617e-08, + "logps/chosen": -46.34131622314453, + "logps/rejected": -39.75196075439453, + "loss": 0.6976, + "losses/dpo": 0.696555495262146, + "losses/sft": 1.7955430746078491, + "losses/total": 0.696555495262146, + "ref_logps/chosen": -46.30574035644531, + "ref_logps/rejected": -39.8038330078125, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.003557533025741577, + "rewards/margins": -0.008745087310671806, + "rewards/rejected": 0.005187553353607655, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 13.658124923706055, + "learning_rate": 3.7735849056603774e-08, + "logps/chosen": -31.579673767089844, + "logps/rejected": -36.94577407836914, + "loss": 0.6991, + "losses/dpo": 0.6995179653167725, + "losses/sft": 1.3396552801132202, + "losses/total": 0.6995179653167725, + "ref_logps/chosen": -31.527324676513672, + "ref_logps/rejected": -37.01141357421875, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.005234760232269764, + "rewards/margins": -0.011798936873674393, + "rewards/rejected": 0.0065641761757433414, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 14.473611831665039, + "learning_rate": 3.930817610062893e-08, + "logps/chosen": -33.093265533447266, + "logps/rejected": -38.25944519042969, + "loss": 0.6981, + "losses/dpo": 0.6936898231506348, + "losses/sft": 1.3963024616241455, + "losses/total": 0.6936898231506348, + "ref_logps/chosen": -33.092132568359375, + "ref_logps/rejected": -38.356536865234375, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.00011318037286400795, + "rewards/margins": -0.009822163730859756, + "rewards/rejected": 0.009708983823657036, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 15.082950592041016, + "learning_rate": 4.088050314465408e-08, + "logps/chosen": -33.5494270324707, + "logps/rejected": -38.667518615722656, + "loss": 0.6905, + "losses/dpo": 0.6945425868034363, + "losses/sft": 1.6459039449691772, + "losses/total": 0.6945425868034363, + "ref_logps/chosen": -33.616607666015625, + "ref_logps/rejected": -38.67987823486328, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.006717693526297808, + "rewards/margins": 0.00548145454376936, + "rewards/rejected": 0.001236238982528448, + "step": 26 + }, + { + "epoch": 0.03, + "grad_norm": 16.236902236938477, + "learning_rate": 4.245283018867924e-08, + "logps/chosen": -40.37019348144531, + "logps/rejected": -49.955196380615234, + "loss": 0.6905, + "losses/dpo": 0.6870005130767822, + "losses/sft": 1.2192702293395996, + "losses/total": 0.6870005130767822, + "ref_logps/chosen": -40.37604904174805, + "ref_logps/rejected": -49.90593338012695, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0005852816393598914, + "rewards/margins": 0.005511948373168707, + "rewards/rejected": -0.004926666617393494, + "step": 27 + }, + { + "epoch": 0.03, + "grad_norm": 14.55437183380127, + "learning_rate": 4.4025157232704395e-08, + "logps/chosen": -32.74169158935547, + "logps/rejected": -39.09827423095703, + "loss": 0.6934, + "losses/dpo": 0.6983916759490967, + "losses/sft": 1.4402568340301514, + "losses/total": 0.6983916759490967, + "ref_logps/chosen": -32.714149475097656, + "ref_logps/rejected": -39.075469970703125, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0027539432048797607, + "rewards/margins": -0.0004735295078717172, + "rewards/rejected": -0.0022804138716310263, + "step": 28 + }, + { + "epoch": 0.03, + "grad_norm": 14.335480690002441, + "learning_rate": 4.559748427672955e-08, + "logps/chosen": -34.224037170410156, + "logps/rejected": -37.38675308227539, + "loss": 0.6936, + "losses/dpo": 0.6951794624328613, + "losses/sft": 1.4238932132720947, + "losses/total": 0.6951794624328613, + "ref_logps/chosen": -34.20547866821289, + "ref_logps/rejected": -37.376678466796875, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0018555226270109415, + "rewards/margins": -0.0008478106465190649, + "rewards/rejected": -0.0010077119804918766, + "step": 29 + }, + { + "epoch": 0.03, + "grad_norm": 15.342060089111328, + "learning_rate": 4.7169811320754715e-08, + "logps/chosen": -34.61055374145508, + "logps/rejected": -42.71800994873047, + "loss": 0.6876, + "losses/dpo": 0.6789344549179077, + "losses/sft": 1.1305980682373047, + "losses/total": 0.6789344549179077, + "ref_logps/chosen": -34.644195556640625, + "ref_logps/rejected": -42.638336181640625, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003364542266353965, + "rewards/margins": 0.01133134588599205, + "rewards/rejected": -0.007966804318130016, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 13.72602367401123, + "learning_rate": 4.874213836477987e-08, + "logps/chosen": -32.430580139160156, + "logps/rejected": -33.662437438964844, + "loss": 0.6934, + "losses/dpo": 0.69603031873703, + "losses/sft": 1.152692437171936, + "losses/total": 0.69603031873703, + "ref_logps/chosen": -32.425933837890625, + "ref_logps/rejected": -33.66048049926758, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0004647308960556984, + "rewards/margins": -0.00026935047935694456, + "rewards/rejected": -0.00019538099877536297, + "step": 31 + }, + { + "epoch": 0.03, + "grad_norm": 15.070570945739746, + "learning_rate": 5.031446540880503e-08, + "logps/chosen": -40.984222412109375, + "logps/rejected": -47.076194763183594, + "loss": 0.6897, + "losses/dpo": 0.6885380744934082, + "losses/sft": 1.491570234298706, + "losses/total": 0.6885380744934082, + "ref_logps/chosen": -41.050697326660156, + "ref_logps/rejected": -47.071189880371094, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.006647652946412563, + "rewards/margins": 0.00714834313839674, + "rewards/rejected": -0.0005006909486837685, + "step": 32 + }, + { + "epoch": 0.03, + "grad_norm": 13.632221221923828, + "learning_rate": 5.1886792452830186e-08, + "logps/chosen": -33.047698974609375, + "logps/rejected": -34.45159912109375, + "loss": 0.6958, + "losses/dpo": 0.689713716506958, + "losses/sft": 1.2142502069473267, + "losses/total": 0.689713716506958, + "ref_logps/chosen": -33.024322509765625, + "ref_logps/rejected": -34.47886657714844, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.002337571932002902, + "rewards/margins": -0.005064377095550299, + "rewards/rejected": 0.002726804930716753, + "step": 33 + }, + { + "epoch": 0.03, + "grad_norm": 13.262465476989746, + "learning_rate": 5.345911949685534e-08, + "logps/chosen": -30.140480041503906, + "logps/rejected": -34.00312805175781, + "loss": 0.6962, + "losses/dpo": 0.6963517665863037, + "losses/sft": 1.2056186199188232, + "losses/total": 0.6963517665863037, + "ref_logps/chosen": -30.103130340576172, + "ref_logps/rejected": -34.02630615234375, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.003735171165317297, + "rewards/margins": -0.006053084507584572, + "rewards/rejected": 0.0023179128766059875, + "step": 34 + }, + { + "epoch": 0.03, + "grad_norm": 15.316493034362793, + "learning_rate": 5.50314465408805e-08, + "logps/chosen": -36.81224822998047, + "logps/rejected": -51.80878448486328, + "loss": 0.6917, + "losses/dpo": 0.6966387629508972, + "losses/sft": 1.3699390888214111, + "losses/total": 0.6966387629508972, + "ref_logps/chosen": -36.79683303833008, + "ref_logps/rejected": -51.762454986572266, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0015415906673297286, + "rewards/margins": 0.0030912284273654222, + "rewards/rejected": -0.004632818978279829, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 13.993942260742188, + "learning_rate": 5.660377358490566e-08, + "logps/chosen": -32.81092834472656, + "logps/rejected": -35.80175018310547, + "loss": 0.6935, + "losses/dpo": 0.6949833035469055, + "losses/sft": 1.1595534086227417, + "losses/total": 0.6949833035469055, + "ref_logps/chosen": -32.79785919189453, + "ref_logps/rejected": -35.79462814331055, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00130692427046597, + "rewards/margins": -0.0005950063932687044, + "rewards/rejected": -0.000711917644366622, + "step": 36 + }, + { + "epoch": 0.03, + "grad_norm": 14.649563789367676, + "learning_rate": 5.8176100628930814e-08, + "logps/chosen": -36.67781066894531, + "logps/rejected": -47.96495819091797, + "loss": 0.6901, + "losses/dpo": 0.6957077980041504, + "losses/sft": 1.529431939125061, + "losses/total": 0.6957077980041504, + "ref_logps/chosen": -36.696502685546875, + "ref_logps/rejected": -47.91997146606445, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.001869445899501443, + "rewards/margins": 0.006367784459143877, + "rewards/rejected": -0.004498339258134365, + "step": 37 + }, + { + "epoch": 0.04, + "grad_norm": 14.203393936157227, + "learning_rate": 5.974842767295598e-08, + "logps/chosen": -41.23329162597656, + "logps/rejected": -35.483360290527344, + "loss": 0.693, + "losses/dpo": 0.7117766737937927, + "losses/sft": 1.2247445583343506, + "losses/total": 0.7117766737937927, + "ref_logps/chosen": -41.22758483886719, + "ref_logps/rejected": -35.47206115722656, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0005709344986826181, + "rewards/margins": 0.0005590622313320637, + "rewards/rejected": -0.0011299969628453255, + "step": 38 + }, + { + "epoch": 0.04, + "grad_norm": 15.450180053710938, + "learning_rate": 6.132075471698113e-08, + "logps/chosen": -39.377593994140625, + "logps/rejected": -46.454246520996094, + "loss": 0.6975, + "losses/dpo": 0.6997748613357544, + "losses/sft": 1.5618855953216553, + "losses/total": 0.6997748613357544, + "ref_logps/chosen": -39.30659866333008, + "ref_logps/rejected": -46.469581604003906, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.007099542301148176, + "rewards/margins": -0.008633071556687355, + "rewards/rejected": 0.001533529139123857, + "step": 39 + }, + { + "epoch": 0.04, + "grad_norm": 14.611263275146484, + "learning_rate": 6.289308176100629e-08, + "logps/chosen": -31.116464614868164, + "logps/rejected": -36.931434631347656, + "loss": 0.6954, + "losses/dpo": 0.6901916861534119, + "losses/sft": 1.1429109573364258, + "losses/total": 0.6901916861534119, + "ref_logps/chosen": -31.093990325927734, + "ref_logps/rejected": -36.953338623046875, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.0022472681012004614, + "rewards/margins": -0.004437911324203014, + "rewards/rejected": 0.002190643921494484, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 13.562875747680664, + "learning_rate": 6.446540880503144e-08, + "logps/chosen": -33.89849853515625, + "logps/rejected": -40.63304138183594, + "loss": 0.6924, + "losses/dpo": 0.6974458694458008, + "losses/sft": 1.3313788175582886, + "losses/total": 0.6974458694458008, + "ref_logps/chosen": -33.87089538574219, + "ref_logps/rejected": -40.59001159667969, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.002760761883109808, + "rewards/margins": 0.001542425248771906, + "rewards/rejected": -0.004303187131881714, + "step": 41 + }, + { + "epoch": 0.04, + "grad_norm": 13.275260925292969, + "learning_rate": 6.60377358490566e-08, + "logps/chosen": -31.73421287536621, + "logps/rejected": -37.52004623413086, + "loss": 0.6971, + "losses/dpo": 0.6914031505584717, + "losses/sft": 1.2416191101074219, + "losses/total": 0.6914031505584717, + "ref_logps/chosen": -31.701637268066406, + "ref_logps/rejected": -37.56598663330078, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0032576103694736958, + "rewards/margins": -0.007851381786167622, + "rewards/rejected": 0.004593771882355213, + "step": 42 + }, + { + "epoch": 0.04, + "grad_norm": 15.27834701538086, + "learning_rate": 6.761006289308176e-08, + "logps/chosen": -38.450767517089844, + "logps/rejected": -45.00822448730469, + "loss": 0.6918, + "losses/dpo": 0.6955606937408447, + "losses/sft": 1.1744383573532104, + "losses/total": 0.6955606937408447, + "ref_logps/chosen": -38.456146240234375, + "ref_logps/rejected": -44.983848571777344, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0005378782516345382, + "rewards/margins": 0.002975050127133727, + "rewards/rejected": -0.002437171759083867, + "step": 43 + }, + { + "epoch": 0.04, + "grad_norm": 15.608879089355469, + "learning_rate": 6.918238993710692e-08, + "logps/chosen": -28.18182945251465, + "logps/rejected": -47.542015075683594, + "loss": 0.6946, + "losses/dpo": 0.6975482702255249, + "losses/sft": 0.8388431072235107, + "losses/total": 0.6975482702255249, + "ref_logps/chosen": -28.196659088134766, + "ref_logps/rejected": -47.58372497558594, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0014829186256974936, + "rewards/margins": -0.002688285894691944, + "rewards/rejected": 0.004171204753220081, + "step": 44 + }, + { + "epoch": 0.04, + "grad_norm": 14.393813133239746, + "learning_rate": 7.075471698113207e-08, + "logps/chosen": -38.44715118408203, + "logps/rejected": -42.931541442871094, + "loss": 0.6936, + "losses/dpo": 0.6936181783676147, + "losses/sft": 1.3454254865646362, + "losses/total": 0.6936181783676147, + "ref_logps/chosen": -38.42930221557617, + "ref_logps/rejected": -42.92062759399414, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0017848550342023373, + "rewards/margins": -0.0006935299606993794, + "rewards/rejected": -0.0010913253063336015, + "step": 45 + }, + { + "epoch": 0.04, + "grad_norm": 16.733362197875977, + "learning_rate": 7.232704402515723e-08, + "logps/chosen": -37.33485412597656, + "logps/rejected": -41.190853118896484, + "loss": 0.6903, + "losses/dpo": 0.6862717270851135, + "losses/sft": 1.1105400323867798, + "losses/total": 0.6862717270851135, + "ref_logps/chosen": -37.37635040283203, + "ref_logps/rejected": -41.17495346069336, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.004149759188294411, + "rewards/margins": 0.0057398732751607895, + "rewards/rejected": -0.0015901147853583097, + "step": 46 + }, + { + "epoch": 0.04, + "grad_norm": 15.69994831085205, + "learning_rate": 7.389937106918238e-08, + "logps/chosen": -44.16721725463867, + "logps/rejected": -37.57732009887695, + "loss": 0.6932, + "losses/dpo": 0.6937659978866577, + "losses/sft": 1.4528712034225464, + "losses/total": 0.6937659978866577, + "ref_logps/chosen": -44.15205764770508, + "ref_logps/rejected": -37.56299591064453, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.001515796990133822, + "rewards/margins": -8.350861025974154e-05, + "rewards/rejected": -0.001432287972420454, + "step": 47 + }, + { + "epoch": 0.05, + "grad_norm": 15.128890991210938, + "learning_rate": 7.547169811320755e-08, + "logps/chosen": -44.43276596069336, + "logps/rejected": -46.47834777832031, + "loss": 0.6911, + "losses/dpo": 0.690741240978241, + "losses/sft": 1.3188337087631226, + "losses/total": 0.690741240978241, + "ref_logps/chosen": -44.44622802734375, + "ref_logps/rejected": -46.45027160644531, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.00134660629555583, + "rewards/margins": 0.0041544619016349316, + "rewards/rejected": -0.0028078556060791016, + "step": 48 + }, + { + "epoch": 0.05, + "grad_norm": 16.965360641479492, + "learning_rate": 7.70440251572327e-08, + "logps/chosen": -39.97547912597656, + "logps/rejected": -48.518775939941406, + "loss": 0.6885, + "losses/dpo": 0.6853836178779602, + "losses/sft": 1.380096673965454, + "losses/total": 0.6853836178779602, + "ref_logps/chosen": -40.01591491699219, + "ref_logps/rejected": -48.46247863769531, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004043960943818092, + "rewards/margins": 0.009673237800598145, + "rewards/rejected": -0.0056292773224413395, + "step": 49 + }, + { + "epoch": 0.05, + "grad_norm": 15.04533576965332, + "learning_rate": 7.861635220125786e-08, + "logps/chosen": -39.61756896972656, + "logps/rejected": -49.66996383666992, + "loss": 0.6942, + "losses/dpo": 0.6997429728507996, + "losses/sft": 1.417793869972229, + "losses/total": 0.6997429728507996, + "ref_logps/chosen": -39.60337829589844, + "ref_logps/rejected": -49.67588424682617, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0014191538793966174, + "rewards/margins": -0.0020113971550017595, + "rewards/rejected": 0.0005922436248511076, + "step": 50 + }, + { + "epoch": 0.05, + "grad_norm": 15.650992393493652, + "learning_rate": 8.018867924528301e-08, + "logps/chosen": -34.93170928955078, + "logps/rejected": -45.80840301513672, + "loss": 0.6917, + "losses/dpo": 0.6938539147377014, + "losses/sft": 1.172393798828125, + "losses/total": 0.6938539147377014, + "ref_logps/chosen": -34.88665771484375, + "ref_logps/rejected": -45.73363494873047, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.004505685530602932, + "rewards/margins": 0.0029702871106565, + "rewards/rejected": -0.0074759721755981445, + "step": 51 + }, + { + "epoch": 0.05, + "grad_norm": 12.861350059509277, + "learning_rate": 8.176100628930816e-08, + "logps/chosen": -30.98968505859375, + "logps/rejected": -37.95505142211914, + "loss": 0.6912, + "losses/dpo": 0.6984187364578247, + "losses/sft": 1.143085241317749, + "losses/total": 0.6984187364578247, + "ref_logps/chosen": -31.017133712768555, + "ref_logps/rejected": -37.94151306152344, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0027447701431810856, + "rewards/margins": 0.004098537378013134, + "rewards/rejected": -0.0013537677004933357, + "step": 52 + }, + { + "epoch": 0.05, + "grad_norm": 15.835521697998047, + "learning_rate": 8.333333333333333e-08, + "logps/chosen": -41.60312271118164, + "logps/rejected": -39.74080276489258, + "loss": 0.6937, + "losses/dpo": 0.6793924570083618, + "losses/sft": 1.4429981708526611, + "losses/total": 0.6793924570083618, + "ref_logps/chosen": -41.5811767578125, + "ref_logps/rejected": -39.72795867919922, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.002194375265389681, + "rewards/margins": -0.0009097992442548275, + "rewards/rejected": -0.0012845754390582442, + "step": 53 + }, + { + "epoch": 0.05, + "grad_norm": 15.634050369262695, + "learning_rate": 8.490566037735848e-08, + "logps/chosen": -46.38857650756836, + "logps/rejected": -40.06861114501953, + "loss": 0.6962, + "losses/dpo": 0.6969350576400757, + "losses/sft": 1.2307177782058716, + "losses/total": 0.6969350576400757, + "ref_logps/chosen": -46.36830520629883, + "ref_logps/rejected": -40.10871505737305, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.002026684582233429, + "rewards/margins": -0.006037433166056871, + "rewards/rejected": 0.00401074904948473, + "step": 54 + }, + { + "epoch": 0.05, + "grad_norm": 14.544160842895508, + "learning_rate": 8.647798742138364e-08, + "logps/chosen": -32.85659408569336, + "logps/rejected": -40.02392578125, + "loss": 0.6964, + "losses/dpo": 0.689262866973877, + "losses/sft": 1.1000863313674927, + "losses/total": 0.689262866973877, + "ref_logps/chosen": -32.840023040771484, + "ref_logps/rejected": -40.07231140136719, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0016567648854106665, + "rewards/margins": -0.0064952559769153595, + "rewards/rejected": 0.004838490858674049, + "step": 55 + }, + { + "epoch": 0.05, + "grad_norm": 14.296195983886719, + "learning_rate": 8.805031446540879e-08, + "logps/chosen": -41.2823371887207, + "logps/rejected": -38.29021072387695, + "loss": 0.693, + "losses/dpo": 0.6974284648895264, + "losses/sft": 1.3975054025650024, + "losses/total": 0.6974284648895264, + "ref_logps/chosen": -41.26557922363281, + "ref_logps/rejected": -38.268707275390625, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0016760170692577958, + "rewards/margins": 0.0004747032653540373, + "rewards/rejected": -0.002150720451027155, + "step": 56 + }, + { + "epoch": 0.05, + "grad_norm": 14.8149995803833, + "learning_rate": 8.962264150943395e-08, + "logps/chosen": -33.73179626464844, + "logps/rejected": -41.118045806884766, + "loss": 0.6936, + "losses/dpo": 0.6961352229118347, + "losses/sft": 1.2129688262939453, + "losses/total": 0.6961352229118347, + "ref_logps/chosen": -33.73820877075195, + "ref_logps/rejected": -41.132591247558594, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0006412325892597437, + "rewards/margins": -0.0008131118956953287, + "rewards/rejected": 0.0014543444849550724, + "step": 57 + }, + { + "epoch": 0.05, + "grad_norm": 13.560029983520508, + "learning_rate": 9.11949685534591e-08, + "logps/chosen": -36.52886962890625, + "logps/rejected": -34.41619873046875, + "loss": 0.6932, + "losses/dpo": 0.6908233165740967, + "losses/sft": 1.0413261651992798, + "losses/total": 0.6908233165740967, + "ref_logps/chosen": -36.54736328125, + "ref_logps/rejected": -34.43437957763672, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0018498122226446867, + "rewards/margins": 3.185553941875696e-05, + "rewards/rejected": 0.0018179567996412516, + "step": 58 + }, + { + "epoch": 0.06, + "grad_norm": 15.819250106811523, + "learning_rate": 9.276729559748427e-08, + "logps/chosen": -42.62882614135742, + "logps/rejected": -42.30875015258789, + "loss": 0.6906, + "losses/dpo": 0.6812754273414612, + "losses/sft": 1.4014006853103638, + "losses/total": 0.6812754273414612, + "ref_logps/chosen": -42.63201904296875, + "ref_logps/rejected": -42.25872802734375, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.00031913910061120987, + "rewards/margins": 0.005321601405739784, + "rewards/rejected": -0.005002461373806, + "step": 59 + }, + { + "epoch": 0.06, + "grad_norm": 13.456873893737793, + "learning_rate": 9.433962264150943e-08, + "logps/chosen": -30.847614288330078, + "logps/rejected": -33.43515396118164, + "loss": 0.6919, + "losses/dpo": 0.6852722764015198, + "losses/sft": 1.3770960569381714, + "losses/total": 0.6852722764015198, + "ref_logps/chosen": -30.881393432617188, + "ref_logps/rejected": -33.4415283203125, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.003377854824066162, + "rewards/margins": 0.002740193158388138, + "rewards/rejected": 0.0006376623641699553, + "step": 60 + }, + { + "epoch": 0.06, + "grad_norm": 15.81886100769043, + "learning_rate": 9.59119496855346e-08, + "logps/chosen": -41.415042877197266, + "logps/rejected": -50.666526794433594, + "loss": 0.6946, + "losses/dpo": 0.7051351070404053, + "losses/sft": 1.4612579345703125, + "losses/total": 0.7051351070404053, + "ref_logps/chosen": -41.415164947509766, + "ref_logps/rejected": -50.69341278076172, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.2171454727649689e-05, + "rewards/margins": -0.002675973577424884, + "rewards/rejected": 0.0026881457306444645, + "step": 61 + }, + { + "epoch": 0.06, + "grad_norm": 14.154650688171387, + "learning_rate": 9.748427672955974e-08, + "logps/chosen": -43.80131530761719, + "logps/rejected": -36.09864807128906, + "loss": 0.6868, + "losses/dpo": 0.6857415437698364, + "losses/sft": 2.0101566314697266, + "losses/total": 0.6857415437698364, + "ref_logps/chosen": -43.869712829589844, + "ref_logps/rejected": -36.038368225097656, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.006840109825134277, + "rewards/margins": 0.01286797784268856, + "rewards/rejected": -0.0060278684832155704, + "step": 62 + }, + { + "epoch": 0.06, + "grad_norm": 13.230729103088379, + "learning_rate": 9.905660377358491e-08, + "logps/chosen": -31.131973266601562, + "logps/rejected": -37.758209228515625, + "loss": 0.6926, + "losses/dpo": 0.6912837624549866, + "losses/sft": 1.1980384588241577, + "losses/total": 0.6912837624549866, + "ref_logps/chosen": -31.096126556396484, + "ref_logps/rejected": -37.710594177246094, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.003584663849323988, + "rewards/margins": 0.0011769132688641548, + "rewards/rejected": -0.004761577118188143, + "step": 63 + }, + { + "epoch": 0.06, + "grad_norm": 17.403446197509766, + "learning_rate": 1.0062893081761006e-07, + "logps/chosen": -41.96482849121094, + "logps/rejected": -53.58015823364258, + "loss": 0.6955, + "losses/dpo": 0.6856638193130493, + "losses/sft": 1.075535774230957, + "losses/total": 0.6856638193130493, + "ref_logps/chosen": -41.92784118652344, + "ref_logps/rejected": -53.586753845214844, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.003698870772495866, + "rewards/margins": -0.004358276724815369, + "rewards/rejected": 0.0006594061851501465, + "step": 64 + }, + { + "epoch": 0.06, + "grad_norm": 14.0162992477417, + "learning_rate": 1.0220125786163522e-07, + "logps/chosen": -23.421993255615234, + "logps/rejected": -43.844642639160156, + "loss": 0.6926, + "losses/dpo": 0.7009305953979492, + "losses/sft": 1.3693844079971313, + "losses/total": 0.7009305953979492, + "ref_logps/chosen": -23.462329864501953, + "ref_logps/rejected": -43.87372589111328, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.00403362512588501, + "rewards/margins": 0.0011253058910369873, + "rewards/rejected": 0.0029083192348480225, + "step": 65 + }, + { + "epoch": 0.06, + "grad_norm": 14.804617881774902, + "learning_rate": 1.0377358490566037e-07, + "logps/chosen": -39.25425338745117, + "logps/rejected": -40.46149444580078, + "loss": 0.6966, + "losses/dpo": 0.7018302083015442, + "losses/sft": 1.4890705347061157, + "losses/total": 0.7018302083015442, + "ref_logps/chosen": -39.18394088745117, + "ref_logps/rejected": -40.45819091796875, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0070314290933310986, + "rewards/margins": -0.006701166275888681, + "rewards/rejected": -0.00033026374876499176, + "step": 66 + }, + { + "epoch": 0.06, + "grad_norm": 13.262336730957031, + "learning_rate": 1.0534591194968554e-07, + "logps/chosen": -30.260812759399414, + "logps/rejected": -33.28879165649414, + "loss": 0.6948, + "losses/dpo": 0.6856677532196045, + "losses/sft": 0.8565500974655151, + "losses/total": 0.6856677532196045, + "ref_logps/chosen": -30.227874755859375, + "ref_logps/rejected": -33.28705596923828, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.003294035093858838, + "rewards/margins": -0.0031204731203615665, + "rewards/rejected": -0.0001735622063279152, + "step": 67 + }, + { + "epoch": 0.06, + "grad_norm": 14.190892219543457, + "learning_rate": 1.0691823899371069e-07, + "logps/chosen": -31.034976959228516, + "logps/rejected": -41.680450439453125, + "loss": 0.6918, + "losses/dpo": 0.6886014938354492, + "losses/sft": 1.6718907356262207, + "losses/total": 0.6886014938354492, + "ref_logps/chosen": -31.038402557373047, + "ref_logps/rejected": -41.65570831298828, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00034259981475770473, + "rewards/margins": 0.0028164724353700876, + "rewards/rejected": -0.0024738728534430265, + "step": 68 + }, + { + "epoch": 0.07, + "grad_norm": 14.235614776611328, + "learning_rate": 1.0849056603773585e-07, + "logps/chosen": -38.295806884765625, + "logps/rejected": -40.39185333251953, + "loss": 0.6901, + "losses/dpo": 0.6912087202072144, + "losses/sft": 1.3809698820114136, + "losses/total": 0.6912087202072144, + "ref_logps/chosen": -38.35113525390625, + "ref_logps/rejected": -40.38447952270508, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0055329324677586555, + "rewards/margins": 0.006270730402320623, + "rewards/rejected": -0.0007377980509772897, + "step": 69 + }, + { + "epoch": 0.07, + "grad_norm": 14.858607292175293, + "learning_rate": 1.10062893081761e-07, + "logps/chosen": -36.434654235839844, + "logps/rejected": -42.416053771972656, + "loss": 0.6965, + "losses/dpo": 0.6960601806640625, + "losses/sft": 1.3788210153579712, + "losses/total": 0.6960601806640625, + "ref_logps/chosen": -36.4222526550293, + "ref_logps/rejected": -42.46902084350586, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0012397617101669312, + "rewards/margins": -0.006536379922181368, + "rewards/rejected": 0.005296617280691862, + "step": 70 + }, + { + "epoch": 0.07, + "grad_norm": 13.156291007995605, + "learning_rate": 1.1163522012578616e-07, + "logps/chosen": -24.545652389526367, + "logps/rejected": -35.784202575683594, + "loss": 0.6923, + "losses/dpo": 0.6898338794708252, + "losses/sft": 1.1819456815719604, + "losses/total": 0.6898338794708252, + "ref_logps/chosen": -24.52574920654297, + "ref_logps/rejected": -35.74748992919922, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.001990178134292364, + "rewards/margins": 0.0016812352696433663, + "rewards/rejected": -0.0036714139860123396, + "step": 71 + }, + { + "epoch": 0.07, + "grad_norm": 16.261690139770508, + "learning_rate": 1.1320754716981131e-07, + "logps/chosen": -30.026611328125, + "logps/rejected": -44.707176208496094, + "loss": 0.6919, + "losses/dpo": 0.6989429593086243, + "losses/sft": 1.3058668375015259, + "losses/total": 0.6989429593086243, + "ref_logps/chosen": -30.060457229614258, + "ref_logps/rejected": -44.71483612060547, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0033847063314169645, + "rewards/margins": 0.0026185151655226946, + "rewards/rejected": 0.0007661909912712872, + "step": 72 + }, + { + "epoch": 0.07, + "grad_norm": 14.12116813659668, + "learning_rate": 1.1477987421383648e-07, + "logps/chosen": -30.411203384399414, + "logps/rejected": -43.75214385986328, + "loss": 0.6931, + "losses/dpo": 0.6866962909698486, + "losses/sft": 0.9678082466125488, + "losses/total": 0.6866962909698486, + "ref_logps/chosen": -30.391040802001953, + "ref_logps/rejected": -43.72895812988281, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0020164516754448414, + "rewards/margins": 0.00030208518728613853, + "rewards/rejected": -0.002318537561222911, + "step": 73 + }, + { + "epoch": 0.07, + "grad_norm": 15.123022079467773, + "learning_rate": 1.1635220125786163e-07, + "logps/chosen": -47.57429504394531, + "logps/rejected": -44.82411193847656, + "loss": 0.6922, + "losses/dpo": 0.6984449028968811, + "losses/sft": 1.6888502836227417, + "losses/total": 0.6984449028968811, + "ref_logps/chosen": -47.54075241088867, + "ref_logps/rejected": -44.771324157714844, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0033542518503963947, + "rewards/margins": 0.001925045158714056, + "rewards/rejected": -0.0052792965434491634, + "step": 74 + }, + { + "epoch": 0.07, + "grad_norm": 15.112242698669434, + "learning_rate": 1.1792452830188679e-07, + "logps/chosen": -35.57346725463867, + "logps/rejected": -47.27886962890625, + "loss": 0.6945, + "losses/dpo": 0.6906975507736206, + "losses/sft": 1.2371470928192139, + "losses/total": 0.6906975507736206, + "ref_logps/chosen": -35.503482818603516, + "ref_logps/rejected": -47.23637771606445, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006998538970947266, + "rewards/margins": -0.002749508712440729, + "rewards/rejected": -0.004249030724167824, + "step": 75 + }, + { + "epoch": 0.07, + "grad_norm": 15.0462007522583, + "learning_rate": 1.1949685534591195e-07, + "logps/chosen": -44.30376434326172, + "logps/rejected": -45.30217742919922, + "loss": 0.6937, + "losses/dpo": 0.7008363604545593, + "losses/sft": 1.2435119152069092, + "losses/total": 0.7008363604545593, + "ref_logps/chosen": -44.33555603027344, + "ref_logps/rejected": -45.34375, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.003179407212883234, + "rewards/margins": -0.0009777904488146305, + "rewards/rejected": 0.0041571976616978645, + "step": 76 + }, + { + "epoch": 0.07, + "grad_norm": 16.324993133544922, + "learning_rate": 1.210691823899371e-07, + "logps/chosen": -40.5933723449707, + "logps/rejected": -46.27098846435547, + "loss": 0.6981, + "losses/dpo": 0.694791316986084, + "losses/sft": 1.267210841178894, + "losses/total": 0.694791316986084, + "ref_logps/chosen": -40.50887680053711, + "ref_logps/rejected": -46.2847900390625, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.00844934955239296, + "rewards/margins": -0.009829377755522728, + "rewards/rejected": 0.0013800286687910557, + "step": 77 + }, + { + "epoch": 0.07, + "grad_norm": 15.344736099243164, + "learning_rate": 1.2264150943396226e-07, + "logps/chosen": -35.888084411621094, + "logps/rejected": -47.26556396484375, + "loss": 0.6926, + "losses/dpo": 0.6868501901626587, + "losses/sft": 1.6895915269851685, + "losses/total": 0.6868501901626587, + "ref_logps/chosen": -35.859771728515625, + "ref_logps/rejected": -47.22634506225586, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0028314590454101562, + "rewards/margins": 0.001090294448658824, + "rewards/rejected": -0.003921753726899624, + "step": 78 + }, + { + "epoch": 0.07, + "grad_norm": 16.02104949951172, + "learning_rate": 1.242138364779874e-07, + "logps/chosen": -51.301265716552734, + "logps/rejected": -40.96278381347656, + "loss": 0.6922, + "losses/dpo": 0.6897770166397095, + "losses/sft": 1.4705162048339844, + "losses/total": 0.6897770166397095, + "ref_logps/chosen": -51.24794006347656, + "ref_logps/rejected": -40.89030456542969, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00533277727663517, + "rewards/margins": 0.0019149510189890862, + "rewards/rejected": -0.007247728295624256, + "step": 79 + }, + { + "epoch": 0.08, + "grad_norm": 14.240321159362793, + "learning_rate": 1.2578616352201258e-07, + "logps/chosen": -38.717018127441406, + "logps/rejected": -43.53178405761719, + "loss": 0.6922, + "losses/dpo": 0.6946658492088318, + "losses/sft": 1.1348018646240234, + "losses/total": 0.6946658492088318, + "ref_logps/chosen": -38.72174072265625, + "ref_logps/rejected": -43.515045166015625, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.00047174119390547276, + "rewards/margins": 0.0021452102810144424, + "rewards/rejected": -0.0016734690871089697, + "step": 80 + }, + { + "epoch": 0.08, + "grad_norm": 15.212409973144531, + "learning_rate": 1.2735849056603773e-07, + "logps/chosen": -39.79193878173828, + "logps/rejected": -44.109989166259766, + "loss": 0.691, + "losses/dpo": 0.6931477785110474, + "losses/sft": 1.2996903657913208, + "losses/total": 0.6931477785110474, + "ref_logps/chosen": -39.76420593261719, + "ref_logps/rejected": -44.03834533691406, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0027731030713766813, + "rewards/margins": 0.0043914709240198135, + "rewards/rejected": -0.007164573762565851, + "step": 81 + }, + { + "epoch": 0.08, + "grad_norm": 14.307190895080566, + "learning_rate": 1.2893081761006288e-07, + "logps/chosen": -39.145469665527344, + "logps/rejected": -42.11958312988281, + "loss": 0.6941, + "losses/dpo": 0.6958881616592407, + "losses/sft": 1.587785243988037, + "losses/total": 0.6958881616592407, + "ref_logps/chosen": -39.09599304199219, + "ref_logps/rejected": -42.08744812011719, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.004947358276695013, + "rewards/margins": -0.0017338457982987165, + "rewards/rejected": -0.00321351271122694, + "step": 82 + }, + { + "epoch": 0.08, + "grad_norm": 13.738253593444824, + "learning_rate": 1.3050314465408803e-07, + "logps/chosen": -34.46840286254883, + "logps/rejected": -32.53174591064453, + "loss": 0.6924, + "losses/dpo": 0.6979929804801941, + "losses/sft": 1.4558571577072144, + "losses/total": 0.6979929804801941, + "ref_logps/chosen": -34.381011962890625, + "ref_logps/rejected": -32.42715835571289, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.008738812990486622, + "rewards/margins": 0.0017202908638864756, + "rewards/rejected": -0.010459104552865028, + "step": 83 + }, + { + "epoch": 0.08, + "grad_norm": 14.270866394042969, + "learning_rate": 1.320754716981132e-07, + "logps/chosen": -36.263126373291016, + "logps/rejected": -32.43574523925781, + "loss": 0.6934, + "losses/dpo": 0.6902810335159302, + "losses/sft": 1.0569183826446533, + "losses/total": 0.6902810335159302, + "ref_logps/chosen": -36.21672439575195, + "ref_logps/rejected": -32.393104553222656, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.004640126600861549, + "rewards/margins": -0.0003763199783861637, + "rewards/rejected": -0.004263806156814098, + "step": 84 + }, + { + "epoch": 0.08, + "grad_norm": 13.939360618591309, + "learning_rate": 1.3364779874213836e-07, + "logps/chosen": -27.650066375732422, + "logps/rejected": -39.44580078125, + "loss": 0.6917, + "losses/dpo": 0.6909595727920532, + "losses/sft": 0.8875692486763, + "losses/total": 0.6909595727920532, + "ref_logps/chosen": -27.60795783996582, + "ref_logps/rejected": -39.37433624267578, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004210913088172674, + "rewards/margins": 0.0029358803294599056, + "rewards/rejected": -0.007146793883293867, + "step": 85 + }, + { + "epoch": 0.08, + "grad_norm": 15.56495475769043, + "learning_rate": 1.352201257861635e-07, + "logps/chosen": -46.32075500488281, + "logps/rejected": -45.338584899902344, + "loss": 0.6895, + "losses/dpo": 0.6981450319290161, + "losses/sft": 1.4593093395233154, + "losses/total": 0.6981450319290161, + "ref_logps/chosen": -46.183692932128906, + "ref_logps/rejected": -45.12657165527344, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01370624266564846, + "rewards/margins": 0.007495338097214699, + "rewards/rejected": -0.02120158076286316, + "step": 86 + }, + { + "epoch": 0.08, + "grad_norm": 15.097458839416504, + "learning_rate": 1.3679245283018866e-07, + "logps/chosen": -39.066123962402344, + "logps/rejected": -45.30772399902344, + "loss": 0.6881, + "losses/dpo": 0.682632565498352, + "losses/sft": 1.6743701696395874, + "losses/total": 0.682632565498352, + "ref_logps/chosen": -38.98908233642578, + "ref_logps/rejected": -45.1273078918457, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.007704532239586115, + "rewards/margins": 0.010336978361010551, + "rewards/rejected": -0.01804151013493538, + "step": 87 + }, + { + "epoch": 0.08, + "grad_norm": 16.17900276184082, + "learning_rate": 1.3836477987421384e-07, + "logps/chosen": -25.759960174560547, + "logps/rejected": -41.42595291137695, + "loss": 0.6879, + "losses/dpo": 0.6917724609375, + "losses/sft": 1.3302946090698242, + "losses/total": 0.6917724609375, + "ref_logps/chosen": -25.710800170898438, + "ref_logps/rejected": -41.27067184448242, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004916036501526833, + "rewards/margins": 0.010611901059746742, + "rewards/rejected": -0.015527937561273575, + "step": 88 + }, + { + "epoch": 0.08, + "grad_norm": 15.869852066040039, + "learning_rate": 1.39937106918239e-07, + "logps/chosen": -37.822227478027344, + "logps/rejected": -47.868438720703125, + "loss": 0.6939, + "losses/dpo": 0.6867597699165344, + "losses/sft": 1.1006416082382202, + "losses/total": 0.6867597699165344, + "ref_logps/chosen": -37.74897003173828, + "ref_logps/rejected": -47.808197021484375, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.007325470447540283, + "rewards/margins": -0.0013015333097428083, + "rewards/rejected": -0.0060239373706281185, + "step": 89 + }, + { + "epoch": 0.08, + "grad_norm": 13.955927848815918, + "learning_rate": 1.4150943396226414e-07, + "logps/chosen": -29.049095153808594, + "logps/rejected": -53.24306869506836, + "loss": 0.6938, + "losses/dpo": 0.6909089088439941, + "losses/sft": 1.1799514293670654, + "losses/total": 0.6909089088439941, + "ref_logps/chosen": -28.953819274902344, + "ref_logps/rejected": -53.159454345703125, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.009527618065476418, + "rewards/margins": -0.001166248694062233, + "rewards/rejected": -0.008361369371414185, + "step": 90 + }, + { + "epoch": 0.09, + "grad_norm": 14.775657653808594, + "learning_rate": 1.430817610062893e-07, + "logps/chosen": -34.025177001953125, + "logps/rejected": -42.15654373168945, + "loss": 0.6929, + "losses/dpo": 0.7020770311355591, + "losses/sft": 1.0969116687774658, + "losses/total": 0.7020770311355591, + "ref_logps/chosen": -33.92861557006836, + "ref_logps/rejected": -42.053688049316406, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.009656224399805069, + "rewards/margins": 0.0006290348246693611, + "rewards/rejected": -0.01028525922447443, + "step": 91 + }, + { + "epoch": 0.09, + "grad_norm": 14.214162826538086, + "learning_rate": 1.4465408805031447e-07, + "logps/chosen": -34.177757263183594, + "logps/rejected": -43.930545806884766, + "loss": 0.6881, + "losses/dpo": 0.6853083372116089, + "losses/sft": 0.9370521306991577, + "losses/total": 0.6853083372116089, + "ref_logps/chosen": -34.10899353027344, + "ref_logps/rejected": -43.759490966796875, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.006876480299979448, + "rewards/margins": 0.010229195468127728, + "rewards/rejected": -0.017105676233768463, + "step": 92 + }, + { + "epoch": 0.09, + "grad_norm": 17.3109188079834, + "learning_rate": 1.4622641509433962e-07, + "logps/chosen": -40.37968444824219, + "logps/rejected": -49.63119125366211, + "loss": 0.6886, + "losses/dpo": 0.6832884550094604, + "losses/sft": 1.7671897411346436, + "losses/total": 0.6832884550094604, + "ref_logps/chosen": -40.22066116333008, + "ref_logps/rejected": -49.377174377441406, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01590237021446228, + "rewards/margins": 0.00949946604669094, + "rewards/rejected": -0.02540183812379837, + "step": 93 + }, + { + "epoch": 0.09, + "grad_norm": 13.591806411743164, + "learning_rate": 1.4779874213836477e-07, + "logps/chosen": -34.08275604248047, + "logps/rejected": -32.793453216552734, + "loss": 0.6869, + "losses/dpo": 0.6806268095970154, + "losses/sft": 1.2122366428375244, + "losses/total": 0.6806268095970154, + "ref_logps/chosen": -34.06282043457031, + "ref_logps/rejected": -32.647281646728516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0019936240278184414, + "rewards/margins": 0.012623312883079052, + "rewards/rejected": -0.014616936445236206, + "step": 94 + }, + { + "epoch": 0.09, + "grad_norm": 14.241443634033203, + "learning_rate": 1.4937106918238992e-07, + "logps/chosen": -43.90531921386719, + "logps/rejected": -38.043739318847656, + "loss": 0.6917, + "losses/dpo": 0.6911299824714661, + "losses/sft": 1.7745474576950073, + "losses/total": 0.6911299824714661, + "ref_logps/chosen": -43.74835968017578, + "ref_logps/rejected": -37.856956481933594, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.015696100890636444, + "rewards/margins": 0.0029826308600604534, + "rewards/rejected": -0.018678732216358185, + "step": 95 + }, + { + "epoch": 0.09, + "grad_norm": 15.963578224182129, + "learning_rate": 1.509433962264151e-07, + "logps/chosen": -33.27313995361328, + "logps/rejected": -43.820289611816406, + "loss": 0.6911, + "losses/dpo": 0.6779791712760925, + "losses/sft": 1.113797903060913, + "losses/total": 0.6779791712760925, + "ref_logps/chosen": -33.136817932128906, + "ref_logps/rejected": -43.64072036743164, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.01363192219287157, + "rewards/margins": 0.004325224086642265, + "rewards/rejected": -0.01795714534819126, + "step": 96 + }, + { + "epoch": 0.09, + "grad_norm": 15.912907600402832, + "learning_rate": 1.5251572327044024e-07, + "logps/chosen": -41.62563705444336, + "logps/rejected": -55.13963317871094, + "loss": 0.6969, + "losses/dpo": 0.6952822208404541, + "losses/sft": 1.8095005750656128, + "losses/total": 0.6952822208404541, + "ref_logps/chosen": -41.45159149169922, + "ref_logps/rejected": -55.037925720214844, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.01740449108183384, + "rewards/margins": -0.00723386462777853, + "rewards/rejected": -0.010170625522732735, + "step": 97 + }, + { + "epoch": 0.09, + "grad_norm": 15.8721342086792, + "learning_rate": 1.540880503144654e-07, + "logps/chosen": -48.8435173034668, + "logps/rejected": -45.31690979003906, + "loss": 0.6936, + "losses/dpo": 0.6874356269836426, + "losses/sft": 1.448114275932312, + "losses/total": 0.6874356269836426, + "ref_logps/chosen": -48.658470153808594, + "ref_logps/rejected": -45.13918685913086, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.018504882231354713, + "rewards/margins": -0.0007324693724513054, + "rewards/rejected": -0.017772411927580833, + "step": 98 + }, + { + "epoch": 0.09, + "grad_norm": 12.944564819335938, + "learning_rate": 1.5566037735849055e-07, + "logps/chosen": -30.8857364654541, + "logps/rejected": -35.680885314941406, + "loss": 0.6971, + "losses/dpo": 0.6944862008094788, + "losses/sft": 0.959463894367218, + "losses/total": 0.6944862008094788, + "ref_logps/chosen": -30.673364639282227, + "ref_logps/rejected": -35.54549789428711, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.021237188950181007, + "rewards/margins": -0.007698586210608482, + "rewards/rejected": -0.013538602739572525, + "step": 99 + }, + { + "epoch": 0.09, + "grad_norm": 14.438765525817871, + "learning_rate": 1.5723270440251572e-07, + "logps/chosen": -45.89412307739258, + "logps/rejected": -47.31294250488281, + "loss": 0.6932, + "losses/dpo": 0.695717453956604, + "losses/sft": 1.817944884300232, + "losses/total": 0.695717453956604, + "ref_logps/chosen": -45.63401794433594, + "ref_logps/rejected": -47.05308532714844, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.02601074054837227, + "rewards/margins": -2.5463057681918144e-05, + "rewards/rejected": -0.02598527818918228, + "step": 100 + }, + { + "epoch": 0.1, + "grad_norm": 16.887371063232422, + "learning_rate": 1.5880503144654087e-07, + "logps/chosen": -47.991641998291016, + "logps/rejected": -57.681365966796875, + "loss": 0.694, + "losses/dpo": 0.7069874405860901, + "losses/sft": 1.730445384979248, + "losses/total": 0.7069874405860901, + "ref_logps/chosen": -47.78053283691406, + "ref_logps/rejected": -57.486690521240234, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02111116796731949, + "rewards/margins": -0.0016439557075500488, + "rewards/rejected": -0.01946721225976944, + "step": 101 + }, + { + "epoch": 0.1, + "grad_norm": 15.702824592590332, + "learning_rate": 1.6037735849056602e-07, + "logps/chosen": -38.05979919433594, + "logps/rejected": -51.93192672729492, + "loss": 0.6956, + "losses/dpo": 0.6968280076980591, + "losses/sft": 1.1726560592651367, + "losses/total": 0.6968280076980591, + "ref_logps/chosen": -37.90008544921875, + "ref_logps/rejected": -51.82126235961914, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.01597147062420845, + "rewards/margins": -0.004904856905341148, + "rewards/rejected": -0.011066612787544727, + "step": 102 + }, + { + "epoch": 0.1, + "grad_norm": 14.684089660644531, + "learning_rate": 1.6194968553459117e-07, + "logps/chosen": -36.708221435546875, + "logps/rejected": -37.9134521484375, + "loss": 0.6868, + "losses/dpo": 0.6792709827423096, + "losses/sft": 1.3039885759353638, + "losses/total": 0.6792709827423096, + "ref_logps/chosen": -36.589111328125, + "ref_logps/rejected": -37.66527557373047, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.011910857632756233, + "rewards/margins": 0.01290722656995058, + "rewards/rejected": -0.02481808327138424, + "step": 103 + }, + { + "epoch": 0.1, + "grad_norm": 16.684906005859375, + "learning_rate": 1.6352201257861632e-07, + "logps/chosen": -43.59468460083008, + "logps/rejected": -40.17591857910156, + "loss": 0.6898, + "losses/dpo": 0.6985071301460266, + "losses/sft": 1.7022027969360352, + "losses/total": 0.6985071301460266, + "ref_logps/chosen": -43.42711639404297, + "ref_logps/rejected": -39.94116973876953, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.016756635159254074, + "rewards/margins": 0.006718737538903952, + "rewards/rejected": -0.023475375026464462, + "step": 104 + }, + { + "epoch": 0.1, + "grad_norm": 14.9424409866333, + "learning_rate": 1.650943396226415e-07, + "logps/chosen": -39.1132926940918, + "logps/rejected": -45.54408264160156, + "loss": 0.6981, + "losses/dpo": 0.6998525857925415, + "losses/sft": 1.4071406126022339, + "losses/total": 0.6998525857925415, + "ref_logps/chosen": -38.84712600708008, + "ref_logps/rejected": -45.37468719482422, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.026616401970386505, + "rewards/margins": -0.009676598012447357, + "rewards/rejected": -0.016939803957939148, + "step": 105 + }, + { + "epoch": 0.1, + "grad_norm": 14.410820960998535, + "learning_rate": 1.6666666666666665e-07, + "logps/chosen": -34.343482971191406, + "logps/rejected": -38.83158874511719, + "loss": 0.6916, + "losses/dpo": 0.6919582486152649, + "losses/sft": 1.2401819229125977, + "losses/total": 0.6919582486152649, + "ref_logps/chosen": -34.14746856689453, + "ref_logps/rejected": -38.6029167175293, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.019601475447416306, + "rewards/margins": 0.00326578039675951, + "rewards/rejected": -0.02286725677549839, + "step": 106 + }, + { + "epoch": 0.1, + "grad_norm": 14.090592384338379, + "learning_rate": 1.682389937106918e-07, + "logps/chosen": -29.85173988342285, + "logps/rejected": -42.94633483886719, + "loss": 0.693, + "losses/dpo": 0.6914762258529663, + "losses/sft": 1.1770298480987549, + "losses/total": 0.6914762258529663, + "ref_logps/chosen": -29.699792861938477, + "ref_logps/rejected": -42.7900276184082, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.015194655396044254, + "rewards/margins": 0.00043619866482913494, + "rewards/rejected": -0.015630854293704033, + "step": 107 + }, + { + "epoch": 0.1, + "grad_norm": 15.35986328125, + "learning_rate": 1.6981132075471695e-07, + "logps/chosen": -42.16456985473633, + "logps/rejected": -45.15089416503906, + "loss": 0.6885, + "losses/dpo": 0.6948836445808411, + "losses/sft": 1.6860700845718384, + "losses/total": 0.6948836445808411, + "ref_logps/chosen": -41.986881256103516, + "ref_logps/rejected": -44.87779235839844, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.01776862144470215, + "rewards/margins": 0.009541459381580353, + "rewards/rejected": -0.0273100808262825, + "step": 108 + }, + { + "epoch": 0.1, + "grad_norm": 13.9297513961792, + "learning_rate": 1.7138364779874213e-07, + "logps/chosen": -26.50198745727539, + "logps/rejected": -36.50336456298828, + "loss": 0.6885, + "losses/dpo": 0.6882284879684448, + "losses/sft": 1.036218285560608, + "losses/total": 0.6882284879684448, + "ref_logps/chosen": -26.43675422668457, + "ref_logps/rejected": -36.34293746948242, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0065233525820076466, + "rewards/margins": 0.009519243612885475, + "rewards/rejected": -0.016042595729231834, + "step": 109 + }, + { + "epoch": 0.1, + "grad_norm": 14.998244285583496, + "learning_rate": 1.7295597484276728e-07, + "logps/chosen": -36.74858093261719, + "logps/rejected": -43.12199401855469, + "loss": 0.6895, + "losses/dpo": 0.6972934603691101, + "losses/sft": 1.3377923965454102, + "losses/total": 0.6972934603691101, + "ref_logps/chosen": -36.58707809448242, + "ref_logps/rejected": -42.88616943359375, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.016150321811437607, + "rewards/margins": 0.007431963458657265, + "rewards/rejected": -0.02358228713274002, + "step": 110 + }, + { + "epoch": 0.1, + "grad_norm": 13.215498924255371, + "learning_rate": 1.7452830188679243e-07, + "logps/chosen": -32.85944747924805, + "logps/rejected": -34.17639923095703, + "loss": 0.6879, + "losses/dpo": 0.6819064021110535, + "losses/sft": 1.4440749883651733, + "losses/total": 0.6819064021110535, + "ref_logps/chosen": -32.7509765625, + "ref_logps/rejected": -33.96112823486328, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.010846990160644054, + "rewards/margins": 0.010679739527404308, + "rewards/rejected": -0.021526731550693512, + "step": 111 + }, + { + "epoch": 0.11, + "grad_norm": 14.666735649108887, + "learning_rate": 1.7610062893081758e-07, + "logps/chosen": -39.56743240356445, + "logps/rejected": -45.411155700683594, + "loss": 0.6848, + "losses/dpo": 0.6862828731536865, + "losses/sft": 1.5352342128753662, + "losses/total": 0.6862828731536865, + "ref_logps/chosen": -39.37195587158203, + "ref_logps/rejected": -45.04680252075195, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.019547976553440094, + "rewards/margins": 0.016887344419956207, + "rewards/rejected": -0.036435317248106, + "step": 112 + }, + { + "epoch": 0.11, + "grad_norm": 14.673539161682129, + "learning_rate": 1.7767295597484276e-07, + "logps/chosen": -34.470298767089844, + "logps/rejected": -39.43095397949219, + "loss": 0.6918, + "losses/dpo": 0.6949492692947388, + "losses/sft": 1.1301006078720093, + "losses/total": 0.6949492692947388, + "ref_logps/chosen": -34.285438537597656, + "ref_logps/rejected": -39.21759033203125, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.018486076965928078, + "rewards/margins": 0.002850135788321495, + "rewards/rejected": -0.021336212754249573, + "step": 113 + }, + { + "epoch": 0.11, + "grad_norm": 14.95534610748291, + "learning_rate": 1.792452830188679e-07, + "logps/chosen": -39.47813415527344, + "logps/rejected": -32.72713088989258, + "loss": 0.6953, + "losses/dpo": 0.6829326152801514, + "losses/sft": 1.2921595573425293, + "losses/total": 0.6829326152801514, + "ref_logps/chosen": -39.1935920715332, + "ref_logps/rejected": -32.48295211791992, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.028454016894102097, + "rewards/margins": -0.004036164842545986, + "rewards/rejected": -0.024417854845523834, + "step": 114 + }, + { + "epoch": 0.11, + "grad_norm": 13.93600082397461, + "learning_rate": 1.8081761006289306e-07, + "logps/chosen": -34.593177795410156, + "logps/rejected": -35.36073303222656, + "loss": 0.697, + "losses/dpo": 0.7119002938270569, + "losses/sft": 1.3550841808319092, + "losses/total": 0.7119002938270569, + "ref_logps/chosen": -34.31936264038086, + "ref_logps/rejected": -35.16082000732422, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.027381660416722298, + "rewards/margins": -0.007390107028186321, + "rewards/rejected": -0.01999155431985855, + "step": 115 + }, + { + "epoch": 0.11, + "grad_norm": 16.06708526611328, + "learning_rate": 1.823899371069182e-07, + "logps/chosen": -45.849769592285156, + "logps/rejected": -50.03912353515625, + "loss": 0.6977, + "losses/dpo": 0.6969462633132935, + "losses/sft": 1.747936487197876, + "losses/total": 0.6969462633132935, + "ref_logps/chosen": -45.534278869628906, + "ref_logps/rejected": -49.81377029418945, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.03154875710606575, + "rewards/margins": -0.009013607166707516, + "rewards/rejected": -0.02253514900803566, + "step": 116 + }, + { + "epoch": 0.11, + "grad_norm": 14.117581367492676, + "learning_rate": 1.8396226415094338e-07, + "logps/chosen": -34.1506233215332, + "logps/rejected": -47.759986877441406, + "loss": 0.6952, + "losses/dpo": 0.6971405744552612, + "losses/sft": 1.3164403438568115, + "losses/total": 0.6971405744552612, + "ref_logps/chosen": -33.87939453125, + "ref_logps/rejected": -47.52716827392578, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.027122847735881805, + "rewards/margins": -0.003840741002932191, + "rewards/rejected": -0.023282108828425407, + "step": 117 + }, + { + "epoch": 0.11, + "grad_norm": 13.412751197814941, + "learning_rate": 1.8553459119496853e-07, + "logps/chosen": -34.454444885253906, + "logps/rejected": -44.52323532104492, + "loss": 0.6962, + "losses/dpo": 0.6933097839355469, + "losses/sft": 1.301590919494629, + "losses/total": 0.6933097839355469, + "ref_logps/chosen": -34.16710662841797, + "ref_logps/rejected": -44.295814514160156, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.028733599931001663, + "rewards/margins": -0.00599172106012702, + "rewards/rejected": -0.022741876542568207, + "step": 118 + }, + { + "epoch": 0.11, + "grad_norm": 15.68087387084961, + "learning_rate": 1.8710691823899368e-07, + "logps/chosen": -47.611175537109375, + "logps/rejected": -46.28931427001953, + "loss": 0.6876, + "losses/dpo": 0.6986205577850342, + "losses/sft": 1.2177549600601196, + "losses/total": 0.6986205577850342, + "ref_logps/chosen": -47.382904052734375, + "ref_logps/rejected": -45.94834518432617, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.022827334702014923, + "rewards/margins": 0.011269832029938698, + "rewards/rejected": -0.03409716486930847, + "step": 119 + }, + { + "epoch": 0.11, + "grad_norm": 12.786795616149902, + "learning_rate": 1.8867924528301886e-07, + "logps/chosen": -30.233591079711914, + "logps/rejected": -37.38566970825195, + "loss": 0.6963, + "losses/dpo": 0.7035396099090576, + "losses/sft": 0.7588335871696472, + "losses/total": 0.7035396099090576, + "ref_logps/chosen": -29.987346649169922, + "ref_logps/rejected": -37.2010498046875, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.024624522775411606, + "rewards/margins": -0.006162205711007118, + "rewards/rejected": -0.018462317064404488, + "step": 120 + }, + { + "epoch": 0.11, + "grad_norm": 14.822002410888672, + "learning_rate": 1.9025157232704404e-07, + "logps/chosen": -42.0538330078125, + "logps/rejected": -43.76673889160156, + "loss": 0.6878, + "losses/dpo": 0.690367579460144, + "losses/sft": 1.4329962730407715, + "losses/total": 0.690367579460144, + "ref_logps/chosen": -41.80472183227539, + "ref_logps/rejected": -43.407867431640625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.024911170825362206, + "rewards/margins": 0.010975952260196209, + "rewards/rejected": -0.03588712215423584, + "step": 121 + }, + { + "epoch": 0.12, + "grad_norm": 13.923846244812012, + "learning_rate": 1.918238993710692e-07, + "logps/chosen": -33.28826904296875, + "logps/rejected": -38.56500244140625, + "loss": 0.692, + "losses/dpo": 0.6966415643692017, + "losses/sft": 1.658522129058838, + "losses/total": 0.6966415643692017, + "ref_logps/chosen": -32.97311019897461, + "ref_logps/rejected": -38.22522735595703, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0315156951546669, + "rewards/margins": 0.0024614857975393534, + "rewards/rejected": -0.03397718071937561, + "step": 122 + }, + { + "epoch": 0.12, + "grad_norm": 15.002264976501465, + "learning_rate": 1.9339622641509434e-07, + "logps/chosen": -34.423828125, + "logps/rejected": -52.01573944091797, + "loss": 0.6885, + "losses/dpo": 0.6879132986068726, + "losses/sft": 1.094735860824585, + "losses/total": 0.6879132986068726, + "ref_logps/chosen": -34.19406509399414, + "ref_logps/rejected": -51.68952941894531, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.022976290434598923, + "rewards/margins": 0.00964451115578413, + "rewards/rejected": -0.03262079879641533, + "step": 123 + }, + { + "epoch": 0.12, + "grad_norm": 19.52489471435547, + "learning_rate": 1.949685534591195e-07, + "logps/chosen": -32.851112365722656, + "logps/rejected": -39.58111572265625, + "loss": 0.6905, + "losses/dpo": 0.685728907585144, + "losses/sft": 1.1572200059890747, + "losses/total": 0.685728907585144, + "ref_logps/chosen": -32.63593292236328, + "ref_logps/rejected": -39.31063461303711, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.021518178284168243, + "rewards/margins": 0.005529931280761957, + "rewards/rejected": -0.027048110961914062, + "step": 124 + }, + { + "epoch": 0.12, + "grad_norm": 15.122300148010254, + "learning_rate": 1.9654088050314467e-07, + "logps/chosen": -36.94127655029297, + "logps/rejected": -39.3349609375, + "loss": 0.6945, + "losses/dpo": 0.6853357553482056, + "losses/sft": 1.2431758642196655, + "losses/total": 0.6853357553482056, + "ref_logps/chosen": -36.56884002685547, + "ref_logps/rejected": -38.9855842590332, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03724343329668045, + "rewards/margins": -0.0023057940416038036, + "rewards/rejected": -0.03493763878941536, + "step": 125 + }, + { + "epoch": 0.12, + "grad_norm": 13.548483848571777, + "learning_rate": 1.9811320754716982e-07, + "logps/chosen": -31.962743759155273, + "logps/rejected": -35.55200958251953, + "loss": 0.6924, + "losses/dpo": 0.7094372510910034, + "losses/sft": 1.4011971950531006, + "losses/total": 0.7094372510910034, + "ref_logps/chosen": -31.544071197509766, + "ref_logps/rejected": -35.11526107788086, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.041867226362228394, + "rewards/margins": 0.0018077259883284569, + "rewards/rejected": -0.043674953281879425, + "step": 126 + }, + { + "epoch": 0.12, + "grad_norm": 15.32440185546875, + "learning_rate": 1.9968553459119497e-07, + "logps/chosen": -47.51824951171875, + "logps/rejected": -49.325355529785156, + "loss": 0.6896, + "losses/dpo": 0.6987741589546204, + "losses/sft": 1.8628894090652466, + "losses/total": 0.6987741589546204, + "ref_logps/chosen": -47.120445251464844, + "ref_logps/rejected": -48.85082244873047, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03978060558438301, + "rewards/margins": 0.007672748528420925, + "rewards/rejected": -0.04745335131883621, + "step": 127 + }, + { + "epoch": 0.12, + "grad_norm": 14.225200653076172, + "learning_rate": 2.0125786163522012e-07, + "logps/chosen": -36.54164505004883, + "logps/rejected": -35.82334899902344, + "loss": 0.6879, + "losses/dpo": 0.6846950650215149, + "losses/sft": 1.3227465152740479, + "losses/total": 0.6846950650215149, + "ref_logps/chosen": -36.33439636230469, + "ref_logps/rejected": -35.509422302246094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.020724685862660408, + "rewards/margins": 0.010668057017028332, + "rewards/rejected": -0.031392741948366165, + "step": 128 + }, + { + "epoch": 0.12, + "grad_norm": 17.420419692993164, + "learning_rate": 2.028301886792453e-07, + "logps/chosen": -39.37260055541992, + "logps/rejected": -57.38317108154297, + "loss": 0.6847, + "losses/dpo": 0.692078173160553, + "losses/sft": 1.5656405687332153, + "losses/total": 0.692078173160553, + "ref_logps/chosen": -39.03068161010742, + "ref_logps/rejected": -56.86867141723633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03419186919927597, + "rewards/margins": 0.01725781336426735, + "rewards/rejected": -0.05144967883825302, + "step": 129 + }, + { + "epoch": 0.12, + "grad_norm": 14.964306831359863, + "learning_rate": 2.0440251572327044e-07, + "logps/chosen": -43.3221321105957, + "logps/rejected": -41.251625061035156, + "loss": 0.6868, + "losses/dpo": 0.6975349187850952, + "losses/sft": 1.6676081418991089, + "losses/total": 0.6975349187850952, + "ref_logps/chosen": -43.071380615234375, + "ref_logps/rejected": -40.869747161865234, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.025075269863009453, + "rewards/margins": 0.01311260461807251, + "rewards/rejected": -0.03818787261843681, + "step": 130 + }, + { + "epoch": 0.12, + "grad_norm": 14.855781555175781, + "learning_rate": 2.059748427672956e-07, + "logps/chosen": -32.41206359863281, + "logps/rejected": -54.820892333984375, + "loss": 0.6897, + "losses/dpo": 0.694727897644043, + "losses/sft": 1.23732328414917, + "losses/total": 0.694727897644043, + "ref_logps/chosen": -32.06626892089844, + "ref_logps/rejected": -54.403839111328125, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03457948565483093, + "rewards/margins": 0.007125252857804298, + "rewards/rejected": -0.04170474037528038, + "step": 131 + }, + { + "epoch": 0.12, + "grad_norm": 14.342564582824707, + "learning_rate": 2.0754716981132074e-07, + "logps/chosen": -37.349639892578125, + "logps/rejected": -44.60934829711914, + "loss": 0.6907, + "losses/dpo": 0.6890225410461426, + "losses/sft": 1.7710987329483032, + "losses/total": 0.6890225410461426, + "ref_logps/chosen": -37.06349563598633, + "ref_logps/rejected": -44.271053314208984, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.028614195063710213, + "rewards/margins": 0.005215257406234741, + "rewards/rejected": -0.033829450607299805, + "step": 132 + }, + { + "epoch": 0.13, + "grad_norm": 14.29720401763916, + "learning_rate": 2.0911949685534592e-07, + "logps/chosen": -36.476844787597656, + "logps/rejected": -41.18645477294922, + "loss": 0.687, + "losses/dpo": 0.672049880027771, + "losses/sft": 1.412061095237732, + "losses/total": 0.672049880027771, + "ref_logps/chosen": -36.14765930175781, + "ref_logps/rejected": -40.729766845703125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03291814774274826, + "rewards/margins": 0.01275054644793272, + "rewards/rejected": -0.04566869139671326, + "step": 133 + }, + { + "epoch": 0.13, + "grad_norm": 14.889425277709961, + "learning_rate": 2.1069182389937107e-07, + "logps/chosen": -32.16847610473633, + "logps/rejected": -41.37443923950195, + "loss": 0.6867, + "losses/dpo": 0.6890193819999695, + "losses/sft": 1.1969457864761353, + "losses/total": 0.6890193819999695, + "ref_logps/chosen": -31.79351043701172, + "ref_logps/rejected": -40.86709213256836, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.037496283650398254, + "rewards/margins": 0.01323840394616127, + "rewards/rejected": -0.050734687596559525, + "step": 134 + }, + { + "epoch": 0.13, + "grad_norm": 14.49722957611084, + "learning_rate": 2.1226415094339622e-07, + "logps/chosen": -28.457765579223633, + "logps/rejected": -45.70714569091797, + "loss": 0.6841, + "losses/dpo": 0.6867278814315796, + "losses/sft": 1.2185068130493164, + "losses/total": 0.6867278814315796, + "ref_logps/chosen": -28.06502914428711, + "ref_logps/rejected": -45.12978744506836, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.039273701608181, + "rewards/margins": 0.01846211403608322, + "rewards/rejected": -0.05773581564426422, + "step": 135 + }, + { + "epoch": 0.13, + "grad_norm": 14.271917343139648, + "learning_rate": 2.1383647798742137e-07, + "logps/chosen": -38.469852447509766, + "logps/rejected": -39.8077392578125, + "loss": 0.6937, + "losses/dpo": 0.687812328338623, + "losses/sft": 1.3850359916687012, + "losses/total": 0.687812328338623, + "ref_logps/chosen": -38.16452407836914, + "ref_logps/rejected": -39.51031494140625, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.030532920733094215, + "rewards/margins": -0.0007908491534180939, + "rewards/rejected": -0.02974206954240799, + "step": 136 + }, + { + "epoch": 0.13, + "grad_norm": 15.589990615844727, + "learning_rate": 2.1540880503144655e-07, + "logps/chosen": -39.6287956237793, + "logps/rejected": -44.67558288574219, + "loss": 0.6898, + "losses/dpo": 0.6974602341651917, + "losses/sft": 1.5750739574432373, + "losses/total": 0.6974602341651917, + "ref_logps/chosen": -39.27399826049805, + "ref_logps/rejected": -44.25121307373047, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03547965735197067, + "rewards/margins": 0.00695746298879385, + "rewards/rejected": -0.0424371175467968, + "step": 137 + }, + { + "epoch": 0.13, + "grad_norm": 15.114520072937012, + "learning_rate": 2.169811320754717e-07, + "logps/chosen": -44.047691345214844, + "logps/rejected": -40.24201202392578, + "loss": 0.6913, + "losses/dpo": 0.6949135065078735, + "losses/sft": 2.0087459087371826, + "losses/total": 0.6949135065078735, + "ref_logps/chosen": -43.595733642578125, + "ref_logps/rejected": -39.75062561035156, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04519592598080635, + "rewards/margins": 0.0039425380527973175, + "rewards/rejected": -0.04913846030831337, + "step": 138 + }, + { + "epoch": 0.13, + "grad_norm": 12.832149505615234, + "learning_rate": 2.1855345911949685e-07, + "logps/chosen": -32.86917495727539, + "logps/rejected": -33.458431243896484, + "loss": 0.6965, + "losses/dpo": 0.699682354927063, + "losses/sft": 0.9208551645278931, + "losses/total": 0.699682354927063, + "ref_logps/chosen": -32.45710754394531, + "ref_logps/rejected": -33.11075210571289, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.041206683963537216, + "rewards/margins": -0.006438740529119968, + "rewards/rejected": -0.03476794809103012, + "step": 139 + }, + { + "epoch": 0.13, + "grad_norm": 16.490097045898438, + "learning_rate": 2.20125786163522e-07, + "logps/chosen": -41.00999450683594, + "logps/rejected": -51.05350875854492, + "loss": 0.6794, + "losses/dpo": 0.6706057786941528, + "losses/sft": 1.1546810865402222, + "losses/total": 0.6706057786941528, + "ref_logps/chosen": -40.791709899902344, + "ref_logps/rejected": -50.55547332763672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.02182871848344803, + "rewards/margins": 0.02797437272965908, + "rewards/rejected": -0.04980308935046196, + "step": 140 + }, + { + "epoch": 0.13, + "grad_norm": 15.765511512756348, + "learning_rate": 2.2169811320754718e-07, + "logps/chosen": -45.249046325683594, + "logps/rejected": -52.27911376953125, + "loss": 0.6951, + "losses/dpo": 0.6822272539138794, + "losses/sft": 1.4385101795196533, + "losses/total": 0.6822272539138794, + "ref_logps/chosen": -44.595176696777344, + "ref_logps/rejected": -51.66139221191406, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06538671255111694, + "rewards/margins": -0.0036148373037576675, + "rewards/rejected": -0.061771877110004425, + "step": 141 + }, + { + "epoch": 0.13, + "grad_norm": 16.23243522644043, + "learning_rate": 2.2327044025157233e-07, + "logps/chosen": -38.73735046386719, + "logps/rejected": -48.82923126220703, + "loss": 0.6944, + "losses/dpo": 0.7174341678619385, + "losses/sft": 1.6384347677230835, + "losses/total": 0.7174341678619385, + "ref_logps/chosen": -37.99863815307617, + "ref_logps/rejected": -48.11299133300781, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.07387126982212067, + "rewards/margins": -0.0022472080308943987, + "rewards/rejected": -0.07162405550479889, + "step": 142 + }, + { + "epoch": 0.14, + "grad_norm": 17.4960994720459, + "learning_rate": 2.2484276729559748e-07, + "logps/chosen": -48.301883697509766, + "logps/rejected": -50.832313537597656, + "loss": 0.6949, + "losses/dpo": 0.6918096542358398, + "losses/sft": 1.2325555086135864, + "losses/total": 0.6918096542358398, + "ref_logps/chosen": -47.70981979370117, + "ref_logps/rejected": -50.27460479736328, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05920600891113281, + "rewards/margins": -0.003434740472584963, + "rewards/rejected": -0.05577126890420914, + "step": 143 + }, + { + "epoch": 0.14, + "grad_norm": 15.367589950561523, + "learning_rate": 2.2641509433962263e-07, + "logps/chosen": -33.164772033691406, + "logps/rejected": -40.99664306640625, + "loss": 0.6876, + "losses/dpo": 0.6855679154396057, + "losses/sft": 1.2458962202072144, + "losses/total": 0.6855679154396057, + "ref_logps/chosen": -32.71955108642578, + "ref_logps/rejected": -40.43425750732422, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.044522032141685486, + "rewards/margins": 0.011716343462467194, + "rewards/rejected": -0.05623837187886238, + "step": 144 + }, + { + "epoch": 0.14, + "grad_norm": 14.423078536987305, + "learning_rate": 2.279874213836478e-07, + "logps/chosen": -38.37190628051758, + "logps/rejected": -43.500038146972656, + "loss": 0.6819, + "losses/dpo": 0.6875548362731934, + "losses/sft": 1.3889192342758179, + "losses/total": 0.6875548362731934, + "ref_logps/chosen": -37.958709716796875, + "ref_logps/rejected": -42.858619689941406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0413198322057724, + "rewards/margins": 0.022821854799985886, + "rewards/rejected": -0.06414168328046799, + "step": 145 + }, + { + "epoch": 0.14, + "grad_norm": 15.48110294342041, + "learning_rate": 2.2955974842767295e-07, + "logps/chosen": -34.765846252441406, + "logps/rejected": -41.104759216308594, + "loss": 0.6814, + "losses/dpo": 0.669724702835083, + "losses/sft": 1.7875714302062988, + "losses/total": 0.669724702835083, + "ref_logps/chosen": -34.194915771484375, + "ref_logps/rejected": -40.29185485839844, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.05709293484687805, + "rewards/margins": 0.024197574704885483, + "rewards/rejected": -0.08129051327705383, + "step": 146 + }, + { + "epoch": 0.14, + "grad_norm": 16.652830123901367, + "learning_rate": 2.311320754716981e-07, + "logps/chosen": -36.488487243652344, + "logps/rejected": -48.65363311767578, + "loss": 0.6813, + "losses/dpo": 0.6807320713996887, + "losses/sft": 1.1820979118347168, + "losses/total": 0.6807320713996887, + "ref_logps/chosen": -36.11376953125, + "ref_logps/rejected": -48.032691955566406, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0374714694917202, + "rewards/margins": 0.02462271973490715, + "rewards/rejected": -0.06209418922662735, + "step": 147 + }, + { + "epoch": 0.14, + "grad_norm": 15.308725357055664, + "learning_rate": 2.3270440251572326e-07, + "logps/chosen": -40.74299240112305, + "logps/rejected": -44.69352722167969, + "loss": 0.6965, + "losses/dpo": 0.709496021270752, + "losses/sft": 1.0641168355941772, + "losses/total": 0.709496021270752, + "ref_logps/chosen": -40.02014923095703, + "ref_logps/rejected": -44.02622604370117, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07228401303291321, + "rewards/margins": -0.005553926341235638, + "rewards/rejected": -0.0667300820350647, + "step": 148 + }, + { + "epoch": 0.14, + "grad_norm": 13.749775886535645, + "learning_rate": 2.3427672955974843e-07, + "logps/chosen": -31.36304473876953, + "logps/rejected": -37.66726303100586, + "loss": 0.6768, + "losses/dpo": 0.6756744384765625, + "losses/sft": 1.764005422592163, + "losses/total": 0.6756744384765625, + "ref_logps/chosen": -30.908706665039062, + "ref_logps/rejected": -36.87897491455078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.045433878898620605, + "rewards/margins": 0.033394597470760345, + "rewards/rejected": -0.07882846891880035, + "step": 149 + }, + { + "epoch": 0.14, + "grad_norm": 13.469657897949219, + "learning_rate": 2.3584905660377358e-07, + "logps/chosen": -35.03850555419922, + "logps/rejected": -41.29573059082031, + "loss": 0.6887, + "losses/dpo": 0.6853890419006348, + "losses/sft": 1.3369121551513672, + "losses/total": 0.6853890419006348, + "ref_logps/chosen": -34.43486022949219, + "ref_logps/rejected": -40.60087966918945, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.06036418676376343, + "rewards/margins": 0.009120948612689972, + "rewards/rejected": -0.0694851353764534, + "step": 150 + }, + { + "epoch": 0.14, + "grad_norm": 14.294488906860352, + "learning_rate": 2.3742138364779873e-07, + "logps/chosen": -37.178802490234375, + "logps/rejected": -44.86241149902344, + "loss": 0.6993, + "losses/dpo": 0.7003527879714966, + "losses/sft": 1.4542131423950195, + "losses/total": 0.7003527879714966, + "ref_logps/chosen": -36.377647399902344, + "ref_logps/rejected": -44.17764663696289, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08011550456285477, + "rewards/margins": -0.011639060452580452, + "rewards/rejected": -0.06847643852233887, + "step": 151 + }, + { + "epoch": 0.14, + "grad_norm": 16.434444427490234, + "learning_rate": 2.389937106918239e-07, + "logps/chosen": -43.262386322021484, + "logps/rejected": -50.735694885253906, + "loss": 0.6843, + "losses/dpo": 0.6821097135543823, + "losses/sft": 1.3412997722625732, + "losses/total": 0.6821097135543823, + "ref_logps/chosen": -42.7374153137207, + "ref_logps/rejected": -50.02814483642578, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05249696224927902, + "rewards/margins": 0.018258104100823402, + "rewards/rejected": -0.07075506448745728, + "step": 152 + }, + { + "epoch": 0.14, + "grad_norm": 17.17867660522461, + "learning_rate": 2.4056603773584903e-07, + "logps/chosen": -45.7003173828125, + "logps/rejected": -48.128639221191406, + "loss": 0.6866, + "losses/dpo": 0.6841785311698914, + "losses/sft": 1.4632898569107056, + "losses/total": 0.6841785311698914, + "ref_logps/chosen": -44.91077423095703, + "ref_logps/rejected": -47.199195861816406, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.07895435392856598, + "rewards/margins": 0.013990305364131927, + "rewards/rejected": -0.09294465184211731, + "step": 153 + }, + { + "epoch": 0.15, + "grad_norm": 15.123295783996582, + "learning_rate": 2.421383647798742e-07, + "logps/chosen": -41.161293029785156, + "logps/rejected": -43.288719177246094, + "loss": 0.6898, + "losses/dpo": 0.6806986331939697, + "losses/sft": 1.2774852514266968, + "losses/total": 0.6806986331939697, + "ref_logps/chosen": -40.295387268066406, + "ref_logps/rejected": -42.34626007080078, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08659013360738754, + "rewards/margins": 0.007655314169824123, + "rewards/rejected": -0.09424544870853424, + "step": 154 + }, + { + "epoch": 0.15, + "grad_norm": 15.345965385437012, + "learning_rate": 2.437106918238994e-07, + "logps/chosen": -36.4453010559082, + "logps/rejected": -43.52362060546875, + "loss": 0.6933, + "losses/dpo": 0.69317626953125, + "losses/sft": 1.4723988771438599, + "losses/total": 0.69317626953125, + "ref_logps/chosen": -35.49375915527344, + "ref_logps/rejected": -42.57204055786133, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09515412151813507, + "rewards/margins": 3.3443793654441833e-06, + "rewards/rejected": -0.09515747427940369, + "step": 155 + }, + { + "epoch": 0.15, + "grad_norm": 15.515606880187988, + "learning_rate": 2.452830188679245e-07, + "logps/chosen": -41.18972396850586, + "logps/rejected": -46.37871170043945, + "loss": 0.6899, + "losses/dpo": 0.6924268007278442, + "losses/sft": 1.5120503902435303, + "losses/total": 0.6924268007278442, + "ref_logps/chosen": -40.387245178222656, + "ref_logps/rejected": -45.50495529174805, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08024749904870987, + "rewards/margins": 0.007128346711397171, + "rewards/rejected": -0.08737584948539734, + "step": 156 + }, + { + "epoch": 0.15, + "grad_norm": 15.6599702835083, + "learning_rate": 2.468553459119497e-07, + "logps/chosen": -37.020111083984375, + "logps/rejected": -43.15111541748047, + "loss": 0.6947, + "losses/dpo": 0.7032310962677002, + "losses/sft": 1.466179609298706, + "losses/total": 0.7032310962677002, + "ref_logps/chosen": -36.081417083740234, + "ref_logps/rejected": -42.236480712890625, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09386956691741943, + "rewards/margins": -0.0024061878211796284, + "rewards/rejected": -0.0914633721113205, + "step": 157 + }, + { + "epoch": 0.15, + "grad_norm": 15.054862022399902, + "learning_rate": 2.484276729559748e-07, + "logps/chosen": -43.759910583496094, + "logps/rejected": -43.61105728149414, + "loss": 0.6849, + "losses/dpo": 0.6913665533065796, + "losses/sft": 1.5183358192443848, + "losses/total": 0.6913665533065796, + "ref_logps/chosen": -42.80980682373047, + "ref_logps/rejected": -42.48993682861328, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09501078724861145, + "rewards/margins": 0.017101164907217026, + "rewards/rejected": -0.11211195588111877, + "step": 158 + }, + { + "epoch": 0.15, + "grad_norm": 17.373764038085938, + "learning_rate": 2.5e-07, + "logps/chosen": -48.36299133300781, + "logps/rejected": -61.741703033447266, + "loss": 0.691, + "losses/dpo": 0.6970544457435608, + "losses/sft": 1.3365906476974487, + "losses/total": 0.6970544457435608, + "ref_logps/chosen": -47.605804443359375, + "ref_logps/rejected": -60.936859130859375, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0757184773683548, + "rewards/margins": 0.004765537567436695, + "rewards/rejected": -0.08048401772975922, + "step": 159 + }, + { + "epoch": 0.15, + "grad_norm": 16.456748962402344, + "learning_rate": 2.5157232704402517e-07, + "logps/chosen": -42.988807678222656, + "logps/rejected": -48.046600341796875, + "loss": 0.6972, + "losses/dpo": 0.6922672986984253, + "losses/sft": 1.4732236862182617, + "losses/total": 0.6922672986984253, + "ref_logps/chosen": -41.93547058105469, + "ref_logps/rejected": -47.06855010986328, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1053338348865509, + "rewards/margins": -0.0075288740918040276, + "rewards/rejected": -0.097804956138134, + "step": 160 + }, + { + "epoch": 0.15, + "grad_norm": 13.012798309326172, + "learning_rate": 2.531446540880503e-07, + "logps/chosen": -25.559913635253906, + "logps/rejected": -38.070777893066406, + "loss": 0.6787, + "losses/dpo": 0.6694801449775696, + "losses/sft": 1.0408848524093628, + "losses/total": 0.6694801449775696, + "ref_logps/chosen": -24.90559196472168, + "ref_logps/rejected": -37.12018585205078, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06543232500553131, + "rewards/margins": 0.029626626521348953, + "rewards/rejected": -0.09505894780158997, + "step": 161 + }, + { + "epoch": 0.15, + "grad_norm": 16.118104934692383, + "learning_rate": 2.5471698113207547e-07, + "logps/chosen": -44.049354553222656, + "logps/rejected": -47.15470886230469, + "loss": 0.6829, + "losses/dpo": 0.7058506608009338, + "losses/sft": 1.5558570623397827, + "losses/total": 0.7058506608009338, + "ref_logps/chosen": -43.05775833129883, + "ref_logps/rejected": -45.945899963378906, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09915965050458908, + "rewards/margins": 0.021722018718719482, + "rewards/rejected": -0.12088166177272797, + "step": 162 + }, + { + "epoch": 0.15, + "grad_norm": 14.646360397338867, + "learning_rate": 2.562893081761006e-07, + "logps/chosen": -37.91931915283203, + "logps/rejected": -35.62671661376953, + "loss": 0.6849, + "losses/dpo": 0.6865489482879639, + "losses/sft": 1.2324689626693726, + "losses/total": 0.6865489482879639, + "ref_logps/chosen": -37.11323547363281, + "ref_logps/rejected": -34.651885986328125, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08060857653617859, + "rewards/margins": 0.016874689608812332, + "rewards/rejected": -0.09748326241970062, + "step": 163 + }, + { + "epoch": 0.15, + "grad_norm": 14.869216918945312, + "learning_rate": 2.5786163522012577e-07, + "logps/chosen": -40.927040100097656, + "logps/rejected": -42.58702087402344, + "loss": 0.6836, + "losses/dpo": 0.6857635378837585, + "losses/sft": 1.0527235269546509, + "losses/total": 0.6857635378837585, + "ref_logps/chosen": -39.78255081176758, + "ref_logps/rejected": -41.23955535888672, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11444923281669617, + "rewards/margins": 0.020297367125749588, + "rewards/rejected": -0.13474659621715546, + "step": 164 + }, + { + "epoch": 0.16, + "grad_norm": 11.951457977294922, + "learning_rate": 2.5943396226415094e-07, + "logps/chosen": -27.83062744140625, + "logps/rejected": -28.856121063232422, + "loss": 0.6832, + "losses/dpo": 0.6878472566604614, + "losses/sft": 1.077296495437622, + "losses/total": 0.6878472566604614, + "ref_logps/chosen": -27.205949783325195, + "ref_logps/rejected": -28.023914337158203, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06246752291917801, + "rewards/margins": 0.02075308747589588, + "rewards/rejected": -0.08322061598300934, + "step": 165 + }, + { + "epoch": 0.16, + "grad_norm": 14.284201622009277, + "learning_rate": 2.6100628930817607e-07, + "logps/chosen": -38.03638458251953, + "logps/rejected": -38.48820495605469, + "loss": 0.6943, + "losses/dpo": 0.7134230136871338, + "losses/sft": 1.8575987815856934, + "losses/total": 0.7134230136871338, + "ref_logps/chosen": -37.123382568359375, + "ref_logps/rejected": -37.59544372558594, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09130043536424637, + "rewards/margins": -0.002024741843342781, + "rewards/rejected": -0.08927569538354874, + "step": 166 + }, + { + "epoch": 0.16, + "grad_norm": 17.11756134033203, + "learning_rate": 2.6257861635220124e-07, + "logps/chosen": -42.22746276855469, + "logps/rejected": -45.72160720825195, + "loss": 0.6843, + "losses/dpo": 0.6774294972419739, + "losses/sft": 1.6261186599731445, + "losses/total": 0.6774294972419739, + "ref_logps/chosen": -41.04143524169922, + "ref_logps/rejected": -44.35469055175781, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11860252916812897, + "rewards/margins": 0.018089205026626587, + "rewards/rejected": -0.13669173419475555, + "step": 167 + }, + { + "epoch": 0.16, + "grad_norm": 15.555020332336426, + "learning_rate": 2.641509433962264e-07, + "logps/chosen": -38.277732849121094, + "logps/rejected": -47.20541763305664, + "loss": 0.6879, + "losses/dpo": 0.6990188360214233, + "losses/sft": 1.5325950384140015, + "losses/total": 0.6990188360214233, + "ref_logps/chosen": -37.04336929321289, + "ref_logps/rejected": -45.858497619628906, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12343662977218628, + "rewards/margins": 0.01125560887157917, + "rewards/rejected": -0.1346922367811203, + "step": 168 + }, + { + "epoch": 0.16, + "grad_norm": 16.52756118774414, + "learning_rate": 2.6572327044025154e-07, + "logps/chosen": -41.617828369140625, + "logps/rejected": -41.552860260009766, + "loss": 0.6899, + "losses/dpo": 0.6829442977905273, + "losses/sft": 1.0690327882766724, + "losses/total": 0.6829442977905273, + "ref_logps/chosen": -40.609249114990234, + "ref_logps/rejected": -40.46549987792969, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10085811465978622, + "rewards/margins": 0.007877673022449017, + "rewards/rejected": -0.10873579233884811, + "step": 169 + }, + { + "epoch": 0.16, + "grad_norm": 14.34033489227295, + "learning_rate": 2.672955974842767e-07, + "logps/chosen": -36.80204391479492, + "logps/rejected": -47.43988800048828, + "loss": 0.6904, + "losses/dpo": 0.7027833461761475, + "losses/sft": 1.1950372457504272, + "losses/total": 0.7027833461761475, + "ref_logps/chosen": -35.8398551940918, + "ref_logps/rejected": -46.41313171386719, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.09621897339820862, + "rewards/margins": 0.006456691771745682, + "rewards/rejected": -0.1026756763458252, + "step": 170 + }, + { + "epoch": 0.16, + "grad_norm": 14.940962791442871, + "learning_rate": 2.6886792452830185e-07, + "logps/chosen": -43.187530517578125, + "logps/rejected": -49.84062194824219, + "loss": 0.69, + "losses/dpo": 0.6816960573196411, + "losses/sft": 1.5521328449249268, + "losses/total": 0.6816960573196411, + "ref_logps/chosen": -41.991539001464844, + "ref_logps/rejected": -48.575469970703125, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11959922313690186, + "rewards/margins": 0.006915335543453693, + "rewards/rejected": -0.12651455402374268, + "step": 171 + }, + { + "epoch": 0.16, + "grad_norm": 15.463927268981934, + "learning_rate": 2.70440251572327e-07, + "logps/chosen": -32.57449722290039, + "logps/rejected": -46.29249572753906, + "loss": 0.6773, + "losses/dpo": 0.687432587146759, + "losses/sft": 1.2540098428726196, + "losses/total": 0.687432587146759, + "ref_logps/chosen": -31.729053497314453, + "ref_logps/rejected": -45.11859130859375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08454425632953644, + "rewards/margins": 0.03284570947289467, + "rewards/rejected": -0.11738996207714081, + "step": 172 + }, + { + "epoch": 0.16, + "grad_norm": 16.008506774902344, + "learning_rate": 2.720125786163522e-07, + "logps/chosen": -44.71931457519531, + "logps/rejected": -49.086971282958984, + "loss": 0.6803, + "losses/dpo": 0.6806377172470093, + "losses/sft": 1.4626554250717163, + "losses/total": 0.6806377172470093, + "ref_logps/chosen": -43.528839111328125, + "ref_logps/rejected": -47.61699676513672, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11904732137918472, + "rewards/margins": 0.027950255200266838, + "rewards/rejected": -0.1469975709915161, + "step": 173 + }, + { + "epoch": 0.16, + "grad_norm": 16.596031188964844, + "learning_rate": 2.735849056603773e-07, + "logps/chosen": -41.14557647705078, + "logps/rejected": -51.88591003417969, + "loss": 0.6778, + "losses/dpo": 0.6837175488471985, + "losses/sft": 1.5990148782730103, + "losses/total": 0.6837175488471985, + "ref_logps/chosen": -40.182373046875, + "ref_logps/rejected": -50.60072708129883, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09632086008787155, + "rewards/margins": 0.032197415828704834, + "rewards/rejected": -0.12851828336715698, + "step": 174 + }, + { + "epoch": 0.17, + "grad_norm": 14.481203079223633, + "learning_rate": 2.751572327044025e-07, + "logps/chosen": -32.59597396850586, + "logps/rejected": -39.09141540527344, + "loss": 0.6745, + "losses/dpo": 0.6526998281478882, + "losses/sft": 1.1885079145431519, + "losses/total": 0.6526998281478882, + "ref_logps/chosen": -31.67422103881836, + "ref_logps/rejected": -37.783836364746094, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0921751856803894, + "rewards/margins": 0.03858298808336258, + "rewards/rejected": -0.1307581663131714, + "step": 175 + }, + { + "epoch": 0.17, + "grad_norm": 13.180888175964355, + "learning_rate": 2.767295597484277e-07, + "logps/chosen": -30.781482696533203, + "logps/rejected": -33.86185836791992, + "loss": 0.7028, + "losses/dpo": 0.702089786529541, + "losses/sft": 1.3457748889923096, + "losses/total": 0.702089786529541, + "ref_logps/chosen": -29.678794860839844, + "ref_logps/rejected": -32.9378662109375, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.11026884615421295, + "rewards/margins": -0.017869500443339348, + "rewards/rejected": -0.09239935129880905, + "step": 176 + }, + { + "epoch": 0.17, + "grad_norm": 16.37763214111328, + "learning_rate": 2.783018867924528e-07, + "logps/chosen": -39.206024169921875, + "logps/rejected": -49.604888916015625, + "loss": 0.6964, + "losses/dpo": 0.68863445520401, + "losses/sft": 1.305910587310791, + "losses/total": 0.68863445520401, + "ref_logps/chosen": -38.11943817138672, + "ref_logps/rejected": -48.57234573364258, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10865846276283264, + "rewards/margins": -0.005403862800449133, + "rewards/rejected": -0.10325458645820618, + "step": 177 + }, + { + "epoch": 0.17, + "grad_norm": 17.375797271728516, + "learning_rate": 2.79874213836478e-07, + "logps/chosen": -45.335411071777344, + "logps/rejected": -59.22425842285156, + "loss": 0.6907, + "losses/dpo": 0.6877986192703247, + "losses/sft": 1.5931527614593506, + "losses/total": 0.6877986192703247, + "ref_logps/chosen": -43.8665885925293, + "ref_logps/rejected": -57.69758224487305, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14688171446323395, + "rewards/margins": 0.005785716697573662, + "rewards/rejected": -0.15266743302345276, + "step": 178 + }, + { + "epoch": 0.17, + "grad_norm": 13.111441612243652, + "learning_rate": 2.814465408805031e-07, + "logps/chosen": -29.578392028808594, + "logps/rejected": -31.886646270751953, + "loss": 0.7002, + "losses/dpo": 0.7050355672836304, + "losses/sft": 1.4341094493865967, + "losses/total": 0.7050355672836304, + "ref_logps/chosen": -28.493379592895508, + "ref_logps/rejected": -30.934284210205078, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1085013598203659, + "rewards/margins": -0.013265163637697697, + "rewards/rejected": -0.09523618966341019, + "step": 179 + }, + { + "epoch": 0.17, + "grad_norm": 16.71872901916504, + "learning_rate": 2.830188679245283e-07, + "logps/chosen": -46.59730911254883, + "logps/rejected": -50.84010314941406, + "loss": 0.6928, + "losses/dpo": 0.6997765302658081, + "losses/sft": 1.3833726644515991, + "losses/total": 0.6997765302658081, + "ref_logps/chosen": -45.3701171875, + "ref_logps/rejected": -49.58867263793945, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1227194219827652, + "rewards/margins": 0.0024234289303421974, + "rewards/rejected": -0.12514284253120422, + "step": 180 + }, + { + "epoch": 0.17, + "grad_norm": 13.005847930908203, + "learning_rate": 2.8459119496855345e-07, + "logps/chosen": -28.849990844726562, + "logps/rejected": -40.01360321044922, + "loss": 0.6859, + "losses/dpo": 0.6663920283317566, + "losses/sft": 1.5015794038772583, + "losses/total": 0.6663920283317566, + "ref_logps/chosen": -27.75012969970703, + "ref_logps/rejected": -38.7610969543457, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1099858433008194, + "rewards/margins": 0.015264769084751606, + "rewards/rejected": -0.12525062263011932, + "step": 181 + }, + { + "epoch": 0.17, + "grad_norm": 16.202726364135742, + "learning_rate": 2.861635220125786e-07, + "logps/chosen": -47.63661193847656, + "logps/rejected": -43.00746536254883, + "loss": 0.7008, + "losses/dpo": 0.7257519364356995, + "losses/sft": 1.6272774934768677, + "losses/total": 0.7257519364356995, + "ref_logps/chosen": -46.10846710205078, + "ref_logps/rejected": -41.60276412963867, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.15281397104263306, + "rewards/margins": -0.012343762442469597, + "rewards/rejected": -0.1404702067375183, + "step": 182 + }, + { + "epoch": 0.17, + "grad_norm": 15.507378578186035, + "learning_rate": 2.8773584905660376e-07, + "logps/chosen": -34.826133728027344, + "logps/rejected": -35.629150390625, + "loss": 0.6983, + "losses/dpo": 0.7034717202186584, + "losses/sft": 1.0290791988372803, + "losses/total": 0.7034717202186584, + "ref_logps/chosen": -33.809837341308594, + "ref_logps/rejected": -34.70735549926758, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.10162919014692307, + "rewards/margins": -0.00944942981004715, + "rewards/rejected": -0.09217976033687592, + "step": 183 + }, + { + "epoch": 0.17, + "grad_norm": 16.532779693603516, + "learning_rate": 2.8930817610062893e-07, + "logps/chosen": -39.020416259765625, + "logps/rejected": -49.396759033203125, + "loss": 0.6692, + "losses/dpo": 0.6778005957603455, + "losses/sft": 1.1891576051712036, + "losses/total": 0.6778005957603455, + "ref_logps/chosen": -37.64043426513672, + "ref_logps/rejected": -47.520320892333984, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.13799801468849182, + "rewards/margins": 0.04964599758386612, + "rewards/rejected": -0.18764400482177734, + "step": 184 + }, + { + "epoch": 0.17, + "grad_norm": 17.674861907958984, + "learning_rate": 2.9088050314465406e-07, + "logps/chosen": -44.64285659790039, + "logps/rejected": -58.697853088378906, + "loss": 0.6659, + "losses/dpo": 0.6724292039871216, + "losses/sft": 1.3990806341171265, + "losses/total": 0.6724292039871216, + "ref_logps/chosen": -43.65705871582031, + "ref_logps/rejected": -57.150028228759766, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09857955574989319, + "rewards/margins": 0.056202761828899384, + "rewards/rejected": -0.15478231012821198, + "step": 185 + }, + { + "epoch": 0.18, + "grad_norm": 16.24513053894043, + "learning_rate": 2.9245283018867923e-07, + "logps/chosen": -43.595157623291016, + "logps/rejected": -50.3731575012207, + "loss": 0.6705, + "losses/dpo": 0.6837365627288818, + "losses/sft": 1.6158891916275024, + "losses/total": 0.6837365627288818, + "ref_logps/chosen": -42.28122329711914, + "ref_logps/rejected": -48.581905364990234, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.13139350712299347, + "rewards/margins": 0.047731611877679825, + "rewards/rejected": -0.1791251301765442, + "step": 186 + }, + { + "epoch": 0.18, + "grad_norm": 14.867253303527832, + "learning_rate": 2.9402515723270436e-07, + "logps/chosen": -38.93950653076172, + "logps/rejected": -38.7735595703125, + "loss": 0.6962, + "losses/dpo": 0.7279604077339172, + "losses/sft": 1.3980575799942017, + "losses/total": 0.7279604077339172, + "ref_logps/chosen": -37.841529846191406, + "ref_logps/rejected": -37.7196044921875, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1097974181175232, + "rewards/margins": -0.004402121528983116, + "rewards/rejected": -0.10539529472589493, + "step": 187 + }, + { + "epoch": 0.18, + "grad_norm": 16.037687301635742, + "learning_rate": 2.9559748427672953e-07, + "logps/chosen": -39.63153839111328, + "logps/rejected": -51.349578857421875, + "loss": 0.6738, + "losses/dpo": 0.6700311899185181, + "losses/sft": 1.102718472480774, + "losses/total": 0.6700311899185181, + "ref_logps/chosen": -38.3155517578125, + "ref_logps/rejected": -49.63243865966797, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.13159829378128052, + "rewards/margins": 0.040115997195243835, + "rewards/rejected": -0.17171429097652435, + "step": 188 + }, + { + "epoch": 0.18, + "grad_norm": 14.470324516296387, + "learning_rate": 2.971698113207547e-07, + "logps/chosen": -31.057300567626953, + "logps/rejected": -39.35655212402344, + "loss": 0.6921, + "losses/dpo": 0.6860572099685669, + "losses/sft": 1.5276051759719849, + "losses/total": 0.6860572099685669, + "ref_logps/chosen": -29.975032806396484, + "ref_logps/rejected": -38.23912811279297, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.10822691768407822, + "rewards/margins": 0.003515336662530899, + "rewards/rejected": -0.11174225062131882, + "step": 189 + }, + { + "epoch": 0.18, + "grad_norm": 15.651127815246582, + "learning_rate": 2.9874213836477983e-07, + "logps/chosen": -34.63383483886719, + "logps/rejected": -50.765357971191406, + "loss": 0.685, + "losses/dpo": 0.6855325698852539, + "losses/sft": 1.1889837980270386, + "losses/total": 0.6855325698852539, + "ref_logps/chosen": -33.375396728515625, + "ref_logps/rejected": -49.328575134277344, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12584339082241058, + "rewards/margins": 0.017834901809692383, + "rewards/rejected": -0.14367829263210297, + "step": 190 + }, + { + "epoch": 0.18, + "grad_norm": 15.785958290100098, + "learning_rate": 3.00314465408805e-07, + "logps/chosen": -45.23821258544922, + "logps/rejected": -42.64419174194336, + "loss": 0.6782, + "losses/dpo": 0.7013839483261108, + "losses/sft": 1.2799322605133057, + "losses/total": 0.7013839483261108, + "ref_logps/chosen": -43.82655334472656, + "ref_logps/rejected": -40.90449523925781, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1411656141281128, + "rewards/margins": 0.03280426189303398, + "rewards/rejected": -0.17396987974643707, + "step": 191 + }, + { + "epoch": 0.18, + "grad_norm": 15.278894424438477, + "learning_rate": 3.018867924528302e-07, + "logps/chosen": -48.549888610839844, + "logps/rejected": -48.774085998535156, + "loss": 0.6952, + "losses/dpo": 0.7167978286743164, + "losses/sft": 1.6310853958129883, + "losses/total": 0.7167978286743164, + "ref_logps/chosen": -47.09099197387695, + "ref_logps/rejected": -47.336204528808594, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1458895206451416, + "rewards/margins": -0.002101265825331211, + "rewards/rejected": -0.14378824830055237, + "step": 192 + }, + { + "epoch": 0.18, + "grad_norm": 15.971733093261719, + "learning_rate": 3.034591194968553e-07, + "logps/chosen": -40.196651458740234, + "logps/rejected": -38.4633674621582, + "loss": 0.6789, + "losses/dpo": 0.6922624111175537, + "losses/sft": 1.6729313135147095, + "losses/total": 0.6922624111175537, + "ref_logps/chosen": -38.757503509521484, + "ref_logps/rejected": -36.720375061035156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.14391487836837769, + "rewards/margins": 0.030384279787540436, + "rewards/rejected": -0.17429916560649872, + "step": 193 + }, + { + "epoch": 0.18, + "grad_norm": 14.058929443359375, + "learning_rate": 3.050314465408805e-07, + "logps/chosen": -31.338830947875977, + "logps/rejected": -37.29695129394531, + "loss": 0.6764, + "losses/dpo": 0.6699336767196655, + "losses/sft": 1.2760751247406006, + "losses/total": 0.6699336767196655, + "ref_logps/chosen": -30.212322235107422, + "ref_logps/rejected": -35.824440002441406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.11265072226524353, + "rewards/margins": 0.03460048511624336, + "rewards/rejected": -0.1472511887550354, + "step": 194 + }, + { + "epoch": 0.18, + "grad_norm": 15.570208549499512, + "learning_rate": 3.066037735849056e-07, + "logps/chosen": -31.415109634399414, + "logps/rejected": -47.4658203125, + "loss": 0.6769, + "losses/dpo": 0.6840342879295349, + "losses/sft": 1.202999234199524, + "losses/total": 0.6840342879295349, + "ref_logps/chosen": -30.24490737915039, + "ref_logps/rejected": -45.958744049072266, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1170201227068901, + "rewards/margins": 0.03368773311376572, + "rewards/rejected": -0.15070785582065582, + "step": 195 + }, + { + "epoch": 0.19, + "grad_norm": 16.667709350585938, + "learning_rate": 3.081761006289308e-07, + "logps/chosen": -43.68022918701172, + "logps/rejected": -56.54926300048828, + "loss": 0.6836, + "losses/dpo": 0.6808040142059326, + "losses/sft": 1.133742094039917, + "losses/total": 0.6808040142059326, + "ref_logps/chosen": -41.85504150390625, + "ref_logps/rejected": -54.516883850097656, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18251898884773254, + "rewards/margins": 0.02071884088218212, + "rewards/rejected": -0.20323783159255981, + "step": 196 + }, + { + "epoch": 0.19, + "grad_norm": 15.58508586883545, + "learning_rate": 3.0974842767295597e-07, + "logps/chosen": -39.37928009033203, + "logps/rejected": -50.64099884033203, + "loss": 0.682, + "losses/dpo": 0.6437791585922241, + "losses/sft": 1.2103878259658813, + "losses/total": 0.6437791585922241, + "ref_logps/chosen": -37.771575927734375, + "ref_logps/rejected": -48.79432678222656, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.16077077388763428, + "rewards/margins": 0.023896407335996628, + "rewards/rejected": -0.1846671849489212, + "step": 197 + }, + { + "epoch": 0.19, + "grad_norm": 16.314849853515625, + "learning_rate": 3.113207547169811e-07, + "logps/chosen": -50.09778594970703, + "logps/rejected": -46.592864990234375, + "loss": 0.6833, + "losses/dpo": 0.6780909299850464, + "losses/sft": 1.2175432443618774, + "losses/total": 0.6780909299850464, + "ref_logps/chosen": -48.372596740722656, + "ref_logps/rejected": -44.658721923828125, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17251861095428467, + "rewards/margins": 0.02089569717645645, + "rewards/rejected": -0.19341430068016052, + "step": 198 + }, + { + "epoch": 0.19, + "grad_norm": 15.102590560913086, + "learning_rate": 3.1289308176100627e-07, + "logps/chosen": -36.87873840332031, + "logps/rejected": -47.39272689819336, + "loss": 0.6813, + "losses/dpo": 0.7059723138809204, + "losses/sft": 1.406247615814209, + "losses/total": 0.7059723138809204, + "ref_logps/chosen": -35.17682647705078, + "ref_logps/rejected": -45.424652099609375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1701907515525818, + "rewards/margins": 0.02661687135696411, + "rewards/rejected": -0.1968076229095459, + "step": 199 + }, + { + "epoch": 0.19, + "grad_norm": 14.590982437133789, + "learning_rate": 3.1446540880503144e-07, + "logps/chosen": -35.96052551269531, + "logps/rejected": -39.116111755371094, + "loss": 0.7049, + "losses/dpo": 0.7237061262130737, + "losses/sft": 1.5074063539505005, + "losses/total": 0.7237061262130737, + "ref_logps/chosen": -33.97023010253906, + "ref_logps/rejected": -37.34395217895508, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.19902947545051575, + "rewards/margins": -0.021813856437802315, + "rewards/rejected": -0.1772156059741974, + "step": 200 + }, + { + "epoch": 0.19, + "grad_norm": 17.552453994750977, + "learning_rate": 3.1603773584905657e-07, + "logps/chosen": -50.09906768798828, + "logps/rejected": -51.57411193847656, + "loss": 0.6844, + "losses/dpo": 0.6874098181724548, + "losses/sft": 1.8443374633789062, + "losses/total": 0.6874098181724548, + "ref_logps/chosen": -48.258975982666016, + "ref_logps/rejected": -49.54026794433594, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.18400876224040985, + "rewards/margins": 0.01937580667436123, + "rewards/rejected": -0.20338457822799683, + "step": 201 + }, + { + "epoch": 0.19, + "grad_norm": 15.17917537689209, + "learning_rate": 3.1761006289308174e-07, + "logps/chosen": -41.997772216796875, + "logps/rejected": -42.95301055908203, + "loss": 0.6651, + "losses/dpo": 0.6975255012512207, + "losses/sft": 1.6166423559188843, + "losses/total": 0.6975255012512207, + "ref_logps/chosen": -40.2783203125, + "ref_logps/rejected": -40.63972473144531, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17194506525993347, + "rewards/margins": 0.059383779764175415, + "rewards/rejected": -0.23132885992527008, + "step": 202 + }, + { + "epoch": 0.19, + "grad_norm": 13.554970741271973, + "learning_rate": 3.1918238993710687e-07, + "logps/chosen": -32.481422424316406, + "logps/rejected": -41.664222717285156, + "loss": 0.6747, + "losses/dpo": 0.6949838399887085, + "losses/sft": 1.0620595216751099, + "losses/total": 0.6949838399887085, + "ref_logps/chosen": -31.102386474609375, + "ref_logps/rejected": -39.886993408203125, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.13790348172187805, + "rewards/margins": 0.03981912508606911, + "rewards/rejected": -0.17772261798381805, + "step": 203 + }, + { + "epoch": 0.19, + "grad_norm": 15.56489372253418, + "learning_rate": 3.2075471698113204e-07, + "logps/chosen": -37.09054946899414, + "logps/rejected": -53.35906219482422, + "loss": 0.692, + "losses/dpo": 0.6879791021347046, + "losses/sft": 1.4524227380752563, + "losses/total": 0.6879791021347046, + "ref_logps/chosen": -35.559696197509766, + "ref_logps/rejected": -51.78603744506836, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.15308555960655212, + "rewards/margins": 0.004216962028294802, + "rewards/rejected": -0.15730252861976624, + "step": 204 + }, + { + "epoch": 0.19, + "grad_norm": 16.473398208618164, + "learning_rate": 3.223270440251572e-07, + "logps/chosen": -47.96939468383789, + "logps/rejected": -42.30652618408203, + "loss": 0.6717, + "losses/dpo": 0.6861964464187622, + "losses/sft": 1.3401845693588257, + "losses/total": 0.6861964464187622, + "ref_logps/chosen": -46.041831970214844, + "ref_logps/rejected": -39.90606689453125, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19275638461112976, + "rewards/margins": 0.04728955030441284, + "rewards/rejected": -0.2400459349155426, + "step": 205 + }, + { + "epoch": 0.19, + "grad_norm": 15.597003936767578, + "learning_rate": 3.2389937106918235e-07, + "logps/chosen": -41.01193618774414, + "logps/rejected": -51.83057403564453, + "loss": 0.6682, + "losses/dpo": 0.6791011095046997, + "losses/sft": 1.6842013597488403, + "losses/total": 0.6791011095046997, + "ref_logps/chosen": -39.343719482421875, + "ref_logps/rejected": -49.630767822265625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16682153940200806, + "rewards/margins": 0.05315908417105675, + "rewards/rejected": -0.2199806123971939, + "step": 206 + }, + { + "epoch": 0.2, + "grad_norm": 16.27007484436035, + "learning_rate": 3.254716981132075e-07, + "logps/chosen": -32.553504943847656, + "logps/rejected": -45.2066650390625, + "loss": 0.6929, + "losses/dpo": 0.690405011177063, + "losses/sft": 1.352427363395691, + "losses/total": 0.690405011177063, + "ref_logps/chosen": -30.952503204345703, + "ref_logps/rejected": -43.589378356933594, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1601003110408783, + "rewards/margins": 0.0016281567513942719, + "rewards/rejected": -0.16172844171524048, + "step": 207 + }, + { + "epoch": 0.2, + "grad_norm": 17.231746673583984, + "learning_rate": 3.2704402515723265e-07, + "logps/chosen": -43.998146057128906, + "logps/rejected": -41.52692794799805, + "loss": 0.7, + "losses/dpo": 0.6533277630805969, + "losses/sft": 1.257144570350647, + "losses/total": 0.6533277630805969, + "ref_logps/chosen": -42.212215423583984, + "ref_logps/rejected": -39.830039978027344, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.17859303951263428, + "rewards/margins": -0.008904272690415382, + "rewards/rejected": -0.16968876123428345, + "step": 208 + }, + { + "epoch": 0.2, + "grad_norm": 16.129486083984375, + "learning_rate": 3.286163522012578e-07, + "logps/chosen": -41.640602111816406, + "logps/rejected": -46.48876190185547, + "loss": 0.6814, + "losses/dpo": 0.6967303156852722, + "losses/sft": 0.8777698874473572, + "losses/total": 0.6967303156852722, + "ref_logps/chosen": -39.685237884521484, + "ref_logps/rejected": -44.27562713623047, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19553638994693756, + "rewards/margins": 0.025777162984013557, + "rewards/rejected": -0.22131355106830597, + "step": 209 + }, + { + "epoch": 0.2, + "grad_norm": 14.743343353271484, + "learning_rate": 3.30188679245283e-07, + "logps/chosen": -32.78865432739258, + "logps/rejected": -42.84953689575195, + "loss": 0.6634, + "losses/dpo": 0.6717252135276794, + "losses/sft": 1.4664636850357056, + "losses/total": 0.6717252135276794, + "ref_logps/chosen": -31.449596405029297, + "ref_logps/rejected": -40.8954963684082, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13390573859214783, + "rewards/margins": 0.06149820238351822, + "rewards/rejected": -0.19540394842624664, + "step": 210 + }, + { + "epoch": 0.2, + "grad_norm": 13.537386894226074, + "learning_rate": 3.317610062893081e-07, + "logps/chosen": -33.905540466308594, + "logps/rejected": -35.38450622558594, + "loss": 0.6864, + "losses/dpo": 0.6876325011253357, + "losses/sft": 1.122238039970398, + "losses/total": 0.6876325011253357, + "ref_logps/chosen": -32.07237243652344, + "ref_logps/rejected": -33.40351486206055, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1833166927099228, + "rewards/margins": 0.014782454818487167, + "rewards/rejected": -0.19809913635253906, + "step": 211 + }, + { + "epoch": 0.2, + "grad_norm": 17.885358810424805, + "learning_rate": 3.333333333333333e-07, + "logps/chosen": -50.72590637207031, + "logps/rejected": -57.39551544189453, + "loss": 0.6716, + "losses/dpo": 0.6695274710655212, + "losses/sft": 1.7289897203445435, + "losses/total": 0.6695274710655212, + "ref_logps/chosen": -48.49372100830078, + "ref_logps/rejected": -54.711021423339844, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22321867942810059, + "rewards/margins": 0.04523022100329399, + "rewards/rejected": -0.2684488892555237, + "step": 212 + }, + { + "epoch": 0.2, + "grad_norm": 13.853524208068848, + "learning_rate": 3.349056603773585e-07, + "logps/chosen": -29.290006637573242, + "logps/rejected": -40.776973724365234, + "loss": 0.6819, + "losses/dpo": 0.6762322783470154, + "losses/sft": 1.0710937976837158, + "losses/total": 0.6762322783470154, + "ref_logps/chosen": -27.652446746826172, + "ref_logps/rejected": -38.88838195800781, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16375604271888733, + "rewards/margins": 0.025102993473410606, + "rewards/rejected": -0.18885904550552368, + "step": 213 + }, + { + "epoch": 0.2, + "grad_norm": 14.594182014465332, + "learning_rate": 3.364779874213836e-07, + "logps/chosen": -31.246246337890625, + "logps/rejected": -41.12478256225586, + "loss": 0.6688, + "losses/dpo": 0.6594405174255371, + "losses/sft": 1.2172948122024536, + "losses/total": 0.6594405174255371, + "ref_logps/chosen": -29.770009994506836, + "ref_logps/rejected": -39.10675048828125, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.14762377738952637, + "rewards/margins": 0.05417914688587189, + "rewards/rejected": -0.20180290937423706, + "step": 214 + }, + { + "epoch": 0.2, + "grad_norm": 16.192296981811523, + "learning_rate": 3.380503144654088e-07, + "logps/chosen": -44.14506530761719, + "logps/rejected": -55.825077056884766, + "loss": 0.6631, + "losses/dpo": 0.656355082988739, + "losses/sft": 1.545645833015442, + "losses/total": 0.656355082988739, + "ref_logps/chosen": -41.701148986816406, + "ref_logps/rejected": -52.75156021118164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.24439160525798798, + "rewards/margins": 0.06296008080244064, + "rewards/rejected": -0.307351678609848, + "step": 215 + }, + { + "epoch": 0.2, + "grad_norm": 14.246395111083984, + "learning_rate": 3.396226415094339e-07, + "logps/chosen": -35.00162887573242, + "logps/rejected": -43.11711120605469, + "loss": 0.6689, + "losses/dpo": 0.6910299062728882, + "losses/sft": 1.5011862516403198, + "losses/total": 0.6910299062728882, + "ref_logps/chosen": -33.433834075927734, + "ref_logps/rejected": -41.03712463378906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1567794233560562, + "rewards/margins": 0.05121920257806778, + "rewards/rejected": -0.2079986333847046, + "step": 216 + }, + { + "epoch": 0.2, + "grad_norm": 15.941094398498535, + "learning_rate": 3.411949685534591e-07, + "logps/chosen": -38.74461364746094, + "logps/rejected": -53.71509552001953, + "loss": 0.6728, + "losses/dpo": 0.708710789680481, + "losses/sft": 1.4716625213623047, + "losses/total": 0.708710789680481, + "ref_logps/chosen": -36.182579040527344, + "ref_logps/rejected": -50.68135070800781, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.25620341300964355, + "rewards/margins": 0.04717123508453369, + "rewards/rejected": -0.30337464809417725, + "step": 217 + }, + { + "epoch": 0.21, + "grad_norm": 15.452842712402344, + "learning_rate": 3.4276729559748426e-07, + "logps/chosen": -46.325130462646484, + "logps/rejected": -41.82459259033203, + "loss": 0.6763, + "losses/dpo": 0.7202698588371277, + "losses/sft": 1.8678845167160034, + "losses/total": 0.7202698588371277, + "ref_logps/chosen": -44.23219299316406, + "ref_logps/rejected": -39.33218002319336, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20929360389709473, + "rewards/margins": 0.039947494864463806, + "rewards/rejected": -0.24924111366271973, + "step": 218 + }, + { + "epoch": 0.21, + "grad_norm": 14.239889144897461, + "learning_rate": 3.443396226415094e-07, + "logps/chosen": -35.42744445800781, + "logps/rejected": -39.29603576660156, + "loss": 0.6799, + "losses/dpo": 0.6879767179489136, + "losses/sft": 1.5547114610671997, + "losses/total": 0.6879767179489136, + "ref_logps/chosen": -33.429893493652344, + "ref_logps/rejected": -36.999755859375, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19975495338439941, + "rewards/margins": 0.029872652143239975, + "rewards/rejected": -0.2296276092529297, + "step": 219 + }, + { + "epoch": 0.21, + "grad_norm": 15.082650184631348, + "learning_rate": 3.4591194968553456e-07, + "logps/chosen": -34.29314422607422, + "logps/rejected": -47.68400573730469, + "loss": 0.6497, + "losses/dpo": 0.6668439507484436, + "losses/sft": 1.0790891647338867, + "losses/total": 0.6668439507484436, + "ref_logps/chosen": -32.624237060546875, + "ref_logps/rejected": -45.09401321411133, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1668909788131714, + "rewards/margins": 0.09210819005966187, + "rewards/rejected": -0.25899916887283325, + "step": 220 + }, + { + "epoch": 0.21, + "grad_norm": 15.518473625183105, + "learning_rate": 3.4748427672955973e-07, + "logps/chosen": -40.32875442504883, + "logps/rejected": -52.40559005737305, + "loss": 0.6599, + "losses/dpo": 0.6081426739692688, + "losses/sft": 1.261610507965088, + "losses/total": 0.6081426739692688, + "ref_logps/chosen": -38.299949645996094, + "ref_logps/rejected": -49.6461181640625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20288079977035522, + "rewards/margins": 0.07306638360023499, + "rewards/rejected": -0.2759471833705902, + "step": 221 + }, + { + "epoch": 0.21, + "grad_norm": 14.866722106933594, + "learning_rate": 3.4905660377358486e-07, + "logps/chosen": -38.81988525390625, + "logps/rejected": -49.87083435058594, + "loss": 0.6693, + "losses/dpo": 0.6657773852348328, + "losses/sft": 1.343111276626587, + "losses/total": 0.6657773852348328, + "ref_logps/chosen": -36.71820068359375, + "ref_logps/rejected": -47.21161651611328, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21016836166381836, + "rewards/margins": 0.05575356259942055, + "rewards/rejected": -0.265921950340271, + "step": 222 + }, + { + "epoch": 0.21, + "grad_norm": 15.532499313354492, + "learning_rate": 3.5062893081761003e-07, + "logps/chosen": -45.165679931640625, + "logps/rejected": -42.318572998046875, + "loss": 0.6893, + "losses/dpo": 0.6813037395477295, + "losses/sft": 1.8235492706298828, + "losses/total": 0.6813037395477295, + "ref_logps/chosen": -42.794921875, + "ref_logps/rejected": -39.800296783447266, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.23707585036754608, + "rewards/margins": 0.01475165132433176, + "rewards/rejected": -0.2518274784088135, + "step": 223 + }, + { + "epoch": 0.21, + "grad_norm": 16.2032527923584, + "learning_rate": 3.5220125786163516e-07, + "logps/chosen": -43.01076889038086, + "logps/rejected": -43.869937896728516, + "loss": 0.6603, + "losses/dpo": 0.6653252840042114, + "losses/sft": 1.4910627603530884, + "losses/total": 0.6653252840042114, + "ref_logps/chosen": -41.44512176513672, + "ref_logps/rejected": -41.61030197143555, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1565646529197693, + "rewards/margins": 0.06939879059791565, + "rewards/rejected": -0.22596344351768494, + "step": 224 + }, + { + "epoch": 0.21, + "grad_norm": 13.993906021118164, + "learning_rate": 3.5377358490566033e-07, + "logps/chosen": -37.451271057128906, + "logps/rejected": -41.676597595214844, + "loss": 0.6771, + "losses/dpo": 0.6879667043685913, + "losses/sft": 1.3644269704818726, + "losses/total": 0.6879667043685913, + "ref_logps/chosen": -35.446128845214844, + "ref_logps/rejected": -39.31643295288086, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2005145400762558, + "rewards/margins": 0.03550197184085846, + "rewards/rejected": -0.23601651191711426, + "step": 225 + }, + { + "epoch": 0.21, + "grad_norm": 15.624147415161133, + "learning_rate": 3.553459119496855e-07, + "logps/chosen": -38.95832061767578, + "logps/rejected": -47.245208740234375, + "loss": 0.6808, + "losses/dpo": 0.6604455709457397, + "losses/sft": 1.2016445398330688, + "losses/total": 0.6604455709457397, + "ref_logps/chosen": -36.592918395996094, + "ref_logps/rejected": -44.59624481201172, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.23654010891914368, + "rewards/margins": 0.028356090188026428, + "rewards/rejected": -0.2648961842060089, + "step": 226 + }, + { + "epoch": 0.21, + "grad_norm": 15.331037521362305, + "learning_rate": 3.5691823899371064e-07, + "logps/chosen": -33.339542388916016, + "logps/rejected": -47.19365692138672, + "loss": 0.694, + "losses/dpo": 0.696473240852356, + "losses/sft": 1.533835530281067, + "losses/total": 0.696473240852356, + "ref_logps/chosen": -31.02646255493164, + "ref_logps/rejected": -44.7705078125, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2313082218170166, + "rewards/margins": 0.0110064297914505, + "rewards/rejected": -0.2423146367073059, + "step": 227 + }, + { + "epoch": 0.22, + "grad_norm": 14.999101638793945, + "learning_rate": 3.584905660377358e-07, + "logps/chosen": -34.735633850097656, + "logps/rejected": -40.994468688964844, + "loss": 0.674, + "losses/dpo": 0.6969943046569824, + "losses/sft": 1.3561644554138184, + "losses/total": 0.6969943046569824, + "ref_logps/chosen": -33.20204162597656, + "ref_logps/rejected": -39.01994323730469, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1533590853214264, + "rewards/margins": 0.04409363120794296, + "rewards/rejected": -0.19745272397994995, + "step": 228 + }, + { + "epoch": 0.22, + "grad_norm": 14.841408729553223, + "learning_rate": 3.60062893081761e-07, + "logps/chosen": -37.207210540771484, + "logps/rejected": -39.67007827758789, + "loss": 0.6915, + "losses/dpo": 0.7072970867156982, + "losses/sft": 1.3001089096069336, + "losses/total": 0.7072970867156982, + "ref_logps/chosen": -34.878719329833984, + "ref_logps/rejected": -37.27781677246094, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.23284918069839478, + "rewards/margins": 0.006377032026648521, + "rewards/rejected": -0.23922622203826904, + "step": 229 + }, + { + "epoch": 0.22, + "grad_norm": 15.088621139526367, + "learning_rate": 3.616352201257861e-07, + "logps/chosen": -37.07647705078125, + "logps/rejected": -43.55615234375, + "loss": 0.6722, + "losses/dpo": 0.7194198369979858, + "losses/sft": 1.6237705945968628, + "losses/total": 0.7194198369979858, + "ref_logps/chosen": -34.825340270996094, + "ref_logps/rejected": -40.844993591308594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22511407732963562, + "rewards/margins": 0.0460018664598465, + "rewards/rejected": -0.2711159586906433, + "step": 230 + }, + { + "epoch": 0.22, + "grad_norm": 15.284753799438477, + "learning_rate": 3.632075471698113e-07, + "logps/chosen": -39.28997039794922, + "logps/rejected": -43.157981872558594, + "loss": 0.6755, + "losses/dpo": 0.6688922643661499, + "losses/sft": 1.4212239980697632, + "losses/total": 0.6688922643661499, + "ref_logps/chosen": -37.02705383300781, + "ref_logps/rejected": -40.469093322753906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22629138827323914, + "rewards/margins": 0.042597681283950806, + "rewards/rejected": -0.26888906955718994, + "step": 231 + }, + { + "epoch": 0.22, + "grad_norm": 14.206757545471191, + "learning_rate": 3.647798742138364e-07, + "logps/chosen": -34.42321014404297, + "logps/rejected": -43.79088592529297, + "loss": 0.6681, + "losses/dpo": 0.664331316947937, + "losses/sft": 1.4147409200668335, + "losses/total": 0.664331316947937, + "ref_logps/chosen": -32.078857421875, + "ref_logps/rejected": -40.853614807128906, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.23443545401096344, + "rewards/margins": 0.05929189175367355, + "rewards/rejected": -0.2937273383140564, + "step": 232 + }, + { + "epoch": 0.22, + "grad_norm": 15.06775951385498, + "learning_rate": 3.663522012578616e-07, + "logps/chosen": -43.315086364746094, + "logps/rejected": -45.40140914916992, + "loss": 0.661, + "losses/dpo": 0.6380653977394104, + "losses/sft": 1.9001001119613647, + "losses/total": 0.6380653977394104, + "ref_logps/chosen": -40.40079116821289, + "ref_logps/rejected": -41.77799987792969, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2914295196533203, + "rewards/margins": 0.0709112212061882, + "rewards/rejected": -0.3623407483100891, + "step": 233 + }, + { + "epoch": 0.22, + "grad_norm": 15.058982849121094, + "learning_rate": 3.6792452830188677e-07, + "logps/chosen": -41.931304931640625, + "logps/rejected": -43.587646484375, + "loss": 0.6634, + "losses/dpo": 0.6785554885864258, + "losses/sft": 1.5310620069503784, + "losses/total": 0.6785554885864258, + "ref_logps/chosen": -39.47883605957031, + "ref_logps/rejected": -40.498313903808594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2452470362186432, + "rewards/margins": 0.06368651241064072, + "rewards/rejected": -0.3089335560798645, + "step": 234 + }, + { + "epoch": 0.22, + "grad_norm": 15.68648910522461, + "learning_rate": 3.694968553459119e-07, + "logps/chosen": -45.422271728515625, + "logps/rejected": -33.526824951171875, + "loss": 0.7061, + "losses/dpo": 0.717628002166748, + "losses/sft": 1.4144009351730347, + "losses/total": 0.717628002166748, + "ref_logps/chosen": -42.57256317138672, + "ref_logps/rejected": -30.87489128112793, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2849707007408142, + "rewards/margins": -0.019777163863182068, + "rewards/rejected": -0.26519352197647095, + "step": 235 + }, + { + "epoch": 0.22, + "grad_norm": 16.10345458984375, + "learning_rate": 3.7106918238993707e-07, + "logps/chosen": -44.631004333496094, + "logps/rejected": -47.2867546081543, + "loss": 0.6631, + "losses/dpo": 0.6312600374221802, + "losses/sft": 1.4290958642959595, + "losses/total": 0.6312600374221802, + "ref_logps/chosen": -42.252628326416016, + "ref_logps/rejected": -44.24822235107422, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23783786594867706, + "rewards/margins": 0.06601482629776001, + "rewards/rejected": -0.30385270714759827, + "step": 236 + }, + { + "epoch": 0.22, + "grad_norm": 15.954336166381836, + "learning_rate": 3.7264150943396224e-07, + "logps/chosen": -39.63313293457031, + "logps/rejected": -52.807403564453125, + "loss": 0.6736, + "losses/dpo": 0.6659075617790222, + "losses/sft": 1.698724627494812, + "losses/total": 0.6659075617790222, + "ref_logps/chosen": -36.858253479003906, + "ref_logps/rejected": -49.60414123535156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2774878144264221, + "rewards/margins": 0.04283905774354935, + "rewards/rejected": -0.32032686471939087, + "step": 237 + }, + { + "epoch": 0.22, + "grad_norm": 14.88613510131836, + "learning_rate": 3.7421383647798737e-07, + "logps/chosen": -39.489967346191406, + "logps/rejected": -52.311187744140625, + "loss": 0.6459, + "losses/dpo": 0.6612695455551147, + "losses/sft": 1.4787285327911377, + "losses/total": 0.6612695455551147, + "ref_logps/chosen": -37.394012451171875, + "ref_logps/rejected": -49.207279205322266, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.20959532260894775, + "rewards/margins": 0.10079538822174072, + "rewards/rejected": -0.3103907108306885, + "step": 238 + }, + { + "epoch": 0.23, + "grad_norm": 17.082839965820312, + "learning_rate": 3.757861635220126e-07, + "logps/chosen": -47.36906051635742, + "logps/rejected": -52.26832962036133, + "loss": 0.6787, + "losses/dpo": 0.6704140305519104, + "losses/sft": 1.844097375869751, + "losses/total": 0.6704140305519104, + "ref_logps/chosen": -44.38483428955078, + "ref_logps/rejected": -48.93090057373047, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2984226942062378, + "rewards/margins": 0.03532020002603531, + "rewards/rejected": -0.3337429165840149, + "step": 239 + }, + { + "epoch": 0.23, + "grad_norm": 15.276880264282227, + "learning_rate": 3.773584905660377e-07, + "logps/chosen": -41.31379699707031, + "logps/rejected": -45.109554290771484, + "loss": 0.6721, + "losses/dpo": 0.679213285446167, + "losses/sft": 1.7050467729568481, + "losses/total": 0.679213285446167, + "ref_logps/chosen": -38.43292236328125, + "ref_logps/rejected": -41.68553924560547, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2880876064300537, + "rewards/margins": 0.05431407690048218, + "rewards/rejected": -0.3424016833305359, + "step": 240 + }, + { + "epoch": 0.23, + "grad_norm": 15.54065227508545, + "learning_rate": 3.789308176100629e-07, + "logps/chosen": -42.323673248291016, + "logps/rejected": -44.42234802246094, + "loss": 0.6537, + "losses/dpo": 0.6742671728134155, + "losses/sft": 1.3836464881896973, + "losses/total": 0.6742671728134155, + "ref_logps/chosen": -39.27201461791992, + "ref_logps/rejected": -40.444923400878906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30516576766967773, + "rewards/margins": 0.09257683157920837, + "rewards/rejected": -0.3977426290512085, + "step": 241 + }, + { + "epoch": 0.23, + "grad_norm": 15.982912063598633, + "learning_rate": 3.805031446540881e-07, + "logps/chosen": -45.311492919921875, + "logps/rejected": -44.36945724487305, + "loss": 0.666, + "losses/dpo": 0.6028597354888916, + "losses/sft": 1.141249656677246, + "losses/total": 0.6028597354888916, + "ref_logps/chosen": -42.80768585205078, + "ref_logps/rejected": -41.26680374145508, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2503806948661804, + "rewards/margins": 0.05988464504480362, + "rewards/rejected": -0.31026530265808105, + "step": 242 + }, + { + "epoch": 0.23, + "grad_norm": 15.576713562011719, + "learning_rate": 3.820754716981132e-07, + "logps/chosen": -40.11058807373047, + "logps/rejected": -49.348941802978516, + "loss": 0.6546, + "losses/dpo": 0.6521026492118835, + "losses/sft": 1.044120192527771, + "losses/total": 0.6521026492118835, + "ref_logps/chosen": -37.294281005859375, + "ref_logps/rejected": -45.683921813964844, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.281631201505661, + "rewards/margins": 0.08487124741077423, + "rewards/rejected": -0.36650246381759644, + "step": 243 + }, + { + "epoch": 0.23, + "grad_norm": 16.844026565551758, + "learning_rate": 3.836477987421384e-07, + "logps/chosen": -45.600990295410156, + "logps/rejected": -52.28462219238281, + "loss": 0.6774, + "losses/dpo": 0.7089597582817078, + "losses/sft": 1.6474016904830933, + "losses/total": 0.7089597582817078, + "ref_logps/chosen": -42.518218994140625, + "ref_logps/rejected": -48.75550079345703, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.30827704071998596, + "rewards/margins": 0.044635359197854996, + "rewards/rejected": -0.35291242599487305, + "step": 244 + }, + { + "epoch": 0.23, + "grad_norm": 15.172090530395508, + "learning_rate": 3.8522012578616355e-07, + "logps/chosen": -36.581722259521484, + "logps/rejected": -45.127784729003906, + "loss": 0.7012, + "losses/dpo": 0.7287442088127136, + "losses/sft": 1.337358832359314, + "losses/total": 0.7287442088127136, + "ref_logps/chosen": -33.399070739746094, + "ref_logps/rejected": -42.060569763183594, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.31826525926589966, + "rewards/margins": -0.011543730273842812, + "rewards/rejected": -0.3067215085029602, + "step": 245 + }, + { + "epoch": 0.23, + "grad_norm": 14.574108123779297, + "learning_rate": 3.867924528301887e-07, + "logps/chosen": -36.471771240234375, + "logps/rejected": -40.091285705566406, + "loss": 0.6782, + "losses/dpo": 0.6828131079673767, + "losses/sft": 1.4105702638626099, + "losses/total": 0.6828131079673767, + "ref_logps/chosen": -34.01863098144531, + "ref_logps/rejected": -37.2685432434082, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24531395733356476, + "rewards/margins": 0.036960236728191376, + "rewards/rejected": -0.28227418661117554, + "step": 246 + }, + { + "epoch": 0.23, + "grad_norm": 15.863611221313477, + "learning_rate": 3.8836477987421385e-07, + "logps/chosen": -36.288108825683594, + "logps/rejected": -38.02286148071289, + "loss": 0.6863, + "losses/dpo": 0.7561929821968079, + "losses/sft": 1.3613510131835938, + "losses/total": 0.7561929821968079, + "ref_logps/chosen": -33.26061248779297, + "ref_logps/rejected": -34.768577575683594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3027498424053192, + "rewards/margins": 0.022678393870592117, + "rewards/rejected": -0.3254282474517822, + "step": 247 + }, + { + "epoch": 0.23, + "grad_norm": 13.649224281311035, + "learning_rate": 3.89937106918239e-07, + "logps/chosen": -33.580169677734375, + "logps/rejected": -30.676929473876953, + "loss": 0.7073, + "losses/dpo": 0.6862790584564209, + "losses/sft": 1.3503104448318481, + "losses/total": 0.6862790584564209, + "ref_logps/chosen": -30.99078941345215, + "ref_logps/rejected": -28.324487686157227, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2589379549026489, + "rewards/margins": -0.023694012314081192, + "rewards/rejected": -0.23524394631385803, + "step": 248 + }, + { + "epoch": 0.24, + "grad_norm": 14.472437858581543, + "learning_rate": 3.9150943396226415e-07, + "logps/chosen": -35.0002555847168, + "logps/rejected": -43.43840026855469, + "loss": 0.6854, + "losses/dpo": 0.7300747632980347, + "losses/sft": 1.217039704322815, + "losses/total": 0.7300747632980347, + "ref_logps/chosen": -31.94352912902832, + "ref_logps/rejected": -40.115211486816406, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3056725859642029, + "rewards/margins": 0.0266465712338686, + "rewards/rejected": -0.33231914043426514, + "step": 249 + }, + { + "epoch": 0.24, + "grad_norm": 18.260669708251953, + "learning_rate": 3.9308176100628933e-07, + "logps/chosen": -48.50001525878906, + "logps/rejected": -54.786956787109375, + "loss": 0.6785, + "losses/dpo": 0.7375392317771912, + "losses/sft": 1.5034102201461792, + "losses/total": 0.7375392317771912, + "ref_logps/chosen": -45.462913513183594, + "ref_logps/rejected": -51.376548767089844, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.30370983481407166, + "rewards/margins": 0.037330590188503265, + "rewards/rejected": -0.3410404324531555, + "step": 250 + }, + { + "epoch": 0.24, + "grad_norm": 17.827125549316406, + "learning_rate": 3.9465408805031445e-07, + "logps/chosen": -43.956756591796875, + "logps/rejected": -53.212440490722656, + "loss": 0.7021, + "losses/dpo": 0.728506326675415, + "losses/sft": 1.50521719455719, + "losses/total": 0.728506326675415, + "ref_logps/chosen": -40.286964416503906, + "ref_logps/rejected": -49.678077697753906, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3669794201850891, + "rewards/margins": -0.01354297250509262, + "rewards/rejected": -0.3534364700317383, + "step": 251 + }, + { + "epoch": 0.24, + "grad_norm": 18.261459350585938, + "learning_rate": 3.9622641509433963e-07, + "logps/chosen": -48.157283782958984, + "logps/rejected": -55.378719329833984, + "loss": 0.6956, + "losses/dpo": 0.6914754509925842, + "losses/sft": 1.2750729322433472, + "losses/total": 0.6914754509925842, + "ref_logps/chosen": -44.9898567199707, + "ref_logps/rejected": -52.20886993408203, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.31674253940582275, + "rewards/margins": 0.00024258531630039215, + "rewards/rejected": -0.3169851005077362, + "step": 252 + }, + { + "epoch": 0.24, + "grad_norm": 17.766359329223633, + "learning_rate": 3.977987421383648e-07, + "logps/chosen": -52.382164001464844, + "logps/rejected": -55.79834747314453, + "loss": 0.6847, + "losses/dpo": 0.70313560962677, + "losses/sft": 1.7460511922836304, + "losses/total": 0.70313560962677, + "ref_logps/chosen": -48.68946075439453, + "ref_logps/rejected": -51.823795318603516, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3692704439163208, + "rewards/margins": 0.02818494290113449, + "rewards/rejected": -0.3974553644657135, + "step": 253 + }, + { + "epoch": 0.24, + "grad_norm": 13.266200065612793, + "learning_rate": 3.9937106918238993e-07, + "logps/chosen": -24.98733901977539, + "logps/rejected": -41.37847900390625, + "loss": 0.6617, + "losses/dpo": 0.6861698031425476, + "losses/sft": 1.0828763246536255, + "losses/total": 0.6861698031425476, + "ref_logps/chosen": -22.569034576416016, + "ref_logps/rejected": -38.25971984863281, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24183058738708496, + "rewards/margins": 0.07004562020301819, + "rewards/rejected": -0.31187620759010315, + "step": 254 + }, + { + "epoch": 0.24, + "grad_norm": 16.188703536987305, + "learning_rate": 4.009433962264151e-07, + "logps/chosen": -42.302978515625, + "logps/rejected": -50.74462890625, + "loss": 0.6724, + "losses/dpo": 0.7926273941993713, + "losses/sft": 1.7149397134780884, + "losses/total": 0.7926273941993713, + "ref_logps/chosen": -38.7346076965332, + "ref_logps/rejected": -46.588783264160156, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3568369746208191, + "rewards/margins": 0.0587477833032608, + "rewards/rejected": -0.4155848026275635, + "step": 255 + }, + { + "epoch": 0.24, + "grad_norm": 15.104988098144531, + "learning_rate": 4.0251572327044023e-07, + "logps/chosen": -41.74165344238281, + "logps/rejected": -51.38977813720703, + "loss": 0.6435, + "losses/dpo": 0.6382113695144653, + "losses/sft": 1.4787259101867676, + "losses/total": 0.6382113695144653, + "ref_logps/chosen": -38.84921646118164, + "ref_logps/rejected": -47.40968322753906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28924357891082764, + "rewards/margins": 0.10876599699258804, + "rewards/rejected": -0.3980095684528351, + "step": 256 + }, + { + "epoch": 0.24, + "grad_norm": 14.39992618560791, + "learning_rate": 4.040880503144654e-07, + "logps/chosen": -34.94757080078125, + "logps/rejected": -42.57470703125, + "loss": 0.6857, + "losses/dpo": 0.7048845291137695, + "losses/sft": 1.443289041519165, + "losses/total": 0.7048845291137695, + "ref_logps/chosen": -31.906789779663086, + "ref_logps/rejected": -39.276458740234375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.304077684879303, + "rewards/margins": 0.025747492909431458, + "rewards/rejected": -0.32982516288757324, + "step": 257 + }, + { + "epoch": 0.24, + "grad_norm": 17.034503936767578, + "learning_rate": 4.056603773584906e-07, + "logps/chosen": -45.911746978759766, + "logps/rejected": -57.7626953125, + "loss": 0.6624, + "losses/dpo": 0.7345125675201416, + "losses/sft": 1.5799616575241089, + "losses/total": 0.7345125675201416, + "ref_logps/chosen": -42.90734100341797, + "ref_logps/rejected": -54.03882598876953, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3004402220249176, + "rewards/margins": 0.07194683700799942, + "rewards/rejected": -0.3723870813846588, + "step": 258 + }, + { + "epoch": 0.24, + "grad_norm": 14.38132381439209, + "learning_rate": 4.072327044025157e-07, + "logps/chosen": -30.72813606262207, + "logps/rejected": -42.7314567565918, + "loss": 0.6639, + "losses/dpo": 0.6390981078147888, + "losses/sft": 1.2096134424209595, + "losses/total": 0.6390981078147888, + "ref_logps/chosen": -28.13254165649414, + "ref_logps/rejected": -39.48625183105469, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25955939292907715, + "rewards/margins": 0.06496097147464752, + "rewards/rejected": -0.3245203495025635, + "step": 259 + }, + { + "epoch": 0.25, + "grad_norm": 13.774007797241211, + "learning_rate": 4.088050314465409e-07, + "logps/chosen": -34.30670166015625, + "logps/rejected": -39.893516540527344, + "loss": 0.6894, + "losses/dpo": 0.6727636456489563, + "losses/sft": 1.287360429763794, + "losses/total": 0.6727636456489563, + "ref_logps/chosen": -31.98028564453125, + "ref_logps/rejected": -37.41706085205078, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23264184594154358, + "rewards/margins": 0.015003605745732784, + "rewards/rejected": -0.24764545261859894, + "step": 260 + }, + { + "epoch": 0.25, + "grad_norm": 15.514729499816895, + "learning_rate": 4.1037735849056606e-07, + "logps/chosen": -40.19240188598633, + "logps/rejected": -44.82511901855469, + "loss": 0.6775, + "losses/dpo": 0.6544908285140991, + "losses/sft": 1.601394534111023, + "losses/total": 0.6544908285140991, + "ref_logps/chosen": -37.0391845703125, + "ref_logps/rejected": -41.2575798034668, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31532156467437744, + "rewards/margins": 0.04143207520246506, + "rewards/rejected": -0.3567536473274231, + "step": 261 + }, + { + "epoch": 0.25, + "grad_norm": 17.19778060913086, + "learning_rate": 4.119496855345912e-07, + "logps/chosen": -40.04396057128906, + "logps/rejected": -54.078495025634766, + "loss": 0.6426, + "losses/dpo": 0.7065551280975342, + "losses/sft": 1.3404619693756104, + "losses/total": 0.7065551280975342, + "ref_logps/chosen": -37.421043395996094, + "ref_logps/rejected": -50.26447677612305, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2622915506362915, + "rewards/margins": 0.11911033093929291, + "rewards/rejected": -0.3814018964767456, + "step": 262 + }, + { + "epoch": 0.25, + "grad_norm": 14.092816352844238, + "learning_rate": 4.1352201257861636e-07, + "logps/chosen": -39.34889221191406, + "logps/rejected": -41.527000427246094, + "loss": 0.6621, + "losses/dpo": 0.6810022592544556, + "losses/sft": 1.7040151357650757, + "losses/total": 0.6810022592544556, + "ref_logps/chosen": -37.01990509033203, + "ref_logps/rejected": -38.464027404785156, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23289905488491058, + "rewards/margins": 0.07339858263731003, + "rewards/rejected": -0.3062976598739624, + "step": 263 + }, + { + "epoch": 0.25, + "grad_norm": 17.33518409729004, + "learning_rate": 4.150943396226415e-07, + "logps/chosen": -50.4212532043457, + "logps/rejected": -47.210941314697266, + "loss": 0.7211, + "losses/dpo": 0.7034216523170471, + "losses/sft": 1.845947027206421, + "losses/total": 0.7034216523170471, + "ref_logps/chosen": -45.92155456542969, + "ref_logps/rejected": -43.09968566894531, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4499703645706177, + "rewards/margins": -0.03884504735469818, + "rewards/rejected": -0.4111253023147583, + "step": 264 + }, + { + "epoch": 0.25, + "grad_norm": 15.276748657226562, + "learning_rate": 4.1666666666666667e-07, + "logps/chosen": -35.351654052734375, + "logps/rejected": -52.00558090209961, + "loss": 0.6312, + "losses/dpo": 0.7081926465034485, + "losses/sft": 1.2190580368041992, + "losses/total": 0.7081926465034485, + "ref_logps/chosen": -32.42749786376953, + "ref_logps/rejected": -47.676856994628906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2924157679080963, + "rewards/margins": 0.1404569447040558, + "rewards/rejected": -0.4328727126121521, + "step": 265 + }, + { + "epoch": 0.25, + "grad_norm": 17.348905563354492, + "learning_rate": 4.1823899371069184e-07, + "logps/chosen": -39.68585968017578, + "logps/rejected": -52.619834899902344, + "loss": 0.6579, + "losses/dpo": 0.614298939704895, + "losses/sft": 1.1146241426467896, + "losses/total": 0.614298939704895, + "ref_logps/chosen": -37.08367919921875, + "ref_logps/rejected": -49.21800231933594, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.26021838188171387, + "rewards/margins": 0.07996508479118347, + "rewards/rejected": -0.3401834964752197, + "step": 266 + }, + { + "epoch": 0.25, + "grad_norm": 15.063830375671387, + "learning_rate": 4.1981132075471697e-07, + "logps/chosen": -40.76624298095703, + "logps/rejected": -52.1614990234375, + "loss": 0.632, + "losses/dpo": 0.6369799375534058, + "losses/sft": 1.5126986503601074, + "losses/total": 0.6369799375534058, + "ref_logps/chosen": -38.486122131347656, + "ref_logps/rejected": -48.50868606567383, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22801154851913452, + "rewards/margins": 0.13726995885372162, + "rewards/rejected": -0.36528149247169495, + "step": 267 + }, + { + "epoch": 0.25, + "grad_norm": 15.28031063079834, + "learning_rate": 4.2138364779874214e-07, + "logps/chosen": -31.515867233276367, + "logps/rejected": -45.040252685546875, + "loss": 0.6305, + "losses/dpo": 0.623494029045105, + "losses/sft": 1.4976038932800293, + "losses/total": 0.623494029045105, + "ref_logps/chosen": -29.37891387939453, + "ref_logps/rejected": -41.54468536376953, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21369558572769165, + "rewards/margins": 0.13586114346981049, + "rewards/rejected": -0.34955674409866333, + "step": 268 + }, + { + "epoch": 0.25, + "grad_norm": 19.609994888305664, + "learning_rate": 4.229559748427673e-07, + "logps/chosen": -46.337158203125, + "logps/rejected": -61.737632751464844, + "loss": 0.6453, + "losses/dpo": 0.7226425409317017, + "losses/sft": 1.4349713325500488, + "losses/total": 0.7226425409317017, + "ref_logps/chosen": -43.272056579589844, + "ref_logps/rejected": -57.550846099853516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3065102696418762, + "rewards/margins": 0.11216841638088226, + "rewards/rejected": -0.4186787009239197, + "step": 269 + }, + { + "epoch": 0.25, + "grad_norm": 15.402117729187012, + "learning_rate": 4.2452830188679244e-07, + "logps/chosen": -38.63148880004883, + "logps/rejected": -51.09016418457031, + "loss": 0.6697, + "losses/dpo": 0.668036699295044, + "losses/sft": 1.8736562728881836, + "losses/total": 0.668036699295044, + "ref_logps/chosen": -35.629493713378906, + "ref_logps/rejected": -47.55656814575195, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3001996576786041, + "rewards/margins": 0.05315982550382614, + "rewards/rejected": -0.35335949063301086, + "step": 270 + }, + { + "epoch": 0.26, + "grad_norm": 16.25444984436035, + "learning_rate": 4.261006289308176e-07, + "logps/chosen": -45.08386993408203, + "logps/rejected": -40.419898986816406, + "loss": 0.7001, + "losses/dpo": 0.6437594890594482, + "losses/sft": 1.251446008682251, + "losses/total": 0.6437594890594482, + "ref_logps/chosen": -41.53697967529297, + "ref_logps/rejected": -36.89950942993164, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.35468873381614685, + "rewards/margins": -0.0026499181985855103, + "rewards/rejected": -0.35203880071640015, + "step": 271 + }, + { + "epoch": 0.26, + "grad_norm": 14.902688980102539, + "learning_rate": 4.2767295597484274e-07, + "logps/chosen": -28.657167434692383, + "logps/rejected": -48.08135223388672, + "loss": 0.6213, + "losses/dpo": 0.5799668431282043, + "losses/sft": 1.2232944965362549, + "losses/total": 0.5799668431282043, + "ref_logps/chosen": -26.796960830688477, + "ref_logps/rejected": -44.65399169921875, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18602056801319122, + "rewards/margins": 0.1567152738571167, + "rewards/rejected": -0.3427358567714691, + "step": 272 + }, + { + "epoch": 0.26, + "grad_norm": 15.471696853637695, + "learning_rate": 4.292452830188679e-07, + "logps/chosen": -42.2933235168457, + "logps/rejected": -45.91864013671875, + "loss": 0.6648, + "losses/dpo": 0.6357539892196655, + "losses/sft": 1.821222186088562, + "losses/total": 0.6357539892196655, + "ref_logps/chosen": -39.12578582763672, + "ref_logps/rejected": -42.10855484008789, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3167540431022644, + "rewards/margins": 0.06425458937883377, + "rewards/rejected": -0.3810086250305176, + "step": 273 + }, + { + "epoch": 0.26, + "grad_norm": 15.276728630065918, + "learning_rate": 4.308176100628931e-07, + "logps/chosen": -34.20111846923828, + "logps/rejected": -56.20793151855469, + "loss": 0.5948, + "losses/dpo": 0.6406265497207642, + "losses/sft": 1.4846235513687134, + "losses/total": 0.6406265497207642, + "ref_logps/chosen": -31.65646743774414, + "ref_logps/rejected": -51.374366760253906, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2544652223587036, + "rewards/margins": 0.22889135777950287, + "rewards/rejected": -0.4833565652370453, + "step": 274 + }, + { + "epoch": 0.26, + "grad_norm": 16.227323532104492, + "learning_rate": 4.323899371069182e-07, + "logps/chosen": -44.5732421875, + "logps/rejected": -54.36494445800781, + "loss": 0.6407, + "losses/dpo": 0.6271330118179321, + "losses/sft": 1.2677981853485107, + "losses/total": 0.6271330118179321, + "ref_logps/chosen": -41.26808547973633, + "ref_logps/rejected": -49.88677978515625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33051571249961853, + "rewards/margins": 0.11730123311281204, + "rewards/rejected": -0.44781696796417236, + "step": 275 + }, + { + "epoch": 0.26, + "grad_norm": 14.46500015258789, + "learning_rate": 4.339622641509434e-07, + "logps/chosen": -32.21154022216797, + "logps/rejected": -40.5195198059082, + "loss": 0.6569, + "losses/dpo": 0.657302737236023, + "losses/sft": 1.18999183177948, + "losses/total": 0.657302737236023, + "ref_logps/chosen": -29.70412826538086, + "ref_logps/rejected": -37.229637145996094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2507411539554596, + "rewards/margins": 0.07824680209159851, + "rewards/rejected": -0.3289879560470581, + "step": 276 + }, + { + "epoch": 0.26, + "grad_norm": 15.547643661499023, + "learning_rate": 4.355345911949685e-07, + "logps/chosen": -38.443050384521484, + "logps/rejected": -50.91439437866211, + "loss": 0.608, + "losses/dpo": 0.5969237685203552, + "losses/sft": 1.4887139797210693, + "losses/total": 0.5969237685203552, + "ref_logps/chosen": -35.83464431762695, + "ref_logps/rejected": -46.121028900146484, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2608405649662018, + "rewards/margins": 0.21849608421325684, + "rewards/rejected": -0.4793366491794586, + "step": 277 + }, + { + "epoch": 0.26, + "grad_norm": 15.859171867370605, + "learning_rate": 4.371069182389937e-07, + "logps/chosen": -45.779476165771484, + "logps/rejected": -41.80155944824219, + "loss": 0.6846, + "losses/dpo": 0.6578966379165649, + "losses/sft": 1.501111388206482, + "losses/total": 0.6578966379165649, + "ref_logps/chosen": -43.15447235107422, + "ref_logps/rejected": -38.86341857910156, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2625007927417755, + "rewards/margins": 0.031313199549913406, + "rewards/rejected": -0.2938140034675598, + "step": 278 + }, + { + "epoch": 0.26, + "grad_norm": 16.175655364990234, + "learning_rate": 4.386792452830189e-07, + "logps/chosen": -42.040733337402344, + "logps/rejected": -45.3043212890625, + "loss": 0.6454, + "losses/dpo": 0.6217872500419617, + "losses/sft": 1.1767728328704834, + "losses/total": 0.6217872500419617, + "ref_logps/chosen": -39.03452682495117, + "ref_logps/rejected": -41.25267791748047, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3006206452846527, + "rewards/margins": 0.10454416275024414, + "rewards/rejected": -0.40516480803489685, + "step": 279 + }, + { + "epoch": 0.26, + "grad_norm": 15.101045608520508, + "learning_rate": 4.40251572327044e-07, + "logps/chosen": -39.660789489746094, + "logps/rejected": -51.88922882080078, + "loss": 0.6431, + "losses/dpo": 0.673517107963562, + "losses/sft": 1.537069320678711, + "losses/total": 0.673517107963562, + "ref_logps/chosen": -37.26958465576172, + "ref_logps/rejected": -48.36371612548828, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2391205132007599, + "rewards/margins": 0.11343112587928772, + "rewards/rejected": -0.3525516390800476, + "step": 280 + }, + { + "epoch": 0.27, + "grad_norm": 18.64141273498535, + "learning_rate": 4.418238993710692e-07, + "logps/chosen": -49.59341812133789, + "logps/rejected": -55.329978942871094, + "loss": 0.7113, + "losses/dpo": 0.7187132239341736, + "losses/sft": 1.7550328969955444, + "losses/total": 0.7187132239341736, + "ref_logps/chosen": -45.04663848876953, + "ref_logps/rejected": -51.04995346069336, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4546777307987213, + "rewards/margins": -0.026675017550587654, + "rewards/rejected": -0.4280027151107788, + "step": 281 + }, + { + "epoch": 0.27, + "grad_norm": 15.597990036010742, + "learning_rate": 4.4339622641509435e-07, + "logps/chosen": -39.32229995727539, + "logps/rejected": -57.20562744140625, + "loss": 0.5939, + "losses/dpo": 0.5364837646484375, + "losses/sft": 0.9851723313331604, + "losses/total": 0.5364837646484375, + "ref_logps/chosen": -35.8394775390625, + "ref_logps/rejected": -51.483863830566406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.348282128572464, + "rewards/margins": 0.22389397025108337, + "rewards/rejected": -0.5721760988235474, + "step": 282 + }, + { + "epoch": 0.27, + "grad_norm": 14.092550277709961, + "learning_rate": 4.449685534591195e-07, + "logps/chosen": -40.98548889160156, + "logps/rejected": -47.132606506347656, + "loss": 0.6047, + "losses/dpo": 0.6200289726257324, + "losses/sft": 1.5759824514389038, + "losses/total": 0.6200289726257324, + "ref_logps/chosen": -37.910160064697266, + "ref_logps/rejected": -41.932498931884766, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3075331151485443, + "rewards/margins": 0.21247738599777222, + "rewards/rejected": -0.5200105309486389, + "step": 283 + }, + { + "epoch": 0.27, + "grad_norm": 17.39341926574707, + "learning_rate": 4.4654088050314465e-07, + "logps/chosen": -41.997745513916016, + "logps/rejected": -46.48918151855469, + "loss": 0.6652, + "losses/dpo": 0.6076216697692871, + "losses/sft": 1.3608059883117676, + "losses/total": 0.6076216697692871, + "ref_logps/chosen": -39.23186492919922, + "ref_logps/rejected": -42.779693603515625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.276588499546051, + "rewards/margins": 0.09436051547527313, + "rewards/rejected": -0.37094900012016296, + "step": 284 + }, + { + "epoch": 0.27, + "grad_norm": 16.304155349731445, + "learning_rate": 4.481132075471698e-07, + "logps/chosen": -40.0236930847168, + "logps/rejected": -44.648338317871094, + "loss": 0.674, + "losses/dpo": 0.6296372413635254, + "losses/sft": 1.485594391822815, + "losses/total": 0.6296372413635254, + "ref_logps/chosen": -36.0826530456543, + "ref_logps/rejected": -40.076412200927734, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39410385489463806, + "rewards/margins": 0.06308881938457489, + "rewards/rejected": -0.45719265937805176, + "step": 285 + }, + { + "epoch": 0.27, + "grad_norm": 15.726293563842773, + "learning_rate": 4.4968553459119495e-07, + "logps/chosen": -39.88897705078125, + "logps/rejected": -43.01799392700195, + "loss": 0.684, + "losses/dpo": 0.6568284630775452, + "losses/sft": 1.9258939027786255, + "losses/total": 0.6568284630775452, + "ref_logps/chosen": -36.286231994628906, + "ref_logps/rejected": -39.07062911987305, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.36027443408966064, + "rewards/margins": 0.03446207940578461, + "rewards/rejected": -0.39473646879196167, + "step": 286 + }, + { + "epoch": 0.27, + "grad_norm": 15.3701753616333, + "learning_rate": 4.5125786163522013e-07, + "logps/chosen": -35.64639663696289, + "logps/rejected": -52.39430236816406, + "loss": 0.6388, + "losses/dpo": 0.6017330288887024, + "losses/sft": 1.4482841491699219, + "losses/total": 0.6017330288887024, + "ref_logps/chosen": -32.51789093017578, + "ref_logps/rejected": -48.03721237182617, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31285086274147034, + "rewards/margins": 0.12285785377025604, + "rewards/rejected": -0.4357087016105652, + "step": 287 + }, + { + "epoch": 0.27, + "grad_norm": 16.409990310668945, + "learning_rate": 4.5283018867924526e-07, + "logps/chosen": -40.90259552001953, + "logps/rejected": -34.40892791748047, + "loss": 0.7107, + "losses/dpo": 0.658980667591095, + "losses/sft": 1.2520112991333008, + "losses/total": 0.658980667591095, + "ref_logps/chosen": -37.784332275390625, + "ref_logps/rejected": -31.530216217041016, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.31182605028152466, + "rewards/margins": -0.023954953998327255, + "rewards/rejected": -0.2878710925579071, + "step": 288 + }, + { + "epoch": 0.27, + "grad_norm": 16.037878036499023, + "learning_rate": 4.5440251572327043e-07, + "logps/chosen": -46.21577453613281, + "logps/rejected": -39.91151428222656, + "loss": 0.6883, + "losses/dpo": 0.6805859804153442, + "losses/sft": 1.5793442726135254, + "losses/total": 0.6805859804153442, + "ref_logps/chosen": -42.846458435058594, + "ref_logps/rejected": -36.31890106201172, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.33693137764930725, + "rewards/margins": 0.02232995815575123, + "rewards/rejected": -0.35926133394241333, + "step": 289 + }, + { + "epoch": 0.27, + "grad_norm": 17.23445701599121, + "learning_rate": 4.559748427672956e-07, + "logps/chosen": -44.772178649902344, + "logps/rejected": -57.92347717285156, + "loss": 0.6689, + "losses/dpo": 0.5138028264045715, + "losses/sft": 1.6963366270065308, + "losses/total": 0.5138028264045715, + "ref_logps/chosen": -40.34746551513672, + "ref_logps/rejected": -52.718589782714844, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.44247108697891235, + "rewards/margins": 0.07801740616559982, + "rewards/rejected": -0.5204885005950928, + "step": 290 + }, + { + "epoch": 0.27, + "grad_norm": 16.649431228637695, + "learning_rate": 4.5754716981132073e-07, + "logps/chosen": -43.871299743652344, + "logps/rejected": -52.043487548828125, + "loss": 0.6531, + "losses/dpo": 0.6663308143615723, + "losses/sft": 1.8109893798828125, + "losses/total": 0.6663308143615723, + "ref_logps/chosen": -39.87586975097656, + "ref_logps/rejected": -47.08435821533203, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39954280853271484, + "rewards/margins": 0.09637051820755005, + "rewards/rejected": -0.4959132671356201, + "step": 291 + }, + { + "epoch": 0.28, + "grad_norm": 15.538848876953125, + "learning_rate": 4.591194968553459e-07, + "logps/chosen": -40.258453369140625, + "logps/rejected": -43.68415069580078, + "loss": 0.6393, + "losses/dpo": 0.7158225774765015, + "losses/sft": 1.4569058418273926, + "losses/total": 0.7158225774765015, + "ref_logps/chosen": -37.33110427856445, + "ref_logps/rejected": -39.56146240234375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.29273444414138794, + "rewards/margins": 0.11953487992286682, + "rewards/rejected": -0.41226935386657715, + "step": 292 + }, + { + "epoch": 0.28, + "grad_norm": 14.913241386413574, + "learning_rate": 4.6069182389937103e-07, + "logps/chosen": -36.74955749511719, + "logps/rejected": -49.41448211669922, + "loss": 0.656, + "losses/dpo": 0.5618067979812622, + "losses/sft": 0.859252393245697, + "losses/total": 0.5618067979812622, + "ref_logps/chosen": -33.257293701171875, + "ref_logps/rejected": -44.97230911254883, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.34922653436660767, + "rewards/margins": 0.09499108791351318, + "rewards/rejected": -0.44421762228012085, + "step": 293 + }, + { + "epoch": 0.28, + "grad_norm": 13.554713249206543, + "learning_rate": 4.622641509433962e-07, + "logps/chosen": -37.023826599121094, + "logps/rejected": -33.654483795166016, + "loss": 0.6445, + "losses/dpo": 0.6806570887565613, + "losses/sft": 1.4043481349945068, + "losses/total": 0.6806570887565613, + "ref_logps/chosen": -34.33789825439453, + "ref_logps/rejected": -29.86209487915039, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2685929536819458, + "rewards/margins": 0.11064592003822327, + "rewards/rejected": -0.37923890352249146, + "step": 294 + }, + { + "epoch": 0.28, + "grad_norm": 17.03995704650879, + "learning_rate": 4.638364779874214e-07, + "logps/chosen": -43.4785041809082, + "logps/rejected": -44.47311782836914, + "loss": 0.6641, + "losses/dpo": 0.6484063267707825, + "losses/sft": 1.5728685855865479, + "losses/total": 0.6484063267707825, + "ref_logps/chosen": -39.81079864501953, + "ref_logps/rejected": -40.02204513549805, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3667706549167633, + "rewards/margins": 0.07833677530288696, + "rewards/rejected": -0.4451074004173279, + "step": 295 + }, + { + "epoch": 0.28, + "grad_norm": 15.064468383789062, + "learning_rate": 4.654088050314465e-07, + "logps/chosen": -35.53520202636719, + "logps/rejected": -44.49199676513672, + "loss": 0.6791, + "losses/dpo": 0.7307789325714111, + "losses/sft": 1.2095739841461182, + "losses/total": 0.7307789325714111, + "ref_logps/chosen": -31.74903678894043, + "ref_logps/rejected": -40.314491271972656, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3786165714263916, + "rewards/margins": 0.039134155958890915, + "rewards/rejected": -0.4177507162094116, + "step": 296 + }, + { + "epoch": 0.28, + "grad_norm": 19.06511116027832, + "learning_rate": 4.669811320754717e-07, + "logps/chosen": -47.629703521728516, + "logps/rejected": -59.77693176269531, + "loss": 0.6795, + "losses/dpo": 0.6098648309707642, + "losses/sft": 0.9918664693832397, + "losses/total": 0.6098648309707642, + "ref_logps/chosen": -42.36842346191406, + "ref_logps/rejected": -53.96903610229492, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5261276960372925, + "rewards/margins": 0.05466217175126076, + "rewards/rejected": -0.5807898640632629, + "step": 297 + }, + { + "epoch": 0.28, + "grad_norm": 17.147062301635742, + "learning_rate": 4.6855345911949686e-07, + "logps/chosen": -47.28531265258789, + "logps/rejected": -46.57255554199219, + "loss": 0.663, + "losses/dpo": 0.7391225695610046, + "losses/sft": 1.6611230373382568, + "losses/total": 0.7391225695610046, + "ref_logps/chosen": -43.058223724365234, + "ref_logps/rejected": -41.360260009765625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4227089285850525, + "rewards/margins": 0.09852064400911331, + "rewards/rejected": -0.5212295651435852, + "step": 298 + }, + { + "epoch": 0.28, + "grad_norm": 13.381155967712402, + "learning_rate": 4.70125786163522e-07, + "logps/chosen": -29.36941146850586, + "logps/rejected": -36.3071403503418, + "loss": 0.6522, + "losses/dpo": 0.5655578374862671, + "losses/sft": 1.265622854232788, + "losses/total": 0.5655578374862671, + "ref_logps/chosen": -25.40846824645996, + "ref_logps/rejected": -31.21912956237793, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39609450101852417, + "rewards/margins": 0.1127067506313324, + "rewards/rejected": -0.5088012218475342, + "step": 299 + }, + { + "epoch": 0.28, + "grad_norm": 14.62690544128418, + "learning_rate": 4.7169811320754717e-07, + "logps/chosen": -37.77018737792969, + "logps/rejected": -48.2723388671875, + "loss": 0.6441, + "losses/dpo": 0.6590859889984131, + "losses/sft": 1.4876482486724854, + "losses/total": 0.6590859889984131, + "ref_logps/chosen": -34.07875061035156, + "ref_logps/rejected": -43.16485595703125, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.369143545627594, + "rewards/margins": 0.1416051685810089, + "rewards/rejected": -0.5107487440109253, + "step": 300 + }, + { + "epoch": 0.28, + "grad_norm": 18.411983489990234, + "learning_rate": 4.732704402515723e-07, + "logps/chosen": -46.09126281738281, + "logps/rejected": -44.139404296875, + "loss": 0.6791, + "losses/dpo": 0.6460752487182617, + "losses/sft": 1.3547742366790771, + "losses/total": 0.6460752487182617, + "ref_logps/chosen": -41.491634368896484, + "ref_logps/rejected": -38.95182418823242, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.45996329188346863, + "rewards/margins": 0.058794502168893814, + "rewards/rejected": -0.5187578201293945, + "step": 301 + }, + { + "epoch": 0.29, + "grad_norm": 18.769363403320312, + "learning_rate": 4.7484276729559747e-07, + "logps/chosen": -44.56507873535156, + "logps/rejected": -48.06944274902344, + "loss": 0.7326, + "losses/dpo": 0.6647197008132935, + "losses/sft": 1.8934367895126343, + "losses/total": 0.6647197008132935, + "ref_logps/chosen": -39.720436096191406, + "ref_logps/rejected": -43.856685638427734, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4844643771648407, + "rewards/margins": -0.0631885826587677, + "rewards/rejected": -0.4212757647037506, + "step": 302 + }, + { + "epoch": 0.29, + "grad_norm": 13.678750038146973, + "learning_rate": 4.7641509433962264e-07, + "logps/chosen": -29.137710571289062, + "logps/rejected": -34.092384338378906, + "loss": 0.6465, + "losses/dpo": 0.7419373989105225, + "losses/sft": 1.431159496307373, + "losses/total": 0.7419373989105225, + "ref_logps/chosen": -26.39360809326172, + "ref_logps/rejected": -30.094730377197266, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2744103968143463, + "rewards/margins": 0.12535496056079865, + "rewards/rejected": -0.39976537227630615, + "step": 303 + }, + { + "epoch": 0.29, + "grad_norm": 16.425840377807617, + "learning_rate": 4.779874213836478e-07, + "logps/chosen": -38.671836853027344, + "logps/rejected": -46.90407943725586, + "loss": 0.6697, + "losses/dpo": 0.6034525036811829, + "losses/sft": 1.153171420097351, + "losses/total": 0.6034525036811829, + "ref_logps/chosen": -35.337196350097656, + "ref_logps/rejected": -42.98271942138672, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3334636688232422, + "rewards/margins": 0.058672383427619934, + "rewards/rejected": -0.3921360373497009, + "step": 304 + }, + { + "epoch": 0.29, + "grad_norm": 15.527347564697266, + "learning_rate": 4.795597484276729e-07, + "logps/chosen": -43.831520080566406, + "logps/rejected": -53.383304595947266, + "loss": 0.5983, + "losses/dpo": 0.6423917412757874, + "losses/sft": 1.5524685382843018, + "losses/total": 0.6423917412757874, + "ref_logps/chosen": -40.01023864746094, + "ref_logps/rejected": -47.300376892089844, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38212764263153076, + "rewards/margins": 0.22616487741470337, + "rewards/rejected": -0.6082925200462341, + "step": 305 + }, + { + "epoch": 0.29, + "grad_norm": 15.913620948791504, + "learning_rate": 4.811320754716981e-07, + "logps/chosen": -36.82072448730469, + "logps/rejected": -43.5571174621582, + "loss": 0.6416, + "losses/dpo": 0.5956966280937195, + "losses/sft": 1.20928156375885, + "losses/total": 0.5956966280937195, + "ref_logps/chosen": -34.06861877441406, + "ref_logps/rejected": -39.47810363769531, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2752102315425873, + "rewards/margins": 0.1326913982629776, + "rewards/rejected": -0.4079016447067261, + "step": 306 + }, + { + "epoch": 0.29, + "grad_norm": 15.387386322021484, + "learning_rate": 4.827044025157232e-07, + "logps/chosen": -35.70310974121094, + "logps/rejected": -40.58681106567383, + "loss": 0.6603, + "losses/dpo": 0.5975790619850159, + "losses/sft": 1.0019665956497192, + "losses/total": 0.5975790619850159, + "ref_logps/chosen": -32.418556213378906, + "ref_logps/rejected": -36.39390182495117, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.32845550775527954, + "rewards/margins": 0.09083528816699982, + "rewards/rejected": -0.41929078102111816, + "step": 307 + }, + { + "epoch": 0.29, + "grad_norm": 17.168498992919922, + "learning_rate": 4.842767295597484e-07, + "logps/chosen": -37.426513671875, + "logps/rejected": -38.88938903808594, + "loss": 0.7197, + "losses/dpo": 0.7147113084793091, + "losses/sft": 1.8871713876724243, + "losses/total": 0.7147113084793091, + "ref_logps/chosen": -33.53988265991211, + "ref_logps/rejected": -35.286476135253906, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3886632025241852, + "rewards/margins": -0.028372403234243393, + "rewards/rejected": -0.3602908253669739, + "step": 308 + }, + { + "epoch": 0.29, + "grad_norm": 15.583175659179688, + "learning_rate": 4.858490566037736e-07, + "logps/chosen": -38.89422607421875, + "logps/rejected": -37.618263244628906, + "loss": 0.7198, + "losses/dpo": 0.7783017158508301, + "losses/sft": 1.1059563159942627, + "losses/total": 0.7783017158508301, + "ref_logps/chosen": -33.88743591308594, + "ref_logps/rejected": -33.0002326965332, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5006794333457947, + "rewards/margins": -0.03887690603733063, + "rewards/rejected": -0.46180254220962524, + "step": 309 + }, + { + "epoch": 0.29, + "grad_norm": 18.440391540527344, + "learning_rate": 4.874213836477988e-07, + "logps/chosen": -42.746482849121094, + "logps/rejected": -54.17639923095703, + "loss": 0.7278, + "losses/dpo": 0.7887463569641113, + "losses/sft": 1.51253342628479, + "losses/total": 0.7887463569641113, + "ref_logps/chosen": -37.78423309326172, + "ref_logps/rejected": -49.63832092285156, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.49622464179992676, + "rewards/margins": -0.04241668060421944, + "rewards/rejected": -0.4538079500198364, + "step": 310 + }, + { + "epoch": 0.29, + "grad_norm": 13.605622291564941, + "learning_rate": 4.889937106918238e-07, + "logps/chosen": -31.56808090209961, + "logps/rejected": -46.14800262451172, + "loss": 0.5595, + "losses/dpo": 0.5758316516876221, + "losses/sft": 1.1419419050216675, + "losses/total": 0.5758316516876221, + "ref_logps/chosen": -28.66701889038086, + "ref_logps/rejected": -40.081214904785156, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29010629653930664, + "rewards/margins": 0.316572904586792, + "rewards/rejected": -0.6066792011260986, + "step": 311 + }, + { + "epoch": 0.29, + "grad_norm": 13.29858112335205, + "learning_rate": 4.90566037735849e-07, + "logps/chosen": -33.10694885253906, + "logps/rejected": -40.835933685302734, + "loss": 0.6434, + "losses/dpo": 0.6015459299087524, + "losses/sft": 1.8418209552764893, + "losses/total": 0.6015459299087524, + "ref_logps/chosen": -30.316028594970703, + "ref_logps/rejected": -36.822509765625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2790922522544861, + "rewards/margins": 0.12225019186735153, + "rewards/rejected": -0.4013424217700958, + "step": 312 + }, + { + "epoch": 0.3, + "grad_norm": 15.46154499053955, + "learning_rate": 4.921383647798742e-07, + "logps/chosen": -45.18656539916992, + "logps/rejected": -46.45176696777344, + "loss": 0.641, + "losses/dpo": 0.6185716986656189, + "losses/sft": 1.4683407545089722, + "losses/total": 0.6185716986656189, + "ref_logps/chosen": -40.97571563720703, + "ref_logps/rejected": -41.00298309326172, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4210848808288574, + "rewards/margins": 0.1237938180565834, + "rewards/rejected": -0.5448787212371826, + "step": 313 + }, + { + "epoch": 0.3, + "grad_norm": 19.87638282775879, + "learning_rate": 4.937106918238994e-07, + "logps/chosen": -50.0240478515625, + "logps/rejected": -55.17597198486328, + "loss": 0.7361, + "losses/dpo": 0.7950786352157593, + "losses/sft": 1.617371916770935, + "losses/total": 0.7950786352157593, + "ref_logps/chosen": -44.95348358154297, + "ref_logps/rejected": -50.52076721191406, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.507056713104248, + "rewards/margins": -0.041536275297403336, + "rewards/rejected": -0.465520441532135, + "step": 314 + }, + { + "epoch": 0.3, + "grad_norm": 15.329681396484375, + "learning_rate": 4.952830188679246e-07, + "logps/chosen": -42.27714920043945, + "logps/rejected": -46.185882568359375, + "loss": 0.6733, + "losses/dpo": 0.6167937517166138, + "losses/sft": 1.1877564191818237, + "losses/total": 0.6167937517166138, + "ref_logps/chosen": -38.369590759277344, + "ref_logps/rejected": -41.6512336730957, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.39075565338134766, + "rewards/margins": 0.06270917505025864, + "rewards/rejected": -0.4534648358821869, + "step": 315 + }, + { + "epoch": 0.3, + "grad_norm": 13.328225135803223, + "learning_rate": 4.968553459119496e-07, + "logps/chosen": -29.752016067504883, + "logps/rejected": -41.991233825683594, + "loss": 0.6141, + "losses/dpo": 0.6650824546813965, + "losses/sft": 1.4727708101272583, + "losses/total": 0.6650824546813965, + "ref_logps/chosen": -27.185718536376953, + "ref_logps/rejected": -37.60988235473633, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2566297650337219, + "rewards/margins": 0.18150556087493896, + "rewards/rejected": -0.4381353557109833, + "step": 316 + }, + { + "epoch": 0.3, + "grad_norm": 13.903385162353516, + "learning_rate": 4.984276729559748e-07, + "logps/chosen": -28.24636459350586, + "logps/rejected": -53.85102081298828, + "loss": 0.5736, + "losses/dpo": 0.544606626033783, + "losses/sft": 1.4416906833648682, + "losses/total": 0.544606626033783, + "ref_logps/chosen": -25.089126586914062, + "ref_logps/rejected": -47.75431823730469, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.31572383642196655, + "rewards/margins": 0.29394596815109253, + "rewards/rejected": -0.6096698045730591, + "step": 317 + }, + { + "epoch": 0.3, + "grad_norm": 16.531278610229492, + "learning_rate": 5e-07, + "logps/chosen": -46.63761520385742, + "logps/rejected": -52.220245361328125, + "loss": 0.6263, + "losses/dpo": 0.5461878180503845, + "losses/sft": 1.6149392127990723, + "losses/total": 0.5461878180503845, + "ref_logps/chosen": -42.58404541015625, + "ref_logps/rejected": -46.41576385498047, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.40535682439804077, + "rewards/margins": 0.17509107291698456, + "rewards/rejected": -0.5804479122161865, + "step": 318 + }, + { + "epoch": 0.3, + "grad_norm": 16.40178871154785, + "learning_rate": 4.998251136761105e-07, + "logps/chosen": -35.51566696166992, + "logps/rejected": -49.572017669677734, + "loss": 0.6223, + "losses/dpo": 0.6416157484054565, + "losses/sft": 1.3145924806594849, + "losses/total": 0.6416157484054565, + "ref_logps/chosen": -32.12255859375, + "ref_logps/rejected": -44.47797393798828, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3393109142780304, + "rewards/margins": 0.1700935810804367, + "rewards/rejected": -0.5094045400619507, + "step": 319 + }, + { + "epoch": 0.3, + "grad_norm": 19.217369079589844, + "learning_rate": 4.99650227352221e-07, + "logps/chosen": -43.56651306152344, + "logps/rejected": -43.700965881347656, + "loss": 0.7009, + "losses/dpo": 0.7487107515335083, + "losses/sft": 1.6572529077529907, + "losses/total": 0.7487107515335083, + "ref_logps/chosen": -39.02032470703125, + "ref_logps/rejected": -38.94599151611328, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.45461931824684143, + "rewards/margins": 0.020878277719020844, + "rewards/rejected": -0.4754975736141205, + "step": 320 + }, + { + "epoch": 0.3, + "grad_norm": 16.932418823242188, + "learning_rate": 4.994753410283315e-07, + "logps/chosen": -45.408233642578125, + "logps/rejected": -43.65406799316406, + "loss": 0.618, + "losses/dpo": 0.6029354333877563, + "losses/sft": 1.5400285720825195, + "losses/total": 0.6029354333877563, + "ref_logps/chosen": -41.826934814453125, + "ref_logps/rejected": -38.315242767333984, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3581298291683197, + "rewards/margins": 0.1757524013519287, + "rewards/rejected": -0.5338822603225708, + "step": 321 + }, + { + "epoch": 0.3, + "grad_norm": 18.416934967041016, + "learning_rate": 4.993004547044421e-07, + "logps/chosen": -39.20286560058594, + "logps/rejected": -55.60207748413086, + "loss": 0.673, + "losses/dpo": 0.6724597215652466, + "losses/sft": 1.8351119756698608, + "losses/total": 0.6724597215652466, + "ref_logps/chosen": -34.607269287109375, + "ref_logps/rejected": -50.36186218261719, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.45955926179885864, + "rewards/margins": 0.06446242332458496, + "rewards/rejected": -0.5240216851234436, + "step": 322 + }, + { + "epoch": 0.31, + "grad_norm": 14.532320022583008, + "learning_rate": 4.991255683805526e-07, + "logps/chosen": -38.145042419433594, + "logps/rejected": -46.594970703125, + "loss": 0.6046, + "losses/dpo": 0.5145504474639893, + "losses/sft": 1.2098132371902466, + "losses/total": 0.5145504474639893, + "ref_logps/chosen": -34.41341781616211, + "ref_logps/rejected": -40.75916290283203, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.37316226959228516, + "rewards/margins": 0.21041880548000336, + "rewards/rejected": -0.5835810899734497, + "step": 323 + }, + { + "epoch": 0.31, + "grad_norm": 18.384719848632812, + "learning_rate": 4.989506820566632e-07, + "logps/chosen": -53.385643005371094, + "logps/rejected": -59.70347595214844, + "loss": 0.6883, + "losses/dpo": 0.7504025101661682, + "losses/sft": 1.5278271436691284, + "losses/total": 0.7504025101661682, + "ref_logps/chosen": -47.14939880371094, + "ref_logps/rejected": -52.856712341308594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6236246824264526, + "rewards/margins": 0.061051853001117706, + "rewards/rejected": -0.6846765279769897, + "step": 324 + }, + { + "epoch": 0.31, + "grad_norm": 19.458702087402344, + "learning_rate": 4.987757957327737e-07, + "logps/chosen": -51.701927185058594, + "logps/rejected": -49.56433868408203, + "loss": 0.6573, + "losses/dpo": 0.6533963680267334, + "losses/sft": 1.3683087825775146, + "losses/total": 0.6533963680267334, + "ref_logps/chosen": -47.254798889160156, + "ref_logps/rejected": -44.163543701171875, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.444713294506073, + "rewards/margins": 0.09536644071340561, + "rewards/rejected": -0.5400797128677368, + "step": 325 + }, + { + "epoch": 0.31, + "grad_norm": 16.315893173217773, + "learning_rate": 4.986009094088842e-07, + "logps/chosen": -44.640541076660156, + "logps/rejected": -46.98735427856445, + "loss": 0.6286, + "losses/dpo": 0.5390735268592834, + "losses/sft": 1.1266926527023315, + "losses/total": 0.5390735268592834, + "ref_logps/chosen": -40.5927619934082, + "ref_logps/rejected": -41.14649963378906, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4047783613204956, + "rewards/margins": 0.1793069839477539, + "rewards/rejected": -0.5840853452682495, + "step": 326 + }, + { + "epoch": 0.31, + "grad_norm": 15.500914573669434, + "learning_rate": 4.984260230849947e-07, + "logps/chosen": -45.44721984863281, + "logps/rejected": -47.130775451660156, + "loss": 0.6625, + "losses/dpo": 0.6081020832061768, + "losses/sft": 1.2721569538116455, + "losses/total": 0.6081020832061768, + "ref_logps/chosen": -40.59376907348633, + "ref_logps/rejected": -41.537742614746094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4853447675704956, + "rewards/margins": 0.07395829260349274, + "rewards/rejected": -0.5593030452728271, + "step": 327 + }, + { + "epoch": 0.31, + "grad_norm": 17.627470016479492, + "learning_rate": 4.982511367611052e-07, + "logps/chosen": -47.881874084472656, + "logps/rejected": -57.78369140625, + "loss": 0.6523, + "losses/dpo": 0.5345410108566284, + "losses/sft": 1.894019365310669, + "losses/total": 0.5345410108566284, + "ref_logps/chosen": -42.40503692626953, + "ref_logps/rejected": -51.038658142089844, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5476834774017334, + "rewards/margins": 0.12682020664215088, + "rewards/rejected": -0.6745036840438843, + "step": 328 + }, + { + "epoch": 0.31, + "grad_norm": 16.869220733642578, + "learning_rate": 4.980762504372158e-07, + "logps/chosen": -46.1259765625, + "logps/rejected": -49.09648132324219, + "loss": 0.6417, + "losses/dpo": 0.6740195751190186, + "losses/sft": 1.8484153747558594, + "losses/total": 0.6740195751190186, + "ref_logps/chosen": -41.715606689453125, + "ref_logps/rejected": -43.21916580200195, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.441037118434906, + "rewards/margins": 0.14669471979141235, + "rewards/rejected": -0.5877318382263184, + "step": 329 + }, + { + "epoch": 0.31, + "grad_norm": 16.697179794311523, + "learning_rate": 4.979013641133263e-07, + "logps/chosen": -44.72831344604492, + "logps/rejected": -45.09574508666992, + "loss": 0.6805, + "losses/dpo": 0.6937955617904663, + "losses/sft": 1.5024570226669312, + "losses/total": 0.6937955617904663, + "ref_logps/chosen": -39.872230529785156, + "ref_logps/rejected": -39.67607879638672, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.48560816049575806, + "rewards/margins": 0.05635836347937584, + "rewards/rejected": -0.5419665575027466, + "step": 330 + }, + { + "epoch": 0.31, + "grad_norm": 14.972700119018555, + "learning_rate": 4.977264777894369e-07, + "logps/chosen": -37.015037536621094, + "logps/rejected": -52.2890625, + "loss": 0.6358, + "losses/dpo": 0.597869873046875, + "losses/sft": 1.2581983804702759, + "losses/total": 0.597869873046875, + "ref_logps/chosen": -32.54080581665039, + "ref_logps/rejected": -46.212852478027344, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4474233090877533, + "rewards/margins": 0.1601974070072174, + "rewards/rejected": -0.6076207160949707, + "step": 331 + }, + { + "epoch": 0.31, + "grad_norm": 18.237545013427734, + "learning_rate": 4.975515914655474e-07, + "logps/chosen": -42.3123779296875, + "logps/rejected": -54.46826171875, + "loss": 0.6737, + "losses/dpo": 0.7524250745773315, + "losses/sft": 1.1088095903396606, + "losses/total": 0.7524250745773315, + "ref_logps/chosen": -36.97213363647461, + "ref_logps/rejected": -48.25359344482422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5340246558189392, + "rewards/margins": 0.08744189143180847, + "rewards/rejected": -0.6214665174484253, + "step": 332 + }, + { + "epoch": 0.31, + "grad_norm": 15.017657279968262, + "learning_rate": 4.973767051416579e-07, + "logps/chosen": -42.169715881347656, + "logps/rejected": -40.91410827636719, + "loss": 0.6542, + "losses/dpo": 0.7065522074699402, + "losses/sft": 1.538163423538208, + "losses/total": 0.7065522074699402, + "ref_logps/chosen": -37.17097854614258, + "ref_logps/rejected": -35.0113525390625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49987372756004333, + "rewards/margins": 0.09040142595767975, + "rewards/rejected": -0.5902751684188843, + "step": 333 + }, + { + "epoch": 0.32, + "grad_norm": 15.769179344177246, + "learning_rate": 4.972018188177684e-07, + "logps/chosen": -37.79359817504883, + "logps/rejected": -43.023658752441406, + "loss": 0.6463, + "losses/dpo": 0.6829018592834473, + "losses/sft": 1.1957991123199463, + "losses/total": 0.6829018592834473, + "ref_logps/chosen": -34.46954345703125, + "ref_logps/rejected": -38.55085754394531, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.33240556716918945, + "rewards/margins": 0.11487418413162231, + "rewards/rejected": -0.44727975130081177, + "step": 334 + }, + { + "epoch": 0.32, + "grad_norm": 19.166316986083984, + "learning_rate": 4.970269324938789e-07, + "logps/chosen": -43.17356491088867, + "logps/rejected": -43.072242736816406, + "loss": 0.6281, + "losses/dpo": 0.45466917753219604, + "losses/sft": 1.2970844507217407, + "losses/total": 0.45466917753219604, + "ref_logps/chosen": -39.01023864746094, + "ref_logps/rejected": -37.182247161865234, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.41633301973342896, + "rewards/margins": 0.1726670116186142, + "rewards/rejected": -0.5889999866485596, + "step": 335 + }, + { + "epoch": 0.32, + "grad_norm": 17.412860870361328, + "learning_rate": 4.968520461699895e-07, + "logps/chosen": -37.92523193359375, + "logps/rejected": -52.40209197998047, + "loss": 0.5764, + "losses/dpo": 0.5267322063446045, + "losses/sft": 1.3428879976272583, + "losses/total": 0.5267322063446045, + "ref_logps/chosen": -34.152950286865234, + "ref_logps/rejected": -45.85675048828125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3772285580635071, + "rewards/margins": 0.27730560302734375, + "rewards/rejected": -0.654534101486206, + "step": 336 + }, + { + "epoch": 0.32, + "grad_norm": 16.409772872924805, + "learning_rate": 4.966771598461e-07, + "logps/chosen": -48.05484390258789, + "logps/rejected": -48.352081298828125, + "loss": 0.583, + "losses/dpo": 0.5335006713867188, + "losses/sft": 1.2728737592697144, + "losses/total": 0.5335006713867188, + "ref_logps/chosen": -43.75884246826172, + "ref_logps/rejected": -41.4486083984375, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4296002984046936, + "rewards/margins": 0.2607470452785492, + "rewards/rejected": -0.6903473138809204, + "step": 337 + }, + { + "epoch": 0.32, + "grad_norm": 17.714736938476562, + "learning_rate": 4.965022735222105e-07, + "logps/chosen": -43.32847213745117, + "logps/rejected": -53.19118881225586, + "loss": 0.6944, + "losses/dpo": 0.68752521276474, + "losses/sft": 1.2535369396209717, + "losses/total": 0.68752521276474, + "ref_logps/chosen": -37.737205505371094, + "ref_logps/rejected": -47.28385925292969, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5591270923614502, + "rewards/margins": 0.031605955213308334, + "rewards/rejected": -0.5907330513000488, + "step": 338 + }, + { + "epoch": 0.32, + "grad_norm": 18.90288543701172, + "learning_rate": 4.963273871983211e-07, + "logps/chosen": -43.99187469482422, + "logps/rejected": -59.82645034790039, + "loss": 0.6313, + "losses/dpo": 0.6904228329658508, + "losses/sft": 1.516309380531311, + "losses/total": 0.6904228329658508, + "ref_logps/chosen": -38.480098724365234, + "ref_logps/rejected": -52.933963775634766, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5511775016784668, + "rewards/margins": 0.1380712389945984, + "rewards/rejected": -0.6892487406730652, + "step": 339 + }, + { + "epoch": 0.32, + "grad_norm": 14.638052940368652, + "learning_rate": 4.961525008744316e-07, + "logps/chosen": -32.529903411865234, + "logps/rejected": -44.50151443481445, + "loss": 0.6092, + "losses/dpo": 0.4741971492767334, + "losses/sft": 1.4373074769973755, + "losses/total": 0.4741971492767334, + "ref_logps/chosen": -28.500930786132812, + "ref_logps/rejected": -38.405235290527344, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4028974771499634, + "rewards/margins": 0.20673055946826935, + "rewards/rejected": -0.6096280813217163, + "step": 340 + }, + { + "epoch": 0.32, + "grad_norm": 17.94290542602539, + "learning_rate": 4.959776145505421e-07, + "logps/chosen": -54.526126861572266, + "logps/rejected": -56.8326530456543, + "loss": 0.6461, + "losses/dpo": 0.7271311283111572, + "losses/sft": 1.667887568473816, + "losses/total": 0.7271311283111572, + "ref_logps/chosen": -48.099334716796875, + "ref_logps/rejected": -48.829864501953125, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6426790952682495, + "rewards/margins": 0.15759959816932678, + "rewards/rejected": -0.8002787232398987, + "step": 341 + }, + { + "epoch": 0.32, + "grad_norm": 16.220985412597656, + "learning_rate": 4.958027282266526e-07, + "logps/chosen": -46.70875549316406, + "logps/rejected": -44.699867248535156, + "loss": 0.6325, + "losses/dpo": 0.6219876408576965, + "losses/sft": 1.7357341051101685, + "losses/total": 0.6219876408576965, + "ref_logps/chosen": -41.75714874267578, + "ref_logps/rejected": -38.12370300292969, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.49516016244888306, + "rewards/margins": 0.16245661675930023, + "rewards/rejected": -0.6576167345046997, + "step": 342 + }, + { + "epoch": 0.32, + "grad_norm": 20.682252883911133, + "learning_rate": 4.956278419027632e-07, + "logps/chosen": -49.13526916503906, + "logps/rejected": -47.93041229248047, + "loss": 0.7831, + "losses/dpo": 0.764042854309082, + "losses/sft": 1.4871896505355835, + "losses/total": 0.764042854309082, + "ref_logps/chosen": -42.53851318359375, + "ref_logps/rejected": -42.67013168334961, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6596754789352417, + "rewards/margins": -0.1336475908756256, + "rewards/rejected": -0.5260279178619385, + "step": 343 + }, + { + "epoch": 0.32, + "grad_norm": 17.956283569335938, + "learning_rate": 4.954529555788737e-07, + "logps/chosen": -48.28112030029297, + "logps/rejected": -59.63841247558594, + "loss": 0.6415, + "losses/dpo": 0.6288716793060303, + "losses/sft": 1.1391419172286987, + "losses/total": 0.6288716793060303, + "ref_logps/chosen": -43.74011993408203, + "ref_logps/rejected": -53.666351318359375, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.45410019159317017, + "rewards/margins": 0.14310592412948608, + "rewards/rejected": -0.5972061157226562, + "step": 344 + }, + { + "epoch": 0.33, + "grad_norm": 14.91905403137207, + "learning_rate": 4.952780692549842e-07, + "logps/chosen": -42.038238525390625, + "logps/rejected": -54.24596405029297, + "loss": 0.5755, + "losses/dpo": 0.759334921836853, + "losses/sft": 1.369367003440857, + "losses/total": 0.759334921836853, + "ref_logps/chosen": -38.41636276245117, + "ref_logps/rejected": -47.42622375488281, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.36218756437301636, + "rewards/margins": 0.3197862505912781, + "rewards/rejected": -0.6819738149642944, + "step": 345 + }, + { + "epoch": 0.33, + "grad_norm": 15.398578643798828, + "learning_rate": 4.951031829310948e-07, + "logps/chosen": -40.799774169921875, + "logps/rejected": -41.05747985839844, + "loss": 0.6483, + "losses/dpo": 0.6527115702629089, + "losses/sft": 1.4417846202850342, + "losses/total": 0.6527115702629089, + "ref_logps/chosen": -35.516326904296875, + "ref_logps/rejected": -34.40104675292969, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5283448696136475, + "rewards/margins": 0.13729813694953918, + "rewards/rejected": -0.665643036365509, + "step": 346 + }, + { + "epoch": 0.33, + "grad_norm": 22.32353973388672, + "learning_rate": 4.949282966072053e-07, + "logps/chosen": -62.92329406738281, + "logps/rejected": -56.66679382324219, + "loss": 0.704, + "losses/dpo": 0.982194185256958, + "losses/sft": 1.8895689249038696, + "losses/total": 0.982194185256958, + "ref_logps/chosen": -56.06339645385742, + "ref_logps/rejected": -49.39789581298828, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6859898567199707, + "rewards/margins": 0.04089978337287903, + "rewards/rejected": -0.7268896102905273, + "step": 347 + }, + { + "epoch": 0.33, + "grad_norm": 15.723599433898926, + "learning_rate": 4.947534102833158e-07, + "logps/chosen": -35.60626220703125, + "logps/rejected": -35.96453857421875, + "loss": 0.6729, + "losses/dpo": 0.6972857713699341, + "losses/sft": 1.229810357093811, + "losses/total": 0.6972857713699341, + "ref_logps/chosen": -31.716310501098633, + "ref_logps/rejected": -31.524150848388672, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38899534940719604, + "rewards/margins": 0.055043164640665054, + "rewards/rejected": -0.4440385103225708, + "step": 348 + }, + { + "epoch": 0.33, + "grad_norm": 17.474153518676758, + "learning_rate": 4.945785239594263e-07, + "logps/chosen": -42.06867980957031, + "logps/rejected": -32.13497543334961, + "loss": 0.749, + "losses/dpo": 0.7319649457931519, + "losses/sft": 1.6740304231643677, + "losses/total": 0.7319649457931519, + "ref_logps/chosen": -36.58716583251953, + "ref_logps/rejected": -27.30141830444336, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.548151433467865, + "rewards/margins": -0.06479551643133163, + "rewards/rejected": -0.48335593938827515, + "step": 349 + }, + { + "epoch": 0.33, + "grad_norm": 16.306217193603516, + "learning_rate": 4.944036376355369e-07, + "logps/chosen": -36.958343505859375, + "logps/rejected": -53.405094146728516, + "loss": 0.6313, + "losses/dpo": 0.593835175037384, + "losses/sft": 1.8788634538650513, + "losses/total": 0.593835175037384, + "ref_logps/chosen": -32.361385345458984, + "ref_logps/rejected": -47.33667755126953, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4596959352493286, + "rewards/margins": 0.14714574813842773, + "rewards/rejected": -0.6068416833877563, + "step": 350 + }, + { + "epoch": 0.33, + "grad_norm": 16.265230178833008, + "learning_rate": 4.942287513116474e-07, + "logps/chosen": -43.46842575073242, + "logps/rejected": -49.64971160888672, + "loss": 0.6093, + "losses/dpo": 0.5821921229362488, + "losses/sft": 1.5536878108978271, + "losses/total": 0.5821921229362488, + "ref_logps/chosen": -39.623558044433594, + "ref_logps/rejected": -43.57435607910156, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3844863176345825, + "rewards/margins": 0.22304952144622803, + "rewards/rejected": -0.6075358390808105, + "step": 351 + }, + { + "epoch": 0.33, + "grad_norm": 20.127544403076172, + "learning_rate": 4.940538649877579e-07, + "logps/chosen": -46.149234771728516, + "logps/rejected": -53.44261932373047, + "loss": 0.7208, + "losses/dpo": 0.4625735580921173, + "losses/sft": 1.6604089736938477, + "losses/total": 0.4625735580921173, + "ref_logps/chosen": -39.44435501098633, + "ref_logps/rejected": -46.366180419921875, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6704879999160767, + "rewards/margins": 0.03715598210692406, + "rewards/rejected": -0.707643985748291, + "step": 352 + }, + { + "epoch": 0.33, + "grad_norm": 15.963071823120117, + "learning_rate": 4.938789786638684e-07, + "logps/chosen": -36.14772033691406, + "logps/rejected": -47.00101089477539, + "loss": 0.6377, + "losses/dpo": 0.5668531656265259, + "losses/sft": 1.6079418659210205, + "losses/total": 0.5668531656265259, + "ref_logps/chosen": -31.208240509033203, + "ref_logps/rejected": -40.485137939453125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49394795298576355, + "rewards/margins": 0.1576395481824875, + "rewards/rejected": -0.6515874862670898, + "step": 353 + }, + { + "epoch": 0.33, + "grad_norm": 16.185941696166992, + "learning_rate": 4.93704092339979e-07, + "logps/chosen": -43.763179779052734, + "logps/rejected": -52.58757019042969, + "loss": 0.6172, + "losses/dpo": 0.5382827520370483, + "losses/sft": 1.4193828105926514, + "losses/total": 0.5382827520370483, + "ref_logps/chosen": -37.45954132080078, + "ref_logps/rejected": -44.3360595703125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6303641200065613, + "rewards/margins": 0.19478727877140045, + "rewards/rejected": -0.8251514434814453, + "step": 354 + }, + { + "epoch": 0.34, + "grad_norm": 16.672767639160156, + "learning_rate": 4.935292060160895e-07, + "logps/chosen": -41.033485412597656, + "logps/rejected": -47.182586669921875, + "loss": 0.7167, + "losses/dpo": 0.8603551387786865, + "losses/sft": 1.820441484451294, + "losses/total": 0.8603551387786865, + "ref_logps/chosen": -35.289642333984375, + "ref_logps/rejected": -41.65888214111328, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5743842124938965, + "rewards/margins": -0.022013980895280838, + "rewards/rejected": -0.5523703098297119, + "step": 355 + }, + { + "epoch": 0.34, + "grad_norm": 18.10701560974121, + "learning_rate": 4.933543196922e-07, + "logps/chosen": -43.38030242919922, + "logps/rejected": -50.14466857910156, + "loss": 0.6917, + "losses/dpo": 0.5257072448730469, + "losses/sft": 1.6851675510406494, + "losses/total": 0.5257072448730469, + "ref_logps/chosen": -37.954803466796875, + "ref_logps/rejected": -44.21272277832031, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5425497889518738, + "rewards/margins": 0.050644807517528534, + "rewards/rejected": -0.5931944847106934, + "step": 356 + }, + { + "epoch": 0.34, + "grad_norm": 19.319822311401367, + "learning_rate": 4.931794333683106e-07, + "logps/chosen": -52.50119400024414, + "logps/rejected": -62.10511016845703, + "loss": 0.6744, + "losses/dpo": 0.7592508792877197, + "losses/sft": 1.8160649538040161, + "losses/total": 0.7592508792877197, + "ref_logps/chosen": -45.70870590209961, + "ref_logps/rejected": -54.46797180175781, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6792489886283875, + "rewards/margins": 0.08446483314037323, + "rewards/rejected": -0.7637137770652771, + "step": 357 + }, + { + "epoch": 0.34, + "grad_norm": 19.426149368286133, + "learning_rate": 4.930045470444211e-07, + "logps/chosen": -57.17057418823242, + "logps/rejected": -61.41523742675781, + "loss": 0.6438, + "losses/dpo": 0.7625062465667725, + "losses/sft": 1.9373992681503296, + "losses/total": 0.7625062465667725, + "ref_logps/chosen": -49.13054656982422, + "ref_logps/rejected": -51.94956588745117, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8040027022361755, + "rewards/margins": 0.14256440103054047, + "rewards/rejected": -0.9465670585632324, + "step": 358 + }, + { + "epoch": 0.34, + "grad_norm": 18.105295181274414, + "learning_rate": 4.928296607205316e-07, + "logps/chosen": -52.00062561035156, + "logps/rejected": -52.21385192871094, + "loss": 0.5934, + "losses/dpo": 0.5580828189849854, + "losses/sft": 1.7423279285430908, + "losses/total": 0.5580828189849854, + "ref_logps/chosen": -46.851287841796875, + "ref_logps/rejected": -44.67858123779297, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5149338245391846, + "rewards/margins": 0.23859325051307678, + "rewards/rejected": -0.7535271048545837, + "step": 359 + }, + { + "epoch": 0.34, + "grad_norm": 18.62017059326172, + "learning_rate": 4.926547743966421e-07, + "logps/chosen": -55.59669494628906, + "logps/rejected": -46.74395751953125, + "loss": 0.6991, + "losses/dpo": 0.6631979942321777, + "losses/sft": 1.880706548690796, + "losses/total": 0.6631979942321777, + "ref_logps/chosen": -49.36800003051758, + "ref_logps/rejected": -40.283843994140625, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6228693127632141, + "rewards/margins": 0.02314191684126854, + "rewards/rejected": -0.646011233329773, + "step": 360 + }, + { + "epoch": 0.34, + "grad_norm": 17.774625778198242, + "learning_rate": 4.924798880727527e-07, + "logps/chosen": -46.10017395019531, + "logps/rejected": -45.3880729675293, + "loss": 0.6913, + "losses/dpo": 0.7544475197792053, + "losses/sft": 1.6384199857711792, + "losses/total": 0.7544475197792053, + "ref_logps/chosen": -41.199554443359375, + "ref_logps/rejected": -39.884803771972656, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.49006187915802, + "rewards/margins": 0.0602647066116333, + "rewards/rejected": -0.5503265857696533, + "step": 361 + }, + { + "epoch": 0.34, + "grad_norm": 15.296215057373047, + "learning_rate": 4.923050017488632e-07, + "logps/chosen": -42.792118072509766, + "logps/rejected": -65.90992736816406, + "loss": 0.5411, + "losses/dpo": 0.4318642318248749, + "losses/sft": 2.0659215450286865, + "losses/total": 0.4318642318248749, + "ref_logps/chosen": -36.07505416870117, + "ref_logps/rejected": -54.86952209472656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6717066764831543, + "rewards/margins": 0.4323332905769348, + "rewards/rejected": -1.1040399074554443, + "step": 362 + }, + { + "epoch": 0.34, + "grad_norm": 15.614508628845215, + "learning_rate": 4.921301154249738e-07, + "logps/chosen": -39.033714294433594, + "logps/rejected": -50.13581085205078, + "loss": 0.6104, + "losses/dpo": 0.558932900428772, + "losses/sft": 1.314379096031189, + "losses/total": 0.558932900428772, + "ref_logps/chosen": -35.31792449951172, + "ref_logps/rejected": -44.301700592041016, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.37157878279685974, + "rewards/margins": 0.21183189749717712, + "rewards/rejected": -0.5834106206893921, + "step": 363 + }, + { + "epoch": 0.34, + "grad_norm": 14.529922485351562, + "learning_rate": 4.919552291010843e-07, + "logps/chosen": -31.802534103393555, + "logps/rejected": -45.50495910644531, + "loss": 0.6177, + "losses/dpo": 0.7741323709487915, + "losses/sft": 1.8115882873535156, + "losses/total": 0.7741323709487915, + "ref_logps/chosen": -29.133575439453125, + "ref_logps/rejected": -40.823123931884766, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.26689612865448, + "rewards/margins": 0.2012873888015747, + "rewards/rejected": -0.46818357706069946, + "step": 364 + }, + { + "epoch": 0.34, + "grad_norm": 14.948827743530273, + "learning_rate": 4.917803427771948e-07, + "logps/chosen": -35.900238037109375, + "logps/rejected": -44.39055633544922, + "loss": 0.6441, + "losses/dpo": 0.645287811756134, + "losses/sft": 1.2219266891479492, + "losses/total": 0.645287811756134, + "ref_logps/chosen": -31.56234359741211, + "ref_logps/rejected": -38.54907989501953, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.43378946185112, + "rewards/margins": 0.15035782754421234, + "rewards/rejected": -0.5841472744941711, + "step": 365 + }, + { + "epoch": 0.35, + "grad_norm": 17.084739685058594, + "learning_rate": 4.916054564533053e-07, + "logps/chosen": -46.07752990722656, + "logps/rejected": -55.04396057128906, + "loss": 0.663, + "losses/dpo": 0.6777514219284058, + "losses/sft": 1.5951437950134277, + "losses/total": 0.6777514219284058, + "ref_logps/chosen": -40.62417984008789, + "ref_logps/rejected": -48.55836486816406, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5453349947929382, + "rewards/margins": 0.1032242476940155, + "rewards/rejected": -0.6485592722892761, + "step": 366 + }, + { + "epoch": 0.35, + "grad_norm": 18.58281707763672, + "learning_rate": 4.914305701294158e-07, + "logps/chosen": -40.653499603271484, + "logps/rejected": -46.78325653076172, + "loss": 0.682, + "losses/dpo": 0.6739465594291687, + "losses/sft": 1.5690735578536987, + "losses/total": 0.6739465594291687, + "ref_logps/chosen": -35.23616027832031, + "ref_logps/rejected": -40.62438201904297, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5417338013648987, + "rewards/margins": 0.0741538554430008, + "rewards/rejected": -0.6158876419067383, + "step": 367 + }, + { + "epoch": 0.35, + "grad_norm": 18.19317626953125, + "learning_rate": 4.912556838055264e-07, + "logps/chosen": -46.61870193481445, + "logps/rejected": -34.94660186767578, + "loss": 0.7033, + "losses/dpo": 0.9036322236061096, + "losses/sft": 1.325303554534912, + "losses/total": 0.9036322236061096, + "ref_logps/chosen": -41.60350036621094, + "ref_logps/rejected": -29.104267120361328, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5015201568603516, + "rewards/margins": 0.08271355926990509, + "rewards/rejected": -0.5842337608337402, + "step": 368 + }, + { + "epoch": 0.35, + "grad_norm": 15.949390411376953, + "learning_rate": 4.910807974816369e-07, + "logps/chosen": -44.95087432861328, + "logps/rejected": -38.99454879760742, + "loss": 0.6674, + "losses/dpo": 0.7117958068847656, + "losses/sft": 1.5104433298110962, + "losses/total": 0.7117958068847656, + "ref_logps/chosen": -39.73406219482422, + "ref_logps/rejected": -32.87041473388672, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5216814279556274, + "rewards/margins": 0.09073203057050705, + "rewards/rejected": -0.6124134659767151, + "step": 369 + }, + { + "epoch": 0.35, + "grad_norm": 17.29485321044922, + "learning_rate": 4.909059111577475e-07, + "logps/chosen": -35.46530532836914, + "logps/rejected": -46.79058074951172, + "loss": 0.6979, + "losses/dpo": 0.7862976789474487, + "losses/sft": 1.4087018966674805, + "losses/total": 0.7862976789474487, + "ref_logps/chosen": -28.886497497558594, + "ref_logps/rejected": -39.95635223388672, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.657880961894989, + "rewards/margins": 0.025541625916957855, + "rewards/rejected": -0.6834225654602051, + "step": 370 + }, + { + "epoch": 0.35, + "grad_norm": 18.99604034423828, + "learning_rate": 4.90731024833858e-07, + "logps/chosen": -45.85717010498047, + "logps/rejected": -47.745880126953125, + "loss": 0.7043, + "losses/dpo": 0.773084282875061, + "losses/sft": 2.0893893241882324, + "losses/total": 0.773084282875061, + "ref_logps/chosen": -39.34242248535156, + "ref_logps/rejected": -40.57661819458008, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6514745950698853, + "rewards/margins": 0.06545138359069824, + "rewards/rejected": -0.7169260382652283, + "step": 371 + }, + { + "epoch": 0.35, + "grad_norm": 17.263776779174805, + "learning_rate": 4.905561385099685e-07, + "logps/chosen": -31.50356674194336, + "logps/rejected": -49.4587287902832, + "loss": 0.6661, + "losses/dpo": 0.7274957895278931, + "losses/sft": 1.4706470966339111, + "losses/total": 0.7274957895278931, + "ref_logps/chosen": -27.50067138671875, + "ref_logps/rejected": -44.39161682128906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.40028929710388184, + "rewards/margins": 0.1064220741391182, + "rewards/rejected": -0.5067113637924194, + "step": 372 + }, + { + "epoch": 0.35, + "grad_norm": 13.453235626220703, + "learning_rate": 4.90381252186079e-07, + "logps/chosen": -30.153182983398438, + "logps/rejected": -39.06922149658203, + "loss": 0.6136, + "losses/dpo": 0.6532871723175049, + "losses/sft": 1.5000978708267212, + "losses/total": 0.6532871723175049, + "ref_logps/chosen": -26.363821029663086, + "ref_logps/rejected": -33.32054138183594, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3789364695549011, + "rewards/margins": 0.19593185186386108, + "rewards/rejected": -0.5748683214187622, + "step": 373 + }, + { + "epoch": 0.35, + "grad_norm": 15.723969459533691, + "learning_rate": 4.902063658621895e-07, + "logps/chosen": -42.195716857910156, + "logps/rejected": -45.46604919433594, + "loss": 0.5879, + "losses/dpo": 0.6335541605949402, + "losses/sft": 1.487048864364624, + "losses/total": 0.6335541605949402, + "ref_logps/chosen": -38.526390075683594, + "ref_logps/rejected": -39.2433967590332, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3669326603412628, + "rewards/margins": 0.2553326189517975, + "rewards/rejected": -0.6222652792930603, + "step": 374 + }, + { + "epoch": 0.35, + "grad_norm": 14.40339183807373, + "learning_rate": 4.900314795383e-07, + "logps/chosen": -31.525100708007812, + "logps/rejected": -45.984596252441406, + "loss": 0.5898, + "losses/dpo": 0.6913972496986389, + "losses/sft": 1.099683403968811, + "losses/total": 0.6913972496986389, + "ref_logps/chosen": -27.74881935119629, + "ref_logps/rejected": -39.44319152832031, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.37762802839279175, + "rewards/margins": 0.2765120565891266, + "rewards/rejected": -0.654140055179596, + "step": 375 + }, + { + "epoch": 0.36, + "grad_norm": 16.11861228942871, + "learning_rate": 4.898565932144106e-07, + "logps/chosen": -36.8918571472168, + "logps/rejected": -50.09154510498047, + "loss": 0.5972, + "losses/dpo": 0.7275481224060059, + "losses/sft": 1.7201942205429077, + "losses/total": 0.7275481224060059, + "ref_logps/chosen": -32.64515686035156, + "ref_logps/rejected": -43.260528564453125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4246698021888733, + "rewards/margins": 0.2584313750267029, + "rewards/rejected": -0.6831011772155762, + "step": 376 + }, + { + "epoch": 0.36, + "grad_norm": 19.990903854370117, + "learning_rate": 4.896817068905212e-07, + "logps/chosen": -53.96989822387695, + "logps/rejected": -46.38031005859375, + "loss": 0.7189, + "losses/dpo": 0.7240732908248901, + "losses/sft": 1.6825931072235107, + "losses/total": 0.7240732908248901, + "ref_logps/chosen": -47.78759002685547, + "ref_logps/rejected": -40.2942008972168, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6182307600975037, + "rewards/margins": -0.009619642049074173, + "rewards/rejected": -0.6086111068725586, + "step": 377 + }, + { + "epoch": 0.36, + "grad_norm": 19.15579605102539, + "learning_rate": 4.895068205666317e-07, + "logps/chosen": -48.938873291015625, + "logps/rejected": -59.844417572021484, + "loss": 0.6221, + "losses/dpo": 0.600677490234375, + "losses/sft": 1.450546383857727, + "losses/total": 0.600677490234375, + "ref_logps/chosen": -42.355220794677734, + "ref_logps/rejected": -51.14586639404297, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6583653688430786, + "rewards/margins": 0.21148985624313354, + "rewards/rejected": -0.8698551654815674, + "step": 378 + }, + { + "epoch": 0.36, + "grad_norm": 17.557723999023438, + "learning_rate": 4.893319342427422e-07, + "logps/chosen": -45.916561126708984, + "logps/rejected": -57.7324333190918, + "loss": 0.6019, + "losses/dpo": 0.6280567646026611, + "losses/sft": 1.4994337558746338, + "losses/total": 0.6280567646026611, + "ref_logps/chosen": -41.315452575683594, + "ref_logps/rejected": -50.54290771484375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4601111114025116, + "rewards/margins": 0.25884193181991577, + "rewards/rejected": -0.718953013420105, + "step": 379 + }, + { + "epoch": 0.36, + "grad_norm": 18.44418716430664, + "learning_rate": 4.891570479188527e-07, + "logps/chosen": -51.42871856689453, + "logps/rejected": -57.7188720703125, + "loss": 0.5943, + "losses/dpo": 0.6480692028999329, + "losses/sft": 1.7022343873977661, + "losses/total": 0.6480692028999329, + "ref_logps/chosen": -46.33269500732422, + "ref_logps/rejected": -49.920387268066406, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5096026659011841, + "rewards/margins": 0.27024561166763306, + "rewards/rejected": -0.7798483371734619, + "step": 380 + }, + { + "epoch": 0.36, + "grad_norm": 16.115467071533203, + "learning_rate": 4.889821615949632e-07, + "logps/chosen": -43.42619323730469, + "logps/rejected": -46.72793960571289, + "loss": 0.5855, + "losses/dpo": 0.4910088777542114, + "losses/sft": 1.433046579360962, + "losses/total": 0.4910088777542114, + "ref_logps/chosen": -39.071250915527344, + "ref_logps/rejected": -39.33418273925781, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4354940950870514, + "rewards/margins": 0.30388131737709045, + "rewards/rejected": -0.7393754124641418, + "step": 381 + }, + { + "epoch": 0.36, + "grad_norm": 17.226011276245117, + "learning_rate": 4.888072752710738e-07, + "logps/chosen": -37.94596481323242, + "logps/rejected": -48.29046630859375, + "loss": 0.589, + "losses/dpo": 0.4702170789241791, + "losses/sft": 0.9514347314834595, + "losses/total": 0.4702170789241791, + "ref_logps/chosen": -32.395713806152344, + "ref_logps/rejected": -39.96636962890625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5550253987312317, + "rewards/margins": 0.2773841619491577, + "rewards/rejected": -0.8324095010757446, + "step": 382 + }, + { + "epoch": 0.36, + "grad_norm": 15.372961044311523, + "learning_rate": 4.886323889471843e-07, + "logps/chosen": -39.563194274902344, + "logps/rejected": -51.458526611328125, + "loss": 0.553, + "losses/dpo": 0.5169616341590881, + "losses/sft": 1.6448532342910767, + "losses/total": 0.5169616341590881, + "ref_logps/chosen": -34.72569274902344, + "ref_logps/rejected": -42.97579574584961, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.483749657869339, + "rewards/margins": 0.3645235002040863, + "rewards/rejected": -0.8482731580734253, + "step": 383 + }, + { + "epoch": 0.36, + "grad_norm": 16.232027053833008, + "learning_rate": 4.884575026232949e-07, + "logps/chosen": -44.31641387939453, + "logps/rejected": -61.96830368041992, + "loss": 0.5499, + "losses/dpo": 0.5958530902862549, + "losses/sft": 1.2350316047668457, + "losses/total": 0.5958530902862549, + "ref_logps/chosen": -39.33867645263672, + "ref_logps/rejected": -52.458457946777344, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4977739155292511, + "rewards/margins": 0.45321106910705566, + "rewards/rejected": -0.9509849548339844, + "step": 384 + }, + { + "epoch": 0.36, + "grad_norm": 17.623497009277344, + "learning_rate": 4.882826162994054e-07, + "logps/chosen": -41.76802062988281, + "logps/rejected": -51.88357925415039, + "loss": 0.6588, + "losses/dpo": 0.6860288977622986, + "losses/sft": 1.4751417636871338, + "losses/total": 0.6860288977622986, + "ref_logps/chosen": -37.24629211425781, + "ref_logps/rejected": -46.162330627441406, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45217248797416687, + "rewards/margins": 0.11995206773281097, + "rewards/rejected": -0.5721245408058167, + "step": 385 + }, + { + "epoch": 0.36, + "grad_norm": 17.865766525268555, + "learning_rate": 4.881077299755159e-07, + "logps/chosen": -48.415687561035156, + "logps/rejected": -59.801910400390625, + "loss": 0.6429, + "losses/dpo": 0.576919436454773, + "losses/sft": 1.5451867580413818, + "losses/total": 0.576919436454773, + "ref_logps/chosen": -42.79845428466797, + "ref_logps/rejected": -52.516029357910156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5617234706878662, + "rewards/margins": 0.16686515510082245, + "rewards/rejected": -0.7285885810852051, + "step": 386 + }, + { + "epoch": 0.37, + "grad_norm": 15.3607759475708, + "learning_rate": 4.879328436516264e-07, + "logps/chosen": -37.83244323730469, + "logps/rejected": -51.96109390258789, + "loss": 0.5846, + "losses/dpo": 0.6534067392349243, + "losses/sft": 1.4005818367004395, + "losses/total": 0.6534067392349243, + "ref_logps/chosen": -33.5261344909668, + "ref_logps/rejected": -44.703426361083984, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.43063056468963623, + "rewards/margins": 0.29513639211654663, + "rewards/rejected": -0.7257668972015381, + "step": 387 + }, + { + "epoch": 0.37, + "grad_norm": 18.870868682861328, + "learning_rate": 4.877579573277369e-07, + "logps/chosen": -52.28619384765625, + "logps/rejected": -58.13663101196289, + "loss": 0.6719, + "losses/dpo": 0.6589078903198242, + "losses/sft": 1.8530817031860352, + "losses/total": 0.6589078903198242, + "ref_logps/chosen": -46.614654541015625, + "ref_logps/rejected": -51.617279052734375, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.5671533346176147, + "rewards/margins": 0.08478206396102905, + "rewards/rejected": -0.6519354581832886, + "step": 388 + }, + { + "epoch": 0.37, + "grad_norm": 17.403518676757812, + "learning_rate": 4.875830710038475e-07, + "logps/chosen": -44.04420852661133, + "logps/rejected": -41.051490783691406, + "loss": 0.622, + "losses/dpo": 0.7141726016998291, + "losses/sft": 1.9304901361465454, + "losses/total": 0.7141726016998291, + "ref_logps/chosen": -39.160709381103516, + "ref_logps/rejected": -33.75501251220703, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4883498549461365, + "rewards/margins": 0.241298109292984, + "rewards/rejected": -0.7296479940414429, + "step": 389 + }, + { + "epoch": 0.37, + "grad_norm": 15.864351272583008, + "learning_rate": 4.874081846799579e-07, + "logps/chosen": -38.41652297973633, + "logps/rejected": -52.740264892578125, + "loss": 0.5355, + "losses/dpo": 0.5309996604919434, + "losses/sft": 1.4430912733078003, + "losses/total": 0.5309996604919434, + "ref_logps/chosen": -34.61224365234375, + "ref_logps/rejected": -45.08299255371094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.38042739033699036, + "rewards/margins": 0.3852997124195099, + "rewards/rejected": -0.7657271027565002, + "step": 390 + }, + { + "epoch": 0.37, + "grad_norm": 17.13785171508789, + "learning_rate": 4.872332983560686e-07, + "logps/chosen": -37.88313293457031, + "logps/rejected": -39.423465728759766, + "loss": 0.6859, + "losses/dpo": 0.690115213394165, + "losses/sft": 1.644648551940918, + "losses/total": 0.690115213394165, + "ref_logps/chosen": -33.61713409423828, + "ref_logps/rejected": -34.702396392822266, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.42660024762153625, + "rewards/margins": 0.04550663381814957, + "rewards/rejected": -0.4721068739891052, + "step": 391 + }, + { + "epoch": 0.37, + "grad_norm": 14.988529205322266, + "learning_rate": 4.870584120321791e-07, + "logps/chosen": -41.488441467285156, + "logps/rejected": -37.75962829589844, + "loss": 0.6294, + "losses/dpo": 0.6864627599716187, + "losses/sft": 1.48981773853302, + "losses/total": 0.6864627599716187, + "ref_logps/chosen": -37.671180725097656, + "ref_logps/rejected": -32.284706115722656, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38172605633735657, + "rewards/margins": 0.16576623916625977, + "rewards/rejected": -0.547492265701294, + "step": 392 + }, + { + "epoch": 0.37, + "grad_norm": 17.260526657104492, + "learning_rate": 4.868835257082896e-07, + "logps/chosen": -37.648902893066406, + "logps/rejected": -47.992645263671875, + "loss": 0.6936, + "losses/dpo": 0.5706920027732849, + "losses/sft": 1.1341158151626587, + "losses/total": 0.5706920027732849, + "ref_logps/chosen": -33.21958541870117, + "ref_logps/rejected": -43.15337371826172, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.44293180108070374, + "rewards/margins": 0.04099569469690323, + "rewards/rejected": -0.48392748832702637, + "step": 393 + }, + { + "epoch": 0.37, + "grad_norm": 16.488170623779297, + "learning_rate": 4.867086393844001e-07, + "logps/chosen": -45.87257385253906, + "logps/rejected": -48.47898864746094, + "loss": 0.6302, + "losses/dpo": 0.49739784002304077, + "losses/sft": 1.7258594036102295, + "losses/total": 0.49739784002304077, + "ref_logps/chosen": -41.582820892333984, + "ref_logps/rejected": -42.0963249206543, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.42897525429725647, + "rewards/margins": 0.20929084718227386, + "rewards/rejected": -0.6382660865783691, + "step": 394 + }, + { + "epoch": 0.37, + "grad_norm": 15.787715911865234, + "learning_rate": 4.865337530605106e-07, + "logps/chosen": -38.56987380981445, + "logps/rejected": -57.91104507446289, + "loss": 0.6043, + "losses/dpo": 0.49941176176071167, + "losses/sft": 1.359432339668274, + "losses/total": 0.49941176176071167, + "ref_logps/chosen": -34.32741165161133, + "ref_logps/rejected": -51.366233825683594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4242461621761322, + "rewards/margins": 0.2302348017692566, + "rewards/rejected": -0.6544809341430664, + "step": 395 + }, + { + "epoch": 0.37, + "grad_norm": 15.207798957824707, + "learning_rate": 4.863588667366212e-07, + "logps/chosen": -30.377071380615234, + "logps/rejected": -36.83739471435547, + "loss": 0.5979, + "losses/dpo": 0.543952465057373, + "losses/sft": 1.4746674299240112, + "losses/total": 0.543952465057373, + "ref_logps/chosen": -27.675073623657227, + "ref_logps/rejected": -31.649330139160156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2701996862888336, + "rewards/margins": 0.24860700964927673, + "rewards/rejected": -0.5188066959381104, + "step": 396 + }, + { + "epoch": 0.37, + "grad_norm": 17.54619598388672, + "learning_rate": 4.861839804127316e-07, + "logps/chosen": -47.32435989379883, + "logps/rejected": -58.10388946533203, + "loss": 0.6519, + "losses/dpo": 0.5839875936508179, + "losses/sft": 1.435477375984192, + "losses/total": 0.5839875936508179, + "ref_logps/chosen": -41.70479202270508, + "ref_logps/rejected": -50.81410217285156, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.561956524848938, + "rewards/margins": 0.1670229434967041, + "rewards/rejected": -0.7289795279502869, + "step": 397 + }, + { + "epoch": 0.38, + "grad_norm": 15.378534317016602, + "learning_rate": 4.860090940888423e-07, + "logps/chosen": -35.75560760498047, + "logps/rejected": -41.20283508300781, + "loss": 0.6299, + "losses/dpo": 0.6845287084579468, + "losses/sft": 1.1644686460494995, + "losses/total": 0.6845287084579468, + "ref_logps/chosen": -31.902124404907227, + "ref_logps/rejected": -35.534156799316406, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.38534846901893616, + "rewards/margins": 0.18151944875717163, + "rewards/rejected": -0.5668679475784302, + "step": 398 + }, + { + "epoch": 0.38, + "grad_norm": 16.789806365966797, + "learning_rate": 4.858342077649528e-07, + "logps/chosen": -33.036827087402344, + "logps/rejected": -40.6373291015625, + "loss": 0.6909, + "losses/dpo": 0.5921821594238281, + "losses/sft": 1.5299408435821533, + "losses/total": 0.5921821594238281, + "ref_logps/chosen": -30.053743362426758, + "ref_logps/rejected": -37.085121154785156, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2983085513114929, + "rewards/margins": 0.0569118931889534, + "rewards/rejected": -0.3552204370498657, + "step": 399 + }, + { + "epoch": 0.38, + "grad_norm": 16.89141082763672, + "learning_rate": 4.856593214410633e-07, + "logps/chosen": -47.697715759277344, + "logps/rejected": -61.035888671875, + "loss": 0.6096, + "losses/dpo": 0.5185248851776123, + "losses/sft": 1.6526906490325928, + "losses/total": 0.5185248851776123, + "ref_logps/chosen": -42.63914489746094, + "ref_logps/rejected": -53.86378860473633, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5058566331863403, + "rewards/margins": 0.21135367453098297, + "rewards/rejected": -0.7172103524208069, + "step": 400 + }, + { + "epoch": 0.38, + "grad_norm": 14.587214469909668, + "learning_rate": 4.854844351171738e-07, + "logps/chosen": -34.169090270996094, + "logps/rejected": -39.14319610595703, + "loss": 0.6158, + "losses/dpo": 0.5357730388641357, + "losses/sft": 1.3998308181762695, + "losses/total": 0.5357730388641357, + "ref_logps/chosen": -30.625774383544922, + "ref_logps/rejected": -33.343929290771484, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.354331374168396, + "rewards/margins": 0.2255951315164566, + "rewards/rejected": -0.5799264907836914, + "step": 401 + }, + { + "epoch": 0.38, + "grad_norm": 16.121726989746094, + "learning_rate": 4.853095487932843e-07, + "logps/chosen": -32.05437469482422, + "logps/rejected": -60.37568664550781, + "loss": 0.4951, + "losses/dpo": 0.48003166913986206, + "losses/sft": 1.5877383947372437, + "losses/total": 0.48003166913986206, + "ref_logps/chosen": -28.26840591430664, + "ref_logps/rejected": -51.92275619506836, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3785969614982605, + "rewards/margins": 0.46669596433639526, + "rewards/rejected": -0.8452929258346558, + "step": 402 + }, + { + "epoch": 0.38, + "grad_norm": 16.64834213256836, + "learning_rate": 4.851346624693949e-07, + "logps/chosen": -38.816585540771484, + "logps/rejected": -53.78671646118164, + "loss": 0.5811, + "losses/dpo": 0.5250800848007202, + "losses/sft": 1.79682457447052, + "losses/total": 0.5250800848007202, + "ref_logps/chosen": -34.659847259521484, + "ref_logps/rejected": -46.36677551269531, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.415674090385437, + "rewards/margins": 0.32631999254226685, + "rewards/rejected": -0.7419940233230591, + "step": 403 + }, + { + "epoch": 0.38, + "grad_norm": 17.353267669677734, + "learning_rate": 4.849597761455053e-07, + "logps/chosen": -43.05952835083008, + "logps/rejected": -48.76091384887695, + "loss": 0.6148, + "losses/dpo": 0.4252875745296478, + "losses/sft": 1.267093539237976, + "losses/total": 0.4252875745296478, + "ref_logps/chosen": -38.705326080322266, + "ref_logps/rejected": -42.27796173095703, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4354203939437866, + "rewards/margins": 0.21287499368190765, + "rewards/rejected": -0.6482954025268555, + "step": 404 + }, + { + "epoch": 0.38, + "grad_norm": 14.371841430664062, + "learning_rate": 4.84784889821616e-07, + "logps/chosen": -26.627094268798828, + "logps/rejected": -35.829750061035156, + "loss": 0.6266, + "losses/dpo": 0.5315425395965576, + "losses/sft": 1.3527321815490723, + "losses/total": 0.5315425395965576, + "ref_logps/chosen": -23.816089630126953, + "ref_logps/rejected": -31.18358039855957, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2811005413532257, + "rewards/margins": 0.1835164874792099, + "rewards/rejected": -0.4646170735359192, + "step": 405 + }, + { + "epoch": 0.38, + "grad_norm": 17.5135440826416, + "learning_rate": 4.846100034977265e-07, + "logps/chosen": -40.98727798461914, + "logps/rejected": -60.82196044921875, + "loss": 0.5499, + "losses/dpo": 0.6684725880622864, + "losses/sft": 1.9783414602279663, + "losses/total": 0.6684725880622864, + "ref_logps/chosen": -36.28944778442383, + "ref_logps/rejected": -52.15388870239258, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.46978288888931274, + "rewards/margins": 0.39702415466308594, + "rewards/rejected": -0.8668070435523987, + "step": 406 + }, + { + "epoch": 0.38, + "grad_norm": 15.416147232055664, + "learning_rate": 4.84435117173837e-07, + "logps/chosen": -32.149574279785156, + "logps/rejected": -41.04954528808594, + "loss": 0.674, + "losses/dpo": 0.6583499312400818, + "losses/sft": 1.5759145021438599, + "losses/total": 0.6583499312400818, + "ref_logps/chosen": -28.559553146362305, + "ref_logps/rejected": -36.13202667236328, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.35900166630744934, + "rewards/margins": 0.13275033235549927, + "rewards/rejected": -0.4917519986629486, + "step": 407 + }, + { + "epoch": 0.39, + "grad_norm": 16.129085540771484, + "learning_rate": 4.842602308499475e-07, + "logps/chosen": -40.663230895996094, + "logps/rejected": -56.94084167480469, + "loss": 0.5149, + "losses/dpo": 0.45456379652023315, + "losses/sft": 1.2015033960342407, + "losses/total": 0.45456379652023315, + "ref_logps/chosen": -37.173484802246094, + "ref_logps/rejected": -47.41035461425781, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3489745259284973, + "rewards/margins": 0.6040740013122559, + "rewards/rejected": -0.953048586845398, + "step": 408 + }, + { + "epoch": 0.39, + "grad_norm": 19.411956787109375, + "learning_rate": 4.84085344526058e-07, + "logps/chosen": -42.17294692993164, + "logps/rejected": -50.168800354003906, + "loss": 0.6934, + "losses/dpo": 0.5932608842849731, + "losses/sft": 1.406678557395935, + "losses/total": 0.5932608842849731, + "ref_logps/chosen": -35.548667907714844, + "ref_logps/rejected": -42.467041015625, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6624277830123901, + "rewards/margins": 0.10774798691272736, + "rewards/rejected": -0.7701758146286011, + "step": 409 + }, + { + "epoch": 0.39, + "grad_norm": 16.154325485229492, + "learning_rate": 4.839104582021686e-07, + "logps/chosen": -42.76899719238281, + "logps/rejected": -50.167564392089844, + "loss": 0.6151, + "losses/dpo": 0.8264023065567017, + "losses/sft": 1.948232650756836, + "losses/total": 0.8264023065567017, + "ref_logps/chosen": -36.968727111816406, + "ref_logps/rejected": -41.93342590332031, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5800269842147827, + "rewards/margins": 0.24338719248771667, + "rewards/rejected": -0.823414146900177, + "step": 410 + }, + { + "epoch": 0.39, + "grad_norm": 18.531009674072266, + "learning_rate": 4.83735571878279e-07, + "logps/chosen": -48.12474822998047, + "logps/rejected": -54.27491760253906, + "loss": 0.6067, + "losses/dpo": 0.46124640107154846, + "losses/sft": 1.0872917175292969, + "losses/total": 0.46124640107154846, + "ref_logps/chosen": -42.32415008544922, + "ref_logps/rejected": -45.653709411621094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5800597667694092, + "rewards/margins": 0.2820609509944916, + "rewards/rejected": -0.8621206879615784, + "step": 411 + }, + { + "epoch": 0.39, + "grad_norm": 16.040817260742188, + "learning_rate": 4.835606855543896e-07, + "logps/chosen": -41.7850456237793, + "logps/rejected": -44.34109115600586, + "loss": 0.5943, + "losses/dpo": 0.5554689168930054, + "losses/sft": 1.4853107929229736, + "losses/total": 0.5554689168930054, + "ref_logps/chosen": -38.30476760864258, + "ref_logps/rejected": -38.22357940673828, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3480280041694641, + "rewards/margins": 0.26372280716896057, + "rewards/rejected": -0.6117507815361023, + "step": 412 + }, + { + "epoch": 0.39, + "grad_norm": 15.917330741882324, + "learning_rate": 4.833857992305002e-07, + "logps/chosen": -41.395469665527344, + "logps/rejected": -49.830352783203125, + "loss": 0.6196, + "losses/dpo": 0.5293267965316772, + "losses/sft": 1.0707151889801025, + "losses/total": 0.5293267965316772, + "ref_logps/chosen": -35.795894622802734, + "ref_logps/rejected": -41.86262130737305, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5599576234817505, + "rewards/margins": 0.2368154525756836, + "rewards/rejected": -0.7967730760574341, + "step": 413 + }, + { + "epoch": 0.39, + "grad_norm": 16.770217895507812, + "learning_rate": 4.832109129066107e-07, + "logps/chosen": -52.91405487060547, + "logps/rejected": -59.947940826416016, + "loss": 0.5564, + "losses/dpo": 0.5155785083770752, + "losses/sft": 1.9012634754180908, + "losses/total": 0.5155785083770752, + "ref_logps/chosen": -46.752349853515625, + "ref_logps/rejected": -49.959800720214844, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6161705255508423, + "rewards/margins": 0.38264378905296326, + "rewards/rejected": -0.9988143444061279, + "step": 414 + }, + { + "epoch": 0.39, + "grad_norm": 17.09412956237793, + "learning_rate": 4.830360265827212e-07, + "logps/chosen": -41.7603874206543, + "logps/rejected": -53.3480339050293, + "loss": 0.5886, + "losses/dpo": 0.5072081685066223, + "losses/sft": 1.2812321186065674, + "losses/total": 0.5072081685066223, + "ref_logps/chosen": -35.59248733520508, + "ref_logps/rejected": -44.63600158691406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.616789698600769, + "rewards/margins": 0.25441354513168335, + "rewards/rejected": -0.8712032437324524, + "step": 415 + }, + { + "epoch": 0.39, + "grad_norm": 21.589744567871094, + "learning_rate": 4.828611402588317e-07, + "logps/chosen": -55.79002380371094, + "logps/rejected": -65.64686584472656, + "loss": 0.6915, + "losses/dpo": 0.5672193169593811, + "losses/sft": 1.7146742343902588, + "losses/total": 0.5672193169593811, + "ref_logps/chosen": -47.1961784362793, + "ref_logps/rejected": -55.97956085205078, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8593848943710327, + "rewards/margins": 0.10734502971172333, + "rewards/rejected": -0.9667298793792725, + "step": 416 + }, + { + "epoch": 0.39, + "grad_norm": 17.905376434326172, + "learning_rate": 4.826862539349423e-07, + "logps/chosen": -41.799400329589844, + "logps/rejected": -51.18357849121094, + "loss": 0.5399, + "losses/dpo": 0.5003094673156738, + "losses/sft": 1.5253819227218628, + "losses/total": 0.5003094673156738, + "ref_logps/chosen": -38.23586654663086, + "ref_logps/rejected": -43.49443054199219, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3563530445098877, + "rewards/margins": 0.41256183385849, + "rewards/rejected": -0.7689148783683777, + "step": 417 + }, + { + "epoch": 0.39, + "grad_norm": 16.941661834716797, + "learning_rate": 4.825113676110527e-07, + "logps/chosen": -37.555171966552734, + "logps/rejected": -42.37037658691406, + "loss": 0.6635, + "losses/dpo": 0.9041906595230103, + "losses/sft": 1.7020913362503052, + "losses/total": 0.9041906595230103, + "ref_logps/chosen": -32.227359771728516, + "ref_logps/rejected": -35.716827392578125, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5327807664871216, + "rewards/margins": 0.1325741708278656, + "rewards/rejected": -0.6653549671173096, + "step": 418 + }, + { + "epoch": 0.4, + "grad_norm": 18.61013412475586, + "learning_rate": 4.823364812871633e-07, + "logps/chosen": -43.78700256347656, + "logps/rejected": -71.27808380126953, + "loss": 0.5131, + "losses/dpo": 0.4810955822467804, + "losses/sft": 1.5377532243728638, + "losses/total": 0.4810955822467804, + "ref_logps/chosen": -39.11919403076172, + "ref_logps/rejected": -59.33916091918945, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4667811989784241, + "rewards/margins": 0.7271108627319336, + "rewards/rejected": -1.193892240524292, + "step": 419 + }, + { + "epoch": 0.4, + "grad_norm": 22.28360939025879, + "learning_rate": 4.821615949632739e-07, + "logps/chosen": -49.553157806396484, + "logps/rejected": -48.26500701904297, + "loss": 0.7759, + "losses/dpo": 0.7567892074584961, + "losses/sft": 1.5547449588775635, + "losses/total": 0.7567892074584961, + "ref_logps/chosen": -41.395145416259766, + "ref_logps/rejected": -40.95943069458008, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.8158011436462402, + "rewards/margins": -0.08524350821971893, + "rewards/rejected": -0.7305576801300049, + "step": 420 + }, + { + "epoch": 0.4, + "grad_norm": 18.713472366333008, + "learning_rate": 4.819867086393844e-07, + "logps/chosen": -40.23215103149414, + "logps/rejected": -52.624839782714844, + "loss": 0.6301, + "losses/dpo": 0.5053095817565918, + "losses/sft": 1.4159964323043823, + "losses/total": 0.5053095817565918, + "ref_logps/chosen": -33.68476104736328, + "ref_logps/rejected": -43.69927978515625, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6547389030456543, + "rewards/margins": 0.23781681060791016, + "rewards/rejected": -0.8925557136535645, + "step": 421 + }, + { + "epoch": 0.4, + "grad_norm": 14.797178268432617, + "learning_rate": 4.818118223154949e-07, + "logps/chosen": -47.91529846191406, + "logps/rejected": -60.493133544921875, + "loss": 0.4971, + "losses/dpo": 0.531101405620575, + "losses/sft": 1.6579855680465698, + "losses/total": 0.531101405620575, + "ref_logps/chosen": -43.0427360534668, + "ref_logps/rejected": -50.19813537597656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4872565269470215, + "rewards/margins": 0.5422431826591492, + "rewards/rejected": -1.0294997692108154, + "step": 422 + }, + { + "epoch": 0.4, + "grad_norm": 17.529640197753906, + "learning_rate": 4.816369359916054e-07, + "logps/chosen": -46.716304779052734, + "logps/rejected": -57.9948616027832, + "loss": 0.6419, + "losses/dpo": 0.7770130634307861, + "losses/sft": 1.5371500253677368, + "losses/total": 0.7770130634307861, + "ref_logps/chosen": -39.979270935058594, + "ref_logps/rejected": -48.78118133544922, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6737032532691956, + "rewards/margins": 0.24766482412815094, + "rewards/rejected": -0.9213681221008301, + "step": 423 + }, + { + "epoch": 0.4, + "grad_norm": 16.264522552490234, + "learning_rate": 4.81462049667716e-07, + "logps/chosen": -44.645259857177734, + "logps/rejected": -61.48662185668945, + "loss": 0.5151, + "losses/dpo": 0.4992883801460266, + "losses/sft": 1.765952229499817, + "losses/total": 0.4992883801460266, + "ref_logps/chosen": -38.91535949707031, + "ref_logps/rejected": -50.420440673828125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5729899406433105, + "rewards/margins": 0.5336276292800903, + "rewards/rejected": -1.1066175699234009, + "step": 424 + }, + { + "epoch": 0.4, + "grad_norm": 17.553157806396484, + "learning_rate": 4.812871633438264e-07, + "logps/chosen": -43.50630569458008, + "logps/rejected": -56.305152893066406, + "loss": 0.5832, + "losses/dpo": 0.7116658687591553, + "losses/sft": 1.435996651649475, + "losses/total": 0.7116658687591553, + "ref_logps/chosen": -37.015892028808594, + "ref_logps/rejected": -46.49329376220703, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6490410566329956, + "rewards/margins": 0.33214494585990906, + "rewards/rejected": -0.981186032295227, + "step": 425 + }, + { + "epoch": 0.4, + "grad_norm": 15.7957124710083, + "learning_rate": 4.81112277019937e-07, + "logps/chosen": -43.427886962890625, + "logps/rejected": -48.535614013671875, + "loss": 0.5, + "losses/dpo": 0.4772017300128937, + "losses/sft": 1.3629473447799683, + "losses/total": 0.4772017300128937, + "ref_logps/chosen": -39.44947814941406, + "ref_logps/rejected": -39.24213409423828, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3978408873081207, + "rewards/margins": 0.531507134437561, + "rewards/rejected": -0.9293479919433594, + "step": 426 + }, + { + "epoch": 0.4, + "grad_norm": 23.079870223999023, + "learning_rate": 4.809373906960476e-07, + "logps/chosen": -55.994503021240234, + "logps/rejected": -48.09328079223633, + "loss": 0.7933, + "losses/dpo": 0.8312777280807495, + "losses/sft": 1.7263953685760498, + "losses/total": 0.8312777280807495, + "ref_logps/chosen": -47.46648025512695, + "ref_logps/rejected": -40.24591827392578, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8528022766113281, + "rewards/margins": -0.06806611269712448, + "rewards/rejected": -0.784736156463623, + "step": 427 + }, + { + "epoch": 0.4, + "grad_norm": 14.267626762390137, + "learning_rate": 4.807625043721581e-07, + "logps/chosen": -37.362342834472656, + "logps/rejected": -52.87950134277344, + "loss": 0.4932, + "losses/dpo": 0.3793169856071472, + "losses/sft": 1.7072707414627075, + "losses/total": 0.3793169856071472, + "ref_logps/chosen": -31.649919509887695, + "ref_logps/rejected": -41.25990295410156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5712425112724304, + "rewards/margins": 0.5907171964645386, + "rewards/rejected": -1.1619596481323242, + "step": 428 + }, + { + "epoch": 0.41, + "grad_norm": 24.212587356567383, + "learning_rate": 4.805876180482686e-07, + "logps/chosen": -62.959625244140625, + "logps/rejected": -58.49671173095703, + "loss": 0.6562, + "losses/dpo": 0.7208305597305298, + "losses/sft": 1.8471555709838867, + "losses/total": 0.7208305597305298, + "ref_logps/chosen": -55.51588439941406, + "ref_logps/rejected": -49.259849548339844, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7443743944168091, + "rewards/margins": 0.17931221425533295, + "rewards/rejected": -0.9236866235733032, + "step": 429 + }, + { + "epoch": 0.41, + "grad_norm": 16.95990562438965, + "learning_rate": 4.804127317243791e-07, + "logps/chosen": -45.03351974487305, + "logps/rejected": -51.542999267578125, + "loss": 0.6483, + "losses/dpo": 0.6551743745803833, + "losses/sft": 1.555639624595642, + "losses/total": 0.6551743745803833, + "ref_logps/chosen": -37.09907531738281, + "ref_logps/rejected": -41.29612350463867, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7934445142745972, + "rewards/margins": 0.2312435805797577, + "rewards/rejected": -1.0246880054473877, + "step": 430 + }, + { + "epoch": 0.41, + "grad_norm": 19.56223487854004, + "learning_rate": 4.802378454004897e-07, + "logps/chosen": -51.68017578125, + "logps/rejected": -60.513671875, + "loss": 0.5672, + "losses/dpo": 0.49754729866981506, + "losses/sft": 1.8137422800064087, + "losses/total": 0.49754729866981506, + "ref_logps/chosen": -44.28087615966797, + "ref_logps/rejected": -48.87751388549805, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7399299144744873, + "rewards/margins": 0.4236864149570465, + "rewards/rejected": -1.1636161804199219, + "step": 431 + }, + { + "epoch": 0.41, + "grad_norm": 19.554107666015625, + "learning_rate": 4.800629590766002e-07, + "logps/chosen": -38.52039337158203, + "logps/rejected": -38.24866485595703, + "loss": 0.7718, + "losses/dpo": 0.7348898649215698, + "losses/sft": 2.0776398181915283, + "losses/total": 0.7348898649215698, + "ref_logps/chosen": -32.058921813964844, + "ref_logps/rejected": -32.68750762939453, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.6461471319198608, + "rewards/margins": -0.09003150463104248, + "rewards/rejected": -0.5561156868934631, + "step": 432 + }, + { + "epoch": 0.41, + "grad_norm": 15.3414888381958, + "learning_rate": 4.798880727527107e-07, + "logps/chosen": -33.04411315917969, + "logps/rejected": -49.62543487548828, + "loss": 0.5423, + "losses/dpo": 0.36758744716644287, + "losses/sft": 1.2707054615020752, + "losses/total": 0.36758744716644287, + "ref_logps/chosen": -26.833080291748047, + "ref_logps/rejected": -38.800392150878906, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6211034059524536, + "rewards/margins": 0.46140095591545105, + "rewards/rejected": -1.0825042724609375, + "step": 433 + }, + { + "epoch": 0.41, + "grad_norm": 17.68614959716797, + "learning_rate": 4.797131864288212e-07, + "logps/chosen": -42.37980270385742, + "logps/rejected": -47.62135314941406, + "loss": 0.602, + "losses/dpo": 0.713486909866333, + "losses/sft": 1.7730909585952759, + "losses/total": 0.713486909866333, + "ref_logps/chosen": -35.75381851196289, + "ref_logps/rejected": -38.149776458740234, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6625986099243164, + "rewards/margins": 0.28455933928489685, + "rewards/rejected": -0.9471579194068909, + "step": 434 + }, + { + "epoch": 0.41, + "grad_norm": 17.736204147338867, + "learning_rate": 4.795383001049318e-07, + "logps/chosen": -43.941314697265625, + "logps/rejected": -56.237457275390625, + "loss": 0.589, + "losses/dpo": 0.7260459065437317, + "losses/sft": 1.7363533973693848, + "losses/total": 0.7260459065437317, + "ref_logps/chosen": -36.130069732666016, + "ref_logps/rejected": -43.96063995361328, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7811243534088135, + "rewards/margins": 0.4465577006340027, + "rewards/rejected": -1.227682113647461, + "step": 435 + }, + { + "epoch": 0.41, + "grad_norm": 20.706064224243164, + "learning_rate": 4.793634137810423e-07, + "logps/chosen": -52.48896026611328, + "logps/rejected": -58.83586120605469, + "loss": 0.6204, + "losses/dpo": 0.5095518827438354, + "losses/sft": 1.1213394403457642, + "losses/total": 0.5095518827438354, + "ref_logps/chosen": -42.20005416870117, + "ref_logps/rejected": -46.325347900390625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0288909673690796, + "rewards/margins": 0.22216051816940308, + "rewards/rejected": -1.251051425933838, + "step": 436 + }, + { + "epoch": 0.41, + "grad_norm": 20.205711364746094, + "learning_rate": 4.791885274571528e-07, + "logps/chosen": -41.39351272583008, + "logps/rejected": -49.85081481933594, + "loss": 0.6644, + "losses/dpo": 0.48256945610046387, + "losses/sft": 1.8185701370239258, + "losses/total": 0.48256945610046387, + "ref_logps/chosen": -33.18410110473633, + "ref_logps/rejected": -40.282981872558594, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8209414482116699, + "rewards/margins": 0.13584190607070923, + "rewards/rejected": -0.9567833542823792, + "step": 437 + }, + { + "epoch": 0.41, + "grad_norm": 19.14922332763672, + "learning_rate": 4.790136411332634e-07, + "logps/chosen": -42.25700378417969, + "logps/rejected": -54.612281799316406, + "loss": 0.6119, + "losses/dpo": 0.45903971791267395, + "losses/sft": 1.8760770559310913, + "losses/total": 0.45903971791267395, + "ref_logps/chosen": -33.964603424072266, + "ref_logps/rejected": -43.75865173339844, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8292405605316162, + "rewards/margins": 0.2561222314834595, + "rewards/rejected": -1.0853627920150757, + "step": 438 + }, + { + "epoch": 0.41, + "grad_norm": 16.795570373535156, + "learning_rate": 4.788387548093739e-07, + "logps/chosen": -47.49351501464844, + "logps/rejected": -68.00299072265625, + "loss": 0.5174, + "losses/dpo": 0.5840404033660889, + "losses/sft": 1.597842812538147, + "losses/total": 0.5840404033660889, + "ref_logps/chosen": -41.466434478759766, + "ref_logps/rejected": -56.268890380859375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6027083396911621, + "rewards/margins": 0.5707011222839355, + "rewards/rejected": -1.1734094619750977, + "step": 439 + }, + { + "epoch": 0.42, + "grad_norm": 18.438379287719727, + "learning_rate": 4.786638684854844e-07, + "logps/chosen": -37.86119842529297, + "logps/rejected": -55.4619140625, + "loss": 0.5908, + "losses/dpo": 0.5164816379547119, + "losses/sft": 1.6231499910354614, + "losses/total": 0.5164816379547119, + "ref_logps/chosen": -31.267627716064453, + "ref_logps/rejected": -45.998756408691406, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6593573093414307, + "rewards/margins": 0.286958247423172, + "rewards/rejected": -0.946315586566925, + "step": 440 + }, + { + "epoch": 0.42, + "grad_norm": 22.680397033691406, + "learning_rate": 4.784889821615949e-07, + "logps/chosen": -47.43533706665039, + "logps/rejected": -46.34278869628906, + "loss": 0.7527, + "losses/dpo": 0.5242143869400024, + "losses/sft": 1.235299825668335, + "losses/total": 0.5242143869400024, + "ref_logps/chosen": -37.58125305175781, + "ref_logps/rejected": -37.05174255371094, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9854083061218262, + "rewards/margins": -0.05630387365818024, + "rewards/rejected": -0.9291044473648071, + "step": 441 + }, + { + "epoch": 0.42, + "grad_norm": 15.34579086303711, + "learning_rate": 4.783140958377055e-07, + "logps/chosen": -38.032371520996094, + "logps/rejected": -58.148345947265625, + "loss": 0.5269, + "losses/dpo": 0.3750840127468109, + "losses/sft": 1.2935688495635986, + "losses/total": 0.3750840127468109, + "ref_logps/chosen": -31.074722290039062, + "ref_logps/rejected": -46.435943603515625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6957647800445557, + "rewards/margins": 0.4754754602909088, + "rewards/rejected": -1.171240210533142, + "step": 442 + }, + { + "epoch": 0.42, + "grad_norm": 14.68557357788086, + "learning_rate": 4.78139209513816e-07, + "logps/chosen": -40.66063690185547, + "logps/rejected": -54.25035858154297, + "loss": 0.4711, + "losses/dpo": 0.5740063786506653, + "losses/sft": 1.3817611932754517, + "losses/total": 0.5740063786506653, + "ref_logps/chosen": -35.359954833984375, + "ref_logps/rejected": -42.793907165527344, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5300679802894592, + "rewards/margins": 0.6155773401260376, + "rewards/rejected": -1.1456453800201416, + "step": 443 + }, + { + "epoch": 0.42, + "grad_norm": 18.490785598754883, + "learning_rate": 4.779643231899265e-07, + "logps/chosen": -41.23979949951172, + "logps/rejected": -56.92577362060547, + "loss": 0.5916, + "losses/dpo": 0.555675745010376, + "losses/sft": 1.7466236352920532, + "losses/total": 0.555675745010376, + "ref_logps/chosen": -32.979679107666016, + "ref_logps/rejected": -45.45464324951172, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8260120153427124, + "rewards/margins": 0.3211010694503784, + "rewards/rejected": -1.1471130847930908, + "step": 444 + }, + { + "epoch": 0.42, + "grad_norm": 16.052154541015625, + "learning_rate": 4.777894368660371e-07, + "logps/chosen": -38.793434143066406, + "logps/rejected": -45.60943603515625, + "loss": 0.6371, + "losses/dpo": 0.7017737627029419, + "losses/sft": 1.5881775617599487, + "losses/total": 0.7017737627029419, + "ref_logps/chosen": -30.47211265563965, + "ref_logps/rejected": -34.91651916503906, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8321321606636047, + "rewards/margins": 0.23715932667255402, + "rewards/rejected": -1.069291591644287, + "step": 445 + }, + { + "epoch": 0.42, + "grad_norm": 19.183164596557617, + "learning_rate": 4.776145505421476e-07, + "logps/chosen": -47.78739929199219, + "logps/rejected": -57.933128356933594, + "loss": 0.5936, + "losses/dpo": 0.41501307487487793, + "losses/sft": 1.1630043983459473, + "losses/total": 0.41501307487487793, + "ref_logps/chosen": -40.3790397644043, + "ref_logps/rejected": -47.480621337890625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7408355474472046, + "rewards/margins": 0.30441516637802124, + "rewards/rejected": -1.045250654220581, + "step": 446 + }, + { + "epoch": 0.42, + "grad_norm": 19.889230728149414, + "learning_rate": 4.774396642182581e-07, + "logps/chosen": -49.03253936767578, + "logps/rejected": -60.4730339050293, + "loss": 0.6154, + "losses/dpo": 0.5732200145721436, + "losses/sft": 1.8470089435577393, + "losses/total": 0.5732200145721436, + "ref_logps/chosen": -38.5589714050293, + "ref_logps/rejected": -46.730350494384766, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0473569631576538, + "rewards/margins": 0.32691168785095215, + "rewards/rejected": -1.374268651008606, + "step": 447 + }, + { + "epoch": 0.42, + "grad_norm": 22.205015182495117, + "learning_rate": 4.772647778943686e-07, + "logps/chosen": -52.47529983520508, + "logps/rejected": -57.048583984375, + "loss": 0.6867, + "losses/dpo": 0.43346765637397766, + "losses/sft": 1.9497685432434082, + "losses/total": 0.43346765637397766, + "ref_logps/chosen": -43.684791564941406, + "ref_logps/rejected": -46.37071990966797, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8790508508682251, + "rewards/margins": 0.18873567879199982, + "rewards/rejected": -1.067786455154419, + "step": 448 + }, + { + "epoch": 0.42, + "grad_norm": 21.74654197692871, + "learning_rate": 4.770898915704791e-07, + "logps/chosen": -55.387115478515625, + "logps/rejected": -53.995975494384766, + "loss": 0.6333, + "losses/dpo": 0.7112340927124023, + "losses/sft": 2.041130781173706, + "losses/total": 0.7112340927124023, + "ref_logps/chosen": -46.39142608642578, + "ref_logps/rejected": -42.75465774536133, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8995682597160339, + "rewards/margins": 0.22456341981887817, + "rewards/rejected": -1.124131679534912, + "step": 449 + }, + { + "epoch": 0.42, + "grad_norm": 18.93586540222168, + "learning_rate": 4.769150052465897e-07, + "logps/chosen": -51.54730224609375, + "logps/rejected": -55.797611236572266, + "loss": 0.6069, + "losses/dpo": 0.6966808438301086, + "losses/sft": 1.768096923828125, + "losses/total": 0.6966808438301086, + "ref_logps/chosen": -42.923240661621094, + "ref_logps/rejected": -44.025184631347656, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8624064922332764, + "rewards/margins": 0.31483587622642517, + "rewards/rejected": -1.1772422790527344, + "step": 450 + }, + { + "epoch": 0.43, + "grad_norm": 20.891456604003906, + "learning_rate": 4.767401189227002e-07, + "logps/chosen": -51.11573028564453, + "logps/rejected": -53.578433990478516, + "loss": 0.6487, + "losses/dpo": 0.5291703939437866, + "losses/sft": 1.383335828781128, + "losses/total": 0.5291703939437866, + "ref_logps/chosen": -42.64189147949219, + "ref_logps/rejected": -42.661312103271484, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8473836779594421, + "rewards/margins": 0.2443285584449768, + "rewards/rejected": -1.091712236404419, + "step": 451 + }, + { + "epoch": 0.43, + "grad_norm": 14.952369689941406, + "learning_rate": 4.7656523259881074e-07, + "logps/chosen": -45.62141418457031, + "logps/rejected": -57.92384338378906, + "loss": 0.4864, + "losses/dpo": 0.56844162940979, + "losses/sft": 1.3780332803726196, + "losses/total": 0.56844162940979, + "ref_logps/chosen": -40.546173095703125, + "ref_logps/rejected": -47.22905731201172, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5075246095657349, + "rewards/margins": 0.5619542598724365, + "rewards/rejected": -1.0694787502288818, + "step": 452 + }, + { + "epoch": 0.43, + "grad_norm": 16.49131965637207, + "learning_rate": 4.763903462749213e-07, + "logps/chosen": -35.94932556152344, + "logps/rejected": -43.545989990234375, + "loss": 0.5882, + "losses/dpo": 0.7098357677459717, + "losses/sft": 1.9436204433441162, + "losses/total": 0.7098357677459717, + "ref_logps/chosen": -30.23165512084961, + "ref_logps/rejected": -34.85575866699219, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5717674493789673, + "rewards/margins": 0.29725557565689087, + "rewards/rejected": -0.8690229654312134, + "step": 453 + }, + { + "epoch": 0.43, + "grad_norm": 18.1254825592041, + "learning_rate": 4.762154599510318e-07, + "logps/chosen": -56.57024383544922, + "logps/rejected": -73.067138671875, + "loss": 0.5017, + "losses/dpo": 0.29225221276283264, + "losses/sft": 1.9481486082077026, + "losses/total": 0.29225221276283264, + "ref_logps/chosen": -46.44720458984375, + "ref_logps/rejected": -57.4094123840332, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0123039484024048, + "rewards/margins": 0.5534693598747253, + "rewards/rejected": -1.5657734870910645, + "step": 454 + }, + { + "epoch": 0.43, + "grad_norm": 17.663000106811523, + "learning_rate": 4.7604057362714233e-07, + "logps/chosen": -43.21162033081055, + "logps/rejected": -50.22489547729492, + "loss": 0.564, + "losses/dpo": 0.36530470848083496, + "losses/sft": 1.479834794998169, + "losses/total": 0.36530470848083496, + "ref_logps/chosen": -35.02986526489258, + "ref_logps/rejected": -38.19993591308594, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8181754946708679, + "rewards/margins": 0.3843207061290741, + "rewards/rejected": -1.2024962902069092, + "step": 455 + }, + { + "epoch": 0.43, + "grad_norm": 21.488021850585938, + "learning_rate": 4.758656873032529e-07, + "logps/chosen": -48.68894577026367, + "logps/rejected": -58.52639389038086, + "loss": 0.6352, + "losses/dpo": 0.5732907056808472, + "losses/sft": 1.8608347177505493, + "losses/total": 0.5732907056808472, + "ref_logps/chosen": -41.12689971923828, + "ref_logps/rejected": -47.08283615112305, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7562047243118286, + "rewards/margins": 0.38815122842788696, + "rewards/rejected": -1.1443560123443604, + "step": 456 + }, + { + "epoch": 0.43, + "grad_norm": 16.04104995727539, + "learning_rate": 4.7569080097936335e-07, + "logps/chosen": -35.751747131347656, + "logps/rejected": -48.58073806762695, + "loss": 0.5742, + "losses/dpo": 0.4595525860786438, + "losses/sft": 1.1189483404159546, + "losses/total": 0.4595525860786438, + "ref_logps/chosen": -28.997360229492188, + "ref_logps/rejected": -38.64642333984375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6754388213157654, + "rewards/margins": 0.3179926872253418, + "rewards/rejected": -0.9934315085411072, + "step": 457 + }, + { + "epoch": 0.43, + "grad_norm": 22.913747787475586, + "learning_rate": 4.755159146554739e-07, + "logps/chosen": -51.341373443603516, + "logps/rejected": -56.24919891357422, + "loss": 0.6936, + "losses/dpo": 0.670680582523346, + "losses/sft": 1.5938969850540161, + "losses/total": 0.670680582523346, + "ref_logps/chosen": -42.247589111328125, + "ref_logps/rejected": -45.67637252807617, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9093787670135498, + "rewards/margins": 0.14790397882461548, + "rewards/rejected": -1.0572826862335205, + "step": 458 + }, + { + "epoch": 0.43, + "grad_norm": 21.591320037841797, + "learning_rate": 4.753410283315845e-07, + "logps/chosen": -48.552101135253906, + "logps/rejected": -71.198974609375, + "loss": 0.5838, + "losses/dpo": 0.5087445974349976, + "losses/sft": 1.36105477809906, + "losses/total": 0.5087445974349976, + "ref_logps/chosen": -38.676673889160156, + "ref_logps/rejected": -58.15402603149414, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9875422716140747, + "rewards/margins": 0.31695204973220825, + "rewards/rejected": -1.3044943809509277, + "step": 459 + }, + { + "epoch": 0.43, + "grad_norm": 18.097824096679688, + "learning_rate": 4.75166142007695e-07, + "logps/chosen": -49.69443893432617, + "logps/rejected": -55.27891159057617, + "loss": 0.618, + "losses/dpo": 0.33629435300827026, + "losses/sft": 1.4625208377838135, + "losses/total": 0.33629435300827026, + "ref_logps/chosen": -42.104835510253906, + "ref_logps/rejected": -44.32172393798828, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7589606046676636, + "rewards/margins": 0.3367578387260437, + "rewards/rejected": -1.0957183837890625, + "step": 460 + }, + { + "epoch": 0.44, + "grad_norm": 17.464462280273438, + "learning_rate": 4.749912556838055e-07, + "logps/chosen": -33.44009780883789, + "logps/rejected": -50.5752067565918, + "loss": 0.6021, + "losses/dpo": 0.4094173312187195, + "losses/sft": 1.7006267309188843, + "losses/total": 0.4094173312187195, + "ref_logps/chosen": -24.201457977294922, + "ref_logps/rejected": -38.730926513671875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9238638877868652, + "rewards/margins": 0.2605639100074768, + "rewards/rejected": -1.1844277381896973, + "step": 461 + }, + { + "epoch": 0.44, + "grad_norm": 22.261863708496094, + "learning_rate": 4.74816369359916e-07, + "logps/chosen": -44.44083023071289, + "logps/rejected": -42.3175048828125, + "loss": 0.7428, + "losses/dpo": 0.6097182035446167, + "losses/sft": 1.3318395614624023, + "losses/total": 0.6097182035446167, + "ref_logps/chosen": -36.43798065185547, + "ref_logps/rejected": -33.59527587890625, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.8002848625183105, + "rewards/margins": 0.07193787395954132, + "rewards/rejected": -0.8722227811813354, + "step": 462 + }, + { + "epoch": 0.44, + "grad_norm": 17.269041061401367, + "learning_rate": 4.746414830360266e-07, + "logps/chosen": -42.34362030029297, + "logps/rejected": -50.94217300415039, + "loss": 0.6089, + "losses/dpo": 0.5902761220932007, + "losses/sft": 1.8555960655212402, + "losses/total": 0.5902761220932007, + "ref_logps/chosen": -34.671905517578125, + "ref_logps/rejected": -40.2083740234375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7671712636947632, + "rewards/margins": 0.30620861053466797, + "rewards/rejected": -1.0733798742294312, + "step": 463 + }, + { + "epoch": 0.44, + "grad_norm": 17.21843719482422, + "learning_rate": 4.7446659671213705e-07, + "logps/chosen": -40.25098419189453, + "logps/rejected": -46.70378112792969, + "loss": 0.6523, + "losses/dpo": 0.7448264360427856, + "losses/sft": 1.8023455142974854, + "losses/total": 0.7448264360427856, + "ref_logps/chosen": -31.76789665222168, + "ref_logps/rejected": -35.90791320800781, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8483086824417114, + "rewards/margins": 0.2312774658203125, + "rewards/rejected": -1.0795860290527344, + "step": 464 + }, + { + "epoch": 0.44, + "grad_norm": 21.60810089111328, + "learning_rate": 4.742917103882476e-07, + "logps/chosen": -45.472679138183594, + "logps/rejected": -46.75816345214844, + "loss": 0.7712, + "losses/dpo": 0.5438826084136963, + "losses/sft": 1.6787500381469727, + "losses/total": 0.5438826084136963, + "ref_logps/chosen": -34.86170196533203, + "ref_logps/rejected": -36.738441467285156, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0610976219177246, + "rewards/margins": -0.0591253936290741, + "rewards/rejected": -1.0019723176956177, + "step": 465 + }, + { + "epoch": 0.44, + "grad_norm": 16.170509338378906, + "learning_rate": 4.741168240643582e-07, + "logps/chosen": -39.71556854248047, + "logps/rejected": -46.69499969482422, + "loss": 0.5679, + "losses/dpo": 0.4159000515937805, + "losses/sft": 0.8769636750221252, + "losses/total": 0.4159000515937805, + "ref_logps/chosen": -33.49738311767578, + "ref_logps/rejected": -36.21461486816406, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6218188405036926, + "rewards/margins": 0.4262195825576782, + "rewards/rejected": -1.0480384826660156, + "step": 466 + }, + { + "epoch": 0.44, + "grad_norm": 19.162927627563477, + "learning_rate": 4.739419377404687e-07, + "logps/chosen": -41.015533447265625, + "logps/rejected": -48.816219329833984, + "loss": 0.661, + "losses/dpo": 0.8000115156173706, + "losses/sft": 1.8277498483657837, + "losses/total": 0.8000115156173706, + "ref_logps/chosen": -32.41111373901367, + "ref_logps/rejected": -37.9894905090332, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.860442042350769, + "rewards/margins": 0.2222307026386261, + "rewards/rejected": -1.0826727151870728, + "step": 467 + }, + { + "epoch": 0.44, + "grad_norm": 20.709863662719727, + "learning_rate": 4.737670514165792e-07, + "logps/chosen": -52.0721321105957, + "logps/rejected": -68.4088134765625, + "loss": 0.5521, + "losses/dpo": 0.37309297919273376, + "losses/sft": 1.539156198501587, + "losses/total": 0.37309297919273376, + "ref_logps/chosen": -44.31230163574219, + "ref_logps/rejected": -55.56413269042969, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7759832143783569, + "rewards/margins": 0.5084854960441589, + "rewards/rejected": -1.284468650817871, + "step": 468 + }, + { + "epoch": 0.44, + "grad_norm": 23.58916473388672, + "learning_rate": 4.735921650926897e-07, + "logps/chosen": -48.31652069091797, + "logps/rejected": -53.9385871887207, + "loss": 0.7458, + "losses/dpo": 0.7906002998352051, + "losses/sft": 1.6727759838104248, + "losses/total": 0.7906002998352051, + "ref_logps/chosen": -36.828189849853516, + "ref_logps/rejected": -42.351768493652344, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.1488333940505981, + "rewards/margins": 0.009848490357398987, + "rewards/rejected": -1.1586819887161255, + "step": 469 + }, + { + "epoch": 0.44, + "grad_norm": 19.42133903503418, + "learning_rate": 4.734172787688003e-07, + "logps/chosen": -45.268280029296875, + "logps/rejected": -64.12366485595703, + "loss": 0.5553, + "losses/dpo": 0.5998967289924622, + "losses/sft": 1.2374427318572998, + "losses/total": 0.5998967289924622, + "ref_logps/chosen": -36.237701416015625, + "ref_logps/rejected": -50.12152862548828, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.903057873249054, + "rewards/margins": 0.49715644121170044, + "rewards/rejected": -1.4002143144607544, + "step": 470 + }, + { + "epoch": 0.44, + "grad_norm": 26.29847526550293, + "learning_rate": 4.7324239244491074e-07, + "logps/chosen": -60.17868423461914, + "logps/rejected": -65.33236694335938, + "loss": 0.8158, + "losses/dpo": 0.7402694225311279, + "losses/sft": 1.5119843482971191, + "losses/total": 0.7402694225311279, + "ref_logps/chosen": -45.775604248046875, + "ref_logps/rejected": -51.926429748535156, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4403083324432373, + "rewards/margins": -0.0997144877910614, + "rewards/rejected": -1.3405938148498535, + "step": 471 + }, + { + "epoch": 0.45, + "grad_norm": 19.726667404174805, + "learning_rate": 4.730675061210213e-07, + "logps/chosen": -40.36116027832031, + "logps/rejected": -48.045719146728516, + "loss": 0.6722, + "losses/dpo": 0.5413148999214172, + "losses/sft": 1.5484027862548828, + "losses/total": 0.5413148999214172, + "ref_logps/chosen": -30.498477935791016, + "ref_logps/rejected": -35.477359771728516, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9862680435180664, + "rewards/margins": 0.2705676257610321, + "rewards/rejected": -1.256835699081421, + "step": 472 + }, + { + "epoch": 0.45, + "grad_norm": 19.40871238708496, + "learning_rate": 4.7289261979713187e-07, + "logps/chosen": -49.310821533203125, + "logps/rejected": -47.04783630371094, + "loss": 0.6942, + "losses/dpo": 0.9769923686981201, + "losses/sft": 2.333613872528076, + "losses/total": 0.9769923686981201, + "ref_logps/chosen": -40.01148986816406, + "ref_logps/rejected": -36.74407958984375, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9299333095550537, + "rewards/margins": 0.10044249892234802, + "rewards/rejected": -1.0303759574890137, + "step": 473 + }, + { + "epoch": 0.45, + "grad_norm": 18.19770050048828, + "learning_rate": 4.727177334732424e-07, + "logps/chosen": -42.30438232421875, + "logps/rejected": -55.242313385009766, + "loss": 0.4734, + "losses/dpo": 0.7860375642776489, + "losses/sft": 1.4256479740142822, + "losses/total": 0.7860375642776489, + "ref_logps/chosen": -34.156639099121094, + "ref_logps/rejected": -39.7231559753418, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8147742748260498, + "rewards/margins": 0.7371412515640259, + "rewards/rejected": -1.5519155263900757, + "step": 474 + }, + { + "epoch": 0.45, + "grad_norm": 18.775230407714844, + "learning_rate": 4.725428471493529e-07, + "logps/chosen": -46.815521240234375, + "logps/rejected": -63.159793853759766, + "loss": 0.6024, + "losses/dpo": 0.598051905632019, + "losses/sft": 1.55747389793396, + "losses/total": 0.598051905632019, + "ref_logps/chosen": -37.30512237548828, + "ref_logps/rejected": -50.9106330871582, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.951040506362915, + "rewards/margins": 0.27387550473213196, + "rewards/rejected": -1.2249159812927246, + "step": 475 + }, + { + "epoch": 0.45, + "grad_norm": 15.7463960647583, + "learning_rate": 4.723679608254634e-07, + "logps/chosen": -35.400634765625, + "logps/rejected": -49.15324401855469, + "loss": 0.5159, + "losses/dpo": 0.8908594846725464, + "losses/sft": 1.7306084632873535, + "losses/total": 0.8908594846725464, + "ref_logps/chosen": -28.968732833862305, + "ref_logps/rejected": -36.71923065185547, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6431903839111328, + "rewards/margins": 0.6002112627029419, + "rewards/rejected": -1.2434017658233643, + "step": 476 + }, + { + "epoch": 0.45, + "grad_norm": 20.361690521240234, + "learning_rate": 4.72193074501574e-07, + "logps/chosen": -49.52460479736328, + "logps/rejected": -58.65748977661133, + "loss": 0.6242, + "losses/dpo": 0.871837854385376, + "losses/sft": 1.6254528760910034, + "losses/total": 0.871837854385376, + "ref_logps/chosen": -41.26051330566406, + "ref_logps/rejected": -46.249664306640625, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8264090418815613, + "rewards/margins": 0.4143737554550171, + "rewards/rejected": -1.2407827377319336, + "step": 477 + }, + { + "epoch": 0.45, + "grad_norm": 17.02433204650879, + "learning_rate": 4.7201818817768444e-07, + "logps/chosen": -33.82848358154297, + "logps/rejected": -38.8887939453125, + "loss": 0.6757, + "losses/dpo": 0.5560075044631958, + "losses/sft": 1.0706725120544434, + "losses/total": 0.5560075044631958, + "ref_logps/chosen": -26.161861419677734, + "ref_logps/rejected": -30.305509567260742, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7666621208190918, + "rewards/margins": 0.09166643023490906, + "rewards/rejected": -0.858328640460968, + "step": 478 + }, + { + "epoch": 0.45, + "grad_norm": 18.766489028930664, + "learning_rate": 4.71843301853795e-07, + "logps/chosen": -45.95814514160156, + "logps/rejected": -52.7591552734375, + "loss": 0.5795, + "losses/dpo": 0.5951334834098816, + "losses/sft": 1.0161874294281006, + "losses/total": 0.5951334834098816, + "ref_logps/chosen": -38.287296295166016, + "ref_logps/rejected": -42.2863883972168, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.767085075378418, + "rewards/margins": 0.2801916301250458, + "rewards/rejected": -1.0472767353057861, + "step": 479 + }, + { + "epoch": 0.45, + "grad_norm": 17.539127349853516, + "learning_rate": 4.7166841552990557e-07, + "logps/chosen": -36.53566360473633, + "logps/rejected": -54.15878677368164, + "loss": 0.6424, + "losses/dpo": 0.7673262357711792, + "losses/sft": 1.7970187664031982, + "losses/total": 0.7673262357711792, + "ref_logps/chosen": -28.678905487060547, + "ref_logps/rejected": -43.651466369628906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.785676121711731, + "rewards/margins": 0.2650558054447174, + "rewards/rejected": -1.050731897354126, + "step": 480 + }, + { + "epoch": 0.45, + "grad_norm": 19.44285011291504, + "learning_rate": 4.714935292060161e-07, + "logps/chosen": -41.32199478149414, + "logps/rejected": -59.14490509033203, + "loss": 0.5627, + "losses/dpo": 0.45880869030952454, + "losses/sft": 1.7741310596466064, + "losses/total": 0.45880869030952454, + "ref_logps/chosen": -33.38575744628906, + "ref_logps/rejected": -47.139739990234375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.793623685836792, + "rewards/margins": 0.40689271688461304, + "rewards/rejected": -1.2005164623260498, + "step": 481 + }, + { + "epoch": 0.46, + "grad_norm": 15.83872127532959, + "learning_rate": 4.713186428821266e-07, + "logps/chosen": -45.21366882324219, + "logps/rejected": -60.75635528564453, + "loss": 0.5274, + "losses/dpo": 0.4298006296157837, + "losses/sft": 1.198513388633728, + "losses/total": 0.4298006296157837, + "ref_logps/chosen": -37.90647506713867, + "ref_logps/rejected": -47.69110870361328, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7307194471359253, + "rewards/margins": 0.5758055448532104, + "rewards/rejected": -1.3065249919891357, + "step": 482 + }, + { + "epoch": 0.46, + "grad_norm": 14.492478370666504, + "learning_rate": 4.711437565582371e-07, + "logps/chosen": -38.78242492675781, + "logps/rejected": -44.837562561035156, + "loss": 0.5355, + "losses/dpo": 0.5942108631134033, + "losses/sft": 1.7184935808181763, + "losses/total": 0.5942108631134033, + "ref_logps/chosen": -32.85670471191406, + "ref_logps/rejected": -34.80342102050781, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5925720930099487, + "rewards/margins": 0.41084229946136475, + "rewards/rejected": -1.0034143924713135, + "step": 483 + }, + { + "epoch": 0.46, + "grad_norm": 18.78915786743164, + "learning_rate": 4.7096887023434767e-07, + "logps/chosen": -48.04331970214844, + "logps/rejected": -68.16919708251953, + "loss": 0.5494, + "losses/dpo": 0.6037707328796387, + "losses/sft": 1.6506601572036743, + "losses/total": 0.6037707328796387, + "ref_logps/chosen": -38.50733184814453, + "ref_logps/rejected": -54.90313720703125, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9535988569259644, + "rewards/margins": 0.3730068802833557, + "rewards/rejected": -1.3266057968139648, + "step": 484 + }, + { + "epoch": 0.46, + "grad_norm": 19.55449104309082, + "learning_rate": 4.707939839104582e-07, + "logps/chosen": -54.59739685058594, + "logps/rejected": -56.72189712524414, + "loss": 0.6101, + "losses/dpo": 0.4871029853820801, + "losses/sft": 1.7814446687698364, + "losses/total": 0.4871029853820801, + "ref_logps/chosen": -46.157718658447266, + "ref_logps/rejected": -45.65517044067383, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8439677953720093, + "rewards/margins": 0.2627049684524536, + "rewards/rejected": -1.106672763824463, + "step": 485 + }, + { + "epoch": 0.46, + "grad_norm": 16.17681312561035, + "learning_rate": 4.706190975865687e-07, + "logps/chosen": -46.72882080078125, + "logps/rejected": -73.10804748535156, + "loss": 0.4612, + "losses/dpo": 0.6411159038543701, + "losses/sft": 1.7147176265716553, + "losses/total": 0.6411159038543701, + "ref_logps/chosen": -38.388999938964844, + "ref_logps/rejected": -57.14522933959961, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8339822888374329, + "rewards/margins": 0.7622995376586914, + "rewards/rejected": -1.5962817668914795, + "step": 486 + }, + { + "epoch": 0.46, + "grad_norm": 23.703916549682617, + "learning_rate": 4.7044421126267926e-07, + "logps/chosen": -45.02565002441406, + "logps/rejected": -43.16627502441406, + "loss": 0.7874, + "losses/dpo": 0.5986388921737671, + "losses/sft": 1.2237434387207031, + "losses/total": 0.5986388921737671, + "ref_logps/chosen": -36.05559539794922, + "ref_logps/rejected": -34.63477325439453, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8970053195953369, + "rewards/margins": -0.0438552126288414, + "rewards/rejected": -0.8531500697135925, + "step": 487 + }, + { + "epoch": 0.46, + "grad_norm": 18.107013702392578, + "learning_rate": 4.702693249387898e-07, + "logps/chosen": -44.655616760253906, + "logps/rejected": -49.370018005371094, + "loss": 0.6378, + "losses/dpo": 0.6428918838500977, + "losses/sft": 1.8399006128311157, + "losses/total": 0.6428918838500977, + "ref_logps/chosen": -35.976295471191406, + "ref_logps/rejected": -38.152950286865234, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8679324984550476, + "rewards/margins": 0.2537745237350464, + "rewards/rejected": -1.1217069625854492, + "step": 488 + }, + { + "epoch": 0.46, + "grad_norm": 17.24370002746582, + "learning_rate": 4.700944386149003e-07, + "logps/chosen": -42.770179748535156, + "logps/rejected": -60.08184051513672, + "loss": 0.5301, + "losses/dpo": 0.5165985822677612, + "losses/sft": 1.7652552127838135, + "losses/total": 0.5165985822677612, + "ref_logps/chosen": -36.12382888793945, + "ref_logps/rejected": -48.13255310058594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.664635181427002, + "rewards/margins": 0.5302936434745789, + "rewards/rejected": -1.194928765296936, + "step": 489 + }, + { + "epoch": 0.46, + "grad_norm": 18.994253158569336, + "learning_rate": 4.699195522910108e-07, + "logps/chosen": -46.275970458984375, + "logps/rejected": -57.52937316894531, + "loss": 0.5453, + "losses/dpo": 0.6960052251815796, + "losses/sft": 1.6999146938323975, + "losses/total": 0.6960052251815796, + "ref_logps/chosen": -38.666622161865234, + "ref_logps/rejected": -43.64849853515625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.760934591293335, + "rewards/margins": 0.6271531581878662, + "rewards/rejected": -1.3880877494812012, + "step": 490 + }, + { + "epoch": 0.46, + "grad_norm": 22.115245819091797, + "learning_rate": 4.6974466596712137e-07, + "logps/chosen": -58.12640380859375, + "logps/rejected": -62.0530891418457, + "loss": 0.6813, + "losses/dpo": 0.5207221508026123, + "losses/sft": 1.4057503938674927, + "losses/total": 0.5207221508026123, + "ref_logps/chosen": -46.12263488769531, + "ref_logps/rejected": -48.4195442199707, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2003765106201172, + "rewards/margins": 0.16297802329063416, + "rewards/rejected": -1.3633546829223633, + "step": 491 + }, + { + "epoch": 0.46, + "grad_norm": 17.894325256347656, + "learning_rate": 4.695697796432319e-07, + "logps/chosen": -42.253726959228516, + "logps/rejected": -54.4864501953125, + "loss": 0.5657, + "losses/dpo": 0.5236856937408447, + "losses/sft": 1.9671494960784912, + "losses/total": 0.5236856937408447, + "ref_logps/chosen": -34.381656646728516, + "ref_logps/rejected": -42.45591354370117, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7872068881988525, + "rewards/margins": 0.415846586227417, + "rewards/rejected": -1.2030534744262695, + "step": 492 + }, + { + "epoch": 0.47, + "grad_norm": 17.1875, + "learning_rate": 4.693948933193424e-07, + "logps/chosen": -42.840702056884766, + "logps/rejected": -54.49610900878906, + "loss": 0.6294, + "losses/dpo": 0.6529833078384399, + "losses/sft": 1.8150737285614014, + "losses/total": 0.6529833078384399, + "ref_logps/chosen": -33.348411560058594, + "ref_logps/rejected": -42.27693176269531, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9492291212081909, + "rewards/margins": 0.2726883888244629, + "rewards/rejected": -1.2219175100326538, + "step": 493 + }, + { + "epoch": 0.47, + "grad_norm": 15.577768325805664, + "learning_rate": 4.6922000699545296e-07, + "logps/chosen": -42.49756622314453, + "logps/rejected": -56.66531753540039, + "loss": 0.5734, + "losses/dpo": 0.44411545991897583, + "losses/sft": 1.9318451881408691, + "losses/total": 0.44411545991897583, + "ref_logps/chosen": -34.50550842285156, + "ref_logps/rejected": -44.91295623779297, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7992058396339417, + "rewards/margins": 0.3760302662849426, + "rewards/rejected": -1.1752361059188843, + "step": 494 + }, + { + "epoch": 0.47, + "grad_norm": 22.485267639160156, + "learning_rate": 4.6904512067156347e-07, + "logps/chosen": -53.93192672729492, + "logps/rejected": -56.58464431762695, + "loss": 0.739, + "losses/dpo": 0.5776136517524719, + "losses/sft": 1.7007924318313599, + "losses/total": 0.5776136517524719, + "ref_logps/chosen": -44.054908752441406, + "ref_logps/rejected": -45.867652893066406, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9877016544342041, + "rewards/margins": 0.0839976891875267, + "rewards/rejected": -1.0716992616653442, + "step": 495 + }, + { + "epoch": 0.47, + "grad_norm": 17.948152542114258, + "learning_rate": 4.68870234347674e-07, + "logps/chosen": -41.03163146972656, + "logps/rejected": -45.37687301635742, + "loss": 0.6257, + "losses/dpo": 0.5661336183547974, + "losses/sft": 2.003648281097412, + "losses/total": 0.5661336183547974, + "ref_logps/chosen": -34.10818099975586, + "ref_logps/rejected": -35.574745178222656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.692345380783081, + "rewards/margins": 0.28786757588386536, + "rewards/rejected": -0.9802129864692688, + "step": 496 + }, + { + "epoch": 0.47, + "grad_norm": 17.81679916381836, + "learning_rate": 4.686953480237845e-07, + "logps/chosen": -40.114295959472656, + "logps/rejected": -68.06855773925781, + "loss": 0.4938, + "losses/dpo": 0.3779289722442627, + "losses/sft": 1.258560061454773, + "losses/total": 0.3779289722442627, + "ref_logps/chosen": -34.208866119384766, + "ref_logps/rejected": -54.778663635253906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5905427932739258, + "rewards/margins": 0.7384467124938965, + "rewards/rejected": -1.3289895057678223, + "step": 497 + }, + { + "epoch": 0.47, + "grad_norm": 17.091392517089844, + "learning_rate": 4.6852046169989506e-07, + "logps/chosen": -48.91603469848633, + "logps/rejected": -78.88955688476562, + "loss": 0.494, + "losses/dpo": 0.3647302985191345, + "losses/sft": 2.0997064113616943, + "losses/total": 0.3647302985191345, + "ref_logps/chosen": -42.14565658569336, + "ref_logps/rejected": -65.46869659423828, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6770377159118652, + "rewards/margins": 0.6650483012199402, + "rewards/rejected": -1.3420860767364502, + "step": 498 + }, + { + "epoch": 0.47, + "grad_norm": 19.506563186645508, + "learning_rate": 4.6834557537600557e-07, + "logps/chosen": -38.25553894042969, + "logps/rejected": -47.6988410949707, + "loss": 0.6258, + "losses/dpo": 0.4752151370048523, + "losses/sft": 1.5231549739837646, + "losses/total": 0.4752151370048523, + "ref_logps/chosen": -31.041873931884766, + "ref_logps/rejected": -37.1468391418457, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7213667035102844, + "rewards/margins": 0.33383363485336304, + "rewards/rejected": -1.0552003383636475, + "step": 499 + }, + { + "epoch": 0.47, + "grad_norm": 18.562971115112305, + "learning_rate": 4.681706890521161e-07, + "logps/chosen": -38.205718994140625, + "logps/rejected": -44.971038818359375, + "loss": 0.6131, + "losses/dpo": 0.7807100415229797, + "losses/sft": 1.3306481838226318, + "losses/total": 0.7807100415229797, + "ref_logps/chosen": -31.795337677001953, + "ref_logps/rejected": -36.20762252807617, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6410377025604248, + "rewards/margins": 0.23530375957489014, + "rewards/rejected": -0.8763414025306702, + "step": 500 + }, + { + "epoch": 0.47, + "grad_norm": 22.99134635925293, + "learning_rate": 4.6799580272822665e-07, + "logps/chosen": -55.033180236816406, + "logps/rejected": -56.21544647216797, + "loss": 0.7198, + "losses/dpo": 0.7807220816612244, + "losses/sft": 1.5503177642822266, + "losses/total": 0.7807220816612244, + "ref_logps/chosen": -43.038021087646484, + "ref_logps/rejected": -42.46408462524414, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.199515461921692, + "rewards/margins": 0.17562085390090942, + "rewards/rejected": -1.375136375427246, + "step": 501 + }, + { + "epoch": 0.47, + "grad_norm": 16.69556999206543, + "learning_rate": 4.6782091640433716e-07, + "logps/chosen": -46.3759651184082, + "logps/rejected": -60.395477294921875, + "loss": 0.5059, + "losses/dpo": 0.37844979763031006, + "losses/sft": 1.8363982439041138, + "losses/total": 0.37844979763031006, + "ref_logps/chosen": -39.96241760253906, + "ref_logps/rejected": -47.32520294189453, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6413544416427612, + "rewards/margins": 0.6656726598739624, + "rewards/rejected": -1.3070271015167236, + "step": 502 + }, + { + "epoch": 0.47, + "grad_norm": 18.059375762939453, + "learning_rate": 4.676460300804477e-07, + "logps/chosen": -40.86088180541992, + "logps/rejected": -45.95768356323242, + "loss": 0.6248, + "losses/dpo": 0.8307574391365051, + "losses/sft": 1.4206961393356323, + "losses/total": 0.8307574391365051, + "ref_logps/chosen": -33.901485443115234, + "ref_logps/rejected": -36.60893249511719, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6959394216537476, + "rewards/margins": 0.23893572390079498, + "rewards/rejected": -0.9348750710487366, + "step": 503 + }, + { + "epoch": 0.48, + "grad_norm": 22.766759872436523, + "learning_rate": 4.6747114375655824e-07, + "logps/chosen": -43.040191650390625, + "logps/rejected": -54.45762252807617, + "loss": 0.67, + "losses/dpo": 0.8299270868301392, + "losses/sft": 1.9540852308273315, + "losses/total": 0.8299270868301392, + "ref_logps/chosen": -35.957183837890625, + "ref_logps/rejected": -44.16240692138672, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7083003520965576, + "rewards/margins": 0.32122132182121277, + "rewards/rejected": -1.0295215845108032, + "step": 504 + }, + { + "epoch": 0.48, + "grad_norm": 17.236427307128906, + "learning_rate": 4.6729625743266875e-07, + "logps/chosen": -40.47539138793945, + "logps/rejected": -50.07622528076172, + "loss": 0.58, + "losses/dpo": 0.6723325848579407, + "losses/sft": 1.2062137126922607, + "losses/total": 0.6723325848579407, + "ref_logps/chosen": -32.4854736328125, + "ref_logps/rejected": -38.40970993041992, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7989921569824219, + "rewards/margins": 0.3676595985889435, + "rewards/rejected": -1.166651725769043, + "step": 505 + }, + { + "epoch": 0.48, + "grad_norm": 18.677513122558594, + "learning_rate": 4.6712137110877927e-07, + "logps/chosen": -50.442169189453125, + "logps/rejected": -69.31327819824219, + "loss": 0.5662, + "losses/dpo": 0.9488786458969116, + "losses/sft": 1.9949392080307007, + "losses/total": 0.9488786458969116, + "ref_logps/chosen": -42.01020812988281, + "ref_logps/rejected": -54.56141662597656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8431962132453918, + "rewards/margins": 0.631990373134613, + "rewards/rejected": -1.4751865863800049, + "step": 506 + }, + { + "epoch": 0.48, + "grad_norm": 14.27094554901123, + "learning_rate": 4.669464847848898e-07, + "logps/chosen": -41.08687973022461, + "logps/rejected": -65.14675903320312, + "loss": 0.4831, + "losses/dpo": 0.552592396736145, + "losses/sft": 1.467944860458374, + "losses/total": 0.552592396736145, + "ref_logps/chosen": -34.62055969238281, + "ref_logps/rejected": -52.06278991699219, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6466319561004639, + "rewards/margins": 0.6617651581764221, + "rewards/rejected": -1.3083970546722412, + "step": 507 + }, + { + "epoch": 0.48, + "grad_norm": 20.737091064453125, + "learning_rate": 4.6677159846100035e-07, + "logps/chosen": -39.053672790527344, + "logps/rejected": -49.83109664916992, + "loss": 0.6259, + "losses/dpo": 0.44908422231674194, + "losses/sft": 1.63162362575531, + "losses/total": 0.44908422231674194, + "ref_logps/chosen": -31.550031661987305, + "ref_logps/rejected": -39.70254898071289, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7503641843795776, + "rewards/margins": 0.26249057054519653, + "rewards/rejected": -1.0128546953201294, + "step": 508 + }, + { + "epoch": 0.48, + "grad_norm": 19.570249557495117, + "learning_rate": 4.6659671213711086e-07, + "logps/chosen": -37.74221420288086, + "logps/rejected": -38.164546966552734, + "loss": 0.7403, + "losses/dpo": 0.7543376684188843, + "losses/sft": 1.7525604963302612, + "losses/total": 0.7543376684188843, + "ref_logps/chosen": -30.180740356445312, + "ref_logps/rejected": -30.425790786743164, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7561473250389099, + "rewards/margins": 0.017728038132190704, + "rewards/rejected": -0.7738754153251648, + "step": 509 + }, + { + "epoch": 0.48, + "grad_norm": 16.213655471801758, + "learning_rate": 4.6642182581322137e-07, + "logps/chosen": -43.07244110107422, + "logps/rejected": -58.89888000488281, + "loss": 0.4809, + "losses/dpo": 0.3350576162338257, + "losses/sft": 1.128604531288147, + "losses/total": 0.3350576162338257, + "ref_logps/chosen": -37.63832473754883, + "ref_logps/rejected": -47.659271240234375, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5434116721153259, + "rewards/margins": 0.5805493593215942, + "rewards/rejected": -1.123961091041565, + "step": 510 + }, + { + "epoch": 0.48, + "grad_norm": 20.865148544311523, + "learning_rate": 4.6624693948933194e-07, + "logps/chosen": -49.687129974365234, + "logps/rejected": -51.68817901611328, + "loss": 0.6943, + "losses/dpo": 0.8964338302612305, + "losses/sft": 1.6238555908203125, + "losses/total": 0.8964338302612305, + "ref_logps/chosen": -40.38511657714844, + "ref_logps/rejected": -40.2294807434082, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9302010536193848, + "rewards/margins": 0.2156689167022705, + "rewards/rejected": -1.1458700895309448, + "step": 511 + }, + { + "epoch": 0.48, + "grad_norm": 17.870445251464844, + "learning_rate": 4.6607205316544245e-07, + "logps/chosen": -37.93321228027344, + "logps/rejected": -62.82341766357422, + "loss": 0.4961, + "losses/dpo": 0.618889570236206, + "losses/sft": 1.3829941749572754, + "losses/total": 0.618889570236206, + "ref_logps/chosen": -32.22320556640625, + "ref_logps/rejected": -50.822689056396484, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5710002183914185, + "rewards/margins": 0.6290723085403442, + "rewards/rejected": -1.2000726461410522, + "step": 512 + }, + { + "epoch": 0.48, + "grad_norm": 17.249675750732422, + "learning_rate": 4.6589716684155296e-07, + "logps/chosen": -46.194698333740234, + "logps/rejected": -56.13873291015625, + "loss": 0.5549, + "losses/dpo": 0.5435903072357178, + "losses/sft": 1.60612952709198, + "losses/total": 0.5435903072357178, + "ref_logps/chosen": -37.873985290527344, + "ref_logps/rejected": -43.54327392578125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.83207106590271, + "rewards/margins": 0.42747408151626587, + "rewards/rejected": -1.259545087814331, + "step": 513 + }, + { + "epoch": 0.49, + "grad_norm": 19.374313354492188, + "learning_rate": 4.657222805176635e-07, + "logps/chosen": -37.62091064453125, + "logps/rejected": -51.132179260253906, + "loss": 0.6659, + "losses/dpo": 0.6505405902862549, + "losses/sft": 1.8584665060043335, + "losses/total": 0.6505405902862549, + "ref_logps/chosen": -31.067005157470703, + "ref_logps/rejected": -43.022987365722656, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6553905010223389, + "rewards/margins": 0.15552854537963867, + "rewards/rejected": -0.8109190464019775, + "step": 514 + }, + { + "epoch": 0.49, + "grad_norm": 20.405542373657227, + "learning_rate": 4.6554739419377404e-07, + "logps/chosen": -41.0093994140625, + "logps/rejected": -54.108421325683594, + "loss": 0.6408, + "losses/dpo": 0.5008844137191772, + "losses/sft": 1.376187801361084, + "losses/total": 0.5008844137191772, + "ref_logps/chosen": -34.1032600402832, + "ref_logps/rejected": -44.66871643066406, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6906140446662903, + "rewards/margins": 0.253356397151947, + "rewards/rejected": -0.9439705014228821, + "step": 515 + }, + { + "epoch": 0.49, + "grad_norm": 25.345590591430664, + "learning_rate": 4.6537250786988455e-07, + "logps/chosen": -54.34903335571289, + "logps/rejected": -50.94935607910156, + "loss": 0.7937, + "losses/dpo": 0.9341498613357544, + "losses/sft": 1.5544166564941406, + "losses/total": 0.9341498613357544, + "ref_logps/chosen": -43.75737762451172, + "ref_logps/rejected": -38.691261291503906, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0591654777526855, + "rewards/margins": 0.16664430499076843, + "rewards/rejected": -1.2258098125457764, + "step": 516 + }, + { + "epoch": 0.49, + "grad_norm": 16.542613983154297, + "learning_rate": 4.6519762154599506e-07, + "logps/chosen": -41.26185607910156, + "logps/rejected": -51.66979217529297, + "loss": 0.5381, + "losses/dpo": 0.49808430671691895, + "losses/sft": 1.7250854969024658, + "losses/total": 0.49808430671691895, + "ref_logps/chosen": -35.4698600769043, + "ref_logps/rejected": -40.898475646972656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5791990160942078, + "rewards/margins": 0.4979328513145447, + "rewards/rejected": -1.0771318674087524, + "step": 517 + }, + { + "epoch": 0.49, + "grad_norm": 19.7836856842041, + "learning_rate": 4.6502273522210563e-07, + "logps/chosen": -37.76109313964844, + "logps/rejected": -51.98636245727539, + "loss": 0.6677, + "losses/dpo": 0.6548014879226685, + "losses/sft": 1.4091333150863647, + "losses/total": 0.6548014879226685, + "ref_logps/chosen": -31.213499069213867, + "ref_logps/rejected": -43.97237014770508, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6547592878341675, + "rewards/margins": 0.14663998782634735, + "rewards/rejected": -0.801399290561676, + "step": 518 + }, + { + "epoch": 0.49, + "grad_norm": 20.166194915771484, + "learning_rate": 4.6484784889821614e-07, + "logps/chosen": -50.248191833496094, + "logps/rejected": -54.06584167480469, + "loss": 0.6071, + "losses/dpo": 0.5363638997077942, + "losses/sft": 1.1510367393493652, + "losses/total": 0.5363638997077942, + "ref_logps/chosen": -42.62671661376953, + "ref_logps/rejected": -41.8621826171875, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7621476650238037, + "rewards/margins": 0.4582183361053467, + "rewards/rejected": -1.2203660011291504, + "step": 519 + }, + { + "epoch": 0.49, + "grad_norm": 18.74920082092285, + "learning_rate": 4.6467296257432666e-07, + "logps/chosen": -48.60076141357422, + "logps/rejected": -69.72369384765625, + "loss": 0.4872, + "losses/dpo": 0.40776166319847107, + "losses/sft": 2.0724453926086426, + "losses/total": 0.40776166319847107, + "ref_logps/chosen": -40.401268005371094, + "ref_logps/rejected": -55.63837432861328, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.819949746131897, + "rewards/margins": 0.58858323097229, + "rewards/rejected": -1.4085328578948975, + "step": 520 + }, + { + "epoch": 0.49, + "grad_norm": 15.707258224487305, + "learning_rate": 4.6449807625043717e-07, + "logps/chosen": -39.754249572753906, + "logps/rejected": -60.420013427734375, + "loss": 0.4779, + "losses/dpo": 0.3915397822856903, + "losses/sft": 1.417066216468811, + "losses/total": 0.3915397822856903, + "ref_logps/chosen": -33.275630950927734, + "ref_logps/rejected": -46.234169006347656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6478619575500488, + "rewards/margins": 0.7707231044769287, + "rewards/rejected": -1.4185853004455566, + "step": 521 + }, + { + "epoch": 0.49, + "grad_norm": 16.339000701904297, + "learning_rate": 4.6432318992654773e-07, + "logps/chosen": -48.527687072753906, + "logps/rejected": -62.224937438964844, + "loss": 0.4821, + "losses/dpo": 0.3421018421649933, + "losses/sft": 1.3392013311386108, + "losses/total": 0.3421018421649933, + "ref_logps/chosen": -40.360755920410156, + "ref_logps/rejected": -47.55398941040039, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.816693127155304, + "rewards/margins": 0.6504019498825073, + "rewards/rejected": -1.467095136642456, + "step": 522 + }, + { + "epoch": 0.49, + "grad_norm": 17.996753692626953, + "learning_rate": 4.641483036026583e-07, + "logps/chosen": -51.03308868408203, + "logps/rejected": -67.96345520019531, + "loss": 0.5056, + "losses/dpo": 0.41352635622024536, + "losses/sft": 1.479400873184204, + "losses/total": 0.41352635622024536, + "ref_logps/chosen": -41.82776641845703, + "ref_logps/rejected": -52.19953155517578, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9205319881439209, + "rewards/margins": 0.6558610200881958, + "rewards/rejected": -1.5763931274414062, + "step": 523 + }, + { + "epoch": 0.49, + "grad_norm": 16.52777099609375, + "learning_rate": 4.6397341727876876e-07, + "logps/chosen": -39.58159637451172, + "logps/rejected": -51.186668395996094, + "loss": 0.542, + "losses/dpo": 0.3972024917602539, + "losses/sft": 1.2694875001907349, + "losses/total": 0.3972024917602539, + "ref_logps/chosen": -33.00318908691406, + "ref_logps/rejected": -39.64402770996094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6578408479690552, + "rewards/margins": 0.49642282724380493, + "rewards/rejected": -1.1542637348175049, + "step": 524 + }, + { + "epoch": 0.5, + "grad_norm": 17.888750076293945, + "learning_rate": 4.637985309548793e-07, + "logps/chosen": -44.566497802734375, + "logps/rejected": -52.6598014831543, + "loss": 0.5552, + "losses/dpo": 0.6064153909683228, + "losses/sft": 1.5405288934707642, + "losses/total": 0.6064153909683228, + "ref_logps/chosen": -37.5517463684082, + "ref_logps/rejected": -40.256202697753906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7014751434326172, + "rewards/margins": 0.5388847589492798, + "rewards/rejected": -1.2403597831726074, + "step": 525 + }, + { + "epoch": 0.5, + "grad_norm": 18.140491485595703, + "learning_rate": 4.6362364463098984e-07, + "logps/chosen": -41.16547393798828, + "logps/rejected": -55.885684967041016, + "loss": 0.5694, + "losses/dpo": 0.46671921014785767, + "losses/sft": 1.6265698671340942, + "losses/total": 0.46671921014785767, + "ref_logps/chosen": -32.732261657714844, + "ref_logps/rejected": -43.497371673583984, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8433213829994202, + "rewards/margins": 0.39550989866256714, + "rewards/rejected": -1.2388312816619873, + "step": 526 + }, + { + "epoch": 0.5, + "grad_norm": 20.340742111206055, + "learning_rate": 4.6344875830710035e-07, + "logps/chosen": -47.97841262817383, + "logps/rejected": -49.0335807800293, + "loss": 0.6108, + "losses/dpo": 0.7322754859924316, + "losses/sft": 1.867557406425476, + "losses/total": 0.7322754859924316, + "ref_logps/chosen": -41.427330017089844, + "ref_logps/rejected": -40.12873840332031, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6551084518432617, + "rewards/margins": 0.23537567257881165, + "rewards/rejected": -0.8904841542243958, + "step": 527 + }, + { + "epoch": 0.5, + "grad_norm": 16.094282150268555, + "learning_rate": 4.6327387198321086e-07, + "logps/chosen": -51.06538009643555, + "logps/rejected": -60.69963073730469, + "loss": 0.4921, + "losses/dpo": 0.5082216262817383, + "losses/sft": 1.6114482879638672, + "losses/total": 0.5082216262817383, + "ref_logps/chosen": -44.52245330810547, + "ref_logps/rejected": -48.39900207519531, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6542928218841553, + "rewards/margins": 0.5757702589035034, + "rewards/rejected": -1.2300630807876587, + "step": 528 + }, + { + "epoch": 0.5, + "grad_norm": 20.200944900512695, + "learning_rate": 4.6309898565932143e-07, + "logps/chosen": -43.65022277832031, + "logps/rejected": -45.25249481201172, + "loss": 0.6692, + "losses/dpo": 0.586146354675293, + "losses/sft": 1.792749285697937, + "losses/total": 0.586146354675293, + "ref_logps/chosen": -35.28498458862305, + "ref_logps/rejected": -35.23607635498047, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8365240097045898, + "rewards/margins": 0.16511788964271545, + "rewards/rejected": -1.0016417503356934, + "step": 529 + }, + { + "epoch": 0.5, + "grad_norm": 18.585430145263672, + "learning_rate": 4.62924099335432e-07, + "logps/chosen": -41.75214767456055, + "logps/rejected": -54.3496208190918, + "loss": 0.5819, + "losses/dpo": 0.6976767182350159, + "losses/sft": 1.224543571472168, + "losses/total": 0.6976767182350159, + "ref_logps/chosen": -34.77529525756836, + "ref_logps/rejected": -44.052207946777344, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6976852416992188, + "rewards/margins": 0.3320556581020355, + "rewards/rejected": -1.0297410488128662, + "step": 530 + }, + { + "epoch": 0.5, + "grad_norm": 20.460107803344727, + "learning_rate": 4.6274921301154245e-07, + "logps/chosen": -46.226558685302734, + "logps/rejected": -46.63634490966797, + "loss": 0.706, + "losses/dpo": 0.5846570730209351, + "losses/sft": 1.621983289718628, + "losses/total": 0.5846570730209351, + "ref_logps/chosen": -39.506229400634766, + "ref_logps/rejected": -39.551780700683594, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.6720329523086548, + "rewards/margins": 0.03642330318689346, + "rewards/rejected": -0.70845627784729, + "step": 531 + }, + { + "epoch": 0.5, + "grad_norm": 17.80394172668457, + "learning_rate": 4.62574326687653e-07, + "logps/chosen": -40.451961517333984, + "logps/rejected": -53.589881896972656, + "loss": 0.6174, + "losses/dpo": 0.5291751027107239, + "losses/sft": 1.1986802816390991, + "losses/total": 0.5291751027107239, + "ref_logps/chosen": -33.04058837890625, + "ref_logps/rejected": -42.0926513671875, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7411374449729919, + "rewards/margins": 0.4085853397846222, + "rewards/rejected": -1.149722695350647, + "step": 532 + }, + { + "epoch": 0.5, + "grad_norm": 19.09368133544922, + "learning_rate": 4.6239944036376353e-07, + "logps/chosen": -48.723331451416016, + "logps/rejected": -54.167301177978516, + "loss": 0.547, + "losses/dpo": 0.5327960252761841, + "losses/sft": 1.3891955614089966, + "losses/total": 0.5327960252761841, + "ref_logps/chosen": -41.82886505126953, + "ref_logps/rejected": -42.401832580566406, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6894465684890747, + "rewards/margins": 0.48709988594055176, + "rewards/rejected": -1.176546335220337, + "step": 533 + }, + { + "epoch": 0.5, + "grad_norm": 18.902814865112305, + "learning_rate": 4.6222455403987404e-07, + "logps/chosen": -41.274105072021484, + "logps/rejected": -46.66707229614258, + "loss": 0.6539, + "losses/dpo": 0.678492546081543, + "losses/sft": 1.7505420446395874, + "losses/total": 0.678492546081543, + "ref_logps/chosen": -33.76953125, + "ref_logps/rejected": -37.19273376464844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7504576444625854, + "rewards/margins": 0.19697652757167816, + "rewards/rejected": -0.9474341869354248, + "step": 534 + }, + { + "epoch": 0.51, + "grad_norm": 16.190881729125977, + "learning_rate": 4.6204966771598456e-07, + "logps/chosen": -41.202980041503906, + "logps/rejected": -59.25920486450195, + "loss": 0.4814, + "losses/dpo": 0.5541568994522095, + "losses/sft": 1.6433279514312744, + "losses/total": 0.5541568994522095, + "ref_logps/chosen": -36.29848098754883, + "ref_logps/rejected": -48.277801513671875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4904496371746063, + "rewards/margins": 0.6076907515525818, + "rewards/rejected": -1.0981404781341553, + "step": 535 + }, + { + "epoch": 0.51, + "grad_norm": 14.119658470153809, + "learning_rate": 4.618747813920951e-07, + "logps/chosen": -36.84916687011719, + "logps/rejected": -50.92412567138672, + "loss": 0.5039, + "losses/dpo": 0.5139963626861572, + "losses/sft": 1.0659281015396118, + "losses/total": 0.5139963626861572, + "ref_logps/chosen": -32.00421142578125, + "ref_logps/rejected": -39.78274154663086, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4844956398010254, + "rewards/margins": 0.6296426057815552, + "rewards/rejected": -1.1141383647918701, + "step": 536 + }, + { + "epoch": 0.51, + "grad_norm": 17.82438087463379, + "learning_rate": 4.616998950682057e-07, + "logps/chosen": -43.560691833496094, + "logps/rejected": -48.5607795715332, + "loss": 0.6292, + "losses/dpo": 0.6040087938308716, + "losses/sft": 1.223031997680664, + "losses/total": 0.6040087938308716, + "ref_logps/chosen": -34.930362701416016, + "ref_logps/rejected": -37.556705474853516, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8630329370498657, + "rewards/margins": 0.23737439513206482, + "rewards/rejected": -1.100407361984253, + "step": 537 + }, + { + "epoch": 0.51, + "grad_norm": 18.799345016479492, + "learning_rate": 4.6152500874431615e-07, + "logps/chosen": -43.603477478027344, + "logps/rejected": -54.45125198364258, + "loss": 0.5935, + "losses/dpo": 0.5168572664260864, + "losses/sft": 1.2226108312606812, + "losses/total": 0.5168572664260864, + "ref_logps/chosen": -35.670448303222656, + "ref_logps/rejected": -43.59748077392578, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7933031916618347, + "rewards/margins": 0.29207438230514526, + "rewards/rejected": -1.08537757396698, + "step": 538 + }, + { + "epoch": 0.51, + "grad_norm": 19.961963653564453, + "learning_rate": 4.613501224204267e-07, + "logps/chosen": -39.98153305053711, + "logps/rejected": -48.5761833190918, + "loss": 0.6535, + "losses/dpo": 1.0349931716918945, + "losses/sft": 1.9321941137313843, + "losses/total": 1.0349931716918945, + "ref_logps/chosen": -33.00051498413086, + "ref_logps/rejected": -38.60032653808594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6981016397476196, + "rewards/margins": 0.2994837760925293, + "rewards/rejected": -0.9975854158401489, + "step": 539 + }, + { + "epoch": 0.51, + "grad_norm": 18.267555236816406, + "learning_rate": 4.611752360965372e-07, + "logps/chosen": -49.56752014160156, + "logps/rejected": -64.00669860839844, + "loss": 0.5189, + "losses/dpo": 0.503933846950531, + "losses/sft": 1.3660101890563965, + "losses/total": 0.503933846950531, + "ref_logps/chosen": -44.633785247802734, + "ref_logps/rejected": -53.22724151611328, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4933737516403198, + "rewards/margins": 0.584572434425354, + "rewards/rejected": -1.0779461860656738, + "step": 540 + }, + { + "epoch": 0.51, + "grad_norm": 19.289949417114258, + "learning_rate": 4.6100034977264774e-07, + "logps/chosen": -55.475101470947266, + "logps/rejected": -60.352447509765625, + "loss": 0.5869, + "losses/dpo": 0.6577602624893188, + "losses/sft": 1.8504594564437866, + "losses/total": 0.6577602624893188, + "ref_logps/chosen": -47.22643280029297, + "ref_logps/rejected": -48.38414764404297, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8248670101165771, + "rewards/margins": 0.3719630837440491, + "rewards/rejected": -1.1968300342559814, + "step": 541 + }, + { + "epoch": 0.51, + "grad_norm": 20.45795440673828, + "learning_rate": 4.608254634487583e-07, + "logps/chosen": -51.12990188598633, + "logps/rejected": -53.9380989074707, + "loss": 0.5951, + "losses/dpo": 0.8269652724266052, + "losses/sft": 1.7129558324813843, + "losses/total": 0.8269652724266052, + "ref_logps/chosen": -41.75885009765625, + "ref_logps/rejected": -40.59892272949219, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9371054172515869, + "rewards/margins": 0.3968122899532318, + "rewards/rejected": -1.3339176177978516, + "step": 542 + }, + { + "epoch": 0.51, + "grad_norm": 20.200122833251953, + "learning_rate": 4.606505771248688e-07, + "logps/chosen": -50.2019157409668, + "logps/rejected": -49.549659729003906, + "loss": 0.5585, + "losses/dpo": 0.6339994668960571, + "losses/sft": 2.619389772415161, + "losses/total": 0.6339994668960571, + "ref_logps/chosen": -42.71995544433594, + "ref_logps/rejected": -38.63445281982422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7481957674026489, + "rewards/margins": 0.343325138092041, + "rewards/rejected": -1.09152090549469, + "step": 543 + }, + { + "epoch": 0.51, + "grad_norm": 20.325183868408203, + "learning_rate": 4.604756908009794e-07, + "logps/chosen": -37.36647033691406, + "logps/rejected": -42.93746566772461, + "loss": 0.661, + "losses/dpo": 0.7532327175140381, + "losses/sft": 2.116211414337158, + "losses/total": 0.7532327175140381, + "ref_logps/chosen": -31.26726531982422, + "ref_logps/rejected": -34.85523223876953, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6099203824996948, + "rewards/margins": 0.19830316305160522, + "rewards/rejected": -0.8082234859466553, + "step": 544 + }, + { + "epoch": 0.51, + "grad_norm": 22.209781646728516, + "learning_rate": 4.6030080447708984e-07, + "logps/chosen": -57.72157669067383, + "logps/rejected": -63.55116271972656, + "loss": 0.5993, + "losses/dpo": 0.4275997281074524, + "losses/sft": 2.05446720123291, + "losses/total": 0.4275997281074524, + "ref_logps/chosen": -47.218875885009766, + "ref_logps/rejected": -48.9195556640625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0502698421478271, + "rewards/margins": 0.41289082169532776, + "rewards/rejected": -1.463160753250122, + "step": 545 + }, + { + "epoch": 0.52, + "grad_norm": 17.983835220336914, + "learning_rate": 4.601259181532004e-07, + "logps/chosen": -40.64090347290039, + "logps/rejected": -47.25122833251953, + "loss": 0.5603, + "losses/dpo": 0.722617506980896, + "losses/sft": 1.5367704629898071, + "losses/total": 0.722617506980896, + "ref_logps/chosen": -35.09404754638672, + "ref_logps/rejected": -37.44237518310547, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5546858310699463, + "rewards/margins": 0.4261992573738098, + "rewards/rejected": -0.9808850288391113, + "step": 546 + }, + { + "epoch": 0.52, + "grad_norm": 26.236013412475586, + "learning_rate": 4.599510318293109e-07, + "logps/chosen": -57.17491912841797, + "logps/rejected": -62.179683685302734, + "loss": 0.7702, + "losses/dpo": 0.9172338247299194, + "losses/sft": 2.0179269313812256, + "losses/total": 0.9172338247299194, + "ref_logps/chosen": -46.224395751953125, + "ref_logps/rejected": -51.54985427856445, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0950521230697632, + "rewards/margins": -0.03206905350089073, + "rewards/rejected": -1.0629830360412598, + "step": 547 + }, + { + "epoch": 0.52, + "grad_norm": 22.186540603637695, + "learning_rate": 4.5977614550542143e-07, + "logps/chosen": -44.1175651550293, + "logps/rejected": -49.125244140625, + "loss": 0.7027, + "losses/dpo": 0.675849437713623, + "losses/sft": 1.7110408544540405, + "losses/total": 0.675849437713623, + "ref_logps/chosen": -35.94295120239258, + "ref_logps/rejected": -39.02360534667969, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.817461371421814, + "rewards/margins": 0.19270196557044983, + "rewards/rejected": -1.0101633071899414, + "step": 548 + }, + { + "epoch": 0.52, + "grad_norm": 15.947452545166016, + "learning_rate": 4.59601259181532e-07, + "logps/chosen": -43.255401611328125, + "logps/rejected": -68.79779052734375, + "loss": 0.414, + "losses/dpo": 0.3710100054740906, + "losses/sft": 1.706789255142212, + "losses/total": 0.3710100054740906, + "ref_logps/chosen": -36.15275573730469, + "ref_logps/rejected": -52.01953125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7102651000022888, + "rewards/margins": 0.9675604104995728, + "rewards/rejected": -1.6778254508972168, + "step": 549 + }, + { + "epoch": 0.52, + "grad_norm": 20.477737426757812, + "learning_rate": 4.594263728576425e-07, + "logps/chosen": -42.64044189453125, + "logps/rejected": -48.93083953857422, + "loss": 0.6011, + "losses/dpo": 0.6613829135894775, + "losses/sft": 1.7269814014434814, + "losses/total": 0.6613829135894775, + "ref_logps/chosen": -33.133811950683594, + "ref_logps/rejected": -35.95905685424805, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9506632089614868, + "rewards/margins": 0.3465149998664856, + "rewards/rejected": -1.2971782684326172, + "step": 550 + }, + { + "epoch": 0.52, + "grad_norm": 16.479202270507812, + "learning_rate": 4.592514865337531e-07, + "logps/chosen": -50.13810729980469, + "logps/rejected": -62.10587692260742, + "loss": 0.5175, + "losses/dpo": 0.6193148493766785, + "losses/sft": 2.2699668407440186, + "losses/total": 0.6193148493766785, + "ref_logps/chosen": -40.48619079589844, + "ref_logps/rejected": -46.197547912597656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9651917815208435, + "rewards/margins": 0.6256414651870728, + "rewards/rejected": -1.5908331871032715, + "step": 551 + }, + { + "epoch": 0.52, + "grad_norm": 17.107406616210938, + "learning_rate": 4.5907660020986354e-07, + "logps/chosen": -38.52226638793945, + "logps/rejected": -59.176597595214844, + "loss": 0.5397, + "losses/dpo": 0.48231440782546997, + "losses/sft": 1.854367733001709, + "losses/total": 0.48231440782546997, + "ref_logps/chosen": -28.64743423461914, + "ref_logps/rejected": -44.32575607299805, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9874832630157471, + "rewards/margins": 0.4976009130477905, + "rewards/rejected": -1.4850842952728271, + "step": 552 + }, + { + "epoch": 0.52, + "grad_norm": 22.987104415893555, + "learning_rate": 4.589017138859741e-07, + "logps/chosen": -50.133785247802734, + "logps/rejected": -48.91804885864258, + "loss": 0.6585, + "losses/dpo": 0.6219667196273804, + "losses/sft": 1.4681081771850586, + "losses/total": 0.6219667196273804, + "ref_logps/chosen": -39.487632751464844, + "ref_logps/rejected": -35.85594177246094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0646154880523682, + "rewards/margins": 0.24159535765647888, + "rewards/rejected": -1.3062108755111694, + "step": 553 + }, + { + "epoch": 0.52, + "grad_norm": 22.33740234375, + "learning_rate": 4.587268275620846e-07, + "logps/chosen": -53.852081298828125, + "logps/rejected": -60.23436737060547, + "loss": 0.6664, + "losses/dpo": 0.9895302057266235, + "losses/sft": 1.8812607526779175, + "losses/total": 0.9895302057266235, + "ref_logps/chosen": -41.266414642333984, + "ref_logps/rejected": -45.49755096435547, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2585668563842773, + "rewards/margins": 0.21511486172676086, + "rewards/rejected": -1.4736816883087158, + "step": 554 + }, + { + "epoch": 0.52, + "grad_norm": 24.779027938842773, + "learning_rate": 4.5855194123819513e-07, + "logps/chosen": -58.739349365234375, + "logps/rejected": -66.29627990722656, + "loss": 0.6482, + "losses/dpo": 0.1946183741092682, + "losses/sft": 1.8658313751220703, + "losses/total": 0.1946183741092682, + "ref_logps/chosen": -46.45478439331055, + "ref_logps/rejected": -49.61613845825195, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2284562587738037, + "rewards/margins": 0.43955832719802856, + "rewards/rejected": -1.6680145263671875, + "step": 555 + }, + { + "epoch": 0.53, + "grad_norm": 19.251026153564453, + "learning_rate": 4.583770549143057e-07, + "logps/chosen": -51.381412506103516, + "logps/rejected": -61.252723693847656, + "loss": 0.5333, + "losses/dpo": 0.675920307636261, + "losses/sft": 1.8289772272109985, + "losses/total": 0.675920307636261, + "ref_logps/chosen": -43.014129638671875, + "ref_logps/rejected": -47.88116455078125, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8367282748222351, + "rewards/margins": 0.5004271864891052, + "rewards/rejected": -1.3371554613113403, + "step": 556 + }, + { + "epoch": 0.53, + "grad_norm": 19.037635803222656, + "learning_rate": 4.582021685904162e-07, + "logps/chosen": -42.47540283203125, + "logps/rejected": -47.087257385253906, + "loss": 0.6694, + "losses/dpo": 0.6714488863945007, + "losses/sft": 1.8966470956802368, + "losses/total": 0.6714488863945007, + "ref_logps/chosen": -33.46836853027344, + "ref_logps/rejected": -36.699928283691406, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9007034301757812, + "rewards/margins": 0.13802987337112427, + "rewards/rejected": -1.0387332439422607, + "step": 557 + }, + { + "epoch": 0.53, + "grad_norm": 16.663394927978516, + "learning_rate": 4.5802728226652677e-07, + "logps/chosen": -49.5675048828125, + "logps/rejected": -66.3643569946289, + "loss": 0.4732, + "losses/dpo": 0.3888000249862671, + "losses/sft": 1.814093828201294, + "losses/total": 0.3888000249862671, + "ref_logps/chosen": -38.756683349609375, + "ref_logps/rejected": -47.82711410522461, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0810821056365967, + "rewards/margins": 0.772642195224762, + "rewards/rejected": -1.8537243604660034, + "step": 558 + }, + { + "epoch": 0.53, + "grad_norm": 14.421659469604492, + "learning_rate": 4.5785239594263723e-07, + "logps/chosen": -45.214996337890625, + "logps/rejected": -66.09407043457031, + "loss": 0.3689, + "losses/dpo": 0.4478214681148529, + "losses/sft": 1.4946962594985962, + "losses/total": 0.4478214681148529, + "ref_logps/chosen": -37.52229309082031, + "ref_logps/rejected": -46.532798767089844, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.769270658493042, + "rewards/margins": 1.1868572235107422, + "rewards/rejected": -1.9561278820037842, + "step": 559 + }, + { + "epoch": 0.53, + "grad_norm": 23.122665405273438, + "learning_rate": 4.576775096187478e-07, + "logps/chosen": -56.175209045410156, + "logps/rejected": -67.87620544433594, + "loss": 0.768, + "losses/dpo": 0.6455317735671997, + "losses/sft": 1.3315482139587402, + "losses/total": 0.6455317735671997, + "ref_logps/chosen": -44.463558197021484, + "ref_logps/rejected": -54.495849609375, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1711652278900146, + "rewards/margins": 0.1668698787689209, + "rewards/rejected": -1.338035225868225, + "step": 560 + }, + { + "epoch": 0.53, + "grad_norm": 17.649150848388672, + "learning_rate": 4.5750262329485836e-07, + "logps/chosen": -41.43122100830078, + "logps/rejected": -69.98138427734375, + "loss": 0.4408, + "losses/dpo": 0.4343390464782715, + "losses/sft": 1.6604691743850708, + "losses/total": 0.4343390464782715, + "ref_logps/chosen": -32.734798431396484, + "ref_logps/rejected": -53.289634704589844, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8696419596672058, + "rewards/margins": 0.799533486366272, + "rewards/rejected": -1.669175386428833, + "step": 561 + }, + { + "epoch": 0.53, + "grad_norm": 17.988561630249023, + "learning_rate": 4.573277369709688e-07, + "logps/chosen": -39.51237869262695, + "logps/rejected": -46.915992736816406, + "loss": 0.6035, + "losses/dpo": 0.6121284365653992, + "losses/sft": 1.4484940767288208, + "losses/total": 0.6121284365653992, + "ref_logps/chosen": -32.31346130371094, + "ref_logps/rejected": -36.85161590576172, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7198917865753174, + "rewards/margins": 0.28654569387435913, + "rewards/rejected": -1.0064374208450317, + "step": 562 + }, + { + "epoch": 0.53, + "grad_norm": 18.029293060302734, + "learning_rate": 4.571528506470794e-07, + "logps/chosen": -47.67314147949219, + "logps/rejected": -67.33277130126953, + "loss": 0.5051, + "losses/dpo": 0.45294126868247986, + "losses/sft": 1.9086031913757324, + "losses/total": 0.45294126868247986, + "ref_logps/chosen": -39.116432189941406, + "ref_logps/rejected": -52.71623992919922, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.855671226978302, + "rewards/margins": 0.6059819459915161, + "rewards/rejected": -1.461653232574463, + "step": 563 + }, + { + "epoch": 0.53, + "grad_norm": 14.394112586975098, + "learning_rate": 4.569779643231899e-07, + "logps/chosen": -42.31031799316406, + "logps/rejected": -54.99332046508789, + "loss": 0.4608, + "losses/dpo": 0.47964540123939514, + "losses/sft": 1.343507170677185, + "losses/total": 0.47964540123939514, + "ref_logps/chosen": -32.94147491455078, + "ref_logps/rejected": -37.29802703857422, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9368841648101807, + "rewards/margins": 0.8326447606086731, + "rewards/rejected": -1.7695289850234985, + "step": 564 + }, + { + "epoch": 0.53, + "grad_norm": 17.398134231567383, + "learning_rate": 4.5680307799930047e-07, + "logps/chosen": -41.23756408691406, + "logps/rejected": -67.37899780273438, + "loss": 0.4651, + "losses/dpo": 0.369045227766037, + "losses/sft": 1.6824448108673096, + "losses/total": 0.369045227766037, + "ref_logps/chosen": -33.456703186035156, + "ref_logps/rejected": -51.3763427734375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7780860662460327, + "rewards/margins": 0.8221798539161682, + "rewards/rejected": -1.6002659797668457, + "step": 565 + }, + { + "epoch": 0.53, + "grad_norm": 28.186016082763672, + "learning_rate": 4.566281916754109e-07, + "logps/chosen": -47.245662689208984, + "logps/rejected": -49.52615737915039, + "loss": 0.781, + "losses/dpo": 0.6183753609657288, + "losses/sft": 1.8384548425674438, + "losses/total": 0.6183753609657288, + "ref_logps/chosen": -34.558082580566406, + "ref_logps/rejected": -36.763221740722656, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.2687578201293945, + "rewards/margins": 0.007535912096500397, + "rewards/rejected": -1.2762937545776367, + "step": 566 + }, + { + "epoch": 0.54, + "grad_norm": 25.419403076171875, + "learning_rate": 4.564533053515215e-07, + "logps/chosen": -64.38424682617188, + "logps/rejected": -77.08365631103516, + "loss": 0.791, + "losses/dpo": 0.6494208574295044, + "losses/sft": 2.211927652359009, + "losses/total": 0.6494208574295044, + "ref_logps/chosen": -49.56736755371094, + "ref_logps/rejected": -62.12215042114258, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4816880226135254, + "rewards/margins": 0.014462143182754517, + "rewards/rejected": -1.496150255203247, + "step": 567 + }, + { + "epoch": 0.54, + "grad_norm": 23.816476821899414, + "learning_rate": 4.5627841902763206e-07, + "logps/chosen": -51.96767807006836, + "logps/rejected": -59.69984817504883, + "loss": 0.6734, + "losses/dpo": 0.4443957209587097, + "losses/sft": 1.374029278755188, + "losses/total": 0.4443957209587097, + "ref_logps/chosen": -42.19104766845703, + "ref_logps/rejected": -48.227783203125, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9776627421379089, + "rewards/margins": 0.1695437729358673, + "rewards/rejected": -1.1472065448760986, + "step": 568 + }, + { + "epoch": 0.54, + "grad_norm": 24.870258331298828, + "learning_rate": 4.561035327037425e-07, + "logps/chosen": -45.30424499511719, + "logps/rejected": -46.708221435546875, + "loss": 0.7158, + "losses/dpo": 1.1105490922927856, + "losses/sft": 1.3865281343460083, + "losses/total": 1.1105490922927856, + "ref_logps/chosen": -37.21491622924805, + "ref_logps/rejected": -37.429046630859375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8089326620101929, + "rewards/margins": 0.11898540705442429, + "rewards/rejected": -0.9279180765151978, + "step": 569 + }, + { + "epoch": 0.54, + "grad_norm": 14.619791030883789, + "learning_rate": 4.559286463798531e-07, + "logps/chosen": -43.014503479003906, + "logps/rejected": -67.5992431640625, + "loss": 0.3671, + "losses/dpo": 0.294307142496109, + "losses/sft": 2.140934944152832, + "losses/total": 0.294307142496109, + "ref_logps/chosen": -35.34181213378906, + "ref_logps/rejected": -49.05023956298828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7672694325447083, + "rewards/margins": 1.0876307487487793, + "rewards/rejected": -1.8549003601074219, + "step": 570 + }, + { + "epoch": 0.54, + "grad_norm": 25.318273544311523, + "learning_rate": 4.557537600559636e-07, + "logps/chosen": -51.31770324707031, + "logps/rejected": -45.34050369262695, + "loss": 0.7361, + "losses/dpo": 0.935380220413208, + "losses/sft": 1.6511880159378052, + "losses/total": 0.935380220413208, + "ref_logps/chosen": -40.02067565917969, + "ref_logps/rejected": -33.30419158935547, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.1297024488449097, + "rewards/margins": 0.07392872869968414, + "rewards/rejected": -1.2036311626434326, + "step": 571 + }, + { + "epoch": 0.54, + "grad_norm": 22.432546615600586, + "learning_rate": 4.5557887373207416e-07, + "logps/chosen": -47.067108154296875, + "logps/rejected": -48.23573303222656, + "loss": 0.6426, + "losses/dpo": 0.3802192807197571, + "losses/sft": 1.8040047883987427, + "losses/total": 0.3802192807197571, + "ref_logps/chosen": -36.93397521972656, + "ref_logps/rejected": -35.979698181152344, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0133131742477417, + "rewards/margins": 0.21229061484336853, + "rewards/rejected": -1.2256038188934326, + "step": 572 + }, + { + "epoch": 0.54, + "grad_norm": 23.75169563293457, + "learning_rate": 4.554039874081846e-07, + "logps/chosen": -47.19709777832031, + "logps/rejected": -42.59159851074219, + "loss": 0.7487, + "losses/dpo": 0.5805943012237549, + "losses/sft": 1.385767936706543, + "losses/total": 0.5805943012237549, + "ref_logps/chosen": -37.008399963378906, + "ref_logps/rejected": -32.48085403442383, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0188690423965454, + "rewards/margins": -0.007794246077537537, + "rewards/rejected": -1.0110747814178467, + "step": 573 + }, + { + "epoch": 0.54, + "grad_norm": 21.6710205078125, + "learning_rate": 4.552291010842952e-07, + "logps/chosen": -50.27872085571289, + "logps/rejected": -53.5006217956543, + "loss": 0.5499, + "losses/dpo": 0.5672124624252319, + "losses/sft": 1.839987874031067, + "losses/total": 0.5672124624252319, + "ref_logps/chosen": -39.56987762451172, + "ref_logps/rejected": -37.309051513671875, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0708839893341064, + "rewards/margins": 0.5482732057571411, + "rewards/rejected": -1.619157314300537, + "step": 574 + }, + { + "epoch": 0.54, + "grad_norm": 20.268836975097656, + "learning_rate": 4.5505421476040575e-07, + "logps/chosen": -51.502044677734375, + "logps/rejected": -58.27291488647461, + "loss": 0.6278, + "losses/dpo": 0.6022443771362305, + "losses/sft": 2.057661294937134, + "losses/total": 0.6022443771362305, + "ref_logps/chosen": -41.06641387939453, + "ref_logps/rejected": -44.71141815185547, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0435630083084106, + "rewards/margins": 0.3125864863395691, + "rewards/rejected": -1.356149435043335, + "step": 575 + }, + { + "epoch": 0.54, + "grad_norm": 20.673593521118164, + "learning_rate": 4.548793284365162e-07, + "logps/chosen": -39.24158477783203, + "logps/rejected": -51.94776916503906, + "loss": 0.5591, + "losses/dpo": 0.9681692123413086, + "losses/sft": 1.5640063285827637, + "losses/total": 0.9681692123413086, + "ref_logps/chosen": -32.29969024658203, + "ref_logps/rejected": -39.803688049316406, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6941896080970764, + "rewards/margins": 0.5202183723449707, + "rewards/rejected": -1.2144079208374023, + "step": 576 + }, + { + "epoch": 0.54, + "grad_norm": 25.663684844970703, + "learning_rate": 4.547044421126268e-07, + "logps/chosen": -53.03947448730469, + "logps/rejected": -58.420494079589844, + "loss": 0.6881, + "losses/dpo": 0.876889169216156, + "losses/sft": 2.2121920585632324, + "losses/total": 0.876889169216156, + "ref_logps/chosen": -41.90216827392578, + "ref_logps/rejected": -44.720306396484375, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1137303113937378, + "rewards/margins": 0.2562887668609619, + "rewards/rejected": -1.3700189590454102, + "step": 577 + }, + { + "epoch": 0.55, + "grad_norm": 21.71703338623047, + "learning_rate": 4.545295557887373e-07, + "logps/chosen": -62.46804428100586, + "logps/rejected": -72.32112121582031, + "loss": 0.5451, + "losses/dpo": 0.5514512062072754, + "losses/sft": 1.7490044832229614, + "losses/total": 0.5514512062072754, + "ref_logps/chosen": -48.041316986083984, + "ref_logps/rejected": -53.017478942871094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4426724910736084, + "rewards/margins": 0.48769164085388184, + "rewards/rejected": -1.9303641319274902, + "step": 578 + }, + { + "epoch": 0.55, + "grad_norm": 21.563751220703125, + "learning_rate": 4.5435466946484785e-07, + "logps/chosen": -47.78504943847656, + "logps/rejected": -54.838382720947266, + "loss": 0.572, + "losses/dpo": 0.5821988582611084, + "losses/sft": 1.9250428676605225, + "losses/total": 0.5821988582611084, + "ref_logps/chosen": -37.25121307373047, + "ref_logps/rejected": -39.90326690673828, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0533840656280518, + "rewards/margins": 0.44012749195098877, + "rewards/rejected": -1.49351167678833, + "step": 579 + }, + { + "epoch": 0.55, + "grad_norm": 14.213398933410645, + "learning_rate": 4.541797831409583e-07, + "logps/chosen": -38.6250114440918, + "logps/rejected": -59.392425537109375, + "loss": 0.3845, + "losses/dpo": 0.41140130162239075, + "losses/sft": 1.2234476804733276, + "losses/total": 0.41140130162239075, + "ref_logps/chosen": -33.129390716552734, + "ref_logps/rejected": -43.63618469238281, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.549561619758606, + "rewards/margins": 1.0260627269744873, + "rewards/rejected": -1.5756242275238037, + "step": 580 + }, + { + "epoch": 0.55, + "grad_norm": 17.11842155456543, + "learning_rate": 4.540048968170689e-07, + "logps/chosen": -50.15816116333008, + "logps/rejected": -63.31660461425781, + "loss": 0.4982, + "losses/dpo": 0.40851694345474243, + "losses/sft": 1.7249274253845215, + "losses/total": 0.40851694345474243, + "ref_logps/chosen": -38.71712875366211, + "ref_logps/rejected": -45.193084716796875, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1441032886505127, + "rewards/margins": 0.6682482361793518, + "rewards/rejected": -1.8123514652252197, + "step": 581 + }, + { + "epoch": 0.55, + "grad_norm": 20.58220672607422, + "learning_rate": 4.5383001049317945e-07, + "logps/chosen": -46.187110900878906, + "logps/rejected": -48.68061828613281, + "loss": 0.6063, + "losses/dpo": 0.7693386077880859, + "losses/sft": 1.2192364931106567, + "losses/total": 0.7693386077880859, + "ref_logps/chosen": -36.772674560546875, + "ref_logps/rejected": -35.26833724975586, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.941443920135498, + "rewards/margins": 0.3997841477394104, + "rewards/rejected": -1.3412280082702637, + "step": 582 + }, + { + "epoch": 0.55, + "grad_norm": 19.872751235961914, + "learning_rate": 4.536551241692899e-07, + "logps/chosen": -54.05327224731445, + "logps/rejected": -72.29106140136719, + "loss": 0.577, + "losses/dpo": 0.3492737114429474, + "losses/sft": 1.7046188116073608, + "losses/total": 0.3492737114429474, + "ref_logps/chosen": -43.72734451293945, + "ref_logps/rejected": -57.298274993896484, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0325927734375, + "rewards/margins": 0.46668627858161926, + "rewards/rejected": -1.4992791414260864, + "step": 583 + }, + { + "epoch": 0.55, + "grad_norm": 22.728727340698242, + "learning_rate": 4.5348023784540047e-07, + "logps/chosen": -54.06036376953125, + "logps/rejected": -69.52227783203125, + "loss": 0.6209, + "losses/dpo": 0.5739734172821045, + "losses/sft": 1.489017128944397, + "losses/total": 0.5739734172821045, + "ref_logps/chosen": -41.19548797607422, + "ref_logps/rejected": -52.632259368896484, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2864876985549927, + "rewards/margins": 0.4025137722492218, + "rewards/rejected": -1.6890015602111816, + "step": 584 + }, + { + "epoch": 0.55, + "grad_norm": 23.140422821044922, + "learning_rate": 4.53305351521511e-07, + "logps/chosen": -57.32766342163086, + "logps/rejected": -58.91395568847656, + "loss": 0.6863, + "losses/dpo": 0.44302016496658325, + "losses/sft": 1.8704756498336792, + "losses/total": 0.44302016496658325, + "ref_logps/chosen": -43.63447570800781, + "ref_logps/rejected": -43.35402297973633, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3693188428878784, + "rewards/margins": 0.18667477369308472, + "rewards/rejected": -1.555993676185608, + "step": 585 + }, + { + "epoch": 0.55, + "grad_norm": 21.6605224609375, + "learning_rate": 4.5313046519762155e-07, + "logps/chosen": -49.245025634765625, + "logps/rejected": -52.45628356933594, + "loss": 0.6422, + "losses/dpo": 0.5428541898727417, + "losses/sft": 1.4998265504837036, + "losses/total": 0.5428541898727417, + "ref_logps/chosen": -37.843074798583984, + "ref_logps/rejected": -38.00508117675781, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.140195369720459, + "rewards/margins": 0.3049251437187195, + "rewards/rejected": -1.4451203346252441, + "step": 586 + }, + { + "epoch": 0.55, + "grad_norm": 25.053163528442383, + "learning_rate": 4.5295557887373206e-07, + "logps/chosen": -61.2729377746582, + "logps/rejected": -55.074424743652344, + "loss": 0.7245, + "losses/dpo": 0.558214008808136, + "losses/sft": 1.6333088874816895, + "losses/total": 0.558214008808136, + "ref_logps/chosen": -47.624298095703125, + "ref_logps/rejected": -39.871944427490234, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.364863634109497, + "rewards/margins": 0.155384361743927, + "rewards/rejected": -1.5202481746673584, + "step": 587 + }, + { + "epoch": 0.56, + "grad_norm": 19.314786911010742, + "learning_rate": 4.527806925498426e-07, + "logps/chosen": -59.835540771484375, + "logps/rejected": -83.142822265625, + "loss": 0.4663, + "losses/dpo": 0.4594837427139282, + "losses/sft": 2.0525152683258057, + "losses/total": 0.4594837427139282, + "ref_logps/chosen": -46.653953552246094, + "ref_logps/rejected": -61.53036880493164, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3181588649749756, + "rewards/margins": 0.8430860042572021, + "rewards/rejected": -2.1612448692321777, + "step": 588 + }, + { + "epoch": 0.56, + "grad_norm": 32.04518508911133, + "learning_rate": 4.5260580622595314e-07, + "logps/chosen": -59.97466278076172, + "logps/rejected": -55.113014221191406, + "loss": 0.9089, + "losses/dpo": 1.2490079402923584, + "losses/sft": 2.043095350265503, + "losses/total": 1.2490079402923584, + "ref_logps/chosen": -43.02207946777344, + "ref_logps/rejected": -38.07715606689453, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.695258617401123, + "rewards/margins": 0.008327044546604156, + "rewards/rejected": -1.7035856246948242, + "step": 589 + }, + { + "epoch": 0.56, + "grad_norm": 19.12995719909668, + "learning_rate": 4.524309199020636e-07, + "logps/chosen": -43.498085021972656, + "logps/rejected": -48.67451858520508, + "loss": 0.5929, + "losses/dpo": 0.7954567670822144, + "losses/sft": 1.5061460733413696, + "losses/total": 0.7954567670822144, + "ref_logps/chosen": -35.334983825683594, + "ref_logps/rejected": -36.11871337890625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8163097500801086, + "rewards/margins": 0.43927085399627686, + "rewards/rejected": -1.2555806636810303, + "step": 590 + }, + { + "epoch": 0.56, + "grad_norm": 22.31923484802246, + "learning_rate": 4.5225603357817417e-07, + "logps/chosen": -49.24600601196289, + "logps/rejected": -58.38401412963867, + "loss": 0.598, + "losses/dpo": 0.5679134130477905, + "losses/sft": 2.1319870948791504, + "losses/total": 0.5679134130477905, + "ref_logps/chosen": -35.836273193359375, + "ref_logps/rejected": -41.80289840698242, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3409734964370728, + "rewards/margins": 0.3171381950378418, + "rewards/rejected": -1.6581116914749146, + "step": 591 + }, + { + "epoch": 0.56, + "grad_norm": 24.918611526489258, + "learning_rate": 4.520811472542847e-07, + "logps/chosen": -49.44950866699219, + "logps/rejected": -57.95235824584961, + "loss": 0.6847, + "losses/dpo": 0.6354164481163025, + "losses/sft": 1.8745847940444946, + "losses/total": 0.6354164481163025, + "ref_logps/chosen": -38.98527908325195, + "ref_logps/rejected": -46.05634307861328, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0464226007461548, + "rewards/margins": 0.14317886531352997, + "rewards/rejected": -1.1896014213562012, + "step": 592 + }, + { + "epoch": 0.56, + "grad_norm": 23.120941162109375, + "learning_rate": 4.5190626093039524e-07, + "logps/chosen": -44.30820846557617, + "logps/rejected": -51.69993591308594, + "loss": 0.6163, + "losses/dpo": 0.6547796130180359, + "losses/sft": 2.007625102996826, + "losses/total": 0.6547796130180359, + "ref_logps/chosen": -33.460899353027344, + "ref_logps/rejected": -37.08675003051758, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.084730863571167, + "rewards/margins": 0.37658780813217163, + "rewards/rejected": -1.4613187313079834, + "step": 593 + }, + { + "epoch": 0.56, + "grad_norm": 27.32744026184082, + "learning_rate": 4.5173137460650576e-07, + "logps/chosen": -53.49062728881836, + "logps/rejected": -66.09136962890625, + "loss": 0.769, + "losses/dpo": 0.8161667585372925, + "losses/sft": 1.976332426071167, + "losses/total": 0.8161667585372925, + "ref_logps/chosen": -38.28842544555664, + "ref_logps/rejected": -49.76551818847656, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.520220160484314, + "rewards/margins": 0.11236533522605896, + "rewards/rejected": -1.6325855255126953, + "step": 594 + }, + { + "epoch": 0.56, + "grad_norm": 22.235774993896484, + "learning_rate": 4.5155648828261627e-07, + "logps/chosen": -48.553611755371094, + "logps/rejected": -53.690345764160156, + "loss": 0.6468, + "losses/dpo": 0.7293063998222351, + "losses/sft": 1.7181318998336792, + "losses/total": 0.7293063998222351, + "ref_logps/chosen": -36.56941223144531, + "ref_logps/rejected": -37.150569915771484, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1984202861785889, + "rewards/margins": 0.45555734634399414, + "rewards/rejected": -1.653977632522583, + "step": 595 + }, + { + "epoch": 0.56, + "grad_norm": 16.202119827270508, + "learning_rate": 4.5138160195872683e-07, + "logps/chosen": -34.22391128540039, + "logps/rejected": -50.515464782714844, + "loss": 0.5549, + "losses/dpo": 0.553246259689331, + "losses/sft": 1.7854032516479492, + "losses/total": 0.553246259689331, + "ref_logps/chosen": -27.792156219482422, + "ref_logps/rejected": -38.98239517211914, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6431754231452942, + "rewards/margins": 0.5101317167282104, + "rewards/rejected": -1.1533071994781494, + "step": 596 + }, + { + "epoch": 0.56, + "grad_norm": 28.784940719604492, + "learning_rate": 4.512067156348373e-07, + "logps/chosen": -64.26377868652344, + "logps/rejected": -59.91178894042969, + "loss": 0.8101, + "losses/dpo": 0.5749263763427734, + "losses/sft": 1.6070976257324219, + "losses/total": 0.5749263763427734, + "ref_logps/chosen": -49.78087615966797, + "ref_logps/rejected": -45.48026657104492, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4482907056808472, + "rewards/margins": -0.005138680338859558, + "rewards/rejected": -1.4431519508361816, + "step": 597 + }, + { + "epoch": 0.56, + "grad_norm": 25.374406814575195, + "learning_rate": 4.5103182931094786e-07, + "logps/chosen": -57.253143310546875, + "logps/rejected": -60.46489715576172, + "loss": 0.7086, + "losses/dpo": 0.6373536586761475, + "losses/sft": 2.185122013092041, + "losses/total": 0.6373536586761475, + "ref_logps/chosen": -45.47064208984375, + "ref_logps/rejected": -48.51488494873047, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1782500743865967, + "rewards/margins": 0.01675093173980713, + "rewards/rejected": -1.1950010061264038, + "step": 598 + }, + { + "epoch": 0.57, + "grad_norm": 17.572267532348633, + "learning_rate": 4.5085694298705837e-07, + "logps/chosen": -45.471588134765625, + "logps/rejected": -52.342628479003906, + "loss": 0.5378, + "losses/dpo": 0.644324004650116, + "losses/sft": 1.1434687376022339, + "losses/total": 0.644324004650116, + "ref_logps/chosen": -34.827049255371094, + "ref_logps/rejected": -37.0857048034668, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0644538402557373, + "rewards/margins": 0.46123889088630676, + "rewards/rejected": -1.5256928205490112, + "step": 599 + }, + { + "epoch": 0.57, + "grad_norm": 17.340742111206055, + "learning_rate": 4.5068205666316894e-07, + "logps/chosen": -38.48366928100586, + "logps/rejected": -67.47239685058594, + "loss": 0.4542, + "losses/dpo": 0.39069920778274536, + "losses/sft": 1.951080560684204, + "losses/total": 0.39069920778274536, + "ref_logps/chosen": -31.885093688964844, + "ref_logps/rejected": -54.0456657409668, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6598575115203857, + "rewards/margins": 0.6828150749206543, + "rewards/rejected": -1.34267258644104, + "step": 600 + }, + { + "epoch": 0.57, + "grad_norm": 16.781187057495117, + "learning_rate": 4.5050717033927945e-07, + "logps/chosen": -36.43646240234375, + "logps/rejected": -46.403053283691406, + "loss": 0.5473, + "losses/dpo": 0.43342477083206177, + "losses/sft": 1.403662919998169, + "losses/total": 0.43342477083206177, + "ref_logps/chosen": -28.485918045043945, + "ref_logps/rejected": -32.811485290527344, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7950546741485596, + "rewards/margins": 0.5641021728515625, + "rewards/rejected": -1.359156847000122, + "step": 601 + }, + { + "epoch": 0.57, + "grad_norm": 20.688121795654297, + "learning_rate": 4.5033228401538996e-07, + "logps/chosen": -47.729129791259766, + "logps/rejected": -53.09663391113281, + "loss": 0.6226, + "losses/dpo": 0.6209818124771118, + "losses/sft": 1.6103700399398804, + "losses/total": 0.6209818124771118, + "ref_logps/chosen": -36.89863967895508, + "ref_logps/rejected": -39.051124572753906, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.083048939704895, + "rewards/margins": 0.3215019106864929, + "rewards/rejected": -1.4045507907867432, + "step": 602 + }, + { + "epoch": 0.57, + "grad_norm": 17.99138832092285, + "learning_rate": 4.5015739769150053e-07, + "logps/chosen": -46.57192611694336, + "logps/rejected": -67.58000183105469, + "loss": 0.4871, + "losses/dpo": 0.41051098704338074, + "losses/sft": 1.6824710369110107, + "losses/total": 0.41051098704338074, + "ref_logps/chosen": -36.0202522277832, + "ref_logps/rejected": -50.3137092590332, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0551674365997314, + "rewards/margins": 0.671461820602417, + "rewards/rejected": -1.7266292572021484, + "step": 603 + }, + { + "epoch": 0.57, + "grad_norm": 17.019672393798828, + "learning_rate": 4.49982511367611e-07, + "logps/chosen": -36.29152297973633, + "logps/rejected": -54.796356201171875, + "loss": 0.5166, + "losses/dpo": 0.5871896147727966, + "losses/sft": 1.7426890134811401, + "losses/total": 0.5871896147727966, + "ref_logps/chosen": -28.109407424926758, + "ref_logps/rejected": -41.816139221191406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8182113766670227, + "rewards/margins": 0.47981035709381104, + "rewards/rejected": -1.2980217933654785, + "step": 604 + }, + { + "epoch": 0.57, + "grad_norm": 15.666430473327637, + "learning_rate": 4.4980762504372155e-07, + "logps/chosen": -46.628196716308594, + "logps/rejected": -48.52845001220703, + "loss": 0.4873, + "losses/dpo": 0.45817893743515015, + "losses/sft": 1.7076882123947144, + "losses/total": 0.45817893743515015, + "ref_logps/chosen": -40.01538848876953, + "ref_logps/rejected": -36.313812255859375, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6612809300422668, + "rewards/margins": 0.5601828098297119, + "rewards/rejected": -1.221463680267334, + "step": 605 + }, + { + "epoch": 0.57, + "grad_norm": 17.331449508666992, + "learning_rate": 4.496327387198321e-07, + "logps/chosen": -48.081321716308594, + "logps/rejected": -68.89327239990234, + "loss": 0.4777, + "losses/dpo": 0.2920644283294678, + "losses/sft": 1.9619967937469482, + "losses/total": 0.2920644283294678, + "ref_logps/chosen": -39.36968231201172, + "ref_logps/rejected": -52.335227966308594, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8711642026901245, + "rewards/margins": 0.7846402525901794, + "rewards/rejected": -1.6558043956756592, + "step": 606 + }, + { + "epoch": 0.57, + "grad_norm": 21.904211044311523, + "learning_rate": 4.4945785239594263e-07, + "logps/chosen": -46.43212127685547, + "logps/rejected": -62.09995651245117, + "loss": 0.5793, + "losses/dpo": 0.43012261390686035, + "losses/sft": 1.255206823348999, + "losses/total": 0.43012261390686035, + "ref_logps/chosen": -36.983028411865234, + "ref_logps/rejected": -46.1136474609375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9449092149734497, + "rewards/margins": 0.6537215709686279, + "rewards/rejected": -1.5986307859420776, + "step": 607 + }, + { + "epoch": 0.57, + "grad_norm": 25.75718879699707, + "learning_rate": 4.4928296607205315e-07, + "logps/chosen": -49.889095306396484, + "logps/rejected": -50.12107849121094, + "loss": 0.7068, + "losses/dpo": 0.6289348602294922, + "losses/sft": 1.6405929327011108, + "losses/total": 0.6289348602294922, + "ref_logps/chosen": -39.404510498046875, + "ref_logps/rejected": -38.11316680908203, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0484580993652344, + "rewards/margins": 0.1523330807685852, + "rewards/rejected": -1.2007912397384644, + "step": 608 + }, + { + "epoch": 0.58, + "grad_norm": 16.910297393798828, + "learning_rate": 4.4910807974816366e-07, + "logps/chosen": -54.687355041503906, + "logps/rejected": -74.47000122070312, + "loss": 0.4881, + "losses/dpo": 0.5970364809036255, + "losses/sft": 1.9302432537078857, + "losses/total": 0.5970364809036255, + "ref_logps/chosen": -45.57235336303711, + "ref_logps/rejected": -55.18684387207031, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9115001559257507, + "rewards/margins": 1.0168161392211914, + "rewards/rejected": -1.9283162355422974, + "step": 609 + }, + { + "epoch": 0.58, + "grad_norm": 19.139265060424805, + "learning_rate": 4.489331934242742e-07, + "logps/chosen": -45.38105773925781, + "logps/rejected": -52.451560974121094, + "loss": 0.5949, + "losses/dpo": 0.519945502281189, + "losses/sft": 1.6535450220108032, + "losses/total": 0.519945502281189, + "ref_logps/chosen": -37.386199951171875, + "ref_logps/rejected": -39.85076141357422, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7994853854179382, + "rewards/margins": 0.4605950117111206, + "rewards/rejected": -1.260080337524414, + "step": 610 + }, + { + "epoch": 0.58, + "grad_norm": 21.23914909362793, + "learning_rate": 4.487583071003847e-07, + "logps/chosen": -41.983001708984375, + "logps/rejected": -48.5654296875, + "loss": 0.6184, + "losses/dpo": 0.4955393373966217, + "losses/sft": 1.6477437019348145, + "losses/total": 0.4955393373966217, + "ref_logps/chosen": -33.260704040527344, + "ref_logps/rejected": -34.713722229003906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8722302913665771, + "rewards/margins": 0.5129398703575134, + "rewards/rejected": -1.3851702213287354, + "step": 611 + }, + { + "epoch": 0.58, + "grad_norm": 15.444493293762207, + "learning_rate": 4.4858342077649525e-07, + "logps/chosen": -38.10261535644531, + "logps/rejected": -67.46133422851562, + "loss": 0.4426, + "losses/dpo": 0.28211790323257446, + "losses/sft": 1.3990199565887451, + "losses/total": 0.28211790323257446, + "ref_logps/chosen": -29.05875015258789, + "ref_logps/rejected": -49.289119720458984, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9043869972229004, + "rewards/margins": 0.9128339290618896, + "rewards/rejected": -1.8172208070755005, + "step": 612 + }, + { + "epoch": 0.58, + "grad_norm": 22.545305252075195, + "learning_rate": 4.484085344526058e-07, + "logps/chosen": -53.714263916015625, + "logps/rejected": -64.93685913085938, + "loss": 0.5222, + "losses/dpo": 0.7485692501068115, + "losses/sft": 1.8144086599349976, + "losses/total": 0.7485692501068115, + "ref_logps/chosen": -43.9848747253418, + "ref_logps/rejected": -50.34032440185547, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9729388952255249, + "rewards/margins": 0.4867154359817505, + "rewards/rejected": -1.4596543312072754, + "step": 613 + }, + { + "epoch": 0.58, + "grad_norm": 28.44540786743164, + "learning_rate": 4.4823364812871633e-07, + "logps/chosen": -67.01869201660156, + "logps/rejected": -65.50695037841797, + "loss": 0.8167, + "losses/dpo": 0.9889156818389893, + "losses/sft": 2.019357681274414, + "losses/total": 0.9889156818389893, + "ref_logps/chosen": -51.936622619628906, + "ref_logps/rejected": -51.06785583496094, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5082073211669922, + "rewards/margins": -0.0642981082201004, + "rewards/rejected": -1.4439091682434082, + "step": 614 + }, + { + "epoch": 0.58, + "grad_norm": 18.57333755493164, + "learning_rate": 4.4805876180482684e-07, + "logps/chosen": -39.33977508544922, + "logps/rejected": -52.94608688354492, + "loss": 0.6512, + "losses/dpo": 0.3669934868812561, + "losses/sft": 1.4202157258987427, + "losses/total": 0.3669934868812561, + "ref_logps/chosen": -32.420963287353516, + "ref_logps/rejected": -42.54301834106445, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6918811798095703, + "rewards/margins": 0.3484255373477936, + "rewards/rejected": -1.0403066873550415, + "step": 615 + }, + { + "epoch": 0.58, + "grad_norm": 20.795682907104492, + "learning_rate": 4.4788387548093735e-07, + "logps/chosen": -43.713478088378906, + "logps/rejected": -47.93873977661133, + "loss": 0.6897, + "losses/dpo": 0.574022650718689, + "losses/sft": 1.5561370849609375, + "losses/total": 0.574022650718689, + "ref_logps/chosen": -34.589534759521484, + "ref_logps/rejected": -35.650272369384766, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9123944044113159, + "rewards/margins": 0.3164524435997009, + "rewards/rejected": -1.228846788406372, + "step": 616 + }, + { + "epoch": 0.58, + "grad_norm": 18.406551361083984, + "learning_rate": 4.477089891570479e-07, + "logps/chosen": -50.79158020019531, + "logps/rejected": -59.605995178222656, + "loss": 0.5404, + "losses/dpo": 0.49215713143348694, + "losses/sft": 1.7804259061813354, + "losses/total": 0.49215713143348694, + "ref_logps/chosen": -44.591209411621094, + "ref_logps/rejected": -47.803138732910156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6200370788574219, + "rewards/margins": 0.5602489113807678, + "rewards/rejected": -1.180285930633545, + "step": 617 + }, + { + "epoch": 0.58, + "grad_norm": 19.99086570739746, + "learning_rate": 4.475341028331584e-07, + "logps/chosen": -41.745399475097656, + "logps/rejected": -66.42382049560547, + "loss": 0.4712, + "losses/dpo": 0.2765766382217407, + "losses/sft": 1.3556243181228638, + "losses/total": 0.2765766382217407, + "ref_logps/chosen": -33.391448974609375, + "ref_logps/rejected": -51.502716064453125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8353952169418335, + "rewards/margins": 0.6567153930664062, + "rewards/rejected": -1.4921106100082397, + "step": 618 + }, + { + "epoch": 0.58, + "grad_norm": 20.219806671142578, + "learning_rate": 4.4735921650926894e-07, + "logps/chosen": -46.19069290161133, + "logps/rejected": -57.514678955078125, + "loss": 0.5436, + "losses/dpo": 0.7588982582092285, + "losses/sft": 1.919982671737671, + "losses/total": 0.7588982582092285, + "ref_logps/chosen": -38.15788650512695, + "ref_logps/rejected": -43.16008377075195, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8032807111740112, + "rewards/margins": 0.6321789026260376, + "rewards/rejected": -1.4354596138000488, + "step": 619 + }, + { + "epoch": 0.59, + "grad_norm": 17.956785202026367, + "learning_rate": 4.471843301853795e-07, + "logps/chosen": -43.92296600341797, + "logps/rejected": -52.783172607421875, + "loss": 0.5365, + "losses/dpo": 0.7335257530212402, + "losses/sft": 1.7678004503250122, + "losses/total": 0.7335257530212402, + "ref_logps/chosen": -35.4367561340332, + "ref_logps/rejected": -39.01974105834961, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8486208915710449, + "rewards/margins": 0.5277221202850342, + "rewards/rejected": -1.376343011856079, + "step": 620 + }, + { + "epoch": 0.59, + "grad_norm": 23.003658294677734, + "learning_rate": 4.4700944386149e-07, + "logps/chosen": -57.13471984863281, + "logps/rejected": -54.509063720703125, + "loss": 0.6757, + "losses/dpo": 0.6394138932228088, + "losses/sft": 1.6702107191085815, + "losses/total": 0.6394138932228088, + "ref_logps/chosen": -45.88554382324219, + "ref_logps/rejected": -41.745384216308594, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1249175071716309, + "rewards/margins": 0.15145081281661987, + "rewards/rejected": -1.2763683795928955, + "step": 621 + }, + { + "epoch": 0.59, + "grad_norm": 18.1394100189209, + "learning_rate": 4.4683455753760053e-07, + "logps/chosen": -41.69705581665039, + "logps/rejected": -66.72216796875, + "loss": 0.5115, + "losses/dpo": 0.47494760155677795, + "losses/sft": 2.0901641845703125, + "losses/total": 0.47494760155677795, + "ref_logps/chosen": -34.35966873168945, + "ref_logps/rejected": -53.75463104248047, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7337387204170227, + "rewards/margins": 0.5630146265029907, + "rewards/rejected": -1.2967534065246582, + "step": 622 + }, + { + "epoch": 0.59, + "grad_norm": 22.04412841796875, + "learning_rate": 4.4665967121371105e-07, + "logps/chosen": -36.097251892089844, + "logps/rejected": -61.0118408203125, + "loss": 0.5032, + "losses/dpo": 0.40838390588760376, + "losses/sft": 1.519967794418335, + "losses/total": 0.40838390588760376, + "ref_logps/chosen": -31.266021728515625, + "ref_logps/rejected": -49.03853988647461, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.48312294483184814, + "rewards/margins": 0.7142065763473511, + "rewards/rejected": -1.1973296403884888, + "step": 623 + }, + { + "epoch": 0.59, + "grad_norm": 19.78840446472168, + "learning_rate": 4.464847848898216e-07, + "logps/chosen": -56.3748893737793, + "logps/rejected": -62.040428161621094, + "loss": 0.5451, + "losses/dpo": 0.5086503028869629, + "losses/sft": 2.1892902851104736, + "losses/total": 0.5086503028869629, + "ref_logps/chosen": -47.25972366333008, + "ref_logps/rejected": -48.04988098144531, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9115162491798401, + "rewards/margins": 0.4875383675098419, + "rewards/rejected": -1.3990545272827148, + "step": 624 + }, + { + "epoch": 0.59, + "grad_norm": 23.76604652404785, + "learning_rate": 4.4630989856593207e-07, + "logps/chosen": -46.96368408203125, + "logps/rejected": -56.02411651611328, + "loss": 0.686, + "losses/dpo": 0.8924371600151062, + "losses/sft": 1.450379490852356, + "losses/total": 0.8924371600151062, + "ref_logps/chosen": -35.780189514160156, + "ref_logps/rejected": -41.658897399902344, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1183497905731201, + "rewards/margins": 0.3181724548339844, + "rewards/rejected": -1.4365222454071045, + "step": 625 + }, + { + "epoch": 0.59, + "grad_norm": 18.979887008666992, + "learning_rate": 4.4613501224204264e-07, + "logps/chosen": -48.69410705566406, + "logps/rejected": -62.87856674194336, + "loss": 0.5634, + "losses/dpo": 0.8508898615837097, + "losses/sft": 1.8799076080322266, + "losses/total": 0.8508898615837097, + "ref_logps/chosen": -40.932003021240234, + "ref_logps/rejected": -49.844390869140625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7762105464935303, + "rewards/margins": 0.5272067189216614, + "rewards/rejected": -1.3034173250198364, + "step": 626 + }, + { + "epoch": 0.59, + "grad_norm": 19.02654457092285, + "learning_rate": 4.459601259181532e-07, + "logps/chosen": -52.498817443847656, + "logps/rejected": -51.22602462768555, + "loss": 0.556, + "losses/dpo": 0.5481646656990051, + "losses/sft": 1.5784449577331543, + "losses/total": 0.5481646656990051, + "ref_logps/chosen": -44.086944580078125, + "ref_logps/rejected": -37.77796936035156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8411874771118164, + "rewards/margins": 0.5036177039146423, + "rewards/rejected": -1.3448052406311035, + "step": 627 + }, + { + "epoch": 0.59, + "grad_norm": 20.742963790893555, + "learning_rate": 4.457852395942637e-07, + "logps/chosen": -50.570465087890625, + "logps/rejected": -60.769622802734375, + "loss": 0.5196, + "losses/dpo": 0.5619484186172485, + "losses/sft": 2.273573875427246, + "losses/total": 0.5619484186172485, + "ref_logps/chosen": -41.500213623046875, + "ref_logps/rejected": -46.17858123779297, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9070254564285278, + "rewards/margins": 0.552078902721405, + "rewards/rejected": -1.459104299545288, + "step": 628 + }, + { + "epoch": 0.59, + "grad_norm": 16.53354263305664, + "learning_rate": 4.4561035327037423e-07, + "logps/chosen": -46.26103973388672, + "logps/rejected": -61.017906188964844, + "loss": 0.5165, + "losses/dpo": 0.3043246865272522, + "losses/sft": 2.1364951133728027, + "losses/total": 0.3043246865272522, + "ref_logps/chosen": -37.735923767089844, + "ref_logps/rejected": -47.00932693481445, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8525112867355347, + "rewards/margins": 0.5483470559120178, + "rewards/rejected": -1.4008582830429077, + "step": 629 + }, + { + "epoch": 0.59, + "grad_norm": 17.88193130493164, + "learning_rate": 4.4543546694648474e-07, + "logps/chosen": -44.97325134277344, + "logps/rejected": -53.60231018066406, + "loss": 0.5447, + "losses/dpo": 0.6644160747528076, + "losses/sft": 1.4925224781036377, + "losses/total": 0.6644160747528076, + "ref_logps/chosen": -38.09996795654297, + "ref_logps/rejected": -41.851593017578125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6873289346694946, + "rewards/margins": 0.4877431392669678, + "rewards/rejected": -1.1750720739364624, + "step": 630 + }, + { + "epoch": 0.6, + "grad_norm": 17.928112030029297, + "learning_rate": 4.452605806225953e-07, + "logps/chosen": -50.4836540222168, + "logps/rejected": -52.671653747558594, + "loss": 0.5926, + "losses/dpo": 0.4583841562271118, + "losses/sft": 1.5576122999191284, + "losses/total": 0.4583841562271118, + "ref_logps/chosen": -39.99382019042969, + "ref_logps/rejected": -37.47768020629883, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0489832162857056, + "rewards/margins": 0.47041383385658264, + "rewards/rejected": -1.5193971395492554, + "step": 631 + }, + { + "epoch": 0.6, + "grad_norm": 17.331432342529297, + "learning_rate": 4.450856942987058e-07, + "logps/chosen": -41.436256408691406, + "logps/rejected": -42.20221710205078, + "loss": 0.5962, + "losses/dpo": 0.4479900896549225, + "losses/sft": 1.6780507564544678, + "losses/total": 0.4479900896549225, + "ref_logps/chosen": -34.681339263916016, + "ref_logps/rejected": -31.729753494262695, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.675491452217102, + "rewards/margins": 0.37175506353378296, + "rewards/rejected": -1.0472465753555298, + "step": 632 + }, + { + "epoch": 0.6, + "grad_norm": 19.252670288085938, + "learning_rate": 4.4491080797481633e-07, + "logps/chosen": -41.41585922241211, + "logps/rejected": -59.671630859375, + "loss": 0.6025, + "losses/dpo": 0.4925847053527832, + "losses/sft": 1.540802240371704, + "losses/total": 0.4925847053527832, + "ref_logps/chosen": -34.36231994628906, + "ref_logps/rejected": -49.26945495605469, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7053536176681519, + "rewards/margins": 0.33486396074295044, + "rewards/rejected": -1.040217638015747, + "step": 633 + }, + { + "epoch": 0.6, + "grad_norm": 15.870306968688965, + "learning_rate": 4.447359216509269e-07, + "logps/chosen": -36.664276123046875, + "logps/rejected": -61.64225769042969, + "loss": 0.5396, + "losses/dpo": 0.4136042296886444, + "losses/sft": 1.9071153402328491, + "losses/total": 0.4136042296886444, + "ref_logps/chosen": -28.928457260131836, + "ref_logps/rejected": -47.71489715576172, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7735821604728699, + "rewards/margins": 0.6191542148590088, + "rewards/rejected": -1.3927364349365234, + "step": 634 + }, + { + "epoch": 0.6, + "grad_norm": 21.3395938873291, + "learning_rate": 4.445610353270374e-07, + "logps/chosen": -46.159767150878906, + "logps/rejected": -44.8878173828125, + "loss": 0.6575, + "losses/dpo": 0.9645848274230957, + "losses/sft": 1.457635521888733, + "losses/total": 0.9645848274230957, + "ref_logps/chosen": -38.900474548339844, + "ref_logps/rejected": -35.76042175292969, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7259290218353271, + "rewards/margins": 0.18681050837039948, + "rewards/rejected": -0.9127395749092102, + "step": 635 + }, + { + "epoch": 0.6, + "grad_norm": 18.366098403930664, + "learning_rate": 4.443861490031479e-07, + "logps/chosen": -45.634368896484375, + "logps/rejected": -57.911338806152344, + "loss": 0.5407, + "losses/dpo": 0.8006632328033447, + "losses/sft": 1.6804780960083008, + "losses/total": 0.8006632328033447, + "ref_logps/chosen": -37.43037414550781, + "ref_logps/rejected": -44.61968231201172, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8203996419906616, + "rewards/margins": 0.5087662935256958, + "rewards/rejected": -1.3291659355163574, + "step": 636 + }, + { + "epoch": 0.6, + "grad_norm": 20.716598510742188, + "learning_rate": 4.4421126267925844e-07, + "logps/chosen": -41.47325134277344, + "logps/rejected": -61.56958770751953, + "loss": 0.6046, + "losses/dpo": 0.5198478698730469, + "losses/sft": 0.8199011087417603, + "losses/total": 0.5198478698730469, + "ref_logps/chosen": -33.44092559814453, + "ref_logps/rejected": -50.522911071777344, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8032324314117432, + "rewards/margins": 0.30143505334854126, + "rewards/rejected": -1.1046674251556396, + "step": 637 + }, + { + "epoch": 0.6, + "grad_norm": 20.82286262512207, + "learning_rate": 4.44036376355369e-07, + "logps/chosen": -39.74163055419922, + "logps/rejected": -51.326393127441406, + "loss": 0.693, + "losses/dpo": 0.9570564031600952, + "losses/sft": 1.4647154808044434, + "losses/total": 0.9570564031600952, + "ref_logps/chosen": -31.393878936767578, + "ref_logps/rejected": -40.26043701171875, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8347752690315247, + "rewards/margins": 0.2718203663825989, + "rewards/rejected": -1.106595516204834, + "step": 638 + }, + { + "epoch": 0.6, + "grad_norm": 19.03815269470215, + "learning_rate": 4.438614900314795e-07, + "logps/chosen": -43.423133850097656, + "logps/rejected": -52.632022857666016, + "loss": 0.6168, + "losses/dpo": 0.5782968997955322, + "losses/sft": 1.9146307706832886, + "losses/total": 0.5782968997955322, + "ref_logps/chosen": -35.6546516418457, + "ref_logps/rejected": -41.304603576660156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7768484950065613, + "rewards/margins": 0.35589343309402466, + "rewards/rejected": -1.132741928100586, + "step": 639 + }, + { + "epoch": 0.6, + "grad_norm": 16.551855087280273, + "learning_rate": 4.4368660370759e-07, + "logps/chosen": -37.846988677978516, + "logps/rejected": -68.63668060302734, + "loss": 0.4806, + "losses/dpo": 0.3581385016441345, + "losses/sft": 1.4599511623382568, + "losses/total": 0.3581385016441345, + "ref_logps/chosen": -29.67315673828125, + "ref_logps/rejected": -53.69397735595703, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8173829913139343, + "rewards/margins": 0.6768878102302551, + "rewards/rejected": -1.4942708015441895, + "step": 640 + }, + { + "epoch": 0.61, + "grad_norm": 21.649520874023438, + "learning_rate": 4.435117173837006e-07, + "logps/chosen": -51.88352966308594, + "logps/rejected": -53.07874298095703, + "loss": 0.6776, + "losses/dpo": 0.7447686791419983, + "losses/sft": 1.8003766536712646, + "losses/total": 0.7447686791419983, + "ref_logps/chosen": -41.13995361328125, + "ref_logps/rejected": -40.63636016845703, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0743573904037476, + "rewards/margins": 0.16988088190555573, + "rewards/rejected": -1.244238257408142, + "step": 641 + }, + { + "epoch": 0.61, + "grad_norm": 19.520015716552734, + "learning_rate": 4.433368310598111e-07, + "logps/chosen": -39.8344841003418, + "logps/rejected": -54.55594253540039, + "loss": 0.5592, + "losses/dpo": 0.3267616927623749, + "losses/sft": 1.6068446636199951, + "losses/total": 0.3267616927623749, + "ref_logps/chosen": -34.02265930175781, + "ref_logps/rejected": -43.448482513427734, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5811823010444641, + "rewards/margins": 0.5295640230178833, + "rewards/rejected": -1.1107463836669922, + "step": 642 + }, + { + "epoch": 0.61, + "grad_norm": 17.208520889282227, + "learning_rate": 4.431619447359216e-07, + "logps/chosen": -50.041473388671875, + "logps/rejected": -56.7177619934082, + "loss": 0.5225, + "losses/dpo": 0.8120496273040771, + "losses/sft": 1.8595517873764038, + "losses/total": 0.8120496273040771, + "ref_logps/chosen": -42.69335174560547, + "ref_logps/rejected": -43.57000732421875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7348119020462036, + "rewards/margins": 0.5799638032913208, + "rewards/rejected": -1.3147757053375244, + "step": 643 + }, + { + "epoch": 0.61, + "grad_norm": 20.3192138671875, + "learning_rate": 4.4298705841203213e-07, + "logps/chosen": -37.84234619140625, + "logps/rejected": -53.206207275390625, + "loss": 0.5602, + "losses/dpo": 0.7065169811248779, + "losses/sft": 1.7062314748764038, + "losses/total": 0.7065169811248779, + "ref_logps/chosen": -29.939010620117188, + "ref_logps/rejected": -40.785850524902344, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7903335094451904, + "rewards/margins": 0.4517020285129547, + "rewards/rejected": -1.2420353889465332, + "step": 644 + }, + { + "epoch": 0.61, + "grad_norm": 18.488365173339844, + "learning_rate": 4.428121720881427e-07, + "logps/chosen": -42.64060974121094, + "logps/rejected": -59.79262161254883, + "loss": 0.6055, + "losses/dpo": 0.6880443096160889, + "losses/sft": 1.4448903799057007, + "losses/total": 0.6880443096160889, + "ref_logps/chosen": -34.36505126953125, + "ref_logps/rejected": -47.38486099243164, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8275556564331055, + "rewards/margins": 0.4132207930088043, + "rewards/rejected": -1.2407763004302979, + "step": 645 + }, + { + "epoch": 0.61, + "grad_norm": 23.138898849487305, + "learning_rate": 4.426372857642532e-07, + "logps/chosen": -49.89390563964844, + "logps/rejected": -53.07732391357422, + "loss": 0.7357, + "losses/dpo": 0.6794490218162537, + "losses/sft": 1.3935376405715942, + "losses/total": 0.6794490218162537, + "ref_logps/chosen": -39.706748962402344, + "ref_logps/rejected": -42.40960693359375, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.018715739250183, + "rewards/margins": 0.048056092113256454, + "rewards/rejected": -1.0667717456817627, + "step": 646 + }, + { + "epoch": 0.61, + "grad_norm": 16.54840660095215, + "learning_rate": 4.424623994403637e-07, + "logps/chosen": -42.495155334472656, + "logps/rejected": -52.528602600097656, + "loss": 0.4561, + "losses/dpo": 0.3666003942489624, + "losses/sft": 1.7475947141647339, + "losses/total": 0.3666003942489624, + "ref_logps/chosen": -33.528594970703125, + "ref_logps/rejected": -36.515892028808594, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8966560363769531, + "rewards/margins": 0.7046149373054504, + "rewards/rejected": -1.6012709140777588, + "step": 647 + }, + { + "epoch": 0.61, + "grad_norm": 25.164112091064453, + "learning_rate": 4.422875131164743e-07, + "logps/chosen": -62.31259536743164, + "logps/rejected": -66.88058471679688, + "loss": 0.631, + "losses/dpo": 0.634787917137146, + "losses/sft": 1.5533806085586548, + "losses/total": 0.634787917137146, + "ref_logps/chosen": -51.22900390625, + "ref_logps/rejected": -51.84467315673828, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1083593368530273, + "rewards/margins": 0.3952321410179138, + "rewards/rejected": -1.503591537475586, + "step": 648 + }, + { + "epoch": 0.61, + "grad_norm": 25.253978729248047, + "learning_rate": 4.421126267925848e-07, + "logps/chosen": -50.226009368896484, + "logps/rejected": -44.45460891723633, + "loss": 0.7328, + "losses/dpo": 1.0524861812591553, + "losses/sft": 2.099217414855957, + "losses/total": 1.0524861812591553, + "ref_logps/chosen": -39.9755859375, + "ref_logps/rejected": -33.808143615722656, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0250425338745117, + "rewards/margins": 0.039604201912879944, + "rewards/rejected": -1.0646467208862305, + "step": 649 + }, + { + "epoch": 0.61, + "grad_norm": 18.047706604003906, + "learning_rate": 4.419377404686953e-07, + "logps/chosen": -45.84514617919922, + "logps/rejected": -64.69760131835938, + "loss": 0.4586, + "losses/dpo": 0.6318464279174805, + "losses/sft": 1.4103286266326904, + "losses/total": 0.6318464279174805, + "ref_logps/chosen": -36.6884880065918, + "ref_logps/rejected": -48.65917205810547, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9156655073165894, + "rewards/margins": 0.6881779432296753, + "rewards/rejected": -1.603843331336975, + "step": 650 + }, + { + "epoch": 0.61, + "grad_norm": 20.930429458618164, + "learning_rate": 4.417628541448059e-07, + "logps/chosen": -50.62479019165039, + "logps/rejected": -47.59782028198242, + "loss": 0.6128, + "losses/dpo": 0.5327115058898926, + "losses/sft": 1.6167140007019043, + "losses/total": 0.5327115058898926, + "ref_logps/chosen": -40.95143127441406, + "ref_logps/rejected": -34.05638885498047, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9673357605934143, + "rewards/margins": 0.3868076205253601, + "rewards/rejected": -1.3541433811187744, + "step": 651 + }, + { + "epoch": 0.62, + "grad_norm": 20.108413696289062, + "learning_rate": 4.415879678209164e-07, + "logps/chosen": -50.80863952636719, + "logps/rejected": -78.14813232421875, + "loss": 0.4678, + "losses/dpo": 0.5177967548370361, + "losses/sft": 2.116820812225342, + "losses/total": 0.5177967548370361, + "ref_logps/chosen": -38.272247314453125, + "ref_logps/rejected": -59.01039505004883, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2536389827728271, + "rewards/margins": 0.6601352691650391, + "rewards/rejected": -1.9137742519378662, + "step": 652 + }, + { + "epoch": 0.62, + "grad_norm": 16.709762573242188, + "learning_rate": 4.4141308149702696e-07, + "logps/chosen": -46.55236053466797, + "logps/rejected": -79.44471740722656, + "loss": 0.4021, + "losses/dpo": 0.32214534282684326, + "losses/sft": 2.058011531829834, + "losses/total": 0.32214534282684326, + "ref_logps/chosen": -38.24629211425781, + "ref_logps/rejected": -60.538787841796875, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8306071162223816, + "rewards/margins": 1.0599861145019531, + "rewards/rejected": -1.8905932903289795, + "step": 653 + }, + { + "epoch": 0.62, + "grad_norm": 16.43296241760254, + "learning_rate": 4.412381951731374e-07, + "logps/chosen": -53.85081100463867, + "logps/rejected": -69.29866027832031, + "loss": 0.4691, + "losses/dpo": 0.4782261848449707, + "losses/sft": 2.026470899581909, + "losses/total": 0.4782261848449707, + "ref_logps/chosen": -44.15160369873047, + "ref_logps/rejected": -51.61612319946289, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9699205160140991, + "rewards/margins": 0.7983332872390747, + "rewards/rejected": -1.7682539224624634, + "step": 654 + }, + { + "epoch": 0.62, + "grad_norm": 25.55121421813965, + "learning_rate": 4.41063308849248e-07, + "logps/chosen": -64.85223388671875, + "logps/rejected": -68.76423645019531, + "loss": 0.5282, + "losses/dpo": 0.8686087131500244, + "losses/sft": 1.912293553352356, + "losses/total": 0.8686087131500244, + "ref_logps/chosen": -53.41896057128906, + "ref_logps/rejected": -49.61004638671875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1433278322219849, + "rewards/margins": 0.772091269493103, + "rewards/rejected": -1.9154192209243774, + "step": 655 + }, + { + "epoch": 0.62, + "grad_norm": 21.51396369934082, + "learning_rate": 4.408884225253585e-07, + "logps/chosen": -46.20930099487305, + "logps/rejected": -54.008995056152344, + "loss": 0.6033, + "losses/dpo": 0.5834099054336548, + "losses/sft": 1.929543375968933, + "losses/total": 0.5834099054336548, + "ref_logps/chosen": -38.36368942260742, + "ref_logps/rejected": -41.02717590332031, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.784561038017273, + "rewards/margins": 0.513620913028717, + "rewards/rejected": -1.2981820106506348, + "step": 656 + }, + { + "epoch": 0.62, + "grad_norm": 21.454519271850586, + "learning_rate": 4.40713536201469e-07, + "logps/chosen": -49.7601203918457, + "logps/rejected": -56.79192352294922, + "loss": 0.5375, + "losses/dpo": 0.715327799320221, + "losses/sft": 1.7063201665878296, + "losses/total": 0.715327799320221, + "ref_logps/chosen": -40.21389389038086, + "ref_logps/rejected": -42.62712097167969, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.954622745513916, + "rewards/margins": 0.46185749769210815, + "rewards/rejected": -1.416480302810669, + "step": 657 + }, + { + "epoch": 0.62, + "grad_norm": 22.692399978637695, + "learning_rate": 4.4053864987757957e-07, + "logps/chosen": -59.7557373046875, + "logps/rejected": -68.03887176513672, + "loss": 0.5193, + "losses/dpo": 0.6416804790496826, + "losses/sft": 1.9612102508544922, + "losses/total": 0.6416804790496826, + "ref_logps/chosen": -45.95321273803711, + "ref_logps/rejected": -48.96763610839844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3802525997161865, + "rewards/margins": 0.5268707275390625, + "rewards/rejected": -1.9071234464645386, + "step": 658 + }, + { + "epoch": 0.62, + "grad_norm": 19.45255470275879, + "learning_rate": 4.403637635536901e-07, + "logps/chosen": -43.349327087402344, + "logps/rejected": -55.58039093017578, + "loss": 0.5812, + "losses/dpo": 0.9194954633712769, + "losses/sft": 2.0749683380126953, + "losses/total": 0.9194954633712769, + "ref_logps/chosen": -32.08791732788086, + "ref_logps/rejected": -38.647396087646484, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.126140832901001, + "rewards/margins": 0.567158579826355, + "rewards/rejected": -1.6932992935180664, + "step": 659 + }, + { + "epoch": 0.62, + "grad_norm": 17.813053131103516, + "learning_rate": 4.4018887722980065e-07, + "logps/chosen": -45.078216552734375, + "logps/rejected": -54.474239349365234, + "loss": 0.4964, + "losses/dpo": 0.4895384907722473, + "losses/sft": 1.8483972549438477, + "losses/total": 0.4895384907722473, + "ref_logps/chosen": -36.561851501464844, + "ref_logps/rejected": -40.54484176635742, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8516362905502319, + "rewards/margins": 0.5413033962249756, + "rewards/rejected": -1.3929396867752075, + "step": 660 + }, + { + "epoch": 0.62, + "grad_norm": 17.709857940673828, + "learning_rate": 4.400139909059111e-07, + "logps/chosen": -44.531532287597656, + "logps/rejected": -61.72312545776367, + "loss": 0.5045, + "losses/dpo": 0.6141805648803711, + "losses/sft": 1.736121654510498, + "losses/total": 0.6141805648803711, + "ref_logps/chosen": -35.72633361816406, + "ref_logps/rejected": -44.452491760253906, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.880520224571228, + "rewards/margins": 0.8465432524681091, + "rewards/rejected": -1.7270634174346924, + "step": 661 + }, + { + "epoch": 0.63, + "grad_norm": 25.278169631958008, + "learning_rate": 4.398391045820217e-07, + "logps/chosen": -46.960514068603516, + "logps/rejected": -52.05955505371094, + "loss": 0.7564, + "losses/dpo": 0.8389472365379333, + "losses/sft": 1.385761022567749, + "losses/total": 0.8389472365379333, + "ref_logps/chosen": -36.994712829589844, + "ref_logps/rejected": -41.69624710083008, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9965801239013672, + "rewards/margins": 0.03975026682019234, + "rewards/rejected": -1.0363304615020752, + "step": 662 + }, + { + "epoch": 0.63, + "grad_norm": 24.060327529907227, + "learning_rate": 4.396642182581322e-07, + "logps/chosen": -51.66042709350586, + "logps/rejected": -70.60588073730469, + "loss": 0.6368, + "losses/dpo": 0.8159767389297485, + "losses/sft": 1.8975802659988403, + "losses/total": 0.8159767389297485, + "ref_logps/chosen": -41.02614212036133, + "ref_logps/rejected": -53.883201599121094, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0634284019470215, + "rewards/margins": 0.6088401079177856, + "rewards/rejected": -1.6722686290740967, + "step": 663 + }, + { + "epoch": 0.63, + "grad_norm": 25.352890014648438, + "learning_rate": 4.394893319342427e-07, + "logps/chosen": -53.84954071044922, + "logps/rejected": -71.38723754882812, + "loss": 0.5687, + "losses/dpo": 0.8507640361785889, + "losses/sft": 2.5388991832733154, + "losses/total": 0.8507640361785889, + "ref_logps/chosen": -41.39336395263672, + "ref_logps/rejected": -52.69435119628906, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.245617389678955, + "rewards/margins": 0.6236708164215088, + "rewards/rejected": -1.8692882061004639, + "step": 664 + }, + { + "epoch": 0.63, + "grad_norm": 26.868606567382812, + "learning_rate": 4.3931444561035327e-07, + "logps/chosen": -56.91460418701172, + "logps/rejected": -64.68113708496094, + "loss": 0.6458, + "losses/dpo": 0.9483543634414673, + "losses/sft": 2.0880446434020996, + "losses/total": 0.9483543634414673, + "ref_logps/chosen": -47.140235900878906, + "ref_logps/rejected": -49.95226287841797, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.977436900138855, + "rewards/margins": 0.49545052647590637, + "rewards/rejected": -1.472887396812439, + "step": 665 + }, + { + "epoch": 0.63, + "grad_norm": 25.787260055541992, + "learning_rate": 4.391395592864638e-07, + "logps/chosen": -45.327796936035156, + "logps/rejected": -53.5628776550293, + "loss": 0.6558, + "losses/dpo": 1.2123247385025024, + "losses/sft": 2.302277088165283, + "losses/total": 1.2123247385025024, + "ref_logps/chosen": -34.81057357788086, + "ref_logps/rejected": -39.50253677368164, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0517226457595825, + "rewards/margins": 0.3543117642402649, + "rewards/rejected": -1.4060344696044922, + "step": 666 + }, + { + "epoch": 0.63, + "grad_norm": 20.70470428466797, + "learning_rate": 4.3896467296257434e-07, + "logps/chosen": -43.803653717041016, + "logps/rejected": -57.881004333496094, + "loss": 0.632, + "losses/dpo": 0.8098856806755066, + "losses/sft": 2.082270622253418, + "losses/total": 0.8098856806755066, + "ref_logps/chosen": -34.23711395263672, + "ref_logps/rejected": -43.208396911621094, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9566540718078613, + "rewards/margins": 0.5106068849563599, + "rewards/rejected": -1.4672608375549316, + "step": 667 + }, + { + "epoch": 0.63, + "grad_norm": 27.2071533203125, + "learning_rate": 4.387897866386848e-07, + "logps/chosen": -53.50001525878906, + "logps/rejected": -57.1082878112793, + "loss": 0.7066, + "losses/dpo": 0.8632234334945679, + "losses/sft": 1.7941436767578125, + "losses/total": 0.8632234334945679, + "ref_logps/chosen": -41.003883361816406, + "ref_logps/rejected": -43.136844635009766, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2496129274368286, + "rewards/margins": 0.1475316286087036, + "rewards/rejected": -1.3971445560455322, + "step": 668 + }, + { + "epoch": 0.63, + "grad_norm": 17.104589462280273, + "learning_rate": 4.3861490031479537e-07, + "logps/chosen": -41.32619094848633, + "logps/rejected": -60.7089729309082, + "loss": 0.4586, + "losses/dpo": 0.5647233724594116, + "losses/sft": 1.8316291570663452, + "losses/total": 0.5647233724594116, + "ref_logps/chosen": -32.7796630859375, + "ref_logps/rejected": -43.463077545166016, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8546529412269592, + "rewards/margins": 0.8699364066123962, + "rewards/rejected": -1.7245893478393555, + "step": 669 + }, + { + "epoch": 0.63, + "grad_norm": 16.925846099853516, + "learning_rate": 4.3844001399090594e-07, + "logps/chosen": -47.019798278808594, + "logps/rejected": -64.30384063720703, + "loss": 0.5102, + "losses/dpo": 0.8314761519432068, + "losses/sft": 1.7040923833847046, + "losses/total": 0.8314761519432068, + "ref_logps/chosen": -40.31928253173828, + "ref_logps/rejected": -49.528221130371094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6700518131256104, + "rewards/margins": 0.8075105547904968, + "rewards/rejected": -1.477562427520752, + "step": 670 + }, + { + "epoch": 0.63, + "grad_norm": 22.852285385131836, + "learning_rate": 4.382651276670164e-07, + "logps/chosen": -48.86735534667969, + "logps/rejected": -60.63975524902344, + "loss": 0.5387, + "losses/dpo": 0.2187681794166565, + "losses/sft": 1.4940122365951538, + "losses/total": 0.2187681794166565, + "ref_logps/chosen": -39.97687530517578, + "ref_logps/rejected": -44.219886779785156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8890483975410461, + "rewards/margins": 0.7529385089874268, + "rewards/rejected": -1.6419868469238281, + "step": 671 + }, + { + "epoch": 0.63, + "grad_norm": 23.92821502685547, + "learning_rate": 4.3809024134312696e-07, + "logps/chosen": -47.636817932128906, + "logps/rejected": -54.7767333984375, + "loss": 0.6583, + "losses/dpo": 0.7265498042106628, + "losses/sft": 1.5916630029678345, + "losses/total": 0.7265498042106628, + "ref_logps/chosen": -38.55921173095703, + "ref_logps/rejected": -42.94745635986328, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.907760500907898, + "rewards/margins": 0.2751672863960266, + "rewards/rejected": -1.1829278469085693, + "step": 672 + }, + { + "epoch": 0.64, + "grad_norm": 18.440404891967773, + "learning_rate": 4.3791535501923747e-07, + "logps/chosen": -48.635337829589844, + "logps/rejected": -51.012855529785156, + "loss": 0.5051, + "losses/dpo": 0.7917296290397644, + "losses/sft": 1.6104592084884644, + "losses/total": 0.7917296290397644, + "ref_logps/chosen": -40.228145599365234, + "ref_logps/rejected": -36.581321716308594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8407193422317505, + "rewards/margins": 0.602434515953064, + "rewards/rejected": -1.4431538581848145, + "step": 673 + }, + { + "epoch": 0.64, + "grad_norm": 21.293899536132812, + "learning_rate": 4.3774046869534804e-07, + "logps/chosen": -35.00872802734375, + "logps/rejected": -61.432472229003906, + "loss": 0.4489, + "losses/dpo": 0.5239160656929016, + "losses/sft": 1.4297856092453003, + "losses/total": 0.5239160656929016, + "ref_logps/chosen": -29.145401000976562, + "ref_logps/rejected": -48.458168029785156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5863324403762817, + "rewards/margins": 0.7110979557037354, + "rewards/rejected": -1.2974302768707275, + "step": 674 + }, + { + "epoch": 0.64, + "grad_norm": 19.253389358520508, + "learning_rate": 4.375655823714585e-07, + "logps/chosen": -43.62630844116211, + "logps/rejected": -57.79328918457031, + "loss": 0.4773, + "losses/dpo": 0.6137611269950867, + "losses/sft": 1.7614896297454834, + "losses/total": 0.6137611269950867, + "ref_logps/chosen": -36.42924880981445, + "ref_logps/rejected": -42.72636413574219, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7197059392929077, + "rewards/margins": 0.7869865894317627, + "rewards/rejected": -1.5066924095153809, + "step": 675 + }, + { + "epoch": 0.64, + "grad_norm": 22.75641632080078, + "learning_rate": 4.3739069604756906e-07, + "logps/chosen": -55.21026611328125, + "logps/rejected": -53.981781005859375, + "loss": 0.7281, + "losses/dpo": 0.7902309894561768, + "losses/sft": 1.4334278106689453, + "losses/total": 0.7902309894561768, + "ref_logps/chosen": -43.870574951171875, + "ref_logps/rejected": -38.35118865966797, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1339691877365112, + "rewards/margins": 0.42909008264541626, + "rewards/rejected": -1.5630592107772827, + "step": 676 + }, + { + "epoch": 0.64, + "grad_norm": 22.33275032043457, + "learning_rate": 4.3721580972367963e-07, + "logps/chosen": -41.71989440917969, + "logps/rejected": -59.97500991821289, + "loss": 0.5978, + "losses/dpo": 0.517135739326477, + "losses/sft": 1.2349271774291992, + "losses/total": 0.517135739326477, + "ref_logps/chosen": -35.97785949707031, + "ref_logps/rejected": -49.64389419555664, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5742034316062927, + "rewards/margins": 0.4589080214500427, + "rewards/rejected": -1.0331114530563354, + "step": 677 + }, + { + "epoch": 0.64, + "grad_norm": 22.815269470214844, + "learning_rate": 4.370409233997901e-07, + "logps/chosen": -39.422367095947266, + "logps/rejected": -77.874755859375, + "loss": 0.5313, + "losses/dpo": 0.6275076270103455, + "losses/sft": 1.2638832330703735, + "losses/total": 0.6275076270103455, + "ref_logps/chosen": -33.01189041137695, + "ref_logps/rejected": -65.9725341796875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6410475373268127, + "rewards/margins": 0.5491743087768555, + "rewards/rejected": -1.1902217864990234, + "step": 678 + }, + { + "epoch": 0.64, + "grad_norm": 17.992204666137695, + "learning_rate": 4.3686603707590065e-07, + "logps/chosen": -40.244049072265625, + "logps/rejected": -64.80023193359375, + "loss": 0.4144, + "losses/dpo": 0.4164362847805023, + "losses/sft": 1.7152419090270996, + "losses/total": 0.4164362847805023, + "ref_logps/chosen": -33.803955078125, + "ref_logps/rejected": -50.13800811767578, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6440093517303467, + "rewards/margins": 0.8222131133079529, + "rewards/rejected": -1.4662225246429443, + "step": 679 + }, + { + "epoch": 0.64, + "grad_norm": 16.815841674804688, + "learning_rate": 4.3669115075201117e-07, + "logps/chosen": -42.92987060546875, + "logps/rejected": -66.77555847167969, + "loss": 0.3951, + "losses/dpo": 0.3958371877670288, + "losses/sft": 2.2657477855682373, + "losses/total": 0.3958371877670288, + "ref_logps/chosen": -36.862754821777344, + "ref_logps/rejected": -48.75736618041992, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.606711208820343, + "rewards/margins": 1.1951079368591309, + "rewards/rejected": -1.801819086074829, + "step": 680 + }, + { + "epoch": 0.64, + "grad_norm": 25.332477569580078, + "learning_rate": 4.3651626442812173e-07, + "logps/chosen": -48.87312316894531, + "logps/rejected": -47.05182647705078, + "loss": 0.9043, + "losses/dpo": 0.7493389844894409, + "losses/sft": 1.770767092704773, + "losses/total": 0.7493389844894409, + "ref_logps/chosen": -37.08683776855469, + "ref_logps/rejected": -36.57822799682617, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.17862868309021, + "rewards/margins": -0.13126879930496216, + "rewards/rejected": -1.047359824180603, + "step": 681 + }, + { + "epoch": 0.64, + "grad_norm": 20.06817626953125, + "learning_rate": 4.363413781042322e-07, + "logps/chosen": -39.16017150878906, + "logps/rejected": -45.40052032470703, + "loss": 0.7039, + "losses/dpo": 1.1117303371429443, + "losses/sft": 1.7223199605941772, + "losses/total": 1.1117303371429443, + "ref_logps/chosen": -29.940467834472656, + "ref_logps/rejected": -32.448307037353516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9219707250595093, + "rewards/margins": 0.37325066328048706, + "rewards/rejected": -1.2952213287353516, + "step": 682 + }, + { + "epoch": 0.64, + "grad_norm": 17.64261817932129, + "learning_rate": 4.3616649178034276e-07, + "logps/chosen": -42.20592498779297, + "logps/rejected": -60.50687789916992, + "loss": 0.4485, + "losses/dpo": 0.4617617428302765, + "losses/sft": 1.6748210191726685, + "losses/total": 0.4617617428302765, + "ref_logps/chosen": -35.366451263427734, + "ref_logps/rejected": -44.39671325683594, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6839475631713867, + "rewards/margins": 0.9270689487457275, + "rewards/rejected": -1.6110165119171143, + "step": 683 + }, + { + "epoch": 0.65, + "grad_norm": 21.765056610107422, + "learning_rate": 4.359916054564533e-07, + "logps/chosen": -51.79177474975586, + "logps/rejected": -49.704383850097656, + "loss": 0.6447, + "losses/dpo": 0.38130292296409607, + "losses/sft": 1.7255353927612305, + "losses/total": 0.38130292296409607, + "ref_logps/chosen": -41.98659896850586, + "ref_logps/rejected": -36.198974609375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9805173873901367, + "rewards/margins": 0.37002331018447876, + "rewards/rejected": -1.3505406379699707, + "step": 684 + }, + { + "epoch": 0.65, + "grad_norm": 24.796009063720703, + "learning_rate": 4.358167191325638e-07, + "logps/chosen": -62.32807922363281, + "logps/rejected": -58.585182189941406, + "loss": 0.7332, + "losses/dpo": 0.9550775289535522, + "losses/sft": 2.147249221801758, + "losses/total": 0.9550775289535522, + "ref_logps/chosen": -49.45254135131836, + "ref_logps/rejected": -43.134979248046875, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2875540256500244, + "rewards/margins": 0.2574663460254669, + "rewards/rejected": -1.545020341873169, + "step": 685 + }, + { + "epoch": 0.65, + "grad_norm": 20.360877990722656, + "learning_rate": 4.3564183280867435e-07, + "logps/chosen": -48.447593688964844, + "logps/rejected": -61.438575744628906, + "loss": 0.4941, + "losses/dpo": 0.6078076362609863, + "losses/sft": 1.8867300748825073, + "losses/total": 0.6078076362609863, + "ref_logps/chosen": -39.92787170410156, + "ref_logps/rejected": -45.262115478515625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.851972222328186, + "rewards/margins": 0.7656738758087158, + "rewards/rejected": -1.6176462173461914, + "step": 686 + }, + { + "epoch": 0.65, + "grad_norm": 23.67317008972168, + "learning_rate": 4.3546694648478486e-07, + "logps/chosen": -47.97858810424805, + "logps/rejected": -73.85092163085938, + "loss": 0.6241, + "losses/dpo": 0.40187883377075195, + "losses/sft": 1.4718133211135864, + "losses/total": 0.40187883377075195, + "ref_logps/chosen": -35.94654846191406, + "ref_logps/rejected": -54.152320861816406, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2032039165496826, + "rewards/margins": 0.7666558027267456, + "rewards/rejected": -1.9698598384857178, + "step": 687 + }, + { + "epoch": 0.65, + "grad_norm": 19.96158790588379, + "learning_rate": 4.3529206016089543e-07, + "logps/chosen": -41.90129470825195, + "logps/rejected": -55.30469512939453, + "loss": 0.6205, + "losses/dpo": 0.6631141901016235, + "losses/sft": 1.4098172187805176, + "losses/total": 0.6631141901016235, + "ref_logps/chosen": -33.053680419921875, + "ref_logps/rejected": -42.67448043823242, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8847612142562866, + "rewards/margins": 0.37826040387153625, + "rewards/rejected": -1.2630215883255005, + "step": 688 + }, + { + "epoch": 0.65, + "grad_norm": 27.99469757080078, + "learning_rate": 4.351171738370059e-07, + "logps/chosen": -51.00370788574219, + "logps/rejected": -55.13591003417969, + "loss": 0.8336, + "losses/dpo": 1.0738999843597412, + "losses/sft": 1.9919081926345825, + "losses/total": 1.0738999843597412, + "ref_logps/chosen": -39.451507568359375, + "ref_logps/rejected": -44.67042541503906, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1552196741104126, + "rewards/margins": -0.10867124050855637, + "rewards/rejected": -1.0465483665466309, + "step": 689 + }, + { + "epoch": 0.65, + "grad_norm": 28.0861873626709, + "learning_rate": 4.3494228751311645e-07, + "logps/chosen": -44.46216583251953, + "logps/rejected": -49.33219528198242, + "loss": 0.8589, + "losses/dpo": 1.1967506408691406, + "losses/sft": 1.7002787590026855, + "losses/total": 1.1967506408691406, + "ref_logps/chosen": -32.268699645996094, + "ref_logps/rejected": -38.94769287109375, + "rewards/accuracies": 0.25, + "rewards/chosen": -1.2193467617034912, + "rewards/margins": -0.18089626729488373, + "rewards/rejected": -1.0384504795074463, + "step": 690 + }, + { + "epoch": 0.65, + "grad_norm": 20.783323287963867, + "learning_rate": 4.34767401189227e-07, + "logps/chosen": -51.34596252441406, + "logps/rejected": -57.26661682128906, + "loss": 0.6163, + "losses/dpo": 0.7968945503234863, + "losses/sft": 1.4953840970993042, + "losses/total": 0.7968945503234863, + "ref_logps/chosen": -42.618202209472656, + "ref_logps/rejected": -44.93904495239258, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8727757930755615, + "rewards/margins": 0.35998106002807617, + "rewards/rejected": -1.2327568531036377, + "step": 691 + }, + { + "epoch": 0.65, + "grad_norm": 22.957321166992188, + "learning_rate": 4.345925148653375e-07, + "logps/chosen": -48.171058654785156, + "logps/rejected": -51.01253128051758, + "loss": 0.871, + "losses/dpo": 0.7004855871200562, + "losses/sft": 1.5449622869491577, + "losses/total": 0.7004855871200562, + "ref_logps/chosen": -37.79178237915039, + "ref_logps/rejected": -39.620338439941406, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0379281044006348, + "rewards/margins": 0.10129156708717346, + "rewards/rejected": -1.1392196416854858, + "step": 692 + }, + { + "epoch": 0.65, + "grad_norm": 22.219606399536133, + "learning_rate": 4.3441762854144804e-07, + "logps/chosen": -50.45658874511719, + "logps/rejected": -68.93952178955078, + "loss": 0.5351, + "losses/dpo": 0.6337833404541016, + "losses/sft": 1.8166413307189941, + "losses/total": 0.6337833404541016, + "ref_logps/chosen": -41.516822814941406, + "ref_logps/rejected": -53.788997650146484, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.893976628780365, + "rewards/margins": 0.6210753917694092, + "rewards/rejected": -1.515052080154419, + "step": 693 + }, + { + "epoch": 0.66, + "grad_norm": 16.8718204498291, + "learning_rate": 4.3424274221755856e-07, + "logps/chosen": -44.262779235839844, + "logps/rejected": -65.33273315429688, + "loss": 0.4547, + "losses/dpo": 0.5813052654266357, + "losses/sft": 1.791420817375183, + "losses/total": 0.5813052654266357, + "ref_logps/chosen": -33.40455627441406, + "ref_logps/rejected": -46.93719482421875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0858221054077148, + "rewards/margins": 0.7537323832511902, + "rewards/rejected": -1.8395545482635498, + "step": 694 + }, + { + "epoch": 0.66, + "grad_norm": 18.257749557495117, + "learning_rate": 4.340678558936691e-07, + "logps/chosen": -33.0558967590332, + "logps/rejected": -49.74082946777344, + "loss": 0.5253, + "losses/dpo": 0.583354115486145, + "losses/sft": 1.278853178024292, + "losses/total": 0.583354115486145, + "ref_logps/chosen": -25.848865509033203, + "ref_logps/rejected": -36.68687438964844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7207031846046448, + "rewards/margins": 0.5846923589706421, + "rewards/rejected": -1.3053956031799316, + "step": 695 + }, + { + "epoch": 0.66, + "grad_norm": 24.73470115661621, + "learning_rate": 4.3389296956977963e-07, + "logps/chosen": -49.288055419921875, + "logps/rejected": -59.290550231933594, + "loss": 0.6199, + "losses/dpo": 0.5122953653335571, + "losses/sft": 1.709577202796936, + "losses/total": 0.5122953653335571, + "ref_logps/chosen": -39.45866394042969, + "ref_logps/rejected": -44.76314163208008, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9829391241073608, + "rewards/margins": 0.4698018431663513, + "rewards/rejected": -1.4527409076690674, + "step": 696 + }, + { + "epoch": 0.66, + "grad_norm": 17.934207916259766, + "learning_rate": 4.3371808324589015e-07, + "logps/chosen": -37.41145324707031, + "logps/rejected": -51.14091873168945, + "loss": 0.6256, + "losses/dpo": 0.9151169657707214, + "losses/sft": 1.8010318279266357, + "losses/total": 0.9151169657707214, + "ref_logps/chosen": -29.690553665161133, + "ref_logps/rejected": -37.15107727050781, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7720898985862732, + "rewards/margins": 0.6268945336341858, + "rewards/rejected": -1.398984432220459, + "step": 697 + }, + { + "epoch": 0.66, + "grad_norm": 15.5745849609375, + "learning_rate": 4.335431969220007e-07, + "logps/chosen": -36.08177185058594, + "logps/rejected": -47.9534797668457, + "loss": 0.5665, + "losses/dpo": 0.5802960395812988, + "losses/sft": 1.8154224157333374, + "losses/total": 0.5802960395812988, + "ref_logps/chosen": -27.115325927734375, + "ref_logps/rejected": -34.380088806152344, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8966445326805115, + "rewards/margins": 0.4606950879096985, + "rewards/rejected": -1.35733962059021, + "step": 698 + }, + { + "epoch": 0.66, + "grad_norm": 16.618440628051758, + "learning_rate": 4.3336831059811117e-07, + "logps/chosen": -44.092445373535156, + "logps/rejected": -50.84418487548828, + "loss": 0.5017, + "losses/dpo": 0.5852111577987671, + "losses/sft": 1.4667259454727173, + "losses/total": 0.5852111577987671, + "ref_logps/chosen": -35.301414489746094, + "ref_logps/rejected": -36.55914306640625, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8791036605834961, + "rewards/margins": 0.5494003891944885, + "rewards/rejected": -1.4285039901733398, + "step": 699 + }, + { + "epoch": 0.66, + "grad_norm": 17.073543548583984, + "learning_rate": 4.3319342427422174e-07, + "logps/chosen": -46.26211166381836, + "logps/rejected": -67.36860656738281, + "loss": 0.3976, + "losses/dpo": 0.7678629159927368, + "losses/sft": 1.8686453104019165, + "losses/total": 0.7678629159927368, + "ref_logps/chosen": -39.48162841796875, + "ref_logps/rejected": -50.303245544433594, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6780481934547424, + "rewards/margins": 1.0284874439239502, + "rewards/rejected": -1.7065356969833374, + "step": 700 + }, + { + "epoch": 0.66, + "grad_norm": 19.845348358154297, + "learning_rate": 4.3301853795033225e-07, + "logps/chosen": -45.529266357421875, + "logps/rejected": -47.47041320800781, + "loss": 0.713, + "losses/dpo": 0.8201539516448975, + "losses/sft": 1.7441099882125854, + "losses/total": 0.8201539516448975, + "ref_logps/chosen": -35.924644470214844, + "ref_logps/rejected": -35.18165588378906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.960462749004364, + "rewards/margins": 0.26841315627098083, + "rewards/rejected": -1.2288758754730225, + "step": 701 + }, + { + "epoch": 0.66, + "grad_norm": 27.96426773071289, + "learning_rate": 4.328436516264428e-07, + "logps/chosen": -56.74497985839844, + "logps/rejected": -58.99053192138672, + "loss": 0.7965, + "losses/dpo": 1.0811152458190918, + "losses/sft": 2.0527267456054688, + "losses/total": 1.0811152458190918, + "ref_logps/chosen": -46.56687927246094, + "ref_logps/rejected": -48.54755401611328, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0178104639053345, + "rewards/margins": 0.02648720145225525, + "rewards/rejected": -1.0442975759506226, + "step": 702 + }, + { + "epoch": 0.66, + "grad_norm": 23.73536491394043, + "learning_rate": 4.3266876530255333e-07, + "logps/chosen": -53.9489631652832, + "logps/rejected": -67.30574035644531, + "loss": 0.6054, + "losses/dpo": 0.24673229455947876, + "losses/sft": 1.821505069732666, + "losses/total": 0.24673229455947876, + "ref_logps/chosen": -43.61452865600586, + "ref_logps/rejected": -52.201171875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0334436893463135, + "rewards/margins": 0.47701317071914673, + "rewards/rejected": -1.5104568004608154, + "step": 703 + }, + { + "epoch": 0.66, + "grad_norm": 16.407470703125, + "learning_rate": 4.3249387897866384e-07, + "logps/chosen": -27.682621002197266, + "logps/rejected": -50.503944396972656, + "loss": 0.5236, + "losses/dpo": 0.49027788639068604, + "losses/sft": 1.0848276615142822, + "losses/total": 0.49027788639068604, + "ref_logps/chosen": -22.123821258544922, + "ref_logps/rejected": -38.551002502441406, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.555880069732666, + "rewards/margins": 0.6394139528274536, + "rewards/rejected": -1.19529390335083, + "step": 704 + }, + { + "epoch": 0.67, + "grad_norm": 21.02859878540039, + "learning_rate": 4.323189926547744e-07, + "logps/chosen": -46.8189697265625, + "logps/rejected": -59.234222412109375, + "loss": 0.5939, + "losses/dpo": 0.631971001625061, + "losses/sft": 1.6848599910736084, + "losses/total": 0.631971001625061, + "ref_logps/chosen": -38.235374450683594, + "ref_logps/rejected": -45.48722457885742, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8583594560623169, + "rewards/margins": 0.5163403153419495, + "rewards/rejected": -1.3746997117996216, + "step": 705 + }, + { + "epoch": 0.67, + "grad_norm": 23.449262619018555, + "learning_rate": 4.3214410633088487e-07, + "logps/chosen": -48.120262145996094, + "logps/rejected": -59.29852294921875, + "loss": 0.6985, + "losses/dpo": 0.525850772857666, + "losses/sft": 1.7156158685684204, + "losses/total": 0.525850772857666, + "ref_logps/chosen": -39.4202995300293, + "ref_logps/rejected": -49.22132110595703, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8699963688850403, + "rewards/margins": 0.1377238780260086, + "rewards/rejected": -1.0077202320098877, + "step": 706 + }, + { + "epoch": 0.67, + "grad_norm": 18.30802345275879, + "learning_rate": 4.3196922000699543e-07, + "logps/chosen": -44.236473083496094, + "logps/rejected": -48.68854522705078, + "loss": 0.6107, + "losses/dpo": 0.5337749123573303, + "losses/sft": 1.376046895980835, + "losses/total": 0.5337749123573303, + "ref_logps/chosen": -36.16105651855469, + "ref_logps/rejected": -37.10627746582031, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8075416684150696, + "rewards/margins": 0.3506850600242615, + "rewards/rejected": -1.158226728439331, + "step": 707 + }, + { + "epoch": 0.67, + "grad_norm": 14.369156837463379, + "learning_rate": 4.3179433368310595e-07, + "logps/chosen": -36.92255401611328, + "logps/rejected": -51.405059814453125, + "loss": 0.4603, + "losses/dpo": 0.4892197251319885, + "losses/sft": 1.0883471965789795, + "losses/total": 0.4892197251319885, + "ref_logps/chosen": -32.85285186767578, + "ref_logps/rejected": -39.484031677246094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40697014331817627, + "rewards/margins": 0.7851329445838928, + "rewards/rejected": -1.1921030282974243, + "step": 708 + }, + { + "epoch": 0.67, + "grad_norm": 19.65743064880371, + "learning_rate": 4.316194473592165e-07, + "logps/chosen": -34.99608612060547, + "logps/rejected": -43.15175247192383, + "loss": 0.6552, + "losses/dpo": 0.4912104606628418, + "losses/sft": 1.3308316469192505, + "losses/total": 0.4912104606628418, + "ref_logps/chosen": -30.362171173095703, + "ref_logps/rejected": -36.70307922363281, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4633915424346924, + "rewards/margins": 0.1814756691455841, + "rewards/rejected": -0.6448671817779541, + "step": 709 + }, + { + "epoch": 0.67, + "grad_norm": 19.184167861938477, + "learning_rate": 4.31444561035327e-07, + "logps/chosen": -41.29024887084961, + "logps/rejected": -45.53723907470703, + "loss": 0.5721, + "losses/dpo": 0.3396609425544739, + "losses/sft": 1.4803591966629028, + "losses/total": 0.3396609425544739, + "ref_logps/chosen": -36.41984939575195, + "ref_logps/rejected": -35.92561340332031, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.48704004287719727, + "rewards/margins": 0.4741221070289612, + "rewards/rejected": -0.9611622095108032, + "step": 710 + }, + { + "epoch": 0.67, + "grad_norm": 17.922670364379883, + "learning_rate": 4.3126967471143754e-07, + "logps/chosen": -38.75063705444336, + "logps/rejected": -49.31770324707031, + "loss": 0.5491, + "losses/dpo": 0.5774656534194946, + "losses/sft": 1.4425379037857056, + "losses/total": 0.5774656534194946, + "ref_logps/chosen": -33.69990921020508, + "ref_logps/rejected": -39.866554260253906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5050727725028992, + "rewards/margins": 0.4400426149368286, + "rewards/rejected": -0.945115327835083, + "step": 711 + }, + { + "epoch": 0.67, + "grad_norm": 12.123807907104492, + "learning_rate": 4.310947883875481e-07, + "logps/chosen": -27.21611213684082, + "logps/rejected": -45.500240325927734, + "loss": 0.4194, + "losses/dpo": 0.45889779925346375, + "losses/sft": 2.3296761512756348, + "losses/total": 0.45889779925346375, + "ref_logps/chosen": -23.09859848022461, + "ref_logps/rejected": -32.90995788574219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.41175127029418945, + "rewards/margins": 0.8472768068313599, + "rewards/rejected": -1.2590280771255493, + "step": 712 + }, + { + "epoch": 0.67, + "grad_norm": 16.523786544799805, + "learning_rate": 4.3091990206365856e-07, + "logps/chosen": -43.260677337646484, + "logps/rejected": -72.78237915039062, + "loss": 0.4426, + "losses/dpo": 0.1860278695821762, + "losses/sft": 1.7062638998031616, + "losses/total": 0.1860278695821762, + "ref_logps/chosen": -36.98164749145508, + "ref_logps/rejected": -56.80029296875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6279028654098511, + "rewards/margins": 0.9703060388565063, + "rewards/rejected": -1.5982087850570679, + "step": 713 + }, + { + "epoch": 0.67, + "grad_norm": 12.94727611541748, + "learning_rate": 4.3074501573976913e-07, + "logps/chosen": -31.61503028869629, + "logps/rejected": -45.780914306640625, + "loss": 0.3611, + "losses/dpo": 0.3345387876033783, + "losses/sft": 1.019202709197998, + "losses/total": 0.3345387876033783, + "ref_logps/chosen": -28.76348876953125, + "ref_logps/rejected": -33.738136291503906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2851543426513672, + "rewards/margins": 0.9191235303878784, + "rewards/rejected": -1.204277753829956, + "step": 714 + }, + { + "epoch": 0.68, + "grad_norm": 15.002126693725586, + "learning_rate": 4.305701294158797e-07, + "logps/chosen": -34.27792739868164, + "logps/rejected": -49.252220153808594, + "loss": 0.4541, + "losses/dpo": 0.5120267868041992, + "losses/sft": 1.7566144466400146, + "losses/total": 0.5120267868041992, + "ref_logps/chosen": -29.74795150756836, + "ref_logps/rejected": -36.67368698120117, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4529974162578583, + "rewards/margins": 0.8048561811447144, + "rewards/rejected": -1.2578535079956055, + "step": 715 + }, + { + "epoch": 0.68, + "grad_norm": 23.409090042114258, + "learning_rate": 4.303952430919902e-07, + "logps/chosen": -57.65196228027344, + "logps/rejected": -58.61241912841797, + "loss": 0.6757, + "losses/dpo": 0.37955763936042786, + "losses/sft": 1.9902818202972412, + "losses/total": 0.37955763936042786, + "ref_logps/chosen": -48.691795349121094, + "ref_logps/rejected": -45.262969970703125, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8960169553756714, + "rewards/margins": 0.43892765045166016, + "rewards/rejected": -1.334944486618042, + "step": 716 + }, + { + "epoch": 0.68, + "grad_norm": 20.81814956665039, + "learning_rate": 4.302203567681007e-07, + "logps/chosen": -52.60441207885742, + "logps/rejected": -54.37532043457031, + "loss": 0.6278, + "losses/dpo": 0.6902827024459839, + "losses/sft": 1.4632818698883057, + "losses/total": 0.6902827024459839, + "ref_logps/chosen": -45.165748596191406, + "ref_logps/rejected": -43.0654296875, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7438666224479675, + "rewards/margins": 0.3871225118637085, + "rewards/rejected": -1.1309890747070312, + "step": 717 + }, + { + "epoch": 0.68, + "grad_norm": 18.56036949157715, + "learning_rate": 4.3004547044421123e-07, + "logps/chosen": -48.89460754394531, + "logps/rejected": -73.34172058105469, + "loss": 0.4418, + "losses/dpo": 0.3784174621105194, + "losses/sft": 1.4442063570022583, + "losses/total": 0.3784174621105194, + "ref_logps/chosen": -38.736427307128906, + "ref_logps/rejected": -52.50584411621094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.015817642211914, + "rewards/margins": 1.0677695274353027, + "rewards/rejected": -2.083587169647217, + "step": 718 + }, + { + "epoch": 0.68, + "grad_norm": 18.991525650024414, + "learning_rate": 4.298705841203218e-07, + "logps/chosen": -46.43339157104492, + "logps/rejected": -58.17090606689453, + "loss": 0.6159, + "losses/dpo": 0.5434184074401855, + "losses/sft": 1.0356076955795288, + "losses/total": 0.5434184074401855, + "ref_logps/chosen": -36.94187927246094, + "ref_logps/rejected": -45.233558654785156, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9491511583328247, + "rewards/margins": 0.3445841372013092, + "rewards/rejected": -1.2937352657318115, + "step": 719 + }, + { + "epoch": 0.68, + "grad_norm": 14.975947380065918, + "learning_rate": 4.2969569779643226e-07, + "logps/chosen": -42.52625274658203, + "logps/rejected": -54.581756591796875, + "loss": 0.4927, + "losses/dpo": 0.4607549011707306, + "losses/sft": 1.430629014968872, + "losses/total": 0.4607549011707306, + "ref_logps/chosen": -34.63993835449219, + "ref_logps/rejected": -40.030967712402344, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7886309027671814, + "rewards/margins": 0.6664482355117798, + "rewards/rejected": -1.4550790786743164, + "step": 720 + }, + { + "epoch": 0.68, + "grad_norm": 16.38325309753418, + "learning_rate": 4.295208114725428e-07, + "logps/chosen": -41.7340087890625, + "logps/rejected": -52.14056396484375, + "loss": 0.5441, + "losses/dpo": 0.5851291418075562, + "losses/sft": 1.5170259475708008, + "losses/total": 0.5851291418075562, + "ref_logps/chosen": -33.08143615722656, + "ref_logps/rejected": -38.7519645690918, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8652572631835938, + "rewards/margins": 0.47360312938690186, + "rewards/rejected": -1.3388605117797852, + "step": 721 + }, + { + "epoch": 0.68, + "grad_norm": 21.832866668701172, + "learning_rate": 4.293459251486534e-07, + "logps/chosen": -43.31774139404297, + "logps/rejected": -62.20421600341797, + "loss": 0.6606, + "losses/dpo": 0.7681498527526855, + "losses/sft": 1.543290376663208, + "losses/total": 0.7681498527526855, + "ref_logps/chosen": -34.86967849731445, + "ref_logps/rejected": -47.756134033203125, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8448069095611572, + "rewards/margins": 0.6000010967254639, + "rewards/rejected": -1.444808006286621, + "step": 722 + }, + { + "epoch": 0.68, + "grad_norm": 18.4277400970459, + "learning_rate": 4.291710388247639e-07, + "logps/chosen": -51.22413635253906, + "logps/rejected": -60.75074005126953, + "loss": 0.5454, + "losses/dpo": 0.44816651940345764, + "losses/sft": 1.9517427682876587, + "losses/total": 0.44816651940345764, + "ref_logps/chosen": -43.811912536621094, + "ref_logps/rejected": -47.564964294433594, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7412219047546387, + "rewards/margins": 0.5773557424545288, + "rewards/rejected": -1.3185776472091675, + "step": 723 + }, + { + "epoch": 0.68, + "grad_norm": 20.5158748626709, + "learning_rate": 4.289961525008744e-07, + "logps/chosen": -45.1531867980957, + "logps/rejected": -66.62257385253906, + "loss": 0.5519, + "losses/dpo": 0.5188019871711731, + "losses/sft": 1.0994102954864502, + "losses/total": 0.5188019871711731, + "ref_logps/chosen": -34.660316467285156, + "ref_logps/rejected": -49.06583786010742, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0492873191833496, + "rewards/margins": 0.7063854336738586, + "rewards/rejected": -1.7556726932525635, + "step": 724 + }, + { + "epoch": 0.68, + "grad_norm": 21.37078857421875, + "learning_rate": 4.288212661769849e-07, + "logps/chosen": -50.27540588378906, + "logps/rejected": -67.41368103027344, + "loss": 0.6139, + "losses/dpo": 0.3498527407646179, + "losses/sft": 1.631231427192688, + "losses/total": 0.3498527407646179, + "ref_logps/chosen": -41.15169906616211, + "ref_logps/rejected": -52.33612823486328, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9123706817626953, + "rewards/margins": 0.5953848958015442, + "rewards/rejected": -1.5077555179595947, + "step": 725 + }, + { + "epoch": 0.69, + "grad_norm": 21.714824676513672, + "learning_rate": 4.286463798530955e-07, + "logps/chosen": -50.311920166015625, + "logps/rejected": -50.24692153930664, + "loss": 0.6893, + "losses/dpo": 0.8099931478500366, + "losses/sft": 1.538763165473938, + "losses/total": 0.8099931478500366, + "ref_logps/chosen": -42.77843475341797, + "ref_logps/rejected": -41.30024337768555, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7533482313156128, + "rewards/margins": 0.14131946861743927, + "rewards/rejected": -0.8946677446365356, + "step": 726 + }, + { + "epoch": 0.69, + "grad_norm": 18.687244415283203, + "learning_rate": 4.2847149352920595e-07, + "logps/chosen": -48.927268981933594, + "logps/rejected": -56.550621032714844, + "loss": 0.6577, + "losses/dpo": 0.7903240919113159, + "losses/sft": 1.6181057691574097, + "losses/total": 0.7903240919113159, + "ref_logps/chosen": -40.08326721191406, + "ref_logps/rejected": -43.57933044433594, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8843997716903687, + "rewards/margins": 0.41272953152656555, + "rewards/rejected": -1.2971292734146118, + "step": 727 + }, + { + "epoch": 0.69, + "grad_norm": 18.43297004699707, + "learning_rate": 4.282966072053165e-07, + "logps/chosen": -45.604248046875, + "logps/rejected": -67.88539123535156, + "loss": 0.4841, + "losses/dpo": 0.281689316034317, + "losses/sft": 1.3659275770187378, + "losses/total": 0.281689316034317, + "ref_logps/chosen": -36.35345458984375, + "ref_logps/rejected": -51.47601318359375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9250791072845459, + "rewards/margins": 0.7158592343330383, + "rewards/rejected": -1.6409382820129395, + "step": 728 + }, + { + "epoch": 0.69, + "grad_norm": 18.301931381225586, + "learning_rate": 4.281217208814271e-07, + "logps/chosen": -46.586639404296875, + "logps/rejected": -55.49821472167969, + "loss": 0.513, + "losses/dpo": 0.4403191804885864, + "losses/sft": 1.4714233875274658, + "losses/total": 0.4403191804885864, + "ref_logps/chosen": -38.97897720336914, + "ref_logps/rejected": -41.933082580566406, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7607663869857788, + "rewards/margins": 0.5957470536231995, + "rewards/rejected": -1.356513500213623, + "step": 729 + }, + { + "epoch": 0.69, + "grad_norm": 16.465715408325195, + "learning_rate": 4.279468345575376e-07, + "logps/chosen": -33.934410095214844, + "logps/rejected": -49.73075866699219, + "loss": 0.6093, + "losses/dpo": 0.7005358934402466, + "losses/sft": 1.7313860654830933, + "losses/total": 0.7005358934402466, + "ref_logps/chosen": -27.16747283935547, + "ref_logps/rejected": -39.068580627441406, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6766937375068665, + "rewards/margins": 0.3895241618156433, + "rewards/rejected": -1.0662178993225098, + "step": 730 + }, + { + "epoch": 0.69, + "grad_norm": 20.65587043762207, + "learning_rate": 4.277719482336481e-07, + "logps/chosen": -44.32313537597656, + "logps/rejected": -36.8135871887207, + "loss": 0.6444, + "losses/dpo": 0.5208037495613098, + "losses/sft": 1.4644453525543213, + "losses/total": 0.5208037495613098, + "ref_logps/chosen": -37.35309600830078, + "ref_logps/rejected": -26.82744789123535, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6970037221908569, + "rewards/margins": 0.3016101121902466, + "rewards/rejected": -0.9986138343811035, + "step": 731 + }, + { + "epoch": 0.69, + "grad_norm": 30.098657608032227, + "learning_rate": 4.275970619097586e-07, + "logps/chosen": -51.476871490478516, + "logps/rejected": -47.054290771484375, + "loss": 0.8557, + "losses/dpo": 1.655027151107788, + "losses/sft": 2.198712110519409, + "losses/total": 1.655027151107788, + "ref_logps/chosen": -40.543113708496094, + "ref_logps/rejected": -36.189697265625, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0933759212493896, + "rewards/margins": -0.006916806101799011, + "rewards/rejected": -1.0864590406417847, + "step": 732 + }, + { + "epoch": 0.69, + "grad_norm": 18.317766189575195, + "learning_rate": 4.274221755858692e-07, + "logps/chosen": -50.106510162353516, + "logps/rejected": -66.85099792480469, + "loss": 0.5015, + "losses/dpo": 0.6240159273147583, + "losses/sft": 1.5468125343322754, + "losses/total": 0.6240159273147583, + "ref_logps/chosen": -41.568607330322266, + "ref_logps/rejected": -51.991661071777344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.853790283203125, + "rewards/margins": 0.6321431398391724, + "rewards/rejected": -1.4859334230422974, + "step": 733 + }, + { + "epoch": 0.69, + "grad_norm": 20.739360809326172, + "learning_rate": 4.272472892619797e-07, + "logps/chosen": -43.271549224853516, + "logps/rejected": -48.183555603027344, + "loss": 0.6991, + "losses/dpo": 0.6731064915657043, + "losses/sft": 1.7340065240859985, + "losses/total": 0.6731064915657043, + "ref_logps/chosen": -33.46013641357422, + "ref_logps/rejected": -35.56556701660156, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.981141209602356, + "rewards/margins": 0.2806575298309326, + "rewards/rejected": -1.261798620223999, + "step": 734 + }, + { + "epoch": 0.69, + "grad_norm": 22.32449722290039, + "learning_rate": 4.270724029380902e-07, + "logps/chosen": -43.6602783203125, + "logps/rejected": -42.747215270996094, + "loss": 0.711, + "losses/dpo": 0.34072479605674744, + "losses/sft": 1.4519860744476318, + "losses/total": 0.34072479605674744, + "ref_logps/chosen": -35.907073974609375, + "ref_logps/rejected": -32.58649444580078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7753205299377441, + "rewards/margins": 0.2407512068748474, + "rewards/rejected": -1.0160717964172363, + "step": 735 + }, + { + "epoch": 0.69, + "grad_norm": 17.203248977661133, + "learning_rate": 4.268975166142008e-07, + "logps/chosen": -38.31851577758789, + "logps/rejected": -52.45063018798828, + "loss": 0.5077, + "losses/dpo": 0.7386688590049744, + "losses/sft": 1.5262503623962402, + "losses/total": 0.7386688590049744, + "ref_logps/chosen": -30.972883224487305, + "ref_logps/rejected": -39.24861526489258, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7345629930496216, + "rewards/margins": 0.5856384038925171, + "rewards/rejected": -1.3202013969421387, + "step": 736 + }, + { + "epoch": 0.7, + "grad_norm": 20.836872100830078, + "learning_rate": 4.267226302903113e-07, + "logps/chosen": -40.628726959228516, + "logps/rejected": -53.397701263427734, + "loss": 0.6443, + "losses/dpo": 0.9054415225982666, + "losses/sft": 1.6336662769317627, + "losses/total": 0.9054415225982666, + "ref_logps/chosen": -31.981647491455078, + "ref_logps/rejected": -41.35036087036133, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8647080659866333, + "rewards/margins": 0.3400261402130127, + "rewards/rejected": -1.204734206199646, + "step": 737 + }, + { + "epoch": 0.7, + "grad_norm": 18.280460357666016, + "learning_rate": 4.265477439664218e-07, + "logps/chosen": -43.923126220703125, + "logps/rejected": -53.445716857910156, + "loss": 0.5706, + "losses/dpo": 0.5863304138183594, + "losses/sft": 0.8763689398765564, + "losses/total": 0.5863304138183594, + "ref_logps/chosen": -36.32550811767578, + "ref_logps/rejected": -42.005767822265625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7597619295120239, + "rewards/margins": 0.3842333257198334, + "rewards/rejected": -1.1439952850341797, + "step": 738 + }, + { + "epoch": 0.7, + "grad_norm": 20.237655639648438, + "learning_rate": 4.263728576425323e-07, + "logps/chosen": -50.909202575683594, + "logps/rejected": -49.70928192138672, + "loss": 0.6183, + "losses/dpo": 0.46446093916893005, + "losses/sft": 1.925357699394226, + "losses/total": 0.46446093916893005, + "ref_logps/chosen": -42.42932891845703, + "ref_logps/rejected": -37.705322265625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8479871153831482, + "rewards/margins": 0.352408766746521, + "rewards/rejected": -1.200395941734314, + "step": 739 + }, + { + "epoch": 0.7, + "grad_norm": 22.662113189697266, + "learning_rate": 4.261979713186429e-07, + "logps/chosen": -47.26714324951172, + "logps/rejected": -73.65606689453125, + "loss": 0.5853, + "losses/dpo": 0.528915286064148, + "losses/sft": 1.3249739408493042, + "losses/total": 0.528915286064148, + "ref_logps/chosen": -38.37602996826172, + "ref_logps/rejected": -60.28350830078125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8891115188598633, + "rewards/margins": 0.4481440782546997, + "rewards/rejected": -1.3372557163238525, + "step": 740 + }, + { + "epoch": 0.7, + "grad_norm": 24.209074020385742, + "learning_rate": 4.260230849947534e-07, + "logps/chosen": -45.871097564697266, + "logps/rejected": -60.66228103637695, + "loss": 0.7298, + "losses/dpo": 1.0251754522323608, + "losses/sft": 2.0724728107452393, + "losses/total": 1.0251754522323608, + "ref_logps/chosen": -37.298606872558594, + "ref_logps/rejected": -50.359161376953125, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.857248842716217, + "rewards/margins": 0.17306344211101532, + "rewards/rejected": -1.0303122997283936, + "step": 741 + }, + { + "epoch": 0.7, + "grad_norm": 21.85954475402832, + "learning_rate": 4.258481986708639e-07, + "logps/chosen": -46.614173889160156, + "logps/rejected": -60.85839080810547, + "loss": 0.6027, + "losses/dpo": 0.5607704520225525, + "losses/sft": 1.6466827392578125, + "losses/total": 0.5607704520225525, + "ref_logps/chosen": -39.28166580200195, + "ref_logps/rejected": -48.896339416503906, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7332503795623779, + "rewards/margins": 0.46295464038848877, + "rewards/rejected": -1.1962049007415771, + "step": 742 + }, + { + "epoch": 0.7, + "grad_norm": 22.585887908935547, + "learning_rate": 4.2567331234697447e-07, + "logps/chosen": -46.457496643066406, + "logps/rejected": -58.06674575805664, + "loss": 0.6909, + "losses/dpo": 0.5679949522018433, + "losses/sft": 1.2433321475982666, + "losses/total": 0.5679949522018433, + "ref_logps/chosen": -35.988277435302734, + "ref_logps/rejected": -45.05171203613281, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.046921968460083, + "rewards/margins": 0.2545810043811798, + "rewards/rejected": -1.3015029430389404, + "step": 743 + }, + { + "epoch": 0.7, + "grad_norm": 16.260671615600586, + "learning_rate": 4.25498426023085e-07, + "logps/chosen": -40.18232727050781, + "logps/rejected": -54.373538970947266, + "loss": 0.456, + "losses/dpo": 0.4967708885669708, + "losses/sft": 1.9124349355697632, + "losses/total": 0.4967708885669708, + "ref_logps/chosen": -33.05384063720703, + "ref_logps/rejected": -39.20069122314453, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7128486037254333, + "rewards/margins": 0.8044363260269165, + "rewards/rejected": -1.517284870147705, + "step": 744 + }, + { + "epoch": 0.7, + "grad_norm": 19.383041381835938, + "learning_rate": 4.253235396991955e-07, + "logps/chosen": -40.77610778808594, + "logps/rejected": -45.19227600097656, + "loss": 0.5829, + "losses/dpo": 0.581690788269043, + "losses/sft": 1.2575770616531372, + "losses/total": 0.581690788269043, + "ref_logps/chosen": -34.16371536254883, + "ref_logps/rejected": -35.671607971191406, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6612394452095032, + "rewards/margins": 0.290827214717865, + "rewards/rejected": -0.9520666599273682, + "step": 745 + }, + { + "epoch": 0.7, + "grad_norm": 19.118419647216797, + "learning_rate": 4.25148653375306e-07, + "logps/chosen": -53.51045608520508, + "logps/rejected": -64.26415252685547, + "loss": 0.4855, + "losses/dpo": 0.5189401507377625, + "losses/sft": 1.8860023021697998, + "losses/total": 0.5189401507377625, + "ref_logps/chosen": -45.97468566894531, + "ref_logps/rejected": -49.77253723144531, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7535770535469055, + "rewards/margins": 0.6955845952033997, + "rewards/rejected": -1.4491616487503052, + "step": 746 + }, + { + "epoch": 0.71, + "grad_norm": 19.952722549438477, + "learning_rate": 4.249737670514166e-07, + "logps/chosen": -42.86393356323242, + "logps/rejected": -48.983543395996094, + "loss": 0.6327, + "losses/dpo": 0.7953437566757202, + "losses/sft": 1.6539349555969238, + "losses/total": 0.7953437566757202, + "ref_logps/chosen": -35.93853759765625, + "ref_logps/rejected": -39.69743728637695, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6925398111343384, + "rewards/margins": 0.23607105016708374, + "rewards/rejected": -0.9286109209060669, + "step": 747 + }, + { + "epoch": 0.71, + "grad_norm": 16.43267059326172, + "learning_rate": 4.247988807275271e-07, + "logps/chosen": -38.21826171875, + "logps/rejected": -61.831634521484375, + "loss": 0.4886, + "losses/dpo": 0.5306552648544312, + "losses/sft": 1.4700191020965576, + "losses/total": 0.5306552648544312, + "ref_logps/chosen": -31.346384048461914, + "ref_logps/rejected": -47.10458755493164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6871877908706665, + "rewards/margins": 0.785516619682312, + "rewards/rejected": -1.4727044105529785, + "step": 748 + }, + { + "epoch": 0.71, + "grad_norm": 18.212879180908203, + "learning_rate": 4.246239944036376e-07, + "logps/chosen": -46.521263122558594, + "logps/rejected": -62.309139251708984, + "loss": 0.4978, + "losses/dpo": 0.4854315221309662, + "losses/sft": 2.568373918533325, + "losses/total": 0.4854315221309662, + "ref_logps/chosen": -37.42906188964844, + "ref_logps/rejected": -47.113624572753906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9092199206352234, + "rewards/margins": 0.6103312969207764, + "rewards/rejected": -1.5195512771606445, + "step": 749 + }, + { + "epoch": 0.71, + "grad_norm": 20.696321487426758, + "learning_rate": 4.2444910807974816e-07, + "logps/chosen": -41.21089172363281, + "logps/rejected": -45.72018051147461, + "loss": 0.6021, + "losses/dpo": 0.6415588855743408, + "losses/sft": 1.5518860816955566, + "losses/total": 0.6415588855743408, + "ref_logps/chosen": -34.7630615234375, + "ref_logps/rejected": -35.60466766357422, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6447831392288208, + "rewards/margins": 0.36676809191703796, + "rewards/rejected": -1.0115511417388916, + "step": 750 + }, + { + "epoch": 0.71, + "grad_norm": 19.7783260345459, + "learning_rate": 4.242742217558587e-07, + "logps/chosen": -45.709781646728516, + "logps/rejected": -51.80164337158203, + "loss": 0.5186, + "losses/dpo": 0.3710705637931824, + "losses/sft": 1.8136144876480103, + "losses/total": 0.3710705637931824, + "ref_logps/chosen": -36.29664993286133, + "ref_logps/rejected": -35.79341506958008, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9413133263587952, + "rewards/margins": 0.6595095992088318, + "rewards/rejected": -1.600822925567627, + "step": 751 + }, + { + "epoch": 0.71, + "grad_norm": 18.367704391479492, + "learning_rate": 4.240993354319692e-07, + "logps/chosen": -47.46202087402344, + "logps/rejected": -72.6573715209961, + "loss": 0.3996, + "losses/dpo": 0.2256067395210266, + "losses/sft": 1.6897969245910645, + "losses/total": 0.2256067395210266, + "ref_logps/chosen": -39.82744216918945, + "ref_logps/rejected": -55.34682083129883, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7634580135345459, + "rewards/margins": 0.9675969481468201, + "rewards/rejected": -1.7310549020767212, + "step": 752 + }, + { + "epoch": 0.71, + "grad_norm": 19.741113662719727, + "learning_rate": 4.2392444910807976e-07, + "logps/chosen": -36.69940948486328, + "logps/rejected": -46.498069763183594, + "loss": 0.5511, + "losses/dpo": 0.2733551859855652, + "losses/sft": 0.9542773365974426, + "losses/total": 0.2733551859855652, + "ref_logps/chosen": -28.31500244140625, + "ref_logps/rejected": -32.472503662109375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8384406566619873, + "rewards/margins": 0.5641160011291504, + "rewards/rejected": -1.4025567770004272, + "step": 753 + }, + { + "epoch": 0.71, + "grad_norm": 19.360973358154297, + "learning_rate": 4.2374956278419027e-07, + "logps/chosen": -50.95995330810547, + "logps/rejected": -62.476322174072266, + "loss": 0.5458, + "losses/dpo": 0.3861052095890045, + "losses/sft": 1.8141791820526123, + "losses/total": 0.3861052095890045, + "ref_logps/chosen": -42.563636779785156, + "ref_logps/rejected": -48.6370849609375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8396315574645996, + "rewards/margins": 0.5442919731140137, + "rewards/rejected": -1.3839235305786133, + "step": 754 + }, + { + "epoch": 0.71, + "grad_norm": 23.597888946533203, + "learning_rate": 4.235746764603008e-07, + "logps/chosen": -54.756996154785156, + "logps/rejected": -68.79257202148438, + "loss": 0.7102, + "losses/dpo": 0.8296176195144653, + "losses/sft": 1.999776005744934, + "losses/total": 0.8296176195144653, + "ref_logps/chosen": -44.24373245239258, + "ref_logps/rejected": -54.635887145996094, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0513266324996948, + "rewards/margins": 0.36434170603752136, + "rewards/rejected": -1.415668249130249, + "step": 755 + }, + { + "epoch": 0.71, + "grad_norm": 22.130552291870117, + "learning_rate": 4.233997901364113e-07, + "logps/chosen": -42.543399810791016, + "logps/rejected": -53.249122619628906, + "loss": 0.6306, + "losses/dpo": 0.9714921712875366, + "losses/sft": 2.300448179244995, + "losses/total": 0.9714921712875366, + "ref_logps/chosen": -34.651344299316406, + "ref_logps/rejected": -42.10847473144531, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7892056703567505, + "rewards/margins": 0.3248593211174011, + "rewards/rejected": -1.1140649318695068, + "step": 756 + }, + { + "epoch": 0.71, + "grad_norm": 20.522016525268555, + "learning_rate": 4.2322490381252186e-07, + "logps/chosen": -51.109954833984375, + "logps/rejected": -72.20364379882812, + "loss": 0.5195, + "losses/dpo": 0.4485291540622711, + "losses/sft": 1.6937845945358276, + "losses/total": 0.4485291540622711, + "ref_logps/chosen": -41.51011276245117, + "ref_logps/rejected": -55.568756103515625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9599847197532654, + "rewards/margins": 0.7035040855407715, + "rewards/rejected": -1.6634888648986816, + "step": 757 + }, + { + "epoch": 0.72, + "grad_norm": 20.583127975463867, + "learning_rate": 4.2305001748863237e-07, + "logps/chosen": -42.421775817871094, + "logps/rejected": -63.52996826171875, + "loss": 0.5248, + "losses/dpo": 0.7386205196380615, + "losses/sft": 1.3855749368667603, + "losses/total": 0.7386205196380615, + "ref_logps/chosen": -34.836395263671875, + "ref_logps/rejected": -48.709537506103516, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.758538007736206, + "rewards/margins": 0.723504900932312, + "rewards/rejected": -1.4820430278778076, + "step": 758 + }, + { + "epoch": 0.72, + "grad_norm": 20.102645874023438, + "learning_rate": 4.228751311647429e-07, + "logps/chosen": -49.17989730834961, + "logps/rejected": -57.92820358276367, + "loss": 0.6536, + "losses/dpo": 0.5786886811256409, + "losses/sft": 1.9223612546920776, + "losses/total": 0.5786886811256409, + "ref_logps/chosen": -37.882286071777344, + "ref_logps/rejected": -44.04322814941406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.129760980606079, + "rewards/margins": 0.2587363123893738, + "rewards/rejected": -1.3884973526000977, + "step": 759 + }, + { + "epoch": 0.72, + "grad_norm": 21.211217880249023, + "learning_rate": 4.2270024484085345e-07, + "logps/chosen": -54.49882507324219, + "logps/rejected": -66.64996337890625, + "loss": 0.5075, + "losses/dpo": 0.6029857397079468, + "losses/sft": 1.9275867938995361, + "losses/total": 0.6029857397079468, + "ref_logps/chosen": -43.71864318847656, + "ref_logps/rejected": -48.92106628417969, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0780181884765625, + "rewards/margins": 0.6948716640472412, + "rewards/rejected": -1.7728897333145142, + "step": 760 + }, + { + "epoch": 0.72, + "grad_norm": 21.209583282470703, + "learning_rate": 4.2252535851696396e-07, + "logps/chosen": -43.100486755371094, + "logps/rejected": -46.91571807861328, + "loss": 0.6642, + "losses/dpo": 0.7757740616798401, + "losses/sft": 1.8444371223449707, + "losses/total": 0.7757740616798401, + "ref_logps/chosen": -32.9022102355957, + "ref_logps/rejected": -34.38334655761719, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0198278427124023, + "rewards/margins": 0.23340973258018494, + "rewards/rejected": -1.2532374858856201, + "step": 761 + }, + { + "epoch": 0.72, + "grad_norm": 24.357664108276367, + "learning_rate": 4.223504721930745e-07, + "logps/chosen": -45.410179138183594, + "logps/rejected": -46.113075256347656, + "loss": 0.8068, + "losses/dpo": 0.6233912706375122, + "losses/sft": 1.0533487796783447, + "losses/total": 0.6233912706375122, + "ref_logps/chosen": -36.20232009887695, + "ref_logps/rejected": -34.531959533691406, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.920785665512085, + "rewards/margins": 0.2373262494802475, + "rewards/rejected": -1.158111810684204, + "step": 762 + }, + { + "epoch": 0.72, + "grad_norm": 15.830036163330078, + "learning_rate": 4.22175585869185e-07, + "logps/chosen": -42.101783752441406, + "logps/rejected": -67.05045318603516, + "loss": 0.3771, + "losses/dpo": 0.4679235816001892, + "losses/sft": 1.8049534559249878, + "losses/total": 0.4679235816001892, + "ref_logps/chosen": -36.04362487792969, + "ref_logps/rejected": -50.57368469238281, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6058157682418823, + "rewards/margins": 1.0418604612350464, + "rewards/rejected": -1.6476762294769287, + "step": 763 + }, + { + "epoch": 0.72, + "grad_norm": 20.92279052734375, + "learning_rate": 4.2200069954529555e-07, + "logps/chosen": -47.54430389404297, + "logps/rejected": -56.36003875732422, + "loss": 0.5923, + "losses/dpo": 0.5802595615386963, + "losses/sft": 1.6230095624923706, + "losses/total": 0.5802595615386963, + "ref_logps/chosen": -37.64332580566406, + "ref_logps/rejected": -41.81571960449219, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9900981187820435, + "rewards/margins": 0.464333713054657, + "rewards/rejected": -1.4544317722320557, + "step": 764 + }, + { + "epoch": 0.72, + "grad_norm": 23.231075286865234, + "learning_rate": 4.2182581322140607e-07, + "logps/chosen": -62.5670280456543, + "logps/rejected": -71.05088806152344, + "loss": 0.5894, + "losses/dpo": 0.4304642975330353, + "losses/sft": 1.9444787502288818, + "losses/total": 0.4304642975330353, + "ref_logps/chosen": -52.799781799316406, + "ref_logps/rejected": -54.82190704345703, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9767248630523682, + "rewards/margins": 0.6461727619171143, + "rewards/rejected": -1.6228976249694824, + "step": 765 + }, + { + "epoch": 0.72, + "grad_norm": 19.713966369628906, + "learning_rate": 4.216509268975166e-07, + "logps/chosen": -54.23445510864258, + "logps/rejected": -56.89936447143555, + "loss": 0.6581, + "losses/dpo": 0.9080683588981628, + "losses/sft": 1.9636963605880737, + "losses/total": 0.9080683588981628, + "ref_logps/chosen": -43.35801696777344, + "ref_logps/rejected": -42.41692352294922, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0876442193984985, + "rewards/margins": 0.3606003522872925, + "rewards/rejected": -1.448244571685791, + "step": 766 + }, + { + "epoch": 0.72, + "grad_norm": 17.78038215637207, + "learning_rate": 4.2147604057362714e-07, + "logps/chosen": -49.073158264160156, + "logps/rejected": -52.753692626953125, + "loss": 0.641, + "losses/dpo": 0.5669728517532349, + "losses/sft": 2.2225940227508545, + "losses/total": 0.5669728517532349, + "ref_logps/chosen": -38.89840316772461, + "ref_logps/rejected": -40.61378479003906, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0174754858016968, + "rewards/margins": 0.19651541113853455, + "rewards/rejected": -1.2139909267425537, + "step": 767 + }, + { + "epoch": 0.73, + "grad_norm": 20.39261245727539, + "learning_rate": 4.2130115424973766e-07, + "logps/chosen": -39.9666633605957, + "logps/rejected": -42.861839294433594, + "loss": 0.7395, + "losses/dpo": 0.5248615145683289, + "losses/sft": 1.6448071002960205, + "losses/total": 0.5248615145683289, + "ref_logps/chosen": -32.12987518310547, + "ref_logps/rejected": -33.25669479370117, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7836792469024658, + "rewards/margins": 0.17683541774749756, + "rewards/rejected": -0.9605146646499634, + "step": 768 + }, + { + "epoch": 0.73, + "grad_norm": 17.96187400817871, + "learning_rate": 4.2112626792584817e-07, + "logps/chosen": -47.99996566772461, + "logps/rejected": -54.542686462402344, + "loss": 0.5555, + "losses/dpo": 0.438449889421463, + "losses/sft": 1.5923808813095093, + "losses/total": 0.438449889421463, + "ref_logps/chosen": -40.29195022583008, + "ref_logps/rejected": -42.82405090332031, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7708016633987427, + "rewards/margins": 0.4010623097419739, + "rewards/rejected": -1.1718639135360718, + "step": 769 + }, + { + "epoch": 0.73, + "grad_norm": 25.057157516479492, + "learning_rate": 4.209513816019587e-07, + "logps/chosen": -61.90786361694336, + "logps/rejected": -66.06634521484375, + "loss": 0.6793, + "losses/dpo": 0.339076966047287, + "losses/sft": 2.2863316535949707, + "losses/total": 0.339076966047287, + "ref_logps/chosen": -49.08837127685547, + "ref_logps/rejected": -48.471160888671875, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2819492816925049, + "rewards/margins": 0.4775692820549011, + "rewards/rejected": -1.7595185041427612, + "step": 770 + }, + { + "epoch": 0.73, + "grad_norm": 20.30738067626953, + "learning_rate": 4.2077649527806925e-07, + "logps/chosen": -43.031578063964844, + "logps/rejected": -49.231285095214844, + "loss": 0.6316, + "losses/dpo": 0.485808789730072, + "losses/sft": 1.4857324361801147, + "losses/total": 0.485808789730072, + "ref_logps/chosen": -33.978431701660156, + "ref_logps/rejected": -36.98942565917969, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9053147435188293, + "rewards/margins": 0.31887149810791016, + "rewards/rejected": -1.2241861820220947, + "step": 771 + }, + { + "epoch": 0.73, + "grad_norm": 18.805618286132812, + "learning_rate": 4.206016089541798e-07, + "logps/chosen": -50.753421783447266, + "logps/rejected": -58.30438995361328, + "loss": 0.4742, + "losses/dpo": 0.5090314149856567, + "losses/sft": 1.7893503904342651, + "losses/total": 0.5090314149856567, + "ref_logps/chosen": -41.46656799316406, + "ref_logps/rejected": -40.94102478027344, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9286853671073914, + "rewards/margins": 0.8076509237289429, + "rewards/rejected": -1.736336350440979, + "step": 772 + }, + { + "epoch": 0.73, + "grad_norm": 17.45510482788086, + "learning_rate": 4.2042672263029027e-07, + "logps/chosen": -43.150169372558594, + "logps/rejected": -58.464111328125, + "loss": 0.5258, + "losses/dpo": 0.4015842080116272, + "losses/sft": 1.6756699085235596, + "losses/total": 0.4015842080116272, + "ref_logps/chosen": -35.26593780517578, + "ref_logps/rejected": -43.1739387512207, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7884229421615601, + "rewards/margins": 0.7405939102172852, + "rewards/rejected": -1.5290168523788452, + "step": 773 + }, + { + "epoch": 0.73, + "grad_norm": 22.22146987915039, + "learning_rate": 4.2025183630640084e-07, + "logps/chosen": -50.417083740234375, + "logps/rejected": -69.91157531738281, + "loss": 0.6617, + "losses/dpo": 0.40283524990081787, + "losses/sft": 1.894343614578247, + "losses/total": 0.40283524990081787, + "ref_logps/chosen": -39.54027557373047, + "ref_logps/rejected": -55.56966781616211, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0876809358596802, + "rewards/margins": 0.3465101718902588, + "rewards/rejected": -1.4341912269592285, + "step": 774 + }, + { + "epoch": 0.73, + "grad_norm": 22.246706008911133, + "learning_rate": 4.2007694998251135e-07, + "logps/chosen": -53.201717376708984, + "logps/rejected": -61.34033966064453, + "loss": 0.5247, + "losses/dpo": 0.20747162401676178, + "losses/sft": 1.3337289094924927, + "losses/total": 0.20747162401676178, + "ref_logps/chosen": -42.53508377075195, + "ref_logps/rejected": -44.6602783203125, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0666635036468506, + "rewards/margins": 0.6013425588607788, + "rewards/rejected": -1.668006181716919, + "step": 775 + }, + { + "epoch": 0.73, + "grad_norm": 24.267658233642578, + "learning_rate": 4.1990206365862186e-07, + "logps/chosen": -58.03358459472656, + "logps/rejected": -55.07177734375, + "loss": 0.6796, + "losses/dpo": 0.9036049842834473, + "losses/sft": 2.1022844314575195, + "losses/total": 0.9036049842834473, + "ref_logps/chosen": -47.973228454589844, + "ref_logps/rejected": -42.999908447265625, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0060359239578247, + "rewards/margins": 0.20115122199058533, + "rewards/rejected": -1.2071871757507324, + "step": 776 + }, + { + "epoch": 0.73, + "grad_norm": 21.436649322509766, + "learning_rate": 4.197271773347324e-07, + "logps/chosen": -53.396942138671875, + "logps/rejected": -56.93098449707031, + "loss": 0.5242, + "losses/dpo": 0.43892043828964233, + "losses/sft": 1.739390254020691, + "losses/total": 0.43892043828964233, + "ref_logps/chosen": -45.15161895751953, + "ref_logps/rejected": -42.66443634033203, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8245325088500977, + "rewards/margins": 0.6021220684051514, + "rewards/rejected": -1.426654577255249, + "step": 777 + }, + { + "epoch": 0.73, + "grad_norm": 19.981088638305664, + "learning_rate": 4.1955229101084294e-07, + "logps/chosen": -48.69720458984375, + "logps/rejected": -52.137699127197266, + "loss": 0.5863, + "losses/dpo": 0.7162208557128906, + "losses/sft": 1.709145426750183, + "losses/total": 0.7162208557128906, + "ref_logps/chosen": -36.80187225341797, + "ref_logps/rejected": -36.985755920410156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.18953275680542, + "rewards/margins": 0.3256617784500122, + "rewards/rejected": -1.5151946544647217, + "step": 778 + }, + { + "epoch": 0.74, + "grad_norm": 20.368045806884766, + "learning_rate": 4.193774046869535e-07, + "logps/chosen": -45.996665954589844, + "logps/rejected": -59.24977493286133, + "loss": 0.525, + "losses/dpo": 0.7196766138076782, + "losses/sft": 2.0520544052124023, + "losses/total": 0.7196766138076782, + "ref_logps/chosen": -36.62474060058594, + "ref_logps/rejected": -44.03826141357422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9371929168701172, + "rewards/margins": 0.5839586853981018, + "rewards/rejected": -1.5211515426635742, + "step": 779 + }, + { + "epoch": 0.74, + "grad_norm": 17.651996612548828, + "learning_rate": 4.1920251836306397e-07, + "logps/chosen": -37.72838592529297, + "logps/rejected": -61.68711853027344, + "loss": 0.4566, + "losses/dpo": 0.3346171975135803, + "losses/sft": 1.868023157119751, + "losses/total": 0.3346171975135803, + "ref_logps/chosen": -30.137733459472656, + "ref_logps/rejected": -46.17749786376953, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7590653896331787, + "rewards/margins": 0.7918962240219116, + "rewards/rejected": -1.5509614944458008, + "step": 780 + }, + { + "epoch": 0.74, + "grad_norm": 25.09738540649414, + "learning_rate": 4.1902763203917453e-07, + "logps/chosen": -57.371253967285156, + "logps/rejected": -47.14936828613281, + "loss": 0.8164, + "losses/dpo": 0.9405972957611084, + "losses/sft": 1.6142300367355347, + "losses/total": 0.9405972957611084, + "ref_logps/chosen": -46.65610885620117, + "ref_logps/rejected": -37.34865188598633, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.0715144872665405, + "rewards/margins": -0.0914430171251297, + "rewards/rejected": -0.980071485042572, + "step": 781 + }, + { + "epoch": 0.74, + "grad_norm": 17.679214477539062, + "learning_rate": 4.1885274571528505e-07, + "logps/chosen": -33.392459869384766, + "logps/rejected": -55.045135498046875, + "loss": 0.6448, + "losses/dpo": 0.8382935523986816, + "losses/sft": 1.943509578704834, + "losses/total": 0.8382935523986816, + "ref_logps/chosen": -24.722564697265625, + "ref_logps/rejected": -42.780574798583984, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8669896125793457, + "rewards/margins": 0.3594665229320526, + "rewards/rejected": -1.2264561653137207, + "step": 782 + }, + { + "epoch": 0.74, + "grad_norm": 17.070148468017578, + "learning_rate": 4.1867785939139556e-07, + "logps/chosen": -47.054222106933594, + "logps/rejected": -59.38862991333008, + "loss": 0.4827, + "losses/dpo": 0.4504500925540924, + "losses/sft": 1.7148276567459106, + "losses/total": 0.4504500925540924, + "ref_logps/chosen": -40.27749252319336, + "ref_logps/rejected": -46.283416748046875, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6776725053787231, + "rewards/margins": 0.6328486204147339, + "rewards/rejected": -1.310521125793457, + "step": 783 + }, + { + "epoch": 0.74, + "grad_norm": 23.34598731994629, + "learning_rate": 4.1850297306750607e-07, + "logps/chosen": -51.39959716796875, + "logps/rejected": -54.16435241699219, + "loss": 0.6386, + "losses/dpo": 0.7482547760009766, + "losses/sft": 1.876220703125, + "losses/total": 0.7482547760009766, + "ref_logps/chosen": -39.918174743652344, + "ref_logps/rejected": -39.992759704589844, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1481423377990723, + "rewards/margins": 0.2690170705318451, + "rewards/rejected": -1.4171594381332397, + "step": 784 + }, + { + "epoch": 0.74, + "grad_norm": 27.0095272064209, + "learning_rate": 4.1832808674361664e-07, + "logps/chosen": -59.550010681152344, + "logps/rejected": -67.86109924316406, + "loss": 0.7146, + "losses/dpo": 0.5503778457641602, + "losses/sft": 1.526914119720459, + "losses/total": 0.5503778457641602, + "ref_logps/chosen": -47.91358184814453, + "ref_logps/rejected": -52.84379577636719, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1636431217193604, + "rewards/margins": 0.33808720111846924, + "rewards/rejected": -1.5017304420471191, + "step": 785 + }, + { + "epoch": 0.74, + "grad_norm": 23.250091552734375, + "learning_rate": 4.181532004197272e-07, + "logps/chosen": -61.68149185180664, + "logps/rejected": -84.31434631347656, + "loss": 0.6161, + "losses/dpo": 0.48296403884887695, + "losses/sft": 2.0142619609832764, + "losses/total": 0.48296403884887695, + "ref_logps/chosen": -49.42213821411133, + "ref_logps/rejected": -66.10447692871094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2259352207183838, + "rewards/margins": 0.5950525403022766, + "rewards/rejected": -1.8209878206253052, + "step": 786 + }, + { + "epoch": 0.74, + "grad_norm": 20.944580078125, + "learning_rate": 4.1797831409583766e-07, + "logps/chosen": -47.476417541503906, + "logps/rejected": -53.08404541015625, + "loss": 0.5704, + "losses/dpo": 0.9001350402832031, + "losses/sft": 1.9318976402282715, + "losses/total": 0.9001350402832031, + "ref_logps/chosen": -39.116355895996094, + "ref_logps/rejected": -39.44659423828125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8360065221786499, + "rewards/margins": 0.5277382135391235, + "rewards/rejected": -1.3637447357177734, + "step": 787 + }, + { + "epoch": 0.74, + "grad_norm": 16.427867889404297, + "learning_rate": 4.1780342777194823e-07, + "logps/chosen": -49.147247314453125, + "logps/rejected": -68.76141357421875, + "loss": 0.4217, + "losses/dpo": 0.43051594495773315, + "losses/sft": 1.7895997762680054, + "losses/total": 0.43051594495773315, + "ref_logps/chosen": -40.01856231689453, + "ref_logps/rejected": -49.69984436035156, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.912868082523346, + "rewards/margins": 0.9932891130447388, + "rewards/rejected": -1.9061572551727295, + "step": 788 + }, + { + "epoch": 0.75, + "grad_norm": 19.30293083190918, + "learning_rate": 4.1762854144805874e-07, + "logps/chosen": -38.080177307128906, + "logps/rejected": -54.69189453125, + "loss": 0.5678, + "losses/dpo": 0.9089207649230957, + "losses/sft": 2.139173746109009, + "losses/total": 0.9089207649230957, + "ref_logps/chosen": -30.37895965576172, + "ref_logps/rejected": -39.110130310058594, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7701219320297241, + "rewards/margins": 0.7880542278289795, + "rewards/rejected": -1.5581762790679932, + "step": 789 + }, + { + "epoch": 0.75, + "grad_norm": 19.495140075683594, + "learning_rate": 4.1745365512416925e-07, + "logps/chosen": -46.094276428222656, + "logps/rejected": -57.03180694580078, + "loss": 0.579, + "losses/dpo": 0.9135038256645203, + "losses/sft": 2.1158804893493652, + "losses/total": 0.9135038256645203, + "ref_logps/chosen": -36.45563507080078, + "ref_logps/rejected": -41.92792510986328, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9638640284538269, + "rewards/margins": 0.5465238690376282, + "rewards/rejected": -1.510387897491455, + "step": 790 + }, + { + "epoch": 0.75, + "grad_norm": 21.350933074951172, + "learning_rate": 4.1727876880027977e-07, + "logps/chosen": -46.796260833740234, + "logps/rejected": -60.494384765625, + "loss": 0.6413, + "losses/dpo": 0.6952491998672485, + "losses/sft": 2.5563645362854004, + "losses/total": 0.6952491998672485, + "ref_logps/chosen": -33.72479248046875, + "ref_logps/rejected": -45.16609573364258, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.307146430015564, + "rewards/margins": 0.22568242251873016, + "rewards/rejected": -1.5328289270401, + "step": 791 + }, + { + "epoch": 0.75, + "grad_norm": 20.2003173828125, + "learning_rate": 4.1710388247639033e-07, + "logps/chosen": -40.17908477783203, + "logps/rejected": -55.57908630371094, + "loss": 0.5794, + "losses/dpo": 0.5370991230010986, + "losses/sft": 1.3463118076324463, + "losses/total": 0.5370991230010986, + "ref_logps/chosen": -31.964265823364258, + "ref_logps/rejected": -42.87556838989258, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.821482241153717, + "rewards/margins": 0.4488694667816162, + "rewards/rejected": -1.2703516483306885, + "step": 792 + }, + { + "epoch": 0.75, + "grad_norm": 27.169261932373047, + "learning_rate": 4.169289961525009e-07, + "logps/chosen": -55.7948112487793, + "logps/rejected": -55.48624038696289, + "loss": 0.7654, + "losses/dpo": 0.9784973859786987, + "losses/sft": 2.267885208129883, + "losses/total": 0.9784973859786987, + "ref_logps/chosen": -42.779937744140625, + "ref_logps/rejected": -40.891395568847656, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.301487684249878, + "rewards/margins": 0.1579965054988861, + "rewards/rejected": -1.4594842195510864, + "step": 793 + }, + { + "epoch": 0.75, + "grad_norm": 15.89783763885498, + "learning_rate": 4.1675410982861136e-07, + "logps/chosen": -37.41274642944336, + "logps/rejected": -58.109989166259766, + "loss": 0.488, + "losses/dpo": 0.4054686427116394, + "losses/sft": 1.6878503561019897, + "losses/total": 0.4054686427116394, + "ref_logps/chosen": -28.857276916503906, + "ref_logps/rejected": -44.28254699707031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8555470108985901, + "rewards/margins": 0.5271970629692078, + "rewards/rejected": -1.3827440738677979, + "step": 794 + }, + { + "epoch": 0.75, + "grad_norm": 20.05536651611328, + "learning_rate": 4.165792235047219e-07, + "logps/chosen": -38.05184555053711, + "logps/rejected": -51.57506561279297, + "loss": 0.5269, + "losses/dpo": 0.6790121793746948, + "losses/sft": 1.5501598119735718, + "losses/total": 0.6790121793746948, + "ref_logps/chosen": -27.786029815673828, + "ref_logps/rejected": -36.00325012207031, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0265814065933228, + "rewards/margins": 0.5306002497673035, + "rewards/rejected": -1.557181715965271, + "step": 795 + }, + { + "epoch": 0.75, + "grad_norm": 22.130945205688477, + "learning_rate": 4.1640433718083243e-07, + "logps/chosen": -54.87089538574219, + "logps/rejected": -56.082725524902344, + "loss": 0.6803, + "losses/dpo": 0.7415988445281982, + "losses/sft": 1.8232758045196533, + "losses/total": 0.7415988445281982, + "ref_logps/chosen": -42.646095275878906, + "ref_logps/rejected": -41.11537170410156, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2224805355072021, + "rewards/margins": 0.27425479888916016, + "rewards/rejected": -1.4967353343963623, + "step": 796 + }, + { + "epoch": 0.75, + "grad_norm": 22.653902053833008, + "learning_rate": 4.1622945085694295e-07, + "logps/chosen": -44.57743835449219, + "logps/rejected": -57.335716247558594, + "loss": 0.731, + "losses/dpo": 0.5241410732269287, + "losses/sft": 1.8190603256225586, + "losses/total": 0.5241410732269287, + "ref_logps/chosen": -33.62150192260742, + "ref_logps/rejected": -45.14116668701172, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0955939292907715, + "rewards/margins": 0.12386080622673035, + "rewards/rejected": -1.2194546461105347, + "step": 797 + }, + { + "epoch": 0.75, + "grad_norm": 17.54418182373047, + "learning_rate": 4.160545645330535e-07, + "logps/chosen": -45.42450714111328, + "logps/rejected": -48.62165832519531, + "loss": 0.5393, + "losses/dpo": 0.43921250104904175, + "losses/sft": 1.1786720752716064, + "losses/total": 0.43921250104904175, + "ref_logps/chosen": -37.135379791259766, + "ref_logps/rejected": -35.45717239379883, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8289127349853516, + "rewards/margins": 0.4875360131263733, + "rewards/rejected": -1.31644868850708, + "step": 798 + }, + { + "epoch": 0.75, + "grad_norm": 17.97772979736328, + "learning_rate": 4.15879678209164e-07, + "logps/chosen": -44.88686752319336, + "logps/rejected": -54.74813461303711, + "loss": 0.5674, + "losses/dpo": 0.5229305624961853, + "losses/sft": 1.2484173774719238, + "losses/total": 0.5229305624961853, + "ref_logps/chosen": -35.25, + "ref_logps/rejected": -40.10423278808594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9636870622634888, + "rewards/margins": 0.5007030367851257, + "rewards/rejected": -1.4643900394439697, + "step": 799 + }, + { + "epoch": 0.76, + "grad_norm": 13.845191955566406, + "learning_rate": 4.157047918852746e-07, + "logps/chosen": -39.998382568359375, + "logps/rejected": -43.76007080078125, + "loss": 0.4603, + "losses/dpo": 0.4537975788116455, + "losses/sft": 1.456099510192871, + "losses/total": 0.4537975788116455, + "ref_logps/chosen": -33.46385955810547, + "ref_logps/rejected": -30.600872039794922, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6534522771835327, + "rewards/margins": 0.6624678373336792, + "rewards/rejected": -1.315920114517212, + "step": 800 + }, + { + "epoch": 0.76, + "grad_norm": 24.161046981811523, + "learning_rate": 4.1552990556138505e-07, + "logps/chosen": -56.78883361816406, + "logps/rejected": -69.69316101074219, + "loss": 0.7032, + "losses/dpo": 0.7013365030288696, + "losses/sft": 1.7608017921447754, + "losses/total": 0.7013365030288696, + "ref_logps/chosen": -44.61452102661133, + "ref_logps/rejected": -54.76441192626953, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2174313068389893, + "rewards/margins": 0.2754439413547516, + "rewards/rejected": -1.4928752183914185, + "step": 801 + }, + { + "epoch": 0.76, + "grad_norm": 16.207292556762695, + "learning_rate": 4.153550192374956e-07, + "logps/chosen": -43.64310836791992, + "logps/rejected": -55.11307907104492, + "loss": 0.5325, + "losses/dpo": 0.5881010293960571, + "losses/sft": 1.5619310140609741, + "losses/total": 0.5881010293960571, + "ref_logps/chosen": -34.51484680175781, + "ref_logps/rejected": -40.832275390625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9128257036209106, + "rewards/margins": 0.5152546167373657, + "rewards/rejected": -1.4280803203582764, + "step": 802 + }, + { + "epoch": 0.76, + "grad_norm": 21.621896743774414, + "learning_rate": 4.1518013291360613e-07, + "logps/chosen": -49.106651306152344, + "logps/rejected": -57.2791633605957, + "loss": 0.6013, + "losses/dpo": 0.4520745575428009, + "losses/sft": 2.139284133911133, + "losses/total": 0.4520745575428009, + "ref_logps/chosen": -38.272666931152344, + "ref_logps/rejected": -43.19129180908203, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0833981037139893, + "rewards/margins": 0.3253892660140991, + "rewards/rejected": -1.408787488937378, + "step": 803 + }, + { + "epoch": 0.76, + "grad_norm": 19.893678665161133, + "learning_rate": 4.1500524658971664e-07, + "logps/chosen": -44.95887756347656, + "logps/rejected": -49.64577102661133, + "loss": 0.6321, + "losses/dpo": 0.5761253833770752, + "losses/sft": 1.3944107294082642, + "losses/total": 0.5761253833770752, + "ref_logps/chosen": -34.86906433105469, + "ref_logps/rejected": -36.0997314453125, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.008981466293335, + "rewards/margins": 0.3456220030784607, + "rewards/rejected": -1.3546034097671509, + "step": 804 + }, + { + "epoch": 0.76, + "grad_norm": 20.27569007873535, + "learning_rate": 4.148303602658272e-07, + "logps/chosen": -47.399662017822266, + "logps/rejected": -50.15160369873047, + "loss": 0.5316, + "losses/dpo": 0.30595090985298157, + "losses/sft": 1.7690634727478027, + "losses/total": 0.30595090985298157, + "ref_logps/chosen": -36.06136703491211, + "ref_logps/rejected": -31.95119857788086, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1338295936584473, + "rewards/margins": 0.6862108707427979, + "rewards/rejected": -1.8200404644012451, + "step": 805 + }, + { + "epoch": 0.76, + "grad_norm": 17.346487045288086, + "learning_rate": 4.146554739419377e-07, + "logps/chosen": -46.592437744140625, + "logps/rejected": -59.75151443481445, + "loss": 0.5321, + "losses/dpo": 0.8430801630020142, + "losses/sft": 2.247488021850586, + "losses/total": 0.8430801630020142, + "ref_logps/chosen": -33.222312927246094, + "ref_logps/rejected": -38.73173904418945, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3370126485824585, + "rewards/margins": 0.7649650573730469, + "rewards/rejected": -2.1019773483276367, + "step": 806 + }, + { + "epoch": 0.76, + "grad_norm": 19.774389266967773, + "learning_rate": 4.144805876180483e-07, + "logps/chosen": -46.54816436767578, + "logps/rejected": -63.87855529785156, + "loss": 0.462, + "losses/dpo": 0.3038465976715088, + "losses/sft": 1.6905699968338013, + "losses/total": 0.3038465976715088, + "ref_logps/chosen": -36.38085174560547, + "ref_logps/rejected": -45.94116973876953, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0167306661605835, + "rewards/margins": 0.7770076394081116, + "rewards/rejected": -1.7937383651733398, + "step": 807 + }, + { + "epoch": 0.76, + "grad_norm": 16.16718292236328, + "learning_rate": 4.1430570129415875e-07, + "logps/chosen": -45.99098587036133, + "logps/rejected": -57.12336730957031, + "loss": 0.4629, + "losses/dpo": 0.32424142956733704, + "losses/sft": 1.6986743211746216, + "losses/total": 0.32424142956733704, + "ref_logps/chosen": -38.268798828125, + "ref_logps/rejected": -42.035465240478516, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7722187042236328, + "rewards/margins": 0.7365716695785522, + "rewards/rejected": -1.5087902545928955, + "step": 808 + }, + { + "epoch": 0.76, + "grad_norm": 13.154738426208496, + "learning_rate": 4.141308149702693e-07, + "logps/chosen": -46.79335403442383, + "logps/rejected": -64.21115112304688, + "loss": 0.3422, + "losses/dpo": 0.4130054712295532, + "losses/sft": 1.7548103332519531, + "losses/total": 0.4130054712295532, + "ref_logps/chosen": -39.620872497558594, + "ref_logps/rejected": -44.596710205078125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7172482013702393, + "rewards/margins": 1.2441960573196411, + "rewards/rejected": -1.96144437789917, + "step": 809 + }, + { + "epoch": 0.76, + "grad_norm": 16.363344192504883, + "learning_rate": 4.139559286463798e-07, + "logps/chosen": -56.21271514892578, + "logps/rejected": -84.21590423583984, + "loss": 0.3861, + "losses/dpo": 0.09063832461833954, + "losses/sft": 1.6268547773361206, + "losses/total": 0.09063832461833954, + "ref_logps/chosen": -45.975486755371094, + "ref_logps/rejected": -62.186920166015625, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.023722767829895, + "rewards/margins": 1.1791757345199585, + "rewards/rejected": -2.2028985023498535, + "step": 810 + }, + { + "epoch": 0.77, + "grad_norm": 18.855722427368164, + "learning_rate": 4.1378104232249034e-07, + "logps/chosen": -46.534881591796875, + "logps/rejected": -64.19956970214844, + "loss": 0.5512, + "losses/dpo": 0.8763961791992188, + "losses/sft": 2.28117036819458, + "losses/total": 0.8763961791992188, + "ref_logps/chosen": -36.1524658203125, + "ref_logps/rejected": -48.884525299072266, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0382417440414429, + "rewards/margins": 0.4932631850242615, + "rewards/rejected": -1.5315048694610596, + "step": 811 + }, + { + "epoch": 0.77, + "grad_norm": 17.897693634033203, + "learning_rate": 4.136061559986009e-07, + "logps/chosen": -51.246246337890625, + "logps/rejected": -59.971458435058594, + "loss": 0.4539, + "losses/dpo": 0.4118199646472931, + "losses/sft": 1.7263908386230469, + "losses/total": 0.4118199646472931, + "ref_logps/chosen": -41.09989929199219, + "ref_logps/rejected": -41.491268157958984, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0146347284317017, + "rewards/margins": 0.8333843946456909, + "rewards/rejected": -1.8480191230773926, + "step": 812 + }, + { + "epoch": 0.77, + "grad_norm": 21.614953994750977, + "learning_rate": 4.134312696747114e-07, + "logps/chosen": -60.36675262451172, + "logps/rejected": -73.6515884399414, + "loss": 0.5616, + "losses/dpo": 0.37293195724487305, + "losses/sft": 1.845620036125183, + "losses/total": 0.37293195724487305, + "ref_logps/chosen": -48.35881042480469, + "ref_logps/rejected": -56.498138427734375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2007945775985718, + "rewards/margins": 0.5145505666732788, + "rewards/rejected": -1.7153451442718506, + "step": 813 + }, + { + "epoch": 0.77, + "grad_norm": 22.284120559692383, + "learning_rate": 4.13256383350822e-07, + "logps/chosen": -70.5172119140625, + "logps/rejected": -73.63374328613281, + "loss": 0.5859, + "losses/dpo": 0.40943634510040283, + "losses/sft": 2.5171494483947754, + "losses/total": 0.40943634510040283, + "ref_logps/chosen": -57.142147064208984, + "ref_logps/rejected": -54.95692443847656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3375064134597778, + "rewards/margins": 0.5301756858825684, + "rewards/rejected": -1.8676820993423462, + "step": 814 + }, + { + "epoch": 0.77, + "grad_norm": 18.59164047241211, + "learning_rate": 4.1308149702693244e-07, + "logps/chosen": -51.64142990112305, + "logps/rejected": -65.4079360961914, + "loss": 0.4503, + "losses/dpo": 0.3758418560028076, + "losses/sft": 1.860427975654602, + "losses/total": 0.3758418560028076, + "ref_logps/chosen": -40.368900299072266, + "ref_logps/rejected": -45.71522521972656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1272530555725098, + "rewards/margins": 0.8420186042785645, + "rewards/rejected": -1.9692716598510742, + "step": 815 + }, + { + "epoch": 0.77, + "grad_norm": 17.908594131469727, + "learning_rate": 4.12906610703043e-07, + "logps/chosen": -47.54798126220703, + "logps/rejected": -67.79107666015625, + "loss": 0.4823, + "losses/dpo": 0.24897406995296478, + "losses/sft": 1.7283592224121094, + "losses/total": 0.24897406995296478, + "ref_logps/chosen": -37.467628479003906, + "ref_logps/rejected": -49.21507263183594, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0080349445343018, + "rewards/margins": 0.8495657444000244, + "rewards/rejected": -1.8576006889343262, + "step": 816 + }, + { + "epoch": 0.77, + "grad_norm": 21.439563751220703, + "learning_rate": 4.1273172437915357e-07, + "logps/chosen": -48.15118408203125, + "logps/rejected": -61.09138870239258, + "loss": 0.5663, + "losses/dpo": 0.2909359931945801, + "losses/sft": 1.461403250694275, + "losses/total": 0.2909359931945801, + "ref_logps/chosen": -35.859962463378906, + "ref_logps/rejected": -42.76637268066406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2291221618652344, + "rewards/margins": 0.6033795475959778, + "rewards/rejected": -1.8325018882751465, + "step": 817 + }, + { + "epoch": 0.77, + "grad_norm": 15.60698413848877, + "learning_rate": 4.1255683805526403e-07, + "logps/chosen": -51.07274627685547, + "logps/rejected": -77.69608306884766, + "loss": 0.3989, + "losses/dpo": 0.5728371739387512, + "losses/sft": 1.7782914638519287, + "losses/total": 0.5728371739387512, + "ref_logps/chosen": -39.94403076171875, + "ref_logps/rejected": -56.00139617919922, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1128716468811035, + "rewards/margins": 1.056597113609314, + "rewards/rejected": -2.169468641281128, + "step": 818 + }, + { + "epoch": 0.77, + "grad_norm": 16.548442840576172, + "learning_rate": 4.123819517313746e-07, + "logps/chosen": -40.735740661621094, + "logps/rejected": -45.109466552734375, + "loss": 0.5646, + "losses/dpo": 0.7372438907623291, + "losses/sft": 2.486968517303467, + "losses/total": 0.7372438907623291, + "ref_logps/chosen": -30.94430923461914, + "ref_logps/rejected": -31.13395118713379, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9791431427001953, + "rewards/margins": 0.4184086322784424, + "rewards/rejected": -1.3975516557693481, + "step": 819 + }, + { + "epoch": 0.77, + "grad_norm": 25.764150619506836, + "learning_rate": 4.122070654074851e-07, + "logps/chosen": -49.48291015625, + "logps/rejected": -61.048072814941406, + "loss": 0.5621, + "losses/dpo": 0.9668756127357483, + "losses/sft": 1.7554867267608643, + "losses/total": 0.9668756127357483, + "ref_logps/chosen": -38.410850524902344, + "ref_logps/rejected": -41.62816619873047, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.107205867767334, + "rewards/margins": 0.834784746170044, + "rewards/rejected": -1.941990613937378, + "step": 820 + }, + { + "epoch": 0.78, + "grad_norm": 18.528274536132812, + "learning_rate": 4.120321790835957e-07, + "logps/chosen": -39.327728271484375, + "logps/rejected": -70.03535461425781, + "loss": 0.4533, + "losses/dpo": 0.3814285397529602, + "losses/sft": 1.7690520286560059, + "losses/total": 0.3814285397529602, + "ref_logps/chosen": -31.195537567138672, + "ref_logps/rejected": -53.608245849609375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8132188320159912, + "rewards/margins": 0.8294921517372131, + "rewards/rejected": -1.6427109241485596, + "step": 821 + }, + { + "epoch": 0.78, + "grad_norm": 21.447460174560547, + "learning_rate": 4.1185729275970613e-07, + "logps/chosen": -43.9781379699707, + "logps/rejected": -56.883888244628906, + "loss": 0.6067, + "losses/dpo": 1.2185615301132202, + "losses/sft": 1.8627616167068481, + "losses/total": 1.2185615301132202, + "ref_logps/chosen": -34.30780029296875, + "ref_logps/rejected": -41.61172866821289, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9670339822769165, + "rewards/margins": 0.5601822137832642, + "rewards/rejected": -1.5272161960601807, + "step": 822 + }, + { + "epoch": 0.78, + "grad_norm": 23.782699584960938, + "learning_rate": 4.116824064358167e-07, + "logps/chosen": -55.485130310058594, + "logps/rejected": -69.16844177246094, + "loss": 0.6105, + "losses/dpo": 0.6353790163993835, + "losses/sft": 1.612541913986206, + "losses/total": 0.6353790163993835, + "ref_logps/chosen": -42.11946487426758, + "ref_logps/rejected": -52.072021484375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.33656644821167, + "rewards/margins": 0.37307578325271606, + "rewards/rejected": -1.7096424102783203, + "step": 823 + }, + { + "epoch": 0.78, + "grad_norm": 24.362964630126953, + "learning_rate": 4.1150752011192727e-07, + "logps/chosen": -49.45055389404297, + "logps/rejected": -52.51344299316406, + "loss": 0.5772, + "losses/dpo": 0.7817726135253906, + "losses/sft": 1.7117810249328613, + "losses/total": 0.7817726135253906, + "ref_logps/chosen": -39.750152587890625, + "ref_logps/rejected": -37.19168472290039, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9700397253036499, + "rewards/margins": 0.562136173248291, + "rewards/rejected": -1.532175898551941, + "step": 824 + }, + { + "epoch": 0.78, + "grad_norm": 17.661327362060547, + "learning_rate": 4.113326337880377e-07, + "logps/chosen": -39.17385482788086, + "logps/rejected": -55.61888122558594, + "loss": 0.5506, + "losses/dpo": 0.430070161819458, + "losses/sft": 1.939976453781128, + "losses/total": 0.430070161819458, + "ref_logps/chosen": -28.86272621154785, + "ref_logps/rejected": -41.25264358520508, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0311129093170166, + "rewards/margins": 0.4055110812187195, + "rewards/rejected": -1.4366239309310913, + "step": 825 + }, + { + "epoch": 0.78, + "grad_norm": 19.97083282470703, + "learning_rate": 4.111577474641483e-07, + "logps/chosen": -47.04802703857422, + "logps/rejected": -46.20124435424805, + "loss": 0.5741, + "losses/dpo": 0.508751630783081, + "losses/sft": 1.8096230030059814, + "losses/total": 0.508751630783081, + "ref_logps/chosen": -37.95310974121094, + "ref_logps/rejected": -33.11418151855469, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9094916582107544, + "rewards/margins": 0.39921486377716064, + "rewards/rejected": -1.308706521987915, + "step": 826 + }, + { + "epoch": 0.78, + "grad_norm": 24.062883377075195, + "learning_rate": 4.109828611402588e-07, + "logps/chosen": -56.68595886230469, + "logps/rejected": -70.44078826904297, + "loss": 0.5813, + "losses/dpo": 0.7843765020370483, + "losses/sft": 2.074343204498291, + "losses/total": 0.7843765020370483, + "ref_logps/chosen": -44.316707611083984, + "ref_logps/rejected": -50.27688980102539, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.236924648284912, + "rewards/margins": 0.779464840888977, + "rewards/rejected": -2.0163896083831787, + "step": 827 + }, + { + "epoch": 0.78, + "grad_norm": 18.22933578491211, + "learning_rate": 4.1080797481636937e-07, + "logps/chosen": -41.456085205078125, + "logps/rejected": -47.322505950927734, + "loss": 0.6461, + "losses/dpo": 0.6511818170547485, + "losses/sft": 1.433754801750183, + "losses/total": 0.6511818170547485, + "ref_logps/chosen": -30.937042236328125, + "ref_logps/rejected": -34.29116439819336, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0519046783447266, + "rewards/margins": 0.25122973322868347, + "rewards/rejected": -1.3031344413757324, + "step": 828 + }, + { + "epoch": 0.78, + "grad_norm": 23.702861785888672, + "learning_rate": 4.1063308849247983e-07, + "logps/chosen": -48.07335662841797, + "logps/rejected": -52.76336669921875, + "loss": 0.6673, + "losses/dpo": 0.6375836133956909, + "losses/sft": 1.4201908111572266, + "losses/total": 0.6375836133956909, + "ref_logps/chosen": -37.506710052490234, + "ref_logps/rejected": -38.000221252441406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0566641092300415, + "rewards/margins": 0.41965004801750183, + "rewards/rejected": -1.4763140678405762, + "step": 829 + }, + { + "epoch": 0.78, + "grad_norm": 25.07790184020996, + "learning_rate": 4.104582021685904e-07, + "logps/chosen": -54.89522171020508, + "logps/rejected": -59.961612701416016, + "loss": 0.6327, + "losses/dpo": 0.7537351250648499, + "losses/sft": 1.9888134002685547, + "losses/total": 0.7537351250648499, + "ref_logps/chosen": -42.2194938659668, + "ref_logps/rejected": -43.52435302734375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2675732374191284, + "rewards/margins": 0.3761524558067322, + "rewards/rejected": -1.6437256336212158, + "step": 830 + }, + { + "epoch": 0.78, + "grad_norm": 24.729991912841797, + "learning_rate": 4.1028331584470096e-07, + "logps/chosen": -51.150794982910156, + "logps/rejected": -61.62116241455078, + "loss": 0.6719, + "losses/dpo": 0.5452122688293457, + "losses/sft": 1.8438490629196167, + "losses/total": 0.5452122688293457, + "ref_logps/chosen": -38.29194641113281, + "ref_logps/rejected": -45.78569030761719, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2858850955963135, + "rewards/margins": 0.2976619601249695, + "rewards/rejected": -1.5835471153259277, + "step": 831 + }, + { + "epoch": 0.79, + "grad_norm": 21.44819450378418, + "learning_rate": 4.101084295208114e-07, + "logps/chosen": -46.09823989868164, + "logps/rejected": -52.49946594238281, + "loss": 0.6399, + "losses/dpo": 0.5194214582443237, + "losses/sft": 1.5613081455230713, + "losses/total": 0.5194214582443237, + "ref_logps/chosen": -36.419857025146484, + "ref_logps/rejected": -38.36824035644531, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9678384065628052, + "rewards/margins": 0.4452841579914093, + "rewards/rejected": -1.4131226539611816, + "step": 832 + }, + { + "epoch": 0.79, + "grad_norm": 19.220415115356445, + "learning_rate": 4.09933543196922e-07, + "logps/chosen": -42.4419059753418, + "logps/rejected": -61.82816696166992, + "loss": 0.4571, + "losses/dpo": 0.4315023720264435, + "losses/sft": 1.821016788482666, + "losses/total": 0.4315023720264435, + "ref_logps/chosen": -34.52996826171875, + "ref_logps/rejected": -46.77499771118164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7911937236785889, + "rewards/margins": 0.7141229510307312, + "rewards/rejected": -1.5053167343139648, + "step": 833 + }, + { + "epoch": 0.79, + "grad_norm": 19.52722930908203, + "learning_rate": 4.097586568730325e-07, + "logps/chosen": -46.842567443847656, + "logps/rejected": -65.82450866699219, + "loss": 0.5215, + "losses/dpo": 0.9241123795509338, + "losses/sft": 1.4499462842941284, + "losses/total": 0.9241123795509338, + "ref_logps/chosen": -35.38399887084961, + "ref_logps/rejected": -46.04804229736328, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1458566188812256, + "rewards/margins": 0.8317896127700806, + "rewards/rejected": -1.9776462316513062, + "step": 834 + }, + { + "epoch": 0.79, + "grad_norm": 16.686166763305664, + "learning_rate": 4.0958377054914306e-07, + "logps/chosen": -48.39841842651367, + "logps/rejected": -58.906097412109375, + "loss": 0.482, + "losses/dpo": 0.3820689916610718, + "losses/sft": 2.4467155933380127, + "losses/total": 0.3820689916610718, + "ref_logps/chosen": -37.098514556884766, + "ref_logps/rejected": -40.42327117919922, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1299902200698853, + "rewards/margins": 0.7182927131652832, + "rewards/rejected": -1.8482829332351685, + "step": 835 + }, + { + "epoch": 0.79, + "grad_norm": 17.65949249267578, + "learning_rate": 4.094088842252535e-07, + "logps/chosen": -37.25263977050781, + "logps/rejected": -55.28199005126953, + "loss": 0.5436, + "losses/dpo": 0.45034536719322205, + "losses/sft": 1.5987011194229126, + "losses/total": 0.45034536719322205, + "ref_logps/chosen": -26.89162826538086, + "ref_logps/rejected": -39.481117248535156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0361013412475586, + "rewards/margins": 0.5439859628677368, + "rewards/rejected": -1.5800873041152954, + "step": 836 + }, + { + "epoch": 0.79, + "grad_norm": 21.985748291015625, + "learning_rate": 4.092339979013641e-07, + "logps/chosen": -40.997798919677734, + "logps/rejected": -67.35408020019531, + "loss": 0.642, + "losses/dpo": 0.8488292098045349, + "losses/sft": 1.6512669324874878, + "losses/total": 0.8488292098045349, + "ref_logps/chosen": -28.734466552734375, + "ref_logps/rejected": -50.182098388671875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2263331413269043, + "rewards/margins": 0.49086517095565796, + "rewards/rejected": -1.717198371887207, + "step": 837 + }, + { + "epoch": 0.79, + "grad_norm": 21.788148880004883, + "learning_rate": 4.0905911157747465e-07, + "logps/chosen": -46.52219772338867, + "logps/rejected": -68.809326171875, + "loss": 0.5504, + "losses/dpo": 0.6097391843795776, + "losses/sft": 1.4508566856384277, + "losses/total": 0.6097391843795776, + "ref_logps/chosen": -37.70521545410156, + "ref_logps/rejected": -53.499549865722656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8816981315612793, + "rewards/margins": 0.6492795348167419, + "rewards/rejected": -1.5309778451919556, + "step": 838 + }, + { + "epoch": 0.79, + "grad_norm": 27.178285598754883, + "learning_rate": 4.088842252535851e-07, + "logps/chosen": -54.411441802978516, + "logps/rejected": -67.19982147216797, + "loss": 0.7348, + "losses/dpo": 1.534578561782837, + "losses/sft": 2.0376055240631104, + "losses/total": 1.534578561782837, + "ref_logps/chosen": -41.59713363647461, + "ref_logps/rejected": -49.88591003417969, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2814306020736694, + "rewards/margins": 0.44996070861816406, + "rewards/rejected": -1.7313913106918335, + "step": 839 + }, + { + "epoch": 0.79, + "grad_norm": 20.112735748291016, + "learning_rate": 4.087093389296957e-07, + "logps/chosen": -50.57804489135742, + "logps/rejected": -76.22239685058594, + "loss": 0.4434, + "losses/dpo": 0.42248982191085815, + "losses/sft": 1.702013373374939, + "losses/total": 0.42248982191085815, + "ref_logps/chosen": -40.00060272216797, + "ref_logps/rejected": -54.03583908081055, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0577441453933716, + "rewards/margins": 1.1609119176864624, + "rewards/rejected": -2.218656063079834, + "step": 840 + }, + { + "epoch": 0.79, + "grad_norm": 18.04446029663086, + "learning_rate": 4.085344526058062e-07, + "logps/chosen": -46.6158447265625, + "logps/rejected": -63.661006927490234, + "loss": 0.4268, + "losses/dpo": 0.6124235391616821, + "losses/sft": 1.5183452367782593, + "losses/total": 0.6124235391616821, + "ref_logps/chosen": -37.38063430786133, + "ref_logps/rejected": -45.835533142089844, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9235209226608276, + "rewards/margins": 0.8590266704559326, + "rewards/rejected": -1.7825474739074707, + "step": 841 + }, + { + "epoch": 0.8, + "grad_norm": 21.226863861083984, + "learning_rate": 4.0835956628191676e-07, + "logps/chosen": -43.76727294921875, + "logps/rejected": -68.08877563476562, + "loss": 0.5675, + "losses/dpo": 0.5349567532539368, + "losses/sft": 1.2213505506515503, + "losses/total": 0.5349567532539368, + "ref_logps/chosen": -32.184818267822266, + "ref_logps/rejected": -50.73428726196289, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1582458019256592, + "rewards/margins": 0.5772035121917725, + "rewards/rejected": -1.7354493141174316, + "step": 842 + }, + { + "epoch": 0.8, + "grad_norm": 22.93755531311035, + "learning_rate": 4.0818467995802727e-07, + "logps/chosen": -46.00981903076172, + "logps/rejected": -56.951881408691406, + "loss": 0.6753, + "losses/dpo": 0.8745790123939514, + "losses/sft": 1.7632869482040405, + "losses/total": 0.8745790123939514, + "ref_logps/chosen": -32.30010986328125, + "ref_logps/rejected": -41.57134246826172, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.370970606803894, + "rewards/margins": 0.1670835018157959, + "rewards/rejected": -1.5380539894104004, + "step": 843 + }, + { + "epoch": 0.8, + "grad_norm": 25.24207305908203, + "learning_rate": 4.080097936341378e-07, + "logps/chosen": -42.13519287109375, + "logps/rejected": -58.031620025634766, + "loss": 0.6824, + "losses/dpo": 0.8411730527877808, + "losses/sft": 1.5144634246826172, + "losses/total": 0.8411730527877808, + "ref_logps/chosen": -31.530597686767578, + "ref_logps/rejected": -43.723594665527344, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0604591369628906, + "rewards/margins": 0.3703431487083435, + "rewards/rejected": -1.430802345275879, + "step": 844 + }, + { + "epoch": 0.8, + "grad_norm": 19.652835845947266, + "learning_rate": 4.0783490731024835e-07, + "logps/chosen": -40.53631591796875, + "logps/rejected": -54.869380950927734, + "loss": 0.5725, + "losses/dpo": 0.7487130165100098, + "losses/sft": 1.791237473487854, + "losses/total": 0.7487130165100098, + "ref_logps/chosen": -32.66266632080078, + "ref_logps/rejected": -42.31922912597656, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7873647212982178, + "rewards/margins": 0.4676506519317627, + "rewards/rejected": -1.2550153732299805, + "step": 845 + }, + { + "epoch": 0.8, + "grad_norm": 22.695154190063477, + "learning_rate": 4.076600209863588e-07, + "logps/chosen": -48.605037689208984, + "logps/rejected": -67.17694091796875, + "loss": 0.5637, + "losses/dpo": 0.6091452836990356, + "losses/sft": 1.739264726638794, + "losses/total": 0.6091452836990356, + "ref_logps/chosen": -35.46781921386719, + "ref_logps/rejected": -48.201053619384766, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3137223720550537, + "rewards/margins": 0.5838658213615417, + "rewards/rejected": -1.8975882530212402, + "step": 846 + }, + { + "epoch": 0.8, + "grad_norm": 27.827070236206055, + "learning_rate": 4.074851346624694e-07, + "logps/chosen": -59.363311767578125, + "logps/rejected": -66.76353454589844, + "loss": 0.6976, + "losses/dpo": 0.26183071732521057, + "losses/sft": 1.4789923429489136, + "losses/total": 0.26183071732521057, + "ref_logps/chosen": -45.20199966430664, + "ref_logps/rejected": -49.7233772277832, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4161311388015747, + "rewards/margins": 0.2878847122192383, + "rewards/rejected": -1.7040159702301025, + "step": 847 + }, + { + "epoch": 0.8, + "grad_norm": 16.140514373779297, + "learning_rate": 4.073102483385799e-07, + "logps/chosen": -42.16569519042969, + "logps/rejected": -58.39725112915039, + "loss": 0.4775, + "losses/dpo": 0.43545204401016235, + "losses/sft": 1.5353697538375854, + "losses/total": 0.43545204401016235, + "ref_logps/chosen": -32.512306213378906, + "ref_logps/rejected": -40.56866455078125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9653388261795044, + "rewards/margins": 0.8175197839736938, + "rewards/rejected": -1.7828586101531982, + "step": 848 + }, + { + "epoch": 0.8, + "grad_norm": 22.41170883178711, + "learning_rate": 4.0713536201469045e-07, + "logps/chosen": -47.756431579589844, + "logps/rejected": -60.93867111206055, + "loss": 0.6327, + "losses/dpo": 0.3982849419116974, + "losses/sft": 1.5695722103118896, + "losses/total": 0.3982849419116974, + "ref_logps/chosen": -37.33623504638672, + "ref_logps/rejected": -45.953495025634766, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0420196056365967, + "rewards/margins": 0.4564979076385498, + "rewards/rejected": -1.4985175132751465, + "step": 849 + }, + { + "epoch": 0.8, + "grad_norm": 22.37420082092285, + "learning_rate": 4.0696047569080096e-07, + "logps/chosen": -50.984737396240234, + "logps/rejected": -59.799072265625, + "loss": 0.547, + "losses/dpo": 0.2710026502609253, + "losses/sft": 1.1395957469940186, + "losses/total": 0.2710026502609253, + "ref_logps/chosen": -39.94063949584961, + "ref_logps/rejected": -43.56227111816406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1044096946716309, + "rewards/margins": 0.5192702412605286, + "rewards/rejected": -1.6236801147460938, + "step": 850 + }, + { + "epoch": 0.8, + "grad_norm": 22.754549026489258, + "learning_rate": 4.067855893669115e-07, + "logps/chosen": -51.86137008666992, + "logps/rejected": -49.23633575439453, + "loss": 0.6634, + "losses/dpo": 0.9052378535270691, + "losses/sft": 1.8138394355773926, + "losses/total": 0.9052378535270691, + "ref_logps/chosen": -41.40962600708008, + "ref_logps/rejected": -34.698333740234375, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0451745986938477, + "rewards/margins": 0.40862545371055603, + "rewards/rejected": -1.453800082206726, + "step": 851 + }, + { + "epoch": 0.8, + "grad_norm": 16.493871688842773, + "learning_rate": 4.0661070304302204e-07, + "logps/chosen": -38.2988395690918, + "logps/rejected": -64.69878387451172, + "loss": 0.4974, + "losses/dpo": 0.38129743933677673, + "losses/sft": 1.8565140962600708, + "losses/total": 0.38129743933677673, + "ref_logps/chosen": -27.315887451171875, + "ref_logps/rejected": -44.236541748046875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.098294973373413, + "rewards/margins": 0.9479295015335083, + "rewards/rejected": -2.046224355697632, + "step": 852 + }, + { + "epoch": 0.81, + "grad_norm": 19.36992835998535, + "learning_rate": 4.064358167191325e-07, + "logps/chosen": -48.493099212646484, + "logps/rejected": -57.895172119140625, + "loss": 0.5137, + "losses/dpo": 0.3286309242248535, + "losses/sft": 1.859135389328003, + "losses/total": 0.3286309242248535, + "ref_logps/chosen": -36.25791931152344, + "ref_logps/rejected": -39.356407165527344, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.223517894744873, + "rewards/margins": 0.6303588151931763, + "rewards/rejected": -1.8538768291473389, + "step": 853 + }, + { + "epoch": 0.81, + "grad_norm": 19.615402221679688, + "learning_rate": 4.0626093039524307e-07, + "logps/chosen": -57.19934844970703, + "logps/rejected": -71.09638977050781, + "loss": 0.5387, + "losses/dpo": 0.7195569276809692, + "losses/sft": 2.2594923973083496, + "losses/total": 0.7195569276809692, + "ref_logps/chosen": -47.57176971435547, + "ref_logps/rejected": -55.40192413330078, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9627577066421509, + "rewards/margins": 0.6066886782646179, + "rewards/rejected": -1.5694464445114136, + "step": 854 + }, + { + "epoch": 0.81, + "grad_norm": 19.97063636779785, + "learning_rate": 4.060860440713536e-07, + "logps/chosen": -45.506553649902344, + "logps/rejected": -49.71862030029297, + "loss": 0.5772, + "losses/dpo": 0.4405631721019745, + "losses/sft": 2.0534377098083496, + "losses/total": 0.4405631721019745, + "ref_logps/chosen": -35.478851318359375, + "ref_logps/rejected": -35.66107940673828, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0027706623077393, + "rewards/margins": 0.4029836058616638, + "rewards/rejected": -1.4057542085647583, + "step": 855 + }, + { + "epoch": 0.81, + "grad_norm": 22.616846084594727, + "learning_rate": 4.0591115774746415e-07, + "logps/chosen": -60.477508544921875, + "logps/rejected": -68.45823669433594, + "loss": 0.6113, + "losses/dpo": 0.5044362545013428, + "losses/sft": 1.688843011856079, + "losses/total": 0.5044362545013428, + "ref_logps/chosen": -47.33610916137695, + "ref_logps/rejected": -50.687744140625, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3141398429870605, + "rewards/margins": 0.4629090130329132, + "rewards/rejected": -1.7770488262176514, + "step": 856 + }, + { + "epoch": 0.81, + "grad_norm": 12.60323715209961, + "learning_rate": 4.0573627142357466e-07, + "logps/chosen": -38.97118377685547, + "logps/rejected": -58.85884094238281, + "loss": 0.3425, + "losses/dpo": 0.33693045377731323, + "losses/sft": 1.9000900983810425, + "losses/total": 0.33693045377731323, + "ref_logps/chosen": -31.284116744995117, + "ref_logps/rejected": -40.602928161621094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7687070965766907, + "rewards/margins": 1.056883692741394, + "rewards/rejected": -1.8255908489227295, + "step": 857 + }, + { + "epoch": 0.81, + "grad_norm": 16.767677307128906, + "learning_rate": 4.0556138509968517e-07, + "logps/chosen": -50.87198257446289, + "logps/rejected": -63.775840759277344, + "loss": 0.4464, + "losses/dpo": 0.17628905177116394, + "losses/sft": 1.7027733325958252, + "losses/total": 0.17628905177116394, + "ref_logps/chosen": -39.1417350769043, + "ref_logps/rejected": -40.746944427490234, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1730247735977173, + "rewards/margins": 1.1298646926879883, + "rewards/rejected": -2.302889347076416, + "step": 858 + }, + { + "epoch": 0.81, + "grad_norm": 22.397029876708984, + "learning_rate": 4.0538649877579574e-07, + "logps/chosen": -58.40730285644531, + "logps/rejected": -50.577537536621094, + "loss": 0.6917, + "losses/dpo": 0.7703583240509033, + "losses/sft": 1.8212858438491821, + "losses/total": 0.7703583240509033, + "ref_logps/chosen": -47.790496826171875, + "ref_logps/rejected": -37.80146789550781, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.061680555343628, + "rewards/margins": 0.21592707931995392, + "rewards/rejected": -1.2776075601577759, + "step": 859 + }, + { + "epoch": 0.81, + "grad_norm": 25.436233520507812, + "learning_rate": 4.052116124519062e-07, + "logps/chosen": -57.184967041015625, + "logps/rejected": -69.12425231933594, + "loss": 0.6217, + "losses/dpo": 0.4821227192878723, + "losses/sft": 1.9631327390670776, + "losses/total": 0.4821227192878723, + "ref_logps/chosen": -43.59064483642578, + "ref_logps/rejected": -48.96746063232422, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3594326972961426, + "rewards/margins": 0.6562467813491821, + "rewards/rejected": -2.015679359436035, + "step": 860 + }, + { + "epoch": 0.81, + "grad_norm": 18.395221710205078, + "learning_rate": 4.0503672612801676e-07, + "logps/chosen": -44.1633186340332, + "logps/rejected": -53.20210266113281, + "loss": 0.5284, + "losses/dpo": 0.6680523157119751, + "losses/sft": 1.4781405925750732, + "losses/total": 0.6680523157119751, + "ref_logps/chosen": -35.332000732421875, + "ref_logps/rejected": -38.637901306152344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8831318616867065, + "rewards/margins": 0.5732882022857666, + "rewards/rejected": -1.4564200639724731, + "step": 861 + }, + { + "epoch": 0.81, + "grad_norm": 23.400135040283203, + "learning_rate": 4.0486183980412733e-07, + "logps/chosen": -48.28411102294922, + "logps/rejected": -57.12320327758789, + "loss": 0.6643, + "losses/dpo": 0.505574643611908, + "losses/sft": 1.6830575466156006, + "losses/total": 0.505574643611908, + "ref_logps/chosen": -35.49404525756836, + "ref_logps/rejected": -40.492889404296875, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2790067195892334, + "rewards/margins": 0.38402464985847473, + "rewards/rejected": -1.6630313396453857, + "step": 862 + }, + { + "epoch": 0.81, + "grad_norm": 19.01360511779785, + "learning_rate": 4.0468695348023784e-07, + "logps/chosen": -38.048397064208984, + "logps/rejected": -53.74854278564453, + "loss": 0.5528, + "losses/dpo": 0.6405391693115234, + "losses/sft": 2.077653408050537, + "losses/total": 0.6405391693115234, + "ref_logps/chosen": -27.689138412475586, + "ref_logps/rejected": -38.26967239379883, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0359257459640503, + "rewards/margins": 0.5119612216949463, + "rewards/rejected": -1.5478869676589966, + "step": 863 + }, + { + "epoch": 0.82, + "grad_norm": 23.446998596191406, + "learning_rate": 4.0451206715634835e-07, + "logps/chosen": -40.65340805053711, + "logps/rejected": -60.793148040771484, + "loss": 0.5443, + "losses/dpo": 0.7148911356925964, + "losses/sft": 1.7339686155319214, + "losses/total": 0.7148911356925964, + "ref_logps/chosen": -31.940067291259766, + "ref_logps/rejected": -46.303550720214844, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8713340759277344, + "rewards/margins": 0.5776263475418091, + "rewards/rejected": -1.4489604234695435, + "step": 864 + }, + { + "epoch": 0.82, + "grad_norm": 27.00196075439453, + "learning_rate": 4.0433718083245887e-07, + "logps/chosen": -53.50159454345703, + "logps/rejected": -57.526405334472656, + "loss": 0.7365, + "losses/dpo": 1.0049927234649658, + "losses/sft": 2.0289671421051025, + "losses/total": 1.0049927234649658, + "ref_logps/chosen": -38.87628936767578, + "ref_logps/rejected": -39.68628692626953, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4625301361083984, + "rewards/margins": 0.3214820623397827, + "rewards/rejected": -1.7840123176574707, + "step": 865 + }, + { + "epoch": 0.82, + "grad_norm": 25.118370056152344, + "learning_rate": 4.0416229450856943e-07, + "logps/chosen": -61.04706954956055, + "logps/rejected": -61.71116638183594, + "loss": 0.5993, + "losses/dpo": 0.7177576422691345, + "losses/sft": 1.7501208782196045, + "losses/total": 0.7177576422691345, + "ref_logps/chosen": -45.91304016113281, + "ref_logps/rejected": -41.8324089050293, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5134029388427734, + "rewards/margins": 0.4744730293750763, + "rewards/rejected": -1.9878759384155273, + "step": 866 + }, + { + "epoch": 0.82, + "grad_norm": 21.401714324951172, + "learning_rate": 4.039874081846799e-07, + "logps/chosen": -59.45185470581055, + "logps/rejected": -59.17026138305664, + "loss": 0.5522, + "losses/dpo": 0.5085623264312744, + "losses/sft": 2.0390872955322266, + "losses/total": 0.5085623264312744, + "ref_logps/chosen": -49.54937744140625, + "ref_logps/rejected": -44.580406188964844, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9902480840682983, + "rewards/margins": 0.46873754262924194, + "rewards/rejected": -1.458985686302185, + "step": 867 + }, + { + "epoch": 0.82, + "grad_norm": 21.441139221191406, + "learning_rate": 4.0381252186079046e-07, + "logps/chosen": -46.783607482910156, + "logps/rejected": -56.933860778808594, + "loss": 0.569, + "losses/dpo": 0.7660975456237793, + "losses/sft": 1.5038211345672607, + "losses/total": 0.7660975456237793, + "ref_logps/chosen": -36.38450622558594, + "ref_logps/rejected": -42.272499084472656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0399103164672852, + "rewards/margins": 0.4262256920337677, + "rewards/rejected": -1.4661359786987305, + "step": 868 + }, + { + "epoch": 0.82, + "grad_norm": 24.64531707763672, + "learning_rate": 4.03637635536901e-07, + "logps/chosen": -54.77189636230469, + "logps/rejected": -60.194175720214844, + "loss": 0.6953, + "losses/dpo": 0.5726479291915894, + "losses/sft": 1.5797961950302124, + "losses/total": 0.5726479291915894, + "ref_logps/chosen": -41.2158088684082, + "ref_logps/rejected": -44.439491271972656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3556084632873535, + "rewards/margins": 0.2198602259159088, + "rewards/rejected": -1.5754687786102295, + "step": 869 + }, + { + "epoch": 0.82, + "grad_norm": 15.96130657196045, + "learning_rate": 4.0346274921301154e-07, + "logps/chosen": -46.201385498046875, + "logps/rejected": -59.735992431640625, + "loss": 0.4572, + "losses/dpo": 0.42531144618988037, + "losses/sft": 1.75662362575531, + "losses/total": 0.42531144618988037, + "ref_logps/chosen": -34.47917175292969, + "ref_logps/rejected": -40.12859344482422, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1722216606140137, + "rewards/margins": 0.7885184288024902, + "rewards/rejected": -1.9607398509979248, + "step": 870 + }, + { + "epoch": 0.82, + "grad_norm": 16.508520126342773, + "learning_rate": 4.0328786288912205e-07, + "logps/chosen": -41.911895751953125, + "logps/rejected": -64.87440490722656, + "loss": 0.4431, + "losses/dpo": 0.5562084913253784, + "losses/sft": 1.542809247970581, + "losses/total": 0.5562084913253784, + "ref_logps/chosen": -32.55203628540039, + "ref_logps/rejected": -46.92086410522461, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.935985803604126, + "rewards/margins": 0.8593685626983643, + "rewards/rejected": -1.7953543663024902, + "step": 871 + }, + { + "epoch": 0.82, + "grad_norm": 17.955007553100586, + "learning_rate": 4.0311297656523256e-07, + "logps/chosen": -45.63519287109375, + "logps/rejected": -59.40199279785156, + "loss": 0.5459, + "losses/dpo": 0.5789581537246704, + "losses/sft": 1.654513955116272, + "losses/total": 0.5789581537246704, + "ref_logps/chosen": -33.384796142578125, + "ref_logps/rejected": -41.051612854003906, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2250394821166992, + "rewards/margins": 0.6099984645843506, + "rewards/rejected": -1.8350379467010498, + "step": 872 + }, + { + "epoch": 0.82, + "grad_norm": 18.90740394592285, + "learning_rate": 4.029380902413431e-07, + "logps/chosen": -53.078369140625, + "logps/rejected": -66.50953674316406, + "loss": 0.4955, + "losses/dpo": 0.2812321186065674, + "losses/sft": 1.4978892803192139, + "losses/total": 0.2812321186065674, + "ref_logps/chosen": -41.2359619140625, + "ref_logps/rejected": -48.61559295654297, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1842408180236816, + "rewards/margins": 0.6051534414291382, + "rewards/rejected": -1.7893941402435303, + "step": 873 + }, + { + "epoch": 0.83, + "grad_norm": 18.567195892333984, + "learning_rate": 4.0276320391745364e-07, + "logps/chosen": -34.209442138671875, + "logps/rejected": -44.80946350097656, + "loss": 0.5305, + "losses/dpo": 0.8185225129127502, + "losses/sft": 2.060314178466797, + "losses/total": 0.8185225129127502, + "ref_logps/chosen": -24.25729751586914, + "ref_logps/rejected": -27.58606719970703, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9952144622802734, + "rewards/margins": 0.7271252870559692, + "rewards/rejected": -1.7223396301269531, + "step": 874 + }, + { + "epoch": 0.83, + "grad_norm": 20.771114349365234, + "learning_rate": 4.0258831759356415e-07, + "logps/chosen": -53.101951599121094, + "logps/rejected": -56.18994140625, + "loss": 0.5429, + "losses/dpo": 0.3754958510398865, + "losses/sft": 1.6476372480392456, + "losses/total": 0.3754958510398865, + "ref_logps/chosen": -40.65750503540039, + "ref_logps/rejected": -38.36329650878906, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2444448471069336, + "rewards/margins": 0.5382197499275208, + "rewards/rejected": -1.7826645374298096, + "step": 875 + }, + { + "epoch": 0.83, + "grad_norm": 24.008037567138672, + "learning_rate": 4.024134312696747e-07, + "logps/chosen": -54.33305740356445, + "logps/rejected": -63.0389404296875, + "loss": 0.5769, + "losses/dpo": 0.32310086488723755, + "losses/sft": 1.8714921474456787, + "losses/total": 0.32310086488723755, + "ref_logps/chosen": -40.23691940307617, + "ref_logps/rejected": -43.30393981933594, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.409613847732544, + "rewards/margins": 0.5638861656188965, + "rewards/rejected": -1.9734998941421509, + "step": 876 + }, + { + "epoch": 0.83, + "grad_norm": 22.259435653686523, + "learning_rate": 4.0223854494578523e-07, + "logps/chosen": -38.70526885986328, + "logps/rejected": -46.02214431762695, + "loss": 0.6679, + "losses/dpo": 0.7911670804023743, + "losses/sft": 1.2371726036071777, + "losses/total": 0.7911670804023743, + "ref_logps/chosen": -29.42253875732422, + "ref_logps/rejected": -34.9599723815918, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9282734394073486, + "rewards/margins": 0.17794373631477356, + "rewards/rejected": -1.1062171459197998, + "step": 877 + }, + { + "epoch": 0.83, + "grad_norm": 17.41746711730957, + "learning_rate": 4.0206365862189574e-07, + "logps/chosen": -47.60137176513672, + "logps/rejected": -64.38847351074219, + "loss": 0.4599, + "losses/dpo": 0.4213147461414337, + "losses/sft": 1.8955029249191284, + "losses/total": 0.4213147461414337, + "ref_logps/chosen": -33.97480773925781, + "ref_logps/rejected": -41.838401794433594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3626561164855957, + "rewards/margins": 0.8923516273498535, + "rewards/rejected": -2.255007743835449, + "step": 878 + }, + { + "epoch": 0.83, + "grad_norm": 19.2557315826416, + "learning_rate": 4.0188877229800625e-07, + "logps/chosen": -47.31446838378906, + "logps/rejected": -62.8548698425293, + "loss": 0.5233, + "losses/dpo": 0.5200448036193848, + "losses/sft": 1.705970048904419, + "losses/total": 0.5200448036193848, + "ref_logps/chosen": -35.320770263671875, + "ref_logps/rejected": -43.62351989746094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1993695497512817, + "rewards/margins": 0.7237653136253357, + "rewards/rejected": -1.9231348037719727, + "step": 879 + }, + { + "epoch": 0.83, + "grad_norm": 21.179445266723633, + "learning_rate": 4.017138859741168e-07, + "logps/chosen": -45.69499969482422, + "logps/rejected": -57.72776794433594, + "loss": 0.5355, + "losses/dpo": 0.3703647255897522, + "losses/sft": 1.6293296813964844, + "losses/total": 0.3703647255897522, + "ref_logps/chosen": -34.82201385498047, + "ref_logps/rejected": -41.07172393798828, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.08729887008667, + "rewards/margins": 0.5783048868179321, + "rewards/rejected": -1.6656038761138916, + "step": 880 + }, + { + "epoch": 0.83, + "grad_norm": 20.20770835876465, + "learning_rate": 4.015389996502274e-07, + "logps/chosen": -47.463958740234375, + "logps/rejected": -71.09518432617188, + "loss": 0.5154, + "losses/dpo": 0.6252133250236511, + "losses/sft": 1.6834959983825684, + "losses/total": 0.6252133250236511, + "ref_logps/chosen": -35.67316436767578, + "ref_logps/rejected": -52.39569854736328, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1790798902511597, + "rewards/margins": 0.6908687949180603, + "rewards/rejected": -1.8699486255645752, + "step": 881 + }, + { + "epoch": 0.83, + "grad_norm": 22.328908920288086, + "learning_rate": 4.0136411332633785e-07, + "logps/chosen": -58.70216369628906, + "logps/rejected": -63.54753494262695, + "loss": 0.6014, + "losses/dpo": 0.9843833446502686, + "losses/sft": 2.578153610229492, + "losses/total": 0.9843833446502686, + "ref_logps/chosen": -43.267913818359375, + "ref_logps/rejected": -42.69639205932617, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.54342520236969, + "rewards/margins": 0.5416891574859619, + "rewards/rejected": -2.0851144790649414, + "step": 882 + }, + { + "epoch": 0.83, + "grad_norm": 22.3562068939209, + "learning_rate": 4.011892270024484e-07, + "logps/chosen": -59.6101188659668, + "logps/rejected": -64.79010009765625, + "loss": 0.6366, + "losses/dpo": 0.7219882607460022, + "losses/sft": 2.1527886390686035, + "losses/total": 0.7219882607460022, + "ref_logps/chosen": -44.10911560058594, + "ref_logps/rejected": -45.55890655517578, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5501008033752441, + "rewards/margins": 0.37301838397979736, + "rewards/rejected": -1.923119306564331, + "step": 883 + }, + { + "epoch": 0.83, + "grad_norm": 19.44447135925293, + "learning_rate": 4.010143406785589e-07, + "logps/chosen": -39.67781066894531, + "logps/rejected": -46.88531494140625, + "loss": 0.6339, + "losses/dpo": 0.546170711517334, + "losses/sft": 1.4967249631881714, + "losses/total": 0.546170711517334, + "ref_logps/chosen": -30.153152465820312, + "ref_logps/rejected": -34.239295959472656, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9524660110473633, + "rewards/margins": 0.31213611364364624, + "rewards/rejected": -1.2646021842956543, + "step": 884 + }, + { + "epoch": 0.84, + "grad_norm": 18.314401626586914, + "learning_rate": 4.0083945435466944e-07, + "logps/chosen": -51.6224365234375, + "logps/rejected": -61.485015869140625, + "loss": 0.5356, + "losses/dpo": 0.30389899015426636, + "losses/sft": 1.3712421655654907, + "losses/total": 0.30389899015426636, + "ref_logps/chosen": -38.87030792236328, + "ref_logps/rejected": -42.986480712890625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2752134799957275, + "rewards/margins": 0.5746399164199829, + "rewards/rejected": -1.849853277206421, + "step": 885 + }, + { + "epoch": 0.84, + "grad_norm": 17.25050926208496, + "learning_rate": 4.0066456803077995e-07, + "logps/chosen": -47.176368713378906, + "logps/rejected": -58.17384719848633, + "loss": 0.4748, + "losses/dpo": 0.417095422744751, + "losses/sft": 1.666614294052124, + "losses/total": 0.417095422744751, + "ref_logps/chosen": -34.78626251220703, + "ref_logps/rejected": -38.810752868652344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2390108108520508, + "rewards/margins": 0.6972986459732056, + "rewards/rejected": -1.936309576034546, + "step": 886 + }, + { + "epoch": 0.84, + "grad_norm": 21.474668502807617, + "learning_rate": 4.004896817068905e-07, + "logps/chosen": -57.298316955566406, + "logps/rejected": -70.33256530761719, + "loss": 0.4984, + "losses/dpo": 0.5374317169189453, + "losses/sft": 1.7164843082427979, + "losses/total": 0.5374317169189453, + "ref_logps/chosen": -43.94514083862305, + "ref_logps/rejected": -50.434391021728516, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.335317611694336, + "rewards/margins": 0.6544992327690125, + "rewards/rejected": -1.9898170232772827, + "step": 887 + }, + { + "epoch": 0.84, + "grad_norm": 25.858394622802734, + "learning_rate": 4.003147953830011e-07, + "logps/chosen": -63.030738830566406, + "logps/rejected": -69.27155303955078, + "loss": 0.7204, + "losses/dpo": 0.9102529287338257, + "losses/sft": 2.259722948074341, + "losses/total": 0.9102529287338257, + "ref_logps/chosen": -46.33272171020508, + "ref_logps/rejected": -51.422882080078125, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6698018312454224, + "rewards/margins": 0.11506487429141998, + "rewards/rejected": -1.7848666906356812, + "step": 888 + }, + { + "epoch": 0.84, + "grad_norm": 25.910511016845703, + "learning_rate": 4.0013990905911154e-07, + "logps/chosen": -63.817386627197266, + "logps/rejected": -73.01847839355469, + "loss": 0.6373, + "losses/dpo": 0.5446485280990601, + "losses/sft": 1.8927452564239502, + "losses/total": 0.5446485280990601, + "ref_logps/chosen": -46.975746154785156, + "ref_logps/rejected": -53.32268142700195, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6841641664505005, + "rewards/margins": 0.2854151129722595, + "rewards/rejected": -1.9695793390274048, + "step": 889 + }, + { + "epoch": 0.84, + "grad_norm": 23.929601669311523, + "learning_rate": 3.999650227352221e-07, + "logps/chosen": -49.52193069458008, + "logps/rejected": -48.13573455810547, + "loss": 0.7912, + "losses/dpo": 0.3822995722293854, + "losses/sft": 1.1467831134796143, + "losses/total": 0.3822995722293854, + "ref_logps/chosen": -37.495567321777344, + "ref_logps/rejected": -35.206275939941406, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2026360034942627, + "rewards/margins": 0.09031036496162415, + "rewards/rejected": -1.2929463386535645, + "step": 890 + }, + { + "epoch": 0.84, + "grad_norm": 17.41577911376953, + "learning_rate": 3.997901364113326e-07, + "logps/chosen": -41.08697509765625, + "logps/rejected": -48.04826354980469, + "loss": 0.5444, + "losses/dpo": 0.44084930419921875, + "losses/sft": 2.1039211750030518, + "losses/total": 0.44084930419921875, + "ref_logps/chosen": -30.391429901123047, + "ref_logps/rejected": -31.823579788208008, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0695546865463257, + "rewards/margins": 0.5529135465621948, + "rewards/rejected": -1.6224682331085205, + "step": 891 + }, + { + "epoch": 0.84, + "grad_norm": 18.696300506591797, + "learning_rate": 3.9961525008744313e-07, + "logps/chosen": -51.2611198425293, + "logps/rejected": -69.18788146972656, + "loss": 0.5201, + "losses/dpo": 0.7692731618881226, + "losses/sft": 1.5752068758010864, + "losses/total": 0.7692731618881226, + "ref_logps/chosen": -39.96553039550781, + "ref_logps/rejected": -50.13174057006836, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1295592784881592, + "rewards/margins": 0.7760545611381531, + "rewards/rejected": -1.905613899230957, + "step": 892 + }, + { + "epoch": 0.84, + "grad_norm": 19.385847091674805, + "learning_rate": 3.9944036376355364e-07, + "logps/chosen": -69.12168884277344, + "logps/rejected": -68.44438171386719, + "loss": 0.4876, + "losses/dpo": 0.4618051052093506, + "losses/sft": 1.9552251100540161, + "losses/total": 0.4618051052093506, + "ref_logps/chosen": -53.12922286987305, + "ref_logps/rejected": -45.817264556884766, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5992467403411865, + "rewards/margins": 0.6634646654129028, + "rewards/rejected": -2.262711524963379, + "step": 893 + }, + { + "epoch": 0.84, + "grad_norm": 16.251096725463867, + "learning_rate": 3.992654774396642e-07, + "logps/chosen": -52.292945861816406, + "logps/rejected": -77.89862823486328, + "loss": 0.3504, + "losses/dpo": 0.3794952630996704, + "losses/sft": 2.046430826187134, + "losses/total": 0.3794952630996704, + "ref_logps/chosen": -40.439971923828125, + "ref_logps/rejected": -53.88254165649414, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1852972507476807, + "rewards/margins": 1.216310977935791, + "rewards/rejected": -2.4016079902648926, + "step": 894 + }, + { + "epoch": 0.85, + "grad_norm": 21.71742820739746, + "learning_rate": 3.990905911157748e-07, + "logps/chosen": -40.72724151611328, + "logps/rejected": -52.5378303527832, + "loss": 0.7007, + "losses/dpo": 0.9905320405960083, + "losses/sft": 1.6795307397842407, + "losses/total": 0.9905320405960083, + "ref_logps/chosen": -28.036211013793945, + "ref_logps/rejected": -37.149574279785156, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2691028118133545, + "rewards/margins": 0.26972299814224243, + "rewards/rejected": -1.5388257503509521, + "step": 895 + }, + { + "epoch": 0.85, + "grad_norm": 18.174007415771484, + "learning_rate": 3.9891570479188523e-07, + "logps/chosen": -44.134765625, + "logps/rejected": -59.41065216064453, + "loss": 0.536, + "losses/dpo": 0.5511208772659302, + "losses/sft": 1.4953030347824097, + "losses/total": 0.5511208772659302, + "ref_logps/chosen": -30.967546463012695, + "ref_logps/rejected": -40.59535217285156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.31672203540802, + "rewards/margins": 0.5648081302642822, + "rewards/rejected": -1.8815300464630127, + "step": 896 + }, + { + "epoch": 0.85, + "grad_norm": 17.954891204833984, + "learning_rate": 3.987408184679958e-07, + "logps/chosen": -41.72795486450195, + "logps/rejected": -62.03364944458008, + "loss": 0.4385, + "losses/dpo": 0.34971052408218384, + "losses/sft": 1.5322344303131104, + "losses/total": 0.34971052408218384, + "ref_logps/chosen": -30.000877380371094, + "ref_logps/rejected": -41.904273986816406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1727076768875122, + "rewards/margins": 0.840229868888855, + "rewards/rejected": -2.012937545776367, + "step": 897 + }, + { + "epoch": 0.85, + "grad_norm": 22.424640655517578, + "learning_rate": 3.985659321441063e-07, + "logps/chosen": -58.33543395996094, + "logps/rejected": -56.343326568603516, + "loss": 0.674, + "losses/dpo": 0.6729226112365723, + "losses/sft": 1.7288624048233032, + "losses/total": 0.6729226112365723, + "ref_logps/chosen": -45.78932571411133, + "ref_logps/rejected": -40.792091369628906, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2546104192733765, + "rewards/margins": 0.3005130887031555, + "rewards/rejected": -1.5551234483718872, + "step": 898 + }, + { + "epoch": 0.85, + "grad_norm": 23.461875915527344, + "learning_rate": 3.983910458202168e-07, + "logps/chosen": -57.271263122558594, + "logps/rejected": -62.131683349609375, + "loss": 0.77, + "losses/dpo": 1.2522003650665283, + "losses/sft": 1.9639540910720825, + "losses/total": 1.2522003650665283, + "ref_logps/chosen": -43.60448455810547, + "ref_logps/rejected": -45.90069580078125, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3666772842407227, + "rewards/margins": 0.2564208507537842, + "rewards/rejected": -1.623098373413086, + "step": 899 + }, + { + "epoch": 0.85, + "grad_norm": 15.951128959655762, + "learning_rate": 3.9821615949632734e-07, + "logps/chosen": -46.20152282714844, + "logps/rejected": -68.09762573242188, + "loss": 0.4218, + "losses/dpo": 0.5306084156036377, + "losses/sft": 1.9610953330993652, + "losses/total": 0.5306084156036377, + "ref_logps/chosen": -33.677608489990234, + "ref_logps/rejected": -47.117156982421875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2523912191390991, + "rewards/margins": 0.8456557393074036, + "rewards/rejected": -2.0980470180511475, + "step": 900 + }, + { + "epoch": 0.85, + "grad_norm": 25.189109802246094, + "learning_rate": 3.980412731724379e-07, + "logps/chosen": -56.62052917480469, + "logps/rejected": -50.940757751464844, + "loss": 0.6881, + "losses/dpo": 0.4470304250717163, + "losses/sft": 1.9205844402313232, + "losses/total": 0.4470304250717163, + "ref_logps/chosen": -43.021026611328125, + "ref_logps/rejected": -35.41661071777344, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3599497079849243, + "rewards/margins": 0.19246482849121094, + "rewards/rejected": -1.5524145364761353, + "step": 901 + }, + { + "epoch": 0.85, + "grad_norm": 20.101633071899414, + "learning_rate": 3.9786638684854847e-07, + "logps/chosen": -48.788063049316406, + "logps/rejected": -61.658782958984375, + "loss": 0.6498, + "losses/dpo": 0.6262995004653931, + "losses/sft": 1.9294172525405884, + "losses/total": 0.6262995004653931, + "ref_logps/chosen": -35.97642135620117, + "ref_logps/rejected": -46.43687057495117, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2811641693115234, + "rewards/margins": 0.24102668464183807, + "rewards/rejected": -1.522190809249878, + "step": 902 + }, + { + "epoch": 0.85, + "grad_norm": 20.468608856201172, + "learning_rate": 3.9769150052465893e-07, + "logps/chosen": -51.07587814331055, + "logps/rejected": -60.33686065673828, + "loss": 0.5267, + "losses/dpo": 0.39729684591293335, + "losses/sft": 1.6105037927627563, + "losses/total": 0.39729684591293335, + "ref_logps/chosen": -40.25775909423828, + "ref_logps/rejected": -42.918670654296875, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0818116664886475, + "rewards/margins": 0.6600068807601929, + "rewards/rejected": -1.7418184280395508, + "step": 903 + }, + { + "epoch": 0.85, + "grad_norm": 21.925613403320312, + "learning_rate": 3.975166142007695e-07, + "logps/chosen": -57.73406219482422, + "logps/rejected": -67.70793151855469, + "loss": 0.5483, + "losses/dpo": 0.4529944360256195, + "losses/sft": 1.730518102645874, + "losses/total": 0.4529944360256195, + "ref_logps/chosen": -46.38959884643555, + "ref_logps/rejected": -50.24241638183594, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.134446144104004, + "rewards/margins": 0.6121060848236084, + "rewards/rejected": -1.7465522289276123, + "step": 904 + }, + { + "epoch": 0.85, + "grad_norm": 27.211524963378906, + "learning_rate": 3.9734172787688e-07, + "logps/chosen": -55.615142822265625, + "logps/rejected": -52.123146057128906, + "loss": 0.793, + "losses/dpo": 0.9936805963516235, + "losses/sft": 1.4861667156219482, + "losses/total": 0.9936805963516235, + "ref_logps/chosen": -42.5081787109375, + "ref_logps/rejected": -39.06757354736328, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3106964826583862, + "rewards/margins": -0.005139090120792389, + "rewards/rejected": -1.305557370185852, + "step": 905 + }, + { + "epoch": 0.86, + "grad_norm": 25.954444885253906, + "learning_rate": 3.971668415529905e-07, + "logps/chosen": -49.63874053955078, + "logps/rejected": -64.95807647705078, + "loss": 0.6194, + "losses/dpo": 0.8540397882461548, + "losses/sft": 1.63751220703125, + "losses/total": 0.8540397882461548, + "ref_logps/chosen": -34.46154022216797, + "ref_logps/rejected": -45.160213470458984, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5177199840545654, + "rewards/margins": 0.46206629276275635, + "rewards/rejected": -1.9797862768173218, + "step": 906 + }, + { + "epoch": 0.86, + "grad_norm": 22.458322525024414, + "learning_rate": 3.969919552291011e-07, + "logps/chosen": -55.3083610534668, + "logps/rejected": -77.94244384765625, + "loss": 0.6459, + "losses/dpo": 0.540063202381134, + "losses/sft": 2.054966449737549, + "losses/total": 0.540063202381134, + "ref_logps/chosen": -38.51998519897461, + "ref_logps/rejected": -57.277915954589844, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.678837537765503, + "rewards/margins": 0.387615442276001, + "rewards/rejected": -2.066452980041504, + "step": 907 + }, + { + "epoch": 0.86, + "grad_norm": 24.14090919494629, + "learning_rate": 3.968170689052116e-07, + "logps/chosen": -58.927574157714844, + "logps/rejected": -65.96034240722656, + "loss": 0.6056, + "losses/dpo": 0.9082849621772766, + "losses/sft": 2.2859010696411133, + "losses/total": 0.9082849621772766, + "ref_logps/chosen": -46.313568115234375, + "ref_logps/rejected": -48.550933837890625, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2614003419876099, + "rewards/margins": 0.4795409142971039, + "rewards/rejected": -1.7409412860870361, + "step": 908 + }, + { + "epoch": 0.86, + "grad_norm": 14.95252799987793, + "learning_rate": 3.9664218258132216e-07, + "logps/chosen": -43.511505126953125, + "logps/rejected": -58.01011276245117, + "loss": 0.4388, + "losses/dpo": 0.43878018856048584, + "losses/sft": 1.9378046989440918, + "losses/total": 0.43878018856048584, + "ref_logps/chosen": -31.892057418823242, + "ref_logps/rejected": -38.38436508178711, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.16194486618042, + "rewards/margins": 0.8006298542022705, + "rewards/rejected": -1.9625747203826904, + "step": 909 + }, + { + "epoch": 0.86, + "grad_norm": 20.923730850219727, + "learning_rate": 3.964672962574326e-07, + "logps/chosen": -52.31150436401367, + "logps/rejected": -64.87698364257812, + "loss": 0.535, + "losses/dpo": 0.854328989982605, + "losses/sft": 2.2774758338928223, + "losses/total": 0.854328989982605, + "ref_logps/chosen": -43.05564880371094, + "ref_logps/rejected": -49.296844482421875, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9255850315093994, + "rewards/margins": 0.6324282288551331, + "rewards/rejected": -1.5580133199691772, + "step": 910 + }, + { + "epoch": 0.86, + "grad_norm": 20.61965560913086, + "learning_rate": 3.962924099335432e-07, + "logps/chosen": -48.34567642211914, + "logps/rejected": -71.5721664428711, + "loss": 0.5809, + "losses/dpo": 0.21209962666034698, + "losses/sft": 1.9302908182144165, + "losses/total": 0.21209962666034698, + "ref_logps/chosen": -33.894256591796875, + "ref_logps/rejected": -51.49092102050781, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4451422691345215, + "rewards/margins": 0.5629817247390747, + "rewards/rejected": -2.0081241130828857, + "step": 911 + }, + { + "epoch": 0.86, + "grad_norm": 14.768598556518555, + "learning_rate": 3.961175236096537e-07, + "logps/chosen": -52.110084533691406, + "logps/rejected": -69.14460754394531, + "loss": 0.2961, + "losses/dpo": 0.40846747159957886, + "losses/sft": 1.9573677778244019, + "losses/total": 0.40846747159957886, + "ref_logps/chosen": -41.841468811035156, + "ref_logps/rejected": -46.12614440917969, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0268621444702148, + "rewards/margins": 1.2749836444854736, + "rewards/rejected": -2.3018460273742676, + "step": 912 + }, + { + "epoch": 0.86, + "grad_norm": 20.606040954589844, + "learning_rate": 3.959426372857642e-07, + "logps/chosen": -56.27659225463867, + "logps/rejected": -56.47953796386719, + "loss": 0.6488, + "losses/dpo": 0.5586219429969788, + "losses/sft": 1.7982591390609741, + "losses/total": 0.5586219429969788, + "ref_logps/chosen": -44.78296661376953, + "ref_logps/rejected": -39.60145568847656, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.149362564086914, + "rewards/margins": 0.538446307182312, + "rewards/rejected": -1.6878087520599365, + "step": 913 + }, + { + "epoch": 0.86, + "grad_norm": 18.362518310546875, + "learning_rate": 3.957677509618748e-07, + "logps/chosen": -40.653472900390625, + "logps/rejected": -60.30322265625, + "loss": 0.4946, + "losses/dpo": 0.6199674606323242, + "losses/sft": 1.369446873664856, + "losses/total": 0.6199674606323242, + "ref_logps/chosen": -32.407623291015625, + "ref_logps/rejected": -46.03407669067383, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8245849609375, + "rewards/margins": 0.6023292541503906, + "rewards/rejected": -1.4269142150878906, + "step": 914 + }, + { + "epoch": 0.86, + "grad_norm": 17.280731201171875, + "learning_rate": 3.955928646379853e-07, + "logps/chosen": -48.192405700683594, + "logps/rejected": -68.83238220214844, + "loss": 0.418, + "losses/dpo": 0.2858433723449707, + "losses/sft": 1.3345253467559814, + "losses/total": 0.2858433723449707, + "ref_logps/chosen": -39.491615295410156, + "ref_logps/rejected": -49.456016540527344, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8700789213180542, + "rewards/margins": 1.0675580501556396, + "rewards/rejected": -1.9376369714736938, + "step": 915 + }, + { + "epoch": 0.86, + "grad_norm": 22.392044067382812, + "learning_rate": 3.9541797831409586e-07, + "logps/chosen": -45.73032760620117, + "logps/rejected": -56.69221878051758, + "loss": 0.6681, + "losses/dpo": 0.5759015083312988, + "losses/sft": 1.6123839616775513, + "losses/total": 0.5759015083312988, + "ref_logps/chosen": -32.025428771972656, + "ref_logps/rejected": -39.44956588745117, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3704901933670044, + "rewards/margins": 0.3537752032279968, + "rewards/rejected": -1.7242653369903564, + "step": 916 + }, + { + "epoch": 0.87, + "grad_norm": 29.403060913085938, + "learning_rate": 3.952430919902063e-07, + "logps/chosen": -69.3777847290039, + "logps/rejected": -77.7389144897461, + "loss": 0.7287, + "losses/dpo": 0.9813758134841919, + "losses/sft": 2.0115604400634766, + "losses/total": 0.9813758134841919, + "ref_logps/chosen": -53.83844757080078, + "ref_logps/rejected": -58.71717834472656, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5539339780807495, + "rewards/margins": 0.348239928483963, + "rewards/rejected": -1.9021738767623901, + "step": 917 + }, + { + "epoch": 0.87, + "grad_norm": 16.38727378845215, + "learning_rate": 3.950682056663169e-07, + "logps/chosen": -49.663917541503906, + "logps/rejected": -66.61436462402344, + "loss": 0.4548, + "losses/dpo": 0.32091787457466125, + "losses/sft": 1.9862086772918701, + "losses/total": 0.32091787457466125, + "ref_logps/chosen": -35.33073043823242, + "ref_logps/rejected": -44.718589782714844, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4333181381225586, + "rewards/margins": 0.7562595009803772, + "rewards/rejected": -2.18957781791687, + "step": 918 + }, + { + "epoch": 0.87, + "grad_norm": 14.43567180633545, + "learning_rate": 3.948933193424274e-07, + "logps/chosen": -47.30598449707031, + "logps/rejected": -78.9987564086914, + "loss": 0.3346, + "losses/dpo": 0.41614753007888794, + "losses/sft": 1.6911240816116333, + "losses/total": 0.41614753007888794, + "ref_logps/chosen": -35.60873031616211, + "ref_logps/rejected": -55.5838508605957, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1697256565093994, + "rewards/margins": 1.1717644929885864, + "rewards/rejected": -2.3414902687072754, + "step": 919 + }, + { + "epoch": 0.87, + "grad_norm": 17.731252670288086, + "learning_rate": 3.947184330185379e-07, + "logps/chosen": -46.636131286621094, + "logps/rejected": -52.21043395996094, + "loss": 0.558, + "losses/dpo": 0.4975965917110443, + "losses/sft": 1.7086056470870972, + "losses/total": 0.4975965917110443, + "ref_logps/chosen": -34.643272399902344, + "ref_logps/rejected": -36.218162536621094, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1992857456207275, + "rewards/margins": 0.3999415934085846, + "rewards/rejected": -1.5992274284362793, + "step": 920 + }, + { + "epoch": 0.87, + "grad_norm": 21.331438064575195, + "learning_rate": 3.945435466946485e-07, + "logps/chosen": -54.239837646484375, + "logps/rejected": -65.28260803222656, + "loss": 0.5102, + "losses/dpo": 0.35546794533729553, + "losses/sft": 1.9883983135223389, + "losses/total": 0.35546794533729553, + "ref_logps/chosen": -39.257781982421875, + "ref_logps/rejected": -44.58476257324219, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4982060194015503, + "rewards/margins": 0.5715781450271606, + "rewards/rejected": -2.069784164428711, + "step": 921 + }, + { + "epoch": 0.87, + "grad_norm": 23.890710830688477, + "learning_rate": 3.94368660370759e-07, + "logps/chosen": -52.246673583984375, + "logps/rejected": -57.574310302734375, + "loss": 0.6025, + "losses/dpo": 0.4584146738052368, + "losses/sft": 1.7509024143218994, + "losses/total": 0.4584146738052368, + "ref_logps/chosen": -39.73377227783203, + "ref_logps/rejected": -40.20069885253906, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2512900829315186, + "rewards/margins": 0.48607081174850464, + "rewards/rejected": -1.7373608350753784, + "step": 922 + }, + { + "epoch": 0.87, + "grad_norm": 24.46379852294922, + "learning_rate": 3.9419377404686955e-07, + "logps/chosen": -54.13951110839844, + "logps/rejected": -62.03705596923828, + "loss": 0.7075, + "losses/dpo": 0.628227949142456, + "losses/sft": 1.8977872133255005, + "losses/total": 0.628227949142456, + "ref_logps/chosen": -40.45734405517578, + "ref_logps/rejected": -42.91888427734375, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.368216633796692, + "rewards/margins": 0.5436007380485535, + "rewards/rejected": -1.9118174314498901, + "step": 923 + }, + { + "epoch": 0.87, + "grad_norm": 16.505361557006836, + "learning_rate": 3.9401888772298e-07, + "logps/chosen": -43.91877746582031, + "logps/rejected": -57.58205795288086, + "loss": 0.415, + "losses/dpo": 0.5588536858558655, + "losses/sft": 1.7603886127471924, + "losses/total": 0.5588536858558655, + "ref_logps/chosen": -33.67098617553711, + "ref_logps/rejected": -37.86896514892578, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0247793197631836, + "rewards/margins": 0.9465299844741821, + "rewards/rejected": -1.9713091850280762, + "step": 924 + }, + { + "epoch": 0.87, + "grad_norm": 19.35439109802246, + "learning_rate": 3.938440013990906e-07, + "logps/chosen": -46.69012451171875, + "logps/rejected": -71.21768951416016, + "loss": 0.4487, + "losses/dpo": 0.17130029201507568, + "losses/sft": 1.5461541414260864, + "losses/total": 0.17130029201507568, + "ref_logps/chosen": -36.7366943359375, + "ref_logps/rejected": -50.868370056152344, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9953426718711853, + "rewards/margins": 1.0395894050598145, + "rewards/rejected": -2.0349321365356445, + "step": 925 + }, + { + "epoch": 0.87, + "grad_norm": 19.393033981323242, + "learning_rate": 3.9366911507520114e-07, + "logps/chosen": -46.82405090332031, + "logps/rejected": -58.96955871582031, + "loss": 0.5752, + "losses/dpo": 0.4874016046524048, + "losses/sft": 2.263362407684326, + "losses/total": 0.4874016046524048, + "ref_logps/chosen": -33.57951354980469, + "ref_logps/rejected": -39.69049835205078, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3244540691375732, + "rewards/margins": 0.6034518480300903, + "rewards/rejected": -1.9279060363769531, + "step": 926 + }, + { + "epoch": 0.88, + "grad_norm": 19.086570739746094, + "learning_rate": 3.934942287513116e-07, + "logps/chosen": -55.59938049316406, + "logps/rejected": -62.348541259765625, + "loss": 0.4738, + "losses/dpo": 0.43744832277297974, + "losses/sft": 2.0748870372772217, + "losses/total": 0.43744832277297974, + "ref_logps/chosen": -42.695716857910156, + "ref_logps/rejected": -43.56956481933594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2903660535812378, + "rewards/margins": 0.5875317454338074, + "rewards/rejected": -1.8778977394104004, + "step": 927 + }, + { + "epoch": 0.88, + "grad_norm": 30.810121536254883, + "learning_rate": 3.9331934242742217e-07, + "logps/chosen": -55.43946075439453, + "logps/rejected": -57.36415100097656, + "loss": 0.9686, + "losses/dpo": 0.5155337452888489, + "losses/sft": 1.5941705703735352, + "losses/total": 0.5155337452888489, + "ref_logps/chosen": -38.125701904296875, + "ref_logps/rejected": -43.50593566894531, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.7313754558563232, + "rewards/margins": -0.34555378556251526, + "rewards/rejected": -1.3858217000961304, + "step": 928 + }, + { + "epoch": 0.88, + "grad_norm": 19.359027862548828, + "learning_rate": 3.931444561035327e-07, + "logps/chosen": -48.7660026550293, + "logps/rejected": -57.48979949951172, + "loss": 0.6006, + "losses/dpo": 0.5694717764854431, + "losses/sft": 1.886942982673645, + "losses/total": 0.5694717764854431, + "ref_logps/chosen": -37.55628967285156, + "ref_logps/rejected": -41.28240966796875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1209712028503418, + "rewards/margins": 0.49976757168769836, + "rewards/rejected": -1.6207387447357178, + "step": 929 + }, + { + "epoch": 0.88, + "grad_norm": 25.701581954956055, + "learning_rate": 3.9296956977964325e-07, + "logps/chosen": -53.83027648925781, + "logps/rejected": -59.25616455078125, + "loss": 0.7557, + "losses/dpo": 0.9149075746536255, + "losses/sft": 1.8033795356750488, + "losses/total": 0.9149075746536255, + "ref_logps/chosen": -40.902042388916016, + "ref_logps/rejected": -44.212833404541016, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2928234338760376, + "rewards/margins": 0.21150939166545868, + "rewards/rejected": -1.5043327808380127, + "step": 930 + }, + { + "epoch": 0.88, + "grad_norm": 23.654983520507812, + "learning_rate": 3.927946834557537e-07, + "logps/chosen": -52.84912109375, + "logps/rejected": -58.94500732421875, + "loss": 0.6085, + "losses/dpo": 0.6101242303848267, + "losses/sft": 1.5752911567687988, + "losses/total": 0.6101242303848267, + "ref_logps/chosen": -38.9780158996582, + "ref_logps/rejected": -41.359989166259766, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3871103525161743, + "rewards/margins": 0.3713914155960083, + "rewards/rejected": -1.7585017681121826, + "step": 931 + }, + { + "epoch": 0.88, + "grad_norm": 18.34036636352539, + "learning_rate": 3.9261979713186427e-07, + "logps/chosen": -41.510948181152344, + "logps/rejected": -50.07872772216797, + "loss": 0.5924, + "losses/dpo": 0.41716867685317993, + "losses/sft": 1.6984672546386719, + "losses/total": 0.41716867685317993, + "ref_logps/chosen": -29.266643524169922, + "ref_logps/rejected": -34.5325927734375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2244302034378052, + "rewards/margins": 0.33018285036087036, + "rewards/rejected": -1.5546129941940308, + "step": 932 + }, + { + "epoch": 0.88, + "grad_norm": 19.952167510986328, + "learning_rate": 3.9244491080797484e-07, + "logps/chosen": -47.16590881347656, + "logps/rejected": -59.26890563964844, + "loss": 0.5872, + "losses/dpo": 0.6848894357681274, + "losses/sft": 1.7096810340881348, + "losses/total": 0.6848894357681274, + "ref_logps/chosen": -39.62025833129883, + "ref_logps/rejected": -45.42413330078125, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7545651793479919, + "rewards/margins": 0.6299120783805847, + "rewards/rejected": -1.384477138519287, + "step": 933 + }, + { + "epoch": 0.88, + "grad_norm": 17.6008243560791, + "learning_rate": 3.922700244840853e-07, + "logps/chosen": -55.84395980834961, + "logps/rejected": -68.52776336669922, + "loss": 0.4359, + "losses/dpo": 0.44452258944511414, + "losses/sft": 1.6741211414337158, + "losses/total": 0.44452258944511414, + "ref_logps/chosen": -43.75857162475586, + "ref_logps/rejected": -47.53761291503906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2085391283035278, + "rewards/margins": 0.8904761075973511, + "rewards/rejected": -2.099015235900879, + "step": 934 + }, + { + "epoch": 0.88, + "grad_norm": 24.569114685058594, + "learning_rate": 3.9209513816019586e-07, + "logps/chosen": -39.32587432861328, + "logps/rejected": -51.08375549316406, + "loss": 0.6675, + "losses/dpo": 0.6287034749984741, + "losses/sft": 0.9051434993743896, + "losses/total": 0.6287034749984741, + "ref_logps/chosen": -29.49465560913086, + "ref_logps/rejected": -38.91350555419922, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9831218719482422, + "rewards/margins": 0.23390275239944458, + "rewards/rejected": -1.217024564743042, + "step": 935 + }, + { + "epoch": 0.88, + "grad_norm": 22.57014274597168, + "learning_rate": 3.919202518363064e-07, + "logps/chosen": -47.48615264892578, + "logps/rejected": -52.418453216552734, + "loss": 0.6616, + "losses/dpo": 0.6661105155944824, + "losses/sft": 1.788139820098877, + "losses/total": 0.6661105155944824, + "ref_logps/chosen": -38.539764404296875, + "ref_logps/rejected": -40.90968322753906, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8946388959884644, + "rewards/margins": 0.25623804330825806, + "rewards/rejected": -1.1508769989013672, + "step": 936 + }, + { + "epoch": 0.88, + "grad_norm": 31.024436950683594, + "learning_rate": 3.9174536551241694e-07, + "logps/chosen": -69.43479919433594, + "logps/rejected": -67.86477661132812, + "loss": 0.8196, + "losses/dpo": 0.806796133518219, + "losses/sft": 2.2336392402648926, + "losses/total": 0.806796133518219, + "ref_logps/chosen": -52.997554779052734, + "ref_logps/rejected": -51.07181167602539, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6437244415283203, + "rewards/margins": 0.03557199239730835, + "rewards/rejected": -1.6792964935302734, + "step": 937 + }, + { + "epoch": 0.89, + "grad_norm": 18.274883270263672, + "learning_rate": 3.915704791885274e-07, + "logps/chosen": -33.531494140625, + "logps/rejected": -50.1838493347168, + "loss": 0.5802, + "losses/dpo": 0.5538694262504578, + "losses/sft": 1.8189845085144043, + "losses/total": 0.5538694262504578, + "ref_logps/chosen": -22.51466178894043, + "ref_logps/rejected": -34.66609191894531, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1016830205917358, + "rewards/margins": 0.45009273290634155, + "rewards/rejected": -1.5517758131027222, + "step": 938 + }, + { + "epoch": 0.89, + "grad_norm": 18.459444046020508, + "learning_rate": 3.9139559286463797e-07, + "logps/chosen": -45.984439849853516, + "logps/rejected": -63.44975280761719, + "loss": 0.4937, + "losses/dpo": 0.4282464385032654, + "losses/sft": 1.753808617591858, + "losses/total": 0.4282464385032654, + "ref_logps/chosen": -34.085784912109375, + "ref_logps/rejected": -45.0334358215332, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1898653507232666, + "rewards/margins": 0.651766300201416, + "rewards/rejected": -1.8416316509246826, + "step": 939 + }, + { + "epoch": 0.89, + "grad_norm": 22.217809677124023, + "learning_rate": 3.9122070654074853e-07, + "logps/chosen": -36.222145080566406, + "logps/rejected": -54.83385467529297, + "loss": 0.6612, + "losses/dpo": 0.46967020630836487, + "losses/sft": 1.7441169023513794, + "losses/total": 0.46967020630836487, + "ref_logps/chosen": -27.031352996826172, + "ref_logps/rejected": -42.950531005859375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9190797805786133, + "rewards/margins": 0.26925280690193176, + "rewards/rejected": -1.1883325576782227, + "step": 940 + }, + { + "epoch": 0.89, + "grad_norm": 13.500921249389648, + "learning_rate": 3.91045820216859e-07, + "logps/chosen": -44.33881759643555, + "logps/rejected": -52.58893585205078, + "loss": 0.4083, + "losses/dpo": 0.3898429274559021, + "losses/sft": 1.3914735317230225, + "losses/total": 0.3898429274559021, + "ref_logps/chosen": -35.23923110961914, + "ref_logps/rejected": -34.500614166259766, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9099588990211487, + "rewards/margins": 0.8988734483718872, + "rewards/rejected": -1.8088324069976807, + "step": 941 + }, + { + "epoch": 0.89, + "grad_norm": 19.755144119262695, + "learning_rate": 3.9087093389296956e-07, + "logps/chosen": -42.63121795654297, + "logps/rejected": -56.15619659423828, + "loss": 0.5452, + "losses/dpo": 0.38568371534347534, + "losses/sft": 1.2724207639694214, + "losses/total": 0.38568371534347534, + "ref_logps/chosen": -34.555423736572266, + "ref_logps/rejected": -41.963417053222656, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8075796365737915, + "rewards/margins": 0.6116986274719238, + "rewards/rejected": -1.4192781448364258, + "step": 942 + }, + { + "epoch": 0.89, + "grad_norm": 21.889968872070312, + "learning_rate": 3.9069604756908007e-07, + "logps/chosen": -49.28250503540039, + "logps/rejected": -55.694854736328125, + "loss": 0.6037, + "losses/dpo": 0.5128437280654907, + "losses/sft": 1.5409718751907349, + "losses/total": 0.5128437280654907, + "ref_logps/chosen": -38.92012023925781, + "ref_logps/rejected": -40.67806625366211, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.036238193511963, + "rewards/margins": 0.4654407501220703, + "rewards/rejected": -1.5016790628433228, + "step": 943 + }, + { + "epoch": 0.89, + "grad_norm": 23.30292510986328, + "learning_rate": 3.9052116124519064e-07, + "logps/chosen": -56.98743438720703, + "logps/rejected": -84.62454223632812, + "loss": 0.569, + "losses/dpo": 0.30215567350387573, + "losses/sft": 1.9144946336746216, + "losses/total": 0.30215567350387573, + "ref_logps/chosen": -41.867645263671875, + "ref_logps/rejected": -64.49409484863281, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.511979103088379, + "rewards/margins": 0.501066267490387, + "rewards/rejected": -2.013045310974121, + "step": 944 + }, + { + "epoch": 0.89, + "grad_norm": 21.492578506469727, + "learning_rate": 3.9034627492130115e-07, + "logps/chosen": -52.53728485107422, + "logps/rejected": -56.453453063964844, + "loss": 0.6, + "losses/dpo": 0.6856805086135864, + "losses/sft": 1.5601537227630615, + "losses/total": 0.6856805086135864, + "ref_logps/chosen": -43.88945007324219, + "ref_logps/rejected": -42.156883239746094, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8647836446762085, + "rewards/margins": 0.5648730993270874, + "rewards/rejected": -1.4296568632125854, + "step": 945 + }, + { + "epoch": 0.89, + "grad_norm": 22.729167938232422, + "learning_rate": 3.9017138859741166e-07, + "logps/chosen": -61.96066665649414, + "logps/rejected": -62.0217170715332, + "loss": 0.5765, + "losses/dpo": 0.34722572565078735, + "losses/sft": 1.393270492553711, + "losses/total": 0.34722572565078735, + "ref_logps/chosen": -50.60178756713867, + "ref_logps/rejected": -45.35589599609375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.135887622833252, + "rewards/margins": 0.5306945443153381, + "rewards/rejected": -1.6665821075439453, + "step": 946 + }, + { + "epoch": 0.89, + "grad_norm": 18.95105743408203, + "learning_rate": 3.8999650227352223e-07, + "logps/chosen": -40.4675178527832, + "logps/rejected": -50.58290100097656, + "loss": 0.5595, + "losses/dpo": 0.8041644096374512, + "losses/sft": 1.6902856826782227, + "losses/total": 0.8041644096374512, + "ref_logps/chosen": -31.34231185913086, + "ref_logps/rejected": -34.31023406982422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9125205278396606, + "rewards/margins": 0.7147461771965027, + "rewards/rejected": -1.627266764640808, + "step": 947 + }, + { + "epoch": 0.9, + "grad_norm": 19.459688186645508, + "learning_rate": 3.898216159496327e-07, + "logps/chosen": -31.382644653320312, + "logps/rejected": -36.57513427734375, + "loss": 0.6887, + "losses/dpo": 0.915522038936615, + "losses/sft": 1.5414443016052246, + "losses/total": 0.915522038936615, + "ref_logps/chosen": -25.55060577392578, + "ref_logps/rejected": -29.547283172607422, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5832041501998901, + "rewards/margins": 0.11958077549934387, + "rewards/rejected": -0.7027848958969116, + "step": 948 + }, + { + "epoch": 0.9, + "grad_norm": 15.459049224853516, + "learning_rate": 3.8964672962574325e-07, + "logps/chosen": -35.39344024658203, + "logps/rejected": -43.9502067565918, + "loss": 0.4462, + "losses/dpo": 0.3306221067905426, + "losses/sft": 1.1747256517410278, + "losses/total": 0.3306221067905426, + "ref_logps/chosen": -30.467212677001953, + "ref_logps/rejected": -30.87660026550293, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.49262285232543945, + "rewards/margins": 0.8147377967834473, + "rewards/rejected": -1.3073606491088867, + "step": 949 + }, + { + "epoch": 0.9, + "grad_norm": 15.679216384887695, + "learning_rate": 3.8947184330185376e-07, + "logps/chosen": -43.178802490234375, + "logps/rejected": -63.8748893737793, + "loss": 0.392, + "losses/dpo": 0.3804413378238678, + "losses/sft": 2.044457197189331, + "losses/total": 0.3804413378238678, + "ref_logps/chosen": -33.71409225463867, + "ref_logps/rejected": -42.914451599121094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9464713931083679, + "rewards/margins": 1.1495721340179443, + "rewards/rejected": -2.096043586730957, + "step": 950 + }, + { + "epoch": 0.9, + "grad_norm": 24.586549758911133, + "learning_rate": 3.8929695697796433e-07, + "logps/chosen": -57.81559753417969, + "logps/rejected": -66.35832214355469, + "loss": 0.7206, + "losses/dpo": 1.0126290321350098, + "losses/sft": 2.1405298709869385, + "losses/total": 1.0126290321350098, + "ref_logps/chosen": -43.024879455566406, + "ref_logps/rejected": -49.4676513671875, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4790716171264648, + "rewards/margins": 0.20999547839164734, + "rewards/rejected": -1.6890668869018555, + "step": 951 + }, + { + "epoch": 0.9, + "grad_norm": 17.131330490112305, + "learning_rate": 3.8912207065407484e-07, + "logps/chosen": -37.56055450439453, + "logps/rejected": -64.56658172607422, + "loss": 0.4258, + "losses/dpo": 0.3325684666633606, + "losses/sft": 1.6636767387390137, + "losses/total": 0.3325684666633606, + "ref_logps/chosen": -28.85824203491211, + "ref_logps/rejected": -48.03290939331055, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8702311515808105, + "rewards/margins": 0.7831363677978516, + "rewards/rejected": -1.653367519378662, + "step": 952 + }, + { + "epoch": 0.9, + "grad_norm": 15.110618591308594, + "learning_rate": 3.8894718433018536e-07, + "logps/chosen": -53.59522247314453, + "logps/rejected": -67.45619201660156, + "loss": 0.4449, + "losses/dpo": 0.3363577425479889, + "losses/sft": 2.001035213470459, + "losses/total": 0.3363577425479889, + "ref_logps/chosen": -43.685890197753906, + "ref_logps/rejected": -47.81781005859375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9909330606460571, + "rewards/margins": 0.9729050397872925, + "rewards/rejected": -1.9638381004333496, + "step": 953 + }, + { + "epoch": 0.9, + "grad_norm": 20.63409996032715, + "learning_rate": 3.887722980062959e-07, + "logps/chosen": -45.954010009765625, + "logps/rejected": -55.219051361083984, + "loss": 0.5064, + "losses/dpo": 0.45236095786094666, + "losses/sft": 1.484810709953308, + "losses/total": 0.45236095786094666, + "ref_logps/chosen": -39.28350830078125, + "ref_logps/rejected": -41.63930892944336, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6670501828193665, + "rewards/margins": 0.6909241676330566, + "rewards/rejected": -1.3579744100570679, + "step": 954 + }, + { + "epoch": 0.9, + "grad_norm": 18.03986930847168, + "learning_rate": 3.885974116824064e-07, + "logps/chosen": -35.99790573120117, + "logps/rejected": -55.131629943847656, + "loss": 0.5489, + "losses/dpo": 0.3931209444999695, + "losses/sft": 1.5018911361694336, + "losses/total": 0.3931209444999695, + "ref_logps/chosen": -27.373233795166016, + "ref_logps/rejected": -41.234100341796875, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8624672889709473, + "rewards/margins": 0.5272859334945679, + "rewards/rejected": -1.3897532224655151, + "step": 955 + }, + { + "epoch": 0.9, + "grad_norm": 16.459518432617188, + "learning_rate": 3.8842252535851695e-07, + "logps/chosen": -49.29063415527344, + "logps/rejected": -71.05470275878906, + "loss": 0.3506, + "losses/dpo": 0.25867295265197754, + "losses/sft": 1.7450703382492065, + "losses/total": 0.25867295265197754, + "ref_logps/chosen": -39.947410583496094, + "ref_logps/rejected": -51.3934326171875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9343230128288269, + "rewards/margins": 1.0318034887313843, + "rewards/rejected": -1.966126561164856, + "step": 956 + }, + { + "epoch": 0.9, + "grad_norm": 21.14252281188965, + "learning_rate": 3.8824763903462746e-07, + "logps/chosen": -41.167327880859375, + "logps/rejected": -56.70052719116211, + "loss": 0.6161, + "losses/dpo": 0.24370966851711273, + "losses/sft": 1.8089507818222046, + "losses/total": 0.24370966851711273, + "ref_logps/chosen": -29.911165237426758, + "ref_logps/rejected": -40.54180908203125, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1256160736083984, + "rewards/margins": 0.49025583267211914, + "rewards/rejected": -1.6158719062805176, + "step": 957 + }, + { + "epoch": 0.9, + "grad_norm": 15.646565437316895, + "learning_rate": 3.88072752710738e-07, + "logps/chosen": -43.166893005371094, + "logps/rejected": -63.51106262207031, + "loss": 0.4196, + "losses/dpo": 0.31831222772598267, + "losses/sft": 2.056562900543213, + "losses/total": 0.31831222772598267, + "ref_logps/chosen": -34.48486328125, + "ref_logps/rejected": -44.45180130004883, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8682032823562622, + "rewards/margins": 1.0377230644226074, + "rewards/rejected": -1.9059263467788696, + "step": 958 + }, + { + "epoch": 0.91, + "grad_norm": 23.05276870727539, + "learning_rate": 3.8789786638684854e-07, + "logps/chosen": -46.186981201171875, + "logps/rejected": -56.274253845214844, + "loss": 0.7001, + "losses/dpo": 0.8616746664047241, + "losses/sft": 1.8922234773635864, + "losses/total": 0.8616746664047241, + "ref_logps/chosen": -35.46388244628906, + "ref_logps/rejected": -42.425453186035156, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.072309970855713, + "rewards/margins": 0.3125705420970917, + "rewards/rejected": -1.384880542755127, + "step": 959 + }, + { + "epoch": 0.91, + "grad_norm": 28.662580490112305, + "learning_rate": 3.8772298006295905e-07, + "logps/chosen": -76.91574096679688, + "logps/rejected": -69.48300170898438, + "loss": 0.6311, + "losses/dpo": 0.9258673787117004, + "losses/sft": 2.213568687438965, + "losses/total": 0.9258673787117004, + "ref_logps/chosen": -63.26716613769531, + "ref_logps/rejected": -49.8331413269043, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3648579120635986, + "rewards/margins": 0.6001282334327698, + "rewards/rejected": -1.9649860858917236, + "step": 960 + }, + { + "epoch": 0.91, + "grad_norm": 22.746126174926758, + "learning_rate": 3.875480937390696e-07, + "logps/chosen": -50.369815826416016, + "logps/rejected": -50.867897033691406, + "loss": 0.6593, + "losses/dpo": 0.4779524505138397, + "losses/sft": 1.7878148555755615, + "losses/total": 0.4779524505138397, + "ref_logps/chosen": -38.644439697265625, + "ref_logps/rejected": -36.70579147338867, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1725378036499023, + "rewards/margins": 0.2436729520559311, + "rewards/rejected": -1.4162108898162842, + "step": 961 + }, + { + "epoch": 0.91, + "grad_norm": 19.56867218017578, + "learning_rate": 3.873732074151801e-07, + "logps/chosen": -42.459083557128906, + "logps/rejected": -56.513710021972656, + "loss": 0.6029, + "losses/dpo": 0.6437307596206665, + "losses/sft": 1.8656847476959229, + "losses/total": 0.6437307596206665, + "ref_logps/chosen": -31.495113372802734, + "ref_logps/rejected": -39.55369186401367, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.096397042274475, + "rewards/margins": 0.5996049046516418, + "rewards/rejected": -1.6960020065307617, + "step": 962 + }, + { + "epoch": 0.91, + "grad_norm": 19.292837142944336, + "learning_rate": 3.8719832109129064e-07, + "logps/chosen": -50.66706466674805, + "logps/rejected": -64.09747314453125, + "loss": 0.491, + "losses/dpo": 0.49953269958496094, + "losses/sft": 2.0225589275360107, + "losses/total": 0.49953269958496094, + "ref_logps/chosen": -39.68737030029297, + "ref_logps/rejected": -46.53889465332031, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0979692935943604, + "rewards/margins": 0.6578893661499023, + "rewards/rejected": -1.7558586597442627, + "step": 963 + }, + { + "epoch": 0.91, + "grad_norm": 26.163936614990234, + "learning_rate": 3.870234347674012e-07, + "logps/chosen": -56.905460357666016, + "logps/rejected": -64.78118896484375, + "loss": 0.6471, + "losses/dpo": 0.24652838706970215, + "losses/sft": 1.862913966178894, + "losses/total": 0.24652838706970215, + "ref_logps/chosen": -43.511051177978516, + "ref_logps/rejected": -47.04642868041992, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.339440941810608, + "rewards/margins": 0.43403521180152893, + "rewards/rejected": -1.7734761238098145, + "step": 964 + }, + { + "epoch": 0.91, + "grad_norm": 24.758644104003906, + "learning_rate": 3.868485484435117e-07, + "logps/chosen": -50.36905288696289, + "logps/rejected": -63.91368103027344, + "loss": 0.7482, + "losses/dpo": 0.9134958982467651, + "losses/sft": 1.8413722515106201, + "losses/total": 0.9134958982467651, + "ref_logps/chosen": -38.325660705566406, + "ref_logps/rejected": -49.209144592285156, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2043395042419434, + "rewards/margins": 0.26611417531967163, + "rewards/rejected": -1.4704537391662598, + "step": 965 + }, + { + "epoch": 0.91, + "grad_norm": 14.799829483032227, + "learning_rate": 3.8667366211962223e-07, + "logps/chosen": -45.358970642089844, + "logps/rejected": -62.32545471191406, + "loss": 0.3398, + "losses/dpo": 0.22617366909980774, + "losses/sft": 1.3016481399536133, + "losses/total": 0.22617366909980774, + "ref_logps/chosen": -38.82315444946289, + "ref_logps/rejected": -43.84934997558594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.653581440448761, + "rewards/margins": 1.1940287351608276, + "rewards/rejected": -1.8476101160049438, + "step": 966 + }, + { + "epoch": 0.91, + "grad_norm": 19.053054809570312, + "learning_rate": 3.8649877579573274e-07, + "logps/chosen": -41.64030456542969, + "logps/rejected": -52.60406494140625, + "loss": 0.5582, + "losses/dpo": 0.8065115213394165, + "losses/sft": 1.68719482421875, + "losses/total": 0.8065115213394165, + "ref_logps/chosen": -31.42445945739746, + "ref_logps/rejected": -36.04149627685547, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0215842723846436, + "rewards/margins": 0.6346728801727295, + "rewards/rejected": -1.6562572717666626, + "step": 967 + }, + { + "epoch": 0.91, + "grad_norm": 19.083852767944336, + "learning_rate": 3.863238894718433e-07, + "logps/chosen": -52.436588287353516, + "logps/rejected": -74.2242431640625, + "loss": 0.4103, + "losses/dpo": 0.2037041187286377, + "losses/sft": 1.3081860542297363, + "losses/total": 0.2037041187286377, + "ref_logps/chosen": -43.161067962646484, + "ref_logps/rejected": -54.99870300292969, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9275518655776978, + "rewards/margins": 0.9950025677680969, + "rewards/rejected": -1.9225544929504395, + "step": 968 + }, + { + "epoch": 0.92, + "grad_norm": 22.65544891357422, + "learning_rate": 3.8614900314795377e-07, + "logps/chosen": -50.19179153442383, + "logps/rejected": -69.511474609375, + "loss": 0.5592, + "losses/dpo": 0.48498445749282837, + "losses/sft": 1.9186389446258545, + "losses/total": 0.48498445749282837, + "ref_logps/chosen": -40.42304229736328, + "ref_logps/rejected": -53.426124572753906, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9768748879432678, + "rewards/margins": 0.6316596269607544, + "rewards/rejected": -1.608534574508667, + "step": 969 + }, + { + "epoch": 0.92, + "grad_norm": 24.638919830322266, + "learning_rate": 3.8597411682406434e-07, + "logps/chosen": -51.37481689453125, + "logps/rejected": -57.98303985595703, + "loss": 0.6773, + "losses/dpo": 0.9160447716712952, + "losses/sft": 1.5926841497421265, + "losses/total": 0.9160447716712952, + "ref_logps/chosen": -39.58479309082031, + "ref_logps/rejected": -40.19683837890625, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.179002285003662, + "rewards/margins": 0.5996181964874268, + "rewards/rejected": -1.7786204814910889, + "step": 970 + }, + { + "epoch": 0.92, + "grad_norm": 19.828022003173828, + "learning_rate": 3.857992305001749e-07, + "logps/chosen": -40.58400344848633, + "logps/rejected": -59.37562561035156, + "loss": 0.5543, + "losses/dpo": 0.495002806186676, + "losses/sft": 1.6763334274291992, + "losses/total": 0.495002806186676, + "ref_logps/chosen": -31.079570770263672, + "ref_logps/rejected": -43.05794906616211, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9504432678222656, + "rewards/margins": 0.6813246607780457, + "rewards/rejected": -1.631767988204956, + "step": 971 + }, + { + "epoch": 0.92, + "grad_norm": 23.73946189880371, + "learning_rate": 3.856243441762854e-07, + "logps/chosen": -54.873634338378906, + "logps/rejected": -51.76392364501953, + "loss": 0.7246, + "losses/dpo": 0.5085068941116333, + "losses/sft": 1.5580775737762451, + "losses/total": 0.5085068941116333, + "ref_logps/chosen": -45.79169845581055, + "ref_logps/rejected": -40.69203186035156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.908193826675415, + "rewards/margins": 0.19899529218673706, + "rewards/rejected": -1.1071891784667969, + "step": 972 + }, + { + "epoch": 0.92, + "grad_norm": 18.36198616027832, + "learning_rate": 3.854494578523959e-07, + "logps/chosen": -50.29176330566406, + "logps/rejected": -57.004920959472656, + "loss": 0.5245, + "losses/dpo": 0.6440709829330444, + "losses/sft": 1.832862377166748, + "losses/total": 0.6440709829330444, + "ref_logps/chosen": -40.84815979003906, + "ref_logps/rejected": -41.187034606933594, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9443599581718445, + "rewards/margins": 0.6374289393424988, + "rewards/rejected": -1.5817888975143433, + "step": 973 + }, + { + "epoch": 0.92, + "grad_norm": 18.88251304626465, + "learning_rate": 3.8527457152850644e-07, + "logps/chosen": -43.22986602783203, + "logps/rejected": -67.54998779296875, + "loss": 0.5005, + "losses/dpo": 0.561363697052002, + "losses/sft": 1.8750559091567993, + "losses/total": 0.561363697052002, + "ref_logps/chosen": -33.15339279174805, + "ref_logps/rejected": -50.49925994873047, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0076472759246826, + "rewards/margins": 0.6974254250526428, + "rewards/rejected": -1.7050727605819702, + "step": 974 + }, + { + "epoch": 0.92, + "grad_norm": 18.624319076538086, + "learning_rate": 3.85099685204617e-07, + "logps/chosen": -45.34701919555664, + "logps/rejected": -69.48483276367188, + "loss": 0.4595, + "losses/dpo": 0.7396693229675293, + "losses/sft": 1.9366568326950073, + "losses/total": 0.7396693229675293, + "ref_logps/chosen": -36.88391876220703, + "ref_logps/rejected": -52.47506332397461, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8463100790977478, + "rewards/margins": 0.8546671867370605, + "rewards/rejected": -1.7009772062301636, + "step": 975 + }, + { + "epoch": 0.92, + "grad_norm": 23.694549560546875, + "learning_rate": 3.8492479888072746e-07, + "logps/chosen": -47.048587799072266, + "logps/rejected": -56.86940383911133, + "loss": 0.7133, + "losses/dpo": 0.9186302423477173, + "losses/sft": 1.9295350313186646, + "losses/total": 0.9186302423477173, + "ref_logps/chosen": -35.889808654785156, + "ref_logps/rejected": -39.932003021240234, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1158777475357056, + "rewards/margins": 0.5778622627258301, + "rewards/rejected": -1.6937401294708252, + "step": 976 + }, + { + "epoch": 0.92, + "grad_norm": 17.80436897277832, + "learning_rate": 3.8474991255683803e-07, + "logps/chosen": -42.967979431152344, + "logps/rejected": -70.74673461914062, + "loss": 0.4746, + "losses/dpo": 0.3472397029399872, + "losses/sft": 1.6480947732925415, + "losses/total": 0.3472397029399872, + "ref_logps/chosen": -35.133766174316406, + "ref_logps/rejected": -52.64344024658203, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7834211587905884, + "rewards/margins": 1.0269087553024292, + "rewards/rejected": -1.8103299140930176, + "step": 977 + }, + { + "epoch": 0.92, + "grad_norm": 20.923574447631836, + "learning_rate": 3.845750262329486e-07, + "logps/chosen": -52.09071350097656, + "logps/rejected": -57.20856475830078, + "loss": 0.48, + "losses/dpo": 0.2674854099750519, + "losses/sft": 1.99339759349823, + "losses/total": 0.2674854099750519, + "ref_logps/chosen": -42.38469696044922, + "ref_logps/rejected": -40.23178482055664, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9706017374992371, + "rewards/margins": 0.7270764708518982, + "rewards/rejected": -1.6976782083511353, + "step": 978 + }, + { + "epoch": 0.92, + "grad_norm": 23.62529182434082, + "learning_rate": 3.844001399090591e-07, + "logps/chosen": -56.328712463378906, + "logps/rejected": -58.30531692504883, + "loss": 0.6757, + "losses/dpo": 0.5385838150978088, + "losses/sft": 1.8027693033218384, + "losses/total": 0.5385838150978088, + "ref_logps/chosen": -42.6113395690918, + "ref_logps/rejected": -41.7963752746582, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3717374801635742, + "rewards/margins": 0.2791568636894226, + "rewards/rejected": -1.6508941650390625, + "step": 979 + }, + { + "epoch": 0.93, + "grad_norm": 18.919498443603516, + "learning_rate": 3.842252535851696e-07, + "logps/chosen": -48.087867736816406, + "logps/rejected": -70.01002502441406, + "loss": 0.5324, + "losses/dpo": 0.4499993920326233, + "losses/sft": 1.245666265487671, + "losses/total": 0.4499993920326233, + "ref_logps/chosen": -35.94928741455078, + "ref_logps/rejected": -51.554725646972656, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2138580083847046, + "rewards/margins": 0.6316715478897095, + "rewards/rejected": -1.845529556274414, + "step": 980 + }, + { + "epoch": 0.93, + "grad_norm": 19.970478057861328, + "learning_rate": 3.8405036726128013e-07, + "logps/chosen": -48.89336395263672, + "logps/rejected": -60.02153015136719, + "loss": 0.4721, + "losses/dpo": 0.6085375547409058, + "losses/sft": 1.633169412612915, + "losses/total": 0.6085375547409058, + "ref_logps/chosen": -40.027061462402344, + "ref_logps/rejected": -42.2012825012207, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8866301774978638, + "rewards/margins": 0.8953946828842163, + "rewards/rejected": -1.78202486038208, + "step": 981 + }, + { + "epoch": 0.93, + "grad_norm": 17.84539222717285, + "learning_rate": 3.838754809373907e-07, + "logps/chosen": -43.731719970703125, + "logps/rejected": -61.472137451171875, + "loss": 0.4654, + "losses/dpo": 0.5906164646148682, + "losses/sft": 1.684012532234192, + "losses/total": 0.5906164646148682, + "ref_logps/chosen": -33.648414611816406, + "ref_logps/rejected": -44.06370544433594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0083309412002563, + "rewards/margins": 0.7325129508972168, + "rewards/rejected": -1.7408437728881836, + "step": 982 + }, + { + "epoch": 0.93, + "grad_norm": 26.598716735839844, + "learning_rate": 3.8370059461350116e-07, + "logps/chosen": -51.013755798339844, + "logps/rejected": -52.03260040283203, + "loss": 0.8181, + "losses/dpo": 0.9581042528152466, + "losses/sft": 1.6930867433547974, + "losses/total": 0.9581042528152466, + "ref_logps/chosen": -37.647823333740234, + "ref_logps/rejected": -37.89427185058594, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3365932703018188, + "rewards/margins": 0.0772397592663765, + "rewards/rejected": -1.4138330221176147, + "step": 983 + }, + { + "epoch": 0.93, + "grad_norm": 21.65802001953125, + "learning_rate": 3.835257082896117e-07, + "logps/chosen": -51.42059326171875, + "logps/rejected": -71.58717346191406, + "loss": 0.5062, + "losses/dpo": 0.7946414947509766, + "losses/sft": 1.9316154718399048, + "losses/total": 0.7946414947509766, + "ref_logps/chosen": -40.99918746948242, + "ref_logps/rejected": -51.60511779785156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0421407222747803, + "rewards/margins": 0.9560640454292297, + "rewards/rejected": -1.9982047080993652, + "step": 984 + }, + { + "epoch": 0.93, + "grad_norm": 16.202625274658203, + "learning_rate": 3.833508219657223e-07, + "logps/chosen": -39.13245391845703, + "logps/rejected": -57.883689880371094, + "loss": 0.4267, + "losses/dpo": 0.38652050495147705, + "losses/sft": 1.6998968124389648, + "losses/total": 0.38652050495147705, + "ref_logps/chosen": -32.44765090942383, + "ref_logps/rejected": -41.958614349365234, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6684800386428833, + "rewards/margins": 0.9240274429321289, + "rewards/rejected": -1.5925076007843018, + "step": 985 + }, + { + "epoch": 0.93, + "grad_norm": 27.589420318603516, + "learning_rate": 3.831759356418328e-07, + "logps/chosen": -51.51816940307617, + "logps/rejected": -53.463623046875, + "loss": 0.6767, + "losses/dpo": 0.47124022245407104, + "losses/sft": 1.4325437545776367, + "losses/total": 0.47124022245407104, + "ref_logps/chosen": -39.39977264404297, + "ref_logps/rejected": -38.43071365356445, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2118401527404785, + "rewards/margins": 0.291450560092926, + "rewards/rejected": -1.5032906532287598, + "step": 986 + }, + { + "epoch": 0.93, + "grad_norm": 21.131973266601562, + "learning_rate": 3.830010493179433e-07, + "logps/chosen": -45.02501678466797, + "logps/rejected": -57.56442642211914, + "loss": 0.5345, + "losses/dpo": 0.3590014576911926, + "losses/sft": 1.4518725872039795, + "losses/total": 0.3590014576911926, + "ref_logps/chosen": -35.90077209472656, + "ref_logps/rejected": -41.12401580810547, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9124248027801514, + "rewards/margins": 0.7316160202026367, + "rewards/rejected": -1.644040822982788, + "step": 987 + }, + { + "epoch": 0.93, + "grad_norm": 19.58783531188965, + "learning_rate": 3.8282616299405383e-07, + "logps/chosen": -48.63311004638672, + "logps/rejected": -65.70501708984375, + "loss": 0.5203, + "losses/dpo": 0.5570815801620483, + "losses/sft": 1.673298954963684, + "losses/total": 0.5570815801620483, + "ref_logps/chosen": -36.90353775024414, + "ref_logps/rejected": -45.29994201660156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.172957181930542, + "rewards/margins": 0.867550790309906, + "rewards/rejected": -2.0405077934265137, + "step": 988 + }, + { + "epoch": 0.93, + "grad_norm": 29.893646240234375, + "learning_rate": 3.826512766701644e-07, + "logps/chosen": -53.61981201171875, + "logps/rejected": -60.33222198486328, + "loss": 0.8667, + "losses/dpo": 0.7239810824394226, + "losses/sft": 1.856675624847412, + "losses/total": 0.7239810824394226, + "ref_logps/chosen": -37.53385925292969, + "ref_logps/rejected": -42.647804260253906, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6085954904556274, + "rewards/margins": 0.1598462462425232, + "rewards/rejected": -1.7684417963027954, + "step": 989 + }, + { + "epoch": 0.93, + "grad_norm": 27.25286293029785, + "learning_rate": 3.824763903462749e-07, + "logps/chosen": -61.245811462402344, + "logps/rejected": -75.42939758300781, + "loss": 0.4865, + "losses/dpo": 0.39196962118148804, + "losses/sft": 1.2838695049285889, + "losses/total": 0.39196962118148804, + "ref_logps/chosen": -51.34972381591797, + "ref_logps/rejected": -55.9708251953125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9896084666252136, + "rewards/margins": 0.9562491178512573, + "rewards/rejected": -1.9458576440811157, + "step": 990 + }, + { + "epoch": 0.94, + "grad_norm": 20.264774322509766, + "learning_rate": 3.823015040223854e-07, + "logps/chosen": -48.33627700805664, + "logps/rejected": -54.53680419921875, + "loss": 0.6103, + "losses/dpo": 0.865106463432312, + "losses/sft": 2.1319375038146973, + "losses/total": 0.865106463432312, + "ref_logps/chosen": -37.25901794433594, + "ref_logps/rejected": -38.98707580566406, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1077256202697754, + "rewards/margins": 0.44724732637405396, + "rewards/rejected": -1.5549730062484741, + "step": 991 + }, + { + "epoch": 0.94, + "grad_norm": 17.77625846862793, + "learning_rate": 3.82126617698496e-07, + "logps/chosen": -45.6514892578125, + "logps/rejected": -70.92652893066406, + "loss": 0.4292, + "losses/dpo": 0.348925918340683, + "losses/sft": 1.6102349758148193, + "losses/total": 0.348925918340683, + "ref_logps/chosen": -37.006778717041016, + "ref_logps/rejected": -54.52971267700195, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8644711375236511, + "rewards/margins": 0.7752112150192261, + "rewards/rejected": -1.6396822929382324, + "step": 992 + }, + { + "epoch": 0.94, + "grad_norm": 29.45354461669922, + "learning_rate": 3.819517313746065e-07, + "logps/chosen": -52.84046936035156, + "logps/rejected": -53.68536376953125, + "loss": 0.8718, + "losses/dpo": 0.8627600073814392, + "losses/sft": 1.4657849073410034, + "losses/total": 0.8627600073814392, + "ref_logps/chosen": -40.24302291870117, + "ref_logps/rejected": -42.56493377685547, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.2597452402114868, + "rewards/margins": -0.14770209789276123, + "rewards/rejected": -1.1120431423187256, + "step": 993 + }, + { + "epoch": 0.94, + "grad_norm": 30.172273635864258, + "learning_rate": 3.81776845050717e-07, + "logps/chosen": -46.63984680175781, + "logps/rejected": -48.94502639770508, + "loss": 0.8928, + "losses/dpo": 0.9043549299240112, + "losses/sft": 1.6924247741699219, + "losses/total": 0.9043549299240112, + "ref_logps/chosen": -36.92937469482422, + "ref_logps/rejected": -39.463645935058594, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9710469841957092, + "rewards/margins": -0.022908613085746765, + "rewards/rejected": -0.9481383562088013, + "step": 994 + }, + { + "epoch": 0.94, + "grad_norm": 26.310802459716797, + "learning_rate": 3.816019587268275e-07, + "logps/chosen": -64.32036590576172, + "logps/rejected": -60.466400146484375, + "loss": 0.5553, + "losses/dpo": 0.43956348299980164, + "losses/sft": 1.8819116353988647, + "losses/total": 0.43956348299980164, + "ref_logps/chosen": -51.93980407714844, + "ref_logps/rejected": -41.98312759399414, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.238055944442749, + "rewards/margins": 0.6102713346481323, + "rewards/rejected": -1.8483272790908813, + "step": 995 + }, + { + "epoch": 0.94, + "grad_norm": 22.787324905395508, + "learning_rate": 3.814270724029381e-07, + "logps/chosen": -55.270965576171875, + "logps/rejected": -53.940147399902344, + "loss": 0.6598, + "losses/dpo": 0.6217118501663208, + "losses/sft": 2.482112169265747, + "losses/total": 0.6217118501663208, + "ref_logps/chosen": -44.208213806152344, + "ref_logps/rejected": -39.184974670410156, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1062750816345215, + "rewards/margins": 0.36924174427986145, + "rewards/rejected": -1.47551691532135, + "step": 996 + }, + { + "epoch": 0.94, + "grad_norm": 22.396766662597656, + "learning_rate": 3.812521860790486e-07, + "logps/chosen": -53.92234802246094, + "logps/rejected": -63.89082336425781, + "loss": 0.5739, + "losses/dpo": 0.5646842122077942, + "losses/sft": 1.2429379224777222, + "losses/total": 0.5646842122077942, + "ref_logps/chosen": -43.771934509277344, + "ref_logps/rejected": -47.69895553588867, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.015041470527649, + "rewards/margins": 0.6041450500488281, + "rewards/rejected": -1.619186520576477, + "step": 997 + }, + { + "epoch": 0.94, + "grad_norm": 17.27048683166504, + "learning_rate": 3.810772997551591e-07, + "logps/chosen": -50.35649871826172, + "logps/rejected": -60.818870544433594, + "loss": 0.4675, + "losses/dpo": 0.3894166350364685, + "losses/sft": 2.066436767578125, + "losses/total": 0.3894166350364685, + "ref_logps/chosen": -45.14312744140625, + "ref_logps/rejected": -47.95522689819336, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5213367938995361, + "rewards/margins": 0.7650270462036133, + "rewards/rejected": -1.2863638401031494, + "step": 998 + }, + { + "epoch": 0.94, + "grad_norm": 25.125530242919922, + "learning_rate": 3.809024134312697e-07, + "logps/chosen": -53.18193817138672, + "logps/rejected": -58.44088363647461, + "loss": 0.6611, + "losses/dpo": 0.9224898815155029, + "losses/sft": 1.8091856241226196, + "losses/total": 0.9224898815155029, + "ref_logps/chosen": -42.58659362792969, + "ref_logps/rejected": -45.03008270263672, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0595345497131348, + "rewards/margins": 0.28154563903808594, + "rewards/rejected": -1.3410801887512207, + "step": 999 + }, + { + "epoch": 0.94, + "grad_norm": 27.306087493896484, + "learning_rate": 3.807275271073802e-07, + "logps/chosen": -63.88386535644531, + "logps/rejected": -59.284210205078125, + "loss": 0.7064, + "losses/dpo": 0.8276010751724243, + "losses/sft": 2.3495540618896484, + "losses/total": 0.8276010751724243, + "ref_logps/chosen": -52.349456787109375, + "ref_logps/rejected": -45.523284912109375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.153441309928894, + "rewards/margins": 0.22265127301216125, + "rewards/rejected": -1.3760924339294434, + "step": 1000 + }, + { + "epoch": 0.95, + "grad_norm": 22.246232986450195, + "learning_rate": 3.805526407834907e-07, + "logps/chosen": -44.66143035888672, + "logps/rejected": -55.53178405761719, + "loss": 0.6809, + "losses/dpo": 1.2138080596923828, + "losses/sft": 2.2949931621551514, + "losses/total": 1.2138080596923828, + "ref_logps/chosen": -34.898231506347656, + "ref_logps/rejected": -42.61272430419922, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9763199090957642, + "rewards/margins": 0.3155861496925354, + "rewards/rejected": -1.2919061183929443, + "step": 1001 + }, + { + "epoch": 0.95, + "grad_norm": 11.103623390197754, + "learning_rate": 3.803777544596012e-07, + "logps/chosen": -46.28156661987305, + "logps/rejected": -66.62234497070312, + "loss": 0.3419, + "losses/dpo": 0.46704888343811035, + "losses/sft": 1.5207267999649048, + "losses/total": 0.46704888343811035, + "ref_logps/chosen": -39.715660095214844, + "ref_logps/rejected": -48.267433166503906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6565909385681152, + "rewards/margins": 1.1789004802703857, + "rewards/rejected": -1.835491418838501, + "step": 1002 + }, + { + "epoch": 0.95, + "grad_norm": 24.383560180664062, + "learning_rate": 3.802028681357118e-07, + "logps/chosen": -50.68195343017578, + "logps/rejected": -60.60013198852539, + "loss": 0.6051, + "losses/dpo": 0.4206312298774719, + "losses/sft": 1.2067759037017822, + "losses/total": 0.4206312298774719, + "ref_logps/chosen": -41.747169494628906, + "ref_logps/rejected": -46.576988220214844, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8934780955314636, + "rewards/margins": 0.5088360905647278, + "rewards/rejected": -1.4023141860961914, + "step": 1003 + }, + { + "epoch": 0.95, + "grad_norm": 18.2020320892334, + "learning_rate": 3.800279818118223e-07, + "logps/chosen": -52.240333557128906, + "logps/rejected": -64.44216918945312, + "loss": 0.5414, + "losses/dpo": 0.6033343076705933, + "losses/sft": 1.76661217212677, + "losses/total": 0.6033343076705933, + "ref_logps/chosen": -41.72374725341797, + "ref_logps/rejected": -48.181156158447266, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0516587495803833, + "rewards/margins": 0.5744425058364868, + "rewards/rejected": -1.6261012554168701, + "step": 1004 + }, + { + "epoch": 0.95, + "grad_norm": 22.256027221679688, + "learning_rate": 3.798530954879328e-07, + "logps/chosen": -41.30514907836914, + "logps/rejected": -46.581298828125, + "loss": 0.6013, + "losses/dpo": 0.5583899021148682, + "losses/sft": 1.2865116596221924, + "losses/total": 0.5583899021148682, + "ref_logps/chosen": -34.4627571105957, + "ref_logps/rejected": -35.83324432373047, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6842390894889832, + "rewards/margins": 0.3905665874481201, + "rewards/rejected": -1.074805736541748, + "step": 1005 + }, + { + "epoch": 0.95, + "grad_norm": 25.01209259033203, + "learning_rate": 3.7967820916404337e-07, + "logps/chosen": -52.127925872802734, + "logps/rejected": -71.54975891113281, + "loss": 0.6054, + "losses/dpo": 0.6716561913490295, + "losses/sft": 1.758927822113037, + "losses/total": 0.6716561913490295, + "ref_logps/chosen": -39.68230438232422, + "ref_logps/rejected": -53.215599060058594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2445619106292725, + "rewards/margins": 0.5888535976409912, + "rewards/rejected": -1.8334155082702637, + "step": 1006 + }, + { + "epoch": 0.95, + "grad_norm": 18.98240089416504, + "learning_rate": 3.795033228401539e-07, + "logps/chosen": -49.99851608276367, + "logps/rejected": -73.98358917236328, + "loss": 0.4659, + "losses/dpo": 0.4777738153934479, + "losses/sft": 1.444373607635498, + "losses/total": 0.4777738153934479, + "ref_logps/chosen": -41.225772857666016, + "ref_logps/rejected": -57.0179443359375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8772743940353394, + "rewards/margins": 0.8192898035049438, + "rewards/rejected": -1.6965641975402832, + "step": 1007 + }, + { + "epoch": 0.95, + "grad_norm": 21.20924949645996, + "learning_rate": 3.793284365162644e-07, + "logps/chosen": -44.317840576171875, + "logps/rejected": -50.83393096923828, + "loss": 0.5868, + "losses/dpo": 0.6321349143981934, + "losses/sft": 1.7833943367004395, + "losses/total": 0.6321349143981934, + "ref_logps/chosen": -37.92741012573242, + "ref_logps/rejected": -40.65975570678711, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6390430927276611, + "rewards/margins": 0.37837448716163635, + "rewards/rejected": -1.0174176692962646, + "step": 1008 + }, + { + "epoch": 0.95, + "grad_norm": 17.293670654296875, + "learning_rate": 3.7915355019237496e-07, + "logps/chosen": -49.412513732910156, + "logps/rejected": -69.95012664794922, + "loss": 0.4331, + "losses/dpo": 0.3595442771911621, + "losses/sft": 1.9984853267669678, + "losses/total": 0.3595442771911621, + "ref_logps/chosen": -42.101043701171875, + "ref_logps/rejected": -54.27330017089844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7311465740203857, + "rewards/margins": 0.8365360498428345, + "rewards/rejected": -1.5676827430725098, + "step": 1009 + }, + { + "epoch": 0.95, + "grad_norm": 19.22477149963379, + "learning_rate": 3.789786638684855e-07, + "logps/chosen": -47.13673400878906, + "logps/rejected": -54.32954025268555, + "loss": 0.5807, + "losses/dpo": 0.6841297149658203, + "losses/sft": 1.992552399635315, + "losses/total": 0.6841297149658203, + "ref_logps/chosen": -38.553260803222656, + "ref_logps/rejected": -41.32449722290039, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8583472967147827, + "rewards/margins": 0.4421570301055908, + "rewards/rejected": -1.300504446029663, + "step": 1010 + }, + { + "epoch": 0.95, + "grad_norm": 24.354263305664062, + "learning_rate": 3.78803777544596e-07, + "logps/chosen": -47.76940155029297, + "logps/rejected": -51.45342254638672, + "loss": 0.6668, + "losses/dpo": 0.5718290209770203, + "losses/sft": 1.7609448432922363, + "losses/total": 0.5718290209770203, + "ref_logps/chosen": -37.937049865722656, + "ref_logps/rejected": -38.792938232421875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9832356572151184, + "rewards/margins": 0.2828129529953003, + "rewards/rejected": -1.266048550605774, + "step": 1011 + }, + { + "epoch": 0.96, + "grad_norm": 23.792476654052734, + "learning_rate": 3.786288912207065e-07, + "logps/chosen": -50.83463668823242, + "logps/rejected": -60.22372817993164, + "loss": 0.6897, + "losses/dpo": 0.6477515697479248, + "losses/sft": 1.191267967224121, + "losses/total": 0.6477515697479248, + "ref_logps/chosen": -41.08557891845703, + "ref_logps/rejected": -47.36979675292969, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9749058485031128, + "rewards/margins": 0.3104873597621918, + "rewards/rejected": -1.285393238067627, + "step": 1012 + }, + { + "epoch": 0.96, + "grad_norm": 21.003948211669922, + "learning_rate": 3.7845400489681707e-07, + "logps/chosen": -56.907066345214844, + "logps/rejected": -54.59217834472656, + "loss": 0.7273, + "losses/dpo": 0.49638450145721436, + "losses/sft": 1.5870550870895386, + "losses/total": 0.49638450145721436, + "ref_logps/chosen": -44.87751770019531, + "ref_logps/rejected": -39.635398864746094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2029545307159424, + "rewards/margins": 0.29272323846817017, + "rewards/rejected": -1.4956775903701782, + "step": 1013 + }, + { + "epoch": 0.96, + "grad_norm": 21.156476974487305, + "learning_rate": 3.782791185729276e-07, + "logps/chosen": -45.71857833862305, + "logps/rejected": -52.760101318359375, + "loss": 0.5529, + "losses/dpo": 0.516573965549469, + "losses/sft": 1.537147045135498, + "losses/total": 0.516573965549469, + "ref_logps/chosen": -36.65092468261719, + "ref_logps/rejected": -38.323673248291016, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9067651033401489, + "rewards/margins": 0.5368777513504028, + "rewards/rejected": -1.4436429738998413, + "step": 1014 + }, + { + "epoch": 0.96, + "grad_norm": 18.79005241394043, + "learning_rate": 3.781042322490381e-07, + "logps/chosen": -59.44023132324219, + "logps/rejected": -81.28251647949219, + "loss": 0.4426, + "losses/dpo": 0.44827088713645935, + "losses/sft": 1.653613805770874, + "losses/total": 0.44827088713645935, + "ref_logps/chosen": -49.774696350097656, + "ref_logps/rejected": -62.330841064453125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9665535688400269, + "rewards/margins": 0.9286140203475952, + "rewards/rejected": -1.895167589187622, + "step": 1015 + }, + { + "epoch": 0.96, + "grad_norm": 22.976865768432617, + "learning_rate": 3.7792934592514866e-07, + "logps/chosen": -45.89210510253906, + "logps/rejected": -41.85190200805664, + "loss": 0.7391, + "losses/dpo": 1.1187453269958496, + "losses/sft": 1.4958677291870117, + "losses/total": 1.1187453269958496, + "ref_logps/chosen": -37.236061096191406, + "ref_logps/rejected": -31.13835334777832, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8656048774719238, + "rewards/margins": 0.2057502567768097, + "rewards/rejected": -1.0713551044464111, + "step": 1016 + }, + { + "epoch": 0.96, + "grad_norm": 20.510265350341797, + "learning_rate": 3.7775445960125917e-07, + "logps/chosen": -45.14139938354492, + "logps/rejected": -54.66725540161133, + "loss": 0.6605, + "losses/dpo": 0.8151442408561707, + "losses/sft": 1.7279096841812134, + "losses/total": 0.8151442408561707, + "ref_logps/chosen": -36.190696716308594, + "ref_logps/rejected": -40.86396026611328, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8950698971748352, + "rewards/margins": 0.4852599501609802, + "rewards/rejected": -1.3803297281265259, + "step": 1017 + }, + { + "epoch": 0.96, + "grad_norm": 23.57343864440918, + "learning_rate": 3.775795732773697e-07, + "logps/chosen": -48.56698989868164, + "logps/rejected": -50.53917694091797, + "loss": 0.6687, + "losses/dpo": 0.3203766644001007, + "losses/sft": 1.8190281391143799, + "losses/total": 0.3203766644001007, + "ref_logps/chosen": -39.001007080078125, + "ref_logps/rejected": -37.89703369140625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9565984010696411, + "rewards/margins": 0.30761587619781494, + "rewards/rejected": -1.264214277267456, + "step": 1018 + }, + { + "epoch": 0.96, + "grad_norm": 16.368715286254883, + "learning_rate": 3.774046869534802e-07, + "logps/chosen": -38.70006561279297, + "logps/rejected": -47.883331298828125, + "loss": 0.5139, + "losses/dpo": 0.36832404136657715, + "losses/sft": 1.65565824508667, + "losses/total": 0.36832404136657715, + "ref_logps/chosen": -35.24560546875, + "ref_logps/rejected": -38.418861389160156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3454464077949524, + "rewards/margins": 0.6010006666183472, + "rewards/rejected": -0.9464471340179443, + "step": 1019 + }, + { + "epoch": 0.96, + "grad_norm": 20.903846740722656, + "learning_rate": 3.7722980062959076e-07, + "logps/chosen": -51.29652404785156, + "logps/rejected": -65.01475524902344, + "loss": 0.5941, + "losses/dpo": 0.43693631887435913, + "losses/sft": 1.4224003553390503, + "losses/total": 0.43693631887435913, + "ref_logps/chosen": -38.19739532470703, + "ref_logps/rejected": -46.52694320678711, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3099132776260376, + "rewards/margins": 0.5388673543930054, + "rewards/rejected": -1.848780632019043, + "step": 1020 + }, + { + "epoch": 0.96, + "grad_norm": 32.36252975463867, + "learning_rate": 3.770549143057013e-07, + "logps/chosen": -57.22710418701172, + "logps/rejected": -72.98391723632812, + "loss": 0.7051, + "losses/dpo": 0.8327932357788086, + "losses/sft": 1.5569915771484375, + "losses/total": 0.8327932357788086, + "ref_logps/chosen": -46.58798599243164, + "ref_logps/rejected": -57.68727493286133, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0639116764068604, + "rewards/margins": 0.46575188636779785, + "rewards/rejected": -1.5296636819839478, + "step": 1021 + }, + { + "epoch": 0.97, + "grad_norm": 19.232439041137695, + "learning_rate": 3.768800279818118e-07, + "logps/chosen": -47.70030212402344, + "logps/rejected": -69.32796478271484, + "loss": 0.4997, + "losses/dpo": 0.3835393190383911, + "losses/sft": 1.9313428401947021, + "losses/total": 0.3835393190383911, + "ref_logps/chosen": -37.832122802734375, + "ref_logps/rejected": -53.7058219909668, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9868176579475403, + "rewards/margins": 0.5753968954086304, + "rewards/rejected": -1.5622146129608154, + "step": 1022 + }, + { + "epoch": 0.97, + "grad_norm": 20.81832504272461, + "learning_rate": 3.7670514165792235e-07, + "logps/chosen": -47.133567810058594, + "logps/rejected": -58.0272331237793, + "loss": 0.6315, + "losses/dpo": 0.39221709966659546, + "losses/sft": 1.6592286825180054, + "losses/total": 0.39221709966659546, + "ref_logps/chosen": -39.48822021484375, + "ref_logps/rejected": -46.671810150146484, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7645347714424133, + "rewards/margins": 0.37100735306739807, + "rewards/rejected": -1.1355421543121338, + "step": 1023 + }, + { + "epoch": 0.97, + "grad_norm": 23.17807388305664, + "learning_rate": 3.7653025533403287e-07, + "logps/chosen": -56.657379150390625, + "logps/rejected": -62.271324157714844, + "loss": 0.5657, + "losses/dpo": 1.0621778964996338, + "losses/sft": 1.998990535736084, + "losses/total": 1.0621778964996338, + "ref_logps/chosen": -47.528175354003906, + "ref_logps/rejected": -46.84623718261719, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9129210114479065, + "rewards/margins": 0.6295875310897827, + "rewards/rejected": -1.542508602142334, + "step": 1024 + }, + { + "epoch": 0.97, + "grad_norm": 21.51382064819336, + "learning_rate": 3.763553690101434e-07, + "logps/chosen": -46.72819137573242, + "logps/rejected": -69.21802520751953, + "loss": 0.5117, + "losses/dpo": 0.6582629680633545, + "losses/sft": 1.9512324333190918, + "losses/total": 0.6582629680633545, + "ref_logps/chosen": -37.18133544921875, + "ref_logps/rejected": -52.33158874511719, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9546855092048645, + "rewards/margins": 0.7339580059051514, + "rewards/rejected": -1.6886436939239502, + "step": 1025 + }, + { + "epoch": 0.97, + "grad_norm": 22.774349212646484, + "learning_rate": 3.761804826862539e-07, + "logps/chosen": -45.344322204589844, + "logps/rejected": -52.74687957763672, + "loss": 0.5741, + "losses/dpo": 0.49935778975486755, + "losses/sft": 1.4545105695724487, + "losses/total": 0.49935778975486755, + "ref_logps/chosen": -36.59073257446289, + "ref_logps/rejected": -39.85047912597656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8753587007522583, + "rewards/margins": 0.4142812490463257, + "rewards/rejected": -1.289639949798584, + "step": 1026 + }, + { + "epoch": 0.97, + "grad_norm": 24.60048484802246, + "learning_rate": 3.7600559636236446e-07, + "logps/chosen": -46.23433303833008, + "logps/rejected": -55.34899139404297, + "loss": 0.7812, + "losses/dpo": 0.6737127304077148, + "losses/sft": 1.1214194297790527, + "losses/total": 0.6737127304077148, + "ref_logps/chosen": -36.20796585083008, + "ref_logps/rejected": -44.365966796875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0026365518569946, + "rewards/margins": 0.09566552937030792, + "rewards/rejected": -1.0983021259307861, + "step": 1027 + }, + { + "epoch": 0.97, + "grad_norm": 21.08169937133789, + "learning_rate": 3.75830710038475e-07, + "logps/chosen": -50.23443603515625, + "logps/rejected": -52.77334213256836, + "loss": 0.6507, + "losses/dpo": 0.29170680046081543, + "losses/sft": 1.2326050996780396, + "losses/total": 0.29170680046081543, + "ref_logps/chosen": -44.33406066894531, + "ref_logps/rejected": -42.89617919921875, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5900372266769409, + "rewards/margins": 0.39767909049987793, + "rewards/rejected": -0.9877163171768188, + "step": 1028 + }, + { + "epoch": 0.97, + "grad_norm": 22.025733947753906, + "learning_rate": 3.756558237145855e-07, + "logps/chosen": -48.47467041015625, + "logps/rejected": -64.47343444824219, + "loss": 0.6189, + "losses/dpo": 0.5170712471008301, + "losses/sft": 1.1247472763061523, + "losses/total": 0.5170712471008301, + "ref_logps/chosen": -38.449867248535156, + "ref_logps/rejected": -51.1156005859375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0024805068969727, + "rewards/margins": 0.3333030939102173, + "rewards/rejected": -1.33578360080719, + "step": 1029 + }, + { + "epoch": 0.97, + "grad_norm": 14.860428810119629, + "learning_rate": 3.7548093739069605e-07, + "logps/chosen": -30.676557540893555, + "logps/rejected": -62.83124923706055, + "loss": 0.4224, + "losses/dpo": 0.37315595149993896, + "losses/sft": 1.9370832443237305, + "losses/total": 0.37315595149993896, + "ref_logps/chosen": -25.772823333740234, + "ref_logps/rejected": -50.06254959106445, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.49037328362464905, + "rewards/margins": 0.7864969968795776, + "rewards/rejected": -1.2768702507019043, + "step": 1030 + }, + { + "epoch": 0.97, + "grad_norm": 19.947961807250977, + "learning_rate": 3.7530605106680656e-07, + "logps/chosen": -47.666038513183594, + "logps/rejected": -51.949249267578125, + "loss": 0.6094, + "losses/dpo": 0.490215003490448, + "losses/sft": 2.1000752449035645, + "losses/total": 0.490215003490448, + "ref_logps/chosen": -39.435821533203125, + "ref_logps/rejected": -40.621337890625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8230217695236206, + "rewards/margins": 0.30976951122283936, + "rewards/rejected": -1.13279128074646, + "step": 1031 + }, + { + "epoch": 0.97, + "grad_norm": 18.579145431518555, + "learning_rate": 3.7513116474291707e-07, + "logps/chosen": -43.39051818847656, + "logps/rejected": -51.849422454833984, + "loss": 0.5608, + "losses/dpo": 0.6709878444671631, + "losses/sft": 2.0446085929870605, + "losses/total": 0.6709878444671631, + "ref_logps/chosen": -35.785308837890625, + "ref_logps/rejected": -39.19109344482422, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7605208158493042, + "rewards/margins": 0.5053122043609619, + "rewards/rejected": -1.2658330202102661, + "step": 1032 + }, + { + "epoch": 0.98, + "grad_norm": 21.474868774414062, + "learning_rate": 3.749562784190276e-07, + "logps/chosen": -53.84595489501953, + "logps/rejected": -66.29723358154297, + "loss": 0.6453, + "losses/dpo": 0.31144535541534424, + "losses/sft": 1.8267700672149658, + "losses/total": 0.31144535541534424, + "ref_logps/chosen": -43.61677932739258, + "ref_logps/rejected": -50.179439544677734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0229175090789795, + "rewards/margins": 0.5888623595237732, + "rewards/rejected": -1.6117799282073975, + "step": 1033 + }, + { + "epoch": 0.98, + "grad_norm": 21.029232025146484, + "learning_rate": 3.7478139209513815e-07, + "logps/chosen": -43.316226959228516, + "logps/rejected": -56.854164123535156, + "loss": 0.5448, + "losses/dpo": 0.6631639003753662, + "losses/sft": 1.811882495880127, + "losses/total": 0.6631639003753662, + "ref_logps/chosen": -35.18425750732422, + "ref_logps/rejected": -42.97540283203125, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8131965398788452, + "rewards/margins": 0.5746797919273376, + "rewards/rejected": -1.3878765106201172, + "step": 1034 + }, + { + "epoch": 0.98, + "grad_norm": 15.447344779968262, + "learning_rate": 3.746065057712487e-07, + "logps/chosen": -33.777618408203125, + "logps/rejected": -51.820091247558594, + "loss": 0.4812, + "losses/dpo": 0.514406681060791, + "losses/sft": 1.1542354822158813, + "losses/total": 0.514406681060791, + "ref_logps/chosen": -30.266817092895508, + "ref_logps/rejected": -40.79583740234375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.35108014941215515, + "rewards/margins": 0.751345157623291, + "rewards/rejected": -1.1024253368377686, + "step": 1035 + }, + { + "epoch": 0.98, + "grad_norm": 22.523555755615234, + "learning_rate": 3.744316194473592e-07, + "logps/chosen": -56.821998596191406, + "logps/rejected": -69.46830749511719, + "loss": 0.5878, + "losses/dpo": 0.28811097145080566, + "losses/sft": 1.9191207885742188, + "losses/total": 0.28811097145080566, + "ref_logps/chosen": -43.662689208984375, + "ref_logps/rejected": -50.66514205932617, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3159312009811401, + "rewards/margins": 0.5643856525421143, + "rewards/rejected": -1.8803167343139648, + "step": 1036 + }, + { + "epoch": 0.98, + "grad_norm": 21.70399284362793, + "learning_rate": 3.7425673312346974e-07, + "logps/chosen": -54.694068908691406, + "logps/rejected": -77.07733154296875, + "loss": 0.5499, + "losses/dpo": 0.372211754322052, + "losses/sft": 2.259852886199951, + "losses/total": 0.372211754322052, + "ref_logps/chosen": -43.89327621459961, + "ref_logps/rejected": -60.12117004394531, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0800797939300537, + "rewards/margins": 0.6155374050140381, + "rewards/rejected": -1.6956171989440918, + "step": 1037 + }, + { + "epoch": 0.98, + "grad_norm": 17.327770233154297, + "learning_rate": 3.7408184679958025e-07, + "logps/chosen": -51.10768127441406, + "logps/rejected": -67.70603942871094, + "loss": 0.4945, + "losses/dpo": 0.5754619240760803, + "losses/sft": 1.5056612491607666, + "losses/total": 0.5754619240760803, + "ref_logps/chosen": -43.47105407714844, + "ref_logps/rejected": -52.151615142822266, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7636622786521912, + "rewards/margins": 0.7917799949645996, + "rewards/rejected": -1.555442214012146, + "step": 1038 + }, + { + "epoch": 0.98, + "grad_norm": 18.521121978759766, + "learning_rate": 3.7390696047569077e-07, + "logps/chosen": -43.31429672241211, + "logps/rejected": -55.69745635986328, + "loss": 0.5285, + "losses/dpo": 0.37058568000793457, + "losses/sft": 1.3831158876419067, + "losses/total": 0.37058568000793457, + "ref_logps/chosen": -34.684295654296875, + "ref_logps/rejected": -40.088356018066406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8629997968673706, + "rewards/margins": 0.6979104280471802, + "rewards/rejected": -1.5609102249145508, + "step": 1039 + }, + { + "epoch": 0.98, + "grad_norm": 23.897001266479492, + "learning_rate": 3.737320741518013e-07, + "logps/chosen": -57.846435546875, + "logps/rejected": -62.12091827392578, + "loss": 0.5588, + "losses/dpo": 0.8358293175697327, + "losses/sft": 2.0152835845947266, + "losses/total": 0.8358293175697327, + "ref_logps/chosen": -45.81495666503906, + "ref_logps/rejected": -44.22117614746094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2031478881835938, + "rewards/margins": 0.5868260860443115, + "rewards/rejected": -1.7899739742279053, + "step": 1040 + }, + { + "epoch": 0.98, + "grad_norm": 23.08881950378418, + "learning_rate": 3.7355718782791184e-07, + "logps/chosen": -51.02873229980469, + "logps/rejected": -63.517189025878906, + "loss": 0.604, + "losses/dpo": 0.6009421348571777, + "losses/sft": 1.8231730461120605, + "losses/total": 0.6009421348571777, + "ref_logps/chosen": -39.89787292480469, + "ref_logps/rejected": -44.95099639892578, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1130861043930054, + "rewards/margins": 0.7435336709022522, + "rewards/rejected": -1.8566197156906128, + "step": 1041 + }, + { + "epoch": 0.98, + "grad_norm": 25.16348648071289, + "learning_rate": 3.733823015040224e-07, + "logps/chosen": -47.1480598449707, + "logps/rejected": -42.78644943237305, + "loss": 0.8707, + "losses/dpo": 1.097036361694336, + "losses/sft": 1.966797947883606, + "losses/total": 1.097036361694336, + "ref_logps/chosen": -37.51509094238281, + "ref_logps/rejected": -34.06185531616211, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9632970094680786, + "rewards/margins": -0.09083791822195053, + "rewards/rejected": -0.8724589943885803, + "step": 1042 + }, + { + "epoch": 0.98, + "grad_norm": 16.07930564880371, + "learning_rate": 3.7320741518013287e-07, + "logps/chosen": -30.622501373291016, + "logps/rejected": -54.29332733154297, + "loss": 0.4737, + "losses/dpo": 0.312989205121994, + "losses/sft": 1.2640271186828613, + "losses/total": 0.312989205121994, + "ref_logps/chosen": -25.249773025512695, + "ref_logps/rejected": -41.58097839355469, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5372729301452637, + "rewards/margins": 0.7339622974395752, + "rewards/rejected": -1.2712352275848389, + "step": 1043 + }, + { + "epoch": 0.99, + "grad_norm": 18.39409828186035, + "learning_rate": 3.7303252885624344e-07, + "logps/chosen": -50.152862548828125, + "logps/rejected": -62.194969177246094, + "loss": 0.5529, + "losses/dpo": 0.619890034198761, + "losses/sft": 2.1275525093078613, + "losses/total": 0.619890034198761, + "ref_logps/chosen": -39.10810089111328, + "ref_logps/rejected": -46.15287399291992, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.104475975036621, + "rewards/margins": 0.49973371624946594, + "rewards/rejected": -1.6042097806930542, + "step": 1044 + }, + { + "epoch": 0.99, + "grad_norm": 18.956058502197266, + "learning_rate": 3.7285764253235395e-07, + "logps/chosen": -36.96673583984375, + "logps/rejected": -51.40975570678711, + "loss": 0.6046, + "losses/dpo": 0.45018085837364197, + "losses/sft": 1.6682674884796143, + "losses/total": 0.45018085837364197, + "ref_logps/chosen": -28.295307159423828, + "ref_logps/rejected": -39.06208038330078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8671428561210632, + "rewards/margins": 0.36762452125549316, + "rewards/rejected": -1.2347674369812012, + "step": 1045 + }, + { + "epoch": 0.99, + "grad_norm": 20.456775665283203, + "learning_rate": 3.7268275620846446e-07, + "logps/chosen": -43.20927429199219, + "logps/rejected": -65.18101501464844, + "loss": 0.5981, + "losses/dpo": 0.17798872292041779, + "losses/sft": 1.6047101020812988, + "losses/total": 0.17798872292041779, + "ref_logps/chosen": -35.581031799316406, + "ref_logps/rejected": -50.5703125, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7628246545791626, + "rewards/margins": 0.6982451677322388, + "rewards/rejected": -1.4610698223114014, + "step": 1046 + }, + { + "epoch": 0.99, + "grad_norm": 16.558841705322266, + "learning_rate": 3.72507869884575e-07, + "logps/chosen": -39.19849395751953, + "logps/rejected": -59.20799255371094, + "loss": 0.4089, + "losses/dpo": 0.5027921199798584, + "losses/sft": 1.5396077632904053, + "losses/total": 0.5027921199798584, + "ref_logps/chosen": -32.8937873840332, + "ref_logps/rejected": -45.09962844848633, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6304706335067749, + "rewards/margins": 0.7803655862808228, + "rewards/rejected": -1.4108362197875977, + "step": 1047 + }, + { + "epoch": 0.99, + "grad_norm": 20.622018814086914, + "learning_rate": 3.7233298356068554e-07, + "logps/chosen": -40.00585174560547, + "logps/rejected": -58.10869216918945, + "loss": 0.5211, + "losses/dpo": 0.40653449296951294, + "losses/sft": 1.5392721891403198, + "losses/total": 0.40653449296951294, + "ref_logps/chosen": -32.68403625488281, + "ref_logps/rejected": -44.4426155090332, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7321815490722656, + "rewards/margins": 0.6344262957572937, + "rewards/rejected": -1.366607904434204, + "step": 1048 + }, + { + "epoch": 0.99, + "grad_norm": 20.416519165039062, + "learning_rate": 3.721580972367961e-07, + "logps/chosen": -46.07855224609375, + "logps/rejected": -62.601341247558594, + "loss": 0.542, + "losses/dpo": 0.45975667238235474, + "losses/sft": 1.3041117191314697, + "losses/total": 0.45975667238235474, + "ref_logps/chosen": -36.73108673095703, + "ref_logps/rejected": -48.306671142578125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9347469210624695, + "rewards/margins": 0.4947202503681183, + "rewards/rejected": -1.4294672012329102, + "step": 1049 + }, + { + "epoch": 0.99, + "grad_norm": 18.21001625061035, + "learning_rate": 3.7198321091290656e-07, + "logps/chosen": -38.742042541503906, + "logps/rejected": -50.60753631591797, + "loss": 0.5939, + "losses/dpo": 0.3910270929336548, + "losses/sft": 1.2642822265625, + "losses/total": 0.3910270929336548, + "ref_logps/chosen": -32.67649459838867, + "ref_logps/rejected": -41.21440887451172, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6065546274185181, + "rewards/margins": 0.3327580690383911, + "rewards/rejected": -0.939312756061554, + "step": 1050 + }, + { + "epoch": 0.99, + "grad_norm": 17.968276977539062, + "learning_rate": 3.7180832458901713e-07, + "logps/chosen": -42.053462982177734, + "logps/rejected": -49.964454650878906, + "loss": 0.5479, + "losses/dpo": 0.6227540969848633, + "losses/sft": 1.782165288925171, + "losses/total": 0.6227540969848633, + "ref_logps/chosen": -34.993019104003906, + "ref_logps/rejected": -38.62141418457031, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7060443162918091, + "rewards/margins": 0.4282597303390503, + "rewards/rejected": -1.1343040466308594, + "step": 1051 + }, + { + "epoch": 0.99, + "grad_norm": 15.051322937011719, + "learning_rate": 3.7163343826512764e-07, + "logps/chosen": -44.43914794921875, + "logps/rejected": -67.49777221679688, + "loss": 0.3392, + "losses/dpo": 0.23947615921497345, + "losses/sft": 1.1581668853759766, + "losses/total": 0.23947615921497345, + "ref_logps/chosen": -37.87828063964844, + "ref_logps/rejected": -48.68732452392578, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6560868620872498, + "rewards/margins": 1.224957823753357, + "rewards/rejected": -1.881044626235962, + "step": 1052 + }, + { + "epoch": 0.99, + "grad_norm": 25.21403694152832, + "learning_rate": 3.7145855194123816e-07, + "logps/chosen": -54.13300323486328, + "logps/rejected": -50.41889190673828, + "loss": 0.7712, + "losses/dpo": 0.6819635629653931, + "losses/sft": 1.9272966384887695, + "losses/total": 0.6819635629653931, + "ref_logps/chosen": -43.46582794189453, + "ref_logps/rejected": -39.4797248840332, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.0667178630828857, + "rewards/margins": 0.02719871699810028, + "rewards/rejected": -1.093916654586792, + "step": 1053 + }, + { + "epoch": 1.0, + "grad_norm": 14.475399017333984, + "learning_rate": 3.712836656173487e-07, + "logps/chosen": -29.974699020385742, + "logps/rejected": -50.017608642578125, + "loss": 0.5052, + "losses/dpo": 0.7304818630218506, + "losses/sft": 1.7995580434799194, + "losses/total": 0.7304818630218506, + "ref_logps/chosen": -24.798912048339844, + "ref_logps/rejected": -38.5832405090332, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5175787210464478, + "rewards/margins": 0.6258580684661865, + "rewards/rejected": -1.1434367895126343, + "step": 1054 + }, + { + "epoch": 1.0, + "grad_norm": 26.792694091796875, + "learning_rate": 3.7110877929345923e-07, + "logps/chosen": -55.168270111083984, + "logps/rejected": -64.51309967041016, + "loss": 0.637, + "losses/dpo": 1.0172982215881348, + "losses/sft": 1.9349207878112793, + "losses/total": 1.0172982215881348, + "ref_logps/chosen": -45.040283203125, + "ref_logps/rejected": -50.64051055908203, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0127986669540405, + "rewards/margins": 0.3744605779647827, + "rewards/rejected": -1.3872592449188232, + "step": 1055 + }, + { + "epoch": 1.0, + "grad_norm": 19.410390853881836, + "learning_rate": 3.709338929695698e-07, + "logps/chosen": -39.06595993041992, + "logps/rejected": -52.589717864990234, + "loss": 0.6574, + "losses/dpo": 0.7321785092353821, + "losses/sft": 1.5721993446350098, + "losses/total": 0.7321785092353821, + "ref_logps/chosen": -31.124237060546875, + "ref_logps/rejected": -40.465423583984375, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7941721677780151, + "rewards/margins": 0.4182566702365875, + "rewards/rejected": -1.2124288082122803, + "step": 1056 + }, + { + "epoch": 1.0, + "grad_norm": 24.434770584106445, + "learning_rate": 3.7075900664568026e-07, + "logps/chosen": -53.610530853271484, + "logps/rejected": -63.1193962097168, + "loss": 0.602, + "losses/dpo": 0.8597586154937744, + "losses/sft": 1.8838454484939575, + "losses/total": 0.8597586154937744, + "ref_logps/chosen": -45.25860595703125, + "ref_logps/rejected": -49.585853576660156, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8351923823356628, + "rewards/margins": 0.518161416053772, + "rewards/rejected": -1.3533538579940796, + "step": 1057 + }, + { + "epoch": 1.0, + "grad_norm": 23.938945770263672, + "learning_rate": 3.705841203217908e-07, + "logps/chosen": -51.38863754272461, + "logps/rejected": -56.15193176269531, + "loss": 0.5816, + "losses/dpo": 0.7956613898277283, + "losses/sft": 1.733151912689209, + "losses/total": 0.7956613898277283, + "ref_logps/chosen": -42.19108581542969, + "ref_logps/rejected": -42.952274322509766, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.919755220413208, + "rewards/margins": 0.4002106189727783, + "rewards/rejected": -1.3199658393859863, + "step": 1058 + }, + { + "epoch": 1.0, + "grad_norm": 16.61768913269043, + "learning_rate": 3.7040923399790134e-07, + "logps/chosen": -43.220340728759766, + "logps/rejected": -65.68209838867188, + "loss": 0.4328, + "losses/dpo": 0.3953036963939667, + "losses/sft": 1.71744966506958, + "losses/total": 0.3953036963939667, + "ref_logps/chosen": -35.62976837158203, + "ref_logps/rejected": -48.545066833496094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7590570449829102, + "rewards/margins": 0.9546458721160889, + "rewards/rejected": -1.713702917098999, + "step": 1059 + }, + { + "epoch": 1.0, + "grad_norm": 19.569744110107422, + "learning_rate": 3.7023434767401185e-07, + "logps/chosen": -48.77082824707031, + "logps/rejected": -68.16401672363281, + "loss": 0.4348, + "losses/dpo": 0.15455271303653717, + "losses/sft": 1.7090686559677124, + "losses/total": 0.15455271303653717, + "ref_logps/chosen": -40.929649353027344, + "ref_logps/rejected": -50.74110794067383, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7841184139251709, + "rewards/margins": 0.9581716060638428, + "rewards/rejected": -1.7422899007797241, + "step": 1060 + }, + { + "epoch": 1.0, + "grad_norm": 15.181184768676758, + "learning_rate": 3.700594613501224e-07, + "logps/chosen": -39.98779296875, + "logps/rejected": -55.82892990112305, + "loss": 0.4619, + "losses/dpo": 0.3095157742500305, + "losses/sft": 1.5837920904159546, + "losses/total": 0.3095157742500305, + "ref_logps/chosen": -32.98744583129883, + "ref_logps/rejected": -39.85797119140625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7000346779823303, + "rewards/margins": 0.897061288356781, + "rewards/rejected": -1.5970959663391113, + "step": 1061 + }, + { + "epoch": 1.0, + "grad_norm": 14.743623733520508, + "learning_rate": 3.6988457502623293e-07, + "logps/chosen": -39.40172576904297, + "logps/rejected": -63.34711837768555, + "loss": 0.3648, + "losses/dpo": 0.2464207261800766, + "losses/sft": 1.2853291034698486, + "losses/total": 0.2464207261800766, + "ref_logps/chosen": -34.49665832519531, + "ref_logps/rejected": -47.112762451171875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.49050676822662354, + "rewards/margins": 1.1329294443130493, + "rewards/rejected": -1.6234362125396729, + "step": 1062 + }, + { + "epoch": 1.0, + "grad_norm": 24.7276554107666, + "learning_rate": 3.697096887023435e-07, + "logps/chosen": -43.58515930175781, + "logps/rejected": -50.95394515991211, + "loss": 0.5362, + "losses/dpo": 0.7975062131881714, + "losses/sft": 1.7530547380447388, + "losses/total": 0.7975062131881714, + "ref_logps/chosen": -35.57408905029297, + "ref_logps/rejected": -37.36211395263672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8011068105697632, + "rewards/margins": 0.5580763816833496, + "rewards/rejected": -1.3591831922531128, + "step": 1063 + }, + { + "epoch": 1.0, + "grad_norm": 18.076126098632812, + "learning_rate": 3.6953480237845395e-07, + "logps/chosen": -43.305152893066406, + "logps/rejected": -54.1341667175293, + "loss": 0.4355, + "losses/dpo": 0.4449557662010193, + "losses/sft": 1.8413487672805786, + "losses/total": 0.4449557662010193, + "ref_logps/chosen": -33.95836639404297, + "ref_logps/rejected": -36.06989288330078, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9346787333488464, + "rewards/margins": 0.871748685836792, + "rewards/rejected": -1.8064274787902832, + "step": 1064 + }, + { + "epoch": 1.01, + "grad_norm": 17.21904754638672, + "learning_rate": 3.693599160545645e-07, + "logps/chosen": -42.98076629638672, + "logps/rejected": -60.32102584838867, + "loss": 0.4183, + "losses/dpo": 0.6023704409599304, + "losses/sft": 1.759673833847046, + "losses/total": 0.6023704409599304, + "ref_logps/chosen": -33.99935531616211, + "ref_logps/rejected": -40.51173400878906, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8981411457061768, + "rewards/margins": 1.0827878713607788, + "rewards/rejected": -1.980928897857666, + "step": 1065 + }, + { + "epoch": 1.01, + "grad_norm": 16.149755477905273, + "learning_rate": 3.6918502973067503e-07, + "logps/chosen": -34.371150970458984, + "logps/rejected": -66.52780151367188, + "loss": 0.3845, + "losses/dpo": 0.28156861662864685, + "losses/sft": 1.2597063779830933, + "losses/total": 0.28156861662864685, + "ref_logps/chosen": -28.831809997558594, + "ref_logps/rejected": -47.38080978393555, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5539339780807495, + "rewards/margins": 1.360764980316162, + "rewards/rejected": -1.9146989583969116, + "step": 1066 + }, + { + "epoch": 1.01, + "grad_norm": 14.031977653503418, + "learning_rate": 3.6901014340678554e-07, + "logps/chosen": -37.38449478149414, + "logps/rejected": -46.552528381347656, + "loss": 0.3672, + "losses/dpo": 0.5438218116760254, + "losses/sft": 1.4593948125839233, + "losses/total": 0.5438218116760254, + "ref_logps/chosen": -30.516246795654297, + "ref_logps/rejected": -29.92164421081543, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6868245601654053, + "rewards/margins": 0.9762636423110962, + "rewards/rejected": -1.6630882024765015, + "step": 1067 + }, + { + "epoch": 1.01, + "grad_norm": 16.827117919921875, + "learning_rate": 3.688352570828961e-07, + "logps/chosen": -37.271148681640625, + "logps/rejected": -60.42409896850586, + "loss": 0.4615, + "losses/dpo": 0.3220047354698181, + "losses/sft": 1.2956514358520508, + "losses/total": 0.3220047354698181, + "ref_logps/chosen": -29.47098731994629, + "ref_logps/rejected": -43.098026275634766, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7800158262252808, + "rewards/margins": 0.9525912404060364, + "rewards/rejected": -1.732607126235962, + "step": 1068 + }, + { + "epoch": 1.01, + "grad_norm": 13.062885284423828, + "learning_rate": 3.686603707590066e-07, + "logps/chosen": -46.08294677734375, + "logps/rejected": -61.942962646484375, + "loss": 0.3078, + "losses/dpo": 0.2849777340888977, + "losses/sft": 1.4762276411056519, + "losses/total": 0.2849777340888977, + "ref_logps/chosen": -39.95171356201172, + "ref_logps/rejected": -43.798824310302734, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6131234169006348, + "rewards/margins": 1.2012903690338135, + "rewards/rejected": -1.8144135475158691, + "step": 1069 + }, + { + "epoch": 1.01, + "grad_norm": 18.53889274597168, + "learning_rate": 3.684854844351172e-07, + "logps/chosen": -49.929996490478516, + "logps/rejected": -64.5808334350586, + "loss": 0.4355, + "losses/dpo": 0.6379729509353638, + "losses/sft": 1.7307603359222412, + "losses/total": 0.6379729509353638, + "ref_logps/chosen": -40.72583770751953, + "ref_logps/rejected": -45.09645080566406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9204157590866089, + "rewards/margins": 1.0280221700668335, + "rewards/rejected": -1.9484379291534424, + "step": 1070 + }, + { + "epoch": 1.01, + "grad_norm": 17.705724716186523, + "learning_rate": 3.6831059811122765e-07, + "logps/chosen": -57.26272201538086, + "logps/rejected": -75.029296875, + "loss": 0.4108, + "losses/dpo": 0.5157050490379333, + "losses/sft": 2.2226946353912354, + "losses/total": 0.5157050490379333, + "ref_logps/chosen": -46.039466857910156, + "ref_logps/rejected": -53.679019927978516, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1223256587982178, + "rewards/margins": 1.0127019882202148, + "rewards/rejected": -2.1350276470184326, + "step": 1071 + }, + { + "epoch": 1.01, + "grad_norm": 17.380020141601562, + "learning_rate": 3.681357117873382e-07, + "logps/chosen": -30.642452239990234, + "logps/rejected": -39.821754455566406, + "loss": 0.5762, + "losses/dpo": 0.8206643462181091, + "losses/sft": 1.267864465713501, + "losses/total": 0.8206643462181091, + "ref_logps/chosen": -25.10495948791504, + "ref_logps/rejected": -29.695594787597656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5537492036819458, + "rewards/margins": 0.45886677503585815, + "rewards/rejected": -1.0126159191131592, + "step": 1072 + }, + { + "epoch": 1.01, + "grad_norm": 18.155853271484375, + "learning_rate": 3.679608254634488e-07, + "logps/chosen": -44.522071838378906, + "logps/rejected": -57.01021194458008, + "loss": 0.3812, + "losses/dpo": 0.49304908514022827, + "losses/sft": 1.2953386306762695, + "losses/total": 0.49304908514022827, + "ref_logps/chosen": -38.756256103515625, + "ref_logps/rejected": -40.62232208251953, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5765818357467651, + "rewards/margins": 1.0622072219848633, + "rewards/rejected": -1.6387890577316284, + "step": 1073 + }, + { + "epoch": 1.01, + "grad_norm": 18.12737464904785, + "learning_rate": 3.6778593913955924e-07, + "logps/chosen": -46.933799743652344, + "logps/rejected": -50.66566848754883, + "loss": 0.4962, + "losses/dpo": 0.2647829055786133, + "losses/sft": 1.135524868965149, + "losses/total": 0.2647829055786133, + "ref_logps/chosen": -40.021026611328125, + "ref_logps/rejected": -36.490535736083984, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6912773251533508, + "rewards/margins": 0.7262359857559204, + "rewards/rejected": -1.417513370513916, + "step": 1074 + }, + { + "epoch": 1.02, + "grad_norm": 26.857921600341797, + "learning_rate": 3.676110528156698e-07, + "logps/chosen": -54.28265380859375, + "logps/rejected": -64.04670715332031, + "loss": 0.72, + "losses/dpo": 0.4676631689071655, + "losses/sft": 1.7014347314834595, + "losses/total": 0.4676631689071655, + "ref_logps/chosen": -42.96564483642578, + "ref_logps/rejected": -51.200721740722656, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.1317007541656494, + "rewards/margins": 0.15289808809757233, + "rewards/rejected": -1.2845988273620605, + "step": 1075 + }, + { + "epoch": 1.02, + "grad_norm": 15.14867115020752, + "learning_rate": 3.674361664917803e-07, + "logps/chosen": -39.258968353271484, + "logps/rejected": -63.0911979675293, + "loss": 0.3872, + "losses/dpo": 0.20462581515312195, + "losses/sft": 1.478392481803894, + "losses/total": 0.20462581515312195, + "ref_logps/chosen": -32.907691955566406, + "ref_logps/rejected": -43.177433013916016, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6351274251937866, + "rewards/margins": 1.3562489748001099, + "rewards/rejected": -1.9913763999938965, + "step": 1076 + }, + { + "epoch": 1.02, + "grad_norm": 17.26877212524414, + "learning_rate": 3.672612801678909e-07, + "logps/chosen": -49.37654113769531, + "logps/rejected": -69.18081665039062, + "loss": 0.3712, + "losses/dpo": 0.25452861189842224, + "losses/sft": 2.0616605281829834, + "losses/total": 0.25452861189842224, + "ref_logps/chosen": -39.10771942138672, + "ref_logps/rejected": -46.87623596191406, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0268824100494385, + "rewards/margins": 1.203575611114502, + "rewards/rejected": -2.2304580211639404, + "step": 1077 + }, + { + "epoch": 1.02, + "grad_norm": 19.198070526123047, + "learning_rate": 3.6708639384400134e-07, + "logps/chosen": -42.97004318237305, + "logps/rejected": -58.670021057128906, + "loss": 0.5011, + "losses/dpo": 0.4471086263656616, + "losses/sft": 1.2140623331069946, + "losses/total": 0.4471086263656616, + "ref_logps/chosen": -31.619060516357422, + "ref_logps/rejected": -38.09598159790039, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1350982189178467, + "rewards/margins": 0.9223060607910156, + "rewards/rejected": -2.0574045181274414, + "step": 1078 + }, + { + "epoch": 1.02, + "grad_norm": 15.979565620422363, + "learning_rate": 3.669115075201119e-07, + "logps/chosen": -43.28668975830078, + "logps/rejected": -55.396141052246094, + "loss": 0.3708, + "losses/dpo": 0.25019770860671997, + "losses/sft": 1.5838481187820435, + "losses/total": 0.25019770860671997, + "ref_logps/chosen": -36.61464309692383, + "ref_logps/rejected": -38.42881774902344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6672050356864929, + "rewards/margins": 1.029528021812439, + "rewards/rejected": -1.696732997894287, + "step": 1079 + }, + { + "epoch": 1.02, + "grad_norm": 12.003375053405762, + "learning_rate": 3.6673662119622247e-07, + "logps/chosen": -29.29328727722168, + "logps/rejected": -61.710323333740234, + "loss": 0.3018, + "losses/dpo": 0.3685706555843353, + "losses/sft": 1.3761706352233887, + "losses/total": 0.3685706555843353, + "ref_logps/chosen": -23.90287208557129, + "ref_logps/rejected": -43.84029769897461, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5390416383743286, + "rewards/margins": 1.2479610443115234, + "rewards/rejected": -1.7870025634765625, + "step": 1080 + }, + { + "epoch": 1.02, + "grad_norm": 19.205904006958008, + "learning_rate": 3.6656173487233293e-07, + "logps/chosen": -52.96255111694336, + "logps/rejected": -71.59742736816406, + "loss": 0.4664, + "losses/dpo": 0.27110928297042847, + "losses/sft": 1.3454355001449585, + "losses/total": 0.27110928297042847, + "ref_logps/chosen": -42.1851806640625, + "ref_logps/rejected": -51.919124603271484, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0777368545532227, + "rewards/margins": 0.8900934457778931, + "rewards/rejected": -1.9678301811218262, + "step": 1081 + }, + { + "epoch": 1.02, + "grad_norm": 20.57588768005371, + "learning_rate": 3.663868485484435e-07, + "logps/chosen": -46.54554748535156, + "logps/rejected": -69.75184631347656, + "loss": 0.5054, + "losses/dpo": 0.5467911958694458, + "losses/sft": 1.9871207475662231, + "losses/total": 0.5467911958694458, + "ref_logps/chosen": -35.361541748046875, + "ref_logps/rejected": -49.73594665527344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1184006929397583, + "rewards/margins": 0.8831894397735596, + "rewards/rejected": -2.0015902519226074, + "step": 1082 + }, + { + "epoch": 1.02, + "grad_norm": 20.42867660522461, + "learning_rate": 3.66211962224554e-07, + "logps/chosen": -50.04201889038086, + "logps/rejected": -57.776390075683594, + "loss": 0.495, + "losses/dpo": 0.2337605059146881, + "losses/sft": 1.7641470432281494, + "losses/total": 0.2337605059146881, + "ref_logps/chosen": -41.76304244995117, + "ref_logps/rejected": -42.703399658203125, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8278974890708923, + "rewards/margins": 0.6794015169143677, + "rewards/rejected": -1.5072989463806152, + "step": 1083 + }, + { + "epoch": 1.02, + "grad_norm": 24.451725006103516, + "learning_rate": 3.660370759006646e-07, + "logps/chosen": -50.32181167602539, + "logps/rejected": -53.35579299926758, + "loss": 0.6113, + "losses/dpo": 0.5341934561729431, + "losses/sft": 1.7669661045074463, + "losses/total": 0.5341934561729431, + "ref_logps/chosen": -38.576175689697266, + "ref_logps/rejected": -36.4826774597168, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1745634078979492, + "rewards/margins": 0.5127479434013367, + "rewards/rejected": -1.6873114109039307, + "step": 1084 + }, + { + "epoch": 1.02, + "grad_norm": 18.57804298400879, + "learning_rate": 3.6586218957677504e-07, + "logps/chosen": -51.10692596435547, + "logps/rejected": -69.60911560058594, + "loss": 0.4381, + "losses/dpo": 0.21541637182235718, + "losses/sft": 2.0709481239318848, + "losses/total": 0.21541637182235718, + "ref_logps/chosen": -38.682254791259766, + "ref_logps/rejected": -46.849151611328125, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2424674034118652, + "rewards/margins": 1.0335289239883423, + "rewards/rejected": -2.275996208190918, + "step": 1085 + }, + { + "epoch": 1.03, + "grad_norm": 14.761630058288574, + "learning_rate": 3.656873032528856e-07, + "logps/chosen": -42.015716552734375, + "logps/rejected": -60.31280517578125, + "loss": 0.3823, + "losses/dpo": 0.38250288367271423, + "losses/sft": 1.6169302463531494, + "losses/total": 0.38250288367271423, + "ref_logps/chosen": -33.85796356201172, + "ref_logps/rejected": -39.86603927612305, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8157749176025391, + "rewards/margins": 1.2289013862609863, + "rewards/rejected": -2.0446763038635254, + "step": 1086 + }, + { + "epoch": 1.03, + "grad_norm": 16.132125854492188, + "learning_rate": 3.6551241692899617e-07, + "logps/chosen": -43.13732147216797, + "logps/rejected": -61.28578186035156, + "loss": 0.4222, + "losses/dpo": 0.6710204482078552, + "losses/sft": 1.5309427976608276, + "losses/total": 0.6710204482078552, + "ref_logps/chosen": -34.049827575683594, + "ref_logps/rejected": -41.381378173828125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9087490439414978, + "rewards/margins": 1.0816915035247803, + "rewards/rejected": -1.9904407262802124, + "step": 1087 + }, + { + "epoch": 1.03, + "grad_norm": 17.13888931274414, + "learning_rate": 3.653375306051067e-07, + "logps/chosen": -50.40296173095703, + "logps/rejected": -51.6271858215332, + "loss": 0.4137, + "losses/dpo": 0.48693373799324036, + "losses/sft": 2.0377941131591797, + "losses/total": 0.48693373799324036, + "ref_logps/chosen": -42.096466064453125, + "ref_logps/rejected": -34.91358947753906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8306496739387512, + "rewards/margins": 0.8407102823257446, + "rewards/rejected": -1.671359896659851, + "step": 1088 + }, + { + "epoch": 1.03, + "grad_norm": 14.04586410522461, + "learning_rate": 3.651626442812172e-07, + "logps/chosen": -60.53893280029297, + "logps/rejected": -75.64192199707031, + "loss": 0.2733, + "losses/dpo": 0.2743315100669861, + "losses/sft": 2.013023853302002, + "losses/total": 0.2743315100669861, + "ref_logps/chosen": -51.14175796508789, + "ref_logps/rejected": -52.965911865234375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9397180080413818, + "rewards/margins": 1.3278836011886597, + "rewards/rejected": -2.267601728439331, + "step": 1089 + }, + { + "epoch": 1.03, + "grad_norm": 19.8659725189209, + "learning_rate": 3.649877579573277e-07, + "logps/chosen": -46.49089050292969, + "logps/rejected": -73.6563491821289, + "loss": 0.5047, + "losses/dpo": 0.48692750930786133, + "losses/sft": 1.5766716003417969, + "losses/total": 0.48692750930786133, + "ref_logps/chosen": -36.45451354980469, + "ref_logps/rejected": -55.79833984375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0036377906799316, + "rewards/margins": 0.7821632027626038, + "rewards/rejected": -1.7858009338378906, + "step": 1090 + }, + { + "epoch": 1.03, + "grad_norm": 21.191913604736328, + "learning_rate": 3.6481287163343827e-07, + "logps/chosen": -54.523948669433594, + "logps/rejected": -64.21807861328125, + "loss": 0.5332, + "losses/dpo": 0.3113740384578705, + "losses/sft": 1.6445286273956299, + "losses/total": 0.3113740384578705, + "ref_logps/chosen": -43.71592712402344, + "ref_logps/rejected": -44.88184356689453, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0808024406433105, + "rewards/margins": 0.8528213500976562, + "rewards/rejected": -1.9336237907409668, + "step": 1091 + }, + { + "epoch": 1.03, + "grad_norm": 15.382048606872559, + "learning_rate": 3.6463798530954873e-07, + "logps/chosen": -47.13607406616211, + "logps/rejected": -55.18809509277344, + "loss": 0.3878, + "losses/dpo": 0.408435583114624, + "losses/sft": 1.995726227760315, + "losses/total": 0.408435583114624, + "ref_logps/chosen": -37.132164001464844, + "ref_logps/rejected": -36.19349670410156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0003913640975952, + "rewards/margins": 0.8990681767463684, + "rewards/rejected": -1.8994596004486084, + "step": 1092 + }, + { + "epoch": 1.03, + "grad_norm": 16.46430778503418, + "learning_rate": 3.644630989856593e-07, + "logps/chosen": -37.23838424682617, + "logps/rejected": -55.96215057373047, + "loss": 0.3795, + "losses/dpo": 0.49285826086997986, + "losses/sft": 1.6897306442260742, + "losses/total": 0.49285826086997986, + "ref_logps/chosen": -28.73897361755371, + "ref_logps/rejected": -37.051513671875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.849941074848175, + "rewards/margins": 1.0411226749420166, + "rewards/rejected": -1.8910638093948364, + "step": 1093 + }, + { + "epoch": 1.03, + "grad_norm": 21.956340789794922, + "learning_rate": 3.6428821266176986e-07, + "logps/chosen": -57.616729736328125, + "logps/rejected": -68.11526489257812, + "loss": 0.5237, + "losses/dpo": 0.5827259421348572, + "losses/sft": 1.3818707466125488, + "losses/total": 0.5827259421348572, + "ref_logps/chosen": -45.73115539550781, + "ref_logps/rejected": -49.68520736694336, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1885576248168945, + "rewards/margins": 0.6544477939605713, + "rewards/rejected": -1.8430054187774658, + "step": 1094 + }, + { + "epoch": 1.03, + "grad_norm": 18.08264923095703, + "learning_rate": 3.641133263378804e-07, + "logps/chosen": -54.895057678222656, + "logps/rejected": -75.75250244140625, + "loss": 0.429, + "losses/dpo": 0.2474673092365265, + "losses/sft": 2.306041955947876, + "losses/total": 0.2474673092365265, + "ref_logps/chosen": -41.94483947753906, + "ref_logps/rejected": -50.75457763671875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2950215339660645, + "rewards/margins": 1.204770803451538, + "rewards/rejected": -2.4997923374176025, + "step": 1095 + }, + { + "epoch": 1.03, + "grad_norm": 18.44590187072754, + "learning_rate": 3.639384400139909e-07, + "logps/chosen": -41.84230422973633, + "logps/rejected": -60.22125244140625, + "loss": 0.3716, + "losses/dpo": 0.5278711318969727, + "losses/sft": 1.1868032217025757, + "losses/total": 0.5278711318969727, + "ref_logps/chosen": -33.745643615722656, + "ref_logps/rejected": -39.61204528808594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8096659183502197, + "rewards/margins": 1.251254677772522, + "rewards/rejected": -2.060920476913452, + "step": 1096 + }, + { + "epoch": 1.04, + "grad_norm": 26.01068687438965, + "learning_rate": 3.637635536901014e-07, + "logps/chosen": -42.99476623535156, + "logps/rejected": -51.496116638183594, + "loss": 0.6381, + "losses/dpo": 0.7461168169975281, + "losses/sft": 1.5090261697769165, + "losses/total": 0.7461168169975281, + "ref_logps/chosen": -32.50988006591797, + "ref_logps/rejected": -36.6175537109375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0484886169433594, + "rewards/margins": 0.43936800956726074, + "rewards/rejected": -1.4878566265106201, + "step": 1097 + }, + { + "epoch": 1.04, + "grad_norm": 15.729644775390625, + "learning_rate": 3.6358866736621197e-07, + "logps/chosen": -36.038970947265625, + "logps/rejected": -47.00421142578125, + "loss": 0.3699, + "losses/dpo": 0.26515817642211914, + "losses/sft": 1.4794013500213623, + "losses/total": 0.26515817642211914, + "ref_logps/chosen": -31.03093719482422, + "ref_logps/rejected": -31.28116226196289, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5008033514022827, + "rewards/margins": 1.0715014934539795, + "rewards/rejected": -1.5723048448562622, + "step": 1098 + }, + { + "epoch": 1.04, + "grad_norm": 19.49972915649414, + "learning_rate": 3.634137810423225e-07, + "logps/chosen": -46.45841979980469, + "logps/rejected": -57.73186492919922, + "loss": 0.4877, + "losses/dpo": 0.48669862747192383, + "losses/sft": 1.3054991960525513, + "losses/total": 0.48669862747192383, + "ref_logps/chosen": -37.5338134765625, + "ref_logps/rejected": -40.144432067871094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8924607038497925, + "rewards/margins": 0.8662827014923096, + "rewards/rejected": -1.7587432861328125, + "step": 1099 + }, + { + "epoch": 1.04, + "grad_norm": 16.852275848388672, + "learning_rate": 3.63238894718433e-07, + "logps/chosen": -43.074493408203125, + "logps/rejected": -51.77428436279297, + "loss": 0.4386, + "losses/dpo": 0.4206892251968384, + "losses/sft": 1.713990330696106, + "losses/total": 0.4206892251968384, + "ref_logps/chosen": -33.638893127441406, + "ref_logps/rejected": -33.846519470214844, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9435595273971558, + "rewards/margins": 0.8492172956466675, + "rewards/rejected": -1.7927768230438232, + "step": 1100 + }, + { + "epoch": 1.04, + "grad_norm": 19.013151168823242, + "learning_rate": 3.6306400839454356e-07, + "logps/chosen": -50.996768951416016, + "logps/rejected": -59.741737365722656, + "loss": 0.4349, + "losses/dpo": 0.4508028030395508, + "losses/sft": 1.7069379091262817, + "losses/total": 0.4508028030395508, + "ref_logps/chosen": -40.384803771972656, + "ref_logps/rejected": -39.33921432495117, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0611966848373413, + "rewards/margins": 0.9790558218955994, + "rewards/rejected": -2.040252447128296, + "step": 1101 + }, + { + "epoch": 1.04, + "grad_norm": 16.264345169067383, + "learning_rate": 3.6288912207065407e-07, + "logps/chosen": -51.23906707763672, + "logps/rejected": -59.40775680541992, + "loss": 0.4368, + "losses/dpo": 0.6014887094497681, + "losses/sft": 2.029402256011963, + "losses/total": 0.6014887094497681, + "ref_logps/chosen": -42.39584732055664, + "ref_logps/rejected": -41.84788513183594, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8843222260475159, + "rewards/margins": 0.8716649413108826, + "rewards/rejected": -1.7559871673583984, + "step": 1102 + }, + { + "epoch": 1.04, + "grad_norm": 17.19400405883789, + "learning_rate": 3.627142357467646e-07, + "logps/chosen": -41.24311065673828, + "logps/rejected": -64.39938354492188, + "loss": 0.3703, + "losses/dpo": 0.3022533655166626, + "losses/sft": 1.4316505193710327, + "losses/total": 0.3022533655166626, + "ref_logps/chosen": -30.809202194213867, + "ref_logps/rejected": -43.1936149597168, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0433909893035889, + "rewards/margins": 1.0771856307983398, + "rewards/rejected": -2.1205766201019287, + "step": 1103 + }, + { + "epoch": 1.04, + "grad_norm": 14.232845306396484, + "learning_rate": 3.625393494228751e-07, + "logps/chosen": -53.44538116455078, + "logps/rejected": -81.70079803466797, + "loss": 0.3057, + "losses/dpo": 0.35619908571243286, + "losses/sft": 1.759982943534851, + "losses/total": 0.35619908571243286, + "ref_logps/chosen": -42.34745788574219, + "ref_logps/rejected": -55.37383270263672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1097925901412964, + "rewards/margins": 1.5229041576385498, + "rewards/rejected": -2.6326968669891357, + "step": 1104 + }, + { + "epoch": 1.04, + "grad_norm": 16.24256706237793, + "learning_rate": 3.6236446309898566e-07, + "logps/chosen": -59.25031280517578, + "logps/rejected": -76.15831756591797, + "loss": 0.3057, + "losses/dpo": 0.5977211594581604, + "losses/sft": 1.8647884130477905, + "losses/total": 0.5977211594581604, + "ref_logps/chosen": -49.02546691894531, + "ref_logps/rejected": -48.99300765991211, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0224847793579102, + "rewards/margins": 1.6940460205078125, + "rewards/rejected": -2.7165307998657227, + "step": 1105 + }, + { + "epoch": 1.04, + "grad_norm": 14.55997371673584, + "learning_rate": 3.6218957677509617e-07, + "logps/chosen": -40.7120361328125, + "logps/rejected": -59.31071472167969, + "loss": 0.3353, + "losses/dpo": 0.2439900040626526, + "losses/sft": 1.3137151002883911, + "losses/total": 0.2439900040626526, + "ref_logps/chosen": -34.27995300292969, + "ref_logps/rejected": -40.9670295715332, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6432080268859863, + "rewards/margins": 1.1911606788635254, + "rewards/rejected": -1.8343685865402222, + "step": 1106 + }, + { + "epoch": 1.05, + "grad_norm": 16.666099548339844, + "learning_rate": 3.620146904512067e-07, + "logps/chosen": -47.12230682373047, + "logps/rejected": -61.94658660888672, + "loss": 0.4388, + "losses/dpo": 0.33863022923469543, + "losses/sft": 1.6335513591766357, + "losses/total": 0.33863022923469543, + "ref_logps/chosen": -38.68480682373047, + "ref_logps/rejected": -42.507816314697266, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8437499403953552, + "rewards/margins": 1.1001267433166504, + "rewards/rejected": -1.9438767433166504, + "step": 1107 + }, + { + "epoch": 1.05, + "grad_norm": 17.748262405395508, + "learning_rate": 3.6183980412731725e-07, + "logps/chosen": -51.22157287597656, + "logps/rejected": -69.38434600830078, + "loss": 0.423, + "losses/dpo": 0.8381637930870056, + "losses/sft": 2.3169407844543457, + "losses/total": 0.8381637930870056, + "ref_logps/chosen": -37.958824157714844, + "ref_logps/rejected": -44.11109161376953, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3262743949890137, + "rewards/margins": 1.2010509967803955, + "rewards/rejected": -2.5273256301879883, + "step": 1108 + }, + { + "epoch": 1.05, + "grad_norm": 22.701948165893555, + "learning_rate": 3.6166491780342776e-07, + "logps/chosen": -54.894561767578125, + "logps/rejected": -66.18856811523438, + "loss": 0.52, + "losses/dpo": 0.6727304458618164, + "losses/sft": 1.6012508869171143, + "losses/total": 0.6727304458618164, + "ref_logps/chosen": -41.45005798339844, + "ref_logps/rejected": -45.44903564453125, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3444503545761108, + "rewards/margins": 0.7295031547546387, + "rewards/rejected": -2.073953628540039, + "step": 1109 + }, + { + "epoch": 1.05, + "grad_norm": 18.720478057861328, + "learning_rate": 3.614900314795383e-07, + "logps/chosen": -59.015777587890625, + "logps/rejected": -59.62892532348633, + "loss": 0.4505, + "losses/dpo": 0.53801429271698, + "losses/sft": 1.892388105392456, + "losses/total": 0.53801429271698, + "ref_logps/chosen": -46.950958251953125, + "ref_logps/rejected": -40.51457214355469, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.20648193359375, + "rewards/margins": 0.7049534320831299, + "rewards/rejected": -1.9114353656768799, + "step": 1110 + }, + { + "epoch": 1.05, + "grad_norm": 17.969141006469727, + "learning_rate": 3.613151451556488e-07, + "logps/chosen": -49.31396484375, + "logps/rejected": -55.11629867553711, + "loss": 0.447, + "losses/dpo": 0.32817405462265015, + "losses/sft": 1.3181028366088867, + "losses/total": 0.32817405462265015, + "ref_logps/chosen": -40.17245101928711, + "ref_logps/rejected": -37.67129898071289, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9141513109207153, + "rewards/margins": 0.8303487300872803, + "rewards/rejected": -1.7445001602172852, + "step": 1111 + }, + { + "epoch": 1.05, + "grad_norm": 25.278850555419922, + "learning_rate": 3.6114025883175935e-07, + "logps/chosen": -69.34782409667969, + "logps/rejected": -71.66302490234375, + "loss": 0.643, + "losses/dpo": 0.6143825054168701, + "losses/sft": 1.6051511764526367, + "losses/total": 0.6143825054168701, + "ref_logps/chosen": -49.629981994628906, + "ref_logps/rejected": -47.354042053222656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.971783995628357, + "rewards/margins": 0.4591141939163208, + "rewards/rejected": -2.4308981895446777, + "step": 1112 + }, + { + "epoch": 1.05, + "grad_norm": 22.47568130493164, + "learning_rate": 3.6096537250786987e-07, + "logps/chosen": -37.21412658691406, + "logps/rejected": -58.294795989990234, + "loss": 0.5597, + "losses/dpo": 0.17180033028125763, + "losses/sft": 1.4381208419799805, + "losses/total": 0.17180033028125763, + "ref_logps/chosen": -27.49213409423828, + "ref_logps/rejected": -40.21863555908203, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9721994996070862, + "rewards/margins": 0.8354163765907288, + "rewards/rejected": -1.8076159954071045, + "step": 1113 + }, + { + "epoch": 1.05, + "grad_norm": 15.010885238647461, + "learning_rate": 3.607904861839804e-07, + "logps/chosen": -46.075538635253906, + "logps/rejected": -66.32315063476562, + "loss": 0.3255, + "losses/dpo": 0.40915241837501526, + "losses/sft": 1.4643089771270752, + "losses/total": 0.40915241837501526, + "ref_logps/chosen": -36.338890075683594, + "ref_logps/rejected": -44.26709747314453, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9736647009849548, + "rewards/margins": 1.2319402694702148, + "rewards/rejected": -2.2056050300598145, + "step": 1114 + }, + { + "epoch": 1.05, + "grad_norm": 19.576242446899414, + "learning_rate": 3.6061559986009095e-07, + "logps/chosen": -44.7883186340332, + "logps/rejected": -59.95389175415039, + "loss": 0.4244, + "losses/dpo": 0.35913681983947754, + "losses/sft": 1.27180814743042, + "losses/total": 0.35913681983947754, + "ref_logps/chosen": -35.445308685302734, + "ref_logps/rejected": -38.98551940917969, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9343007206916809, + "rewards/margins": 1.16253662109375, + "rewards/rejected": -2.096837282180786, + "step": 1115 + }, + { + "epoch": 1.05, + "grad_norm": 18.308237075805664, + "learning_rate": 3.6044071353620146e-07, + "logps/chosen": -36.147193908691406, + "logps/rejected": -58.4028205871582, + "loss": 0.4321, + "losses/dpo": 0.4773769974708557, + "losses/sft": 1.2280681133270264, + "losses/total": 0.4773769974708557, + "ref_logps/chosen": -28.9426326751709, + "ref_logps/rejected": -40.52062225341797, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7204559445381165, + "rewards/margins": 1.0677636861801147, + "rewards/rejected": -1.788219690322876, + "step": 1116 + }, + { + "epoch": 1.05, + "grad_norm": 16.233169555664062, + "learning_rate": 3.6026582721231197e-07, + "logps/chosen": -39.595314025878906, + "logps/rejected": -53.48869705200195, + "loss": 0.4073, + "losses/dpo": 0.3475499749183655, + "losses/sft": 1.2709814310073853, + "losses/total": 0.3475499749183655, + "ref_logps/chosen": -30.529605865478516, + "ref_logps/rejected": -36.00749969482422, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9065707325935364, + "rewards/margins": 0.8415488600730896, + "rewards/rejected": -1.748119592666626, + "step": 1117 + }, + { + "epoch": 1.06, + "grad_norm": 15.320701599121094, + "learning_rate": 3.6009094088842254e-07, + "logps/chosen": -44.485107421875, + "logps/rejected": -54.91472625732422, + "loss": 0.3597, + "losses/dpo": 0.48282861709594727, + "losses/sft": 1.5630582571029663, + "losses/total": 0.48282861709594727, + "ref_logps/chosen": -35.60723114013672, + "ref_logps/rejected": -35.52288055419922, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8877873420715332, + "rewards/margins": 1.0513970851898193, + "rewards/rejected": -1.9391844272613525, + "step": 1118 + }, + { + "epoch": 1.06, + "grad_norm": 14.03819751739502, + "learning_rate": 3.5991605456453305e-07, + "logps/chosen": -47.40251159667969, + "logps/rejected": -75.83522033691406, + "loss": 0.2648, + "losses/dpo": 0.24401050806045532, + "losses/sft": 1.9604064226150513, + "losses/total": 0.24401050806045532, + "ref_logps/chosen": -38.702205657958984, + "ref_logps/rejected": -52.07846450805664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8700307011604309, + "rewards/margins": 1.5056447982788086, + "rewards/rejected": -2.375675678253174, + "step": 1119 + }, + { + "epoch": 1.06, + "grad_norm": 17.748851776123047, + "learning_rate": 3.5974116824064356e-07, + "logps/chosen": -47.6712532043457, + "logps/rejected": -79.5129623413086, + "loss": 0.3323, + "losses/dpo": 0.25378283858299255, + "losses/sft": 1.4068554639816284, + "losses/total": 0.25378283858299255, + "ref_logps/chosen": -37.408958435058594, + "ref_logps/rejected": -55.53813934326172, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0262292623519897, + "rewards/margins": 1.3712530136108398, + "rewards/rejected": -2.39748215675354, + "step": 1120 + }, + { + "epoch": 1.06, + "grad_norm": 19.006412506103516, + "learning_rate": 3.595662819167541e-07, + "logps/chosen": -49.565391540527344, + "logps/rejected": -76.67436218261719, + "loss": 0.3702, + "losses/dpo": 0.50902259349823, + "losses/sft": 2.1297295093536377, + "losses/total": 0.50902259349823, + "ref_logps/chosen": -38.21741485595703, + "ref_logps/rejected": -54.113346099853516, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1347975730895996, + "rewards/margins": 1.1213042736053467, + "rewards/rejected": -2.2561018466949463, + "step": 1121 + }, + { + "epoch": 1.06, + "grad_norm": 13.391525268554688, + "learning_rate": 3.5939139559286464e-07, + "logps/chosen": -45.734527587890625, + "logps/rejected": -64.69873046875, + "loss": 0.2798, + "losses/dpo": 0.22471119463443756, + "losses/sft": 1.4187021255493164, + "losses/total": 0.22471119463443756, + "ref_logps/chosen": -36.93622589111328, + "ref_logps/rejected": -41.9991569519043, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8798298835754395, + "rewards/margins": 1.3901268243789673, + "rewards/rejected": -2.2699568271636963, + "step": 1122 + }, + { + "epoch": 1.06, + "grad_norm": 15.596683502197266, + "learning_rate": 3.5921650926897515e-07, + "logps/chosen": -41.36247634887695, + "logps/rejected": -71.56450653076172, + "loss": 0.3539, + "losses/dpo": 0.24520686268806458, + "losses/sft": 1.4111450910568237, + "losses/total": 0.24520686268806458, + "ref_logps/chosen": -34.94282531738281, + "ref_logps/rejected": -50.83561706542969, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6419654488563538, + "rewards/margins": 1.4309229850769043, + "rewards/rejected": -2.0728883743286133, + "step": 1123 + }, + { + "epoch": 1.06, + "grad_norm": 19.83420753479004, + "learning_rate": 3.5904162294508567e-07, + "logps/chosen": -47.32707214355469, + "logps/rejected": -72.98094940185547, + "loss": 0.3454, + "losses/dpo": 0.40817296504974365, + "losses/sft": 1.5442166328430176, + "losses/total": 0.40817296504974365, + "ref_logps/chosen": -37.06970977783203, + "ref_logps/rejected": -48.767147064208984, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0257360935211182, + "rewards/margins": 1.395643949508667, + "rewards/rejected": -2.421380043029785, + "step": 1124 + }, + { + "epoch": 1.06, + "grad_norm": 21.95766830444336, + "learning_rate": 3.5886673662119623e-07, + "logps/chosen": -57.457759857177734, + "logps/rejected": -59.646793365478516, + "loss": 0.4503, + "losses/dpo": 0.3703533411026001, + "losses/sft": 1.6664035320281982, + "losses/total": 0.3703533411026001, + "ref_logps/chosen": -43.65606689453125, + "ref_logps/rejected": -37.202117919921875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3801689147949219, + "rewards/margins": 0.8642987012863159, + "rewards/rejected": -2.2444677352905273, + "step": 1125 + }, + { + "epoch": 1.06, + "grad_norm": 21.429195404052734, + "learning_rate": 3.5869185029730674e-07, + "logps/chosen": -58.990962982177734, + "logps/rejected": -64.17696380615234, + "loss": 0.4925, + "losses/dpo": 0.3632112741470337, + "losses/sft": 1.3681801557540894, + "losses/total": 0.3632112741470337, + "ref_logps/chosen": -44.11982727050781, + "ref_logps/rejected": -40.79416275024414, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4871137142181396, + "rewards/margins": 0.8511661887168884, + "rewards/rejected": -2.338279962539673, + "step": 1126 + }, + { + "epoch": 1.06, + "grad_norm": 19.995677947998047, + "learning_rate": 3.5851696397341726e-07, + "logps/chosen": -37.011383056640625, + "logps/rejected": -66.07659912109375, + "loss": 0.4604, + "losses/dpo": 0.21649906039237976, + "losses/sft": 1.7158467769622803, + "losses/total": 0.21649906039237976, + "ref_logps/chosen": -26.816097259521484, + "ref_logps/rejected": -43.77798080444336, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0195286273956299, + "rewards/margins": 1.2103328704833984, + "rewards/rejected": -2.229861259460449, + "step": 1127 + }, + { + "epoch": 1.07, + "grad_norm": 23.656599044799805, + "learning_rate": 3.5834207764952777e-07, + "logps/chosen": -53.18360900878906, + "logps/rejected": -63.63755416870117, + "loss": 0.4436, + "losses/dpo": 0.5402354598045349, + "losses/sft": 1.850532054901123, + "losses/total": 0.5402354598045349, + "ref_logps/chosen": -39.15081787109375, + "ref_logps/rejected": -40.1318359375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4032790660858154, + "rewards/margins": 0.9472929239273071, + "rewards/rejected": -2.350571870803833, + "step": 1128 + }, + { + "epoch": 1.07, + "grad_norm": 15.228174209594727, + "learning_rate": 3.5816719132563833e-07, + "logps/chosen": -57.978492736816406, + "logps/rejected": -80.89125061035156, + "loss": 0.296, + "losses/dpo": 0.23466572165489197, + "losses/sft": 2.539710521697998, + "losses/total": 0.23466572165489197, + "ref_logps/chosen": -45.61967086791992, + "ref_logps/rejected": -53.44007873535156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2358821630477905, + "rewards/margins": 1.509235143661499, + "rewards/rejected": -2.7451171875, + "step": 1129 + }, + { + "epoch": 1.07, + "grad_norm": 27.180347442626953, + "learning_rate": 3.579923050017489e-07, + "logps/chosen": -50.36742401123047, + "logps/rejected": -55.680397033691406, + "loss": 0.6255, + "losses/dpo": 0.6598228216171265, + "losses/sft": 1.9887068271636963, + "losses/total": 0.6598228216171265, + "ref_logps/chosen": -33.17410659790039, + "ref_logps/rejected": -34.39702606201172, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7193315029144287, + "rewards/margins": 0.4090052843093872, + "rewards/rejected": -2.1283369064331055, + "step": 1130 + }, + { + "epoch": 1.07, + "grad_norm": 27.55199432373047, + "learning_rate": 3.5781741867785936e-07, + "logps/chosen": -60.86018753051758, + "logps/rejected": -68.66188049316406, + "loss": 0.6284, + "losses/dpo": 0.8763452172279358, + "losses/sft": 1.7240406274795532, + "losses/total": 0.8763452172279358, + "ref_logps/chosen": -45.129634857177734, + "ref_logps/rejected": -48.709686279296875, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5730552673339844, + "rewards/margins": 0.4221642017364502, + "rewards/rejected": -1.9952194690704346, + "step": 1131 + }, + { + "epoch": 1.07, + "grad_norm": 25.422346115112305, + "learning_rate": 3.576425323539699e-07, + "logps/chosen": -58.734981536865234, + "logps/rejected": -54.84503173828125, + "loss": 0.6635, + "losses/dpo": 0.43343645334243774, + "losses/sft": 1.9890453815460205, + "losses/total": 0.43343645334243774, + "ref_logps/chosen": -41.60216522216797, + "ref_logps/rejected": -33.43422317504883, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.7132813930511475, + "rewards/margins": 0.4277995228767395, + "rewards/rejected": -2.141080856323242, + "step": 1132 + }, + { + "epoch": 1.07, + "grad_norm": 18.690250396728516, + "learning_rate": 3.5746764603008044e-07, + "logps/chosen": -58.3661994934082, + "logps/rejected": -69.38311767578125, + "loss": 0.3678, + "losses/dpo": 0.42590248584747314, + "losses/sft": 1.55850088596344, + "losses/total": 0.42590248584747314, + "ref_logps/chosen": -45.18629455566406, + "ref_logps/rejected": -44.711822509765625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3179905414581299, + "rewards/margins": 1.1491386890411377, + "rewards/rejected": -2.4671292304992676, + "step": 1133 + }, + { + "epoch": 1.07, + "grad_norm": 24.80685043334961, + "learning_rate": 3.5729275970619095e-07, + "logps/chosen": -48.22721481323242, + "logps/rejected": -63.99211120605469, + "loss": 0.4485, + "losses/dpo": 0.45494773983955383, + "losses/sft": 1.812914252281189, + "losses/total": 0.45494773983955383, + "ref_logps/chosen": -33.45372772216797, + "ref_logps/rejected": -39.96595001220703, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.477348804473877, + "rewards/margins": 0.9252671003341675, + "rewards/rejected": -2.402616024017334, + "step": 1134 + }, + { + "epoch": 1.07, + "grad_norm": 26.141324996948242, + "learning_rate": 3.5711787338230146e-07, + "logps/chosen": -50.24357604980469, + "logps/rejected": -70.65209197998047, + "loss": 0.4864, + "losses/dpo": 0.8688951730728149, + "losses/sft": 2.723207712173462, + "losses/total": 0.8688951730728149, + "ref_logps/chosen": -32.94593048095703, + "ref_logps/rejected": -43.611351013183594, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.729764699935913, + "rewards/margins": 0.9743098020553589, + "rewards/rejected": -2.7040746212005615, + "step": 1135 + }, + { + "epoch": 1.07, + "grad_norm": 18.747983932495117, + "learning_rate": 3.5694298705841203e-07, + "logps/chosen": -49.9259033203125, + "logps/rejected": -58.2346076965332, + "loss": 0.4268, + "losses/dpo": 0.30385822057724, + "losses/sft": 1.8258968591690063, + "losses/total": 0.30385822057724, + "ref_logps/chosen": -39.310333251953125, + "ref_logps/rejected": -37.35885238647461, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.061557412147522, + "rewards/margins": 1.0260180234909058, + "rewards/rejected": -2.0875754356384277, + "step": 1136 + }, + { + "epoch": 1.07, + "grad_norm": 19.496219635009766, + "learning_rate": 3.567681007345226e-07, + "logps/chosen": -53.89305877685547, + "logps/rejected": -78.14730072021484, + "loss": 0.3448, + "losses/dpo": 0.4381665289402008, + "losses/sft": 1.6898956298828125, + "losses/total": 0.4381665289402008, + "ref_logps/chosen": -38.962066650390625, + "ref_logps/rejected": -49.8504638671875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.493099331855774, + "rewards/margins": 1.3365838527679443, + "rewards/rejected": -2.8296830654144287, + "step": 1137 + }, + { + "epoch": 1.07, + "grad_norm": 28.223752975463867, + "learning_rate": 3.5659321441063305e-07, + "logps/chosen": -69.51285552978516, + "logps/rejected": -78.18511199951172, + "loss": 0.4396, + "losses/dpo": 0.8602421283721924, + "losses/sft": 2.231619358062744, + "losses/total": 0.8602421283721924, + "ref_logps/chosen": -51.995994567871094, + "ref_logps/rejected": -49.90565490722656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7516865730285645, + "rewards/margins": 1.0762584209442139, + "rewards/rejected": -2.8279449939727783, + "step": 1138 + }, + { + "epoch": 1.08, + "grad_norm": 22.851444244384766, + "learning_rate": 3.564183280867436e-07, + "logps/chosen": -44.15315246582031, + "logps/rejected": -51.366722106933594, + "loss": 0.4632, + "losses/dpo": 0.5887936353683472, + "losses/sft": 2.151808261871338, + "losses/total": 0.5887936353683472, + "ref_logps/chosen": -31.659286499023438, + "ref_logps/rejected": -28.490835189819336, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2493866682052612, + "rewards/margins": 1.0382020473480225, + "rewards/rejected": -2.2875888347625732, + "step": 1139 + }, + { + "epoch": 1.08, + "grad_norm": 25.950254440307617, + "learning_rate": 3.5624344176285413e-07, + "logps/chosen": -57.27245330810547, + "logps/rejected": -66.78453063964844, + "loss": 0.5041, + "losses/dpo": 0.5709726810455322, + "losses/sft": 2.028679609298706, + "losses/total": 0.5709726810455322, + "ref_logps/chosen": -44.13425827026367, + "ref_logps/rejected": -45.715187072753906, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3138198852539062, + "rewards/margins": 0.7931150794029236, + "rewards/rejected": -2.1069350242614746, + "step": 1140 + }, + { + "epoch": 1.08, + "grad_norm": 33.055450439453125, + "learning_rate": 3.5606855543896464e-07, + "logps/chosen": -60.55674743652344, + "logps/rejected": -58.6107292175293, + "loss": 0.7891, + "losses/dpo": 0.7282435894012451, + "losses/sft": 1.6608691215515137, + "losses/total": 0.7282435894012451, + "ref_logps/chosen": -41.75940704345703, + "ref_logps/rejected": -37.362300872802734, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.879733920097351, + "rewards/margins": 0.24510888755321503, + "rewards/rejected": -2.124842643737793, + "step": 1141 + }, + { + "epoch": 1.08, + "grad_norm": 18.956497192382812, + "learning_rate": 3.5589366911507516e-07, + "logps/chosen": -46.049354553222656, + "logps/rejected": -58.819793701171875, + "loss": 0.3508, + "losses/dpo": 0.2803114354610443, + "losses/sft": 1.8871058225631714, + "losses/total": 0.2803114354610443, + "ref_logps/chosen": -35.16623306274414, + "ref_logps/rejected": -36.46910095214844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0883126258850098, + "rewards/margins": 1.1467571258544922, + "rewards/rejected": -2.235069751739502, + "step": 1142 + }, + { + "epoch": 1.08, + "grad_norm": 17.948486328125, + "learning_rate": 3.557187827911857e-07, + "logps/chosen": -52.22205352783203, + "logps/rejected": -73.42619323730469, + "loss": 0.3478, + "losses/dpo": 0.4428257346153259, + "losses/sft": 1.4677045345306396, + "losses/total": 0.4428257346153259, + "ref_logps/chosen": -39.68170928955078, + "ref_logps/rejected": -49.47446060180664, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2540345191955566, + "rewards/margins": 1.1411387920379639, + "rewards/rejected": -2.3951730728149414, + "step": 1143 + }, + { + "epoch": 1.08, + "grad_norm": 25.768634796142578, + "learning_rate": 3.555438964672963e-07, + "logps/chosen": -54.37225341796875, + "logps/rejected": -80.14595794677734, + "loss": 0.4138, + "losses/dpo": 0.23643368482589722, + "losses/sft": 1.6120222806930542, + "losses/total": 0.23643368482589722, + "ref_logps/chosen": -41.12909698486328, + "ref_logps/rejected": -52.570533752441406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3243160247802734, + "rewards/margins": 1.4332270622253418, + "rewards/rejected": -2.7575430870056152, + "step": 1144 + }, + { + "epoch": 1.08, + "grad_norm": 19.334613800048828, + "learning_rate": 3.5536901014340675e-07, + "logps/chosen": -44.830909729003906, + "logps/rejected": -62.91521072387695, + "loss": 0.4225, + "losses/dpo": 0.22556906938552856, + "losses/sft": 1.6811991930007935, + "losses/total": 0.22556906938552856, + "ref_logps/chosen": -32.09433364868164, + "ref_logps/rejected": -38.02540588378906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2736577987670898, + "rewards/margins": 1.2153228521347046, + "rewards/rejected": -2.488980770111084, + "step": 1145 + }, + { + "epoch": 1.08, + "grad_norm": 14.628767967224121, + "learning_rate": 3.551941238195173e-07, + "logps/chosen": -42.94926071166992, + "logps/rejected": -71.724853515625, + "loss": 0.2858, + "losses/dpo": 0.4847782254219055, + "losses/sft": 1.6568411588668823, + "losses/total": 0.4847782254219055, + "ref_logps/chosen": -33.102718353271484, + "ref_logps/rejected": -45.65483856201172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9846542477607727, + "rewards/margins": 1.6223468780517578, + "rewards/rejected": -2.6070010662078857, + "step": 1146 + }, + { + "epoch": 1.08, + "grad_norm": 18.073406219482422, + "learning_rate": 3.5501923749562783e-07, + "logps/chosen": -39.053016662597656, + "logps/rejected": -57.59465026855469, + "loss": 0.4177, + "losses/dpo": 0.8261393308639526, + "losses/sft": 1.3657069206237793, + "losses/total": 0.8261393308639526, + "ref_logps/chosen": -28.937376022338867, + "ref_logps/rejected": -34.375274658203125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0115641355514526, + "rewards/margins": 1.310373306274414, + "rewards/rejected": -2.3219375610351562, + "step": 1147 + }, + { + "epoch": 1.08, + "grad_norm": 21.49103355407715, + "learning_rate": 3.5484435117173834e-07, + "logps/chosen": -45.497642517089844, + "logps/rejected": -65.38764190673828, + "loss": 0.4269, + "losses/dpo": 0.5173580646514893, + "losses/sft": 1.3202495574951172, + "losses/total": 0.5173580646514893, + "ref_logps/chosen": -33.979156494140625, + "ref_logps/rejected": -41.26577377319336, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1518484354019165, + "rewards/margins": 1.260338544845581, + "rewards/rejected": -2.412186861038208, + "step": 1148 + }, + { + "epoch": 1.08, + "grad_norm": 23.00715446472168, + "learning_rate": 3.5466946484784885e-07, + "logps/chosen": -49.389007568359375, + "logps/rejected": -73.2331771850586, + "loss": 0.3978, + "losses/dpo": 0.5119600296020508, + "losses/sft": 1.5053342580795288, + "losses/total": 0.5119600296020508, + "ref_logps/chosen": -38.111602783203125, + "ref_logps/rejected": -51.177330017089844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1277403831481934, + "rewards/margins": 1.0778441429138184, + "rewards/rejected": -2.2055845260620117, + "step": 1149 + }, + { + "epoch": 1.09, + "grad_norm": 15.713883399963379, + "learning_rate": 3.544945785239594e-07, + "logps/chosen": -41.73316955566406, + "logps/rejected": -59.24371337890625, + "loss": 0.4268, + "losses/dpo": 0.5106825828552246, + "losses/sft": 1.5899378061294556, + "losses/total": 0.5106825828552246, + "ref_logps/chosen": -32.25578308105469, + "ref_logps/rejected": -39.29499053955078, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9477388858795166, + "rewards/margins": 1.0471336841583252, + "rewards/rejected": -1.9948724508285522, + "step": 1150 + }, + { + "epoch": 1.09, + "grad_norm": 24.67866325378418, + "learning_rate": 3.5431969220007e-07, + "logps/chosen": -54.55917739868164, + "logps/rejected": -71.77665710449219, + "loss": 0.4008, + "losses/dpo": 0.2664833962917328, + "losses/sft": 1.702461838722229, + "losses/total": 0.2664833962917328, + "ref_logps/chosen": -41.261722564697266, + "ref_logps/rejected": -47.21800994873047, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3297452926635742, + "rewards/margins": 1.126119613647461, + "rewards/rejected": -2.455864906311035, + "step": 1151 + }, + { + "epoch": 1.09, + "grad_norm": 16.442657470703125, + "learning_rate": 3.5414480587618044e-07, + "logps/chosen": -49.97710037231445, + "logps/rejected": -66.29367065429688, + "loss": 0.3146, + "losses/dpo": 0.26360827684402466, + "losses/sft": 1.9229429960250854, + "losses/total": 0.26360827684402466, + "ref_logps/chosen": -39.35449981689453, + "ref_logps/rejected": -43.36257553100586, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0622599124908447, + "rewards/margins": 1.230850338935852, + "rewards/rejected": -2.2931101322174072, + "step": 1152 + }, + { + "epoch": 1.09, + "grad_norm": 17.925357818603516, + "learning_rate": 3.53969919552291e-07, + "logps/chosen": -54.834068298339844, + "logps/rejected": -59.09382629394531, + "loss": 0.3315, + "losses/dpo": 0.35731256008148193, + "losses/sft": 1.4937348365783691, + "losses/total": 0.35731256008148193, + "ref_logps/chosen": -47.17584991455078, + "ref_logps/rejected": -37.09530258178711, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7658220529556274, + "rewards/margins": 1.434030532836914, + "rewards/rejected": -2.199852466583252, + "step": 1153 + }, + { + "epoch": 1.09, + "grad_norm": 17.206207275390625, + "learning_rate": 3.537950332284015e-07, + "logps/chosen": -35.15504455566406, + "logps/rejected": -58.040550231933594, + "loss": 0.3286, + "losses/dpo": 0.6031808257102966, + "losses/sft": 1.793176293373108, + "losses/total": 0.6031808257102966, + "ref_logps/chosen": -27.431175231933594, + "ref_logps/rejected": -37.54655838012695, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7723866701126099, + "rewards/margins": 1.2770124673843384, + "rewards/rejected": -2.0493993759155273, + "step": 1154 + }, + { + "epoch": 1.09, + "grad_norm": 23.088726043701172, + "learning_rate": 3.5362014690451203e-07, + "logps/chosen": -56.150489807128906, + "logps/rejected": -67.29031372070312, + "loss": 0.5781, + "losses/dpo": 0.9381304979324341, + "losses/sft": 2.3748488426208496, + "losses/total": 0.9381304979324341, + "ref_logps/chosen": -42.44438171386719, + "ref_logps/rejected": -46.41227340698242, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3706107139587402, + "rewards/margins": 0.7171932458877563, + "rewards/rejected": -2.087803840637207, + "step": 1155 + }, + { + "epoch": 1.09, + "grad_norm": 14.592273712158203, + "learning_rate": 3.534452605806226e-07, + "logps/chosen": -43.50091552734375, + "logps/rejected": -55.27341079711914, + "loss": 0.3501, + "losses/dpo": 0.4037971794605255, + "losses/sft": 1.7588260173797607, + "losses/total": 0.4037971794605255, + "ref_logps/chosen": -33.55754470825195, + "ref_logps/rejected": -34.06553649902344, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9943369030952454, + "rewards/margins": 1.126450538635254, + "rewards/rejected": -2.1207876205444336, + "step": 1156 + }, + { + "epoch": 1.09, + "grad_norm": 14.295937538146973, + "learning_rate": 3.532703742567331e-07, + "logps/chosen": -46.77973556518555, + "logps/rejected": -55.979454040527344, + "loss": 0.3319, + "losses/dpo": 0.25589895248413086, + "losses/sft": 1.7189871072769165, + "losses/total": 0.25589895248413086, + "ref_logps/chosen": -37.207427978515625, + "ref_logps/rejected": -33.743812561035156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9572307467460632, + "rewards/margins": 1.2663336992263794, + "rewards/rejected": -2.223564386367798, + "step": 1157 + }, + { + "epoch": 1.09, + "grad_norm": 19.403175354003906, + "learning_rate": 3.530954879328437e-07, + "logps/chosen": -38.16740417480469, + "logps/rejected": -66.7232666015625, + "loss": 0.41, + "losses/dpo": 0.34852665662765503, + "losses/sft": 1.9315986633300781, + "losses/total": 0.34852665662765503, + "ref_logps/chosen": -25.988985061645508, + "ref_logps/rejected": -42.32234573364258, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2178417444229126, + "rewards/margins": 1.22225022315979, + "rewards/rejected": -2.440091848373413, + "step": 1158 + }, + { + "epoch": 1.09, + "grad_norm": 22.430150985717773, + "learning_rate": 3.5292060160895414e-07, + "logps/chosen": -51.44413757324219, + "logps/rejected": -69.00386047363281, + "loss": 0.3955, + "losses/dpo": 0.5232186913490295, + "losses/sft": 1.9724373817443848, + "losses/total": 0.5232186913490295, + "ref_logps/chosen": -36.11259078979492, + "ref_logps/rejected": -43.00016403198242, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.533154845237732, + "rewards/margins": 1.067214012145996, + "rewards/rejected": -2.6003689765930176, + "step": 1159 + }, + { + "epoch": 1.1, + "grad_norm": 22.362287521362305, + "learning_rate": 3.527457152850647e-07, + "logps/chosen": -55.34199905395508, + "logps/rejected": -68.48287963867188, + "loss": 0.454, + "losses/dpo": 0.2427171915769577, + "losses/sft": 1.4085420370101929, + "losses/total": 0.2427171915769577, + "ref_logps/chosen": -41.06510925292969, + "ref_logps/rejected": -45.122982025146484, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4276885986328125, + "rewards/margins": 0.9083008766174316, + "rewards/rejected": -2.335989475250244, + "step": 1160 + }, + { + "epoch": 1.1, + "grad_norm": 24.605642318725586, + "learning_rate": 3.525708289611752e-07, + "logps/chosen": -58.87159729003906, + "logps/rejected": -63.40034484863281, + "loss": 0.581, + "losses/dpo": 0.33205026388168335, + "losses/sft": 2.30277156829834, + "losses/total": 0.33205026388168335, + "ref_logps/chosen": -43.74916076660156, + "ref_logps/rejected": -40.892364501953125, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5122437477111816, + "rewards/margins": 0.7385542392730713, + "rewards/rejected": -2.250798225402832, + "step": 1161 + }, + { + "epoch": 1.1, + "grad_norm": 19.314685821533203, + "learning_rate": 3.5239594263728573e-07, + "logps/chosen": -42.61058807373047, + "logps/rejected": -50.25887680053711, + "loss": 0.4573, + "losses/dpo": 0.7957479357719421, + "losses/sft": 1.5312973260879517, + "losses/total": 0.7957479357719421, + "ref_logps/chosen": -36.30377960205078, + "ref_logps/rejected": -33.12445831298828, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6306807994842529, + "rewards/margins": 1.082761287689209, + "rewards/rejected": -1.7134422063827515, + "step": 1162 + }, + { + "epoch": 1.1, + "grad_norm": 24.99732208251953, + "learning_rate": 3.522210563133963e-07, + "logps/chosen": -53.78838348388672, + "logps/rejected": -63.896514892578125, + "loss": 0.544, + "losses/dpo": 0.30477678775787354, + "losses/sft": 1.4882583618164062, + "losses/total": 0.30477678775787354, + "ref_logps/chosen": -41.39977264404297, + "ref_logps/rejected": -43.97334289550781, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.238861083984375, + "rewards/margins": 0.7534563541412354, + "rewards/rejected": -1.9923176765441895, + "step": 1163 + }, + { + "epoch": 1.1, + "grad_norm": 19.90959930419922, + "learning_rate": 3.520461699895068e-07, + "logps/chosen": -50.8354606628418, + "logps/rejected": -75.72209167480469, + "loss": 0.3793, + "losses/dpo": 0.5388731360435486, + "losses/sft": 2.3876144886016846, + "losses/total": 0.5388731360435486, + "ref_logps/chosen": -38.40864562988281, + "ref_logps/rejected": -49.724021911621094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2426813840866089, + "rewards/margins": 1.3571256399154663, + "rewards/rejected": -2.599807024002075, + "step": 1164 + }, + { + "epoch": 1.1, + "grad_norm": 19.020309448242188, + "learning_rate": 3.5187128366561737e-07, + "logps/chosen": -49.30963897705078, + "logps/rejected": -77.75773620605469, + "loss": 0.332, + "losses/dpo": 0.39261364936828613, + "losses/sft": 1.989349126815796, + "losses/total": 0.39261364936828613, + "ref_logps/chosen": -36.979759216308594, + "ref_logps/rejected": -50.448089599609375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2329885959625244, + "rewards/margins": 1.497976303100586, + "rewards/rejected": -2.7309646606445312, + "step": 1165 + }, + { + "epoch": 1.1, + "grad_norm": 24.917984008789062, + "learning_rate": 3.5169639734172783e-07, + "logps/chosen": -51.5273323059082, + "logps/rejected": -80.72734832763672, + "loss": 0.5428, + "losses/dpo": 0.2605714797973633, + "losses/sft": 1.7832306623458862, + "losses/total": 0.2605714797973633, + "ref_logps/chosen": -35.4371337890625, + "ref_logps/rejected": -55.13147735595703, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6090195178985596, + "rewards/margins": 0.9505681991577148, + "rewards/rejected": -2.5595874786376953, + "step": 1166 + }, + { + "epoch": 1.1, + "grad_norm": 24.68726921081543, + "learning_rate": 3.515215110178384e-07, + "logps/chosen": -53.63042068481445, + "logps/rejected": -64.61624145507812, + "loss": 0.4266, + "losses/dpo": 0.4296250343322754, + "losses/sft": 2.0445499420166016, + "losses/total": 0.4296250343322754, + "ref_logps/chosen": -37.47551727294922, + "ref_logps/rejected": -37.14946746826172, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6154899597167969, + "rewards/margins": 1.1311869621276855, + "rewards/rejected": -2.7466769218444824, + "step": 1167 + }, + { + "epoch": 1.1, + "grad_norm": 24.4298095703125, + "learning_rate": 3.513466246939489e-07, + "logps/chosen": -58.022926330566406, + "logps/rejected": -87.08427429199219, + "loss": 0.4136, + "losses/dpo": 0.5407834053039551, + "losses/sft": 1.6361950635910034, + "losses/total": 0.5407834053039551, + "ref_logps/chosen": -41.00047302246094, + "ref_logps/rejected": -55.60737991333008, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7022452354431152, + "rewards/margins": 1.445443868637085, + "rewards/rejected": -3.147688865661621, + "step": 1168 + }, + { + "epoch": 1.1, + "grad_norm": 21.377750396728516, + "learning_rate": 3.511717383700594e-07, + "logps/chosen": -36.05640411376953, + "logps/rejected": -51.381248474121094, + "loss": 0.4771, + "losses/dpo": 0.9195261001586914, + "losses/sft": 2.144254684448242, + "losses/total": 0.9195261001586914, + "ref_logps/chosen": -27.85747528076172, + "ref_logps/rejected": -35.07959747314453, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8198926448822021, + "rewards/margins": 0.8102728128433228, + "rewards/rejected": -1.6301653385162354, + "step": 1169 + }, + { + "epoch": 1.1, + "grad_norm": 18.577001571655273, + "learning_rate": 3.5099685204617e-07, + "logps/chosen": -42.984107971191406, + "logps/rejected": -57.425537109375, + "loss": 0.427, + "losses/dpo": 0.5384687185287476, + "losses/sft": 1.9965769052505493, + "losses/total": 0.5384687185287476, + "ref_logps/chosen": -31.05020523071289, + "ref_logps/rejected": -35.98308563232422, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1933903694152832, + "rewards/margins": 0.9508547186851501, + "rewards/rejected": -2.144245147705078, + "step": 1170 + }, + { + "epoch": 1.11, + "grad_norm": 19.3896484375, + "learning_rate": 3.508219657222805e-07, + "logps/chosen": -55.75626754760742, + "logps/rejected": -64.54580688476562, + "loss": 0.4252, + "losses/dpo": 0.1418294906616211, + "losses/sft": 1.8937249183654785, + "losses/total": 0.1418294906616211, + "ref_logps/chosen": -43.520999908447266, + "ref_logps/rejected": -42.44101333618164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.223527193069458, + "rewards/margins": 0.9869518280029297, + "rewards/rejected": -2.2104790210723877, + "step": 1171 + }, + { + "epoch": 1.11, + "grad_norm": 20.17457389831543, + "learning_rate": 3.5064707939839107e-07, + "logps/chosen": -47.316165924072266, + "logps/rejected": -65.99116516113281, + "loss": 0.4152, + "losses/dpo": 0.4568825364112854, + "losses/sft": 1.6591540575027466, + "losses/total": 0.4568825364112854, + "ref_logps/chosen": -35.279884338378906, + "ref_logps/rejected": -43.724586486816406, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2036280632019043, + "rewards/margins": 1.0230306386947632, + "rewards/rejected": -2.226658821105957, + "step": 1172 + }, + { + "epoch": 1.11, + "grad_norm": 17.68301010131836, + "learning_rate": 3.504721930745015e-07, + "logps/chosen": -50.62818145751953, + "logps/rejected": -87.6301498413086, + "loss": 0.2948, + "losses/dpo": 0.3418300151824951, + "losses/sft": 2.1226580142974854, + "losses/total": 0.3418300151824951, + "ref_logps/chosen": -36.131919860839844, + "ref_logps/rejected": -54.819026947021484, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4496264457702637, + "rewards/margins": 1.8314857482910156, + "rewards/rejected": -3.2811119556427, + "step": 1173 + }, + { + "epoch": 1.11, + "grad_norm": 27.2384090423584, + "learning_rate": 3.502973067506121e-07, + "logps/chosen": -53.364532470703125, + "logps/rejected": -63.85072326660156, + "loss": 0.6441, + "losses/dpo": 0.8595701456069946, + "losses/sft": 2.1518218517303467, + "losses/total": 0.8595701456069946, + "ref_logps/chosen": -38.69149398803711, + "ref_logps/rejected": -44.97615051269531, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4673038721084595, + "rewards/margins": 0.4201531410217285, + "rewards/rejected": -1.887457013130188, + "step": 1174 + }, + { + "epoch": 1.11, + "grad_norm": 22.04258155822754, + "learning_rate": 3.5012242042672266e-07, + "logps/chosen": -54.312103271484375, + "logps/rejected": -67.18894958496094, + "loss": 0.4164, + "losses/dpo": 0.718026876449585, + "losses/sft": 1.9256107807159424, + "losses/total": 0.718026876449585, + "ref_logps/chosen": -39.61787796020508, + "ref_logps/rejected": -42.2315673828125, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4694225788116455, + "rewards/margins": 1.0263155698776245, + "rewards/rejected": -2.4957382678985596, + "step": 1175 + }, + { + "epoch": 1.11, + "grad_norm": 17.261892318725586, + "learning_rate": 3.499475341028331e-07, + "logps/chosen": -38.29346466064453, + "logps/rejected": -62.170143127441406, + "loss": 0.3702, + "losses/dpo": 0.1875734031200409, + "losses/sft": 1.3133978843688965, + "losses/total": 0.1875734031200409, + "ref_logps/chosen": -29.878101348876953, + "ref_logps/rejected": -39.332252502441406, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8415360450744629, + "rewards/margins": 1.4422534704208374, + "rewards/rejected": -2.28378963470459, + "step": 1176 + }, + { + "epoch": 1.11, + "grad_norm": 21.05582046508789, + "learning_rate": 3.497726477789437e-07, + "logps/chosen": -44.98912048339844, + "logps/rejected": -67.68132781982422, + "loss": 0.4188, + "losses/dpo": 0.8851693272590637, + "losses/sft": 1.7933876514434814, + "losses/total": 0.8851693272590637, + "ref_logps/chosen": -36.263694763183594, + "ref_logps/rejected": -42.989280700683594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8725423216819763, + "rewards/margins": 1.596662163734436, + "rewards/rejected": -2.4692044258117676, + "step": 1177 + }, + { + "epoch": 1.11, + "grad_norm": 32.253196716308594, + "learning_rate": 3.495977614550542e-07, + "logps/chosen": -58.107582092285156, + "logps/rejected": -74.15830993652344, + "loss": 0.6214, + "losses/dpo": 0.49097663164138794, + "losses/sft": 1.653022289276123, + "losses/total": 0.49097663164138794, + "ref_logps/chosen": -43.99420928955078, + "ref_logps/rejected": -56.31801986694336, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.41133713722229, + "rewards/margins": 0.372691810131073, + "rewards/rejected": -1.7840288877487183, + "step": 1178 + }, + { + "epoch": 1.11, + "grad_norm": 26.46973419189453, + "learning_rate": 3.4942287513116476e-07, + "logps/chosen": -63.79710006713867, + "logps/rejected": -69.51112365722656, + "loss": 0.5685, + "losses/dpo": 0.6211001873016357, + "losses/sft": 1.787951111793518, + "losses/total": 0.6211001873016357, + "ref_logps/chosen": -49.32140350341797, + "ref_logps/rejected": -46.48867416381836, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4475698471069336, + "rewards/margins": 0.8546746373176575, + "rewards/rejected": -2.3022444248199463, + "step": 1179 + }, + { + "epoch": 1.11, + "grad_norm": 18.976913452148438, + "learning_rate": 3.492479888072752e-07, + "logps/chosen": -58.0233154296875, + "logps/rejected": -75.26605224609375, + "loss": 0.4405, + "losses/dpo": 0.3172830641269684, + "losses/sft": 1.5456390380859375, + "losses/total": 0.3172830641269684, + "ref_logps/chosen": -43.87802505493164, + "ref_logps/rejected": -49.81204605102539, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4145286083221436, + "rewards/margins": 1.130871295928955, + "rewards/rejected": -2.5453999042510986, + "step": 1180 + }, + { + "epoch": 1.12, + "grad_norm": 25.700510025024414, + "learning_rate": 3.490731024833858e-07, + "logps/chosen": -59.06761169433594, + "logps/rejected": -68.7720947265625, + "loss": 0.5264, + "losses/dpo": 0.9217410683631897, + "losses/sft": 1.8986196517944336, + "losses/total": 0.9217410683631897, + "ref_logps/chosen": -43.40380096435547, + "ref_logps/rejected": -45.43873596191406, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5663809776306152, + "rewards/margins": 0.7669545412063599, + "rewards/rejected": -2.3333356380462646, + "step": 1181 + }, + { + "epoch": 1.12, + "grad_norm": 22.932247161865234, + "learning_rate": 3.4889821615949635e-07, + "logps/chosen": -54.19947052001953, + "logps/rejected": -69.83200073242188, + "loss": 0.4193, + "losses/dpo": 0.2667746841907501, + "losses/sft": 1.887205958366394, + "losses/total": 0.2667746841907501, + "ref_logps/chosen": -40.734580993652344, + "ref_logps/rejected": -46.85262680053711, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3464891910552979, + "rewards/margins": 0.9514486789703369, + "rewards/rejected": -2.2979378700256348, + "step": 1182 + }, + { + "epoch": 1.12, + "grad_norm": 17.1081600189209, + "learning_rate": 3.487233298356068e-07, + "logps/chosen": -48.15372848510742, + "logps/rejected": -75.6485595703125, + "loss": 0.3246, + "losses/dpo": 0.4840124845504761, + "losses/sft": 1.9550937414169312, + "losses/total": 0.4840124845504761, + "ref_logps/chosen": -36.77861785888672, + "ref_logps/rejected": -50.692771911621094, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1375110149383545, + "rewards/margins": 1.3580682277679443, + "rewards/rejected": -2.495579242706299, + "step": 1183 + }, + { + "epoch": 1.12, + "grad_norm": 18.510454177856445, + "learning_rate": 3.485484435117174e-07, + "logps/chosen": -43.016136169433594, + "logps/rejected": -58.0286865234375, + "loss": 0.4385, + "losses/dpo": 0.7479536533355713, + "losses/sft": 2.54318904876709, + "losses/total": 0.7479536533355713, + "ref_logps/chosen": -31.054210662841797, + "ref_logps/rejected": -35.893402099609375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1961928606033325, + "rewards/margins": 1.0173360109329224, + "rewards/rejected": -2.213528871536255, + "step": 1184 + }, + { + "epoch": 1.12, + "grad_norm": 17.244873046875, + "learning_rate": 3.483735571878279e-07, + "logps/chosen": -43.629615783691406, + "logps/rejected": -54.11824035644531, + "loss": 0.4856, + "losses/dpo": 0.5764984488487244, + "losses/sft": 1.954749584197998, + "losses/total": 0.5764984488487244, + "ref_logps/chosen": -31.12030601501465, + "ref_logps/rejected": -32.72331619262695, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2509305477142334, + "rewards/margins": 0.888561487197876, + "rewards/rejected": -2.1394920349121094, + "step": 1185 + }, + { + "epoch": 1.12, + "grad_norm": 28.519880294799805, + "learning_rate": 3.4819867086393846e-07, + "logps/chosen": -51.87816619873047, + "logps/rejected": -62.20604705810547, + "loss": 0.5812, + "losses/dpo": 0.8336532711982727, + "losses/sft": 1.679840087890625, + "losses/total": 0.8336532711982727, + "ref_logps/chosen": -37.35664367675781, + "ref_logps/rejected": -42.5760498046875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.452152967453003, + "rewards/margins": 0.5108469724655151, + "rewards/rejected": -1.962999939918518, + "step": 1186 + }, + { + "epoch": 1.12, + "grad_norm": 21.679903030395508, + "learning_rate": 3.480237845400489e-07, + "logps/chosen": -49.95682144165039, + "logps/rejected": -61.141510009765625, + "loss": 0.4159, + "losses/dpo": 0.5916539430618286, + "losses/sft": 1.7467758655548096, + "losses/total": 0.5916539430618286, + "ref_logps/chosen": -37.57395553588867, + "ref_logps/rejected": -37.24937057495117, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2382864952087402, + "rewards/margins": 1.1509270668029785, + "rewards/rejected": -2.3892135620117188, + "step": 1187 + }, + { + "epoch": 1.12, + "grad_norm": 24.70008659362793, + "learning_rate": 3.478488982161595e-07, + "logps/chosen": -61.8380241394043, + "logps/rejected": -66.40875244140625, + "loss": 0.4498, + "losses/dpo": 0.3130904734134674, + "losses/sft": 1.7137500047683716, + "losses/total": 0.3130904734134674, + "ref_logps/chosen": -47.12581253051758, + "ref_logps/rejected": -42.88390350341797, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4712209701538086, + "rewards/margins": 0.8812636137008667, + "rewards/rejected": -2.352484703063965, + "step": 1188 + }, + { + "epoch": 1.12, + "grad_norm": 20.829978942871094, + "learning_rate": 3.4767401189227005e-07, + "logps/chosen": -39.790435791015625, + "logps/rejected": -62.06210708618164, + "loss": 0.4435, + "losses/dpo": 0.6889925599098206, + "losses/sft": 1.716752290725708, + "losses/total": 0.6889925599098206, + "ref_logps/chosen": -28.73815155029297, + "ref_logps/rejected": -40.22623825073242, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1052287817001343, + "rewards/margins": 1.0783581733703613, + "rewards/rejected": -2.183587074279785, + "step": 1189 + }, + { + "epoch": 1.12, + "grad_norm": 21.44162940979004, + "learning_rate": 3.474991255683805e-07, + "logps/chosen": -56.900238037109375, + "logps/rejected": -64.34751892089844, + "loss": 0.4279, + "losses/dpo": 0.6053601503372192, + "losses/sft": 1.8646621704101562, + "losses/total": 0.6053601503372192, + "ref_logps/chosen": -45.60890197753906, + "ref_logps/rejected": -44.466705322265625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.129133701324463, + "rewards/margins": 0.8589475154876709, + "rewards/rejected": -1.9880813360214233, + "step": 1190 + }, + { + "epoch": 1.12, + "grad_norm": 16.19450569152832, + "learning_rate": 3.4732423924449107e-07, + "logps/chosen": -39.03544616699219, + "logps/rejected": -61.84416961669922, + "loss": 0.3648, + "losses/dpo": 0.2910487651824951, + "losses/sft": 1.8084921836853027, + "losses/total": 0.2910487651824951, + "ref_logps/chosen": -29.14715576171875, + "ref_logps/rejected": -40.108909606933594, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9888288974761963, + "rewards/margins": 1.184696912765503, + "rewards/rejected": -2.173525810241699, + "step": 1191 + }, + { + "epoch": 1.13, + "grad_norm": 24.440706253051758, + "learning_rate": 3.471493529206016e-07, + "logps/chosen": -63.605384826660156, + "logps/rejected": -75.43583679199219, + "loss": 0.4601, + "losses/dpo": 0.8292043209075928, + "losses/sft": 1.2350585460662842, + "losses/total": 0.8292043209075928, + "ref_logps/chosen": -48.958065032958984, + "ref_logps/rejected": -46.86841583251953, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4647314548492432, + "rewards/margins": 1.39201021194458, + "rewards/rejected": -2.8567416667938232, + "step": 1192 + }, + { + "epoch": 1.13, + "grad_norm": 27.790037155151367, + "learning_rate": 3.4697446659671215e-07, + "logps/chosen": -42.98646545410156, + "logps/rejected": -58.1151123046875, + "loss": 0.6586, + "losses/dpo": 0.873833417892456, + "losses/sft": 2.1007161140441895, + "losses/total": 0.873833417892456, + "ref_logps/chosen": -29.699018478393555, + "ref_logps/rejected": -37.354427337646484, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.328744888305664, + "rewards/margins": 0.7473236322402954, + "rewards/rejected": -2.07606840133667, + "step": 1193 + }, + { + "epoch": 1.13, + "grad_norm": 14.21298885345459, + "learning_rate": 3.467995802728226e-07, + "logps/chosen": -43.702049255371094, + "logps/rejected": -59.763389587402344, + "loss": 0.3356, + "losses/dpo": 0.15674805641174316, + "losses/sft": 1.9462584257125854, + "losses/total": 0.15674805641174316, + "ref_logps/chosen": -31.779237747192383, + "ref_logps/rejected": -35.758663177490234, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1922811269760132, + "rewards/margins": 1.2081913948059082, + "rewards/rejected": -2.400472640991211, + "step": 1194 + }, + { + "epoch": 1.13, + "grad_norm": 20.321813583374023, + "learning_rate": 3.466246939489332e-07, + "logps/chosen": -63.832942962646484, + "logps/rejected": -88.71019744873047, + "loss": 0.3626, + "losses/dpo": 0.8511580228805542, + "losses/sft": 1.8943411111831665, + "losses/total": 0.8511580228805542, + "ref_logps/chosen": -47.3338623046875, + "ref_logps/rejected": -58.8153076171875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6499080657958984, + "rewards/margins": 1.339580774307251, + "rewards/rejected": -2.9894888401031494, + "step": 1195 + }, + { + "epoch": 1.13, + "grad_norm": 25.849332809448242, + "learning_rate": 3.4644980762504374e-07, + "logps/chosen": -64.0870361328125, + "logps/rejected": -87.64054870605469, + "loss": 0.3906, + "losses/dpo": 0.3396153748035431, + "losses/sft": 2.0073556900024414, + "losses/total": 0.3396153748035431, + "ref_logps/chosen": -50.96879577636719, + "ref_logps/rejected": -56.45702362060547, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3118246793746948, + "rewards/margins": 1.8065276145935059, + "rewards/rejected": -3.118352174758911, + "step": 1196 + }, + { + "epoch": 1.13, + "grad_norm": 21.948686599731445, + "learning_rate": 3.462749213011542e-07, + "logps/chosen": -55.41584014892578, + "logps/rejected": -85.0135498046875, + "loss": 0.4285, + "losses/dpo": 0.3888290524482727, + "losses/sft": 1.7734923362731934, + "losses/total": 0.3888290524482727, + "ref_logps/chosen": -42.59334945678711, + "ref_logps/rejected": -61.39726638793945, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2822489738464355, + "rewards/margins": 1.0793800354003906, + "rewards/rejected": -2.361629009246826, + "step": 1197 + }, + { + "epoch": 1.13, + "grad_norm": 21.668668746948242, + "learning_rate": 3.4610003497726477e-07, + "logps/chosen": -47.953975677490234, + "logps/rejected": -72.549072265625, + "loss": 0.3893, + "losses/dpo": 0.31027325987815857, + "losses/sft": 2.0131776332855225, + "losses/total": 0.31027325987815857, + "ref_logps/chosen": -33.96575927734375, + "ref_logps/rejected": -43.709205627441406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3988215923309326, + "rewards/margins": 1.4851644039154053, + "rewards/rejected": -2.883985996246338, + "step": 1198 + }, + { + "epoch": 1.13, + "grad_norm": 20.344911575317383, + "learning_rate": 3.459251486533753e-07, + "logps/chosen": -43.740814208984375, + "logps/rejected": -76.95494079589844, + "loss": 0.3314, + "losses/dpo": 0.11246241629123688, + "losses/sft": 1.1133618354797363, + "losses/total": 0.11246241629123688, + "ref_logps/chosen": -34.54724884033203, + "ref_logps/rejected": -52.08617401123047, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.919356107711792, + "rewards/margins": 1.5675208568572998, + "rewards/rejected": -2.486876964569092, + "step": 1199 + }, + { + "epoch": 1.13, + "grad_norm": 20.741167068481445, + "learning_rate": 3.4575026232948584e-07, + "logps/chosen": -39.625831604003906, + "logps/rejected": -62.57728576660156, + "loss": 0.3801, + "losses/dpo": 0.5188046097755432, + "losses/sft": 1.4814120531082153, + "losses/total": 0.5188046097755432, + "ref_logps/chosen": -29.104610443115234, + "ref_logps/rejected": -39.900455474853516, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0521219968795776, + "rewards/margins": 1.2155613899230957, + "rewards/rejected": -2.267683267593384, + "step": 1200 + }, + { + "epoch": 1.13, + "grad_norm": 19.507539749145508, + "learning_rate": 3.4557537600559636e-07, + "logps/chosen": -48.30961227416992, + "logps/rejected": -72.48915100097656, + "loss": 0.4491, + "losses/dpo": 0.6221827268600464, + "losses/sft": 2.064242124557495, + "losses/total": 0.6221827268600464, + "ref_logps/chosen": -35.892147064208984, + "ref_logps/rejected": -43.38697814941406, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2417469024658203, + "rewards/margins": 1.6684703826904297, + "rewards/rejected": -2.910217046737671, + "step": 1201 + }, + { + "epoch": 1.14, + "grad_norm": 16.92333221435547, + "learning_rate": 3.4540048968170687e-07, + "logps/chosen": -35.59849548339844, + "logps/rejected": -68.27458190917969, + "loss": 0.2701, + "losses/dpo": 0.5904721021652222, + "losses/sft": 2.1532437801361084, + "losses/total": 0.5904721021652222, + "ref_logps/chosen": -28.967607498168945, + "ref_logps/rejected": -40.41972351074219, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6630889773368835, + "rewards/margins": 2.1223959922790527, + "rewards/rejected": -2.78548526763916, + "step": 1202 + }, + { + "epoch": 1.14, + "grad_norm": 17.5150146484375, + "learning_rate": 3.4522560335781743e-07, + "logps/chosen": -49.920013427734375, + "logps/rejected": -69.733154296875, + "loss": 0.2983, + "losses/dpo": 0.41659703850746155, + "losses/sft": 1.6118489503860474, + "losses/total": 0.41659703850746155, + "ref_logps/chosen": -41.028804779052734, + "ref_logps/rejected": -44.3741455078125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8891209363937378, + "rewards/margins": 1.6467803716659546, + "rewards/rejected": -2.5359013080596924, + "step": 1203 + }, + { + "epoch": 1.14, + "grad_norm": 22.143356323242188, + "learning_rate": 3.450507170339279e-07, + "logps/chosen": -44.29490280151367, + "logps/rejected": -54.45267105102539, + "loss": 0.4696, + "losses/dpo": 0.5185851454734802, + "losses/sft": 1.8470827341079712, + "losses/total": 0.5185851454734802, + "ref_logps/chosen": -32.514862060546875, + "ref_logps/rejected": -35.87690734863281, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1780040264129639, + "rewards/margins": 0.6795724630355835, + "rewards/rejected": -1.8575763702392578, + "step": 1204 + }, + { + "epoch": 1.14, + "grad_norm": 26.703330993652344, + "learning_rate": 3.4487583071003846e-07, + "logps/chosen": -66.18683624267578, + "logps/rejected": -79.56542205810547, + "loss": 0.4226, + "losses/dpo": 0.45258915424346924, + "losses/sft": 1.4578217267990112, + "losses/total": 0.45258915424346924, + "ref_logps/chosen": -52.280574798583984, + "ref_logps/rejected": -53.096160888671875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3906259536743164, + "rewards/margins": 1.2563002109527588, + "rewards/rejected": -2.646925926208496, + "step": 1205 + }, + { + "epoch": 1.14, + "grad_norm": 22.803064346313477, + "learning_rate": 3.4470094438614897e-07, + "logps/chosen": -54.265052795410156, + "logps/rejected": -62.260963439941406, + "loss": 0.3688, + "losses/dpo": 0.17984730005264282, + "losses/sft": 1.8301756381988525, + "losses/total": 0.17984730005264282, + "ref_logps/chosen": -44.24738311767578, + "ref_logps/rejected": -39.08616256713867, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0017666816711426, + "rewards/margins": 1.3157135248184204, + "rewards/rejected": -2.3174803256988525, + "step": 1206 + }, + { + "epoch": 1.14, + "grad_norm": 24.572607040405273, + "learning_rate": 3.4452605806225954e-07, + "logps/chosen": -43.329105377197266, + "logps/rejected": -63.29738998413086, + "loss": 0.5151, + "losses/dpo": 1.060744285583496, + "losses/sft": 2.0242159366607666, + "losses/total": 1.060744285583496, + "ref_logps/chosen": -30.796119689941406, + "ref_logps/rejected": -39.337188720703125, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2532985210418701, + "rewards/margins": 1.1427216529846191, + "rewards/rejected": -2.3960201740264893, + "step": 1207 + }, + { + "epoch": 1.14, + "grad_norm": 19.429122924804688, + "learning_rate": 3.4435117173837005e-07, + "logps/chosen": -52.67515563964844, + "logps/rejected": -56.843177795410156, + "loss": 0.4331, + "losses/dpo": 0.380759060382843, + "losses/sft": 1.5727355480194092, + "losses/total": 0.380759060382843, + "ref_logps/chosen": -43.468894958496094, + "ref_logps/rejected": -35.63218688964844, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.920626163482666, + "rewards/margins": 1.200473427772522, + "rewards/rejected": -2.1210994720458984, + "step": 1208 + }, + { + "epoch": 1.14, + "grad_norm": 18.721139907836914, + "learning_rate": 3.4417628541448056e-07, + "logps/chosen": -43.63262176513672, + "logps/rejected": -62.44777297973633, + "loss": 0.3952, + "losses/dpo": 0.3956284523010254, + "losses/sft": 1.261940598487854, + "losses/total": 0.3956284523010254, + "ref_logps/chosen": -32.311004638671875, + "ref_logps/rejected": -41.452457427978516, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1321617364883423, + "rewards/margins": 0.9673698544502258, + "rewards/rejected": -2.099531650543213, + "step": 1209 + }, + { + "epoch": 1.14, + "grad_norm": 20.891883850097656, + "learning_rate": 3.4400139909059113e-07, + "logps/chosen": -36.589542388916016, + "logps/rejected": -52.03498077392578, + "loss": 0.5049, + "losses/dpo": 0.7367061376571655, + "losses/sft": 1.6468583345413208, + "losses/total": 0.7367061376571655, + "ref_logps/chosen": -24.960180282592773, + "ref_logps/rejected": -32.505863189697266, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1629362106323242, + "rewards/margins": 0.7899754047393799, + "rewards/rejected": -1.952911615371704, + "step": 1210 + }, + { + "epoch": 1.14, + "grad_norm": 30.904935836791992, + "learning_rate": 3.438265127667016e-07, + "logps/chosen": -58.95918273925781, + "logps/rejected": -67.94422149658203, + "loss": 0.584, + "losses/dpo": 0.3868604302406311, + "losses/sft": 1.5404444932937622, + "losses/total": 0.3868604302406311, + "ref_logps/chosen": -47.20154571533203, + "ref_logps/rejected": -49.71845245361328, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1757640838623047, + "rewards/margins": 0.646812915802002, + "rewards/rejected": -1.8225769996643066, + "step": 1211 + }, + { + "epoch": 1.14, + "grad_norm": 23.595455169677734, + "learning_rate": 3.4365162644281215e-07, + "logps/chosen": -42.342552185058594, + "logps/rejected": -61.444252014160156, + "loss": 0.4705, + "losses/dpo": 0.2707156538963318, + "losses/sft": 1.5560718774795532, + "losses/total": 0.2707156538963318, + "ref_logps/chosen": -32.79653549194336, + "ref_logps/rejected": -41.464778900146484, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9546019434928894, + "rewards/margins": 1.0433454513549805, + "rewards/rejected": -1.9979474544525146, + "step": 1212 + }, + { + "epoch": 1.15, + "grad_norm": 27.435976028442383, + "learning_rate": 3.4347674011892267e-07, + "logps/chosen": -47.908782958984375, + "logps/rejected": -46.43934631347656, + "loss": 0.6044, + "losses/dpo": 0.545130729675293, + "losses/sft": 1.9446781873703003, + "losses/total": 0.545130729675293, + "ref_logps/chosen": -35.36150360107422, + "ref_logps/rejected": -30.26004981994629, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.254727840423584, + "rewards/margins": 0.36320173740386963, + "rewards/rejected": -1.6179295778274536, + "step": 1213 + }, + { + "epoch": 1.15, + "grad_norm": 25.55695915222168, + "learning_rate": 3.4330185379503323e-07, + "logps/chosen": -60.84926986694336, + "logps/rejected": -63.439491271972656, + "loss": 0.5527, + "losses/dpo": 0.9513847827911377, + "losses/sft": 1.7977392673492432, + "losses/total": 0.9513847827911377, + "ref_logps/chosen": -48.891082763671875, + "ref_logps/rejected": -44.028541564941406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1958189010620117, + "rewards/margins": 0.7452763319015503, + "rewards/rejected": -1.9410953521728516, + "step": 1214 + }, + { + "epoch": 1.15, + "grad_norm": 21.73856544494629, + "learning_rate": 3.4312696747114375e-07, + "logps/chosen": -62.00055694580078, + "logps/rejected": -74.59437561035156, + "loss": 0.3918, + "losses/dpo": 0.24058350920677185, + "losses/sft": 1.6256749629974365, + "losses/total": 0.24058350920677185, + "ref_logps/chosen": -48.43283462524414, + "ref_logps/rejected": -49.34600067138672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3567726612091064, + "rewards/margins": 1.168065071105957, + "rewards/rejected": -2.5248374938964844, + "step": 1215 + }, + { + "epoch": 1.15, + "grad_norm": 24.474241256713867, + "learning_rate": 3.4295208114725426e-07, + "logps/chosen": -50.98002624511719, + "logps/rejected": -68.3804702758789, + "loss": 0.4446, + "losses/dpo": 0.1954561471939087, + "losses/sft": 1.62507963180542, + "losses/total": 0.1954561471939087, + "ref_logps/chosen": -37.444149017333984, + "ref_logps/rejected": -43.96714782714844, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3535881042480469, + "rewards/margins": 1.0877442359924316, + "rewards/rejected": -2.4413321018218994, + "step": 1216 + }, + { + "epoch": 1.15, + "grad_norm": 19.39516258239746, + "learning_rate": 3.427771948233648e-07, + "logps/chosen": -49.55461120605469, + "logps/rejected": -75.17345428466797, + "loss": 0.3015, + "losses/dpo": 0.3323233425617218, + "losses/sft": 2.0071802139282227, + "losses/total": 0.3323233425617218, + "ref_logps/chosen": -40.910945892333984, + "ref_logps/rejected": -53.50979995727539, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8643665313720703, + "rewards/margins": 1.3019989728927612, + "rewards/rejected": -2.166365623474121, + "step": 1217 + }, + { + "epoch": 1.15, + "grad_norm": 22.340932846069336, + "learning_rate": 3.426023084994753e-07, + "logps/chosen": -50.58102798461914, + "logps/rejected": -67.12346649169922, + "loss": 0.4652, + "losses/dpo": 0.7647037506103516, + "losses/sft": 1.7319880723953247, + "losses/total": 0.7647037506103516, + "ref_logps/chosen": -38.5652961730957, + "ref_logps/rejected": -46.59025573730469, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.201573371887207, + "rewards/margins": 0.8517480492591858, + "rewards/rejected": -2.053321361541748, + "step": 1218 + }, + { + "epoch": 1.15, + "grad_norm": 25.574398040771484, + "learning_rate": 3.4242742217558585e-07, + "logps/chosen": -48.58213806152344, + "logps/rejected": -71.12791442871094, + "loss": 0.4387, + "losses/dpo": 0.19296759366989136, + "losses/sft": 1.4935076236724854, + "losses/total": 0.19296759366989136, + "ref_logps/chosen": -35.044158935546875, + "ref_logps/rejected": -46.30248260498047, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3537980318069458, + "rewards/margins": 1.1287453174591064, + "rewards/rejected": -2.482543468475342, + "step": 1219 + }, + { + "epoch": 1.15, + "grad_norm": 24.586471557617188, + "learning_rate": 3.422525358516964e-07, + "logps/chosen": -48.48291015625, + "logps/rejected": -77.8056640625, + "loss": 0.3786, + "losses/dpo": 0.10799476504325867, + "losses/sft": 1.7551960945129395, + "losses/total": 0.10799476504325867, + "ref_logps/chosen": -39.020484924316406, + "ref_logps/rejected": -53.53319549560547, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9462423324584961, + "rewards/margins": 1.4810049533843994, + "rewards/rejected": -2.4272472858428955, + "step": 1220 + }, + { + "epoch": 1.15, + "grad_norm": 22.86333465576172, + "learning_rate": 3.4207764952780693e-07, + "logps/chosen": -53.229713439941406, + "logps/rejected": -67.08908081054688, + "loss": 0.4879, + "losses/dpo": 0.41523340344429016, + "losses/sft": 2.0493810176849365, + "losses/total": 0.41523340344429016, + "ref_logps/chosen": -41.48245620727539, + "ref_logps/rejected": -41.98067855834961, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.174725890159607, + "rewards/margins": 1.3361148834228516, + "rewards/rejected": -2.510840654373169, + "step": 1221 + }, + { + "epoch": 1.15, + "grad_norm": 20.58643913269043, + "learning_rate": 3.4190276320391744e-07, + "logps/chosen": -41.96229553222656, + "logps/rejected": -65.61783599853516, + "loss": 0.3209, + "losses/dpo": 0.22559937834739685, + "losses/sft": 1.1402311325073242, + "losses/total": 0.22559937834739685, + "ref_logps/chosen": -33.453243255615234, + "ref_logps/rejected": -42.23716735839844, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.850905179977417, + "rewards/margins": 1.48716139793396, + "rewards/rejected": -2.338066339492798, + "step": 1222 + }, + { + "epoch": 1.15, + "grad_norm": 27.87337875366211, + "learning_rate": 3.4172787688002795e-07, + "logps/chosen": -48.875816345214844, + "logps/rejected": -60.657806396484375, + "loss": 0.5726, + "losses/dpo": 0.2870476245880127, + "losses/sft": 1.8599473237991333, + "losses/total": 0.2870476245880127, + "ref_logps/chosen": -36.14516067504883, + "ref_logps/rejected": -42.29712677001953, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2730658054351807, + "rewards/margins": 0.5630024671554565, + "rewards/rejected": -1.8360682725906372, + "step": 1223 + }, + { + "epoch": 1.16, + "grad_norm": 26.35036277770996, + "learning_rate": 3.415529905561385e-07, + "logps/chosen": -56.7800178527832, + "logps/rejected": -77.70372009277344, + "loss": 0.6803, + "losses/dpo": 0.25999119877815247, + "losses/sft": 1.5244412422180176, + "losses/total": 0.25999119877815247, + "ref_logps/chosen": -37.98125457763672, + "ref_logps/rejected": -49.28694152832031, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8798760175704956, + "rewards/margins": 0.9618018269538879, + "rewards/rejected": -2.8416779041290283, + "step": 1224 + }, + { + "epoch": 1.16, + "grad_norm": 24.638765335083008, + "learning_rate": 3.41378104232249e-07, + "logps/chosen": -53.67375946044922, + "logps/rejected": -63.890159606933594, + "loss": 0.5792, + "losses/dpo": 0.45446139574050903, + "losses/sft": 1.8416240215301514, + "losses/total": 0.45446139574050903, + "ref_logps/chosen": -35.34421157836914, + "ref_logps/rejected": -41.183658599853516, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8329551219940186, + "rewards/margins": 0.4376949965953827, + "rewards/rejected": -2.2706503868103027, + "step": 1225 + }, + { + "epoch": 1.16, + "grad_norm": 17.07170867919922, + "learning_rate": 3.4120321790835954e-07, + "logps/chosen": -44.629207611083984, + "logps/rejected": -64.49826049804688, + "loss": 0.3347, + "losses/dpo": 0.41418763995170593, + "losses/sft": 1.9355415105819702, + "losses/total": 0.41418763995170593, + "ref_logps/chosen": -35.71815490722656, + "ref_logps/rejected": -42.28117370605469, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.891105055809021, + "rewards/margins": 1.3306039571762085, + "rewards/rejected": -2.2217092514038086, + "step": 1226 + }, + { + "epoch": 1.16, + "grad_norm": 21.43756866455078, + "learning_rate": 3.410283315844701e-07, + "logps/chosen": -49.661277770996094, + "logps/rejected": -54.75538635253906, + "loss": 0.4956, + "losses/dpo": 0.5396450757980347, + "losses/sft": 1.9451037645339966, + "losses/total": 0.5396450757980347, + "ref_logps/chosen": -39.7839469909668, + "ref_logps/rejected": -37.6505241394043, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9877331852912903, + "rewards/margins": 0.7227531671524048, + "rewards/rejected": -1.7104862928390503, + "step": 1227 + }, + { + "epoch": 1.16, + "grad_norm": 18.683822631835938, + "learning_rate": 3.408534452605806e-07, + "logps/chosen": -43.6932487487793, + "logps/rejected": -66.22261047363281, + "loss": 0.3186, + "losses/dpo": 0.4445999264717102, + "losses/sft": 1.171536922454834, + "losses/total": 0.4445999264717102, + "ref_logps/chosen": -35.19602966308594, + "ref_logps/rejected": -43.317169189453125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8497219085693359, + "rewards/margins": 1.4408226013183594, + "rewards/rejected": -2.290544271469116, + "step": 1228 + }, + { + "epoch": 1.16, + "grad_norm": 25.496213912963867, + "learning_rate": 3.4067855893669113e-07, + "logps/chosen": -55.74542999267578, + "logps/rejected": -69.71067810058594, + "loss": 0.4917, + "losses/dpo": 0.5883376598358154, + "losses/sft": 1.744455099105835, + "losses/total": 0.5883376598358154, + "ref_logps/chosen": -43.384056091308594, + "ref_logps/rejected": -48.68836975097656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2361371517181396, + "rewards/margins": 0.8660933971405029, + "rewards/rejected": -2.1022305488586426, + "step": 1229 + }, + { + "epoch": 1.16, + "grad_norm": 23.95579719543457, + "learning_rate": 3.4050367261280165e-07, + "logps/chosen": -42.41576385498047, + "logps/rejected": -55.842376708984375, + "loss": 0.4732, + "losses/dpo": 0.5958298444747925, + "losses/sft": 1.346472144126892, + "losses/total": 0.5958298444747925, + "ref_logps/chosen": -34.13622283935547, + "ref_logps/rejected": -39.37353515625, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8279539942741394, + "rewards/margins": 0.8189303278923035, + "rewards/rejected": -1.6468843221664429, + "step": 1230 + }, + { + "epoch": 1.16, + "grad_norm": 23.343685150146484, + "learning_rate": 3.403287862889122e-07, + "logps/chosen": -55.46833038330078, + "logps/rejected": -67.71742248535156, + "loss": 0.4914, + "losses/dpo": 0.282474547624588, + "losses/sft": 2.198577642440796, + "losses/total": 0.282474547624588, + "ref_logps/chosen": -44.964542388916016, + "ref_logps/rejected": -47.52948760986328, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0503791570663452, + "rewards/margins": 0.9684140086174011, + "rewards/rejected": -2.0187931060791016, + "step": 1231 + }, + { + "epoch": 1.16, + "grad_norm": 23.989164352416992, + "learning_rate": 3.4015389996502267e-07, + "logps/chosen": -57.11771011352539, + "logps/rejected": -66.72931671142578, + "loss": 0.4929, + "losses/dpo": 0.4271523058414459, + "losses/sft": 2.0742557048797607, + "losses/total": 0.4271523058414459, + "ref_logps/chosen": -44.4672737121582, + "ref_logps/rejected": -45.18669509887695, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2650437355041504, + "rewards/margins": 0.8892186880111694, + "rewards/rejected": -2.1542623043060303, + "step": 1232 + }, + { + "epoch": 1.16, + "grad_norm": 16.49856948852539, + "learning_rate": 3.3997901364113324e-07, + "logps/chosen": -52.76203918457031, + "logps/rejected": -64.7552490234375, + "loss": 0.3319, + "losses/dpo": 0.1526704579591751, + "losses/sft": 1.746389389038086, + "losses/total": 0.1526704579591751, + "ref_logps/chosen": -43.64642333984375, + "ref_logps/rejected": -42.858543395996094, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.911561906337738, + "rewards/margins": 1.2781083583831787, + "rewards/rejected": -2.1896703243255615, + "step": 1233 + }, + { + "epoch": 1.17, + "grad_norm": 26.359237670898438, + "learning_rate": 3.398041273172438e-07, + "logps/chosen": -42.836246490478516, + "logps/rejected": -44.02879333496094, + "loss": 0.596, + "losses/dpo": 0.45190829038619995, + "losses/sft": 1.4974873065948486, + "losses/total": 0.45190829038619995, + "ref_logps/chosen": -31.9793701171875, + "ref_logps/rejected": -28.331953048706055, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.0856877565383911, + "rewards/margins": 0.4839964807033539, + "rewards/rejected": -1.5696842670440674, + "step": 1234 + }, + { + "epoch": 1.17, + "grad_norm": 22.135757446289062, + "learning_rate": 3.396292409933543e-07, + "logps/chosen": -46.788543701171875, + "logps/rejected": -75.48374938964844, + "loss": 0.3362, + "losses/dpo": 0.43912792205810547, + "losses/sft": 1.712257742881775, + "losses/total": 0.43912792205810547, + "ref_logps/chosen": -37.78060531616211, + "ref_logps/rejected": -51.97429656982422, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.900794267654419, + "rewards/margins": 1.450150728225708, + "rewards/rejected": -2.350944995880127, + "step": 1235 + }, + { + "epoch": 1.17, + "grad_norm": 21.167810440063477, + "learning_rate": 3.3945435466946483e-07, + "logps/chosen": -62.74198532104492, + "logps/rejected": -78.19017028808594, + "loss": 0.3671, + "losses/dpo": 0.26451680064201355, + "losses/sft": 1.9652423858642578, + "losses/total": 0.26451680064201355, + "ref_logps/chosen": -46.7498779296875, + "ref_logps/rejected": -49.8843994140625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.599210500717163, + "rewards/margins": 1.2313660383224487, + "rewards/rejected": -2.8305766582489014, + "step": 1236 + }, + { + "epoch": 1.17, + "grad_norm": 24.189664840698242, + "learning_rate": 3.3927946834557534e-07, + "logps/chosen": -61.61280059814453, + "logps/rejected": -63.16066360473633, + "loss": 0.52, + "losses/dpo": 1.0029711723327637, + "losses/sft": 1.8602609634399414, + "losses/total": 1.0029711723327637, + "ref_logps/chosen": -49.45880126953125, + "ref_logps/rejected": -41.41373062133789, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2154000997543335, + "rewards/margins": 0.9592931866645813, + "rewards/rejected": -2.1746931076049805, + "step": 1237 + }, + { + "epoch": 1.17, + "grad_norm": 18.63071632385254, + "learning_rate": 3.391045820216859e-07, + "logps/chosen": -51.032657623291016, + "logps/rejected": -67.61209106445312, + "loss": 0.4245, + "losses/dpo": 0.31612345576286316, + "losses/sft": 1.7192654609680176, + "losses/total": 0.31612345576286316, + "ref_logps/chosen": -39.01557159423828, + "ref_logps/rejected": -44.32916259765625, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2017087936401367, + "rewards/margins": 1.1265840530395508, + "rewards/rejected": -2.3282928466796875, + "step": 1238 + }, + { + "epoch": 1.17, + "grad_norm": 17.873682022094727, + "learning_rate": 3.3892969569779637e-07, + "logps/chosen": -51.54199981689453, + "logps/rejected": -69.31196594238281, + "loss": 0.3382, + "losses/dpo": 0.16271719336509705, + "losses/sft": 1.8073300123214722, + "losses/total": 0.16271719336509705, + "ref_logps/chosen": -41.933921813964844, + "ref_logps/rejected": -44.349037170410156, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9608081579208374, + "rewards/margins": 1.535484790802002, + "rewards/rejected": -2.49629282951355, + "step": 1239 + }, + { + "epoch": 1.17, + "grad_norm": 25.454572677612305, + "learning_rate": 3.3875480937390693e-07, + "logps/chosen": -46.22962951660156, + "logps/rejected": -56.698699951171875, + "loss": 0.544, + "losses/dpo": 0.482551634311676, + "losses/sft": 1.4616835117340088, + "losses/total": 0.482551634311676, + "ref_logps/chosen": -36.28697967529297, + "ref_logps/rejected": -39.95439910888672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9942651987075806, + "rewards/margins": 0.6801650524139404, + "rewards/rejected": -1.674430251121521, + "step": 1240 + }, + { + "epoch": 1.17, + "grad_norm": 15.92263126373291, + "learning_rate": 3.385799230500175e-07, + "logps/chosen": -41.26055145263672, + "logps/rejected": -71.25456237792969, + "loss": 0.2884, + "losses/dpo": 0.3872529864311218, + "losses/sft": 2.4158272743225098, + "losses/total": 0.3872529864311218, + "ref_logps/chosen": -32.057342529296875, + "ref_logps/rejected": -44.658355712890625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9203210473060608, + "rewards/margins": 1.7392998933792114, + "rewards/rejected": -2.659621000289917, + "step": 1241 + }, + { + "epoch": 1.17, + "grad_norm": 18.061687469482422, + "learning_rate": 3.38405036726128e-07, + "logps/chosen": -41.618431091308594, + "logps/rejected": -57.597293853759766, + "loss": 0.4316, + "losses/dpo": 0.7112546563148499, + "losses/sft": 1.568984031677246, + "losses/total": 0.7112546563148499, + "ref_logps/chosen": -33.12033462524414, + "ref_logps/rejected": -38.881324768066406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8498092889785767, + "rewards/margins": 1.0217880010604858, + "rewards/rejected": -1.871597409248352, + "step": 1242 + }, + { + "epoch": 1.17, + "grad_norm": 33.961753845214844, + "learning_rate": 3.382301504022385e-07, + "logps/chosen": -58.7257080078125, + "logps/rejected": -63.61299133300781, + "loss": 0.6259, + "losses/dpo": 0.8247978687286377, + "losses/sft": 1.817539930343628, + "losses/total": 0.8247978687286377, + "ref_logps/chosen": -45.45909881591797, + "ref_logps/rejected": -42.36334228515625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3266608715057373, + "rewards/margins": 0.7983040809631348, + "rewards/rejected": -2.124964952468872, + "step": 1243 + }, + { + "epoch": 1.17, + "grad_norm": 19.390769958496094, + "learning_rate": 3.3805526407834904e-07, + "logps/chosen": -45.50224304199219, + "logps/rejected": -69.91033172607422, + "loss": 0.4912, + "losses/dpo": 0.5417351126670837, + "losses/sft": 1.3218685388565063, + "losses/total": 0.5417351126670837, + "ref_logps/chosen": -35.31776428222656, + "ref_logps/rejected": -50.800045013427734, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0184478759765625, + "rewards/margins": 0.8925809264183044, + "rewards/rejected": -1.9110288619995117, + "step": 1244 + }, + { + "epoch": 1.18, + "grad_norm": 26.61286163330078, + "learning_rate": 3.378803777544596e-07, + "logps/chosen": -49.75404357910156, + "logps/rejected": -57.819114685058594, + "loss": 0.5683, + "losses/dpo": 0.47424811124801636, + "losses/sft": 1.544967532157898, + "losses/total": 0.47424811124801636, + "ref_logps/chosen": -40.68010711669922, + "ref_logps/rejected": -42.18236541748047, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9073936939239502, + "rewards/margins": 0.6562814712524414, + "rewards/rejected": -1.5636751651763916, + "step": 1245 + }, + { + "epoch": 1.18, + "grad_norm": 20.838897705078125, + "learning_rate": 3.377054914305701e-07, + "logps/chosen": -55.13612747192383, + "logps/rejected": -88.3150634765625, + "loss": 0.3553, + "losses/dpo": 0.16532008349895477, + "losses/sft": 1.6109546422958374, + "losses/total": 0.16532008349895477, + "ref_logps/chosen": -41.640995025634766, + "ref_logps/rejected": -63.374874114990234, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.349513053894043, + "rewards/margins": 1.1445056200027466, + "rewards/rejected": -2.494018793106079, + "step": 1246 + }, + { + "epoch": 1.18, + "grad_norm": 21.849136352539062, + "learning_rate": 3.3753060510668063e-07, + "logps/chosen": -37.76445770263672, + "logps/rejected": -56.318603515625, + "loss": 0.58, + "losses/dpo": 0.556659996509552, + "losses/sft": 1.354345679283142, + "losses/total": 0.556659996509552, + "ref_logps/chosen": -28.26213836669922, + "ref_logps/rejected": -38.53440856933594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.950231671333313, + "rewards/margins": 0.8281873464584351, + "rewards/rejected": -1.7784191370010376, + "step": 1247 + }, + { + "epoch": 1.18, + "grad_norm": 16.14711570739746, + "learning_rate": 3.373557187827912e-07, + "logps/chosen": -34.53186798095703, + "logps/rejected": -58.24977111816406, + "loss": 0.3753, + "losses/dpo": 0.19008731842041016, + "losses/sft": 1.8000578880310059, + "losses/total": 0.19008731842041016, + "ref_logps/chosen": -24.019250869750977, + "ref_logps/rejected": -37.30735397338867, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0512619018554688, + "rewards/margins": 1.0429797172546387, + "rewards/rejected": -2.0942416191101074, + "step": 1248 + }, + { + "epoch": 1.18, + "grad_norm": 19.2435245513916, + "learning_rate": 3.371808324589017e-07, + "logps/chosen": -49.262107849121094, + "logps/rejected": -73.62611389160156, + "loss": 0.3579, + "losses/dpo": 0.49603015184402466, + "losses/sft": 1.5602797269821167, + "losses/total": 0.49603015184402466, + "ref_logps/chosen": -35.401824951171875, + "ref_logps/rejected": -48.3310661315918, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3860286474227905, + "rewards/margins": 1.1434768438339233, + "rewards/rejected": -2.529505491256714, + "step": 1249 + }, + { + "epoch": 1.18, + "grad_norm": 19.644094467163086, + "learning_rate": 3.370059461350122e-07, + "logps/chosen": -39.05671691894531, + "logps/rejected": -48.12267303466797, + "loss": 0.4691, + "losses/dpo": 0.4304007291793823, + "losses/sft": 2.091778039932251, + "losses/total": 0.4304007291793823, + "ref_logps/chosen": -31.152490615844727, + "ref_logps/rejected": -32.670867919921875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7904224395751953, + "rewards/margins": 0.7547580003738403, + "rewards/rejected": -1.545180320739746, + "step": 1250 + }, + { + "epoch": 1.18, + "grad_norm": 24.426603317260742, + "learning_rate": 3.3683105981112273e-07, + "logps/chosen": -49.237548828125, + "logps/rejected": -59.71967315673828, + "loss": 0.5087, + "losses/dpo": 0.8596978187561035, + "losses/sft": 2.3221054077148438, + "losses/total": 0.8596978187561035, + "ref_logps/chosen": -36.67880630493164, + "ref_logps/rejected": -39.929840087890625, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2558743953704834, + "rewards/margins": 0.7231088280677795, + "rewards/rejected": -1.9789831638336182, + "step": 1251 + }, + { + "epoch": 1.18, + "grad_norm": 18.18250274658203, + "learning_rate": 3.366561734872333e-07, + "logps/chosen": -44.187374114990234, + "logps/rejected": -61.64546203613281, + "loss": 0.3814, + "losses/dpo": 0.21781179308891296, + "losses/sft": 1.3680617809295654, + "losses/total": 0.21781179308891296, + "ref_logps/chosen": -35.716651916503906, + "ref_logps/rejected": -42.11402893066406, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8470723628997803, + "rewards/margins": 1.1060707569122314, + "rewards/rejected": -1.9531432390213013, + "step": 1252 + }, + { + "epoch": 1.18, + "grad_norm": 21.00637435913086, + "learning_rate": 3.364812871633438e-07, + "logps/chosen": -41.177223205566406, + "logps/rejected": -61.0933837890625, + "loss": 0.4484, + "losses/dpo": 0.7239801287651062, + "losses/sft": 2.150071620941162, + "losses/total": 0.7239801287651062, + "ref_logps/chosen": -31.72400665283203, + "ref_logps/rejected": -39.310707092285156, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9453219771385193, + "rewards/margins": 1.2329456806182861, + "rewards/rejected": -2.178267478942871, + "step": 1253 + }, + { + "epoch": 1.18, + "grad_norm": 16.84493637084961, + "learning_rate": 3.363064008394543e-07, + "logps/chosen": -33.50115203857422, + "logps/rejected": -46.99577331542969, + "loss": 0.4482, + "losses/dpo": 0.5594444274902344, + "losses/sft": 1.4713568687438965, + "losses/total": 0.5594444274902344, + "ref_logps/chosen": -26.529706954956055, + "ref_logps/rejected": -29.561607360839844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6971445083618164, + "rewards/margins": 1.0462722778320312, + "rewards/rejected": -1.7434167861938477, + "step": 1254 + }, + { + "epoch": 1.19, + "grad_norm": 24.219676971435547, + "learning_rate": 3.361315145155649e-07, + "logps/chosen": -51.231842041015625, + "logps/rejected": -53.90184020996094, + "loss": 0.5889, + "losses/dpo": 0.892292857170105, + "losses/sft": 1.986249566078186, + "losses/total": 0.892292857170105, + "ref_logps/chosen": -38.029048919677734, + "ref_logps/rejected": -33.53123474121094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.320279598236084, + "rewards/margins": 0.7167806625366211, + "rewards/rejected": -2.037060260772705, + "step": 1255 + }, + { + "epoch": 1.19, + "grad_norm": 21.42949104309082, + "learning_rate": 3.359566281916754e-07, + "logps/chosen": -47.90787887573242, + "logps/rejected": -60.8084716796875, + "loss": 0.4334, + "losses/dpo": 0.3060031831264496, + "losses/sft": 1.823817491531372, + "losses/total": 0.3060031831264496, + "ref_logps/chosen": -38.683349609375, + "ref_logps/rejected": -40.10588073730469, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9224532246589661, + "rewards/margins": 1.1478058099746704, + "rewards/rejected": -2.0702590942382812, + "step": 1256 + }, + { + "epoch": 1.19, + "grad_norm": 15.512833595275879, + "learning_rate": 3.357817418677859e-07, + "logps/chosen": -43.582130432128906, + "logps/rejected": -81.34471130371094, + "loss": 0.246, + "losses/dpo": 0.11325369775295258, + "losses/sft": 1.1968235969543457, + "losses/total": 0.11325369775295258, + "ref_logps/chosen": -32.442893981933594, + "ref_logps/rejected": -51.03504943847656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1139239072799683, + "rewards/margins": 1.9170420169830322, + "rewards/rejected": -3.030965805053711, + "step": 1257 + }, + { + "epoch": 1.19, + "grad_norm": 28.494823455810547, + "learning_rate": 3.356068555438964e-07, + "logps/chosen": -55.90460968017578, + "logps/rejected": -67.21074676513672, + "loss": 0.5795, + "losses/dpo": 0.6859239339828491, + "losses/sft": 1.5206575393676758, + "losses/total": 0.6859239339828491, + "ref_logps/chosen": -41.750732421875, + "ref_logps/rejected": -45.588375091552734, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4153876304626465, + "rewards/margins": 0.7468500137329102, + "rewards/rejected": -2.1622376441955566, + "step": 1258 + }, + { + "epoch": 1.19, + "grad_norm": 17.048093795776367, + "learning_rate": 3.35431969220007e-07, + "logps/chosen": -40.54545593261719, + "logps/rejected": -58.8145751953125, + "loss": 0.3684, + "losses/dpo": 0.15894848108291626, + "losses/sft": 1.6647748947143555, + "losses/total": 0.15894848108291626, + "ref_logps/chosen": -30.776290893554688, + "ref_logps/rejected": -39.05743408203125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9769164323806763, + "rewards/margins": 0.9987971186637878, + "rewards/rejected": -1.9757134914398193, + "step": 1259 + }, + { + "epoch": 1.19, + "grad_norm": 19.45284080505371, + "learning_rate": 3.352570828961175e-07, + "logps/chosen": -37.42103958129883, + "logps/rejected": -52.38259506225586, + "loss": 0.4916, + "losses/dpo": 0.505672812461853, + "losses/sft": 1.741233468055725, + "losses/total": 0.505672812461853, + "ref_logps/chosen": -28.626670837402344, + "ref_logps/rejected": -35.35721969604492, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8794369697570801, + "rewards/margins": 0.8231004476547241, + "rewards/rejected": -1.7025374174118042, + "step": 1260 + }, + { + "epoch": 1.19, + "grad_norm": 28.87559700012207, + "learning_rate": 3.35082196572228e-07, + "logps/chosen": -51.134605407714844, + "logps/rejected": -61.78376770019531, + "loss": 0.5852, + "losses/dpo": 1.0553419589996338, + "losses/sft": 2.0093259811401367, + "losses/total": 1.0553419589996338, + "ref_logps/chosen": -37.38733673095703, + "ref_logps/rejected": -41.00181198120117, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3747273683547974, + "rewards/margins": 0.7034682035446167, + "rewards/rejected": -2.078195571899414, + "step": 1261 + }, + { + "epoch": 1.19, + "grad_norm": 25.339797973632812, + "learning_rate": 3.349073102483386e-07, + "logps/chosen": -46.318939208984375, + "logps/rejected": -49.26006317138672, + "loss": 0.6336, + "losses/dpo": 0.5517014265060425, + "losses/sft": 1.5738214254379272, + "losses/total": 0.5517014265060425, + "ref_logps/chosen": -35.05769348144531, + "ref_logps/rejected": -32.71645736694336, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1261239051818848, + "rewards/margins": 0.5282371044158936, + "rewards/rejected": -1.6543610095977783, + "step": 1262 + }, + { + "epoch": 1.19, + "grad_norm": 11.409197807312012, + "learning_rate": 3.347324239244491e-07, + "logps/chosen": -44.18559265136719, + "logps/rejected": -73.9047622680664, + "loss": 0.2993, + "losses/dpo": 0.1533624827861786, + "losses/sft": 1.511976957321167, + "losses/total": 0.1533624827861786, + "ref_logps/chosen": -36.915496826171875, + "ref_logps/rejected": -49.74488067626953, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7270100116729736, + "rewards/margins": 1.6889781951904297, + "rewards/rejected": -2.4159882068634033, + "step": 1263 + }, + { + "epoch": 1.19, + "grad_norm": 16.06197166442871, + "learning_rate": 3.345575376005596e-07, + "logps/chosen": -39.298484802246094, + "logps/rejected": -72.40126037597656, + "loss": 0.3503, + "losses/dpo": 0.23409026861190796, + "losses/sft": 1.4718996286392212, + "losses/total": 0.23409026861190796, + "ref_logps/chosen": -29.222522735595703, + "ref_logps/rejected": -49.64337158203125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0075957775115967, + "rewards/margins": 1.2681934833526611, + "rewards/rejected": -2.275789260864258, + "step": 1264 + }, + { + "epoch": 1.19, + "grad_norm": 21.85924530029297, + "learning_rate": 3.3438265127667017e-07, + "logps/chosen": -52.99810028076172, + "logps/rejected": -59.741310119628906, + "loss": 0.4037, + "losses/dpo": 0.6012160778045654, + "losses/sft": 2.290933132171631, + "losses/total": 0.6012160778045654, + "ref_logps/chosen": -41.28215026855469, + "ref_logps/rejected": -36.79560089111328, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1715948581695557, + "rewards/margins": 1.1229760646820068, + "rewards/rejected": -2.2945709228515625, + "step": 1265 + }, + { + "epoch": 1.2, + "grad_norm": 17.8614501953125, + "learning_rate": 3.342077649527807e-07, + "logps/chosen": -41.201026916503906, + "logps/rejected": -48.78249740600586, + "loss": 0.4047, + "losses/dpo": 0.5122185349464417, + "losses/sft": 1.697983741760254, + "losses/total": 0.5122185349464417, + "ref_logps/chosen": -35.62199401855469, + "ref_logps/rejected": -33.27435302734375, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5579028725624084, + "rewards/margins": 0.992911696434021, + "rewards/rejected": -1.5508146286010742, + "step": 1266 + }, + { + "epoch": 1.2, + "grad_norm": 15.121383666992188, + "learning_rate": 3.340328786288912e-07, + "logps/chosen": -48.94519805908203, + "logps/rejected": -60.90454864501953, + "loss": 0.3843, + "losses/dpo": 0.3824542760848999, + "losses/sft": 1.8560597896575928, + "losses/total": 0.3824542760848999, + "ref_logps/chosen": -39.710853576660156, + "ref_logps/rejected": -41.079078674316406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9234342575073242, + "rewards/margins": 1.059112787246704, + "rewards/rejected": -1.9825471639633179, + "step": 1267 + }, + { + "epoch": 1.2, + "grad_norm": 14.699516296386719, + "learning_rate": 3.338579923050017e-07, + "logps/chosen": -47.368682861328125, + "logps/rejected": -55.56596374511719, + "loss": 0.4032, + "losses/dpo": 0.6939573287963867, + "losses/sft": 2.2385778427124023, + "losses/total": 0.6939573287963867, + "ref_logps/chosen": -37.04937744140625, + "ref_logps/rejected": -34.13915252685547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0319303274154663, + "rewards/margins": 1.110750436782837, + "rewards/rejected": -2.1426808834075928, + "step": 1268 + }, + { + "epoch": 1.2, + "grad_norm": 17.357484817504883, + "learning_rate": 3.336831059811123e-07, + "logps/chosen": -44.28623580932617, + "logps/rejected": -64.76651763916016, + "loss": 0.3356, + "losses/dpo": 0.3124982416629791, + "losses/sft": 1.9567397832870483, + "losses/total": 0.3124982416629791, + "ref_logps/chosen": -31.228628158569336, + "ref_logps/rejected": -39.99836730957031, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3057608604431152, + "rewards/margins": 1.1710538864135742, + "rewards/rejected": -2.4768147468566895, + "step": 1269 + }, + { + "epoch": 1.2, + "grad_norm": 19.059419631958008, + "learning_rate": 3.335082196572228e-07, + "logps/chosen": -52.90300750732422, + "logps/rejected": -71.27776336669922, + "loss": 0.3608, + "losses/dpo": 0.24609430134296417, + "losses/sft": 1.595199704170227, + "losses/total": 0.24609430134296417, + "ref_logps/chosen": -37.32032012939453, + "ref_logps/rejected": -43.765377044677734, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.558268666267395, + "rewards/margins": 1.1929700374603271, + "rewards/rejected": -2.7512388229370117, + "step": 1270 + }, + { + "epoch": 1.2, + "grad_norm": 25.79266929626465, + "learning_rate": 3.333333333333333e-07, + "logps/chosen": -64.91693115234375, + "logps/rejected": -75.42364501953125, + "loss": 0.4968, + "losses/dpo": 0.34554123878479004, + "losses/sft": 1.9924956560134888, + "losses/total": 0.34554123878479004, + "ref_logps/chosen": -47.75408935546875, + "ref_logps/rejected": -50.0391731262207, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7162843942642212, + "rewards/margins": 0.822162926197052, + "rewards/rejected": -2.538447380065918, + "step": 1271 + }, + { + "epoch": 1.2, + "grad_norm": 21.458112716674805, + "learning_rate": 3.3315844700944387e-07, + "logps/chosen": -52.42706298828125, + "logps/rejected": -64.53559875488281, + "loss": 0.4788, + "losses/dpo": 0.2602001428604126, + "losses/sft": 1.9368573427200317, + "losses/total": 0.2602001428604126, + "ref_logps/chosen": -39.55405044555664, + "ref_logps/rejected": -43.710662841796875, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2873010635375977, + "rewards/margins": 0.795192539691925, + "rewards/rejected": -2.082493782043457, + "step": 1272 + }, + { + "epoch": 1.2, + "grad_norm": 22.87200927734375, + "learning_rate": 3.329835606855544e-07, + "logps/chosen": -56.033592224121094, + "logps/rejected": -70.26383209228516, + "loss": 0.4821, + "losses/dpo": 0.4073163866996765, + "losses/sft": 1.952742576599121, + "losses/total": 0.4073163866996765, + "ref_logps/chosen": -40.43696212768555, + "ref_logps/rejected": -44.102195739746094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5596632957458496, + "rewards/margins": 1.0565005540847778, + "rewards/rejected": -2.616163730621338, + "step": 1273 + }, + { + "epoch": 1.2, + "grad_norm": 20.100311279296875, + "learning_rate": 3.328086743616649e-07, + "logps/chosen": -52.1953125, + "logps/rejected": -70.45608520507812, + "loss": 0.3996, + "losses/dpo": 0.4546908438205719, + "losses/sft": 1.8390356302261353, + "losses/total": 0.4546908438205719, + "ref_logps/chosen": -38.603729248046875, + "ref_logps/rejected": -45.85691833496094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3591587543487549, + "rewards/margins": 1.1007575988769531, + "rewards/rejected": -2.459916591644287, + "step": 1274 + }, + { + "epoch": 1.2, + "grad_norm": 25.579519271850586, + "learning_rate": 3.326337880377754e-07, + "logps/chosen": -54.416290283203125, + "logps/rejected": -63.55059814453125, + "loss": 0.4958, + "losses/dpo": 0.85321044921875, + "losses/sft": 1.9153268337249756, + "losses/total": 0.85321044921875, + "ref_logps/chosen": -41.786014556884766, + "ref_logps/rejected": -44.38050079345703, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2630276679992676, + "rewards/margins": 0.653982400894165, + "rewards/rejected": -1.9170100688934326, + "step": 1275 + }, + { + "epoch": 1.2, + "grad_norm": 17.778179168701172, + "learning_rate": 3.3245890171388597e-07, + "logps/chosen": -44.57392120361328, + "logps/rejected": -61.3828125, + "loss": 0.3441, + "losses/dpo": 0.3692394495010376, + "losses/sft": 1.4996060132980347, + "losses/total": 0.3692394495010376, + "ref_logps/chosen": -33.99726867675781, + "ref_logps/rejected": -38.345863342285156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0576651096343994, + "rewards/margins": 1.2460298538208008, + "rewards/rejected": -2.3036949634552, + "step": 1276 + }, + { + "epoch": 1.21, + "grad_norm": 15.322132110595703, + "learning_rate": 3.322840153899965e-07, + "logps/chosen": -48.347389221191406, + "logps/rejected": -68.76718139648438, + "loss": 0.2668, + "losses/dpo": 0.1739310324192047, + "losses/sft": 1.8910424709320068, + "losses/total": 0.1739310324192047, + "ref_logps/chosen": -38.66949462890625, + "ref_logps/rejected": -44.7320442199707, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9677895903587341, + "rewards/margins": 1.4357243776321411, + "rewards/rejected": -2.4035139083862305, + "step": 1277 + }, + { + "epoch": 1.21, + "grad_norm": 24.36154556274414, + "learning_rate": 3.32109129066107e-07, + "logps/chosen": -48.66584777832031, + "logps/rejected": -68.83869934082031, + "loss": 0.4297, + "losses/dpo": 0.3207012414932251, + "losses/sft": 1.997493863105774, + "losses/total": 0.3207012414932251, + "ref_logps/chosen": -36.260589599609375, + "ref_logps/rejected": -46.24098205566406, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2405259609222412, + "rewards/margins": 1.0192451477050781, + "rewards/rejected": -2.2597711086273193, + "step": 1278 + }, + { + "epoch": 1.21, + "grad_norm": 14.60727596282959, + "learning_rate": 3.3193424274221756e-07, + "logps/chosen": -47.10565185546875, + "logps/rejected": -75.04266357421875, + "loss": 0.2979, + "losses/dpo": 0.30064719915390015, + "losses/sft": 1.6382322311401367, + "losses/total": 0.30064719915390015, + "ref_logps/chosen": -34.7554931640625, + "ref_logps/rejected": -46.60670471191406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.235015869140625, + "rewards/margins": 1.6085803508758545, + "rewards/rejected": -2.8435959815979004, + "step": 1279 + }, + { + "epoch": 1.21, + "grad_norm": 19.049394607543945, + "learning_rate": 3.3175935641832807e-07, + "logps/chosen": -42.42154312133789, + "logps/rejected": -57.276573181152344, + "loss": 0.3747, + "losses/dpo": 0.4905742406845093, + "losses/sft": 2.6760754585266113, + "losses/total": 0.4905742406845093, + "ref_logps/chosen": -31.650192260742188, + "ref_logps/rejected": -35.149505615234375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0771350860595703, + "rewards/margins": 1.1355715990066528, + "rewards/rejected": -2.2127065658569336, + "step": 1280 + }, + { + "epoch": 1.21, + "grad_norm": 17.385385513305664, + "learning_rate": 3.315844700944386e-07, + "logps/chosen": -38.581573486328125, + "logps/rejected": -62.73283004760742, + "loss": 0.3966, + "losses/dpo": 0.558727502822876, + "losses/sft": 2.257667064666748, + "losses/total": 0.558727502822876, + "ref_logps/chosen": -28.25278091430664, + "ref_logps/rejected": -38.52047348022461, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0328795909881592, + "rewards/margins": 1.3883559703826904, + "rewards/rejected": -2.4212355613708496, + "step": 1281 + }, + { + "epoch": 1.21, + "grad_norm": 21.758419036865234, + "learning_rate": 3.314095837705491e-07, + "logps/chosen": -43.095306396484375, + "logps/rejected": -64.56269836425781, + "loss": 0.3912, + "losses/dpo": 0.4416448473930359, + "losses/sft": 1.790996789932251, + "losses/total": 0.4416448473930359, + "ref_logps/chosen": -29.484451293945312, + "ref_logps/rejected": -39.142303466796875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.361085295677185, + "rewards/margins": 1.1809539794921875, + "rewards/rejected": -2.542039394378662, + "step": 1282 + }, + { + "epoch": 1.21, + "grad_norm": 15.446070671081543, + "learning_rate": 3.3123469744665966e-07, + "logps/chosen": -54.28368377685547, + "logps/rejected": -66.2320556640625, + "loss": 0.3684, + "losses/dpo": 0.6027462482452393, + "losses/sft": 2.1526315212249756, + "losses/total": 0.6027462482452393, + "ref_logps/chosen": -41.75803756713867, + "ref_logps/rejected": -40.7514762878418, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2525646686553955, + "rewards/margins": 1.2954933643341064, + "rewards/rejected": -2.548058271408081, + "step": 1283 + }, + { + "epoch": 1.21, + "grad_norm": 29.56391143798828, + "learning_rate": 3.3105981112277023e-07, + "logps/chosen": -55.56463623046875, + "logps/rejected": -52.370445251464844, + "loss": 0.6312, + "losses/dpo": 0.940421462059021, + "losses/sft": 2.232243537902832, + "losses/total": 0.940421462059021, + "ref_logps/chosen": -40.911827087402344, + "ref_logps/rejected": -34.261634826660156, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4652807712554932, + "rewards/margins": 0.34560006856918335, + "rewards/rejected": -1.8108808994293213, + "step": 1284 + }, + { + "epoch": 1.21, + "grad_norm": 17.717260360717773, + "learning_rate": 3.308849247988807e-07, + "logps/chosen": -48.872928619384766, + "logps/rejected": -74.94236755371094, + "loss": 0.2708, + "losses/dpo": 0.27558350563049316, + "losses/sft": 1.7259505987167358, + "losses/total": 0.27558350563049316, + "ref_logps/chosen": -36.185546875, + "ref_logps/rejected": -46.036842346191406, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2687382698059082, + "rewards/margins": 1.6218140125274658, + "rewards/rejected": -2.890552282333374, + "step": 1285 + }, + { + "epoch": 1.21, + "grad_norm": 22.396329879760742, + "learning_rate": 3.3071003847499126e-07, + "logps/chosen": -51.96961975097656, + "logps/rejected": -74.1020278930664, + "loss": 0.5146, + "losses/dpo": 0.5004276633262634, + "losses/sft": 2.29833984375, + "losses/total": 0.5004276633262634, + "ref_logps/chosen": -34.832855224609375, + "ref_logps/rejected": -49.5932731628418, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7136763334274292, + "rewards/margins": 0.7371993064880371, + "rewards/rejected": -2.450875759124756, + "step": 1286 + }, + { + "epoch": 1.22, + "grad_norm": 15.472954750061035, + "learning_rate": 3.3053515215110177e-07, + "logps/chosen": -47.909706115722656, + "logps/rejected": -91.96464538574219, + "loss": 0.2595, + "losses/dpo": 0.10487969219684601, + "losses/sft": 1.825188159942627, + "losses/total": 0.10487969219684601, + "ref_logps/chosen": -34.9535026550293, + "ref_logps/rejected": -61.30604553222656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.295620083808899, + "rewards/margins": 1.770240068435669, + "rewards/rejected": -3.0658605098724365, + "step": 1287 + }, + { + "epoch": 1.22, + "grad_norm": 22.184104919433594, + "learning_rate": 3.303602658272123e-07, + "logps/chosen": -50.46155548095703, + "logps/rejected": -80.95024108886719, + "loss": 0.3384, + "losses/dpo": 0.18355077505111694, + "losses/sft": 1.5707978010177612, + "losses/total": 0.18355077505111694, + "ref_logps/chosen": -39.31085968017578, + "ref_logps/rejected": -54.36072540283203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1150697469711304, + "rewards/margins": 1.5438816547393799, + "rewards/rejected": -2.6589515209198, + "step": 1288 + }, + { + "epoch": 1.22, + "grad_norm": 21.306243896484375, + "learning_rate": 3.301853795033228e-07, + "logps/chosen": -43.325958251953125, + "logps/rejected": -68.84542083740234, + "loss": 0.3303, + "losses/dpo": 0.24994681775569916, + "losses/sft": 1.3414628505706787, + "losses/total": 0.24994681775569916, + "ref_logps/chosen": -30.669391632080078, + "ref_logps/rejected": -40.66756820678711, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2656567096710205, + "rewards/margins": 1.5521284341812134, + "rewards/rejected": -2.8177850246429443, + "step": 1289 + }, + { + "epoch": 1.22, + "grad_norm": 21.933183670043945, + "learning_rate": 3.3001049317943336e-07, + "logps/chosen": -49.80470275878906, + "logps/rejected": -60.630489349365234, + "loss": 0.5128, + "losses/dpo": 0.545912504196167, + "losses/sft": 1.558157205581665, + "losses/total": 0.545912504196167, + "ref_logps/chosen": -34.777557373046875, + "ref_logps/rejected": -37.527740478515625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.50271475315094, + "rewards/margins": 0.8075599074363708, + "rewards/rejected": -2.310274600982666, + "step": 1290 + }, + { + "epoch": 1.22, + "grad_norm": 23.52597999572754, + "learning_rate": 3.298356068555439e-07, + "logps/chosen": -51.77589416503906, + "logps/rejected": -69.64151763916016, + "loss": 0.4809, + "losses/dpo": 0.8141963481903076, + "losses/sft": 1.747296690940857, + "losses/total": 0.8141963481903076, + "ref_logps/chosen": -33.86619186401367, + "ref_logps/rejected": -42.238868713378906, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.790969967842102, + "rewards/margins": 0.949295163154602, + "rewards/rejected": -2.740265369415283, + "step": 1291 + }, + { + "epoch": 1.22, + "grad_norm": 30.15660285949707, + "learning_rate": 3.296607205316544e-07, + "logps/chosen": -51.819515228271484, + "logps/rejected": -74.5934066772461, + "loss": 0.4627, + "losses/dpo": 0.306501179933548, + "losses/sft": 1.456073522567749, + "losses/total": 0.306501179933548, + "ref_logps/chosen": -37.83601760864258, + "ref_logps/rejected": -48.12083053588867, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3983497619628906, + "rewards/margins": 1.2489078044891357, + "rewards/rejected": -2.6472578048706055, + "step": 1292 + }, + { + "epoch": 1.22, + "grad_norm": 30.59558868408203, + "learning_rate": 3.2948583420776495e-07, + "logps/chosen": -53.75914001464844, + "logps/rejected": -62.064453125, + "loss": 0.6725, + "losses/dpo": 0.6996077299118042, + "losses/sft": 1.8936095237731934, + "losses/total": 0.6996077299118042, + "ref_logps/chosen": -38.6866569519043, + "ref_logps/rejected": -39.625144958496094, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5072486400604248, + "rewards/margins": 0.7366818785667419, + "rewards/rejected": -2.2439303398132324, + "step": 1293 + }, + { + "epoch": 1.22, + "grad_norm": 19.79813575744629, + "learning_rate": 3.2931094788387546e-07, + "logps/chosen": -48.740421295166016, + "logps/rejected": -67.98152160644531, + "loss": 0.3642, + "losses/dpo": 0.4135620594024658, + "losses/sft": 2.6681344509124756, + "losses/total": 0.4135620594024658, + "ref_logps/chosen": -33.50664138793945, + "ref_logps/rejected": -41.82233810424805, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5233776569366455, + "rewards/margins": 1.092540979385376, + "rewards/rejected": -2.6159186363220215, + "step": 1294 + }, + { + "epoch": 1.22, + "grad_norm": 32.642818450927734, + "learning_rate": 3.29136061559986e-07, + "logps/chosen": -58.44609451293945, + "logps/rejected": -64.89666748046875, + "loss": 0.6469, + "losses/dpo": 0.526237428188324, + "losses/sft": 1.6568933725357056, + "losses/total": 0.526237428188324, + "ref_logps/chosen": -41.75444030761719, + "ref_logps/rejected": -41.766197204589844, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6691652536392212, + "rewards/margins": 0.6438819766044617, + "rewards/rejected": -2.313047170639038, + "step": 1295 + }, + { + "epoch": 1.22, + "grad_norm": 22.285186767578125, + "learning_rate": 3.289611752360965e-07, + "logps/chosen": -45.24211883544922, + "logps/rejected": -68.41378021240234, + "loss": 0.4107, + "losses/dpo": 0.24445749819278717, + "losses/sft": 1.6323164701461792, + "losses/total": 0.24445749819278717, + "ref_logps/chosen": -33.608001708984375, + "ref_logps/rejected": -44.66828918457031, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.163412094116211, + "rewards/margins": 1.211136817932129, + "rewards/rejected": -2.37454891204834, + "step": 1296 + }, + { + "epoch": 1.22, + "grad_norm": 19.355915069580078, + "learning_rate": 3.2878628891220705e-07, + "logps/chosen": -56.947296142578125, + "logps/rejected": -81.64164733886719, + "loss": 0.3288, + "losses/dpo": 0.19625043869018555, + "losses/sft": 1.7529845237731934, + "losses/total": 0.19625043869018555, + "ref_logps/chosen": -42.715492248535156, + "ref_logps/rejected": -51.96495819091797, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4231808185577393, + "rewards/margins": 1.544487714767456, + "rewards/rejected": -2.9676685333251953, + "step": 1297 + }, + { + "epoch": 1.23, + "grad_norm": 15.184709548950195, + "learning_rate": 3.286114025883176e-07, + "logps/chosen": -43.13945388793945, + "logps/rejected": -76.0487289428711, + "loss": 0.2396, + "losses/dpo": 0.22111205756664276, + "losses/sft": 1.25008225440979, + "losses/total": 0.22111205756664276, + "ref_logps/chosen": -33.227134704589844, + "ref_logps/rejected": -49.567604064941406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9912323951721191, + "rewards/margins": 1.6568796634674072, + "rewards/rejected": -2.6481118202209473, + "step": 1298 + }, + { + "epoch": 1.23, + "grad_norm": 22.35556411743164, + "learning_rate": 3.284365162644281e-07, + "logps/chosen": -48.418701171875, + "logps/rejected": -66.66232299804688, + "loss": 0.4478, + "losses/dpo": 0.5131890773773193, + "losses/sft": 1.5699821710586548, + "losses/total": 0.5131890773773193, + "ref_logps/chosen": -36.57819747924805, + "ref_logps/rejected": -43.835689544677734, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1840505599975586, + "rewards/margins": 1.0986123085021973, + "rewards/rejected": -2.282662868499756, + "step": 1299 + }, + { + "epoch": 1.23, + "grad_norm": 18.5380916595459, + "learning_rate": 3.2826162994053864e-07, + "logps/chosen": -61.87382507324219, + "logps/rejected": -77.09158325195312, + "loss": 0.325, + "losses/dpo": 0.24416860938072205, + "losses/sft": 1.715246319770813, + "losses/total": 0.24416860938072205, + "ref_logps/chosen": -47.20370864868164, + "ref_logps/rejected": -48.25052261352539, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4670116901397705, + "rewards/margins": 1.4170947074890137, + "rewards/rejected": -2.884106397628784, + "step": 1300 + }, + { + "epoch": 1.23, + "grad_norm": 11.219528198242188, + "learning_rate": 3.2808674361664916e-07, + "logps/chosen": -37.850894927978516, + "logps/rejected": -75.34614562988281, + "loss": 0.2226, + "losses/dpo": 0.19624850153923035, + "losses/sft": 1.3619885444641113, + "losses/total": 0.19624850153923035, + "ref_logps/chosen": -29.011240005493164, + "ref_logps/rejected": -44.524391174316406, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.883965253829956, + "rewards/margins": 2.1982107162475586, + "rewards/rejected": -3.0821757316589355, + "step": 1301 + }, + { + "epoch": 1.23, + "grad_norm": 18.190052032470703, + "learning_rate": 3.2791185729275967e-07, + "logps/chosen": -54.94478225708008, + "logps/rejected": -64.18688201904297, + "loss": 0.3208, + "losses/dpo": 0.2329922318458557, + "losses/sft": 2.344893455505371, + "losses/total": 0.2329922318458557, + "ref_logps/chosen": -42.08929443359375, + "ref_logps/rejected": -39.92123031616211, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2855491638183594, + "rewards/margins": 1.1410161256790161, + "rewards/rejected": -2.426565170288086, + "step": 1302 + }, + { + "epoch": 1.23, + "grad_norm": 23.517995834350586, + "learning_rate": 3.277369709688702e-07, + "logps/chosen": -46.17790985107422, + "logps/rejected": -67.5840072631836, + "loss": 0.471, + "losses/dpo": 0.3811895549297333, + "losses/sft": 1.5601413249969482, + "losses/total": 0.3811895549297333, + "ref_logps/chosen": -32.570960998535156, + "ref_logps/rejected": -43.74147033691406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3606951236724854, + "rewards/margins": 1.0235587358474731, + "rewards/rejected": -2.384253978729248, + "step": 1303 + }, + { + "epoch": 1.23, + "grad_norm": 28.837244033813477, + "learning_rate": 3.2756208464498075e-07, + "logps/chosen": -51.181602478027344, + "logps/rejected": -68.76593017578125, + "loss": 0.5232, + "losses/dpo": 0.3507500886917114, + "losses/sft": 1.9810177087783813, + "losses/total": 0.3507500886917114, + "ref_logps/chosen": -36.52156066894531, + "ref_logps/rejected": -45.390869140625, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4660041332244873, + "rewards/margins": 0.8715018033981323, + "rewards/rejected": -2.33750581741333, + "step": 1304 + }, + { + "epoch": 1.23, + "grad_norm": 18.90814971923828, + "learning_rate": 3.273871983210913e-07, + "logps/chosen": -43.88724899291992, + "logps/rejected": -69.35987854003906, + "loss": 0.3235, + "losses/dpo": 0.17113202810287476, + "losses/sft": 1.858979344367981, + "losses/total": 0.17113202810287476, + "ref_logps/chosen": -32.038177490234375, + "ref_logps/rejected": -43.49333190917969, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1849069595336914, + "rewards/margins": 1.4017481803894043, + "rewards/rejected": -2.5866551399230957, + "step": 1305 + }, + { + "epoch": 1.23, + "grad_norm": 22.561180114746094, + "learning_rate": 3.2721231199720177e-07, + "logps/chosen": -53.358001708984375, + "logps/rejected": -76.11177825927734, + "loss": 0.4161, + "losses/dpo": 0.31508854031562805, + "losses/sft": 1.9650845527648926, + "losses/total": 0.31508854031562805, + "ref_logps/chosen": -40.74730682373047, + "ref_logps/rejected": -48.347511291503906, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2610701322555542, + "rewards/margins": 1.5153571367263794, + "rewards/rejected": -2.7764272689819336, + "step": 1306 + }, + { + "epoch": 1.23, + "grad_norm": 19.188674926757812, + "learning_rate": 3.2703742567331234e-07, + "logps/chosen": -48.32026672363281, + "logps/rejected": -57.26163864135742, + "loss": 0.444, + "losses/dpo": 0.276664137840271, + "losses/sft": 1.4580862522125244, + "losses/total": 0.276664137840271, + "ref_logps/chosen": -37.229698181152344, + "ref_logps/rejected": -35.70116424560547, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1090574264526367, + "rewards/margins": 1.0469906330108643, + "rewards/rejected": -2.156047821044922, + "step": 1307 + }, + { + "epoch": 1.24, + "grad_norm": 21.35573959350586, + "learning_rate": 3.2686253934942285e-07, + "logps/chosen": -50.68305206298828, + "logps/rejected": -65.30364990234375, + "loss": 0.486, + "losses/dpo": 0.5735315084457397, + "losses/sft": 1.9887821674346924, + "losses/total": 0.5735315084457397, + "ref_logps/chosen": -36.80727005004883, + "ref_logps/rejected": -42.483665466308594, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3875783681869507, + "rewards/margins": 0.8944202661514282, + "rewards/rejected": -2.281998634338379, + "step": 1308 + }, + { + "epoch": 1.24, + "grad_norm": 21.293556213378906, + "learning_rate": 3.266876530255334e-07, + "logps/chosen": -50.35670852661133, + "logps/rejected": -81.25212097167969, + "loss": 0.3172, + "losses/dpo": 0.2058955579996109, + "losses/sft": 1.4526883363723755, + "losses/total": 0.2058955579996109, + "ref_logps/chosen": -36.02672576904297, + "ref_logps/rejected": -49.118896484375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4329981803894043, + "rewards/margins": 1.7803233861923218, + "rewards/rejected": -3.2133214473724365, + "step": 1309 + }, + { + "epoch": 1.24, + "grad_norm": 32.47032165527344, + "learning_rate": 3.2651276670164393e-07, + "logps/chosen": -55.274818420410156, + "logps/rejected": -68.23551940917969, + "loss": 0.5644, + "losses/dpo": 0.3510875105857849, + "losses/sft": 1.2595924139022827, + "losses/total": 0.3510875105857849, + "ref_logps/chosen": -40.185157775878906, + "ref_logps/rejected": -45.297996520996094, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5089654922485352, + "rewards/margins": 0.784786581993103, + "rewards/rejected": -2.2937521934509277, + "step": 1310 + }, + { + "epoch": 1.24, + "grad_norm": 30.444149017333984, + "learning_rate": 3.2633788037775444e-07, + "logps/chosen": -55.1437873840332, + "logps/rejected": -63.98257827758789, + "loss": 0.6349, + "losses/dpo": 0.4569612741470337, + "losses/sft": 1.7529760599136353, + "losses/total": 0.4569612741470337, + "ref_logps/chosen": -41.296539306640625, + "ref_logps/rejected": -43.07489013671875, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3847248554229736, + "rewards/margins": 0.7060439586639404, + "rewards/rejected": -2.090768814086914, + "step": 1311 + }, + { + "epoch": 1.24, + "grad_norm": 15.60333251953125, + "learning_rate": 3.26162994053865e-07, + "logps/chosen": -44.36737823486328, + "logps/rejected": -68.90261840820312, + "loss": 0.3255, + "losses/dpo": 0.36724573373794556, + "losses/sft": 2.181623697280884, + "losses/total": 0.36724573373794556, + "ref_logps/chosen": -34.942996978759766, + "ref_logps/rejected": -44.84746551513672, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9424382448196411, + "rewards/margins": 1.4630770683288574, + "rewards/rejected": -2.405515193939209, + "step": 1312 + }, + { + "epoch": 1.24, + "grad_norm": 12.743868827819824, + "learning_rate": 3.2598810772997547e-07, + "logps/chosen": -48.645076751708984, + "logps/rejected": -77.03884887695312, + "loss": 0.234, + "losses/dpo": 0.27354711294174194, + "losses/sft": 1.7931358814239502, + "losses/total": 0.27354711294174194, + "ref_logps/chosen": -37.277740478515625, + "ref_logps/rejected": -48.298004150390625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1367335319519043, + "rewards/margins": 1.737350344657898, + "rewards/rejected": -2.874083995819092, + "step": 1313 + }, + { + "epoch": 1.24, + "grad_norm": 19.99807357788086, + "learning_rate": 3.2581322140608603e-07, + "logps/chosen": -50.415985107421875, + "logps/rejected": -68.30654907226562, + "loss": 0.428, + "losses/dpo": 0.7504759430885315, + "losses/sft": 1.4540705680847168, + "losses/total": 0.7504759430885315, + "ref_logps/chosen": -40.197364807128906, + "ref_logps/rejected": -46.552452087402344, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0218619108200073, + "rewards/margins": 1.153548240661621, + "rewards/rejected": -2.175410270690918, + "step": 1314 + }, + { + "epoch": 1.24, + "grad_norm": 20.18872833251953, + "learning_rate": 3.2563833508219655e-07, + "logps/chosen": -58.441917419433594, + "logps/rejected": -72.74459838867188, + "loss": 0.4563, + "losses/dpo": 0.600630521774292, + "losses/sft": 1.7910703420639038, + "losses/total": 0.600630521774292, + "ref_logps/chosen": -46.625999450683594, + "ref_logps/rejected": -50.275848388671875, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1815918684005737, + "rewards/margins": 1.0652837753295898, + "rewards/rejected": -2.246875524520874, + "step": 1315 + }, + { + "epoch": 1.24, + "grad_norm": 25.531356811523438, + "learning_rate": 3.254634487583071e-07, + "logps/chosen": -64.70285034179688, + "logps/rejected": -60.91915512084961, + "loss": 0.4955, + "losses/dpo": 0.3505529761314392, + "losses/sft": 1.782562017440796, + "losses/total": 0.3505529761314392, + "ref_logps/chosen": -50.220314025878906, + "ref_logps/rejected": -36.196380615234375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4482536315917969, + "rewards/margins": 1.0240238904953003, + "rewards/rejected": -2.4722776412963867, + "step": 1316 + }, + { + "epoch": 1.24, + "grad_norm": 16.17140769958496, + "learning_rate": 3.252885624344176e-07, + "logps/chosen": -43.879234313964844, + "logps/rejected": -75.45570373535156, + "loss": 0.2886, + "losses/dpo": 0.18516552448272705, + "losses/sft": 1.9407551288604736, + "losses/total": 0.18516552448272705, + "ref_logps/chosen": -33.99541473388672, + "ref_logps/rejected": -50.96930694580078, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9883820414543152, + "rewards/margins": 1.4602574110031128, + "rewards/rejected": -2.448639392852783, + "step": 1317 + }, + { + "epoch": 1.24, + "grad_norm": 24.757556915283203, + "learning_rate": 3.2511367611052814e-07, + "logps/chosen": -60.0771369934082, + "logps/rejected": -74.33728790283203, + "loss": 0.5302, + "losses/dpo": 0.23207569122314453, + "losses/sft": 1.9666743278503418, + "losses/total": 0.23207569122314453, + "ref_logps/chosen": -45.66221237182617, + "ref_logps/rejected": -50.77057647705078, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4414923191070557, + "rewards/margins": 0.9151792526245117, + "rewards/rejected": -2.3566718101501465, + "step": 1318 + }, + { + "epoch": 1.25, + "grad_norm": 15.23419189453125, + "learning_rate": 3.249387897866387e-07, + "logps/chosen": -37.39873504638672, + "logps/rejected": -70.27153015136719, + "loss": 0.3191, + "losses/dpo": 0.35991108417510986, + "losses/sft": 1.3433964252471924, + "losses/total": 0.35991108417510986, + "ref_logps/chosen": -28.353347778320312, + "ref_logps/rejected": -48.33560562133789, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9045386910438538, + "rewards/margins": 1.2890541553497314, + "rewards/rejected": -2.1935927867889404, + "step": 1319 + }, + { + "epoch": 1.25, + "grad_norm": 22.617067337036133, + "learning_rate": 3.2476390346274916e-07, + "logps/chosen": -46.061668395996094, + "logps/rejected": -51.542694091796875, + "loss": 0.6039, + "losses/dpo": 0.4379119277000427, + "losses/sft": 1.9982529878616333, + "losses/total": 0.4379119277000427, + "ref_logps/chosen": -33.610599517822266, + "ref_logps/rejected": -33.450477600097656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2451066970825195, + "rewards/margins": 0.5641152262687683, + "rewards/rejected": -1.8092219829559326, + "step": 1320 + }, + { + "epoch": 1.25, + "grad_norm": 21.929431915283203, + "learning_rate": 3.2458901713885973e-07, + "logps/chosen": -45.45535659790039, + "logps/rejected": -55.51594543457031, + "loss": 0.4203, + "losses/dpo": 0.6898786425590515, + "losses/sft": 2.0693986415863037, + "losses/total": 0.6898786425590515, + "ref_logps/chosen": -35.48495864868164, + "ref_logps/rejected": -34.851654052734375, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9970399141311646, + "rewards/margins": 1.0693888664245605, + "rewards/rejected": -2.0664286613464355, + "step": 1321 + }, + { + "epoch": 1.25, + "grad_norm": 22.96612548828125, + "learning_rate": 3.2441413081497024e-07, + "logps/chosen": -52.88302993774414, + "logps/rejected": -73.43191528320312, + "loss": 0.3806, + "losses/dpo": 0.2667972445487976, + "losses/sft": 1.705133080482483, + "losses/total": 0.2667972445487976, + "ref_logps/chosen": -41.457115173339844, + "ref_logps/rejected": -48.10161209106445, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1425915956497192, + "rewards/margins": 1.3904387950897217, + "rewards/rejected": -2.5330302715301514, + "step": 1322 + }, + { + "epoch": 1.25, + "grad_norm": 15.859878540039062, + "learning_rate": 3.242392444910808e-07, + "logps/chosen": -44.19548034667969, + "logps/rejected": -60.982975006103516, + "loss": 0.2992, + "losses/dpo": 0.4226863384246826, + "losses/sft": 1.8237709999084473, + "losses/total": 0.4226863384246826, + "ref_logps/chosen": -34.07893371582031, + "ref_logps/rejected": -36.695064544677734, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0116544961929321, + "rewards/margins": 1.4171364307403564, + "rewards/rejected": -2.428791046142578, + "step": 1323 + }, + { + "epoch": 1.25, + "grad_norm": 14.331469535827637, + "learning_rate": 3.240643581671913e-07, + "logps/chosen": -38.36085510253906, + "logps/rejected": -58.140342712402344, + "loss": 0.3015, + "losses/dpo": 0.28820958733558655, + "losses/sft": 1.4967849254608154, + "losses/total": 0.28820958733558655, + "ref_logps/chosen": -31.664566040039062, + "ref_logps/rejected": -37.260887145996094, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6696290969848633, + "rewards/margins": 1.41831636428833, + "rewards/rejected": -2.0879454612731934, + "step": 1324 + }, + { + "epoch": 1.25, + "grad_norm": 19.79657554626465, + "learning_rate": 3.2388947184330183e-07, + "logps/chosen": -45.195281982421875, + "logps/rejected": -66.21366882324219, + "loss": 0.3472, + "losses/dpo": 0.1739564836025238, + "losses/sft": 1.7063114643096924, + "losses/total": 0.1739564836025238, + "ref_logps/chosen": -33.74018859863281, + "ref_logps/rejected": -41.121517181396484, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1455092430114746, + "rewards/margins": 1.3637057542800903, + "rewards/rejected": -2.5092148780822754, + "step": 1325 + }, + { + "epoch": 1.25, + "grad_norm": 22.342103958129883, + "learning_rate": 3.237145855194124e-07, + "logps/chosen": -42.53139114379883, + "logps/rejected": -48.85778045654297, + "loss": 0.5487, + "losses/dpo": 0.3733326494693756, + "losses/sft": 1.5709648132324219, + "losses/total": 0.3733326494693756, + "ref_logps/chosen": -33.1717643737793, + "ref_logps/rejected": -31.201541900634766, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.935962975025177, + "rewards/margins": 0.8296608924865723, + "rewards/rejected": -1.7656238079071045, + "step": 1326 + }, + { + "epoch": 1.25, + "grad_norm": 27.36860466003418, + "learning_rate": 3.2353969919552286e-07, + "logps/chosen": -45.33452224731445, + "logps/rejected": -56.8319206237793, + "loss": 0.638, + "losses/dpo": 0.9248436093330383, + "losses/sft": 1.8668464422225952, + "losses/total": 0.9248436093330383, + "ref_logps/chosen": -35.331947326660156, + "ref_logps/rejected": -41.05805969238281, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0002573728561401, + "rewards/margins": 0.5771288275718689, + "rewards/rejected": -1.5773861408233643, + "step": 1327 + }, + { + "epoch": 1.25, + "grad_norm": 27.762012481689453, + "learning_rate": 3.233648128716334e-07, + "logps/chosen": -51.885101318359375, + "logps/rejected": -61.62514114379883, + "loss": 0.5824, + "losses/dpo": 0.7015052437782288, + "losses/sft": 1.5709404945373535, + "losses/total": 0.7015052437782288, + "ref_logps/chosen": -36.931671142578125, + "ref_logps/rejected": -39.759273529052734, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4953434467315674, + "rewards/margins": 0.6912434101104736, + "rewards/rejected": -2.186586856842041, + "step": 1328 + }, + { + "epoch": 1.25, + "grad_norm": 23.887310028076172, + "learning_rate": 3.23189926547744e-07, + "logps/chosen": -61.18767547607422, + "logps/rejected": -87.2144546508789, + "loss": 0.4081, + "losses/dpo": 0.3507990837097168, + "losses/sft": 1.886390209197998, + "losses/total": 0.3507990837097168, + "ref_logps/chosen": -42.69068908691406, + "ref_logps/rejected": -57.786170959472656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8496990203857422, + "rewards/margins": 1.093129277229309, + "rewards/rejected": -2.942828416824341, + "step": 1329 + }, + { + "epoch": 1.26, + "grad_norm": 20.27486228942871, + "learning_rate": 3.230150402238545e-07, + "logps/chosen": -47.38467788696289, + "logps/rejected": -69.49496459960938, + "loss": 0.3936, + "losses/dpo": 0.28516754508018494, + "losses/sft": 1.731813669204712, + "losses/total": 0.28516754508018494, + "ref_logps/chosen": -36.72315216064453, + "ref_logps/rejected": -46.98150634765625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.066152572631836, + "rewards/margins": 1.1851938962936401, + "rewards/rejected": -2.2513465881347656, + "step": 1330 + }, + { + "epoch": 1.26, + "grad_norm": 21.366527557373047, + "learning_rate": 3.22840153899965e-07, + "logps/chosen": -44.96656036376953, + "logps/rejected": -62.3281135559082, + "loss": 0.4295, + "losses/dpo": 0.515168309211731, + "losses/sft": 1.5524519681930542, + "losses/total": 0.515168309211731, + "ref_logps/chosen": -36.836669921875, + "ref_logps/rejected": -43.83147430419922, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8129887580871582, + "rewards/margins": 1.036675214767456, + "rewards/rejected": -1.8496639728546143, + "step": 1331 + }, + { + "epoch": 1.26, + "grad_norm": 17.08660888671875, + "learning_rate": 3.226652675760755e-07, + "logps/chosen": -41.66600799560547, + "logps/rejected": -62.67748260498047, + "loss": 0.4071, + "losses/dpo": 0.164664626121521, + "losses/sft": 2.1566896438598633, + "losses/total": 0.164664626121521, + "ref_logps/chosen": -28.76436996459961, + "ref_logps/rejected": -37.91204071044922, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.290163516998291, + "rewards/margins": 1.1863805055618286, + "rewards/rejected": -2.476544141769409, + "step": 1332 + }, + { + "epoch": 1.26, + "grad_norm": 20.865203857421875, + "learning_rate": 3.224903812521861e-07, + "logps/chosen": -64.59005737304688, + "logps/rejected": -69.01496124267578, + "loss": 0.4417, + "losses/dpo": 0.4876309037208557, + "losses/sft": 1.9798153638839722, + "losses/total": 0.4876309037208557, + "ref_logps/chosen": -50.089942932128906, + "ref_logps/rejected": -44.22984313964844, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4500117301940918, + "rewards/margins": 1.0285006761550903, + "rewards/rejected": -2.4785122871398926, + "step": 1333 + }, + { + "epoch": 1.26, + "grad_norm": 21.650136947631836, + "learning_rate": 3.2231549492829655e-07, + "logps/chosen": -43.238407135009766, + "logps/rejected": -61.714271545410156, + "loss": 0.441, + "losses/dpo": 0.9948136806488037, + "losses/sft": 2.3222570419311523, + "losses/total": 0.9948136806488037, + "ref_logps/chosen": -32.03950881958008, + "ref_logps/rejected": -38.622928619384766, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.119889736175537, + "rewards/margins": 1.1892445087432861, + "rewards/rejected": -2.3091344833374023, + "step": 1334 + }, + { + "epoch": 1.26, + "grad_norm": 18.566551208496094, + "learning_rate": 3.221406086044071e-07, + "logps/chosen": -60.351646423339844, + "logps/rejected": -86.7037353515625, + "loss": 0.2427, + "losses/dpo": 0.09323880076408386, + "losses/sft": 1.6382561922073364, + "losses/total": 0.09323880076408386, + "ref_logps/chosen": -47.07814025878906, + "ref_logps/rejected": -54.51207733154297, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3273508548736572, + "rewards/margins": 1.8918142318725586, + "rewards/rejected": -3.2191648483276367, + "step": 1335 + }, + { + "epoch": 1.26, + "grad_norm": 23.849151611328125, + "learning_rate": 3.219657222805177e-07, + "logps/chosen": -49.82647705078125, + "logps/rejected": -65.72238159179688, + "loss": 0.3579, + "losses/dpo": 0.34007012844085693, + "losses/sft": 2.0767719745635986, + "losses/total": 0.34007012844085693, + "ref_logps/chosen": -36.99418258666992, + "ref_logps/rejected": -39.154380798339844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2832295894622803, + "rewards/margins": 1.3735709190368652, + "rewards/rejected": -2.6568005084991455, + "step": 1336 + }, + { + "epoch": 1.26, + "grad_norm": 19.60880470275879, + "learning_rate": 3.217908359566282e-07, + "logps/chosen": -52.775386810302734, + "logps/rejected": -66.595458984375, + "loss": 0.3526, + "losses/dpo": 0.31136977672576904, + "losses/sft": 1.9005811214447021, + "losses/total": 0.31136977672576904, + "ref_logps/chosen": -39.96556091308594, + "ref_logps/rejected": -41.529136657714844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.280982255935669, + "rewards/margins": 1.2256507873535156, + "rewards/rejected": -2.5066330432891846, + "step": 1337 + }, + { + "epoch": 1.26, + "grad_norm": 24.112470626831055, + "learning_rate": 3.216159496327387e-07, + "logps/chosen": -57.16948699951172, + "logps/rejected": -61.171268463134766, + "loss": 0.4699, + "losses/dpo": 0.4526618719100952, + "losses/sft": 1.6857000589370728, + "losses/total": 0.4526618719100952, + "ref_logps/chosen": -41.94329833984375, + "ref_logps/rejected": -35.8074836730957, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5226187705993652, + "rewards/margins": 1.0137600898742676, + "rewards/rejected": -2.536378860473633, + "step": 1338 + }, + { + "epoch": 1.26, + "grad_norm": 28.725250244140625, + "learning_rate": 3.214410633088492e-07, + "logps/chosen": -56.87745666503906, + "logps/rejected": -64.42515563964844, + "loss": 0.5745, + "losses/dpo": 0.33915460109710693, + "losses/sft": 1.551383137702942, + "losses/total": 0.33915460109710693, + "ref_logps/chosen": -40.86872863769531, + "ref_logps/rejected": -40.371002197265625, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6008729934692383, + "rewards/margins": 0.8045421838760376, + "rewards/rejected": -2.4054150581359863, + "step": 1339 + }, + { + "epoch": 1.27, + "grad_norm": 14.856887817382812, + "learning_rate": 3.212661769849598e-07, + "logps/chosen": -60.09968566894531, + "logps/rejected": -94.28604125976562, + "loss": 0.195, + "losses/dpo": 0.2769324481487274, + "losses/sft": 2.1882357597351074, + "losses/total": 0.2769324481487274, + "ref_logps/chosen": -47.022762298583984, + "ref_logps/rejected": -61.86514663696289, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.307692050933838, + "rewards/margins": 1.9343969821929932, + "rewards/rejected": -3.242089033126831, + "step": 1340 + }, + { + "epoch": 1.27, + "grad_norm": 30.66367530822754, + "learning_rate": 3.2109129066107024e-07, + "logps/chosen": -55.5614013671875, + "logps/rejected": -69.0540542602539, + "loss": 0.6662, + "losses/dpo": 1.352977991104126, + "losses/sft": 2.3313493728637695, + "losses/total": 1.352977991104126, + "ref_logps/chosen": -39.0507698059082, + "ref_logps/rejected": -41.118221282958984, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6510632038116455, + "rewards/margins": 1.142520546913147, + "rewards/rejected": -2.793583869934082, + "step": 1341 + }, + { + "epoch": 1.27, + "grad_norm": 38.804439544677734, + "learning_rate": 3.209164043371808e-07, + "logps/chosen": -72.30899047851562, + "logps/rejected": -66.9932861328125, + "loss": 0.5741, + "losses/dpo": 0.19433581829071045, + "losses/sft": 1.922597050666809, + "losses/total": 0.19433581829071045, + "ref_logps/chosen": -55.39595031738281, + "ref_logps/rejected": -43.16709899902344, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6913042068481445, + "rewards/margins": 0.6913138628005981, + "rewards/rejected": -2.3826181888580322, + "step": 1342 + }, + { + "epoch": 1.27, + "grad_norm": 19.082738876342773, + "learning_rate": 3.207415180132914e-07, + "logps/chosen": -49.338626861572266, + "logps/rejected": -62.61579513549805, + "loss": 0.3741, + "losses/dpo": 0.7597951889038086, + "losses/sft": 2.1300644874572754, + "losses/total": 0.7597951889038086, + "ref_logps/chosen": -39.40080261230469, + "ref_logps/rejected": -40.771018981933594, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9937824010848999, + "rewards/margins": 1.1906951665878296, + "rewards/rejected": -2.1844775676727295, + "step": 1343 + }, + { + "epoch": 1.27, + "grad_norm": 17.68570899963379, + "learning_rate": 3.205666316894019e-07, + "logps/chosen": -42.218475341796875, + "logps/rejected": -62.51219177246094, + "loss": 0.3369, + "losses/dpo": 0.24026553332805634, + "losses/sft": 0.9595241546630859, + "losses/total": 0.24026553332805634, + "ref_logps/chosen": -33.65283203125, + "ref_logps/rejected": -41.172088623046875, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8565640449523926, + "rewards/margins": 1.2774463891983032, + "rewards/rejected": -2.1340105533599854, + "step": 1344 + }, + { + "epoch": 1.27, + "grad_norm": 21.236780166625977, + "learning_rate": 3.203917453655124e-07, + "logps/chosen": -50.389320373535156, + "logps/rejected": -55.241783142089844, + "loss": 0.4897, + "losses/dpo": 0.5423358678817749, + "losses/sft": 1.8764293193817139, + "losses/total": 0.5423358678817749, + "ref_logps/chosen": -37.61534881591797, + "ref_logps/rejected": -35.13709259033203, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2773971557617188, + "rewards/margins": 0.733071506023407, + "rewards/rejected": -2.0104687213897705, + "step": 1345 + }, + { + "epoch": 1.27, + "grad_norm": 27.776611328125, + "learning_rate": 3.202168590416229e-07, + "logps/chosen": -55.493804931640625, + "logps/rejected": -61.5127067565918, + "loss": 0.4846, + "losses/dpo": 0.6097148656845093, + "losses/sft": 2.082397699356079, + "losses/total": 0.6097148656845093, + "ref_logps/chosen": -41.849891662597656, + "ref_logps/rejected": -38.26130294799805, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3643914461135864, + "rewards/margins": 0.9607487916946411, + "rewards/rejected": -2.3251404762268066, + "step": 1346 + }, + { + "epoch": 1.27, + "grad_norm": 21.580564498901367, + "learning_rate": 3.200419727177335e-07, + "logps/chosen": -60.56695556640625, + "logps/rejected": -64.56971740722656, + "loss": 0.4147, + "losses/dpo": 0.6108167171478271, + "losses/sft": 2.1351778507232666, + "losses/total": 0.6108167171478271, + "ref_logps/chosen": -44.02821350097656, + "ref_logps/rejected": -38.69805145263672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6538739204406738, + "rewards/margins": 0.9332923889160156, + "rewards/rejected": -2.5871663093566895, + "step": 1347 + }, + { + "epoch": 1.27, + "grad_norm": 17.11998176574707, + "learning_rate": 3.19867086393844e-07, + "logps/chosen": -56.3470344543457, + "logps/rejected": -83.40293884277344, + "loss": 0.2419, + "losses/dpo": 0.2898802161216736, + "losses/sft": 2.107215642929077, + "losses/total": 0.2898802161216736, + "ref_logps/chosen": -41.44533920288086, + "ref_logps/rejected": -51.06003189086914, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4901695251464844, + "rewards/margins": 1.7441210746765137, + "rewards/rejected": -3.234290599822998, + "step": 1348 + }, + { + "epoch": 1.27, + "grad_norm": 24.633512496948242, + "learning_rate": 3.196922000699545e-07, + "logps/chosen": -60.06035232543945, + "logps/rejected": -66.82292175292969, + "loss": 0.4087, + "losses/dpo": 0.5050278306007385, + "losses/sft": 1.8513654470443726, + "losses/total": 0.5050278306007385, + "ref_logps/chosen": -44.01942443847656, + "ref_logps/rejected": -39.494895935058594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6040929555892944, + "rewards/margins": 1.1287102699279785, + "rewards/rejected": -2.7328033447265625, + "step": 1349 + }, + { + "epoch": 1.27, + "grad_norm": 24.701948165893555, + "learning_rate": 3.1951731374606507e-07, + "logps/chosen": -48.76957702636719, + "logps/rejected": -66.14830780029297, + "loss": 0.5414, + "losses/dpo": 0.43534407019615173, + "losses/sft": 2.3554751873016357, + "losses/total": 0.43534407019615173, + "ref_logps/chosen": -32.77943420410156, + "ref_logps/rejected": -44.04889678955078, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5990142822265625, + "rewards/margins": 0.6109272241592407, + "rewards/rejected": -2.2099416255950928, + "step": 1350 + }, + { + "epoch": 1.28, + "grad_norm": 22.263809204101562, + "learning_rate": 3.193424274221756e-07, + "logps/chosen": -47.04250717163086, + "logps/rejected": -62.94432067871094, + "loss": 0.4185, + "losses/dpo": 0.5180111527442932, + "losses/sft": 1.6933324337005615, + "losses/total": 0.5180111527442932, + "ref_logps/chosen": -34.896610260009766, + "ref_logps/rejected": -37.53900909423828, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2145897150039673, + "rewards/margins": 1.325941562652588, + "rewards/rejected": -2.5405313968658447, + "step": 1351 + }, + { + "epoch": 1.28, + "grad_norm": 23.63612937927246, + "learning_rate": 3.191675410982861e-07, + "logps/chosen": -43.21095275878906, + "logps/rejected": -57.190086364746094, + "loss": 0.5392, + "losses/dpo": 0.35031503438949585, + "losses/sft": 1.4813015460968018, + "losses/total": 0.35031503438949585, + "ref_logps/chosen": -30.437179565429688, + "ref_logps/rejected": -38.99433898925781, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2773772478103638, + "rewards/margins": 0.5421980023384094, + "rewards/rejected": -1.8195751905441284, + "step": 1352 + }, + { + "epoch": 1.28, + "grad_norm": 18.869770050048828, + "learning_rate": 3.189926547743966e-07, + "logps/chosen": -57.13106918334961, + "logps/rejected": -87.74666595458984, + "loss": 0.3007, + "losses/dpo": 0.25921720266342163, + "losses/sft": 2.8428573608398438, + "losses/total": 0.25921720266342163, + "ref_logps/chosen": -38.957374572753906, + "ref_logps/rejected": -56.00676727294922, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8173695802688599, + "rewards/margins": 1.3566205501556396, + "rewards/rejected": -3.17399001121521, + "step": 1353 + }, + { + "epoch": 1.28, + "grad_norm": 20.820863723754883, + "learning_rate": 3.188177684505072e-07, + "logps/chosen": -61.46854019165039, + "logps/rejected": -75.09683227539062, + "loss": 0.3858, + "losses/dpo": 0.42118579149246216, + "losses/sft": 2.2673394680023193, + "losses/total": 0.42118579149246216, + "ref_logps/chosen": -44.9959716796875, + "ref_logps/rejected": -47.12028503417969, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6472563743591309, + "rewards/margins": 1.150398850440979, + "rewards/rejected": -2.7976553440093994, + "step": 1354 + }, + { + "epoch": 1.28, + "grad_norm": 17.014755249023438, + "learning_rate": 3.186428821266177e-07, + "logps/chosen": -36.23109817504883, + "logps/rejected": -64.0101547241211, + "loss": 0.394, + "losses/dpo": 0.7347977161407471, + "losses/sft": 1.2118299007415771, + "losses/total": 0.7347977161407471, + "ref_logps/chosen": -28.18022346496582, + "ref_logps/rejected": -45.771461486816406, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8050874471664429, + "rewards/margins": 1.0187814235687256, + "rewards/rejected": -1.823868989944458, + "step": 1355 + }, + { + "epoch": 1.28, + "grad_norm": 20.06956672668457, + "learning_rate": 3.184679958027282e-07, + "logps/chosen": -60.052703857421875, + "logps/rejected": -74.61544799804688, + "loss": 0.3505, + "losses/dpo": 0.35956865549087524, + "losses/sft": 1.5750818252563477, + "losses/total": 0.35956865549087524, + "ref_logps/chosen": -44.549537658691406, + "ref_logps/rejected": -47.12712097167969, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5503166913986206, + "rewards/margins": 1.1985156536102295, + "rewards/rejected": -2.7488322257995605, + "step": 1356 + }, + { + "epoch": 1.28, + "grad_norm": 16.932592391967773, + "learning_rate": 3.1829310947883876e-07, + "logps/chosen": -47.99407958984375, + "logps/rejected": -78.48458099365234, + "loss": 0.2493, + "losses/dpo": 0.12762005627155304, + "losses/sft": 1.5339864492416382, + "losses/total": 0.12762005627155304, + "ref_logps/chosen": -35.01840591430664, + "ref_logps/rejected": -48.10680389404297, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2975674867630005, + "rewards/margins": 1.7402098178863525, + "rewards/rejected": -3.0377771854400635, + "step": 1357 + }, + { + "epoch": 1.28, + "grad_norm": 18.315305709838867, + "learning_rate": 3.181182231549493e-07, + "logps/chosen": -63.45601272583008, + "logps/rejected": -85.56204986572266, + "loss": 0.3312, + "losses/dpo": 0.18121808767318726, + "losses/sft": 1.7915785312652588, + "losses/total": 0.18121808767318726, + "ref_logps/chosen": -46.37159729003906, + "ref_logps/rejected": -52.41903305053711, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7084417343139648, + "rewards/margins": 1.605859637260437, + "rewards/rejected": -3.3143014907836914, + "step": 1358 + }, + { + "epoch": 1.28, + "grad_norm": 26.892333984375, + "learning_rate": 3.179433368310598e-07, + "logps/chosen": -51.82383728027344, + "logps/rejected": -53.26301574707031, + "loss": 0.5431, + "losses/dpo": 0.6771758794784546, + "losses/sft": 2.1927335262298584, + "losses/total": 0.6771758794784546, + "ref_logps/chosen": -37.56463623046875, + "ref_logps/rejected": -33.25959777832031, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4259207248687744, + "rewards/margins": 0.5744211673736572, + "rewards/rejected": -2.0003418922424316, + "step": 1359 + }, + { + "epoch": 1.28, + "grad_norm": 15.517579078674316, + "learning_rate": 3.177684505071703e-07, + "logps/chosen": -51.13165283203125, + "logps/rejected": -86.49072265625, + "loss": 0.2102, + "losses/dpo": 0.32153305411338806, + "losses/sft": 1.4930654764175415, + "losses/total": 0.32153305411338806, + "ref_logps/chosen": -39.39052963256836, + "ref_logps/rejected": -56.91254425048828, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1741122007369995, + "rewards/margins": 1.783705711364746, + "rewards/rejected": -2.957818031311035, + "step": 1360 + }, + { + "epoch": 1.29, + "grad_norm": 15.040142059326172, + "learning_rate": 3.1759356418328087e-07, + "logps/chosen": -51.7848014831543, + "logps/rejected": -74.01992797851562, + "loss": 0.247, + "losses/dpo": 0.16273543238639832, + "losses/sft": 1.7990646362304688, + "losses/total": 0.16273543238639832, + "ref_logps/chosen": -35.85475158691406, + "ref_logps/rejected": -42.82506561279297, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5930051803588867, + "rewards/margins": 1.5264816284179688, + "rewards/rejected": -3.1194868087768555, + "step": 1361 + }, + { + "epoch": 1.29, + "grad_norm": 22.02246856689453, + "learning_rate": 3.174186778593914e-07, + "logps/chosen": -52.05388641357422, + "logps/rejected": -68.54545593261719, + "loss": 0.3583, + "losses/dpo": 0.1601119190454483, + "losses/sft": 0.9818538427352905, + "losses/total": 0.1601119190454483, + "ref_logps/chosen": -39.668514251708984, + "ref_logps/rejected": -40.752655029296875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.238537311553955, + "rewards/margins": 1.5407426357269287, + "rewards/rejected": -2.779280185699463, + "step": 1362 + }, + { + "epoch": 1.29, + "grad_norm": 18.685449600219727, + "learning_rate": 3.172437915355019e-07, + "logps/chosen": -53.236289978027344, + "logps/rejected": -81.12696838378906, + "loss": 0.4615, + "losses/dpo": 0.44704434275627136, + "losses/sft": 2.2950265407562256, + "losses/total": 0.44704434275627136, + "ref_logps/chosen": -37.747615814208984, + "ref_logps/rejected": -48.26818084716797, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.548867106437683, + "rewards/margins": 1.7370113134384155, + "rewards/rejected": -3.2858781814575195, + "step": 1363 + }, + { + "epoch": 1.29, + "grad_norm": 26.25226402282715, + "learning_rate": 3.1706890521161246e-07, + "logps/chosen": -53.697139739990234, + "logps/rejected": -52.0499267578125, + "loss": 0.539, + "losses/dpo": 0.40041327476501465, + "losses/sft": 0.8750542402267456, + "losses/total": 0.40041327476501465, + "ref_logps/chosen": -39.54927062988281, + "ref_logps/rejected": -32.445838928222656, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.414786696434021, + "rewards/margins": 0.5456223487854004, + "rewards/rejected": -1.9604090452194214, + "step": 1364 + }, + { + "epoch": 1.29, + "grad_norm": 25.24365234375, + "learning_rate": 3.1689401888772297e-07, + "logps/chosen": -49.30266571044922, + "logps/rejected": -60.79222106933594, + "loss": 0.5297, + "losses/dpo": 0.4613967537879944, + "losses/sft": 1.035914421081543, + "losses/total": 0.4613967537879944, + "ref_logps/chosen": -35.4945068359375, + "ref_logps/rejected": -35.82539749145508, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3808157444000244, + "rewards/margins": 1.115866780281067, + "rewards/rejected": -2.4966824054718018, + "step": 1365 + }, + { + "epoch": 1.29, + "grad_norm": 24.446874618530273, + "learning_rate": 3.167191325638335e-07, + "logps/chosen": -62.15544128417969, + "logps/rejected": -69.06873321533203, + "loss": 0.3823, + "losses/dpo": 0.36644697189331055, + "losses/sft": 1.7216076850891113, + "losses/total": 0.36644697189331055, + "ref_logps/chosen": -48.019168853759766, + "ref_logps/rejected": -44.19029998779297, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4136276245117188, + "rewards/margins": 1.0742160081863403, + "rewards/rejected": -2.4878435134887695, + "step": 1366 + }, + { + "epoch": 1.29, + "grad_norm": 18.419918060302734, + "learning_rate": 3.1654424623994405e-07, + "logps/chosen": -53.24803161621094, + "logps/rejected": -74.8726806640625, + "loss": 0.3131, + "losses/dpo": 0.2178664654493332, + "losses/sft": 2.127765655517578, + "losses/total": 0.2178664654493332, + "ref_logps/chosen": -41.76417922973633, + "ref_logps/rejected": -48.734710693359375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1483851671218872, + "rewards/margins": 1.465411901473999, + "rewards/rejected": -2.613797187805176, + "step": 1367 + }, + { + "epoch": 1.29, + "grad_norm": 21.006553649902344, + "learning_rate": 3.1636935991605456e-07, + "logps/chosen": -43.47159957885742, + "logps/rejected": -67.40238189697266, + "loss": 0.392, + "losses/dpo": 0.4687536060810089, + "losses/sft": 1.481898307800293, + "losses/total": 0.4687536060810089, + "ref_logps/chosen": -33.60332489013672, + "ref_logps/rejected": -43.203651428222656, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9868277311325073, + "rewards/margins": 1.4330453872680664, + "rewards/rejected": -2.4198732376098633, + "step": 1368 + }, + { + "epoch": 1.29, + "grad_norm": 28.368614196777344, + "learning_rate": 3.161944735921651e-07, + "logps/chosen": -52.62860870361328, + "logps/rejected": -73.36225891113281, + "loss": 0.4706, + "losses/dpo": 0.5309086441993713, + "losses/sft": 1.9961135387420654, + "losses/total": 0.5309086441993713, + "ref_logps/chosen": -35.80301284790039, + "ref_logps/rejected": -45.82862854003906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.682559609413147, + "rewards/margins": 1.0708030462265015, + "rewards/rejected": -2.7533626556396484, + "step": 1369 + }, + { + "epoch": 1.29, + "grad_norm": 22.93353843688965, + "learning_rate": 3.160195872682756e-07, + "logps/chosen": -47.724754333496094, + "logps/rejected": -70.8294906616211, + "loss": 0.3462, + "losses/dpo": 0.2852926254272461, + "losses/sft": 1.5040239095687866, + "losses/total": 0.2852926254272461, + "ref_logps/chosen": -35.95964813232422, + "ref_logps/rejected": -46.462371826171875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1765105724334717, + "rewards/margins": 1.2602012157440186, + "rewards/rejected": -2.4367117881774902, + "step": 1370 + }, + { + "epoch": 1.29, + "grad_norm": 23.429407119750977, + "learning_rate": 3.1584470094438615e-07, + "logps/chosen": -46.56145095825195, + "logps/rejected": -73.58964538574219, + "loss": 0.2879, + "losses/dpo": 0.13537660241127014, + "losses/sft": 1.4202475547790527, + "losses/total": 0.13537660241127014, + "ref_logps/chosen": -35.454078674316406, + "ref_logps/rejected": -47.185768127441406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1107375621795654, + "rewards/margins": 1.5296508073806763, + "rewards/rejected": -2.6403884887695312, + "step": 1371 + }, + { + "epoch": 1.3, + "grad_norm": 23.322397232055664, + "learning_rate": 3.1566981462049667e-07, + "logps/chosen": -37.70085906982422, + "logps/rejected": -47.00138854980469, + "loss": 0.5646, + "losses/dpo": 0.30335646867752075, + "losses/sft": 1.3127626180648804, + "losses/total": 0.30335646867752075, + "ref_logps/chosen": -24.70101547241211, + "ref_logps/rejected": -26.147720336914062, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2999846935272217, + "rewards/margins": 0.7853823304176331, + "rewards/rejected": -2.08536696434021, + "step": 1372 + }, + { + "epoch": 1.3, + "grad_norm": 21.259082794189453, + "learning_rate": 3.154949282966072e-07, + "logps/chosen": -47.568992614746094, + "logps/rejected": -59.967716217041016, + "loss": 0.424, + "losses/dpo": 0.12321989238262177, + "losses/sft": 2.2505016326904297, + "losses/total": 0.12321989238262177, + "ref_logps/chosen": -33.088985443115234, + "ref_logps/rejected": -33.45567321777344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4480009078979492, + "rewards/margins": 1.2032034397125244, + "rewards/rejected": -2.6512041091918945, + "step": 1373 + }, + { + "epoch": 1.3, + "grad_norm": 23.981849670410156, + "learning_rate": 3.1532004197271774e-07, + "logps/chosen": -43.099525451660156, + "logps/rejected": -77.929931640625, + "loss": 0.4545, + "losses/dpo": 0.20337752997875214, + "losses/sft": 1.3605133295059204, + "losses/total": 0.20337752997875214, + "ref_logps/chosen": -32.27179718017578, + "ref_logps/rejected": -53.765743255615234, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0827726125717163, + "rewards/margins": 1.333645224571228, + "rewards/rejected": -2.4164180755615234, + "step": 1374 + }, + { + "epoch": 1.3, + "grad_norm": 23.155954360961914, + "learning_rate": 3.1514515564882826e-07, + "logps/chosen": -47.0855712890625, + "logps/rejected": -75.8427963256836, + "loss": 0.4053, + "losses/dpo": 0.3050435781478882, + "losses/sft": 2.213414430618286, + "losses/total": 0.3050435781478882, + "ref_logps/chosen": -34.05945587158203, + "ref_logps/rejected": -50.41439437866211, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3026115894317627, + "rewards/margins": 1.2402284145355225, + "rewards/rejected": -2.542840003967285, + "step": 1375 + }, + { + "epoch": 1.3, + "grad_norm": 30.488391876220703, + "learning_rate": 3.1497026932493877e-07, + "logps/chosen": -59.1077995300293, + "logps/rejected": -55.779541015625, + "loss": 0.5633, + "losses/dpo": 0.2272609919309616, + "losses/sft": 1.6881656646728516, + "losses/total": 0.2272609919309616, + "ref_logps/chosen": -45.06895065307617, + "ref_logps/rejected": -34.236572265625, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4038848876953125, + "rewards/margins": 0.7504122257232666, + "rewards/rejected": -2.154297113418579, + "step": 1376 + }, + { + "epoch": 1.3, + "grad_norm": 19.647987365722656, + "learning_rate": 3.147953830010493e-07, + "logps/chosen": -52.400299072265625, + "logps/rejected": -66.96345520019531, + "loss": 0.3621, + "losses/dpo": 0.5456865429878235, + "losses/sft": 1.9704679250717163, + "losses/total": 0.5456865429878235, + "ref_logps/chosen": -39.86430358886719, + "ref_logps/rejected": -41.286895751953125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2535994052886963, + "rewards/margins": 1.3140560388565063, + "rewards/rejected": -2.567655563354492, + "step": 1377 + }, + { + "epoch": 1.3, + "grad_norm": 21.903074264526367, + "learning_rate": 3.1462049667715985e-07, + "logps/chosen": -45.04633331298828, + "logps/rejected": -62.4086799621582, + "loss": 0.4614, + "losses/dpo": 0.383821040391922, + "losses/sft": 1.47364342212677, + "losses/total": 0.383821040391922, + "ref_logps/chosen": -33.184043884277344, + "ref_logps/rejected": -40.430763244628906, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1862293481826782, + "rewards/margins": 1.0115625858306885, + "rewards/rejected": -2.1977920532226562, + "step": 1378 + }, + { + "epoch": 1.3, + "grad_norm": 28.027938842773438, + "learning_rate": 3.1444561035327036e-07, + "logps/chosen": -45.439125061035156, + "logps/rejected": -58.0157356262207, + "loss": 0.483, + "losses/dpo": 0.9206573963165283, + "losses/sft": 1.75630784034729, + "losses/total": 0.9206573963165283, + "ref_logps/chosen": -31.78700065612793, + "ref_logps/rejected": -34.04890441894531, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3652124404907227, + "rewards/margins": 1.0314708948135376, + "rewards/rejected": -2.3966832160949707, + "step": 1379 + }, + { + "epoch": 1.3, + "grad_norm": 25.44988250732422, + "learning_rate": 3.1427072402938087e-07, + "logps/chosen": -56.93769073486328, + "logps/rejected": -57.96370315551758, + "loss": 0.4546, + "losses/dpo": 0.6200724840164185, + "losses/sft": 2.593087911605835, + "losses/total": 0.6200724840164185, + "ref_logps/chosen": -42.7227783203125, + "ref_logps/rejected": -34.85062789916992, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4214911460876465, + "rewards/margins": 0.8898162245750427, + "rewards/rejected": -2.311307430267334, + "step": 1380 + }, + { + "epoch": 1.3, + "grad_norm": 15.13788890838623, + "learning_rate": 3.1409583770549144e-07, + "logps/chosen": -35.49049377441406, + "logps/rejected": -72.80972290039062, + "loss": 0.2828, + "losses/dpo": 0.5170881152153015, + "losses/sft": 1.7763038873672485, + "losses/total": 0.5170881152153015, + "ref_logps/chosen": -26.234588623046875, + "ref_logps/rejected": -44.85145950317383, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9255908131599426, + "rewards/margins": 1.8702361583709717, + "rewards/rejected": -2.7958269119262695, + "step": 1381 + }, + { + "epoch": 1.31, + "grad_norm": 27.159870147705078, + "learning_rate": 3.1392095138160195e-07, + "logps/chosen": -60.885982513427734, + "logps/rejected": -68.66365051269531, + "loss": 0.4944, + "losses/dpo": 0.6054320335388184, + "losses/sft": 2.1589083671569824, + "losses/total": 0.6054320335388184, + "ref_logps/chosen": -42.962127685546875, + "ref_logps/rejected": -41.3615837097168, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7923858165740967, + "rewards/margins": 0.9378213286399841, + "rewards/rejected": -2.7302069664001465, + "step": 1382 + }, + { + "epoch": 1.31, + "grad_norm": 30.14463233947754, + "learning_rate": 3.1374606505771246e-07, + "logps/chosen": -56.305328369140625, + "logps/rejected": -61.138851165771484, + "loss": 0.5178, + "losses/dpo": 0.6132975220680237, + "losses/sft": 1.7179806232452393, + "losses/total": 0.6132975220680237, + "ref_logps/chosen": -39.56067657470703, + "ref_logps/rejected": -36.21038818359375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6744654178619385, + "rewards/margins": 0.8183808326721191, + "rewards/rejected": -2.4928462505340576, + "step": 1383 + }, + { + "epoch": 1.31, + "grad_norm": 18.163667678833008, + "learning_rate": 3.13571178733823e-07, + "logps/chosen": -53.994598388671875, + "logps/rejected": -68.44343566894531, + "loss": 0.3387, + "losses/dpo": 0.42649635672569275, + "losses/sft": 1.394381046295166, + "losses/total": 0.42649635672569275, + "ref_logps/chosen": -40.39307403564453, + "ref_logps/rejected": -41.70367431640625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3601528406143188, + "rewards/margins": 1.3138233423233032, + "rewards/rejected": -2.673975944519043, + "step": 1384 + }, + { + "epoch": 1.31, + "grad_norm": 28.407211303710938, + "learning_rate": 3.1339629240993354e-07, + "logps/chosen": -50.175987243652344, + "logps/rejected": -71.09711456298828, + "loss": 0.5418, + "losses/dpo": 0.5175858736038208, + "losses/sft": 2.164062261581421, + "losses/total": 0.5175858736038208, + "ref_logps/chosen": -36.302520751953125, + "ref_logps/rejected": -47.9297981262207, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3873467445373535, + "rewards/margins": 0.929384708404541, + "rewards/rejected": -2.3167314529418945, + "step": 1385 + }, + { + "epoch": 1.31, + "grad_norm": 22.01079750061035, + "learning_rate": 3.132214060860441e-07, + "logps/chosen": -43.22167205810547, + "logps/rejected": -73.66404724121094, + "loss": 0.3278, + "losses/dpo": 0.2465500384569168, + "losses/sft": 1.84312903881073, + "losses/total": 0.2465500384569168, + "ref_logps/chosen": -31.679141998291016, + "ref_logps/rejected": -48.11632537841797, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1542531251907349, + "rewards/margins": 1.4005193710327148, + "rewards/rejected": -2.5547726154327393, + "step": 1386 + }, + { + "epoch": 1.31, + "grad_norm": 20.3621883392334, + "learning_rate": 3.1304651976215457e-07, + "logps/chosen": -42.49702453613281, + "logps/rejected": -68.95793151855469, + "loss": 0.3815, + "losses/dpo": 0.2193598747253418, + "losses/sft": 1.3440654277801514, + "losses/total": 0.2193598747253418, + "ref_logps/chosen": -32.190826416015625, + "ref_logps/rejected": -43.43907928466797, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0306202173233032, + "rewards/margins": 1.5212647914886475, + "rewards/rejected": -2.551884889602661, + "step": 1387 + }, + { + "epoch": 1.31, + "grad_norm": 26.03891944885254, + "learning_rate": 3.1287163343826513e-07, + "logps/chosen": -52.6580810546875, + "logps/rejected": -67.25297546386719, + "loss": 0.5382, + "losses/dpo": 0.6871962547302246, + "losses/sft": 1.9108715057373047, + "losses/total": 0.6871962547302246, + "ref_logps/chosen": -36.66200256347656, + "ref_logps/rejected": -43.18291091918945, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5996079444885254, + "rewards/margins": 0.8073986172676086, + "rewards/rejected": -2.4070067405700684, + "step": 1388 + }, + { + "epoch": 1.31, + "grad_norm": 27.576459884643555, + "learning_rate": 3.1269674711437565e-07, + "logps/chosen": -58.50428771972656, + "logps/rejected": -77.48928833007812, + "loss": 0.4801, + "losses/dpo": 0.13750389218330383, + "losses/sft": 1.2845847606658936, + "losses/total": 0.13750389218330383, + "ref_logps/chosen": -45.2631721496582, + "ref_logps/rejected": -51.28300476074219, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3241115808486938, + "rewards/margins": 1.2965164184570312, + "rewards/rejected": -2.6206281185150146, + "step": 1389 + }, + { + "epoch": 1.31, + "grad_norm": 28.661888122558594, + "learning_rate": 3.1252186079048616e-07, + "logps/chosen": -47.94889831542969, + "logps/rejected": -68.34281921386719, + "loss": 0.6343, + "losses/dpo": 1.873807668685913, + "losses/sft": 2.6926257610321045, + "losses/total": 1.873807668685913, + "ref_logps/chosen": -35.423587799072266, + "ref_logps/rejected": -46.64491271972656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2525306940078735, + "rewards/margins": 0.917259693145752, + "rewards/rejected": -2.169790267944336, + "step": 1390 + }, + { + "epoch": 1.31, + "grad_norm": 19.96749496459961, + "learning_rate": 3.1234697446659667e-07, + "logps/chosen": -47.88706970214844, + "logps/rejected": -59.14627456665039, + "loss": 0.478, + "losses/dpo": 0.21075883507728577, + "losses/sft": 1.6604812145233154, + "losses/total": 0.21075883507728577, + "ref_logps/chosen": -36.14986801147461, + "ref_logps/rejected": -37.06296920776367, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.173720359802246, + "rewards/margins": 1.0346100330352783, + "rewards/rejected": -2.2083301544189453, + "step": 1391 + }, + { + "epoch": 1.31, + "grad_norm": 18.352394104003906, + "learning_rate": 3.1217208814270724e-07, + "logps/chosen": -48.96562957763672, + "logps/rejected": -89.93772888183594, + "loss": 0.3174, + "losses/dpo": 0.6201391816139221, + "losses/sft": 2.112382650375366, + "losses/total": 0.6201391816139221, + "ref_logps/chosen": -36.74701690673828, + "ref_logps/rejected": -60.49577713012695, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2218611240386963, + "rewards/margins": 1.7223336696624756, + "rewards/rejected": -2.944194793701172, + "step": 1392 + }, + { + "epoch": 1.32, + "grad_norm": 30.134258270263672, + "learning_rate": 3.119972018188178e-07, + "logps/chosen": -71.5577392578125, + "logps/rejected": -68.81761169433594, + "loss": 0.6765, + "losses/dpo": 1.0463733673095703, + "losses/sft": 2.4263932704925537, + "losses/total": 1.0463733673095703, + "ref_logps/chosen": -53.1054801940918, + "ref_logps/rejected": -42.88414764404297, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8452259302139282, + "rewards/margins": 0.7481206655502319, + "rewards/rejected": -2.59334659576416, + "step": 1393 + }, + { + "epoch": 1.32, + "grad_norm": 12.316241264343262, + "learning_rate": 3.1182231549492826e-07, + "logps/chosen": -58.682777404785156, + "logps/rejected": -76.54867553710938, + "loss": 0.2377, + "losses/dpo": 0.2974410057067871, + "losses/sft": 1.4883127212524414, + "losses/total": 0.2974410057067871, + "ref_logps/chosen": -49.17033386230469, + "ref_logps/rejected": -50.62391662597656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9512447118759155, + "rewards/margins": 1.6412322521209717, + "rewards/rejected": -2.5924768447875977, + "step": 1394 + }, + { + "epoch": 1.32, + "grad_norm": 20.66306495666504, + "learning_rate": 3.1164742917103883e-07, + "logps/chosen": -49.01014709472656, + "logps/rejected": -64.79468536376953, + "loss": 0.3602, + "losses/dpo": 0.33817166090011597, + "losses/sft": 2.0505852699279785, + "losses/total": 0.33817166090011597, + "ref_logps/chosen": -39.86006546020508, + "ref_logps/rejected": -43.593971252441406, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9150079488754272, + "rewards/margins": 1.2050637006759644, + "rewards/rejected": -2.1200718879699707, + "step": 1395 + }, + { + "epoch": 1.32, + "grad_norm": 16.288755416870117, + "learning_rate": 3.1147254284714934e-07, + "logps/chosen": -52.439876556396484, + "logps/rejected": -68.74030303955078, + "loss": 0.3846, + "losses/dpo": 0.46823808550834656, + "losses/sft": 1.661376714706421, + "losses/total": 0.46823808550834656, + "ref_logps/chosen": -42.11474609375, + "ref_logps/rejected": -45.83921813964844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0325132608413696, + "rewards/margins": 1.2575949430465698, + "rewards/rejected": -2.2901082038879395, + "step": 1396 + }, + { + "epoch": 1.32, + "grad_norm": 17.891496658325195, + "learning_rate": 3.1129765652325985e-07, + "logps/chosen": -39.93794631958008, + "logps/rejected": -59.983604431152344, + "loss": 0.3558, + "losses/dpo": 0.5310298204421997, + "losses/sft": 1.0889902114868164, + "losses/total": 0.5310298204421997, + "ref_logps/chosen": -31.426830291748047, + "ref_logps/rejected": -40.64684295654297, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8511111736297607, + "rewards/margins": 1.0825653076171875, + "rewards/rejected": -1.9336764812469482, + "step": 1397 + }, + { + "epoch": 1.32, + "grad_norm": 17.31990623474121, + "learning_rate": 3.1112277019937037e-07, + "logps/chosen": -47.70417404174805, + "logps/rejected": -65.0086669921875, + "loss": 0.348, + "losses/dpo": 0.32458168268203735, + "losses/sft": 2.0135648250579834, + "losses/total": 0.32458168268203735, + "ref_logps/chosen": -35.5030632019043, + "ref_logps/rejected": -41.95423126220703, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2201111316680908, + "rewards/margins": 1.0853321552276611, + "rewards/rejected": -2.305443286895752, + "step": 1398 + }, + { + "epoch": 1.32, + "grad_norm": 13.60582447052002, + "learning_rate": 3.1094788387548093e-07, + "logps/chosen": -35.148406982421875, + "logps/rejected": -60.91284942626953, + "loss": 0.2909, + "losses/dpo": 0.3278396725654602, + "losses/sft": 1.5964231491088867, + "losses/total": 0.3278396725654602, + "ref_logps/chosen": -26.8546085357666, + "ref_logps/rejected": -40.30609130859375, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8293799161911011, + "rewards/margins": 1.2312958240509033, + "rewards/rejected": -2.060675621032715, + "step": 1399 + }, + { + "epoch": 1.32, + "grad_norm": 18.040922164916992, + "learning_rate": 3.107729975515915e-07, + "logps/chosen": -36.44855499267578, + "logps/rejected": -61.811790466308594, + "loss": 0.3144, + "losses/dpo": 0.21064826846122742, + "losses/sft": 1.0480576753616333, + "losses/total": 0.21064826846122742, + "ref_logps/chosen": -25.492290496826172, + "ref_logps/rejected": -36.45253372192383, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0956261157989502, + "rewards/margins": 1.4402997493743896, + "rewards/rejected": -2.53592586517334, + "step": 1400 + }, + { + "epoch": 1.32, + "grad_norm": 15.514551162719727, + "learning_rate": 3.1059811122770196e-07, + "logps/chosen": -37.759986877441406, + "logps/rejected": -70.44426727294922, + "loss": 0.3003, + "losses/dpo": 0.13849696516990662, + "losses/sft": 1.3929380178451538, + "losses/total": 0.13849696516990662, + "ref_logps/chosen": -29.04631233215332, + "ref_logps/rejected": -47.98952865600586, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8713672161102295, + "rewards/margins": 1.3741066455841064, + "rewards/rejected": -2.245473861694336, + "step": 1401 + }, + { + "epoch": 1.32, + "grad_norm": 24.216726303100586, + "learning_rate": 3.104232249038125e-07, + "logps/chosen": -39.83381271362305, + "logps/rejected": -72.62610626220703, + "loss": 0.5245, + "losses/dpo": 0.3698168098926544, + "losses/sft": 1.223478078842163, + "losses/total": 0.3698168098926544, + "ref_logps/chosen": -27.672334671020508, + "ref_logps/rejected": -49.60409927368164, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2161478996276855, + "rewards/margins": 1.0860525369644165, + "rewards/rejected": -2.3022005558013916, + "step": 1402 + }, + { + "epoch": 1.32, + "grad_norm": 20.74520492553711, + "learning_rate": 3.1024833857992303e-07, + "logps/chosen": -63.94044876098633, + "logps/rejected": -74.19377136230469, + "loss": 0.3808, + "losses/dpo": 0.2923702001571655, + "losses/sft": 2.277377128601074, + "losses/total": 0.2923702001571655, + "ref_logps/chosen": -50.2935791015625, + "ref_logps/rejected": -46.86443328857422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3646868467330933, + "rewards/margins": 1.3682467937469482, + "rewards/rejected": -2.732933521270752, + "step": 1403 + }, + { + "epoch": 1.33, + "grad_norm": 17.602275848388672, + "learning_rate": 3.1007345225603355e-07, + "logps/chosen": -42.638694763183594, + "logps/rejected": -63.673919677734375, + "loss": 0.3501, + "losses/dpo": 0.6859549283981323, + "losses/sft": 1.8699051141738892, + "losses/total": 0.6859549283981323, + "ref_logps/chosen": -32.98509216308594, + "ref_logps/rejected": -38.486568450927734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9653604030609131, + "rewards/margins": 1.5533747673034668, + "rewards/rejected": -2.51873517036438, + "step": 1404 + }, + { + "epoch": 1.33, + "grad_norm": 24.334733963012695, + "learning_rate": 3.0989856593214406e-07, + "logps/chosen": -44.016693115234375, + "logps/rejected": -54.46356964111328, + "loss": 0.5681, + "losses/dpo": 0.5730810761451721, + "losses/sft": 1.4357812404632568, + "losses/total": 0.5730810761451721, + "ref_logps/chosen": -30.908496856689453, + "ref_logps/rejected": -32.35789489746094, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3108196258544922, + "rewards/margins": 0.8997477889060974, + "rewards/rejected": -2.2105674743652344, + "step": 1405 + }, + { + "epoch": 1.33, + "grad_norm": 24.985742568969727, + "learning_rate": 3.097236796082546e-07, + "logps/chosen": -60.19560241699219, + "logps/rejected": -63.56106185913086, + "loss": 0.4605, + "losses/dpo": 0.8694175481796265, + "losses/sft": 2.2830049991607666, + "losses/total": 0.8694175481796265, + "ref_logps/chosen": -47.15262985229492, + "ref_logps/rejected": -40.38645935058594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3042973279953003, + "rewards/margins": 1.0131628513336182, + "rewards/rejected": -2.317460060119629, + "step": 1406 + }, + { + "epoch": 1.33, + "grad_norm": 19.60630989074707, + "learning_rate": 3.095487932843652e-07, + "logps/chosen": -39.14239501953125, + "logps/rejected": -57.21702194213867, + "loss": 0.3744, + "losses/dpo": 0.5650507211685181, + "losses/sft": 1.6307978630065918, + "losses/total": 0.5650507211685181, + "ref_logps/chosen": -31.956689834594727, + "ref_logps/rejected": -36.87541198730469, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.718570351600647, + "rewards/margins": 1.3155903816223145, + "rewards/rejected": -2.034160852432251, + "step": 1407 + }, + { + "epoch": 1.33, + "grad_norm": 22.99588394165039, + "learning_rate": 3.0937390696047565e-07, + "logps/chosen": -62.28180694580078, + "logps/rejected": -73.65724182128906, + "loss": 0.4073, + "losses/dpo": 0.6211172342300415, + "losses/sft": 2.6106433868408203, + "losses/total": 0.6211172342300415, + "ref_logps/chosen": -50.00990295410156, + "ref_logps/rejected": -50.14404296875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.227190613746643, + "rewards/margins": 1.1241296529769897, + "rewards/rejected": -2.351320266723633, + "step": 1408 + }, + { + "epoch": 1.33, + "grad_norm": 18.683799743652344, + "learning_rate": 3.091990206365862e-07, + "logps/chosen": -42.29640197753906, + "logps/rejected": -61.958717346191406, + "loss": 0.3347, + "losses/dpo": 0.45394790172576904, + "losses/sft": 1.4220184087753296, + "losses/total": 0.45394790172576904, + "ref_logps/chosen": -35.667076110839844, + "ref_logps/rejected": -40.28544616699219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6629323959350586, + "rewards/margins": 1.5043952465057373, + "rewards/rejected": -2.167327642440796, + "step": 1409 + }, + { + "epoch": 1.33, + "grad_norm": 23.03899574279785, + "learning_rate": 3.0902413431269673e-07, + "logps/chosen": -54.78341293334961, + "logps/rejected": -68.1576919555664, + "loss": 0.4087, + "losses/dpo": 0.3472962975502014, + "losses/sft": 1.6087357997894287, + "losses/total": 0.3472962975502014, + "ref_logps/chosen": -40.278770446777344, + "ref_logps/rejected": -43.559349060058594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4504643678665161, + "rewards/margins": 1.0093703269958496, + "rewards/rejected": -2.459834575653076, + "step": 1410 + }, + { + "epoch": 1.33, + "grad_norm": 26.325695037841797, + "learning_rate": 3.0884924798880724e-07, + "logps/chosen": -47.2593994140625, + "logps/rejected": -60.71413803100586, + "loss": 0.5073, + "losses/dpo": 0.7224621772766113, + "losses/sft": 2.0751097202301025, + "losses/total": 0.7224621772766113, + "ref_logps/chosen": -35.848514556884766, + "ref_logps/rejected": -42.53636932373047, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1410884857177734, + "rewards/margins": 0.676688551902771, + "rewards/rejected": -1.8177769184112549, + "step": 1411 + }, + { + "epoch": 1.33, + "grad_norm": 17.443208694458008, + "learning_rate": 3.086743616649178e-07, + "logps/chosen": -39.574119567871094, + "logps/rejected": -66.24822235107422, + "loss": 0.3471, + "losses/dpo": 0.4222831130027771, + "losses/sft": 1.7287424802780151, + "losses/total": 0.4222831130027771, + "ref_logps/chosen": -31.801837921142578, + "ref_logps/rejected": -44.20322799682617, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7772287130355835, + "rewards/margins": 1.4272706508636475, + "rewards/rejected": -2.2044992446899414, + "step": 1412 + }, + { + "epoch": 1.33, + "grad_norm": 19.533376693725586, + "learning_rate": 3.084994753410283e-07, + "logps/chosen": -49.30174255371094, + "logps/rejected": -71.27017211914062, + "loss": 0.2996, + "losses/dpo": 0.17927199602127075, + "losses/sft": 1.7610194683074951, + "losses/total": 0.17927199602127075, + "ref_logps/chosen": -38.06349563598633, + "ref_logps/rejected": -45.71255874633789, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1238248348236084, + "rewards/margins": 1.4319367408752441, + "rewards/rejected": -2.5557613372802734, + "step": 1413 + }, + { + "epoch": 1.34, + "grad_norm": 20.28409767150879, + "learning_rate": 3.083245890171389e-07, + "logps/chosen": -44.53003692626953, + "logps/rejected": -58.603981018066406, + "loss": 0.3487, + "losses/dpo": 0.4930720329284668, + "losses/sft": 1.788412094116211, + "losses/total": 0.4930720329284668, + "ref_logps/chosen": -34.771034240722656, + "ref_logps/rejected": -37.61746597290039, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9759001731872559, + "rewards/margins": 1.1227518320083618, + "rewards/rejected": -2.098651885986328, + "step": 1414 + }, + { + "epoch": 1.34, + "grad_norm": 24.667396545410156, + "learning_rate": 3.0814970269324935e-07, + "logps/chosen": -45.952110290527344, + "logps/rejected": -62.221256256103516, + "loss": 0.4718, + "losses/dpo": 0.6820125579833984, + "losses/sft": 2.0642268657684326, + "losses/total": 0.6820125579833984, + "ref_logps/chosen": -34.38420104980469, + "ref_logps/rejected": -40.26169204711914, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.156791090965271, + "rewards/margins": 1.0391653776168823, + "rewards/rejected": -2.1959567070007324, + "step": 1415 + }, + { + "epoch": 1.34, + "grad_norm": 15.919194221496582, + "learning_rate": 3.079748163693599e-07, + "logps/chosen": -42.049278259277344, + "logps/rejected": -68.30839538574219, + "loss": 0.3513, + "losses/dpo": 0.21697184443473816, + "losses/sft": 2.0377728939056396, + "losses/total": 0.21697184443473816, + "ref_logps/chosen": -32.20050811767578, + "ref_logps/rejected": -44.524757385253906, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9848769903182983, + "rewards/margins": 1.3934866189956665, + "rewards/rejected": -2.378363609313965, + "step": 1416 + }, + { + "epoch": 1.34, + "grad_norm": 22.401674270629883, + "learning_rate": 3.077999300454704e-07, + "logps/chosen": -53.42433166503906, + "logps/rejected": -68.79466247558594, + "loss": 0.4149, + "losses/dpo": 0.3252949118614197, + "losses/sft": 1.8426626920700073, + "losses/total": 0.3252949118614197, + "ref_logps/chosen": -40.75001525878906, + "ref_logps/rejected": -44.611534118652344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.267431616783142, + "rewards/margins": 1.1508824825286865, + "rewards/rejected": -2.418313980102539, + "step": 1417 + }, + { + "epoch": 1.34, + "grad_norm": 23.449138641357422, + "learning_rate": 3.0762504372158094e-07, + "logps/chosen": -56.36857604980469, + "logps/rejected": -58.75748062133789, + "loss": 0.4542, + "losses/dpo": 0.45658814907073975, + "losses/sft": 1.5619760751724243, + "losses/total": 0.45658814907073975, + "ref_logps/chosen": -42.70182800292969, + "ref_logps/rejected": -36.784507751464844, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3666751384735107, + "rewards/margins": 0.8306224942207336, + "rewards/rejected": -2.1972975730895996, + "step": 1418 + }, + { + "epoch": 1.34, + "grad_norm": 24.037595748901367, + "learning_rate": 3.074501573976915e-07, + "logps/chosen": -56.9433708190918, + "logps/rejected": -70.42808532714844, + "loss": 0.3817, + "losses/dpo": 0.2560586631298065, + "losses/sft": 1.9976487159729004, + "losses/total": 0.2560586631298065, + "ref_logps/chosen": -41.48696517944336, + "ref_logps/rejected": -40.96171188354492, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.545640468597412, + "rewards/margins": 1.4009971618652344, + "rewards/rejected": -2.9466376304626465, + "step": 1419 + }, + { + "epoch": 1.34, + "grad_norm": 38.60078430175781, + "learning_rate": 3.07275271073802e-07, + "logps/chosen": -57.37983703613281, + "logps/rejected": -69.86796569824219, + "loss": 0.578, + "losses/dpo": 0.7916027307510376, + "losses/sft": 2.2825205326080322, + "losses/total": 0.7916027307510376, + "ref_logps/chosen": -38.82680130004883, + "ref_logps/rejected": -44.12309265136719, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.8553032875061035, + "rewards/margins": 0.719184160232544, + "rewards/rejected": -2.5744876861572266, + "step": 1420 + }, + { + "epoch": 1.34, + "grad_norm": 14.173458099365234, + "learning_rate": 3.071003847499126e-07, + "logps/chosen": -41.91478729248047, + "logps/rejected": -71.73396301269531, + "loss": 0.2214, + "losses/dpo": 0.10589051246643066, + "losses/sft": 1.3087167739868164, + "losses/total": 0.10589051246643066, + "ref_logps/chosen": -34.511016845703125, + "ref_logps/rejected": -42.56721496582031, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7403774261474609, + "rewards/margins": 2.1762967109680176, + "rewards/rejected": -2.9166741371154785, + "step": 1421 + }, + { + "epoch": 1.34, + "grad_norm": 40.23226547241211, + "learning_rate": 3.0692549842602304e-07, + "logps/chosen": -50.16059875488281, + "logps/rejected": -45.11272430419922, + "loss": 0.9759, + "losses/dpo": 1.4183539152145386, + "losses/sft": 2.296969413757324, + "losses/total": 1.4183539152145386, + "ref_logps/chosen": -31.263580322265625, + "ref_logps/rejected": -27.629045486450195, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.8897016048431396, + "rewards/margins": -0.14133356511592865, + "rewards/rejected": -1.7483680248260498, + "step": 1422 + }, + { + "epoch": 1.34, + "grad_norm": 25.430992126464844, + "learning_rate": 3.067506121021336e-07, + "logps/chosen": -56.25402069091797, + "logps/rejected": -82.1207275390625, + "loss": 0.372, + "losses/dpo": 0.34306663274765015, + "losses/sft": 1.7412848472595215, + "losses/total": 0.34306663274765015, + "ref_logps/chosen": -40.79608154296875, + "ref_logps/rejected": -54.380760192871094, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.545793890953064, + "rewards/margins": 1.22820246219635, + "rewards/rejected": -2.773996591567993, + "step": 1423 + }, + { + "epoch": 1.34, + "grad_norm": 31.236656188964844, + "learning_rate": 3.065757257782441e-07, + "logps/chosen": -59.461517333984375, + "logps/rejected": -70.31063842773438, + "loss": 0.4914, + "losses/dpo": 0.8357799649238586, + "losses/sft": 2.1770999431610107, + "losses/total": 0.8357799649238586, + "ref_logps/chosen": -48.422119140625, + "ref_logps/rejected": -46.93023681640625, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1039400100708008, + "rewards/margins": 1.2341006994247437, + "rewards/rejected": -2.338040590286255, + "step": 1424 + }, + { + "epoch": 1.35, + "grad_norm": 16.982175827026367, + "learning_rate": 3.0640083945435463e-07, + "logps/chosen": -50.047523498535156, + "logps/rejected": -78.69482421875, + "loss": 0.2798, + "losses/dpo": 0.4208105802536011, + "losses/sft": 1.4603816270828247, + "losses/total": 0.4208105802536011, + "ref_logps/chosen": -36.75709533691406, + "ref_logps/rejected": -50.03679656982422, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3290431499481201, + "rewards/margins": 1.5367591381072998, + "rewards/rejected": -2.865802049636841, + "step": 1425 + }, + { + "epoch": 1.35, + "grad_norm": 20.309284210205078, + "learning_rate": 3.062259531304652e-07, + "logps/chosen": -42.76316833496094, + "logps/rejected": -71.7971420288086, + "loss": 0.3374, + "losses/dpo": 0.29878053069114685, + "losses/sft": 1.239473819732666, + "losses/total": 0.29878053069114685, + "ref_logps/chosen": -30.08619499206543, + "ref_logps/rejected": -45.589942932128906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2676975727081299, + "rewards/margins": 1.3530223369598389, + "rewards/rejected": -2.6207199096679688, + "step": 1426 + }, + { + "epoch": 1.35, + "grad_norm": 24.290002822875977, + "learning_rate": 3.060510668065757e-07, + "logps/chosen": -48.008941650390625, + "logps/rejected": -68.45893859863281, + "loss": 0.4337, + "losses/dpo": 0.7064761519432068, + "losses/sft": 1.5562361478805542, + "losses/total": 0.7064761519432068, + "ref_logps/chosen": -33.58290100097656, + "ref_logps/rejected": -42.375648498535156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4426039457321167, + "rewards/margins": 1.1657254695892334, + "rewards/rejected": -2.6083292961120605, + "step": 1427 + }, + { + "epoch": 1.35, + "grad_norm": 21.337886810302734, + "learning_rate": 3.058761804826863e-07, + "logps/chosen": -42.58956527709961, + "logps/rejected": -69.51599884033203, + "loss": 0.348, + "losses/dpo": 0.37191101908683777, + "losses/sft": 1.7934852838516235, + "losses/total": 0.37191101908683777, + "ref_logps/chosen": -30.46814727783203, + "ref_logps/rejected": -41.44960403442383, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2121418714523315, + "rewards/margins": 1.5944979190826416, + "rewards/rejected": -2.8066396713256836, + "step": 1428 + }, + { + "epoch": 1.35, + "grad_norm": 30.51302719116211, + "learning_rate": 3.0570129415879673e-07, + "logps/chosen": -57.298519134521484, + "logps/rejected": -81.44483947753906, + "loss": 0.5259, + "losses/dpo": 0.6635831594467163, + "losses/sft": 1.7359380722045898, + "losses/total": 0.6635831594467163, + "ref_logps/chosen": -40.06396484375, + "ref_logps/rejected": -50.34728240966797, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7234556674957275, + "rewards/margins": 1.386299967765808, + "rewards/rejected": -3.109755754470825, + "step": 1429 + }, + { + "epoch": 1.35, + "grad_norm": 28.02737045288086, + "learning_rate": 3.055264078349073e-07, + "logps/chosen": -41.67241668701172, + "logps/rejected": -62.38465881347656, + "loss": 0.5008, + "losses/dpo": 0.2610743045806885, + "losses/sft": 0.9479192495346069, + "losses/total": 0.2610743045806885, + "ref_logps/chosen": -29.95924186706543, + "ref_logps/rejected": -41.242156982421875, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1713175773620605, + "rewards/margins": 0.9429323673248291, + "rewards/rejected": -2.1142499446868896, + "step": 1430 + }, + { + "epoch": 1.35, + "grad_norm": 25.366226196289062, + "learning_rate": 3.0535152151101787e-07, + "logps/chosen": -56.33328628540039, + "logps/rejected": -80.22727966308594, + "loss": 0.4365, + "losses/dpo": 0.8064418435096741, + "losses/sft": 2.8340845108032227, + "losses/total": 0.8064418435096741, + "ref_logps/chosen": -38.87138366699219, + "ref_logps/rejected": -50.27754211425781, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7461903095245361, + "rewards/margins": 1.2487839460372925, + "rewards/rejected": -2.994974374771118, + "step": 1431 + }, + { + "epoch": 1.35, + "grad_norm": 26.362878799438477, + "learning_rate": 3.051766351871283e-07, + "logps/chosen": -56.68375015258789, + "logps/rejected": -76.91268157958984, + "loss": 0.4751, + "losses/dpo": 0.31512925028800964, + "losses/sft": 1.7146873474121094, + "losses/total": 0.31512925028800964, + "ref_logps/chosen": -39.540618896484375, + "ref_logps/rejected": -48.338619232177734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.71431303024292, + "rewards/margins": 1.143093228340149, + "rewards/rejected": -2.8574063777923584, + "step": 1432 + }, + { + "epoch": 1.35, + "grad_norm": 14.078072547912598, + "learning_rate": 3.050017488632389e-07, + "logps/chosen": -42.78202819824219, + "logps/rejected": -55.35765075683594, + "loss": 0.2449, + "losses/dpo": 0.2670789062976837, + "losses/sft": 1.6147735118865967, + "losses/total": 0.2670789062976837, + "ref_logps/chosen": -32.7816276550293, + "ref_logps/rejected": -29.931459426879883, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0000396966934204, + "rewards/margins": 1.542580008506775, + "rewards/rejected": -2.542619466781616, + "step": 1433 + }, + { + "epoch": 1.35, + "grad_norm": 16.286357879638672, + "learning_rate": 3.048268625393494e-07, + "logps/chosen": -61.516666412353516, + "logps/rejected": -71.70602416992188, + "loss": 0.2722, + "losses/dpo": 0.5190421938896179, + "losses/sft": 1.8730074167251587, + "losses/total": 0.5190421938896179, + "ref_logps/chosen": -44.22786331176758, + "ref_logps/rejected": -39.38705825805664, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7288801670074463, + "rewards/margins": 1.503016710281372, + "rewards/rejected": -3.2318968772888184, + "step": 1434 + }, + { + "epoch": 1.36, + "grad_norm": 18.778362274169922, + "learning_rate": 3.0465197621545997e-07, + "logps/chosen": -51.56160354614258, + "logps/rejected": -79.54507446289062, + "loss": 0.2966, + "losses/dpo": 0.3342849016189575, + "losses/sft": 1.7648053169250488, + "losses/total": 0.3342849016189575, + "ref_logps/chosen": -40.895713806152344, + "ref_logps/rejected": -54.30652618408203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0665889978408813, + "rewards/margins": 1.4572656154632568, + "rewards/rejected": -2.5238547325134277, + "step": 1435 + }, + { + "epoch": 1.36, + "grad_norm": 21.250564575195312, + "learning_rate": 3.0447708989157043e-07, + "logps/chosen": -52.267730712890625, + "logps/rejected": -68.39076232910156, + "loss": 0.3329, + "losses/dpo": 0.338424414396286, + "losses/sft": 1.6868879795074463, + "losses/total": 0.338424414396286, + "ref_logps/chosen": -37.6220588684082, + "ref_logps/rejected": -38.4321403503418, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4645671844482422, + "rewards/margins": 1.5312950611114502, + "rewards/rejected": -2.9958624839782715, + "step": 1436 + }, + { + "epoch": 1.36, + "grad_norm": 23.66788673400879, + "learning_rate": 3.04302203567681e-07, + "logps/chosen": -49.62910461425781, + "logps/rejected": -62.755252838134766, + "loss": 0.3298, + "losses/dpo": 0.18312913179397583, + "losses/sft": 1.5485693216323853, + "losses/total": 0.18312913179397583, + "ref_logps/chosen": -37.44635009765625, + "ref_logps/rejected": -36.1099967956543, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2182754278182983, + "rewards/margins": 1.4462497234344482, + "rewards/rejected": -2.664525270462036, + "step": 1437 + }, + { + "epoch": 1.36, + "grad_norm": 18.332170486450195, + "learning_rate": 3.0412731724379156e-07, + "logps/chosen": -51.35747528076172, + "logps/rejected": -77.13438415527344, + "loss": 0.2693, + "losses/dpo": 0.5719197392463684, + "losses/sft": 1.5391196012496948, + "losses/total": 0.5719197392463684, + "ref_logps/chosen": -41.13132858276367, + "ref_logps/rejected": -46.666290283203125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.02261483669281, + "rewards/margins": 2.0241949558258057, + "rewards/rejected": -3.046809673309326, + "step": 1438 + }, + { + "epoch": 1.36, + "grad_norm": 35.252159118652344, + "learning_rate": 3.03952430919902e-07, + "logps/chosen": -48.03419494628906, + "logps/rejected": -80.33631134033203, + "loss": 0.559, + "losses/dpo": 0.896109938621521, + "losses/sft": 3.0311272144317627, + "losses/total": 0.896109938621521, + "ref_logps/chosen": -30.956846237182617, + "ref_logps/rejected": -51.07893371582031, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7077350616455078, + "rewards/margins": 1.218003511428833, + "rewards/rejected": -2.9257383346557617, + "step": 1439 + }, + { + "epoch": 1.36, + "grad_norm": 23.185909271240234, + "learning_rate": 3.037775445960126e-07, + "logps/chosen": -53.763240814208984, + "logps/rejected": -66.82176208496094, + "loss": 0.471, + "losses/dpo": 0.2621922492980957, + "losses/sft": 1.472888469696045, + "losses/total": 0.2621922492980957, + "ref_logps/chosen": -37.556148529052734, + "ref_logps/rejected": -42.45654296875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6207095384597778, + "rewards/margins": 0.8158121109008789, + "rewards/rejected": -2.436521530151367, + "step": 1440 + }, + { + "epoch": 1.36, + "grad_norm": 21.757980346679688, + "learning_rate": 3.036026582721231e-07, + "logps/chosen": -57.9247932434082, + "logps/rejected": -71.8721923828125, + "loss": 0.3349, + "losses/dpo": 0.2935296893119812, + "losses/sft": 1.7554571628570557, + "losses/total": 0.2935296893119812, + "ref_logps/chosen": -43.217811584472656, + "ref_logps/rejected": -42.12742614746094, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.470698595046997, + "rewards/margins": 1.5037784576416016, + "rewards/rejected": -2.9744772911071777, + "step": 1441 + }, + { + "epoch": 1.36, + "grad_norm": 22.38652801513672, + "learning_rate": 3.0342777194823366e-07, + "logps/chosen": -53.57727813720703, + "logps/rejected": -87.47329711914062, + "loss": 0.3578, + "losses/dpo": 0.18158069252967834, + "losses/sft": 1.5796535015106201, + "losses/total": 0.18158069252967834, + "ref_logps/chosen": -37.418819427490234, + "ref_logps/rejected": -52.9796142578125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.615846037864685, + "rewards/margins": 1.8335225582122803, + "rewards/rejected": -3.449368715286255, + "step": 1442 + }, + { + "epoch": 1.36, + "grad_norm": 21.699548721313477, + "learning_rate": 3.032528856243441e-07, + "logps/chosen": -64.91116333007812, + "logps/rejected": -92.02908325195312, + "loss": 0.3342, + "losses/dpo": 0.3280050754547119, + "losses/sft": 1.766418218612671, + "losses/total": 0.3280050754547119, + "ref_logps/chosen": -46.51850128173828, + "ref_logps/rejected": -56.66664123535156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.839266061782837, + "rewards/margins": 1.6969778537750244, + "rewards/rejected": -3.5362439155578613, + "step": 1443 + }, + { + "epoch": 1.36, + "grad_norm": 25.19713592529297, + "learning_rate": 3.030779993004547e-07, + "logps/chosen": -38.48619842529297, + "logps/rejected": -51.78202819824219, + "loss": 0.5595, + "losses/dpo": 0.8129055500030518, + "losses/sft": 3.3074753284454346, + "losses/total": 0.8129055500030518, + "ref_logps/chosen": -25.25677490234375, + "ref_logps/rejected": -31.497846603393555, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3229424953460693, + "rewards/margins": 0.7054756283760071, + "rewards/rejected": -2.0284180641174316, + "step": 1444 + }, + { + "epoch": 1.36, + "grad_norm": 24.862638473510742, + "learning_rate": 3.0290311297656525e-07, + "logps/chosen": -64.99393463134766, + "logps/rejected": -86.62968444824219, + "loss": 0.4145, + "losses/dpo": 0.23780396580696106, + "losses/sft": 2.1835410594940186, + "losses/total": 0.23780396580696106, + "ref_logps/chosen": -47.015663146972656, + "ref_logps/rejected": -55.13080596923828, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7978276014328003, + "rewards/margins": 1.352060317993164, + "rewards/rejected": -3.149887800216675, + "step": 1445 + }, + { + "epoch": 1.37, + "grad_norm": 26.846160888671875, + "learning_rate": 3.027282266526757e-07, + "logps/chosen": -48.17491912841797, + "logps/rejected": -71.64388275146484, + "loss": 0.5015, + "losses/dpo": 0.41429728269577026, + "losses/sft": 1.542240858078003, + "losses/total": 0.41429728269577026, + "ref_logps/chosen": -36.93295669555664, + "ref_logps/rejected": -49.83993911743164, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1241968870162964, + "rewards/margins": 1.0561978816986084, + "rewards/rejected": -2.1803946495056152, + "step": 1446 + }, + { + "epoch": 1.37, + "grad_norm": 17.389711380004883, + "learning_rate": 3.025533403287863e-07, + "logps/chosen": -64.23756408691406, + "logps/rejected": -86.47703552246094, + "loss": 0.2239, + "losses/dpo": 0.20563524961471558, + "losses/sft": 1.9172698259353638, + "losses/total": 0.20563524961471558, + "ref_logps/chosen": -44.17626953125, + "ref_logps/rejected": -47.868324279785156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0061302185058594, + "rewards/margins": 1.8547405004501343, + "rewards/rejected": -3.860870599746704, + "step": 1447 + }, + { + "epoch": 1.37, + "grad_norm": 25.225473403930664, + "learning_rate": 3.023784540048968e-07, + "logps/chosen": -53.32830810546875, + "logps/rejected": -63.27170944213867, + "loss": 0.4693, + "losses/dpo": 0.6495373845100403, + "losses/sft": 1.8110337257385254, + "losses/total": 0.6495373845100403, + "ref_logps/chosen": -39.959617614746094, + "ref_logps/rejected": -37.68694305419922, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3368688821792603, + "rewards/margins": 1.22160804271698, + "rewards/rejected": -2.5584769248962402, + "step": 1448 + }, + { + "epoch": 1.37, + "grad_norm": 21.963321685791016, + "learning_rate": 3.0220356768100736e-07, + "logps/chosen": -61.472476959228516, + "logps/rejected": -81.95194244384766, + "loss": 0.3652, + "losses/dpo": 0.244198277592659, + "losses/sft": 1.7694854736328125, + "losses/total": 0.244198277592659, + "ref_logps/chosen": -42.86567687988281, + "ref_logps/rejected": -47.24297332763672, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8606804609298706, + "rewards/margins": 1.6102163791656494, + "rewards/rejected": -3.4708969593048096, + "step": 1449 + }, + { + "epoch": 1.37, + "grad_norm": 20.733707427978516, + "learning_rate": 3.020286813571178e-07, + "logps/chosen": -53.143592834472656, + "logps/rejected": -80.09049224853516, + "loss": 0.4151, + "losses/dpo": 0.46113601326942444, + "losses/sft": 1.6883480548858643, + "losses/total": 0.46113601326942444, + "ref_logps/chosen": -39.02067565917969, + "ref_logps/rejected": -50.30956268310547, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.412292242050171, + "rewards/margins": 1.5658011436462402, + "rewards/rejected": -2.978093385696411, + "step": 1450 + }, + { + "epoch": 1.37, + "grad_norm": 15.747688293457031, + "learning_rate": 3.018537950332284e-07, + "logps/chosen": -35.794960021972656, + "logps/rejected": -54.41717529296875, + "loss": 0.3542, + "losses/dpo": 0.4572751522064209, + "losses/sft": 1.935394525527954, + "losses/total": 0.4572751522064209, + "ref_logps/chosen": -27.21696662902832, + "ref_logps/rejected": -32.192317962646484, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8577994704246521, + "rewards/margins": 1.3646862506866455, + "rewards/rejected": -2.2224857807159424, + "step": 1451 + }, + { + "epoch": 1.37, + "grad_norm": 27.470935821533203, + "learning_rate": 3.0167890870933895e-07, + "logps/chosen": -54.65156936645508, + "logps/rejected": -64.08354187011719, + "loss": 0.4168, + "losses/dpo": 0.21041765809059143, + "losses/sft": 1.51226007938385, + "losses/total": 0.21041765809059143, + "ref_logps/chosen": -41.28759765625, + "ref_logps/rejected": -38.83619689941406, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.336397409439087, + "rewards/margins": 1.1883370876312256, + "rewards/rejected": -2.5247344970703125, + "step": 1452 + }, + { + "epoch": 1.37, + "grad_norm": 32.393550872802734, + "learning_rate": 3.015040223854494e-07, + "logps/chosen": -62.701499938964844, + "logps/rejected": -89.2132568359375, + "loss": 0.4346, + "losses/dpo": 0.16079044342041016, + "losses/sft": 1.9983454942703247, + "losses/total": 0.16079044342041016, + "ref_logps/chosen": -44.023834228515625, + "ref_logps/rejected": -56.803733825683594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8677663803100586, + "rewards/margins": 1.373186469078064, + "rewards/rejected": -3.240952730178833, + "step": 1453 + }, + { + "epoch": 1.37, + "grad_norm": 35.32954788208008, + "learning_rate": 3.0132913606156e-07, + "logps/chosen": -48.779273986816406, + "logps/rejected": -63.26791763305664, + "loss": 0.5552, + "losses/dpo": 0.24887573719024658, + "losses/sft": 2.008256196975708, + "losses/total": 0.24887573719024658, + "ref_logps/chosen": -34.924224853515625, + "ref_logps/rejected": -38.76007080078125, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3855046033859253, + "rewards/margins": 1.0652801990509033, + "rewards/rejected": -2.450784683227539, + "step": 1454 + }, + { + "epoch": 1.37, + "grad_norm": 18.07564926147461, + "learning_rate": 3.011542497376705e-07, + "logps/chosen": -50.26887512207031, + "logps/rejected": -73.92079162597656, + "loss": 0.3222, + "losses/dpo": 0.3506878912448883, + "losses/sft": 1.500986099243164, + "losses/total": 0.3506878912448883, + "ref_logps/chosen": -36.34586715698242, + "ref_logps/rejected": -47.58843231201172, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3923009634017944, + "rewards/margins": 1.2409355640411377, + "rewards/rejected": -2.6332364082336426, + "step": 1455 + }, + { + "epoch": 1.37, + "grad_norm": 29.577266693115234, + "learning_rate": 3.0097936341378105e-07, + "logps/chosen": -49.849891662597656, + "logps/rejected": -57.964447021484375, + "loss": 0.5189, + "losses/dpo": 0.8094742298126221, + "losses/sft": 2.1526012420654297, + "losses/total": 0.8094742298126221, + "ref_logps/chosen": -34.91853713989258, + "ref_logps/rejected": -33.85536193847656, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4931353330612183, + "rewards/margins": 0.9177733659744263, + "rewards/rejected": -2.4109086990356445, + "step": 1456 + }, + { + "epoch": 1.38, + "grad_norm": 18.888126373291016, + "learning_rate": 3.0080447708989156e-07, + "logps/chosen": -45.32219314575195, + "logps/rejected": -58.74849319458008, + "loss": 0.4271, + "losses/dpo": 0.2755812406539917, + "losses/sft": 1.6777021884918213, + "losses/total": 0.2755812406539917, + "ref_logps/chosen": -30.581050872802734, + "ref_logps/rejected": -34.79866027832031, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4741144180297852, + "rewards/margins": 0.9208693504333496, + "rewards/rejected": -2.3949837684631348, + "step": 1457 + }, + { + "epoch": 1.38, + "grad_norm": 17.708778381347656, + "learning_rate": 3.006295907660021e-07, + "logps/chosen": -35.59089279174805, + "logps/rejected": -53.55289077758789, + "loss": 0.3655, + "losses/dpo": 0.2321612536907196, + "losses/sft": 1.954699158668518, + "losses/total": 0.2321612536907196, + "ref_logps/chosen": -25.250795364379883, + "ref_logps/rejected": -30.8167724609375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0340098142623901, + "rewards/margins": 1.239601731300354, + "rewards/rejected": -2.273611545562744, + "step": 1458 + }, + { + "epoch": 1.38, + "grad_norm": 27.67070198059082, + "learning_rate": 3.0045470444211264e-07, + "logps/chosen": -70.18630981445312, + "logps/rejected": -73.20052337646484, + "loss": 0.5085, + "losses/dpo": 0.4248794615268707, + "losses/sft": 1.938214659690857, + "losses/total": 0.4248794615268707, + "ref_logps/chosen": -53.95916748046875, + "ref_logps/rejected": -47.098670959472656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6227140426635742, + "rewards/margins": 0.9874710440635681, + "rewards/rejected": -2.610185146331787, + "step": 1459 + }, + { + "epoch": 1.38, + "grad_norm": 32.969573974609375, + "learning_rate": 3.002798181182231e-07, + "logps/chosen": -54.15985870361328, + "logps/rejected": -62.675636291503906, + "loss": 0.6297, + "losses/dpo": 0.2536635398864746, + "losses/sft": 1.526346206665039, + "losses/total": 0.2536635398864746, + "ref_logps/chosen": -35.572959899902344, + "ref_logps/rejected": -39.281192779541016, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.858689546585083, + "rewards/margins": 0.4807544946670532, + "rewards/rejected": -2.339444160461426, + "step": 1460 + }, + { + "epoch": 1.38, + "grad_norm": 27.672409057617188, + "learning_rate": 3.0010493179433367e-07, + "logps/chosen": -58.23644256591797, + "logps/rejected": -85.10983276367188, + "loss": 0.4249, + "losses/dpo": 0.39252281188964844, + "losses/sft": 1.9027915000915527, + "losses/total": 0.39252281188964844, + "ref_logps/chosen": -40.46833801269531, + "ref_logps/rejected": -53.45152282714844, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7768104076385498, + "rewards/margins": 1.389020323753357, + "rewards/rejected": -3.1658310890197754, + "step": 1461 + }, + { + "epoch": 1.38, + "grad_norm": 18.22044563293457, + "learning_rate": 2.999300454704442e-07, + "logps/chosen": -59.07020568847656, + "logps/rejected": -64.58023071289062, + "loss": 0.332, + "losses/dpo": 0.5386080741882324, + "losses/sft": 1.5160285234451294, + "losses/total": 0.5386080741882324, + "ref_logps/chosen": -42.942867279052734, + "ref_logps/rejected": -35.57465362548828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6127336025238037, + "rewards/margins": 1.2878247499465942, + "rewards/rejected": -2.9005582332611084, + "step": 1462 + }, + { + "epoch": 1.38, + "grad_norm": 17.094030380249023, + "learning_rate": 2.9975515914655475e-07, + "logps/chosen": -38.967994689941406, + "logps/rejected": -64.69197082519531, + "loss": 0.3281, + "losses/dpo": 0.19840840995311737, + "losses/sft": 1.487196445465088, + "losses/total": 0.19840840995311737, + "ref_logps/chosen": -30.22754669189453, + "ref_logps/rejected": -40.487152099609375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8740450143814087, + "rewards/margins": 1.5464364290237427, + "rewards/rejected": -2.4204814434051514, + "step": 1463 + }, + { + "epoch": 1.38, + "grad_norm": 15.936551094055176, + "learning_rate": 2.9958027282266526e-07, + "logps/chosen": -43.809566497802734, + "logps/rejected": -52.186256408691406, + "loss": 0.3292, + "losses/dpo": 0.1375245749950409, + "losses/sft": 1.757676601409912, + "losses/total": 0.1375245749950409, + "ref_logps/chosen": -30.919689178466797, + "ref_logps/rejected": -26.887386322021484, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2889879941940308, + "rewards/margins": 1.2408989667892456, + "rewards/rejected": -2.5298869609832764, + "step": 1464 + }, + { + "epoch": 1.38, + "grad_norm": 17.255407333374023, + "learning_rate": 2.9940538649877577e-07, + "logps/chosen": -60.7708854675293, + "logps/rejected": -95.71295928955078, + "loss": 0.2841, + "losses/dpo": 0.23410826921463013, + "losses/sft": 2.4984688758850098, + "losses/total": 0.23410826921463013, + "ref_logps/chosen": -43.045684814453125, + "ref_logps/rejected": -57.78691864013672, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.772519826889038, + "rewards/margins": 2.020085096359253, + "rewards/rejected": -3.792604923248291, + "step": 1465 + }, + { + "epoch": 1.38, + "grad_norm": 31.450180053710938, + "learning_rate": 2.9923050017488634e-07, + "logps/chosen": -47.75181579589844, + "logps/rejected": -50.85969161987305, + "loss": 0.673, + "losses/dpo": 0.7621719241142273, + "losses/sft": 2.1680119037628174, + "losses/total": 0.7621719241142273, + "ref_logps/chosen": -31.87733268737793, + "ref_logps/rejected": -30.926931381225586, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5874487161636353, + "rewards/margins": 0.40582719445228577, + "rewards/rejected": -1.9932758808135986, + "step": 1466 + }, + { + "epoch": 1.39, + "grad_norm": 19.149681091308594, + "learning_rate": 2.990556138509968e-07, + "logps/chosen": -57.261173248291016, + "logps/rejected": -82.31720733642578, + "loss": 0.2558, + "losses/dpo": 0.191327303647995, + "losses/sft": 1.7014291286468506, + "losses/total": 0.191327303647995, + "ref_logps/chosen": -38.61812973022461, + "ref_logps/rejected": -47.27553176879883, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8643043041229248, + "rewards/margins": 1.6398630142211914, + "rewards/rejected": -3.504167318344116, + "step": 1467 + }, + { + "epoch": 1.39, + "grad_norm": 28.514707565307617, + "learning_rate": 2.9888072752710736e-07, + "logps/chosen": -63.502899169921875, + "logps/rejected": -64.31593322753906, + "loss": 0.6183, + "losses/dpo": 0.6624680161476135, + "losses/sft": 2.3497395515441895, + "losses/total": 0.6624680161476135, + "ref_logps/chosen": -46.184349060058594, + "ref_logps/rejected": -40.362693786621094, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7318546772003174, + "rewards/margins": 0.6634698510169983, + "rewards/rejected": -2.395324468612671, + "step": 1468 + }, + { + "epoch": 1.39, + "grad_norm": 25.650829315185547, + "learning_rate": 2.987058412032179e-07, + "logps/chosen": -53.402183532714844, + "logps/rejected": -61.13035583496094, + "loss": 0.4607, + "losses/dpo": 0.10315266996622086, + "losses/sft": 1.7648452520370483, + "losses/total": 0.10315266996622086, + "ref_logps/chosen": -42.29344940185547, + "ref_logps/rejected": -38.432212829589844, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1108734607696533, + "rewards/margins": 1.1589409112930298, + "rewards/rejected": -2.2698144912719727, + "step": 1469 + }, + { + "epoch": 1.39, + "grad_norm": 25.22063446044922, + "learning_rate": 2.9853095487932844e-07, + "logps/chosen": -44.644264221191406, + "logps/rejected": -57.26899719238281, + "loss": 0.4648, + "losses/dpo": 0.6071901917457581, + "losses/sft": 2.1052732467651367, + "losses/total": 0.6071901917457581, + "ref_logps/chosen": -32.22168731689453, + "ref_logps/rejected": -34.735984802246094, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2422577142715454, + "rewards/margins": 1.0110435485839844, + "rewards/rejected": -2.2533011436462402, + "step": 1470 + }, + { + "epoch": 1.39, + "grad_norm": 24.80872344970703, + "learning_rate": 2.9835606855543895e-07, + "logps/chosen": -56.374412536621094, + "logps/rejected": -71.41012573242188, + "loss": 0.4466, + "losses/dpo": 0.7869513630867004, + "losses/sft": 2.471348285675049, + "losses/total": 0.7869513630867004, + "ref_logps/chosen": -43.42095184326172, + "ref_logps/rejected": -49.27517318725586, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2953461408615112, + "rewards/margins": 0.9181485772132874, + "rewards/rejected": -2.2134947776794434, + "step": 1471 + }, + { + "epoch": 1.39, + "grad_norm": 18.416790008544922, + "learning_rate": 2.9818118223154947e-07, + "logps/chosen": -61.26020812988281, + "logps/rejected": -78.95443725585938, + "loss": 0.3026, + "losses/dpo": 0.23834973573684692, + "losses/sft": 1.5854521989822388, + "losses/total": 0.23834973573684692, + "ref_logps/chosen": -49.4541015625, + "ref_logps/rejected": -51.38745880126953, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1806107759475708, + "rewards/margins": 1.5760869979858398, + "rewards/rejected": -2.7566978931427, + "step": 1472 + }, + { + "epoch": 1.39, + "grad_norm": 15.231879234313965, + "learning_rate": 2.9800629590766003e-07, + "logps/chosen": -44.0016975402832, + "logps/rejected": -75.897705078125, + "loss": 0.248, + "losses/dpo": 0.1659824699163437, + "losses/sft": 1.1497304439544678, + "losses/total": 0.1659824699163437, + "ref_logps/chosen": -32.45808792114258, + "ref_logps/rejected": -46.080848693847656, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1543606519699097, + "rewards/margins": 1.827324628829956, + "rewards/rejected": -2.981685161590576, + "step": 1473 + }, + { + "epoch": 1.39, + "grad_norm": 21.670198440551758, + "learning_rate": 2.978314095837705e-07, + "logps/chosen": -45.749427795410156, + "logps/rejected": -63.82823181152344, + "loss": 0.4681, + "losses/dpo": 0.43698054552078247, + "losses/sft": 1.4650715589523315, + "losses/total": 0.43698054552078247, + "ref_logps/chosen": -31.394006729125977, + "ref_logps/rejected": -38.07575225830078, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.435542106628418, + "rewards/margins": 1.139705777168274, + "rewards/rejected": -2.5752477645874023, + "step": 1474 + }, + { + "epoch": 1.39, + "grad_norm": 19.23586082458496, + "learning_rate": 2.9765652325988106e-07, + "logps/chosen": -64.34100341796875, + "logps/rejected": -79.12982940673828, + "loss": 0.3976, + "losses/dpo": 0.64848393201828, + "losses/sft": 1.8697618246078491, + "losses/total": 0.64848393201828, + "ref_logps/chosen": -50.03496551513672, + "ref_logps/rejected": -53.19569778442383, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4306037425994873, + "rewards/margins": 1.1628097295761108, + "rewards/rejected": -2.5934133529663086, + "step": 1475 + }, + { + "epoch": 1.39, + "grad_norm": 21.026540756225586, + "learning_rate": 2.974816369359916e-07, + "logps/chosen": -42.318729400634766, + "logps/rejected": -58.60890579223633, + "loss": 0.4032, + "losses/dpo": 0.33661365509033203, + "losses/sft": 1.560653805732727, + "losses/total": 0.33661365509033203, + "ref_logps/chosen": -31.113370895385742, + "ref_logps/rejected": -36.7552490234375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1205358505249023, + "rewards/margins": 1.06482994556427, + "rewards/rejected": -2.185365676879883, + "step": 1476 + }, + { + "epoch": 1.39, + "grad_norm": 17.171052932739258, + "learning_rate": 2.9730675061210214e-07, + "logps/chosen": -45.365013122558594, + "logps/rejected": -73.35462951660156, + "loss": 0.3509, + "losses/dpo": 0.44881758093833923, + "losses/sft": 1.5224173069000244, + "losses/total": 0.44881758093833923, + "ref_logps/chosen": -31.196611404418945, + "ref_logps/rejected": -45.02224349975586, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4168400764465332, + "rewards/margins": 1.4163987636566162, + "rewards/rejected": -2.8332388401031494, + "step": 1477 + }, + { + "epoch": 1.4, + "grad_norm": 18.62702178955078, + "learning_rate": 2.9713186428821265e-07, + "logps/chosen": -48.18018341064453, + "logps/rejected": -66.46004486083984, + "loss": 0.3649, + "losses/dpo": 0.37515488266944885, + "losses/sft": 2.106121778488159, + "losses/total": 0.37515488266944885, + "ref_logps/chosen": -33.46634292602539, + "ref_logps/rejected": -39.04322052001953, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4713842868804932, + "rewards/margins": 1.2702980041503906, + "rewards/rejected": -2.741682529449463, + "step": 1478 + }, + { + "epoch": 1.4, + "grad_norm": 22.654354095458984, + "learning_rate": 2.9695697796432316e-07, + "logps/chosen": -44.82452392578125, + "logps/rejected": -61.94343566894531, + "loss": 0.4275, + "losses/dpo": 0.40967732667922974, + "losses/sft": 1.5727530717849731, + "losses/total": 0.40967732667922974, + "ref_logps/chosen": -30.690963745117188, + "ref_logps/rejected": -37.72716522216797, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.413355827331543, + "rewards/margins": 1.0082708597183228, + "rewards/rejected": -2.4216268062591553, + "step": 1479 + }, + { + "epoch": 1.4, + "grad_norm": 28.83608627319336, + "learning_rate": 2.967820916404337e-07, + "logps/chosen": -53.062374114990234, + "logps/rejected": -74.76139831542969, + "loss": 0.4279, + "losses/dpo": 0.2415730357170105, + "losses/sft": 1.272810935974121, + "losses/total": 0.2415730357170105, + "ref_logps/chosen": -39.616756439208984, + "ref_logps/rejected": -50.39042663574219, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3445615768432617, + "rewards/margins": 1.092535376548767, + "rewards/rejected": -2.4370968341827393, + "step": 1480 + }, + { + "epoch": 1.4, + "grad_norm": 25.0146427154541, + "learning_rate": 2.966072053165442e-07, + "logps/chosen": -44.36920928955078, + "logps/rejected": -57.59333038330078, + "loss": 0.4528, + "losses/dpo": 0.2863396406173706, + "losses/sft": 1.8024622201919556, + "losses/total": 0.2863396406173706, + "ref_logps/chosen": -34.70337677001953, + "ref_logps/rejected": -36.049468994140625, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9665828943252563, + "rewards/margins": 1.1878032684326172, + "rewards/rejected": -2.154386043548584, + "step": 1481 + }, + { + "epoch": 1.4, + "grad_norm": 22.59343719482422, + "learning_rate": 2.9643231899265475e-07, + "logps/chosen": -54.02422332763672, + "logps/rejected": -72.16030883789062, + "loss": 0.3826, + "losses/dpo": 0.44329535961151123, + "losses/sft": 1.5579030513763428, + "losses/total": 0.44329535961151123, + "ref_logps/chosen": -41.91730499267578, + "ref_logps/rejected": -46.37119674682617, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2106914520263672, + "rewards/margins": 1.3682198524475098, + "rewards/rejected": -2.578911304473877, + "step": 1482 + }, + { + "epoch": 1.4, + "grad_norm": 13.133771896362305, + "learning_rate": 2.962574326687653e-07, + "logps/chosen": -31.992694854736328, + "logps/rejected": -65.34564208984375, + "loss": 0.2487, + "losses/dpo": 0.1788356602191925, + "losses/sft": 1.3065111637115479, + "losses/total": 0.1788356602191925, + "ref_logps/chosen": -24.564327239990234, + "ref_logps/rejected": -38.47550964355469, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7428367137908936, + "rewards/margins": 1.9441765546798706, + "rewards/rejected": -2.6870133876800537, + "step": 1483 + }, + { + "epoch": 1.4, + "grad_norm": 20.612218856811523, + "learning_rate": 2.9608254634487583e-07, + "logps/chosen": -48.381187438964844, + "logps/rejected": -68.538818359375, + "loss": 0.3926, + "losses/dpo": 0.40987130999565125, + "losses/sft": 1.2583063840866089, + "losses/total": 0.40987130999565125, + "ref_logps/chosen": -36.965511322021484, + "ref_logps/rejected": -43.162193298339844, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1415679454803467, + "rewards/margins": 1.3960951566696167, + "rewards/rejected": -2.537662982940674, + "step": 1484 + }, + { + "epoch": 1.4, + "grad_norm": 21.078365325927734, + "learning_rate": 2.9590766002098634e-07, + "logps/chosen": -57.50191116333008, + "logps/rejected": -72.59945678710938, + "loss": 0.3103, + "losses/dpo": 0.3797740340232849, + "losses/sft": 1.9937254190444946, + "losses/total": 0.3797740340232849, + "ref_logps/chosen": -42.138980865478516, + "ref_logps/rejected": -43.61561965942383, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.536293387413025, + "rewards/margins": 1.362090826034546, + "rewards/rejected": -2.8983843326568604, + "step": 1485 + }, + { + "epoch": 1.4, + "grad_norm": 26.72555160522461, + "learning_rate": 2.9573277369709686e-07, + "logps/chosen": -60.041465759277344, + "logps/rejected": -71.0130386352539, + "loss": 0.4958, + "losses/dpo": 0.41573166847229004, + "losses/sft": 2.1062893867492676, + "losses/total": 0.41573166847229004, + "ref_logps/chosen": -43.17316436767578, + "ref_logps/rejected": -43.25284194946289, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6868302822113037, + "rewards/margins": 1.089189887046814, + "rewards/rejected": -2.776020050048828, + "step": 1486 + }, + { + "epoch": 1.4, + "grad_norm": 20.682126998901367, + "learning_rate": 2.955578873732074e-07, + "logps/chosen": -45.147117614746094, + "logps/rejected": -73.24772644042969, + "loss": 0.3537, + "losses/dpo": 0.20876505970954895, + "losses/sft": 1.2642250061035156, + "losses/total": 0.20876505970954895, + "ref_logps/chosen": -32.60017395019531, + "ref_logps/rejected": -46.0899772644043, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2546948194503784, + "rewards/margins": 1.4610795974731445, + "rewards/rejected": -2.7157745361328125, + "step": 1487 + }, + { + "epoch": 1.41, + "grad_norm": 24.359195709228516, + "learning_rate": 2.953830010493179e-07, + "logps/chosen": -52.26470184326172, + "logps/rejected": -60.49019241333008, + "loss": 0.4173, + "losses/dpo": 0.5849593877792358, + "losses/sft": 1.9212629795074463, + "losses/total": 0.5849593877792358, + "ref_logps/chosen": -42.57030487060547, + "ref_logps/rejected": -39.783363342285156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9694399237632751, + "rewards/margins": 1.1012427806854248, + "rewards/rejected": -2.0706825256347656, + "step": 1488 + }, + { + "epoch": 1.41, + "grad_norm": 30.23659896850586, + "learning_rate": 2.9520811472542845e-07, + "logps/chosen": -50.95896911621094, + "logps/rejected": -55.51099395751953, + "loss": 0.511, + "losses/dpo": 0.6571680307388306, + "losses/sft": 1.5848480463027954, + "losses/total": 0.6571680307388306, + "ref_logps/chosen": -37.404823303222656, + "ref_logps/rejected": -31.563274383544922, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.355414628982544, + "rewards/margins": 1.0393571853637695, + "rewards/rejected": -2.3947718143463135, + "step": 1489 + }, + { + "epoch": 1.41, + "grad_norm": 22.450220108032227, + "learning_rate": 2.95033228401539e-07, + "logps/chosen": -45.04136657714844, + "logps/rejected": -64.29212951660156, + "loss": 0.4041, + "losses/dpo": 0.5002979636192322, + "losses/sft": 1.9214532375335693, + "losses/total": 0.5002979636192322, + "ref_logps/chosen": -34.148441314697266, + "ref_logps/rejected": -40.619911193847656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0892921686172485, + "rewards/margins": 1.2779297828674316, + "rewards/rejected": -2.3672218322753906, + "step": 1490 + }, + { + "epoch": 1.41, + "grad_norm": 32.676029205322266, + "learning_rate": 2.948583420776495e-07, + "logps/chosen": -47.213138580322266, + "logps/rejected": -56.650901794433594, + "loss": 0.7712, + "losses/dpo": 1.327339768409729, + "losses/sft": 2.4795992374420166, + "losses/total": 1.327339768409729, + "ref_logps/chosen": -31.19516944885254, + "ref_logps/rejected": -35.810768127441406, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6017969846725464, + "rewards/margins": 0.48221641778945923, + "rewards/rejected": -2.0840134620666504, + "step": 1491 + }, + { + "epoch": 1.41, + "grad_norm": 28.814014434814453, + "learning_rate": 2.9468345575376004e-07, + "logps/chosen": -56.77680969238281, + "logps/rejected": -82.70301055908203, + "loss": 0.4759, + "losses/dpo": 0.3210974335670471, + "losses/sft": 2.3065695762634277, + "losses/total": 0.3210974335670471, + "ref_logps/chosen": -39.640220642089844, + "ref_logps/rejected": -55.055564880371094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7136590480804443, + "rewards/margins": 1.0510858297348022, + "rewards/rejected": -2.764744758605957, + "step": 1492 + }, + { + "epoch": 1.41, + "grad_norm": 16.515443801879883, + "learning_rate": 2.9450856942987055e-07, + "logps/chosen": -52.44084167480469, + "logps/rejected": -69.65238952636719, + "loss": 0.3448, + "losses/dpo": 0.5896556377410889, + "losses/sft": 2.2405614852905273, + "losses/total": 0.5896556377410889, + "ref_logps/chosen": -39.882965087890625, + "ref_logps/rejected": -43.62897872924805, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2557874917984009, + "rewards/margins": 1.3465540409088135, + "rewards/rejected": -2.602341651916504, + "step": 1493 + }, + { + "epoch": 1.41, + "grad_norm": 21.969770431518555, + "learning_rate": 2.943336831059811e-07, + "logps/chosen": -46.63779830932617, + "logps/rejected": -69.41429138183594, + "loss": 0.3957, + "losses/dpo": 0.4231574237346649, + "losses/sft": 1.2778698205947876, + "losses/total": 0.4231574237346649, + "ref_logps/chosen": -32.38006591796875, + "ref_logps/rejected": -43.47391891479492, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4257733821868896, + "rewards/margins": 1.1682645082473755, + "rewards/rejected": -2.5940377712249756, + "step": 1494 + }, + { + "epoch": 1.41, + "grad_norm": 20.34341049194336, + "learning_rate": 2.941587967820916e-07, + "logps/chosen": -46.3596076965332, + "logps/rejected": -70.40168762207031, + "loss": 0.3418, + "losses/dpo": 0.40635621547698975, + "losses/sft": 1.805832862854004, + "losses/total": 0.40635621547698975, + "ref_logps/chosen": -32.84781265258789, + "ref_logps/rejected": -43.06932830810547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3511793613433838, + "rewards/margins": 1.3820571899414062, + "rewards/rejected": -2.73323655128479, + "step": 1495 + }, + { + "epoch": 1.41, + "grad_norm": 12.552985191345215, + "learning_rate": 2.9398391045820214e-07, + "logps/chosen": -47.052276611328125, + "logps/rejected": -82.99396514892578, + "loss": 0.1853, + "losses/dpo": 0.31122273206710815, + "losses/sft": 1.8056796789169312, + "losses/total": 0.31122273206710815, + "ref_logps/chosen": -36.09916305541992, + "ref_logps/rejected": -52.081565856933594, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0953112840652466, + "rewards/margins": 1.9959288835525513, + "rewards/rejected": -3.0912399291992188, + "step": 1496 + }, + { + "epoch": 1.41, + "grad_norm": 19.483808517456055, + "learning_rate": 2.938090241343127e-07, + "logps/chosen": -39.39792251586914, + "logps/rejected": -74.99205780029297, + "loss": 0.3764, + "losses/dpo": 0.1508772224187851, + "losses/sft": 1.6205919981002808, + "losses/total": 0.1508772224187851, + "ref_logps/chosen": -28.510116577148438, + "ref_logps/rejected": -46.25724411010742, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0887806415557861, + "rewards/margins": 1.7847009897232056, + "rewards/rejected": -2.8734817504882812, + "step": 1497 + }, + { + "epoch": 1.41, + "grad_norm": 23.783533096313477, + "learning_rate": 2.936341378104232e-07, + "logps/chosen": -49.414306640625, + "logps/rejected": -64.71704864501953, + "loss": 0.4158, + "losses/dpo": 0.6990461945533752, + "losses/sft": 1.6989450454711914, + "losses/total": 0.6990461945533752, + "ref_logps/chosen": -36.04567337036133, + "ref_logps/rejected": -39.98622131347656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3368635177612305, + "rewards/margins": 1.1362197399139404, + "rewards/rejected": -2.473083257675171, + "step": 1498 + }, + { + "epoch": 1.42, + "grad_norm": 22.386396408081055, + "learning_rate": 2.9345925148653373e-07, + "logps/chosen": -51.40376281738281, + "logps/rejected": -64.31002807617188, + "loss": 0.4081, + "losses/dpo": 0.33907002210617065, + "losses/sft": 1.927138090133667, + "losses/total": 0.33907002210617065, + "ref_logps/chosen": -40.528560638427734, + "ref_logps/rejected": -41.02455139160156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0875203609466553, + "rewards/margins": 1.2410271167755127, + "rewards/rejected": -2.328547477722168, + "step": 1499 + }, + { + "epoch": 1.42, + "grad_norm": 26.581707000732422, + "learning_rate": 2.9328436516264424e-07, + "logps/chosen": -49.87763595581055, + "logps/rejected": -64.03345489501953, + "loss": 0.4726, + "losses/dpo": 0.3879508376121521, + "losses/sft": 1.7184689044952393, + "losses/total": 0.3879508376121521, + "ref_logps/chosen": -37.52323913574219, + "ref_logps/rejected": -39.740928649902344, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2354395389556885, + "rewards/margins": 1.1938132047653198, + "rewards/rejected": -2.4292526245117188, + "step": 1500 + }, + { + "epoch": 1.42, + "grad_norm": 24.524402618408203, + "learning_rate": 2.931094788387548e-07, + "logps/chosen": -54.667537689208984, + "logps/rejected": -64.08084106445312, + "loss": 0.4537, + "losses/dpo": 0.1589667797088623, + "losses/sft": 1.773787260055542, + "losses/total": 0.1589667797088623, + "ref_logps/chosen": -38.175533294677734, + "ref_logps/rejected": -38.80839920043945, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.649200439453125, + "rewards/margins": 0.8780432343482971, + "rewards/rejected": -2.5272438526153564, + "step": 1501 + }, + { + "epoch": 1.42, + "grad_norm": 15.457566261291504, + "learning_rate": 2.929345925148653e-07, + "logps/chosen": -52.253299713134766, + "logps/rejected": -71.617919921875, + "loss": 0.2313, + "losses/dpo": 0.20826825499534607, + "losses/sft": 1.8494374752044678, + "losses/total": 0.20826825499534607, + "ref_logps/chosen": -38.869773864746094, + "ref_logps/rejected": -41.653194427490234, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3383526802062988, + "rewards/margins": 1.6581199169158936, + "rewards/rejected": -2.9964723587036133, + "step": 1502 + }, + { + "epoch": 1.42, + "grad_norm": 20.69858169555664, + "learning_rate": 2.9275970619097583e-07, + "logps/chosen": -44.85877990722656, + "logps/rejected": -70.6641845703125, + "loss": 0.3753, + "losses/dpo": 0.20970723032951355, + "losses/sft": 1.2575109004974365, + "losses/total": 0.20970723032951355, + "ref_logps/chosen": -33.47350311279297, + "ref_logps/rejected": -47.147457122802734, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1385278701782227, + "rewards/margins": 1.2131446599960327, + "rewards/rejected": -2.351672649383545, + "step": 1503 + }, + { + "epoch": 1.42, + "grad_norm": 19.670150756835938, + "learning_rate": 2.925848198670864e-07, + "logps/chosen": -47.65375518798828, + "logps/rejected": -76.619140625, + "loss": 0.373, + "losses/dpo": 0.4358763098716736, + "losses/sft": 1.4477770328521729, + "losses/total": 0.4358763098716736, + "ref_logps/chosen": -36.318790435791016, + "ref_logps/rejected": -51.11369323730469, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1334965229034424, + "rewards/margins": 1.4170477390289307, + "rewards/rejected": -2.550544261932373, + "step": 1504 + }, + { + "epoch": 1.42, + "grad_norm": 37.48612594604492, + "learning_rate": 2.924099335431969e-07, + "logps/chosen": -53.99786376953125, + "logps/rejected": -70.99600219726562, + "loss": 0.6758, + "losses/dpo": 0.4331752359867096, + "losses/sft": 2.1719987392425537, + "losses/total": 0.4331752359867096, + "ref_logps/chosen": -36.0965576171875, + "ref_logps/rejected": -47.481536865234375, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.790130853652954, + "rewards/margins": 0.5613151788711548, + "rewards/rejected": -2.3514459133148193, + "step": 1505 + }, + { + "epoch": 1.42, + "grad_norm": 30.493154525756836, + "learning_rate": 2.922350472193074e-07, + "logps/chosen": -57.61941146850586, + "logps/rejected": -71.89325714111328, + "loss": 0.672, + "losses/dpo": 0.6709376573562622, + "losses/sft": 2.321643829345703, + "losses/total": 0.6709376573562622, + "ref_logps/chosen": -36.45568084716797, + "ref_logps/rejected": -44.72772979736328, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.116373062133789, + "rewards/margins": 0.6001795530319214, + "rewards/rejected": -2.716552495956421, + "step": 1506 + }, + { + "epoch": 1.42, + "grad_norm": 34.07032012939453, + "learning_rate": 2.9206016089541794e-07, + "logps/chosen": -45.518280029296875, + "logps/rejected": -56.7679443359375, + "loss": 0.613, + "losses/dpo": 0.5298448801040649, + "losses/sft": 1.6550202369689941, + "losses/total": 0.5298448801040649, + "ref_logps/chosen": -33.158653259277344, + "ref_logps/rejected": -39.70037841796875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2359625101089478, + "rewards/margins": 0.4707942008972168, + "rewards/rejected": -1.706756830215454, + "step": 1507 + }, + { + "epoch": 1.42, + "grad_norm": 23.747217178344727, + "learning_rate": 2.918852745715285e-07, + "logps/chosen": -51.798919677734375, + "logps/rejected": -66.12730407714844, + "loss": 0.4607, + "losses/dpo": 0.5760438442230225, + "losses/sft": 1.8068910837173462, + "losses/total": 0.5760438442230225, + "ref_logps/chosen": -40.55227279663086, + "ref_logps/rejected": -48.49240493774414, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.124664545059204, + "rewards/margins": 0.6388249397277832, + "rewards/rejected": -1.7634894847869873, + "step": 1508 + }, + { + "epoch": 1.42, + "grad_norm": 15.932150840759277, + "learning_rate": 2.91710388247639e-07, + "logps/chosen": -34.748321533203125, + "logps/rejected": -58.75572204589844, + "loss": 0.2881, + "losses/dpo": 0.24681426584720612, + "losses/sft": 1.252745270729065, + "losses/total": 0.24681426584720612, + "ref_logps/chosen": -24.019683837890625, + "ref_logps/rejected": -33.04027557373047, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0728639364242554, + "rewards/margins": 1.4986803531646729, + "rewards/rejected": -2.5715441703796387, + "step": 1509 + }, + { + "epoch": 1.43, + "grad_norm": 23.119163513183594, + "learning_rate": 2.9153550192374953e-07, + "logps/chosen": -45.46012878417969, + "logps/rejected": -58.938079833984375, + "loss": 0.5108, + "losses/dpo": 0.9658684730529785, + "losses/sft": 2.4378509521484375, + "losses/total": 0.9658684730529785, + "ref_logps/chosen": -32.790374755859375, + "ref_logps/rejected": -36.19810485839844, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2669752836227417, + "rewards/margins": 1.007022500038147, + "rewards/rejected": -2.2739977836608887, + "step": 1510 + }, + { + "epoch": 1.43, + "grad_norm": 19.32199478149414, + "learning_rate": 2.913606155998601e-07, + "logps/chosen": -63.526588439941406, + "logps/rejected": -73.48130798339844, + "loss": 0.3019, + "losses/dpo": 0.3907686173915863, + "losses/sft": 1.6382976770401, + "losses/total": 0.3907686173915863, + "ref_logps/chosen": -49.78626251220703, + "ref_logps/rejected": -44.628684997558594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3740321397781372, + "rewards/margins": 1.51123046875, + "rewards/rejected": -2.8852624893188477, + "step": 1511 + }, + { + "epoch": 1.43, + "grad_norm": 19.381914138793945, + "learning_rate": 2.911857292759706e-07, + "logps/chosen": -63.94828414916992, + "logps/rejected": -77.06166076660156, + "loss": 0.4115, + "losses/dpo": 0.4323621392250061, + "losses/sft": 2.058326482772827, + "losses/total": 0.4323621392250061, + "ref_logps/chosen": -46.979366302490234, + "ref_logps/rejected": -49.90327072143555, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6968919038772583, + "rewards/margins": 1.0189476013183594, + "rewards/rejected": -2.715839385986328, + "step": 1512 + }, + { + "epoch": 1.43, + "grad_norm": 24.129758834838867, + "learning_rate": 2.910108429520811e-07, + "logps/chosen": -45.2930908203125, + "logps/rejected": -66.28303527832031, + "loss": 0.5376, + "losses/dpo": 0.5452848076820374, + "losses/sft": 1.6820714473724365, + "losses/total": 0.5452848076820374, + "ref_logps/chosen": -31.70247459411621, + "ref_logps/rejected": -43.183048248291016, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3590612411499023, + "rewards/margins": 0.9509379863739014, + "rewards/rejected": -2.309999465942383, + "step": 1513 + }, + { + "epoch": 1.43, + "grad_norm": 22.892858505249023, + "learning_rate": 2.9083595662819163e-07, + "logps/chosen": -52.986759185791016, + "logps/rejected": -59.184425354003906, + "loss": 0.3972, + "losses/dpo": 0.5723955631256104, + "losses/sft": 2.107868194580078, + "losses/total": 0.5723955631256104, + "ref_logps/chosen": -40.682411193847656, + "ref_logps/rejected": -35.80892562866211, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2304348945617676, + "rewards/margins": 1.1071151494979858, + "rewards/rejected": -2.337550163269043, + "step": 1514 + }, + { + "epoch": 1.43, + "grad_norm": 19.007612228393555, + "learning_rate": 2.906610703043022e-07, + "logps/chosen": -48.39806365966797, + "logps/rejected": -75.96879577636719, + "loss": 0.3577, + "losses/dpo": 0.5371244549751282, + "losses/sft": 1.5192457437515259, + "losses/total": 0.5371244549751282, + "ref_logps/chosen": -34.886680603027344, + "ref_logps/rejected": -47.24713134765625, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3511383533477783, + "rewards/margins": 1.5210281610488892, + "rewards/rejected": -2.872166633605957, + "step": 1515 + }, + { + "epoch": 1.43, + "grad_norm": 25.711668014526367, + "learning_rate": 2.904861839804127e-07, + "logps/chosen": -51.43296813964844, + "logps/rejected": -85.52340698242188, + "loss": 0.4498, + "losses/dpo": 0.3060365617275238, + "losses/sft": 1.6344822645187378, + "losses/total": 0.3060365617275238, + "ref_logps/chosen": -37.094058990478516, + "ref_logps/rejected": -60.42805480957031, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4338903427124023, + "rewards/margins": 1.0756444931030273, + "rewards/rejected": -2.5095348358154297, + "step": 1516 + }, + { + "epoch": 1.43, + "grad_norm": 25.63076400756836, + "learning_rate": 2.903112976565232e-07, + "logps/chosen": -48.82794189453125, + "logps/rejected": -67.44844818115234, + "loss": 0.4389, + "losses/dpo": 0.5563966631889343, + "losses/sft": 2.7458395957946777, + "losses/total": 0.5563966631889343, + "ref_logps/chosen": -33.88334274291992, + "ref_logps/rejected": -41.201087951660156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.494459867477417, + "rewards/margins": 1.1302764415740967, + "rewards/rejected": -2.6247363090515137, + "step": 1517 + }, + { + "epoch": 1.43, + "grad_norm": 20.57042121887207, + "learning_rate": 2.901364113326338e-07, + "logps/chosen": -47.47187423706055, + "logps/rejected": -82.17427825927734, + "loss": 0.352, + "losses/dpo": 0.3270842730998993, + "losses/sft": 1.8293434381484985, + "losses/total": 0.3270842730998993, + "ref_logps/chosen": -31.914215087890625, + "ref_logps/rejected": -49.63578796386719, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.555765986442566, + "rewards/margins": 1.6980829238891602, + "rewards/rejected": -3.2538490295410156, + "step": 1518 + }, + { + "epoch": 1.43, + "grad_norm": 23.56537628173828, + "learning_rate": 2.899615250087443e-07, + "logps/chosen": -61.43927001953125, + "logps/rejected": -64.43576049804688, + "loss": 0.4044, + "losses/dpo": 0.22194579243659973, + "losses/sft": 2.3176565170288086, + "losses/total": 0.22194579243659973, + "ref_logps/chosen": -44.38973617553711, + "ref_logps/rejected": -35.563785552978516, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7049531936645508, + "rewards/margins": 1.1822437047958374, + "rewards/rejected": -2.8871970176696777, + "step": 1519 + }, + { + "epoch": 1.44, + "grad_norm": 25.64571762084961, + "learning_rate": 2.897866386848548e-07, + "logps/chosen": -51.46670913696289, + "logps/rejected": -60.356689453125, + "loss": 0.503, + "losses/dpo": 0.4018670618534088, + "losses/sft": 2.062720775604248, + "losses/total": 0.4018670618534088, + "ref_logps/chosen": -36.0409049987793, + "ref_logps/rejected": -34.509521484375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5425803661346436, + "rewards/margins": 1.0421358346939087, + "rewards/rejected": -2.584716320037842, + "step": 1520 + }, + { + "epoch": 1.44, + "grad_norm": 32.24479293823242, + "learning_rate": 2.896117523609654e-07, + "logps/chosen": -54.112548828125, + "logps/rejected": -66.57638549804688, + "loss": 0.5873, + "losses/dpo": 0.8029208183288574, + "losses/sft": 2.2057156562805176, + "losses/total": 0.8029208183288574, + "ref_logps/chosen": -35.82487869262695, + "ref_logps/rejected": -41.952247619628906, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8287665843963623, + "rewards/margins": 0.6336476802825928, + "rewards/rejected": -2.462414264678955, + "step": 1521 + }, + { + "epoch": 1.44, + "grad_norm": 23.201295852661133, + "learning_rate": 2.894368660370759e-07, + "logps/chosen": -51.750732421875, + "logps/rejected": -64.2300796508789, + "loss": 0.4232, + "losses/dpo": 0.38377732038497925, + "losses/sft": 1.3923503160476685, + "losses/total": 0.38377732038497925, + "ref_logps/chosen": -39.22446823120117, + "ref_logps/rejected": -42.31391143798828, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2526264190673828, + "rewards/margins": 0.9389901161193848, + "rewards/rejected": -2.1916165351867676, + "step": 1522 + }, + { + "epoch": 1.44, + "grad_norm": 22.923320770263672, + "learning_rate": 2.892619797131864e-07, + "logps/chosen": -56.127052307128906, + "logps/rejected": -66.88697052001953, + "loss": 0.4187, + "losses/dpo": 0.3056759536266327, + "losses/sft": 2.107492446899414, + "losses/total": 0.3056759536266327, + "ref_logps/chosen": -42.2886848449707, + "ref_logps/rejected": -41.96747589111328, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3838367462158203, + "rewards/margins": 1.108112096786499, + "rewards/rejected": -2.4919488430023193, + "step": 1523 + }, + { + "epoch": 1.44, + "grad_norm": 31.246360778808594, + "learning_rate": 2.890870933892969e-07, + "logps/chosen": -59.89424514770508, + "logps/rejected": -86.02073669433594, + "loss": 0.4256, + "losses/dpo": 0.5294551253318787, + "losses/sft": 1.980635166168213, + "losses/total": 0.5294551253318787, + "ref_logps/chosen": -40.18248748779297, + "ref_logps/rejected": -49.21497344970703, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9711757898330688, + "rewards/margins": 1.7094006538391113, + "rewards/rejected": -3.6805765628814697, + "step": 1524 + }, + { + "epoch": 1.44, + "grad_norm": 16.346588134765625, + "learning_rate": 2.889122070654075e-07, + "logps/chosen": -61.171051025390625, + "logps/rejected": -80.6778564453125, + "loss": 0.2355, + "losses/dpo": 0.2049711048603058, + "losses/sft": 2.1592917442321777, + "losses/total": 0.2049711048603058, + "ref_logps/chosen": -41.30645751953125, + "ref_logps/rejected": -43.77112579345703, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.986459732055664, + "rewards/margins": 1.7042136192321777, + "rewards/rejected": -3.690673351287842, + "step": 1525 + }, + { + "epoch": 1.44, + "grad_norm": 19.330259323120117, + "learning_rate": 2.88737320741518e-07, + "logps/chosen": -47.257225036621094, + "logps/rejected": -61.823699951171875, + "loss": 0.3129, + "losses/dpo": 0.1936805546283722, + "losses/sft": 1.9064816236495972, + "losses/total": 0.1936805546283722, + "ref_logps/chosen": -37.022098541259766, + "ref_logps/rejected": -34.80775451660156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0235130786895752, + "rewards/margins": 1.6780810356140137, + "rewards/rejected": -2.701594114303589, + "step": 1526 + }, + { + "epoch": 1.44, + "grad_norm": 33.08138656616211, + "learning_rate": 2.885624344176285e-07, + "logps/chosen": -54.41649627685547, + "logps/rejected": -55.74816131591797, + "loss": 0.6738, + "losses/dpo": 0.8506128191947937, + "losses/sft": 1.9796555042266846, + "losses/total": 0.8506128191947937, + "ref_logps/chosen": -36.925262451171875, + "ref_logps/rejected": -31.216808319091797, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.749123454093933, + "rewards/margins": 0.7040117979049683, + "rewards/rejected": -2.4531352519989014, + "step": 1527 + }, + { + "epoch": 1.44, + "grad_norm": 32.28748321533203, + "learning_rate": 2.883875480937391e-07, + "logps/chosen": -65.16987609863281, + "logps/rejected": -69.23661804199219, + "loss": 0.5265, + "losses/dpo": 0.5600053668022156, + "losses/sft": 1.978285312652588, + "losses/total": 0.5600053668022156, + "ref_logps/chosen": -46.6458854675293, + "ref_logps/rejected": -45.33740234375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8523993492126465, + "rewards/margins": 0.537522554397583, + "rewards/rejected": -2.3899219036102295, + "step": 1528 + }, + { + "epoch": 1.44, + "grad_norm": 22.397335052490234, + "learning_rate": 2.882126617698496e-07, + "logps/chosen": -43.86811065673828, + "logps/rejected": -61.027381896972656, + "loss": 0.4417, + "losses/dpo": 0.3098493218421936, + "losses/sft": 1.9116106033325195, + "losses/total": 0.3098493218421936, + "ref_logps/chosen": -30.395709991455078, + "ref_logps/rejected": -37.6104736328125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3472402095794678, + "rewards/margins": 0.9944506883621216, + "rewards/rejected": -2.3416907787323, + "step": 1529 + }, + { + "epoch": 1.44, + "grad_norm": 19.612794876098633, + "learning_rate": 2.8803777544596015e-07, + "logps/chosen": -57.82060241699219, + "logps/rejected": -75.35323333740234, + "loss": 0.3016, + "losses/dpo": 0.3849601447582245, + "losses/sft": 2.273878812789917, + "losses/total": 0.3849601447582245, + "ref_logps/chosen": -43.81038284301758, + "ref_logps/rejected": -45.89567184448242, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4010215997695923, + "rewards/margins": 1.5447348356246948, + "rewards/rejected": -2.945756196975708, + "step": 1530 + }, + { + "epoch": 1.45, + "grad_norm": 29.8697509765625, + "learning_rate": 2.878628891220706e-07, + "logps/chosen": -61.79139709472656, + "logps/rejected": -71.67982482910156, + "loss": 0.4182, + "losses/dpo": 0.6001218557357788, + "losses/sft": 2.438936948776245, + "losses/total": 0.6001218557357788, + "ref_logps/chosen": -44.27633285522461, + "ref_logps/rejected": -39.547454833984375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7515063285827637, + "rewards/margins": 1.4617310762405396, + "rewards/rejected": -3.213237762451172, + "step": 1531 + }, + { + "epoch": 1.45, + "grad_norm": 19.21063232421875, + "learning_rate": 2.876880027981812e-07, + "logps/chosen": -58.567779541015625, + "logps/rejected": -81.89295196533203, + "loss": 0.3401, + "losses/dpo": 0.2961357831954956, + "losses/sft": 2.013099193572998, + "losses/total": 0.2961357831954956, + "ref_logps/chosen": -41.91876983642578, + "ref_logps/rejected": -51.12880325317383, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.664900779724121, + "rewards/margins": 1.4115142822265625, + "rewards/rejected": -3.0764150619506836, + "step": 1532 + }, + { + "epoch": 1.45, + "grad_norm": 22.92458152770996, + "learning_rate": 2.875131164742917e-07, + "logps/chosen": -58.74645233154297, + "logps/rejected": -77.60418701171875, + "loss": 0.3205, + "losses/dpo": 0.7321463227272034, + "losses/sft": 1.7700061798095703, + "losses/total": 0.7321463227272034, + "ref_logps/chosen": -44.00907897949219, + "ref_logps/rejected": -48.02771759033203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4737370014190674, + "rewards/margins": 1.483910322189331, + "rewards/rejected": -2.9576473236083984, + "step": 1533 + }, + { + "epoch": 1.45, + "grad_norm": 26.128801345825195, + "learning_rate": 2.873382301504022e-07, + "logps/chosen": -50.60676956176758, + "logps/rejected": -59.92283630371094, + "loss": 0.4765, + "losses/dpo": 0.5237456560134888, + "losses/sft": 1.5703126192092896, + "losses/total": 0.5237456560134888, + "ref_logps/chosen": -38.51496124267578, + "ref_logps/rejected": -38.8935661315918, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2091807126998901, + "rewards/margins": 0.8937462568283081, + "rewards/rejected": -2.1029269695281982, + "step": 1534 + }, + { + "epoch": 1.45, + "grad_norm": 23.607364654541016, + "learning_rate": 2.8716334382651277e-07, + "logps/chosen": -56.33033752441406, + "logps/rejected": -75.4780044555664, + "loss": 0.4796, + "losses/dpo": 0.688484787940979, + "losses/sft": 2.1886415481567383, + "losses/total": 0.688484787940979, + "ref_logps/chosen": -42.12576675415039, + "ref_logps/rejected": -53.68501663208008, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.420457124710083, + "rewards/margins": 0.7588416337966919, + "rewards/rejected": -2.1792988777160645, + "step": 1535 + }, + { + "epoch": 1.45, + "grad_norm": 20.63005256652832, + "learning_rate": 2.869884575026233e-07, + "logps/chosen": -55.593955993652344, + "logps/rejected": -73.67984771728516, + "loss": 0.3587, + "losses/dpo": 0.12686285376548767, + "losses/sft": 2.268369674682617, + "losses/total": 0.12686285376548767, + "ref_logps/chosen": -40.68044662475586, + "ref_logps/rejected": -45.844207763671875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4913510084152222, + "rewards/margins": 1.2922132015228271, + "rewards/rejected": -2.7835640907287598, + "step": 1536 + }, + { + "epoch": 1.45, + "grad_norm": 22.66666603088379, + "learning_rate": 2.8681357117873385e-07, + "logps/chosen": -52.40771484375, + "logps/rejected": -66.45508575439453, + "loss": 0.4372, + "losses/dpo": 0.35038965940475464, + "losses/sft": 1.7115205526351929, + "losses/total": 0.35038965940475464, + "ref_logps/chosen": -38.07029342651367, + "ref_logps/rejected": -40.9581184387207, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4337424039840698, + "rewards/margins": 1.1159547567367554, + "rewards/rejected": -2.549697160720825, + "step": 1537 + }, + { + "epoch": 1.45, + "grad_norm": 18.346418380737305, + "learning_rate": 2.866386848548443e-07, + "logps/chosen": -66.79084777832031, + "logps/rejected": -82.09329223632812, + "loss": 0.2885, + "losses/dpo": 0.2678743898868561, + "losses/sft": 1.9576998949050903, + "losses/total": 0.2678743898868561, + "ref_logps/chosen": -51.47205352783203, + "ref_logps/rejected": -52.53905487060547, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5318797826766968, + "rewards/margins": 1.423543930053711, + "rewards/rejected": -2.955423593521118, + "step": 1538 + }, + { + "epoch": 1.45, + "grad_norm": 22.38614845275879, + "learning_rate": 2.8646379853095487e-07, + "logps/chosen": -52.541385650634766, + "logps/rejected": -68.58695983886719, + "loss": 0.3613, + "losses/dpo": 0.3060445189476013, + "losses/sft": 1.8688775300979614, + "losses/total": 0.3060445189476013, + "ref_logps/chosen": -38.203617095947266, + "ref_logps/rejected": -40.814598083496094, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4337767362594604, + "rewards/margins": 1.3434598445892334, + "rewards/rejected": -2.7772367000579834, + "step": 1539 + }, + { + "epoch": 1.45, + "grad_norm": 21.95256233215332, + "learning_rate": 2.8628891220706544e-07, + "logps/chosen": -52.6802978515625, + "logps/rejected": -76.41194152832031, + "loss": 0.3356, + "losses/dpo": 0.4440712332725525, + "losses/sft": 2.2653234004974365, + "losses/total": 0.4440712332725525, + "ref_logps/chosen": -36.10881805419922, + "ref_logps/rejected": -43.95797348022461, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6571482419967651, + "rewards/margins": 1.5882489681243896, + "rewards/rejected": -3.2453970909118652, + "step": 1540 + }, + { + "epoch": 1.46, + "grad_norm": 20.52392578125, + "learning_rate": 2.861140258831759e-07, + "logps/chosen": -49.52919387817383, + "logps/rejected": -58.325340270996094, + "loss": 0.4156, + "losses/dpo": 0.4757683575153351, + "losses/sft": 1.3999855518341064, + "losses/total": 0.4757683575153351, + "ref_logps/chosen": -38.35038757324219, + "ref_logps/rejected": -38.191673278808594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1178803443908691, + "rewards/margins": 0.8954866528511047, + "rewards/rejected": -2.013367176055908, + "step": 1541 + }, + { + "epoch": 1.46, + "grad_norm": 24.823984146118164, + "learning_rate": 2.8593913955928646e-07, + "logps/chosen": -59.465599060058594, + "logps/rejected": -78.2006607055664, + "loss": 0.4462, + "losses/dpo": 0.23168253898620605, + "losses/sft": 1.6652085781097412, + "losses/total": 0.23168253898620605, + "ref_logps/chosen": -41.725364685058594, + "ref_logps/rejected": -49.47913360595703, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7740230560302734, + "rewards/margins": 1.0981297492980957, + "rewards/rejected": -2.872152805328369, + "step": 1542 + }, + { + "epoch": 1.46, + "grad_norm": 28.500024795532227, + "learning_rate": 2.85764253235397e-07, + "logps/chosen": -49.46937942504883, + "logps/rejected": -76.01848602294922, + "loss": 0.5339, + "losses/dpo": 1.129459261894226, + "losses/sft": 2.185872793197632, + "losses/total": 1.129459261894226, + "ref_logps/chosen": -35.974552154541016, + "ref_logps/rejected": -50.56367492675781, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3494828939437866, + "rewards/margins": 1.1959989070892334, + "rewards/rejected": -2.5454816818237305, + "step": 1543 + }, + { + "epoch": 1.46, + "grad_norm": 27.28022003173828, + "learning_rate": 2.8558936691150754e-07, + "logps/chosen": -58.35121536254883, + "logps/rejected": -73.19402313232422, + "loss": 0.4723, + "losses/dpo": 0.4089778661727905, + "losses/sft": 1.9968239068984985, + "losses/total": 0.4089778661727905, + "ref_logps/chosen": -43.517208099365234, + "ref_logps/rejected": -48.74143981933594, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4834007024765015, + "rewards/margins": 0.9618576169013977, + "rewards/rejected": -2.445258140563965, + "step": 1544 + }, + { + "epoch": 1.46, + "grad_norm": 30.026042938232422, + "learning_rate": 2.85414480587618e-07, + "logps/chosen": -58.37890625, + "logps/rejected": -76.18202209472656, + "loss": 0.4926, + "losses/dpo": 0.6482977867126465, + "losses/sft": 2.542445659637451, + "losses/total": 0.6482977867126465, + "ref_logps/chosen": -38.98811340332031, + "ref_logps/rejected": -45.21678924560547, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.9390795230865479, + "rewards/margins": 1.1574437618255615, + "rewards/rejected": -3.0965235233306885, + "step": 1545 + }, + { + "epoch": 1.46, + "grad_norm": 16.482799530029297, + "learning_rate": 2.8523959426372857e-07, + "logps/chosen": -51.419639587402344, + "logps/rejected": -72.48983001708984, + "loss": 0.2616, + "losses/dpo": 0.17937470972537994, + "losses/sft": 1.642836093902588, + "losses/total": 0.17937470972537994, + "ref_logps/chosen": -38.369903564453125, + "ref_logps/rejected": -41.711090087890625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3049732446670532, + "rewards/margins": 1.7729005813598633, + "rewards/rejected": -3.077873706817627, + "step": 1546 + }, + { + "epoch": 1.46, + "grad_norm": 25.0074405670166, + "learning_rate": 2.8506470793983913e-07, + "logps/chosen": -58.674293518066406, + "logps/rejected": -80.90597534179688, + "loss": 0.4267, + "losses/dpo": 0.7177131175994873, + "losses/sft": 2.143411636352539, + "losses/total": 0.7177131175994873, + "ref_logps/chosen": -43.082618713378906, + "ref_logps/rejected": -51.89866256713867, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5591676235198975, + "rewards/margins": 1.3415639400482178, + "rewards/rejected": -2.9007315635681152, + "step": 1547 + }, + { + "epoch": 1.46, + "grad_norm": 17.16482162475586, + "learning_rate": 2.848898216159496e-07, + "logps/chosen": -52.59001922607422, + "logps/rejected": -62.16965866088867, + "loss": 0.3492, + "losses/dpo": 0.23446545004844666, + "losses/sft": 1.5199135541915894, + "losses/total": 0.23446545004844666, + "ref_logps/chosen": -38.15099334716797, + "ref_logps/rejected": -34.785945892333984, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4439024925231934, + "rewards/margins": 1.294468879699707, + "rewards/rejected": -2.7383713722229004, + "step": 1548 + }, + { + "epoch": 1.46, + "grad_norm": 27.55540657043457, + "learning_rate": 2.8471493529206016e-07, + "logps/chosen": -49.91382598876953, + "logps/rejected": -70.44082641601562, + "loss": 0.5861, + "losses/dpo": 0.7367491722106934, + "losses/sft": 1.6084750890731812, + "losses/total": 0.7367491722106934, + "ref_logps/chosen": -32.57075500488281, + "ref_logps/rejected": -45.83686828613281, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.734306812286377, + "rewards/margins": 0.7260898351669312, + "rewards/rejected": -2.4603967666625977, + "step": 1549 + }, + { + "epoch": 1.46, + "grad_norm": 26.385900497436523, + "learning_rate": 2.8454004896817067e-07, + "logps/chosen": -50.995521545410156, + "logps/rejected": -68.32173156738281, + "loss": 0.4243, + "losses/dpo": 0.31904733180999756, + "losses/sft": 1.5386825799942017, + "losses/total": 0.31904733180999756, + "ref_logps/chosen": -37.816734313964844, + "ref_logps/rejected": -41.5258674621582, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3178787231445312, + "rewards/margins": 1.3617067337036133, + "rewards/rejected": -2.6795854568481445, + "step": 1550 + }, + { + "epoch": 1.46, + "grad_norm": 27.311309814453125, + "learning_rate": 2.8436516264428124e-07, + "logps/chosen": -52.601322174072266, + "logps/rejected": -76.10559844970703, + "loss": 0.5335, + "losses/dpo": 0.22984179854393005, + "losses/sft": 2.0700111389160156, + "losses/total": 0.22984179854393005, + "ref_logps/chosen": -39.199684143066406, + "ref_logps/rejected": -51.422935485839844, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3401637077331543, + "rewards/margins": 1.1281023025512695, + "rewards/rejected": -2.468266010284424, + "step": 1551 + }, + { + "epoch": 1.47, + "grad_norm": 23.3348388671875, + "learning_rate": 2.841902763203917e-07, + "logps/chosen": -57.84280776977539, + "logps/rejected": -86.03680419921875, + "loss": 0.3552, + "losses/dpo": 0.6518940925598145, + "losses/sft": 1.9520008563995361, + "losses/total": 0.6518940925598145, + "ref_logps/chosen": -40.45856857299805, + "ref_logps/rejected": -55.459381103515625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7384238243103027, + "rewards/margins": 1.3193187713623047, + "rewards/rejected": -3.0577425956726074, + "step": 1552 + }, + { + "epoch": 1.47, + "grad_norm": 17.599271774291992, + "learning_rate": 2.8401538999650226e-07, + "logps/chosen": -50.42523956298828, + "logps/rejected": -85.38443756103516, + "loss": 0.2557, + "losses/dpo": 0.20078754425048828, + "losses/sft": 1.3675918579101562, + "losses/total": 0.20078754425048828, + "ref_logps/chosen": -37.47771453857422, + "ref_logps/rejected": -56.087860107421875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2947523593902588, + "rewards/margins": 1.6349050998687744, + "rewards/rejected": -2.929657459259033, + "step": 1553 + }, + { + "epoch": 1.47, + "grad_norm": 24.149715423583984, + "learning_rate": 2.8384050367261283e-07, + "logps/chosen": -54.10523223876953, + "logps/rejected": -60.35800552368164, + "loss": 0.4512, + "losses/dpo": 0.21545279026031494, + "losses/sft": 2.0858535766601562, + "losses/total": 0.21545279026031494, + "ref_logps/chosen": -36.687259674072266, + "ref_logps/rejected": -34.079776763916016, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7417970895767212, + "rewards/margins": 0.8860259056091309, + "rewards/rejected": -2.6278228759765625, + "step": 1554 + }, + { + "epoch": 1.47, + "grad_norm": 19.41289520263672, + "learning_rate": 2.836656173487233e-07, + "logps/chosen": -34.860260009765625, + "logps/rejected": -50.743934631347656, + "loss": 0.46, + "losses/dpo": 0.36985400319099426, + "losses/sft": 1.7566661834716797, + "losses/total": 0.36985400319099426, + "ref_logps/chosen": -26.2508487701416, + "ref_logps/rejected": -34.557228088378906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8609414100646973, + "rewards/margins": 0.7577290534973145, + "rewards/rejected": -1.6186704635620117, + "step": 1555 + }, + { + "epoch": 1.47, + "grad_norm": 23.737871170043945, + "learning_rate": 2.8349073102483385e-07, + "logps/chosen": -49.39778518676758, + "logps/rejected": -60.15223693847656, + "loss": 0.4537, + "losses/dpo": 0.5179659128189087, + "losses/sft": 1.335763692855835, + "losses/total": 0.5179659128189087, + "ref_logps/chosen": -36.8922119140625, + "ref_logps/rejected": -34.520545959472656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2505571842193604, + "rewards/margins": 1.3126121759414673, + "rewards/rejected": -2.563169479370117, + "step": 1556 + }, + { + "epoch": 1.47, + "grad_norm": 14.922785758972168, + "learning_rate": 2.8331584470094436e-07, + "logps/chosen": -46.69976043701172, + "logps/rejected": -57.63823699951172, + "loss": 0.2497, + "losses/dpo": 0.2243071049451828, + "losses/sft": 1.6626936197280884, + "losses/total": 0.2243071049451828, + "ref_logps/chosen": -33.87782287597656, + "ref_logps/rejected": -28.841154098510742, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2821935415267944, + "rewards/margins": 1.5975148677825928, + "rewards/rejected": -2.8797082901000977, + "step": 1557 + }, + { + "epoch": 1.47, + "grad_norm": 18.02433967590332, + "learning_rate": 2.8314095837705493e-07, + "logps/chosen": -46.7186279296875, + "logps/rejected": -71.32408905029297, + "loss": 0.3577, + "losses/dpo": 0.4005771577358246, + "losses/sft": 1.6510266065597534, + "losses/total": 0.4005771577358246, + "ref_logps/chosen": -35.55266571044922, + "ref_logps/rejected": -48.943687438964844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1165958642959595, + "rewards/margins": 1.1214449405670166, + "rewards/rejected": -2.2380409240722656, + "step": 1558 + }, + { + "epoch": 1.47, + "grad_norm": 32.218563079833984, + "learning_rate": 2.8296607205316544e-07, + "logps/chosen": -47.6123046875, + "logps/rejected": -64.24397277832031, + "loss": 0.5707, + "losses/dpo": 0.591403603553772, + "losses/sft": 1.5472232103347778, + "losses/total": 0.591403603553772, + "ref_logps/chosen": -35.12158203125, + "ref_logps/rejected": -43.7344970703125, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.249072551727295, + "rewards/margins": 0.8018757104873657, + "rewards/rejected": -2.05094838142395, + "step": 1559 + }, + { + "epoch": 1.47, + "grad_norm": 18.46552848815918, + "learning_rate": 2.8279118572927596e-07, + "logps/chosen": -54.636051177978516, + "logps/rejected": -80.05775451660156, + "loss": 0.3058, + "losses/dpo": 0.2528434991836548, + "losses/sft": 2.1791927814483643, + "losses/total": 0.2528434991836548, + "ref_logps/chosen": -41.39192581176758, + "ref_logps/rejected": -51.63854217529297, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3244123458862305, + "rewards/margins": 1.5175093412399292, + "rewards/rejected": -2.841921806335449, + "step": 1560 + }, + { + "epoch": 1.47, + "grad_norm": 29.531017303466797, + "learning_rate": 2.826162994053865e-07, + "logps/chosen": -53.762428283691406, + "logps/rejected": -54.455039978027344, + "loss": 0.4727, + "losses/dpo": 0.3079676032066345, + "losses/sft": 1.5105602741241455, + "losses/total": 0.3079676032066345, + "ref_logps/chosen": -40.69398880004883, + "ref_logps/rejected": -30.85037612915039, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3068442344665527, + "rewards/margins": 1.0536220073699951, + "rewards/rejected": -2.360466480255127, + "step": 1561 + }, + { + "epoch": 1.47, + "grad_norm": 26.355924606323242, + "learning_rate": 2.82441413081497e-07, + "logps/chosen": -63.31415939331055, + "logps/rejected": -83.74795532226562, + "loss": 0.3683, + "losses/dpo": 0.11343936622142792, + "losses/sft": 1.4646058082580566, + "losses/total": 0.11343936622142792, + "ref_logps/chosen": -44.809471130371094, + "ref_logps/rejected": -49.59527587890625, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.850468397140503, + "rewards/margins": 1.5647997856140137, + "rewards/rejected": -3.4152684211730957, + "step": 1562 + }, + { + "epoch": 1.48, + "grad_norm": 20.292633056640625, + "learning_rate": 2.8226652675760755e-07, + "logps/chosen": -53.957557678222656, + "logps/rejected": -73.56104278564453, + "loss": 0.3084, + "losses/dpo": 0.5798536539077759, + "losses/sft": 1.8515987396240234, + "losses/total": 0.5798536539077759, + "ref_logps/chosen": -40.87503433227539, + "ref_logps/rejected": -44.657562255859375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3082528114318848, + "rewards/margins": 1.5820951461791992, + "rewards/rejected": -2.890347957611084, + "step": 1563 + }, + { + "epoch": 1.48, + "grad_norm": 19.71109962463379, + "learning_rate": 2.8209164043371806e-07, + "logps/chosen": -47.39905548095703, + "logps/rejected": -75.1040267944336, + "loss": 0.342, + "losses/dpo": 0.25179505348205566, + "losses/sft": 1.9480195045471191, + "losses/total": 0.25179505348205566, + "ref_logps/chosen": -32.6230583190918, + "ref_logps/rejected": -43.57971954345703, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.47760009765625, + "rewards/margins": 1.674830436706543, + "rewards/rejected": -3.152430534362793, + "step": 1564 + }, + { + "epoch": 1.48, + "grad_norm": 23.368722915649414, + "learning_rate": 2.819167541098286e-07, + "logps/chosen": -42.84213638305664, + "logps/rejected": -71.26492309570312, + "loss": 0.3681, + "losses/dpo": 0.3649923801422119, + "losses/sft": 2.0411674976348877, + "losses/total": 0.3649923801422119, + "ref_logps/chosen": -27.057880401611328, + "ref_logps/rejected": -42.94731140136719, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.578425645828247, + "rewards/margins": 1.2533351182937622, + "rewards/rejected": -2.831760883331299, + "step": 1565 + }, + { + "epoch": 1.48, + "grad_norm": 26.619714736938477, + "learning_rate": 2.8174186778593914e-07, + "logps/chosen": -54.24474334716797, + "logps/rejected": -65.07306671142578, + "loss": 0.4966, + "losses/dpo": 0.3746645450592041, + "losses/sft": 2.3145899772644043, + "losses/total": 0.3746645450592041, + "ref_logps/chosen": -35.97780990600586, + "ref_logps/rejected": -36.86473846435547, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8266935348510742, + "rewards/margins": 0.9941390156745911, + "rewards/rejected": -2.8208324909210205, + "step": 1566 + }, + { + "epoch": 1.48, + "grad_norm": 32.63786697387695, + "learning_rate": 2.8156698146204965e-07, + "logps/chosen": -48.44013977050781, + "logps/rejected": -63.79279327392578, + "loss": 0.6562, + "losses/dpo": 0.6476283669471741, + "losses/sft": 1.9668405055999756, + "losses/total": 0.6476283669471741, + "ref_logps/chosen": -32.326194763183594, + "ref_logps/rejected": -43.733665466308594, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6113942861557007, + "rewards/margins": 0.39451873302459717, + "rewards/rejected": -2.005913019180298, + "step": 1567 + }, + { + "epoch": 1.48, + "grad_norm": 23.762386322021484, + "learning_rate": 2.813920951381602e-07, + "logps/chosen": -55.13279342651367, + "logps/rejected": -69.50912475585938, + "loss": 0.4667, + "losses/dpo": 0.49039798974990845, + "losses/sft": 1.7319002151489258, + "losses/total": 0.49039798974990845, + "ref_logps/chosen": -41.90827178955078, + "ref_logps/rejected": -43.75598907470703, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3224526643753052, + "rewards/margins": 1.2528605461120605, + "rewards/rejected": -2.575313091278076, + "step": 1568 + }, + { + "epoch": 1.48, + "grad_norm": 22.09877586364746, + "learning_rate": 2.812172088142707e-07, + "logps/chosen": -51.13863754272461, + "logps/rejected": -73.3948974609375, + "loss": 0.3349, + "losses/dpo": 0.2623365819454193, + "losses/sft": 1.7351672649383545, + "losses/total": 0.2623365819454193, + "ref_logps/chosen": -36.487667083740234, + "ref_logps/rejected": -45.0869255065918, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.465097188949585, + "rewards/margins": 1.3657000064849854, + "rewards/rejected": -2.8307971954345703, + "step": 1569 + }, + { + "epoch": 1.48, + "grad_norm": 24.633047103881836, + "learning_rate": 2.8104232249038124e-07, + "logps/chosen": -52.68830871582031, + "logps/rejected": -89.28404235839844, + "loss": 0.3344, + "losses/dpo": 0.2976940870285034, + "losses/sft": 1.8603380918502808, + "losses/total": 0.2976940870285034, + "ref_logps/chosen": -34.00613784790039, + "ref_logps/rejected": -53.935401916503906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8682169914245605, + "rewards/margins": 1.66664719581604, + "rewards/rejected": -3.5348639488220215, + "step": 1570 + }, + { + "epoch": 1.48, + "grad_norm": 31.272361755371094, + "learning_rate": 2.8086743616649175e-07, + "logps/chosen": -38.56789016723633, + "logps/rejected": -66.273193359375, + "loss": 0.5677, + "losses/dpo": 0.32743704319000244, + "losses/sft": 1.8801658153533936, + "losses/total": 0.32743704319000244, + "ref_logps/chosen": -27.371931076049805, + "ref_logps/rejected": -47.53741455078125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1195958852767944, + "rewards/margins": 0.7539824843406677, + "rewards/rejected": -1.873578429222107, + "step": 1571 + }, + { + "epoch": 1.48, + "grad_norm": 16.557924270629883, + "learning_rate": 2.806925498426023e-07, + "logps/chosen": -39.770877838134766, + "logps/rejected": -64.97610473632812, + "loss": 0.3123, + "losses/dpo": 0.07179497182369232, + "losses/sft": 1.649191975593567, + "losses/total": 0.07179497182369232, + "ref_logps/chosen": -30.08028793334961, + "ref_logps/rejected": -39.861572265625, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9690591096878052, + "rewards/margins": 1.542393684387207, + "rewards/rejected": -2.5114529132843018, + "step": 1572 + }, + { + "epoch": 1.49, + "grad_norm": 16.097074508666992, + "learning_rate": 2.8051766351871283e-07, + "logps/chosen": -38.5244255065918, + "logps/rejected": -56.38629913330078, + "loss": 0.3326, + "losses/dpo": 0.3651847541332245, + "losses/sft": 1.6834081411361694, + "losses/total": 0.3651847541332245, + "ref_logps/chosen": -29.10544776916504, + "ref_logps/rejected": -31.85451889038086, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9418981075286865, + "rewards/margins": 1.511279821395874, + "rewards/rejected": -2.4531779289245605, + "step": 1573 + }, + { + "epoch": 1.49, + "grad_norm": 18.000150680541992, + "learning_rate": 2.8034277719482334e-07, + "logps/chosen": -53.27288818359375, + "logps/rejected": -86.36906433105469, + "loss": 0.2641, + "losses/dpo": 0.19226941466331482, + "losses/sft": 2.676643133163452, + "losses/total": 0.19226941466331482, + "ref_logps/chosen": -35.221588134765625, + "ref_logps/rejected": -52.99406433105469, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8051300048828125, + "rewards/margins": 1.5323700904846191, + "rewards/rejected": -3.3375000953674316, + "step": 1574 + }, + { + "epoch": 1.49, + "grad_norm": 16.803436279296875, + "learning_rate": 2.801678908709339e-07, + "logps/chosen": -50.18499755859375, + "logps/rejected": -70.54275512695312, + "loss": 0.3553, + "losses/dpo": 0.4526533782482147, + "losses/sft": 1.590066909790039, + "losses/total": 0.4526533782482147, + "ref_logps/chosen": -35.84791564941406, + "ref_logps/rejected": -42.39314270019531, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.433708667755127, + "rewards/margins": 1.3812527656555176, + "rewards/rejected": -2.8149614334106445, + "step": 1575 + }, + { + "epoch": 1.49, + "grad_norm": 23.92568588256836, + "learning_rate": 2.7999300454704437e-07, + "logps/chosen": -48.44969940185547, + "logps/rejected": -63.28421401977539, + "loss": 0.414, + "losses/dpo": 0.35964441299438477, + "losses/sft": 1.9736664295196533, + "losses/total": 0.35964441299438477, + "ref_logps/chosen": -33.312408447265625, + "ref_logps/rejected": -39.735782623291016, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5137293338775635, + "rewards/margins": 0.8411141633987427, + "rewards/rejected": -2.3548436164855957, + "step": 1576 + }, + { + "epoch": 1.49, + "grad_norm": 19.621519088745117, + "learning_rate": 2.7981811822315494e-07, + "logps/chosen": -45.52754211425781, + "logps/rejected": -61.8480224609375, + "loss": 0.4199, + "losses/dpo": 0.34698039293289185, + "losses/sft": 1.7505427598953247, + "losses/total": 0.34698039293289185, + "ref_logps/chosen": -34.88029861450195, + "ref_logps/rejected": -38.03106689453125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.064724326133728, + "rewards/margins": 1.3169710636138916, + "rewards/rejected": -2.381695508956909, + "step": 1577 + }, + { + "epoch": 1.49, + "grad_norm": 22.111160278320312, + "learning_rate": 2.796432318992655e-07, + "logps/chosen": -58.804664611816406, + "logps/rejected": -77.11788177490234, + "loss": 0.4165, + "losses/dpo": 0.511218786239624, + "losses/sft": 1.8728396892547607, + "losses/total": 0.511218786239624, + "ref_logps/chosen": -40.30754852294922, + "ref_logps/rejected": -48.91407012939453, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.849711537361145, + "rewards/margins": 0.9706699252128601, + "rewards/rejected": -2.8203814029693604, + "step": 1578 + }, + { + "epoch": 1.49, + "grad_norm": 19.60536003112793, + "learning_rate": 2.79468345575376e-07, + "logps/chosen": -46.77680206298828, + "logps/rejected": -58.81962585449219, + "loss": 0.3809, + "losses/dpo": 0.42948460578918457, + "losses/sft": 1.9717991352081299, + "losses/total": 0.42948460578918457, + "ref_logps/chosen": -33.579078674316406, + "ref_logps/rejected": -35.79740524291992, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.319772481918335, + "rewards/margins": 0.9824498891830444, + "rewards/rejected": -2.30222225189209, + "step": 1579 + }, + { + "epoch": 1.49, + "grad_norm": 31.59912109375, + "learning_rate": 2.792934592514865e-07, + "logps/chosen": -52.323333740234375, + "logps/rejected": -80.10462951660156, + "loss": 0.4946, + "losses/dpo": 0.305744469165802, + "losses/sft": 1.9969563484191895, + "losses/total": 0.305744469165802, + "ref_logps/chosen": -36.05377197265625, + "ref_logps/rejected": -51.194400787353516, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6269561052322388, + "rewards/margins": 1.2640661001205444, + "rewards/rejected": -2.891022205352783, + "step": 1580 + }, + { + "epoch": 1.49, + "grad_norm": 26.642765045166016, + "learning_rate": 2.7911857292759704e-07, + "logps/chosen": -39.11742401123047, + "logps/rejected": -50.89830780029297, + "loss": 0.595, + "losses/dpo": 1.009821891784668, + "losses/sft": 2.328462600708008, + "losses/total": 1.009821891784668, + "ref_logps/chosen": -26.55136489868164, + "ref_logps/rejected": -32.081886291503906, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2566059827804565, + "rewards/margins": 0.6250360012054443, + "rewards/rejected": -1.8816421031951904, + "step": 1581 + }, + { + "epoch": 1.49, + "grad_norm": 17.78432273864746, + "learning_rate": 2.789436866037076e-07, + "logps/chosen": -49.73016357421875, + "logps/rejected": -53.685997009277344, + "loss": 0.4254, + "losses/dpo": 0.4622114896774292, + "losses/sft": 2.1299214363098145, + "losses/total": 0.4622114896774292, + "ref_logps/chosen": -37.85810852050781, + "ref_logps/rejected": -31.274436950683594, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1872059106826782, + "rewards/margins": 1.0539498329162598, + "rewards/rejected": -2.2411556243896484, + "step": 1582 + }, + { + "epoch": 1.49, + "grad_norm": 39.04930114746094, + "learning_rate": 2.7876880027981806e-07, + "logps/chosen": -58.037784576416016, + "logps/rejected": -70.38892364501953, + "loss": 0.6527, + "losses/dpo": 0.2935391366481781, + "losses/sft": 1.4305685758590698, + "losses/total": 0.2935391366481781, + "ref_logps/chosen": -41.22814178466797, + "ref_logps/rejected": -46.40721893310547, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6809645891189575, + "rewards/margins": 0.7172063589096069, + "rewards/rejected": -2.3981709480285645, + "step": 1583 + }, + { + "epoch": 1.5, + "grad_norm": 15.458442687988281, + "learning_rate": 2.7859391395592863e-07, + "logps/chosen": -42.144535064697266, + "logps/rejected": -68.74707794189453, + "loss": 0.2711, + "losses/dpo": 0.3337159752845764, + "losses/sft": 1.76071035861969, + "losses/total": 0.3337159752845764, + "ref_logps/chosen": -30.88461685180664, + "ref_logps/rejected": -42.31597900390625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.125991702079773, + "rewards/margins": 1.5171184539794922, + "rewards/rejected": -2.6431102752685547, + "step": 1584 + }, + { + "epoch": 1.5, + "grad_norm": 17.06437873840332, + "learning_rate": 2.784190276320392e-07, + "logps/chosen": -42.097023010253906, + "logps/rejected": -63.55470275878906, + "loss": 0.3731, + "losses/dpo": 0.447040855884552, + "losses/sft": 1.8044205904006958, + "losses/total": 0.447040855884552, + "ref_logps/chosen": -33.403717041015625, + "ref_logps/rejected": -42.10512924194336, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8693302869796753, + "rewards/margins": 1.2756270170211792, + "rewards/rejected": -2.1449573040008545, + "step": 1585 + }, + { + "epoch": 1.5, + "grad_norm": 21.871400833129883, + "learning_rate": 2.782441413081497e-07, + "logps/chosen": -47.751651763916016, + "logps/rejected": -65.92152404785156, + "loss": 0.4306, + "losses/dpo": 0.540984034538269, + "losses/sft": 1.7272223234176636, + "losses/total": 0.540984034538269, + "ref_logps/chosen": -30.07518768310547, + "ref_logps/rejected": -39.9300537109375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.767646312713623, + "rewards/margins": 0.8315005302429199, + "rewards/rejected": -2.599146842956543, + "step": 1586 + }, + { + "epoch": 1.5, + "grad_norm": 13.564169883728027, + "learning_rate": 2.780692549842602e-07, + "logps/chosen": -50.20806121826172, + "logps/rejected": -80.58341979980469, + "loss": 0.2536, + "losses/dpo": 0.25744491815567017, + "losses/sft": 2.364044427871704, + "losses/total": 0.25744491815567017, + "ref_logps/chosen": -37.42918395996094, + "ref_logps/rejected": -49.507225036621094, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2778877019882202, + "rewards/margins": 1.8297314643859863, + "rewards/rejected": -3.107619047164917, + "step": 1587 + }, + { + "epoch": 1.5, + "grad_norm": 21.28896713256836, + "learning_rate": 2.7789436866037073e-07, + "logps/chosen": -49.551422119140625, + "logps/rejected": -72.1412353515625, + "loss": 0.3334, + "losses/dpo": 0.690658688545227, + "losses/sft": 2.1990537643432617, + "losses/total": 0.690658688545227, + "ref_logps/chosen": -34.629905700683594, + "ref_logps/rejected": -41.382835388183594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4921514987945557, + "rewards/margins": 1.5836892127990723, + "rewards/rejected": -3.075840473175049, + "step": 1588 + }, + { + "epoch": 1.5, + "grad_norm": 35.028602600097656, + "learning_rate": 2.777194823364813e-07, + "logps/chosen": -51.10850524902344, + "logps/rejected": -65.39685821533203, + "loss": 0.5529, + "losses/dpo": 0.848666787147522, + "losses/sft": 1.9485321044921875, + "losses/total": 0.848666787147522, + "ref_logps/chosen": -33.31840133666992, + "ref_logps/rejected": -39.65271759033203, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.779010534286499, + "rewards/margins": 0.7954035997390747, + "rewards/rejected": -2.5744142532348633, + "step": 1589 + }, + { + "epoch": 1.5, + "grad_norm": 24.854259490966797, + "learning_rate": 2.7754459601259176e-07, + "logps/chosen": -45.76518630981445, + "logps/rejected": -61.96608352661133, + "loss": 0.3931, + "losses/dpo": 0.2894047796726227, + "losses/sft": 1.5795376300811768, + "losses/total": 0.2894047796726227, + "ref_logps/chosen": -32.27042007446289, + "ref_logps/rejected": -35.597530364990234, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3494765758514404, + "rewards/margins": 1.2873790264129639, + "rewards/rejected": -2.636855363845825, + "step": 1590 + }, + { + "epoch": 1.5, + "grad_norm": 30.77260971069336, + "learning_rate": 2.773697096887023e-07, + "logps/chosen": -66.782958984375, + "logps/rejected": -73.30743408203125, + "loss": 0.6087, + "losses/dpo": 0.8013497591018677, + "losses/sft": 1.7161396741867065, + "losses/total": 0.8013497591018677, + "ref_logps/chosen": -44.82712936401367, + "ref_logps/rejected": -44.11414337158203, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.195582866668701, + "rewards/margins": 0.7237467169761658, + "rewards/rejected": -2.9193296432495117, + "step": 1591 + }, + { + "epoch": 1.5, + "grad_norm": 21.13929557800293, + "learning_rate": 2.771948233648129e-07, + "logps/chosen": -41.46681594848633, + "logps/rejected": -68.57009887695312, + "loss": 0.306, + "losses/dpo": 0.15988892316818237, + "losses/sft": 1.5330132246017456, + "losses/total": 0.15988892316818237, + "ref_logps/chosen": -29.467641830444336, + "ref_logps/rejected": -41.549949645996094, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1999173164367676, + "rewards/margins": 1.502097249031067, + "rewards/rejected": -2.702014446258545, + "step": 1592 + }, + { + "epoch": 1.5, + "grad_norm": 13.868195533752441, + "learning_rate": 2.770199370409234e-07, + "logps/chosen": -49.54917907714844, + "logps/rejected": -89.79420471191406, + "loss": 0.2149, + "losses/dpo": 0.1659305840730667, + "losses/sft": 2.0476858615875244, + "losses/total": 0.1659305840730667, + "ref_logps/chosen": -36.629356384277344, + "ref_logps/rejected": -55.23522186279297, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.291982650756836, + "rewards/margins": 2.1639156341552734, + "rewards/rejected": -3.4558980464935303, + "step": 1593 + }, + { + "epoch": 1.51, + "grad_norm": 29.07862091064453, + "learning_rate": 2.768450507170339e-07, + "logps/chosen": -53.14488220214844, + "logps/rejected": -69.19747161865234, + "loss": 0.4449, + "losses/dpo": 0.26309823989868164, + "losses/sft": 2.200120687484741, + "losses/total": 0.26309823989868164, + "ref_logps/chosen": -38.730194091796875, + "ref_logps/rejected": -44.16493225097656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4414689540863037, + "rewards/margins": 1.0617849826812744, + "rewards/rejected": -2.503253936767578, + "step": 1594 + }, + { + "epoch": 1.51, + "grad_norm": 15.981719970703125, + "learning_rate": 2.7667016439314443e-07, + "logps/chosen": -50.74475860595703, + "logps/rejected": -72.94822692871094, + "loss": 0.2646, + "losses/dpo": 0.23204100131988525, + "losses/sft": 1.665698766708374, + "losses/total": 0.23204100131988525, + "ref_logps/chosen": -35.56131362915039, + "ref_logps/rejected": -44.1530876159668, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5183440446853638, + "rewards/margins": 1.3611699342727661, + "rewards/rejected": -2.87951397895813, + "step": 1595 + }, + { + "epoch": 1.51, + "grad_norm": 21.13315773010254, + "learning_rate": 2.76495278069255e-07, + "logps/chosen": -42.29929733276367, + "logps/rejected": -63.52539825439453, + "loss": 0.3857, + "losses/dpo": 0.37726065516471863, + "losses/sft": 2.1821720600128174, + "losses/total": 0.37726065516471863, + "ref_logps/chosen": -29.5195255279541, + "ref_logps/rejected": -38.37900924682617, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2779769897460938, + "rewards/margins": 1.2366619110107422, + "rewards/rejected": -2.514638662338257, + "step": 1596 + }, + { + "epoch": 1.51, + "grad_norm": 14.99760627746582, + "learning_rate": 2.7632039174536545e-07, + "logps/chosen": -38.87886428833008, + "logps/rejected": -70.91326141357422, + "loss": 0.2672, + "losses/dpo": 0.24154485762119293, + "losses/sft": 2.2009530067443848, + "losses/total": 0.24154485762119293, + "ref_logps/chosen": -28.733173370361328, + "ref_logps/rejected": -42.064971923828125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0145691633224487, + "rewards/margins": 1.8702595233917236, + "rewards/rejected": -2.884828567504883, + "step": 1597 + }, + { + "epoch": 1.51, + "grad_norm": 31.890321731567383, + "learning_rate": 2.76145505421476e-07, + "logps/chosen": -59.677406311035156, + "logps/rejected": -83.85343933105469, + "loss": 0.2931, + "losses/dpo": 0.45264574885368347, + "losses/sft": 1.9471641778945923, + "losses/total": 0.45264574885368347, + "ref_logps/chosen": -43.396541595458984, + "ref_logps/rejected": -51.72345733642578, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6280864477157593, + "rewards/margins": 1.5849120616912842, + "rewards/rejected": -3.212998628616333, + "step": 1598 + }, + { + "epoch": 1.51, + "grad_norm": 26.067333221435547, + "learning_rate": 2.759706190975866e-07, + "logps/chosen": -49.743629455566406, + "logps/rejected": -63.852630615234375, + "loss": 0.3928, + "losses/dpo": 0.20356914401054382, + "losses/sft": 1.668600082397461, + "losses/total": 0.20356914401054382, + "ref_logps/chosen": -36.41566848754883, + "ref_logps/rejected": -36.86611557006836, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3327960968017578, + "rewards/margins": 1.36585533618927, + "rewards/rejected": -2.6986515522003174, + "step": 1599 + }, + { + "epoch": 1.51, + "grad_norm": 27.520532608032227, + "learning_rate": 2.757957327736971e-07, + "logps/chosen": -54.62408447265625, + "logps/rejected": -74.0721435546875, + "loss": 0.3644, + "losses/dpo": 0.3439791798591614, + "losses/sft": 1.6578291654586792, + "losses/total": 0.3439791798591614, + "ref_logps/chosen": -40.1357536315918, + "ref_logps/rejected": -42.285789489746094, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4488331079483032, + "rewards/margins": 1.7298022508621216, + "rewards/rejected": -3.178635358810425, + "step": 1600 + }, + { + "epoch": 1.51, + "grad_norm": 36.437744140625, + "learning_rate": 2.756208464498076e-07, + "logps/chosen": -54.70110321044922, + "logps/rejected": -62.645633697509766, + "loss": 0.5619, + "losses/dpo": 0.44855254888534546, + "losses/sft": 1.701101303100586, + "losses/total": 0.44855254888534546, + "ref_logps/chosen": -35.591583251953125, + "ref_logps/rejected": -33.868446350097656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.910952091217041, + "rewards/margins": 0.9667667746543884, + "rewards/rejected": -2.877718925476074, + "step": 1601 + }, + { + "epoch": 1.51, + "grad_norm": 37.80801010131836, + "learning_rate": 2.754459601259181e-07, + "logps/chosen": -58.61450958251953, + "logps/rejected": -88.08565521240234, + "loss": 0.6349, + "losses/dpo": 0.6244620680809021, + "losses/sft": 1.8752890825271606, + "losses/total": 0.6244620680809021, + "ref_logps/chosen": -38.55999755859375, + "ref_logps/rejected": -56.104698181152344, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.005451202392578, + "rewards/margins": 1.1926450729370117, + "rewards/rejected": -3.19809627532959, + "step": 1602 + }, + { + "epoch": 1.51, + "grad_norm": 37.187156677246094, + "learning_rate": 2.752710738020287e-07, + "logps/chosen": -68.95501708984375, + "logps/rejected": -74.0699691772461, + "loss": 0.6101, + "losses/dpo": 0.6041665077209473, + "losses/sft": 1.6167887449264526, + "losses/total": 0.6041665077209473, + "ref_logps/chosen": -53.5874137878418, + "ref_logps/rejected": -48.076377868652344, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5367608070373535, + "rewards/margins": 1.0625981092453003, + "rewards/rejected": -2.5993587970733643, + "step": 1603 + }, + { + "epoch": 1.51, + "grad_norm": 24.92690086364746, + "learning_rate": 2.750961874781392e-07, + "logps/chosen": -48.456871032714844, + "logps/rejected": -64.7651596069336, + "loss": 0.406, + "losses/dpo": 0.4590799808502197, + "losses/sft": 2.4898979663848877, + "losses/total": 0.4590799808502197, + "ref_logps/chosen": -32.762123107910156, + "ref_logps/rejected": -37.21025085449219, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5694751739501953, + "rewards/margins": 1.1860153675079346, + "rewards/rejected": -2.755490779876709, + "step": 1604 + }, + { + "epoch": 1.52, + "grad_norm": 30.134885787963867, + "learning_rate": 2.749213011542497e-07, + "logps/chosen": -50.478050231933594, + "logps/rejected": -75.0337142944336, + "loss": 0.5033, + "losses/dpo": 0.37894198298454285, + "losses/sft": 2.071441650390625, + "losses/total": 0.37894198298454285, + "ref_logps/chosen": -33.355438232421875, + "ref_logps/rejected": -46.22139358520508, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.712261438369751, + "rewards/margins": 1.168969988822937, + "rewards/rejected": -2.8812315464019775, + "step": 1605 + }, + { + "epoch": 1.52, + "grad_norm": 26.25534439086914, + "learning_rate": 2.747464148303603e-07, + "logps/chosen": -46.702392578125, + "logps/rejected": -62.1175651550293, + "loss": 0.4033, + "losses/dpo": 0.3480875790119171, + "losses/sft": 1.6217899322509766, + "losses/total": 0.3480875790119171, + "ref_logps/chosen": -32.0451545715332, + "ref_logps/rejected": -36.74039840698242, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4657235145568848, + "rewards/margins": 1.0719932317733765, + "rewards/rejected": -2.537716865539551, + "step": 1606 + }, + { + "epoch": 1.52, + "grad_norm": 20.10713005065918, + "learning_rate": 2.745715285064708e-07, + "logps/chosen": -44.955055236816406, + "logps/rejected": -77.20957946777344, + "loss": 0.3435, + "losses/dpo": 0.3660425543785095, + "losses/sft": 1.945845365524292, + "losses/total": 0.3660425543785095, + "ref_logps/chosen": -32.77898025512695, + "ref_logps/rejected": -50.41749572753906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2176077365875244, + "rewards/margins": 1.461601734161377, + "rewards/rejected": -2.6792097091674805, + "step": 1607 + }, + { + "epoch": 1.52, + "grad_norm": 20.93253517150879, + "learning_rate": 2.743966421825813e-07, + "logps/chosen": -52.94804382324219, + "logps/rejected": -66.85520935058594, + "loss": 0.378, + "losses/dpo": 0.10774748027324677, + "losses/sft": 1.7420648336410522, + "losses/total": 0.10774748027324677, + "ref_logps/chosen": -40.665367126464844, + "ref_logps/rejected": -43.21400451660156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2282679080963135, + "rewards/margins": 1.1358526945114136, + "rewards/rejected": -2.3641204833984375, + "step": 1608 + }, + { + "epoch": 1.52, + "grad_norm": 21.789220809936523, + "learning_rate": 2.742217558586918e-07, + "logps/chosen": -59.301334381103516, + "logps/rejected": -65.62680053710938, + "loss": 0.389, + "losses/dpo": 0.24781490862369537, + "losses/sft": 2.024040699005127, + "losses/total": 0.24781490862369537, + "ref_logps/chosen": -45.508140563964844, + "ref_logps/rejected": -40.55152893066406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.379319429397583, + "rewards/margins": 1.1282069683074951, + "rewards/rejected": -2.507526397705078, + "step": 1609 + }, + { + "epoch": 1.52, + "grad_norm": 17.608909606933594, + "learning_rate": 2.740468695348024e-07, + "logps/chosen": -45.20891571044922, + "logps/rejected": -88.14855194091797, + "loss": 0.2216, + "losses/dpo": 0.3600003123283386, + "losses/sft": 2.1840012073516846, + "losses/total": 0.3600003123283386, + "ref_logps/chosen": -32.8494873046875, + "ref_logps/rejected": -53.32741165161133, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2359428405761719, + "rewards/margins": 2.246171474456787, + "rewards/rejected": -3.482114315032959, + "step": 1610 + }, + { + "epoch": 1.52, + "grad_norm": 27.919546127319336, + "learning_rate": 2.738719832109129e-07, + "logps/chosen": -41.939048767089844, + "logps/rejected": -66.14915466308594, + "loss": 0.4301, + "losses/dpo": 0.5740477442741394, + "losses/sft": 1.614914894104004, + "losses/total": 0.5740477442741394, + "ref_logps/chosen": -30.83543586730957, + "ref_logps/rejected": -42.79560089111328, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1103609800338745, + "rewards/margins": 1.224994421005249, + "rewards/rejected": -2.335355520248413, + "step": 1611 + }, + { + "epoch": 1.52, + "grad_norm": 25.810455322265625, + "learning_rate": 2.736970968870234e-07, + "logps/chosen": -52.598716735839844, + "logps/rejected": -77.66197204589844, + "loss": 0.335, + "losses/dpo": 0.33409860730171204, + "losses/sft": 1.7981247901916504, + "losses/total": 0.33409860730171204, + "ref_logps/chosen": -37.69508743286133, + "ref_logps/rejected": -47.655296325683594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4903628826141357, + "rewards/margins": 1.5103046894073486, + "rewards/rejected": -3.0006678104400635, + "step": 1612 + }, + { + "epoch": 1.52, + "grad_norm": 27.2560977935791, + "learning_rate": 2.7352221056313397e-07, + "logps/chosen": -50.28767395019531, + "logps/rejected": -61.483577728271484, + "loss": 0.5036, + "losses/dpo": 0.6138076782226562, + "losses/sft": 1.6396846771240234, + "losses/total": 0.6138076782226562, + "ref_logps/chosen": -37.69236755371094, + "ref_logps/rejected": -36.72338104248047, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2595303058624268, + "rewards/margins": 1.216489553451538, + "rewards/rejected": -2.476019859313965, + "step": 1613 + }, + { + "epoch": 1.52, + "grad_norm": 19.068532943725586, + "learning_rate": 2.733473242392445e-07, + "logps/chosen": -49.49738693237305, + "logps/rejected": -75.90866088867188, + "loss": 0.2989, + "losses/dpo": 0.23595218360424042, + "losses/sft": 2.039900779724121, + "losses/total": 0.23595218360424042, + "ref_logps/chosen": -34.36888885498047, + "ref_logps/rejected": -45.201568603515625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.512850046157837, + "rewards/margins": 1.5578593015670776, + "rewards/rejected": -3.070709466934204, + "step": 1614 + }, + { + "epoch": 1.53, + "grad_norm": 20.726409912109375, + "learning_rate": 2.73172437915355e-07, + "logps/chosen": -45.3119010925293, + "logps/rejected": -67.6421127319336, + "loss": 0.357, + "losses/dpo": 0.5576993823051453, + "losses/sft": 1.5033469200134277, + "losses/total": 0.5576993823051453, + "ref_logps/chosen": -30.938488006591797, + "ref_logps/rejected": -38.47358703613281, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4373414516448975, + "rewards/margins": 1.479511022567749, + "rewards/rejected": -2.9168524742126465, + "step": 1615 + }, + { + "epoch": 1.53, + "grad_norm": 14.721166610717773, + "learning_rate": 2.729975515914655e-07, + "logps/chosen": -54.51541519165039, + "logps/rejected": -85.97838592529297, + "loss": 0.1938, + "losses/dpo": 0.15921086072921753, + "losses/sft": 1.546753168106079, + "losses/total": 0.15921086072921753, + "ref_logps/chosen": -41.91796112060547, + "ref_logps/rejected": -51.820594787597656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2597455978393555, + "rewards/margins": 2.156033515930176, + "rewards/rejected": -3.4157791137695312, + "step": 1616 + }, + { + "epoch": 1.53, + "grad_norm": 14.837723731994629, + "learning_rate": 2.728226652675761e-07, + "logps/chosen": -49.117774963378906, + "logps/rejected": -88.85385131835938, + "loss": 0.2487, + "losses/dpo": 0.2684962749481201, + "losses/sft": 2.1303884983062744, + "losses/total": 0.2684962749481201, + "ref_logps/chosen": -37.05741882324219, + "ref_logps/rejected": -53.95323944091797, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2060357332229614, + "rewards/margins": 2.2840256690979004, + "rewards/rejected": -3.4900612831115723, + "step": 1617 + }, + { + "epoch": 1.53, + "grad_norm": 30.443340301513672, + "learning_rate": 2.726477789436866e-07, + "logps/chosen": -44.841835021972656, + "logps/rejected": -62.176063537597656, + "loss": 0.4849, + "losses/dpo": 0.45843416452407837, + "losses/sft": 1.3392354249954224, + "losses/total": 0.45843416452407837, + "ref_logps/chosen": -32.64509963989258, + "ref_logps/rejected": -38.978004455566406, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2196736335754395, + "rewards/margins": 1.100132942199707, + "rewards/rejected": -2.3198065757751465, + "step": 1618 + }, + { + "epoch": 1.53, + "grad_norm": 24.091711044311523, + "learning_rate": 2.724728926197971e-07, + "logps/chosen": -55.11222839355469, + "logps/rejected": -65.99240112304688, + "loss": 0.4108, + "losses/dpo": 0.4813131093978882, + "losses/sft": 1.751856803894043, + "losses/total": 0.4813131093978882, + "ref_logps/chosen": -39.802066802978516, + "ref_logps/rejected": -42.212806701660156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.53101646900177, + "rewards/margins": 0.8469429612159729, + "rewards/rejected": -2.3779592514038086, + "step": 1619 + }, + { + "epoch": 1.53, + "grad_norm": 31.9316349029541, + "learning_rate": 2.7229800629590767e-07, + "logps/chosen": -50.680057525634766, + "logps/rejected": -77.05933380126953, + "loss": 0.6088, + "losses/dpo": 0.34035658836364746, + "losses/sft": 1.5466454029083252, + "losses/total": 0.34035658836364746, + "ref_logps/chosen": -35.495094299316406, + "ref_logps/rejected": -51.928260803222656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5184965133666992, + "rewards/margins": 0.9946112632751465, + "rewards/rejected": -2.5131077766418457, + "step": 1620 + }, + { + "epoch": 1.53, + "grad_norm": 25.840150833129883, + "learning_rate": 2.721231199720182e-07, + "logps/chosen": -53.757896423339844, + "logps/rejected": -69.1905746459961, + "loss": 0.4877, + "losses/dpo": 0.4141101837158203, + "losses/sft": 1.570658564567566, + "losses/total": 0.4141101837158203, + "ref_logps/chosen": -40.60039520263672, + "ref_logps/rejected": -43.09496307373047, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3157503604888916, + "rewards/margins": 1.2938103675842285, + "rewards/rejected": -2.609560966491699, + "step": 1621 + }, + { + "epoch": 1.53, + "grad_norm": 22.033201217651367, + "learning_rate": 2.719482336481287e-07, + "logps/chosen": -47.648292541503906, + "logps/rejected": -65.65187072753906, + "loss": 0.4527, + "losses/dpo": 0.6945185661315918, + "losses/sft": 1.9615315198898315, + "losses/total": 0.6945185661315918, + "ref_logps/chosen": -37.45869445800781, + "ref_logps/rejected": -43.24253845214844, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0189597606658936, + "rewards/margins": 1.2219738960266113, + "rewards/rejected": -2.240933418273926, + "step": 1622 + }, + { + "epoch": 1.53, + "grad_norm": 24.417654037475586, + "learning_rate": 2.7177334732423926e-07, + "logps/chosen": -57.761863708496094, + "logps/rejected": -74.31767272949219, + "loss": 0.3645, + "losses/dpo": 0.14395514130592346, + "losses/sft": 1.4621645212173462, + "losses/total": 0.14395514130592346, + "ref_logps/chosen": -42.51466751098633, + "ref_logps/rejected": -48.25796127319336, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5247198343276978, + "rewards/margins": 1.0812511444091797, + "rewards/rejected": -2.605971336364746, + "step": 1623 + }, + { + "epoch": 1.53, + "grad_norm": 18.523330688476562, + "learning_rate": 2.7159846100034977e-07, + "logps/chosen": -52.2335205078125, + "logps/rejected": -81.66960144042969, + "loss": 0.3326, + "losses/dpo": 0.20169439911842346, + "losses/sft": 1.5033025741577148, + "losses/total": 0.20169439911842346, + "ref_logps/chosen": -40.988014221191406, + "ref_logps/rejected": -52.44248962402344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1245503425598145, + "rewards/margins": 1.7981605529785156, + "rewards/rejected": -2.92271089553833, + "step": 1624 + }, + { + "epoch": 1.53, + "grad_norm": 21.227066040039062, + "learning_rate": 2.714235746764603e-07, + "logps/chosen": -44.20942306518555, + "logps/rejected": -67.27243041992188, + "loss": 0.3199, + "losses/dpo": 0.3612087368965149, + "losses/sft": 1.516041874885559, + "losses/total": 0.3612087368965149, + "ref_logps/chosen": -31.2517147064209, + "ref_logps/rejected": -37.37519073486328, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2957711219787598, + "rewards/margins": 1.693953037261963, + "rewards/rejected": -2.9897239208221436, + "step": 1625 + }, + { + "epoch": 1.54, + "grad_norm": 18.714885711669922, + "learning_rate": 2.712486883525708e-07, + "logps/chosen": -39.56517028808594, + "logps/rejected": -75.67193603515625, + "loss": 0.3049, + "losses/dpo": 0.4981847405433655, + "losses/sft": 1.671005129814148, + "losses/total": 0.4981847405433655, + "ref_logps/chosen": -29.99589729309082, + "ref_logps/rejected": -50.809783935546875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9569271206855774, + "rewards/margins": 1.529287576675415, + "rewards/rejected": -2.4862146377563477, + "step": 1626 + }, + { + "epoch": 1.54, + "grad_norm": 18.798629760742188, + "learning_rate": 2.7107380202868136e-07, + "logps/chosen": -37.07057189941406, + "logps/rejected": -50.39424133300781, + "loss": 0.3917, + "losses/dpo": 0.2724947929382324, + "losses/sft": 1.7756692171096802, + "losses/total": 0.2724947929382324, + "ref_logps/chosen": -28.9957332611084, + "ref_logps/rejected": -31.157318115234375, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8074840307235718, + "rewards/margins": 1.1162086725234985, + "rewards/rejected": -1.9236927032470703, + "step": 1627 + }, + { + "epoch": 1.54, + "grad_norm": 26.653005599975586, + "learning_rate": 2.708989157047919e-07, + "logps/chosen": -60.65376281738281, + "logps/rejected": -73.37484741210938, + "loss": 0.4239, + "losses/dpo": 0.45428338646888733, + "losses/sft": 2.1997427940368652, + "losses/total": 0.45428338646888733, + "ref_logps/chosen": -43.859615325927734, + "ref_logps/rejected": -44.42409896850586, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6794148683547974, + "rewards/margins": 1.2156600952148438, + "rewards/rejected": -2.8950748443603516, + "step": 1628 + }, + { + "epoch": 1.54, + "grad_norm": 20.16916847229004, + "learning_rate": 2.707240293809024e-07, + "logps/chosen": -55.38658142089844, + "logps/rejected": -60.91698455810547, + "loss": 0.3616, + "losses/dpo": 0.20826703310012817, + "losses/sft": 1.8224296569824219, + "losses/total": 0.20826703310012817, + "ref_logps/chosen": -39.526954650878906, + "ref_logps/rejected": -34.66309356689453, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5859628915786743, + "rewards/margins": 1.0394259691238403, + "rewards/rejected": -2.6253886222839355, + "step": 1629 + }, + { + "epoch": 1.54, + "grad_norm": 20.150941848754883, + "learning_rate": 2.7054914305701295e-07, + "logps/chosen": -51.59336471557617, + "logps/rejected": -88.48597717285156, + "loss": 0.3071, + "losses/dpo": 0.16902276873588562, + "losses/sft": 2.2553014755249023, + "losses/total": 0.16902276873588562, + "ref_logps/chosen": -38.70440673828125, + "ref_logps/rejected": -56.46617126464844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.288895606994629, + "rewards/margins": 1.913084864616394, + "rewards/rejected": -3.2019805908203125, + "step": 1630 + }, + { + "epoch": 1.54, + "grad_norm": 32.25161361694336, + "learning_rate": 2.7037425673312347e-07, + "logps/chosen": -51.89531707763672, + "logps/rejected": -66.19013977050781, + "loss": 0.5398, + "losses/dpo": 0.570817232131958, + "losses/sft": 1.7185765504837036, + "losses/total": 0.570817232131958, + "ref_logps/chosen": -37.96915054321289, + "ref_logps/rejected": -41.259674072265625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3926165103912354, + "rewards/margins": 1.1004307270050049, + "rewards/rejected": -2.493046998977661, + "step": 1631 + }, + { + "epoch": 1.54, + "grad_norm": 23.103656768798828, + "learning_rate": 2.70199370409234e-07, + "logps/chosen": -43.867820739746094, + "logps/rejected": -60.219703674316406, + "loss": 0.4104, + "losses/dpo": 0.4848886728286743, + "losses/sft": 1.5061545372009277, + "losses/total": 0.4848886728286743, + "ref_logps/chosen": -31.769641876220703, + "ref_logps/rejected": -36.95215606689453, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2098181247711182, + "rewards/margins": 1.1169370412826538, + "rewards/rejected": -2.3267550468444824, + "step": 1632 + }, + { + "epoch": 1.54, + "grad_norm": 23.330171585083008, + "learning_rate": 2.700244840853445e-07, + "logps/chosen": -77.50054168701172, + "logps/rejected": -99.7005615234375, + "loss": 0.3197, + "losses/dpo": 0.1342008411884308, + "losses/sft": 2.5591113567352295, + "losses/total": 0.1342008411884308, + "ref_logps/chosen": -57.61627197265625, + "ref_logps/rejected": -63.15528869628906, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9884271621704102, + "rewards/margins": 1.6661008596420288, + "rewards/rejected": -3.6545281410217285, + "step": 1633 + }, + { + "epoch": 1.54, + "grad_norm": 33.40924072265625, + "learning_rate": 2.6984959776145506e-07, + "logps/chosen": -62.27132797241211, + "logps/rejected": -86.947509765625, + "loss": 0.5329, + "losses/dpo": 0.6631467342376709, + "losses/sft": 2.3627607822418213, + "losses/total": 0.6631467342376709, + "ref_logps/chosen": -43.612518310546875, + "ref_logps/rejected": -57.89971923828125, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.865881085395813, + "rewards/margins": 1.0388987064361572, + "rewards/rejected": -2.9047799110412598, + "step": 1634 + }, + { + "epoch": 1.54, + "grad_norm": 28.160261154174805, + "learning_rate": 2.6967471143756557e-07, + "logps/chosen": -56.79981994628906, + "logps/rejected": -83.97128295898438, + "loss": 0.4519, + "losses/dpo": 0.4497472047805786, + "losses/sft": 2.3514842987060547, + "losses/total": 0.4497472047805786, + "ref_logps/chosen": -41.22658157348633, + "ref_logps/rejected": -52.50547790527344, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.557323932647705, + "rewards/margins": 1.5892574787139893, + "rewards/rejected": -3.1465816497802734, + "step": 1635 + }, + { + "epoch": 1.54, + "grad_norm": 22.036100387573242, + "learning_rate": 2.694998251136761e-07, + "logps/chosen": -43.375343322753906, + "logps/rejected": -76.14234924316406, + "loss": 0.3429, + "losses/dpo": 0.5489202737808228, + "losses/sft": 1.8918821811676025, + "losses/total": 0.5489202737808228, + "ref_logps/chosen": -30.27825164794922, + "ref_logps/rejected": -47.488224029541016, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3097093105316162, + "rewards/margins": 1.5557036399841309, + "rewards/rejected": -2.865412950515747, + "step": 1636 + }, + { + "epoch": 1.55, + "grad_norm": 16.719146728515625, + "learning_rate": 2.6932493878978665e-07, + "logps/chosen": -54.67706298828125, + "logps/rejected": -82.48965454101562, + "loss": 0.2847, + "losses/dpo": 0.2864099144935608, + "losses/sft": 2.197626829147339, + "losses/total": 0.2864099144935608, + "ref_logps/chosen": -41.088531494140625, + "ref_logps/rejected": -51.77882385253906, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3588535785675049, + "rewards/margins": 1.7122292518615723, + "rewards/rejected": -3.0710830688476562, + "step": 1637 + }, + { + "epoch": 1.55, + "grad_norm": 22.877870559692383, + "learning_rate": 2.6915005246589716e-07, + "logps/chosen": -37.54486083984375, + "logps/rejected": -51.50291442871094, + "loss": 0.4067, + "losses/dpo": 0.47823959589004517, + "losses/sft": 2.042141914367676, + "losses/total": 0.47823959589004517, + "ref_logps/chosen": -26.308597564697266, + "ref_logps/rejected": -30.071317672729492, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.123626470565796, + "rewards/margins": 1.0195329189300537, + "rewards/rejected": -2.1431596279144287, + "step": 1638 + }, + { + "epoch": 1.55, + "grad_norm": 22.970375061035156, + "learning_rate": 2.6897516614200767e-07, + "logps/chosen": -48.15448760986328, + "logps/rejected": -62.06604766845703, + "loss": 0.4829, + "losses/dpo": 0.6983827352523804, + "losses/sft": 2.161217451095581, + "losses/total": 0.6983827352523804, + "ref_logps/chosen": -35.08259201049805, + "ref_logps/rejected": -38.79847717285156, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.30718994140625, + "rewards/margins": 1.0195668935775757, + "rewards/rejected": -2.3267569541931152, + "step": 1639 + }, + { + "epoch": 1.55, + "grad_norm": 41.678348541259766, + "learning_rate": 2.688002798181182e-07, + "logps/chosen": -48.52545166015625, + "logps/rejected": -63.07997512817383, + "loss": 0.5523, + "losses/dpo": 0.23472118377685547, + "losses/sft": 1.5837814807891846, + "losses/total": 0.23472118377685547, + "ref_logps/chosen": -32.400516510009766, + "ref_logps/rejected": -38.260520935058594, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6124935150146484, + "rewards/margins": 0.8694522976875305, + "rewards/rejected": -2.4819459915161133, + "step": 1640 + }, + { + "epoch": 1.55, + "grad_norm": 28.9033145904541, + "learning_rate": 2.6862539349422875e-07, + "logps/chosen": -71.91659545898438, + "logps/rejected": -96.31245422363281, + "loss": 0.3554, + "losses/dpo": 0.05449855327606201, + "losses/sft": 1.3932918310165405, + "losses/total": 0.05449855327606201, + "ref_logps/chosen": -52.96250915527344, + "ref_logps/rejected": -60.115394592285156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8954088687896729, + "rewards/margins": 1.7242975234985352, + "rewards/rejected": -3.619706630706787, + "step": 1641 + }, + { + "epoch": 1.55, + "grad_norm": 25.54038429260254, + "learning_rate": 2.684505071703393e-07, + "logps/chosen": -50.156150817871094, + "logps/rejected": -72.30618286132812, + "loss": 0.4543, + "losses/dpo": 0.5111289024353027, + "losses/sft": 2.578420400619507, + "losses/total": 0.5111289024353027, + "ref_logps/chosen": -34.52027893066406, + "ref_logps/rejected": -43.94939422607422, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5635875463485718, + "rewards/margins": 1.2720911502838135, + "rewards/rejected": -2.835678815841675, + "step": 1642 + }, + { + "epoch": 1.55, + "grad_norm": 21.259483337402344, + "learning_rate": 2.682756208464498e-07, + "logps/chosen": -49.59820556640625, + "logps/rejected": -66.58502197265625, + "loss": 0.369, + "losses/dpo": 0.3234396278858185, + "losses/sft": 1.5548697710037231, + "losses/total": 0.3234396278858185, + "ref_logps/chosen": -36.21709442138672, + "ref_logps/rejected": -41.82677459716797, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3381109237670898, + "rewards/margins": 1.13771390914917, + "rewards/rejected": -2.4758248329162598, + "step": 1643 + }, + { + "epoch": 1.55, + "grad_norm": 31.227542877197266, + "learning_rate": 2.6810073452256034e-07, + "logps/chosen": -45.606712341308594, + "logps/rejected": -58.460350036621094, + "loss": 0.4999, + "losses/dpo": 0.5389057993888855, + "losses/sft": 2.197465419769287, + "losses/total": 0.5389057993888855, + "ref_logps/chosen": -30.810718536376953, + "ref_logps/rejected": -35.18593215942383, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4795989990234375, + "rewards/margins": 0.8478430509567261, + "rewards/rejected": -2.327441930770874, + "step": 1644 + }, + { + "epoch": 1.55, + "grad_norm": 31.755725860595703, + "learning_rate": 2.6792584819867085e-07, + "logps/chosen": -54.021392822265625, + "logps/rejected": -85.29512023925781, + "loss": 0.4119, + "losses/dpo": 0.22851862013339996, + "losses/sft": 1.5314711332321167, + "losses/total": 0.22851862013339996, + "ref_logps/chosen": -37.89067840576172, + "ref_logps/rejected": -53.333343505859375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6130714416503906, + "rewards/margins": 1.5831066370010376, + "rewards/rejected": -3.1961779594421387, + "step": 1645 + }, + { + "epoch": 1.55, + "grad_norm": 25.32545280456543, + "learning_rate": 2.6775096187478137e-07, + "logps/chosen": -51.02509689331055, + "logps/rejected": -56.782466888427734, + "loss": 0.5397, + "losses/dpo": 0.5141239166259766, + "losses/sft": 1.764752984046936, + "losses/total": 0.5141239166259766, + "ref_logps/chosen": -35.31363296508789, + "ref_logps/rejected": -33.657630920410156, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5711463689804077, + "rewards/margins": 0.741337776184082, + "rewards/rejected": -2.3124840259552, + "step": 1646 + }, + { + "epoch": 1.56, + "grad_norm": 20.92083168029785, + "learning_rate": 2.675760755508919e-07, + "logps/chosen": -53.08032989501953, + "logps/rejected": -71.9874267578125, + "loss": 0.2893, + "losses/dpo": 0.13308952748775482, + "losses/sft": 1.7331342697143555, + "losses/total": 0.13308952748775482, + "ref_logps/chosen": -39.94524383544922, + "ref_logps/rejected": -41.948890686035156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3135087490081787, + "rewards/margins": 1.690345048904419, + "rewards/rejected": -3.0038537979125977, + "step": 1647 + }, + { + "epoch": 1.56, + "grad_norm": 24.365474700927734, + "learning_rate": 2.6740118922700245e-07, + "logps/chosen": -53.47990417480469, + "logps/rejected": -75.38375854492188, + "loss": 0.4943, + "losses/dpo": 0.44575968384742737, + "losses/sft": 2.0612235069274902, + "losses/total": 0.44575968384742737, + "ref_logps/chosen": -36.23787307739258, + "ref_logps/rejected": -48.19116973876953, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.72420334815979, + "rewards/margins": 0.995055079460144, + "rewards/rejected": -2.7192583084106445, + "step": 1648 + }, + { + "epoch": 1.56, + "grad_norm": 19.783899307250977, + "learning_rate": 2.67226302903113e-07, + "logps/chosen": -64.78406524658203, + "logps/rejected": -82.91340637207031, + "loss": 0.2551, + "losses/dpo": 0.1174703985452652, + "losses/sft": 1.5330079793930054, + "losses/total": 0.1174703985452652, + "ref_logps/chosen": -49.928321838378906, + "ref_logps/rejected": -51.219703674316406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4855741262435913, + "rewards/margins": 1.6837958097457886, + "rewards/rejected": -3.16936993598938, + "step": 1649 + }, + { + "epoch": 1.56, + "grad_norm": 18.13228988647461, + "learning_rate": 2.6705141657922347e-07, + "logps/chosen": -52.760826110839844, + "logps/rejected": -77.49868774414062, + "loss": 0.321, + "losses/dpo": 0.5770677328109741, + "losses/sft": 2.273347854614258, + "losses/total": 0.5770677328109741, + "ref_logps/chosen": -40.98771667480469, + "ref_logps/rejected": -51.59224319458008, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1773107051849365, + "rewards/margins": 1.413333535194397, + "rewards/rejected": -2.590644359588623, + "step": 1650 + }, + { + "epoch": 1.56, + "grad_norm": 18.541601181030273, + "learning_rate": 2.6687653025533404e-07, + "logps/chosen": -65.10570526123047, + "logps/rejected": -84.50265502929688, + "loss": 0.2826, + "losses/dpo": 0.32084178924560547, + "losses/sft": 2.3292148113250732, + "losses/total": 0.32084178924560547, + "ref_logps/chosen": -46.8994140625, + "ref_logps/rejected": -51.53701400756836, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.820629358291626, + "rewards/margins": 1.475934624671936, + "rewards/rejected": -3.2965638637542725, + "step": 1651 + }, + { + "epoch": 1.56, + "grad_norm": 23.196929931640625, + "learning_rate": 2.6670164393144455e-07, + "logps/chosen": -62.20363998413086, + "logps/rejected": -77.71561431884766, + "loss": 0.4543, + "losses/dpo": 0.6370358467102051, + "losses/sft": 1.830292820930481, + "losses/total": 0.6370358467102051, + "ref_logps/chosen": -44.85289001464844, + "ref_logps/rejected": -48.890987396240234, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.735074758529663, + "rewards/margins": 1.1473876237869263, + "rewards/rejected": -2.882462501525879, + "step": 1652 + }, + { + "epoch": 1.56, + "grad_norm": 25.40617561340332, + "learning_rate": 2.6652675760755506e-07, + "logps/chosen": -50.79237365722656, + "logps/rejected": -76.96620178222656, + "loss": 0.4269, + "losses/dpo": 0.5880008339881897, + "losses/sft": 1.894627332687378, + "losses/total": 0.5880008339881897, + "ref_logps/chosen": -33.071617126464844, + "ref_logps/rejected": -46.53327178955078, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.772075891494751, + "rewards/margins": 1.2712171077728271, + "rewards/rejected": -3.043292999267578, + "step": 1653 + }, + { + "epoch": 1.56, + "grad_norm": 18.516530990600586, + "learning_rate": 2.663518712836656e-07, + "logps/chosen": -53.31241226196289, + "logps/rejected": -80.26522827148438, + "loss": 0.2536, + "losses/dpo": 0.2803356349468231, + "losses/sft": 1.9406237602233887, + "losses/total": 0.2803356349468231, + "ref_logps/chosen": -41.60315704345703, + "ref_logps/rejected": -51.36479568481445, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.170925259590149, + "rewards/margins": 1.7191179990768433, + "rewards/rejected": -2.890043258666992, + "step": 1654 + }, + { + "epoch": 1.56, + "grad_norm": 19.091054916381836, + "learning_rate": 2.6617698495977614e-07, + "logps/chosen": -45.498844146728516, + "logps/rejected": -71.2254638671875, + "loss": 0.3605, + "losses/dpo": 0.2558470070362091, + "losses/sft": 1.3483762741088867, + "losses/total": 0.2558470070362091, + "ref_logps/chosen": -31.576784133911133, + "ref_logps/rejected": -46.127159118652344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.392205834388733, + "rewards/margins": 1.1176249980926514, + "rewards/rejected": -2.5098307132720947, + "step": 1655 + }, + { + "epoch": 1.56, + "grad_norm": 34.910762786865234, + "learning_rate": 2.660020986358867e-07, + "logps/chosen": -56.819129943847656, + "logps/rejected": -62.712806701660156, + "loss": 0.6916, + "losses/dpo": 0.5331999659538269, + "losses/sft": 1.8937010765075684, + "losses/total": 0.5331999659538269, + "ref_logps/chosen": -39.472007751464844, + "ref_logps/rejected": -37.18384552001953, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7347123622894287, + "rewards/margins": 0.818183958530426, + "rewards/rejected": -2.552896499633789, + "step": 1656 + }, + { + "epoch": 1.56, + "grad_norm": 26.515329360961914, + "learning_rate": 2.6582721231199716e-07, + "logps/chosen": -61.10515213012695, + "logps/rejected": -85.40309143066406, + "loss": 0.4208, + "losses/dpo": 0.44674190878868103, + "losses/sft": 1.8360155820846558, + "losses/total": 0.44674190878868103, + "ref_logps/chosen": -44.87999725341797, + "ref_logps/rejected": -52.64459991455078, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6225149631500244, + "rewards/margins": 1.653334140777588, + "rewards/rejected": -3.2758491039276123, + "step": 1657 + }, + { + "epoch": 1.57, + "grad_norm": 25.264232635498047, + "learning_rate": 2.6565232598810773e-07, + "logps/chosen": -44.700599670410156, + "logps/rejected": -66.07460021972656, + "loss": 0.4791, + "losses/dpo": 0.3718392848968506, + "losses/sft": 1.9944740533828735, + "losses/total": 0.3718392848968506, + "ref_logps/chosen": -28.533042907714844, + "ref_logps/rejected": -39.207847595214844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.616755723953247, + "rewards/margins": 1.0699198246002197, + "rewards/rejected": -2.686675548553467, + "step": 1658 + }, + { + "epoch": 1.57, + "grad_norm": 20.162647247314453, + "learning_rate": 2.6547743966421824e-07, + "logps/chosen": -51.169822692871094, + "logps/rejected": -77.40350341796875, + "loss": 0.36, + "losses/dpo": 0.5586111545562744, + "losses/sft": 1.6747393608093262, + "losses/total": 0.5586111545562744, + "ref_logps/chosen": -39.93372344970703, + "ref_logps/rejected": -50.12847900390625, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.123610019683838, + "rewards/margins": 1.6038925647735596, + "rewards/rejected": -2.7275025844573975, + "step": 1659 + }, + { + "epoch": 1.57, + "grad_norm": 32.805057525634766, + "learning_rate": 2.6530255334032876e-07, + "logps/chosen": -61.154266357421875, + "logps/rejected": -76.336181640625, + "loss": 0.5574, + "losses/dpo": 0.4673042297363281, + "losses/sft": 1.8928769826889038, + "losses/total": 0.4673042297363281, + "ref_logps/chosen": -43.338523864746094, + "ref_logps/rejected": -46.407073974609375, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.7815742492675781, + "rewards/margins": 1.2113360166549683, + "rewards/rejected": -2.992910385131836, + "step": 1660 + }, + { + "epoch": 1.57, + "grad_norm": 28.616901397705078, + "learning_rate": 2.6512766701643927e-07, + "logps/chosen": -57.51754379272461, + "logps/rejected": -72.57643127441406, + "loss": 0.3595, + "losses/dpo": 0.4811883568763733, + "losses/sft": 1.6532338857650757, + "losses/total": 0.4811883568763733, + "ref_logps/chosen": -38.726295471191406, + "ref_logps/rejected": -39.933048248291016, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8791247606277466, + "rewards/margins": 1.3852133750915527, + "rewards/rejected": -3.2643380165100098, + "step": 1661 + }, + { + "epoch": 1.57, + "grad_norm": 14.62951374053955, + "learning_rate": 2.6495278069254983e-07, + "logps/chosen": -57.11421203613281, + "logps/rejected": -89.01399993896484, + "loss": 0.1998, + "losses/dpo": 0.2647951543331146, + "losses/sft": 2.361064910888672, + "losses/total": 0.2647951543331146, + "ref_logps/chosen": -44.09925079345703, + "ref_logps/rejected": -52.31884002685547, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3014960289001465, + "rewards/margins": 2.3680200576782227, + "rewards/rejected": -3.669516086578369, + "step": 1662 + }, + { + "epoch": 1.57, + "grad_norm": 22.190095901489258, + "learning_rate": 2.647778943686604e-07, + "logps/chosen": -41.692527770996094, + "logps/rejected": -59.03276062011719, + "loss": 0.4231, + "losses/dpo": 0.24633584916591644, + "losses/sft": 1.4761253595352173, + "losses/total": 0.24633584916591644, + "ref_logps/chosen": -29.166818618774414, + "ref_logps/rejected": -34.794349670410156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2525708675384521, + "rewards/margins": 1.171270728111267, + "rewards/rejected": -2.4238414764404297, + "step": 1663 + }, + { + "epoch": 1.57, + "grad_norm": 28.424457550048828, + "learning_rate": 2.6460300804477086e-07, + "logps/chosen": -59.99970245361328, + "logps/rejected": -88.26316833496094, + "loss": 0.4575, + "losses/dpo": 0.7439466118812561, + "losses/sft": 1.7997181415557861, + "losses/total": 0.7439466118812561, + "ref_logps/chosen": -39.7232551574707, + "ref_logps/rejected": -54.9466667175293, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0276451110839844, + "rewards/margins": 1.3040049076080322, + "rewards/rejected": -3.3316500186920166, + "step": 1664 + }, + { + "epoch": 1.57, + "grad_norm": 24.59447479248047, + "learning_rate": 2.644281217208814e-07, + "logps/chosen": -45.171573638916016, + "logps/rejected": -69.65135192871094, + "loss": 0.4677, + "losses/dpo": 0.2128482609987259, + "losses/sft": 1.6712005138397217, + "losses/total": 0.2128482609987259, + "ref_logps/chosen": -31.02631378173828, + "ref_logps/rejected": -42.3289794921875, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4145258665084839, + "rewards/margins": 1.317711591720581, + "rewards/rejected": -2.7322373390197754, + "step": 1665 + }, + { + "epoch": 1.57, + "grad_norm": 25.404090881347656, + "learning_rate": 2.6425323539699194e-07, + "logps/chosen": -57.798797607421875, + "logps/rejected": -80.9564437866211, + "loss": 0.3881, + "losses/dpo": 0.3141580820083618, + "losses/sft": 2.369131326675415, + "losses/total": 0.3141580820083618, + "ref_logps/chosen": -42.959590911865234, + "ref_logps/rejected": -51.40434646606445, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.483920693397522, + "rewards/margins": 1.4712886810302734, + "rewards/rejected": -2.955209255218506, + "step": 1666 + }, + { + "epoch": 1.57, + "grad_norm": 19.500904083251953, + "learning_rate": 2.6407834907310245e-07, + "logps/chosen": -50.6585693359375, + "logps/rejected": -72.68571472167969, + "loss": 0.2503, + "losses/dpo": 0.250453382730484, + "losses/sft": 2.021021842956543, + "losses/total": 0.250453382730484, + "ref_logps/chosen": -36.266387939453125, + "ref_logps/rejected": -42.38813018798828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.439218282699585, + "rewards/margins": 1.5905399322509766, + "rewards/rejected": -3.0297582149505615, + "step": 1667 + }, + { + "epoch": 1.58, + "grad_norm": 13.217467308044434, + "learning_rate": 2.63903462749213e-07, + "logps/chosen": -40.942283630371094, + "logps/rejected": -55.480979919433594, + "loss": 0.3213, + "losses/dpo": 0.23293673992156982, + "losses/sft": 2.005419969558716, + "losses/total": 0.23293673992156982, + "ref_logps/chosen": -31.410551071166992, + "ref_logps/rejected": -32.45996856689453, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.953173041343689, + "rewards/margins": 1.3489274978637695, + "rewards/rejected": -2.302100658416748, + "step": 1668 + }, + { + "epoch": 1.58, + "grad_norm": 16.91592025756836, + "learning_rate": 2.6372857642532353e-07, + "logps/chosen": -40.017459869384766, + "logps/rejected": -72.77808380126953, + "loss": 0.306, + "losses/dpo": 0.19263622164726257, + "losses/sft": 1.4955772161483765, + "losses/total": 0.19263622164726257, + "ref_logps/chosen": -32.276084899902344, + "ref_logps/rejected": -49.59219741821289, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7741374373435974, + "rewards/margins": 1.5444512367248535, + "rewards/rejected": -2.3185884952545166, + "step": 1669 + }, + { + "epoch": 1.58, + "grad_norm": 31.049604415893555, + "learning_rate": 2.635536901014341e-07, + "logps/chosen": -54.75788879394531, + "logps/rejected": -63.67683410644531, + "loss": 0.5912, + "losses/dpo": 0.6454762816429138, + "losses/sft": 2.1500191688537598, + "losses/total": 0.6454762816429138, + "ref_logps/chosen": -37.41011047363281, + "ref_logps/rejected": -37.62797546386719, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7347784042358398, + "rewards/margins": 0.8701076507568359, + "rewards/rejected": -2.604886054992676, + "step": 1670 + }, + { + "epoch": 1.58, + "grad_norm": 22.56049919128418, + "learning_rate": 2.6337880377754455e-07, + "logps/chosen": -43.30486297607422, + "logps/rejected": -70.1745834350586, + "loss": 0.5217, + "losses/dpo": 0.5436393022537231, + "losses/sft": 1.8744052648544312, + "losses/total": 0.5436393022537231, + "ref_logps/chosen": -32.65325164794922, + "ref_logps/rejected": -49.66826629638672, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0651612281799316, + "rewards/margins": 0.9854701161384583, + "rewards/rejected": -2.050631284713745, + "step": 1671 + }, + { + "epoch": 1.58, + "grad_norm": 21.70183563232422, + "learning_rate": 2.632039174536551e-07, + "logps/chosen": -44.057373046875, + "logps/rejected": -65.52379608154297, + "loss": 0.3543, + "losses/dpo": 0.5216940641403198, + "losses/sft": 1.9431930780410767, + "losses/total": 0.5216940641403198, + "ref_logps/chosen": -31.411155700683594, + "ref_logps/rejected": -37.00748825073242, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2646220922470093, + "rewards/margins": 1.5870087146759033, + "rewards/rejected": -2.851630687713623, + "step": 1672 + }, + { + "epoch": 1.58, + "grad_norm": 27.148149490356445, + "learning_rate": 2.6302903112976563e-07, + "logps/chosen": -50.910274505615234, + "logps/rejected": -69.40679931640625, + "loss": 0.4255, + "losses/dpo": 0.9526625871658325, + "losses/sft": 2.315486192703247, + "losses/total": 0.9526625871658325, + "ref_logps/chosen": -34.414894104003906, + "ref_logps/rejected": -37.466373443603516, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6495378017425537, + "rewards/margins": 1.544505000114441, + "rewards/rejected": -3.194042682647705, + "step": 1673 + }, + { + "epoch": 1.58, + "grad_norm": 21.605085372924805, + "learning_rate": 2.6285414480587614e-07, + "logps/chosen": -56.68227005004883, + "logps/rejected": -72.84638977050781, + "loss": 0.3459, + "losses/dpo": 0.34026867151260376, + "losses/sft": 2.0914828777313232, + "losses/total": 0.34026867151260376, + "ref_logps/chosen": -40.51249694824219, + "ref_logps/rejected": -44.0089111328125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6169772148132324, + "rewards/margins": 1.266770362854004, + "rewards/rejected": -2.8837475776672363, + "step": 1674 + }, + { + "epoch": 1.58, + "grad_norm": 34.19541549682617, + "learning_rate": 2.626792584819867e-07, + "logps/chosen": -52.255653381347656, + "logps/rejected": -67.04866027832031, + "loss": 0.5724, + "losses/dpo": 0.5703568458557129, + "losses/sft": 1.9263702630996704, + "losses/total": 0.5703568458557129, + "ref_logps/chosen": -37.738319396972656, + "ref_logps/rejected": -44.528045654296875, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4517334699630737, + "rewards/margins": 0.8003284931182861, + "rewards/rejected": -2.2520618438720703, + "step": 1675 + }, + { + "epoch": 1.58, + "grad_norm": 22.78449058532715, + "learning_rate": 2.625043721580972e-07, + "logps/chosen": -49.107826232910156, + "logps/rejected": -69.46508026123047, + "loss": 0.3994, + "losses/dpo": 0.26648449897766113, + "losses/sft": 1.4606882333755493, + "losses/total": 0.26648449897766113, + "ref_logps/chosen": -37.83285903930664, + "ref_logps/rejected": -45.494712829589844, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1274969577789307, + "rewards/margins": 1.2695398330688477, + "rewards/rejected": -2.397036552429199, + "step": 1676 + }, + { + "epoch": 1.58, + "grad_norm": 18.713668823242188, + "learning_rate": 2.623294858342078e-07, + "logps/chosen": -45.18793869018555, + "logps/rejected": -84.1526107788086, + "loss": 0.317, + "losses/dpo": 0.2993185520172119, + "losses/sft": 1.3882665634155273, + "losses/total": 0.2993185520172119, + "ref_logps/chosen": -30.645986557006836, + "ref_logps/rejected": -53.584232330322266, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.454195261001587, + "rewards/margins": 1.6026426553726196, + "rewards/rejected": -3.056837797164917, + "step": 1677 + }, + { + "epoch": 1.58, + "grad_norm": 16.389495849609375, + "learning_rate": 2.6215459951031825e-07, + "logps/chosen": -36.11703872680664, + "logps/rejected": -75.21994018554688, + "loss": 0.2332, + "losses/dpo": 0.28319504857063293, + "losses/sft": 1.540827751159668, + "losses/total": 0.28319504857063293, + "ref_logps/chosen": -26.941329956054688, + "ref_logps/rejected": -48.799434661865234, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9175708889961243, + "rewards/margins": 1.7244800329208374, + "rewards/rejected": -2.6420507431030273, + "step": 1678 + }, + { + "epoch": 1.59, + "grad_norm": 19.882858276367188, + "learning_rate": 2.619797131864288e-07, + "logps/chosen": -49.78032302856445, + "logps/rejected": -84.12419891357422, + "loss": 0.2925, + "losses/dpo": 0.1579287052154541, + "losses/sft": 1.5552613735198975, + "losses/total": 0.1579287052154541, + "ref_logps/chosen": -35.98012924194336, + "ref_logps/rejected": -52.107017517089844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3800195455551147, + "rewards/margins": 1.8216983079910278, + "rewards/rejected": -3.2017178535461426, + "step": 1679 + }, + { + "epoch": 1.59, + "grad_norm": 28.187620162963867, + "learning_rate": 2.618048268625393e-07, + "logps/chosen": -51.121551513671875, + "logps/rejected": -71.55033111572266, + "loss": 0.3887, + "losses/dpo": 0.23457235097885132, + "losses/sft": 2.2999722957611084, + "losses/total": 0.23457235097885132, + "ref_logps/chosen": -35.76118850708008, + "ref_logps/rejected": -39.54771423339844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5360362529754639, + "rewards/margins": 1.6642255783081055, + "rewards/rejected": -3.2002618312835693, + "step": 1680 + }, + { + "epoch": 1.59, + "grad_norm": 26.547409057617188, + "learning_rate": 2.6162994053864984e-07, + "logps/chosen": -40.57977294921875, + "logps/rejected": -59.65150451660156, + "loss": 0.5212, + "losses/dpo": 0.30037161707878113, + "losses/sft": 1.3291321992874146, + "losses/total": 0.30037161707878113, + "ref_logps/chosen": -29.0831298828125, + "ref_logps/rejected": -34.76066589355469, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1496641635894775, + "rewards/margins": 1.339420199394226, + "rewards/rejected": -2.489084243774414, + "step": 1681 + }, + { + "epoch": 1.59, + "grad_norm": 16.880813598632812, + "learning_rate": 2.614550542147604e-07, + "logps/chosen": -41.66114807128906, + "logps/rejected": -65.62359619140625, + "loss": 0.3193, + "losses/dpo": 0.536454439163208, + "losses/sft": 1.594664454460144, + "losses/total": 0.536454439163208, + "ref_logps/chosen": -31.709991455078125, + "ref_logps/rejected": -38.73860168457031, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9951153993606567, + "rewards/margins": 1.6933845281600952, + "rewards/rejected": -2.688499927520752, + "step": 1682 + }, + { + "epoch": 1.59, + "grad_norm": 22.443984985351562, + "learning_rate": 2.612801678908709e-07, + "logps/chosen": -59.49350357055664, + "logps/rejected": -79.27086639404297, + "loss": 0.3134, + "losses/dpo": 0.24396274983882904, + "losses/sft": 2.112079620361328, + "losses/total": 0.24396274983882904, + "ref_logps/chosen": -43.74391174316406, + "ref_logps/rejected": -49.0337028503418, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5749597549438477, + "rewards/margins": 1.448757290840149, + "rewards/rejected": -3.023717164993286, + "step": 1683 + }, + { + "epoch": 1.59, + "grad_norm": 21.76535987854004, + "learning_rate": 2.611052815669815e-07, + "logps/chosen": -62.51936721801758, + "logps/rejected": -90.518798828125, + "loss": 0.3522, + "losses/dpo": 0.1277112364768982, + "losses/sft": 2.0342190265655518, + "losses/total": 0.1277112364768982, + "ref_logps/chosen": -44.36508560180664, + "ref_logps/rejected": -56.39969253540039, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8154282569885254, + "rewards/margins": 1.5964818000793457, + "rewards/rejected": -3.411910057067871, + "step": 1684 + }, + { + "epoch": 1.59, + "grad_norm": 20.25614356994629, + "learning_rate": 2.6093039524309194e-07, + "logps/chosen": -42.00189208984375, + "logps/rejected": -63.54865264892578, + "loss": 0.3939, + "losses/dpo": 0.6064974665641785, + "losses/sft": 1.8426246643066406, + "losses/total": 0.6064974665641785, + "ref_logps/chosen": -30.157485961914062, + "ref_logps/rejected": -38.62895965576172, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1844403743743896, + "rewards/margins": 1.3075294494628906, + "rewards/rejected": -2.4919698238372803, + "step": 1685 + }, + { + "epoch": 1.59, + "grad_norm": 24.392690658569336, + "learning_rate": 2.607555089192025e-07, + "logps/chosen": -55.11760330200195, + "logps/rejected": -78.53960418701172, + "loss": 0.4283, + "losses/dpo": 0.4085690379142761, + "losses/sft": 2.050025224685669, + "losses/total": 0.4085690379142761, + "ref_logps/chosen": -41.50851058959961, + "ref_logps/rejected": -53.802818298339844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3609095811843872, + "rewards/margins": 1.1127686500549316, + "rewards/rejected": -2.4736781120300293, + "step": 1686 + }, + { + "epoch": 1.59, + "grad_norm": 19.779436111450195, + "learning_rate": 2.605806225953131e-07, + "logps/chosen": -46.925655364990234, + "logps/rejected": -76.889892578125, + "loss": 0.2828, + "losses/dpo": 0.3575228154659271, + "losses/sft": 1.8481152057647705, + "losses/total": 0.3575228154659271, + "ref_logps/chosen": -35.66778564453125, + "ref_logps/rejected": -49.306495666503906, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1257871389389038, + "rewards/margins": 1.6325528621673584, + "rewards/rejected": -2.7583398818969727, + "step": 1687 + }, + { + "epoch": 1.59, + "grad_norm": 20.928817749023438, + "learning_rate": 2.6040573627142353e-07, + "logps/chosen": -42.58091354370117, + "logps/rejected": -69.19400024414062, + "loss": 0.4106, + "losses/dpo": 0.26089322566986084, + "losses/sft": 1.6918455362319946, + "losses/total": 0.26089322566986084, + "ref_logps/chosen": -31.497961044311523, + "ref_logps/rejected": -46.74159240722656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.108295202255249, + "rewards/margins": 1.1369454860687256, + "rewards/rejected": -2.2452406883239746, + "step": 1688 + }, + { + "epoch": 1.59, + "grad_norm": 21.269784927368164, + "learning_rate": 2.602308499475341e-07, + "logps/chosen": -60.73674011230469, + "logps/rejected": -78.62924194335938, + "loss": 0.3499, + "losses/dpo": 0.3991255760192871, + "losses/sft": 2.6420505046844482, + "losses/total": 0.3991255760192871, + "ref_logps/chosen": -42.557193756103516, + "ref_logps/rejected": -45.440250396728516, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8179550170898438, + "rewards/margins": 1.5009442567825317, + "rewards/rejected": -3.318899154663086, + "step": 1689 + }, + { + "epoch": 1.6, + "grad_norm": 25.46879005432129, + "learning_rate": 2.600559636236446e-07, + "logps/chosen": -53.05203628540039, + "logps/rejected": -69.90511322021484, + "loss": 0.526, + "losses/dpo": 0.5232917070388794, + "losses/sft": 1.971091628074646, + "losses/total": 0.5232917070388794, + "ref_logps/chosen": -37.96042251586914, + "ref_logps/rejected": -44.83235168457031, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5091618299484253, + "rewards/margins": 0.9981141090393066, + "rewards/rejected": -2.5072760581970215, + "step": 1690 + }, + { + "epoch": 1.6, + "grad_norm": 19.873798370361328, + "learning_rate": 2.598810772997552e-07, + "logps/chosen": -37.886085510253906, + "logps/rejected": -65.10121154785156, + "loss": 0.3292, + "losses/dpo": 0.15399359166622162, + "losses/sft": 2.109825611114502, + "losses/total": 0.15399359166622162, + "ref_logps/chosen": -21.938209533691406, + "ref_logps/rejected": -35.62663269042969, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.59478759765625, + "rewards/margins": 1.352670669555664, + "rewards/rejected": -2.947458267211914, + "step": 1691 + }, + { + "epoch": 1.6, + "grad_norm": 30.11886215209961, + "learning_rate": 2.5970619097586564e-07, + "logps/chosen": -61.93041229248047, + "logps/rejected": -73.73487854003906, + "loss": 0.5023, + "losses/dpo": 0.4248866140842438, + "losses/sft": 1.7431190013885498, + "losses/total": 0.4248866140842438, + "ref_logps/chosen": -44.71243667602539, + "ref_logps/rejected": -45.64496994018555, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7217977046966553, + "rewards/margins": 1.087193489074707, + "rewards/rejected": -2.808990955352783, + "step": 1692 + }, + { + "epoch": 1.6, + "grad_norm": 27.565887451171875, + "learning_rate": 2.595313046519762e-07, + "logps/chosen": -48.117340087890625, + "logps/rejected": -63.55048751831055, + "loss": 0.4001, + "losses/dpo": 0.8755173087120056, + "losses/sft": 1.6353271007537842, + "losses/total": 0.8755173087120056, + "ref_logps/chosen": -35.32022476196289, + "ref_logps/rejected": -39.41569137573242, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2797114849090576, + "rewards/margins": 1.1337684392929077, + "rewards/rejected": -2.413480043411255, + "step": 1693 + }, + { + "epoch": 1.6, + "grad_norm": 25.21326446533203, + "learning_rate": 2.5935641832808677e-07, + "logps/chosen": -46.920196533203125, + "logps/rejected": -59.816200256347656, + "loss": 0.4287, + "losses/dpo": 0.43476080894470215, + "losses/sft": 2.567338228225708, + "losses/total": 0.43476080894470215, + "ref_logps/chosen": -33.12450408935547, + "ref_logps/rejected": -36.84740447998047, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.37956964969635, + "rewards/margins": 0.9173097610473633, + "rewards/rejected": -2.296879291534424, + "step": 1694 + }, + { + "epoch": 1.6, + "grad_norm": 27.18939971923828, + "learning_rate": 2.5918153200419723e-07, + "logps/chosen": -55.09325408935547, + "logps/rejected": -64.12335968017578, + "loss": 0.4843, + "losses/dpo": 0.21715635061264038, + "losses/sft": 1.8934741020202637, + "losses/total": 0.21715635061264038, + "ref_logps/chosen": -40.22957992553711, + "ref_logps/rejected": -38.47683334350586, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4863673448562622, + "rewards/margins": 1.0782852172851562, + "rewards/rejected": -2.564652442932129, + "step": 1695 + }, + { + "epoch": 1.6, + "grad_norm": 13.418107986450195, + "learning_rate": 2.590066456803078e-07, + "logps/chosen": -43.864192962646484, + "logps/rejected": -69.68255615234375, + "loss": 0.2231, + "losses/dpo": 0.21122479438781738, + "losses/sft": 1.6132981777191162, + "losses/total": 0.21122479438781738, + "ref_logps/chosen": -33.222068786621094, + "ref_logps/rejected": -39.89115524291992, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0642125606536865, + "rewards/margins": 1.9149274826049805, + "rewards/rejected": -2.979140281677246, + "step": 1696 + }, + { + "epoch": 1.6, + "grad_norm": 24.44149398803711, + "learning_rate": 2.588317593564183e-07, + "logps/chosen": -50.114990234375, + "logps/rejected": -73.626220703125, + "loss": 0.5198, + "losses/dpo": 0.5436519980430603, + "losses/sft": 1.7331504821777344, + "losses/total": 0.5436519980430603, + "ref_logps/chosen": -36.66084289550781, + "ref_logps/rejected": -49.82818603515625, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3454147577285767, + "rewards/margins": 1.0343890190124512, + "rewards/rejected": -2.3798036575317383, + "step": 1697 + }, + { + "epoch": 1.6, + "grad_norm": 14.38016128540039, + "learning_rate": 2.5865687303252887e-07, + "logps/chosen": -54.12076187133789, + "logps/rejected": -78.1153564453125, + "loss": 0.2204, + "losses/dpo": 0.20779266953468323, + "losses/sft": 2.3309385776519775, + "losses/total": 0.20779266953468323, + "ref_logps/chosen": -39.74736785888672, + "ref_logps/rejected": -42.970306396484375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4373393058776855, + "rewards/margins": 2.0771656036376953, + "rewards/rejected": -3.51450514793396, + "step": 1698 + }, + { + "epoch": 1.6, + "grad_norm": 23.805593490600586, + "learning_rate": 2.5848198670863933e-07, + "logps/chosen": -51.08177947998047, + "logps/rejected": -76.69293975830078, + "loss": 0.437, + "losses/dpo": 0.5563952326774597, + "losses/sft": 2.206334352493286, + "losses/total": 0.5563952326774597, + "ref_logps/chosen": -35.916412353515625, + "ref_logps/rejected": -49.044219970703125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5165367126464844, + "rewards/margins": 1.248335361480713, + "rewards/rejected": -2.7648720741271973, + "step": 1699 + }, + { + "epoch": 1.61, + "grad_norm": 29.400575637817383, + "learning_rate": 2.583071003847499e-07, + "logps/chosen": -61.05107879638672, + "logps/rejected": -70.8686752319336, + "loss": 0.5572, + "losses/dpo": 0.2740618586540222, + "losses/sft": 1.9630038738250732, + "losses/total": 0.2740618586540222, + "ref_logps/chosen": -45.12006378173828, + "ref_logps/rejected": -43.93709182739258, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5931012630462646, + "rewards/margins": 1.1000571250915527, + "rewards/rejected": -2.6931581497192383, + "step": 1700 + }, + { + "epoch": 1.61, + "grad_norm": 23.972097396850586, + "learning_rate": 2.5813221406086046e-07, + "logps/chosen": -58.94755172729492, + "logps/rejected": -86.1441879272461, + "loss": 0.3476, + "losses/dpo": 0.3983757197856903, + "losses/sft": 2.3936402797698975, + "losses/total": 0.3983757197856903, + "ref_logps/chosen": -43.79438781738281, + "ref_logps/rejected": -56.778839111328125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5153162479400635, + "rewards/margins": 1.4212180376052856, + "rewards/rejected": -2.9365344047546387, + "step": 1701 + }, + { + "epoch": 1.61, + "grad_norm": 31.20714569091797, + "learning_rate": 2.579573277369709e-07, + "logps/chosen": -51.1915283203125, + "logps/rejected": -68.49287414550781, + "loss": 0.5542, + "losses/dpo": 0.6483299732208252, + "losses/sft": 2.033721685409546, + "losses/total": 0.6483299732208252, + "ref_logps/chosen": -38.686920166015625, + "ref_logps/rejected": -42.646610260009766, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2504607439041138, + "rewards/margins": 1.3341654539108276, + "rewards/rejected": -2.5846261978149414, + "step": 1702 + }, + { + "epoch": 1.61, + "grad_norm": 21.918899536132812, + "learning_rate": 2.577824414130815e-07, + "logps/chosen": -47.61374282836914, + "logps/rejected": -63.86582946777344, + "loss": 0.4125, + "losses/dpo": 0.46828797459602356, + "losses/sft": 1.3913002014160156, + "losses/total": 0.46828797459602356, + "ref_logps/chosen": -32.2750358581543, + "ref_logps/rejected": -37.893798828125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5338705778121948, + "rewards/margins": 1.0633323192596436, + "rewards/rejected": -2.597202777862549, + "step": 1703 + }, + { + "epoch": 1.61, + "grad_norm": 19.991924285888672, + "learning_rate": 2.57607555089192e-07, + "logps/chosen": -50.15476989746094, + "logps/rejected": -62.726287841796875, + "loss": 0.3294, + "losses/dpo": 0.21432074904441833, + "losses/sft": 1.2985972166061401, + "losses/total": 0.21432074904441833, + "ref_logps/chosen": -36.133060455322266, + "ref_logps/rejected": -34.09256362915039, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4021708965301514, + "rewards/margins": 1.4612014293670654, + "rewards/rejected": -2.863372325897217, + "step": 1704 + }, + { + "epoch": 1.61, + "grad_norm": 20.620197296142578, + "learning_rate": 2.5743266876530257e-07, + "logps/chosen": -45.40034103393555, + "logps/rejected": -56.69420623779297, + "loss": 0.4422, + "losses/dpo": 0.40340331196784973, + "losses/sft": 1.82687509059906, + "losses/total": 0.40340331196784973, + "ref_logps/chosen": -33.02241897583008, + "ref_logps/rejected": -34.886810302734375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2377923727035522, + "rewards/margins": 0.9429475665092468, + "rewards/rejected": -2.1807398796081543, + "step": 1705 + }, + { + "epoch": 1.61, + "grad_norm": 24.2882022857666, + "learning_rate": 2.57257782441413e-07, + "logps/chosen": -45.427650451660156, + "logps/rejected": -69.88075256347656, + "loss": 0.47, + "losses/dpo": 0.3295285105705261, + "losses/sft": 2.1115331649780273, + "losses/total": 0.3295285105705261, + "ref_logps/chosen": -30.624160766601562, + "ref_logps/rejected": -43.11790466308594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.480348825454712, + "rewards/margins": 1.1959362030029297, + "rewards/rejected": -2.6762850284576416, + "step": 1706 + }, + { + "epoch": 1.61, + "grad_norm": 21.574094772338867, + "learning_rate": 2.570828961175236e-07, + "logps/chosen": -42.071937561035156, + "logps/rejected": -82.47891235351562, + "loss": 0.3394, + "losses/dpo": 0.1716458797454834, + "losses/sft": 1.3351582288742065, + "losses/total": 0.1716458797454834, + "ref_logps/chosen": -32.68022918701172, + "ref_logps/rejected": -55.13825988769531, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9391703605651855, + "rewards/margins": 1.7948949337005615, + "rewards/rejected": -2.734065294265747, + "step": 1707 + }, + { + "epoch": 1.61, + "grad_norm": 21.97496223449707, + "learning_rate": 2.5690800979363416e-07, + "logps/chosen": -48.086212158203125, + "logps/rejected": -77.50769805908203, + "loss": 0.3312, + "losses/dpo": 0.23380061984062195, + "losses/sft": 1.7336677312850952, + "losses/total": 0.23380061984062195, + "ref_logps/chosen": -36.06480407714844, + "ref_logps/rejected": -49.543487548828125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2021414041519165, + "rewards/margins": 1.5942800045013428, + "rewards/rejected": -2.7964212894439697, + "step": 1708 + }, + { + "epoch": 1.61, + "grad_norm": 20.616004943847656, + "learning_rate": 2.567331234697446e-07, + "logps/chosen": -45.64478302001953, + "logps/rejected": -59.34964370727539, + "loss": 0.489, + "losses/dpo": 0.5231375694274902, + "losses/sft": 2.4794270992279053, + "losses/total": 0.5231375694274902, + "ref_logps/chosen": -31.097517013549805, + "ref_logps/rejected": -33.70814514160156, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4547265768051147, + "rewards/margins": 1.1094229221343994, + "rewards/rejected": -2.5641493797302246, + "step": 1709 + }, + { + "epoch": 1.61, + "grad_norm": 41.78421401977539, + "learning_rate": 2.565582371458552e-07, + "logps/chosen": -53.465431213378906, + "logps/rejected": -66.07946014404297, + "loss": 0.8232, + "losses/dpo": 0.3527677059173584, + "losses/sft": 1.5076885223388672, + "losses/total": 0.3527677059173584, + "ref_logps/chosen": -36.02337646484375, + "ref_logps/rejected": -42.72295379638672, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.7442052364349365, + "rewards/margins": 0.5914453864097595, + "rewards/rejected": -2.3356504440307617, + "step": 1710 + }, + { + "epoch": 1.62, + "grad_norm": 26.656536102294922, + "learning_rate": 2.563833508219657e-07, + "logps/chosen": -64.93763732910156, + "logps/rejected": -102.15889739990234, + "loss": 0.3415, + "losses/dpo": 0.388261079788208, + "losses/sft": 1.999680519104004, + "losses/total": 0.388261079788208, + "ref_logps/chosen": -47.505767822265625, + "ref_logps/rejected": -68.66729736328125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7431871891021729, + "rewards/margins": 1.6059727668762207, + "rewards/rejected": -3.3491601943969727, + "step": 1711 + }, + { + "epoch": 1.62, + "grad_norm": 21.65997886657715, + "learning_rate": 2.5620846449807626e-07, + "logps/chosen": -50.296051025390625, + "logps/rejected": -63.11650085449219, + "loss": 0.4245, + "losses/dpo": 0.29651695489883423, + "losses/sft": 1.7380163669586182, + "losses/total": 0.29651695489883423, + "ref_logps/chosen": -36.176063537597656, + "ref_logps/rejected": -36.67198944091797, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4119987487792969, + "rewards/margins": 1.2324522733688354, + "rewards/rejected": -2.644451141357422, + "step": 1712 + }, + { + "epoch": 1.62, + "grad_norm": 30.8131160736084, + "learning_rate": 2.5603357817418677e-07, + "logps/chosen": -58.23980712890625, + "logps/rejected": -76.20166778564453, + "loss": 0.5437, + "losses/dpo": 0.6521898508071899, + "losses/sft": 2.064173936843872, + "losses/total": 0.6521898508071899, + "ref_logps/chosen": -41.63606262207031, + "ref_logps/rejected": -51.62317657470703, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.660374641418457, + "rewards/margins": 0.7974750399589539, + "rewards/rejected": -2.4578497409820557, + "step": 1713 + }, + { + "epoch": 1.62, + "grad_norm": 30.45929718017578, + "learning_rate": 2.558586918502973e-07, + "logps/chosen": -50.106475830078125, + "logps/rejected": -60.24506378173828, + "loss": 0.5065, + "losses/dpo": 0.42407292127609253, + "losses/sft": 1.9886914491653442, + "losses/total": 0.42407292127609253, + "ref_logps/chosen": -36.871726989746094, + "ref_logps/rejected": -36.309322357177734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3234748840332031, + "rewards/margins": 1.0700993537902832, + "rewards/rejected": -2.3935742378234863, + "step": 1714 + }, + { + "epoch": 1.62, + "grad_norm": 22.826793670654297, + "learning_rate": 2.5568380552640785e-07, + "logps/chosen": -48.91212463378906, + "logps/rejected": -75.21824645996094, + "loss": 0.3376, + "losses/dpo": 0.5897175669670105, + "losses/sft": 2.2067716121673584, + "losses/total": 0.5897175669670105, + "ref_logps/chosen": -33.7586669921875, + "ref_logps/rejected": -47.553443908691406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5153456926345825, + "rewards/margins": 1.2511353492736816, + "rewards/rejected": -2.7664811611175537, + "step": 1715 + }, + { + "epoch": 1.62, + "grad_norm": 14.791064262390137, + "learning_rate": 2.555089192025183e-07, + "logps/chosen": -52.70648956298828, + "logps/rejected": -71.68559265136719, + "loss": 0.2707, + "losses/dpo": 0.2725614607334137, + "losses/sft": 1.9950833320617676, + "losses/total": 0.2725614607334137, + "ref_logps/chosen": -39.049720764160156, + "ref_logps/rejected": -43.878639221191406, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3656775951385498, + "rewards/margins": 1.41501784324646, + "rewards/rejected": -2.7806954383850098, + "step": 1716 + }, + { + "epoch": 1.62, + "grad_norm": 18.773130416870117, + "learning_rate": 2.553340328786289e-07, + "logps/chosen": -43.274932861328125, + "logps/rejected": -56.1312370300293, + "loss": 0.4545, + "losses/dpo": 0.2814927101135254, + "losses/sft": 1.5184011459350586, + "losses/total": 0.2814927101135254, + "ref_logps/chosen": -30.005794525146484, + "ref_logps/rejected": -32.11858367919922, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.326913833618164, + "rewards/margins": 1.0743515491485596, + "rewards/rejected": -2.4012653827667236, + "step": 1717 + }, + { + "epoch": 1.62, + "grad_norm": 24.9932918548584, + "learning_rate": 2.551591465547394e-07, + "logps/chosen": -51.914886474609375, + "logps/rejected": -59.898231506347656, + "loss": 0.5483, + "losses/dpo": 0.7883964776992798, + "losses/sft": 3.383769989013672, + "losses/total": 0.7883964776992798, + "ref_logps/chosen": -34.69055938720703, + "ref_logps/rejected": -36.196292877197266, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7224326133728027, + "rewards/margins": 0.6477614641189575, + "rewards/rejected": -2.3701939582824707, + "step": 1718 + }, + { + "epoch": 1.62, + "grad_norm": 22.577491760253906, + "learning_rate": 2.5498426023084995e-07, + "logps/chosen": -42.70201110839844, + "logps/rejected": -65.31559753417969, + "loss": 0.4347, + "losses/dpo": 0.5611782073974609, + "losses/sft": 2.379319667816162, + "losses/total": 0.5611782073974609, + "ref_logps/chosen": -31.390506744384766, + "ref_logps/rejected": -40.67528533935547, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1311503648757935, + "rewards/margins": 1.3328800201416016, + "rewards/rejected": -2.4640302658081055, + "step": 1719 + }, + { + "epoch": 1.62, + "grad_norm": 20.03665542602539, + "learning_rate": 2.5480937390696047e-07, + "logps/chosen": -42.88938522338867, + "logps/rejected": -61.84242248535156, + "loss": 0.3576, + "losses/dpo": 0.3116549849510193, + "losses/sft": 1.7542328834533691, + "losses/total": 0.3116549849510193, + "ref_logps/chosen": -31.158885955810547, + "ref_logps/rejected": -38.53749084472656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.173049807548523, + "rewards/margins": 1.157443642616272, + "rewards/rejected": -2.330493450164795, + "step": 1720 + }, + { + "epoch": 1.63, + "grad_norm": 18.225561141967773, + "learning_rate": 2.54634487583071e-07, + "logps/chosen": -48.095458984375, + "logps/rejected": -86.5268783569336, + "loss": 0.299, + "losses/dpo": 0.16750237345695496, + "losses/sft": 1.5937292575836182, + "losses/total": 0.16750237345695496, + "ref_logps/chosen": -35.68586730957031, + "ref_logps/rejected": -59.16970443725586, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2409590482711792, + "rewards/margins": 1.4947586059570312, + "rewards/rejected": -2.735717535018921, + "step": 1721 + }, + { + "epoch": 1.63, + "grad_norm": 33.022579193115234, + "learning_rate": 2.5445960125918155e-07, + "logps/chosen": -58.868778228759766, + "logps/rejected": -59.140811920166016, + "loss": 0.6316, + "losses/dpo": 0.2693566679954529, + "losses/sft": 1.7555820941925049, + "losses/total": 0.2693566679954529, + "ref_logps/chosen": -42.4088134765625, + "ref_logps/rejected": -33.35874557495117, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6459965705871582, + "rewards/margins": 0.9322102069854736, + "rewards/rejected": -2.5782065391540527, + "step": 1722 + }, + { + "epoch": 1.63, + "grad_norm": 25.231719970703125, + "learning_rate": 2.54284714935292e-07, + "logps/chosen": -53.8870849609375, + "logps/rejected": -73.10115814208984, + "loss": 0.5203, + "losses/dpo": 0.18047204613685608, + "losses/sft": 2.002629280090332, + "losses/total": 0.18047204613685608, + "ref_logps/chosen": -38.624595642089844, + "ref_logps/rejected": -48.08985900878906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5262489318847656, + "rewards/margins": 0.9748811721801758, + "rewards/rejected": -2.5011301040649414, + "step": 1723 + }, + { + "epoch": 1.63, + "grad_norm": 20.354001998901367, + "learning_rate": 2.5410982861140257e-07, + "logps/chosen": -57.29927062988281, + "logps/rejected": -60.07427215576172, + "loss": 0.4153, + "losses/dpo": 0.4300401508808136, + "losses/sft": 1.3612242937088013, + "losses/total": 0.4300401508808136, + "ref_logps/chosen": -43.414794921875, + "ref_logps/rejected": -34.77241516113281, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.388447642326355, + "rewards/margins": 1.1417381763458252, + "rewards/rejected": -2.5301856994628906, + "step": 1724 + }, + { + "epoch": 1.63, + "grad_norm": 17.782976150512695, + "learning_rate": 2.539349422875131e-07, + "logps/chosen": -42.95454788208008, + "logps/rejected": -68.92463684082031, + "loss": 0.305, + "losses/dpo": 0.427029013633728, + "losses/sft": 1.9913402795791626, + "losses/total": 0.427029013633728, + "ref_logps/chosen": -31.01287078857422, + "ref_logps/rejected": -39.82344055175781, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1941677331924438, + "rewards/margins": 1.7159514427185059, + "rewards/rejected": -2.9101195335388184, + "step": 1725 + }, + { + "epoch": 1.63, + "grad_norm": 24.13286018371582, + "learning_rate": 2.5376005596362365e-07, + "logps/chosen": -50.24007797241211, + "logps/rejected": -69.79496765136719, + "loss": 0.476, + "losses/dpo": 0.5065603852272034, + "losses/sft": 1.876365065574646, + "losses/total": 0.5065603852272034, + "ref_logps/chosen": -35.224449157714844, + "ref_logps/rejected": -43.55268859863281, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5015625953674316, + "rewards/margins": 1.1226651668548584, + "rewards/rejected": -2.624228000640869, + "step": 1726 + }, + { + "epoch": 1.63, + "grad_norm": 18.875690460205078, + "learning_rate": 2.5358516963973416e-07, + "logps/chosen": -51.90456771850586, + "logps/rejected": -74.50015258789062, + "loss": 0.3144, + "losses/dpo": 0.3848721385002136, + "losses/sft": 1.942346215248108, + "losses/total": 0.3848721385002136, + "ref_logps/chosen": -38.46068572998047, + "ref_logps/rejected": -46.12248611450195, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3443881273269653, + "rewards/margins": 1.493377923965454, + "rewards/rejected": -2.837766170501709, + "step": 1727 + }, + { + "epoch": 1.63, + "grad_norm": 28.37961769104004, + "learning_rate": 2.534102833158447e-07, + "logps/chosen": -48.577064514160156, + "logps/rejected": -82.97050476074219, + "loss": 0.425, + "losses/dpo": 0.6688879132270813, + "losses/sft": 1.9931738376617432, + "losses/total": 0.6688879132270813, + "ref_logps/chosen": -35.99244689941406, + "ref_logps/rejected": -54.301673889160156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2584621906280518, + "rewards/margins": 1.6084208488464355, + "rewards/rejected": -2.8668832778930664, + "step": 1728 + }, + { + "epoch": 1.63, + "grad_norm": 22.02532386779785, + "learning_rate": 2.5323539699195524e-07, + "logps/chosen": -56.200321197509766, + "logps/rejected": -90.24491882324219, + "loss": 0.3496, + "losses/dpo": 0.3043467700481415, + "losses/sft": 2.3032021522521973, + "losses/total": 0.3043467700481415, + "ref_logps/chosen": -43.64707946777344, + "ref_logps/rejected": -58.17082214355469, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.255324363708496, + "rewards/margins": 1.9520853757858276, + "rewards/rejected": -3.207409381866455, + "step": 1729 + }, + { + "epoch": 1.63, + "grad_norm": 19.41638946533203, + "learning_rate": 2.530605106680657e-07, + "logps/chosen": -50.2297477722168, + "logps/rejected": -71.56086730957031, + "loss": 0.3032, + "losses/dpo": 0.34405726194381714, + "losses/sft": 1.7360780239105225, + "losses/total": 0.34405726194381714, + "ref_logps/chosen": -35.395469665527344, + "ref_logps/rejected": -41.67345428466797, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4834280014038086, + "rewards/margins": 1.505313515663147, + "rewards/rejected": -2.988741397857666, + "step": 1730 + }, + { + "epoch": 1.63, + "grad_norm": 22.186115264892578, + "learning_rate": 2.5288562434417627e-07, + "logps/chosen": -45.201332092285156, + "logps/rejected": -68.27487182617188, + "loss": 0.4001, + "losses/dpo": 0.6906545162200928, + "losses/sft": 2.455472707748413, + "losses/total": 0.6906545162200928, + "ref_logps/chosen": -29.865217208862305, + "ref_logps/rejected": -40.72291564941406, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.533611536026001, + "rewards/margins": 1.2215843200683594, + "rewards/rejected": -2.7551958560943604, + "step": 1731 + }, + { + "epoch": 1.64, + "grad_norm": 27.745742797851562, + "learning_rate": 2.5271073802028683e-07, + "logps/chosen": -54.785179138183594, + "logps/rejected": -77.94574737548828, + "loss": 0.4921, + "losses/dpo": 0.34052154421806335, + "losses/sft": 1.7407050132751465, + "losses/total": 0.34052154421806335, + "ref_logps/chosen": -40.07048797607422, + "ref_logps/rejected": -51.32563781738281, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4714692831039429, + "rewards/margins": 1.190542221069336, + "rewards/rejected": -2.6620116233825684, + "step": 1732 + }, + { + "epoch": 1.64, + "grad_norm": 18.940242767333984, + "learning_rate": 2.5253585169639734e-07, + "logps/chosen": -32.22381591796875, + "logps/rejected": -62.058349609375, + "loss": 0.3785, + "losses/dpo": 0.43926823139190674, + "losses/sft": 1.482996940612793, + "losses/total": 0.43926823139190674, + "ref_logps/chosen": -23.436565399169922, + "ref_logps/rejected": -40.592166900634766, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8787248134613037, + "rewards/margins": 1.2678937911987305, + "rewards/rejected": -2.146618604660034, + "step": 1733 + }, + { + "epoch": 1.64, + "grad_norm": 22.96949005126953, + "learning_rate": 2.5236096537250786e-07, + "logps/chosen": -49.87434005737305, + "logps/rejected": -64.28363037109375, + "loss": 0.4364, + "losses/dpo": 0.3585287034511566, + "losses/sft": 2.1542539596557617, + "losses/total": 0.3585287034511566, + "ref_logps/chosen": -37.18541717529297, + "ref_logps/rejected": -41.434173583984375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.268892526626587, + "rewards/margins": 1.0160531997680664, + "rewards/rejected": -2.2849457263946533, + "step": 1734 + }, + { + "epoch": 1.64, + "grad_norm": 20.29282569885254, + "learning_rate": 2.5218607904861837e-07, + "logps/chosen": -47.359031677246094, + "logps/rejected": -63.904075622558594, + "loss": 0.3902, + "losses/dpo": 0.5772249698638916, + "losses/sft": 2.259298324584961, + "losses/total": 0.5772249698638916, + "ref_logps/chosen": -33.2038688659668, + "ref_logps/rejected": -36.245792388916016, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4155160188674927, + "rewards/margins": 1.3503127098083496, + "rewards/rejected": -2.7658286094665527, + "step": 1735 + }, + { + "epoch": 1.64, + "grad_norm": 20.01504135131836, + "learning_rate": 2.5201119272472893e-07, + "logps/chosen": -61.14841079711914, + "logps/rejected": -79.07331848144531, + "loss": 0.2827, + "losses/dpo": 0.37559935450553894, + "losses/sft": 1.6319624185562134, + "losses/total": 0.37559935450553894, + "ref_logps/chosen": -45.3342399597168, + "ref_logps/rejected": -49.348487854003906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5814169645309448, + "rewards/margins": 1.391066074371338, + "rewards/rejected": -2.9724831581115723, + "step": 1736 + }, + { + "epoch": 1.64, + "grad_norm": 15.476624488830566, + "learning_rate": 2.518363064008394e-07, + "logps/chosen": -46.7214469909668, + "logps/rejected": -66.35993194580078, + "loss": 0.2801, + "losses/dpo": 0.38920146226882935, + "losses/sft": 1.9700355529785156, + "losses/total": 0.38920146226882935, + "ref_logps/chosen": -36.837059020996094, + "ref_logps/rejected": -41.472259521484375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9884388446807861, + "rewards/margins": 1.5003283023834229, + "rewards/rejected": -2.488767147064209, + "step": 1737 + }, + { + "epoch": 1.64, + "grad_norm": 21.71673583984375, + "learning_rate": 2.5166142007694996e-07, + "logps/chosen": -53.2696533203125, + "logps/rejected": -74.0630111694336, + "loss": 0.341, + "losses/dpo": 0.22557701170444489, + "losses/sft": 2.205395221710205, + "losses/total": 0.22557701170444489, + "ref_logps/chosen": -35.446632385253906, + "ref_logps/rejected": -43.976463317871094, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7823021411895752, + "rewards/margins": 1.2263524532318115, + "rewards/rejected": -3.008654832839966, + "step": 1738 + }, + { + "epoch": 1.64, + "grad_norm": 18.874605178833008, + "learning_rate": 2.514865337530605e-07, + "logps/chosen": -49.429691314697266, + "logps/rejected": -72.20341491699219, + "loss": 0.2869, + "losses/dpo": 0.1943396031856537, + "losses/sft": 1.6803804636001587, + "losses/total": 0.1943396031856537, + "ref_logps/chosen": -34.68839645385742, + "ref_logps/rejected": -40.69697952270508, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4741297960281372, + "rewards/margins": 1.6765141487121582, + "rewards/rejected": -3.150644063949585, + "step": 1739 + }, + { + "epoch": 1.64, + "grad_norm": 21.336071014404297, + "learning_rate": 2.5131164742917104e-07, + "logps/chosen": -41.82895278930664, + "logps/rejected": -83.10308074951172, + "loss": 0.3066, + "losses/dpo": 0.13714835047721863, + "losses/sft": 1.9981573820114136, + "losses/total": 0.13714835047721863, + "ref_logps/chosen": -30.48394775390625, + "ref_logps/rejected": -53.38444900512695, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.134500503540039, + "rewards/margins": 1.8373630046844482, + "rewards/rejected": -2.9718635082244873, + "step": 1740 + }, + { + "epoch": 1.64, + "grad_norm": 20.28631019592285, + "learning_rate": 2.5113676110528155e-07, + "logps/chosen": -45.91974639892578, + "logps/rejected": -65.12422180175781, + "loss": 0.3384, + "losses/dpo": 0.2829352915287018, + "losses/sft": 1.9542996883392334, + "losses/total": 0.2829352915287018, + "ref_logps/chosen": -33.374568939208984, + "ref_logps/rejected": -38.56147003173828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2545177936553955, + "rewards/margins": 1.4017577171325684, + "rewards/rejected": -2.656275510787964, + "step": 1741 + }, + { + "epoch": 1.64, + "grad_norm": 24.799890518188477, + "learning_rate": 2.5096187478139206e-07, + "logps/chosen": -50.457008361816406, + "logps/rejected": -59.93446731567383, + "loss": 0.4073, + "losses/dpo": 0.26259562373161316, + "losses/sft": 1.6940741539001465, + "losses/total": 0.26259562373161316, + "ref_logps/chosen": -37.06106185913086, + "ref_logps/rejected": -37.168182373046875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3395946025848389, + "rewards/margins": 0.9370344877243042, + "rewards/rejected": -2.2766289710998535, + "step": 1742 + }, + { + "epoch": 1.65, + "grad_norm": 17.957420349121094, + "learning_rate": 2.5078698845750263e-07, + "logps/chosen": -40.55392074584961, + "logps/rejected": -55.7088508605957, + "loss": 0.3382, + "losses/dpo": 0.5663778781890869, + "losses/sft": 1.5110385417938232, + "losses/total": 0.5663778781890869, + "ref_logps/chosen": -29.02698516845703, + "ref_logps/rejected": -31.44099235534668, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1526936292648315, + "rewards/margins": 1.274092197418213, + "rewards/rejected": -2.426785945892334, + "step": 1743 + }, + { + "epoch": 1.65, + "grad_norm": 26.556346893310547, + "learning_rate": 2.506121021336131e-07, + "logps/chosen": -53.62139892578125, + "logps/rejected": -65.5887451171875, + "loss": 0.4345, + "losses/dpo": 0.18863216042518616, + "losses/sft": 1.981642723083496, + "losses/total": 0.18863216042518616, + "ref_logps/chosen": -39.930259704589844, + "ref_logps/rejected": -38.34013748168945, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3691136837005615, + "rewards/margins": 1.3557474613189697, + "rewards/rejected": -2.7248611450195312, + "step": 1744 + }, + { + "epoch": 1.65, + "grad_norm": 24.39154052734375, + "learning_rate": 2.5043721580972365e-07, + "logps/chosen": -45.27629852294922, + "logps/rejected": -60.03491973876953, + "loss": 0.4674, + "losses/dpo": 0.4523618817329407, + "losses/sft": 1.9528025388717651, + "losses/total": 0.4523618817329407, + "ref_logps/chosen": -30.986499786376953, + "ref_logps/rejected": -36.605712890625, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4289798736572266, + "rewards/margins": 0.913940966129303, + "rewards/rejected": -2.3429207801818848, + "step": 1745 + }, + { + "epoch": 1.65, + "grad_norm": 20.71787452697754, + "learning_rate": 2.502623294858342e-07, + "logps/chosen": -44.60021209716797, + "logps/rejected": -66.5357666015625, + "loss": 0.4076, + "losses/dpo": 0.26822173595428467, + "losses/sft": 1.697623372077942, + "losses/total": 0.26822173595428467, + "ref_logps/chosen": -31.552837371826172, + "ref_logps/rejected": -39.9516487121582, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3047373294830322, + "rewards/margins": 1.3536741733551025, + "rewards/rejected": -2.6584115028381348, + "step": 1746 + }, + { + "epoch": 1.65, + "grad_norm": 13.634176254272461, + "learning_rate": 2.5008744316194473e-07, + "logps/chosen": -61.19767379760742, + "logps/rejected": -84.01016235351562, + "loss": 0.2112, + "losses/dpo": 0.1825573891401291, + "losses/sft": 1.556693196296692, + "losses/total": 0.1825573891401291, + "ref_logps/chosen": -47.61400604248047, + "ref_logps/rejected": -50.663639068603516, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3583669662475586, + "rewards/margins": 1.976285457611084, + "rewards/rejected": -3.3346526622772217, + "step": 1747 + }, + { + "epoch": 1.65, + "grad_norm": 18.066137313842773, + "learning_rate": 2.4991255683805525e-07, + "logps/chosen": -45.3166389465332, + "logps/rejected": -62.234588623046875, + "loss": 0.2815, + "losses/dpo": 0.2690800130367279, + "losses/sft": 1.589748501777649, + "losses/total": 0.2690800130367279, + "ref_logps/chosen": -34.692745208740234, + "ref_logps/rejected": -38.325172424316406, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0623897314071655, + "rewards/margins": 1.3285517692565918, + "rewards/rejected": -2.3909413814544678, + "step": 1748 + }, + { + "epoch": 1.65, + "grad_norm": 27.016061782836914, + "learning_rate": 2.4973767051416576e-07, + "logps/chosen": -61.679107666015625, + "logps/rejected": -79.778564453125, + "loss": 0.3811, + "losses/dpo": 1.023167610168457, + "losses/sft": 2.07210636138916, + "losses/total": 1.023167610168457, + "ref_logps/chosen": -50.669517517089844, + "ref_logps/rejected": -50.098106384277344, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1009588241577148, + "rewards/margins": 1.8670872449874878, + "rewards/rejected": -2.968045711517334, + "step": 1749 + }, + { + "epoch": 1.65, + "grad_norm": 22.37818717956543, + "learning_rate": 2.495627841902763e-07, + "logps/chosen": -62.192989349365234, + "logps/rejected": -83.9250717163086, + "loss": 0.3155, + "losses/dpo": 0.3115991950035095, + "losses/sft": 2.187771797180176, + "losses/total": 0.3115991950035095, + "ref_logps/chosen": -44.19801712036133, + "ref_logps/rejected": -50.246856689453125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7994972467422485, + "rewards/margins": 1.5683242082595825, + "rewards/rejected": -3.36782169342041, + "step": 1750 + }, + { + "epoch": 1.65, + "grad_norm": 37.1116943359375, + "learning_rate": 2.4938789786638684e-07, + "logps/chosen": -61.98405075073242, + "logps/rejected": -70.7381591796875, + "loss": 0.6037, + "losses/dpo": 0.713307797908783, + "losses/sft": 2.509040594100952, + "losses/total": 0.713307797908783, + "ref_logps/chosen": -43.37413024902344, + "ref_logps/rejected": -42.15391159057617, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.860992193222046, + "rewards/margins": 0.9974318742752075, + "rewards/rejected": -2.858423948287964, + "step": 1751 + }, + { + "epoch": 1.65, + "grad_norm": 22.407917022705078, + "learning_rate": 2.4921301154249735e-07, + "logps/chosen": -47.93756866455078, + "logps/rejected": -72.85267639160156, + "loss": 0.3235, + "losses/dpo": 0.15469565987586975, + "losses/sft": 1.903242588043213, + "losses/total": 0.15469565987586975, + "ref_logps/chosen": -34.365089416503906, + "ref_logps/rejected": -41.953590393066406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.357248067855835, + "rewards/margins": 1.7326595783233643, + "rewards/rejected": -3.089907646179199, + "step": 1752 + }, + { + "epoch": 1.66, + "grad_norm": 26.822011947631836, + "learning_rate": 2.490381252186079e-07, + "logps/chosen": -45.707340240478516, + "logps/rejected": -50.74597930908203, + "loss": 0.5444, + "losses/dpo": 0.6730902194976807, + "losses/sft": 1.9205435514450073, + "losses/total": 0.6730902194976807, + "ref_logps/chosen": -31.910297393798828, + "ref_logps/rejected": -30.508390426635742, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3797039985656738, + "rewards/margins": 0.6440549492835999, + "rewards/rejected": -2.023758888244629, + "step": 1753 + }, + { + "epoch": 1.66, + "grad_norm": 27.741172790527344, + "learning_rate": 2.4886323889471843e-07, + "logps/chosen": -56.74720001220703, + "logps/rejected": -73.21231079101562, + "loss": 0.5465, + "losses/dpo": 0.47176164388656616, + "losses/sft": 1.7883340120315552, + "losses/total": 0.47176164388656616, + "ref_logps/chosen": -42.71295166015625, + "ref_logps/rejected": -50.758323669433594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4034249782562256, + "rewards/margins": 0.841973602771759, + "rewards/rejected": -2.24539852142334, + "step": 1754 + }, + { + "epoch": 1.66, + "grad_norm": 17.044225692749023, + "learning_rate": 2.4868835257082894e-07, + "logps/chosen": -43.494998931884766, + "logps/rejected": -74.65469360351562, + "loss": 0.2601, + "losses/dpo": 0.15053287148475647, + "losses/sft": 1.662393569946289, + "losses/total": 0.15053287148475647, + "ref_logps/chosen": -32.523048400878906, + "ref_logps/rejected": -46.49173355102539, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0971951484680176, + "rewards/margins": 1.7191004753112793, + "rewards/rejected": -2.816295623779297, + "step": 1755 + }, + { + "epoch": 1.66, + "grad_norm": 16.60759925842285, + "learning_rate": 2.4851346624693945e-07, + "logps/chosen": -55.989253997802734, + "logps/rejected": -74.5605239868164, + "loss": 0.2856, + "losses/dpo": 0.4219213128089905, + "losses/sft": 1.8973370790481567, + "losses/total": 0.4219213128089905, + "ref_logps/chosen": -41.978431701660156, + "ref_logps/rejected": -43.160606384277344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4010822772979736, + "rewards/margins": 1.7389092445373535, + "rewards/rejected": -3.139991521835327, + "step": 1756 + }, + { + "epoch": 1.66, + "grad_norm": 29.04009246826172, + "learning_rate": 2.4833857992305e-07, + "logps/chosen": -63.5703010559082, + "logps/rejected": -71.96419525146484, + "loss": 0.6452, + "losses/dpo": 0.6840000152587891, + "losses/sft": 1.913475751876831, + "losses/total": 0.6840000152587891, + "ref_logps/chosen": -40.3055534362793, + "ref_logps/rejected": -42.871620178222656, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.326474666595459, + "rewards/margins": 0.5827829837799072, + "rewards/rejected": -2.9092578887939453, + "step": 1757 + }, + { + "epoch": 1.66, + "grad_norm": 28.937654495239258, + "learning_rate": 2.4816369359916053e-07, + "logps/chosen": -61.76683044433594, + "logps/rejected": -68.48959350585938, + "loss": 0.5078, + "losses/dpo": 0.5240499973297119, + "losses/sft": 2.167008638381958, + "losses/total": 0.5240499973297119, + "ref_logps/chosen": -40.633323669433594, + "ref_logps/rejected": -39.12541961669922, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1133508682250977, + "rewards/margins": 0.8230666518211365, + "rewards/rejected": -2.936417579650879, + "step": 1758 + }, + { + "epoch": 1.66, + "grad_norm": 30.774147033691406, + "learning_rate": 2.4798880727527104e-07, + "logps/chosen": -54.717796325683594, + "logps/rejected": -81.8855209350586, + "loss": 0.4419, + "losses/dpo": 0.23568269610404968, + "losses/sft": 2.6098146438598633, + "losses/total": 0.23568269610404968, + "ref_logps/chosen": -37.91327667236328, + "ref_logps/rejected": -51.40283966064453, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6804518699645996, + "rewards/margins": 1.3678154945373535, + "rewards/rejected": -3.048267364501953, + "step": 1759 + }, + { + "epoch": 1.66, + "grad_norm": 33.1243896484375, + "learning_rate": 2.478139209513816e-07, + "logps/chosen": -53.85862350463867, + "logps/rejected": -70.88557434082031, + "loss": 0.5587, + "losses/dpo": 0.8125826120376587, + "losses/sft": 2.3513290882110596, + "losses/total": 0.8125826120376587, + "ref_logps/chosen": -38.626739501953125, + "ref_logps/rejected": -43.82019805908203, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.523188591003418, + "rewards/margins": 1.1833494901657104, + "rewards/rejected": -2.706537961959839, + "step": 1760 + }, + { + "epoch": 1.66, + "grad_norm": 25.510570526123047, + "learning_rate": 2.476390346274921e-07, + "logps/chosen": -43.83403015136719, + "logps/rejected": -66.24166107177734, + "loss": 0.4548, + "losses/dpo": 0.2545833885669708, + "losses/sft": 2.516761064529419, + "losses/total": 0.2545833885669708, + "ref_logps/chosen": -33.473411560058594, + "ref_logps/rejected": -42.44950485229492, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0360615253448486, + "rewards/margins": 1.3431546688079834, + "rewards/rejected": -2.379216194152832, + "step": 1761 + }, + { + "epoch": 1.66, + "grad_norm": 19.211475372314453, + "learning_rate": 2.4746414830360263e-07, + "logps/chosen": -42.900978088378906, + "logps/rejected": -78.2021484375, + "loss": 0.2391, + "losses/dpo": 0.23803257942199707, + "losses/sft": 1.5480340719223022, + "losses/total": 0.23803257942199707, + "ref_logps/chosen": -30.61618423461914, + "ref_logps/rejected": -44.52288055419922, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.228479266166687, + "rewards/margins": 2.1394476890563965, + "rewards/rejected": -3.367927074432373, + "step": 1762 + }, + { + "epoch": 1.66, + "grad_norm": 27.00369644165039, + "learning_rate": 2.4728926197971315e-07, + "logps/chosen": -49.99366760253906, + "logps/rejected": -70.42268371582031, + "loss": 0.5345, + "losses/dpo": 0.39466962218284607, + "losses/sft": 1.1136701107025146, + "losses/total": 0.39466962218284607, + "ref_logps/chosen": -33.49297332763672, + "ref_logps/rejected": -44.156715393066406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6500697135925293, + "rewards/margins": 0.9765269756317139, + "rewards/rejected": -2.626596689224243, + "step": 1763 + }, + { + "epoch": 1.67, + "grad_norm": 25.0563907623291, + "learning_rate": 2.471143756558237e-07, + "logps/chosen": -54.64145278930664, + "logps/rejected": -72.87681579589844, + "loss": 0.4251, + "losses/dpo": 0.4190717339515686, + "losses/sft": 2.3476083278656006, + "losses/total": 0.4190717339515686, + "ref_logps/chosen": -39.182796478271484, + "ref_logps/rejected": -43.9580078125, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.545865774154663, + "rewards/margins": 1.3460148572921753, + "rewards/rejected": -2.891880512237549, + "step": 1764 + }, + { + "epoch": 1.67, + "grad_norm": 23.983139038085938, + "learning_rate": 2.469394893319342e-07, + "logps/chosen": -56.65501403808594, + "logps/rejected": -71.4188461303711, + "loss": 0.3988, + "losses/dpo": 0.41120821237564087, + "losses/sft": 2.1404871940612793, + "losses/total": 0.41120821237564087, + "ref_logps/chosen": -38.815696716308594, + "ref_logps/rejected": -41.478057861328125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.783931851387024, + "rewards/margins": 1.2101467847824097, + "rewards/rejected": -2.9940786361694336, + "step": 1765 + }, + { + "epoch": 1.67, + "grad_norm": 23.112445831298828, + "learning_rate": 2.4676460300804474e-07, + "logps/chosen": -47.889896392822266, + "logps/rejected": -62.0364990234375, + "loss": 0.4304, + "losses/dpo": 0.3706853687763214, + "losses/sft": 1.5867184400558472, + "losses/total": 0.3706853687763214, + "ref_logps/chosen": -33.91569519042969, + "ref_logps/rejected": -39.546409606933594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3974204063415527, + "rewards/margins": 0.8515884876251221, + "rewards/rejected": -2.249008893966675, + "step": 1766 + }, + { + "epoch": 1.67, + "grad_norm": 24.360191345214844, + "learning_rate": 2.465897166841553e-07, + "logps/chosen": -63.62725067138672, + "logps/rejected": -86.77616882324219, + "loss": 0.3422, + "losses/dpo": 0.5358516573905945, + "losses/sft": 1.811553716659546, + "losses/total": 0.5358516573905945, + "ref_logps/chosen": -45.95429992675781, + "ref_logps/rejected": -51.953426361083984, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7672953605651855, + "rewards/margins": 1.7149794101715088, + "rewards/rejected": -3.4822750091552734, + "step": 1767 + }, + { + "epoch": 1.67, + "grad_norm": 15.364474296569824, + "learning_rate": 2.464148303602658e-07, + "logps/chosen": -46.399410247802734, + "logps/rejected": -83.53336334228516, + "loss": 0.2582, + "losses/dpo": 0.35614290833473206, + "losses/sft": 1.4065749645233154, + "losses/total": 0.35614290833473206, + "ref_logps/chosen": -32.357181549072266, + "ref_logps/rejected": -49.31428146362305, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4042229652404785, + "rewards/margins": 2.0176854133605957, + "rewards/rejected": -3.421908378601074, + "step": 1768 + }, + { + "epoch": 1.67, + "grad_norm": 23.35055160522461, + "learning_rate": 2.4623994403637633e-07, + "logps/chosen": -58.016815185546875, + "logps/rejected": -59.55890655517578, + "loss": 0.4711, + "losses/dpo": 0.3933718800544739, + "losses/sft": 1.6000556945800781, + "losses/total": 0.3933718800544739, + "ref_logps/chosen": -41.461463928222656, + "ref_logps/rejected": -34.128299713134766, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6555355787277222, + "rewards/margins": 0.8875244855880737, + "rewards/rejected": -2.543060302734375, + "step": 1769 + }, + { + "epoch": 1.67, + "grad_norm": 18.778644561767578, + "learning_rate": 2.460650577124869e-07, + "logps/chosen": -45.72222900390625, + "logps/rejected": -76.72969818115234, + "loss": 0.2618, + "losses/dpo": 0.06690526008605957, + "losses/sft": 2.0953080654144287, + "losses/total": 0.06690526008605957, + "ref_logps/chosen": -30.757808685302734, + "ref_logps/rejected": -44.511863708496094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.496442198753357, + "rewards/margins": 1.7253413200378418, + "rewards/rejected": -3.221783399581909, + "step": 1770 + }, + { + "epoch": 1.67, + "grad_norm": 15.959135055541992, + "learning_rate": 2.458901713885974e-07, + "logps/chosen": -32.28383255004883, + "logps/rejected": -55.138572692871094, + "loss": 0.4931, + "losses/dpo": 0.7304418087005615, + "losses/sft": 1.6448898315429688, + "losses/total": 0.7304418087005615, + "ref_logps/chosen": -20.5487060546875, + "ref_logps/rejected": -32.69231414794922, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.173512578010559, + "rewards/margins": 1.0711129903793335, + "rewards/rejected": -2.2446253299713135, + "step": 1771 + }, + { + "epoch": 1.67, + "grad_norm": 29.38340187072754, + "learning_rate": 2.457152850647079e-07, + "logps/chosen": -52.144081115722656, + "logps/rejected": -65.97552490234375, + "loss": 0.597, + "losses/dpo": 0.8049647808074951, + "losses/sft": 1.864599347114563, + "losses/total": 0.8049647808074951, + "ref_logps/chosen": -34.577274322509766, + "ref_logps/rejected": -41.40080261230469, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.7566807270050049, + "rewards/margins": 0.7007919549942017, + "rewards/rejected": -2.457472562789917, + "step": 1772 + }, + { + "epoch": 1.67, + "grad_norm": 23.83207130432129, + "learning_rate": 2.4554039874081843e-07, + "logps/chosen": -59.02446746826172, + "logps/rejected": -75.85979461669922, + "loss": 0.3537, + "losses/dpo": 0.5540534853935242, + "losses/sft": 2.0470051765441895, + "losses/total": 0.5540534853935242, + "ref_logps/chosen": -41.64139175415039, + "ref_logps/rejected": -43.01557540893555, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7383074760437012, + "rewards/margins": 1.5461148023605347, + "rewards/rejected": -3.2844221591949463, + "step": 1773 + }, + { + "epoch": 1.68, + "grad_norm": 29.883790969848633, + "learning_rate": 2.45365512416929e-07, + "logps/chosen": -82.52509307861328, + "logps/rejected": -81.92755126953125, + "loss": 0.4266, + "losses/dpo": 0.9312683343887329, + "losses/sft": 2.6990251541137695, + "losses/total": 0.9312683343887329, + "ref_logps/chosen": -64.4759521484375, + "ref_logps/rejected": -50.77394104003906, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8049136400222778, + "rewards/margins": 1.3104476928710938, + "rewards/rejected": -3.115361213684082, + "step": 1774 + }, + { + "epoch": 1.68, + "grad_norm": 31.216472625732422, + "learning_rate": 2.451906260930395e-07, + "logps/chosen": -56.650150299072266, + "logps/rejected": -72.89379119873047, + "loss": 0.571, + "losses/dpo": 0.6400648951530457, + "losses/sft": 2.34333872795105, + "losses/total": 0.6400648951530457, + "ref_logps/chosen": -36.189632415771484, + "ref_logps/rejected": -42.272918701171875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0460517406463623, + "rewards/margins": 1.0160362720489502, + "rewards/rejected": -3.0620880126953125, + "step": 1775 + }, + { + "epoch": 1.68, + "grad_norm": 22.13832664489746, + "learning_rate": 2.4501573976915e-07, + "logps/chosen": -54.67356491088867, + "logps/rejected": -63.080604553222656, + "loss": 0.4464, + "losses/dpo": 0.5262210965156555, + "losses/sft": 1.7890100479125977, + "losses/total": 0.5262210965156555, + "ref_logps/chosen": -39.76952362060547, + "ref_logps/rejected": -37.833702087402344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4904041290283203, + "rewards/margins": 1.0342861413955688, + "rewards/rejected": -2.524690628051758, + "step": 1776 + }, + { + "epoch": 1.68, + "grad_norm": 21.75943374633789, + "learning_rate": 2.448408534452606e-07, + "logps/chosen": -46.36650085449219, + "logps/rejected": -64.93172454833984, + "loss": 0.3591, + "losses/dpo": 0.40173521637916565, + "losses/sft": 1.4916573762893677, + "losses/total": 0.40173521637916565, + "ref_logps/chosen": -32.63059616088867, + "ref_logps/rejected": -37.73809051513672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3735904693603516, + "rewards/margins": 1.3457728624343872, + "rewards/rejected": -2.719363212585449, + "step": 1777 + }, + { + "epoch": 1.68, + "grad_norm": 26.07839012145996, + "learning_rate": 2.446659671213711e-07, + "logps/chosen": -52.337528228759766, + "logps/rejected": -62.33589553833008, + "loss": 0.487, + "losses/dpo": 0.8857058882713318, + "losses/sft": 1.9764080047607422, + "losses/total": 0.8857058882713318, + "ref_logps/chosen": -38.39671325683594, + "ref_logps/rejected": -40.387664794921875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3940818309783936, + "rewards/margins": 0.8007413148880005, + "rewards/rejected": -2.1948230266571045, + "step": 1778 + }, + { + "epoch": 1.68, + "grad_norm": 22.114421844482422, + "learning_rate": 2.444910807974816e-07, + "logps/chosen": -56.41883850097656, + "logps/rejected": -71.63005065917969, + "loss": 0.386, + "losses/dpo": 0.44728463888168335, + "losses/sft": 1.8242136240005493, + "losses/total": 0.44728463888168335, + "ref_logps/chosen": -40.64788055419922, + "ref_logps/rejected": -43.111515045166016, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5770957469940186, + "rewards/margins": 1.2747578620910645, + "rewards/rejected": -2.851853847503662, + "step": 1779 + }, + { + "epoch": 1.68, + "grad_norm": 17.93875503540039, + "learning_rate": 2.443161944735921e-07, + "logps/chosen": -52.26482009887695, + "logps/rejected": -69.38674926757812, + "loss": 0.3061, + "losses/dpo": 0.43446084856987, + "losses/sft": 2.333432674407959, + "losses/total": 0.43446084856987, + "ref_logps/chosen": -37.2119026184082, + "ref_logps/rejected": -37.800071716308594, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5052918195724487, + "rewards/margins": 1.653376579284668, + "rewards/rejected": -3.158668279647827, + "step": 1780 + }, + { + "epoch": 1.68, + "grad_norm": 18.611656188964844, + "learning_rate": 2.441413081497027e-07, + "logps/chosen": -46.145103454589844, + "logps/rejected": -76.09770965576172, + "loss": 0.3005, + "losses/dpo": 0.16587722301483154, + "losses/sft": 1.7813081741333008, + "losses/total": 0.16587722301483154, + "ref_logps/chosen": -31.090377807617188, + "ref_logps/rejected": -44.38664245605469, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5054725408554077, + "rewards/margins": 1.665634036064148, + "rewards/rejected": -3.1711065769195557, + "step": 1781 + }, + { + "epoch": 1.68, + "grad_norm": 26.09534454345703, + "learning_rate": 2.439664218258132e-07, + "logps/chosen": -52.15563201904297, + "logps/rejected": -80.50919342041016, + "loss": 0.4189, + "losses/dpo": 0.33090147376060486, + "losses/sft": 1.8237643241882324, + "losses/total": 0.33090147376060486, + "ref_logps/chosen": -35.654212951660156, + "ref_logps/rejected": -51.00825119018555, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.650141954421997, + "rewards/margins": 1.2999521493911743, + "rewards/rejected": -2.950094223022461, + "step": 1782 + }, + { + "epoch": 1.68, + "grad_norm": 18.725366592407227, + "learning_rate": 2.4379153550192377e-07, + "logps/chosen": -56.691383361816406, + "logps/rejected": -75.12642669677734, + "loss": 0.3411, + "losses/dpo": 0.416388601064682, + "losses/sft": 1.8497592210769653, + "losses/total": 0.416388601064682, + "ref_logps/chosen": -41.28091812133789, + "ref_logps/rejected": -44.83116149902344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.541046380996704, + "rewards/margins": 1.4884802103042603, + "rewards/rejected": -3.029526710510254, + "step": 1783 + }, + { + "epoch": 1.68, + "grad_norm": 18.87398910522461, + "learning_rate": 2.436166491780343e-07, + "logps/chosen": -54.67528533935547, + "logps/rejected": -76.38240814208984, + "loss": 0.2635, + "losses/dpo": 0.1386934071779251, + "losses/sft": 1.6716876029968262, + "losses/total": 0.1386934071779251, + "ref_logps/chosen": -43.75564193725586, + "ref_logps/rejected": -48.06443405151367, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0919640064239502, + "rewards/margins": 1.7398333549499512, + "rewards/rejected": -2.8317971229553223, + "step": 1784 + }, + { + "epoch": 1.69, + "grad_norm": 18.562440872192383, + "learning_rate": 2.434417628541448e-07, + "logps/chosen": -60.26049041748047, + "logps/rejected": -77.65402221679688, + "loss": 0.2842, + "losses/dpo": 0.2546020746231079, + "losses/sft": 2.0679266452789307, + "losses/total": 0.2546020746231079, + "ref_logps/chosen": -44.27286911010742, + "ref_logps/rejected": -45.871578216552734, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5987615585327148, + "rewards/margins": 1.5794830322265625, + "rewards/rejected": -3.1782445907592773, + "step": 1785 + }, + { + "epoch": 1.69, + "grad_norm": 26.28315544128418, + "learning_rate": 2.432668765302553e-07, + "logps/chosen": -58.116798400878906, + "logps/rejected": -72.88306427001953, + "loss": 0.4248, + "losses/dpo": 0.24877868592739105, + "losses/sft": 1.925865650177002, + "losses/total": 0.24877868592739105, + "ref_logps/chosen": -40.48954391479492, + "ref_logps/rejected": -42.69841766357422, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7627257108688354, + "rewards/margins": 1.2557388544082642, + "rewards/rejected": -3.0184645652770996, + "step": 1786 + }, + { + "epoch": 1.69, + "grad_norm": 21.263511657714844, + "learning_rate": 2.430919902063658e-07, + "logps/chosen": -49.32164001464844, + "logps/rejected": -70.8302993774414, + "loss": 0.4289, + "losses/dpo": 0.45973828434944153, + "losses/sft": 1.9106676578521729, + "losses/total": 0.45973828434944153, + "ref_logps/chosen": -34.64125061035156, + "ref_logps/rejected": -46.22222137451172, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4680389165878296, + "rewards/margins": 0.9927690625190735, + "rewards/rejected": -2.460808038711548, + "step": 1787 + }, + { + "epoch": 1.69, + "grad_norm": 16.062267303466797, + "learning_rate": 2.429171038824764e-07, + "logps/chosen": -60.7998046875, + "logps/rejected": -80.55281066894531, + "loss": 0.2391, + "losses/dpo": 0.3039817214012146, + "losses/sft": 1.987055778503418, + "losses/total": 0.3039817214012146, + "ref_logps/chosen": -48.479469299316406, + "ref_logps/rejected": -52.036865234375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2320337295532227, + "rewards/margins": 1.6195610761642456, + "rewards/rejected": -2.8515946865081787, + "step": 1788 + }, + { + "epoch": 1.69, + "grad_norm": 21.14324188232422, + "learning_rate": 2.427422175585869e-07, + "logps/chosen": -48.07803726196289, + "logps/rejected": -82.919189453125, + "loss": 0.2616, + "losses/dpo": 0.27730491757392883, + "losses/sft": 1.4679170846939087, + "losses/total": 0.27730491757392883, + "ref_logps/chosen": -32.23628616333008, + "ref_logps/rejected": -51.782989501953125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5841751098632812, + "rewards/margins": 1.5294442176818848, + "rewards/rejected": -3.113619565963745, + "step": 1789 + }, + { + "epoch": 1.69, + "grad_norm": 20.190723419189453, + "learning_rate": 2.4256733123469746e-07, + "logps/chosen": -45.28055953979492, + "logps/rejected": -67.60964965820312, + "loss": 0.3713, + "losses/dpo": 0.5195133686065674, + "losses/sft": 1.902703046798706, + "losses/total": 0.5195133686065674, + "ref_logps/chosen": -30.42159080505371, + "ref_logps/rejected": -40.092803955078125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4858973026275635, + "rewards/margins": 1.2657880783081055, + "rewards/rejected": -2.75168514251709, + "step": 1790 + }, + { + "epoch": 1.69, + "grad_norm": 29.569889068603516, + "learning_rate": 2.42392444910808e-07, + "logps/chosen": -56.91025924682617, + "logps/rejected": -62.00165557861328, + "loss": 0.5529, + "losses/dpo": 0.6529324650764465, + "losses/sft": 1.5989012718200684, + "losses/total": 0.6529324650764465, + "ref_logps/chosen": -39.31761169433594, + "ref_logps/rejected": -37.57944869995117, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7592642307281494, + "rewards/margins": 0.6829563975334167, + "rewards/rejected": -2.442220687866211, + "step": 1791 + }, + { + "epoch": 1.69, + "grad_norm": 21.72402000427246, + "learning_rate": 2.422175585869185e-07, + "logps/chosen": -44.800167083740234, + "logps/rejected": -58.93064498901367, + "loss": 0.4815, + "losses/dpo": 0.8620520830154419, + "losses/sft": 1.5104798078536987, + "losses/total": 0.8620520830154419, + "ref_logps/chosen": -32.5340461730957, + "ref_logps/rejected": -36.773292541503906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.226611852645874, + "rewards/margins": 0.9891231656074524, + "rewards/rejected": -2.2157349586486816, + "step": 1792 + }, + { + "epoch": 1.69, + "grad_norm": 20.296966552734375, + "learning_rate": 2.42042672263029e-07, + "logps/chosen": -70.43276977539062, + "logps/rejected": -84.78248596191406, + "loss": 0.3108, + "losses/dpo": 0.308792382478714, + "losses/sft": 2.2755091190338135, + "losses/total": 0.308792382478714, + "ref_logps/chosen": -50.647308349609375, + "ref_logps/rejected": -51.45112609863281, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.978546380996704, + "rewards/margins": 1.354589581489563, + "rewards/rejected": -3.3331360816955566, + "step": 1793 + }, + { + "epoch": 1.69, + "grad_norm": 24.659976959228516, + "learning_rate": 2.418677859391395e-07, + "logps/chosen": -53.954463958740234, + "logps/rejected": -70.03739929199219, + "loss": 0.4518, + "losses/dpo": 0.7430610656738281, + "losses/sft": 2.3112823963165283, + "losses/total": 0.7430610656738281, + "ref_logps/chosen": -38.437843322753906, + "ref_logps/rejected": -42.495086669921875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5516622066497803, + "rewards/margins": 1.202569603919983, + "rewards/rejected": -2.7542319297790527, + "step": 1794 + }, + { + "epoch": 1.69, + "grad_norm": 22.181167602539062, + "learning_rate": 2.416928996152501e-07, + "logps/chosen": -43.811004638671875, + "logps/rejected": -59.92433166503906, + "loss": 0.4551, + "losses/dpo": 0.6735576391220093, + "losses/sft": 1.6574383974075317, + "losses/total": 0.6735576391220093, + "ref_logps/chosen": -28.613182067871094, + "ref_logps/rejected": -34.89199447631836, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5197821855545044, + "rewards/margins": 0.983451783657074, + "rewards/rejected": -2.5032339096069336, + "step": 1795 + }, + { + "epoch": 1.7, + "grad_norm": 30.978641510009766, + "learning_rate": 2.415180132913606e-07, + "logps/chosen": -50.89833450317383, + "logps/rejected": -57.69483947753906, + "loss": 0.5876, + "losses/dpo": 0.6637569665908813, + "losses/sft": 1.7893388271331787, + "losses/total": 0.6637569665908813, + "ref_logps/chosen": -36.480552673339844, + "ref_logps/rejected": -36.3235969543457, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4417781829833984, + "rewards/margins": 0.6953457593917847, + "rewards/rejected": -2.1371238231658936, + "step": 1796 + }, + { + "epoch": 1.7, + "grad_norm": 19.025787353515625, + "learning_rate": 2.4134312696747116e-07, + "logps/chosen": -53.97319030761719, + "logps/rejected": -72.19430541992188, + "loss": 0.3084, + "losses/dpo": 0.26776963472366333, + "losses/sft": 1.8403483629226685, + "losses/total": 0.26776963472366333, + "ref_logps/chosen": -40.94664764404297, + "ref_logps/rejected": -42.826873779296875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3026540279388428, + "rewards/margins": 1.6340895891189575, + "rewards/rejected": -2.9367434978485107, + "step": 1797 + }, + { + "epoch": 1.7, + "grad_norm": 13.881430625915527, + "learning_rate": 2.4116824064358167e-07, + "logps/chosen": -42.53218078613281, + "logps/rejected": -77.91055297851562, + "loss": 0.175, + "losses/dpo": 0.23638613522052765, + "losses/sft": 1.7883586883544922, + "losses/total": 0.23638613522052765, + "ref_logps/chosen": -32.17339324951172, + "ref_logps/rejected": -46.677001953125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0358785390853882, + "rewards/margins": 2.0874767303466797, + "rewards/rejected": -3.1233553886413574, + "step": 1798 + }, + { + "epoch": 1.7, + "grad_norm": 25.556428909301758, + "learning_rate": 2.409933543196922e-07, + "logps/chosen": -54.38060760498047, + "logps/rejected": -74.64018249511719, + "loss": 0.5343, + "losses/dpo": 0.4501485526561737, + "losses/sft": 2.3382349014282227, + "losses/total": 0.4501485526561737, + "ref_logps/chosen": -36.06700897216797, + "ref_logps/rejected": -45.476924896240234, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8313599824905396, + "rewards/margins": 1.0849659442901611, + "rewards/rejected": -2.916325807571411, + "step": 1799 + }, + { + "epoch": 1.7, + "grad_norm": 20.34830093383789, + "learning_rate": 2.408184679958027e-07, + "logps/chosen": -54.23207092285156, + "logps/rejected": -78.97080993652344, + "loss": 0.3925, + "losses/dpo": 0.3393729627132416, + "losses/sft": 1.8011932373046875, + "losses/total": 0.3393729627132416, + "ref_logps/chosen": -38.857635498046875, + "ref_logps/rejected": -48.183135986328125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5374433994293213, + "rewards/margins": 1.5413247346878052, + "rewards/rejected": -3.078768253326416, + "step": 1800 + }, + { + "epoch": 1.7, + "grad_norm": 14.515795707702637, + "learning_rate": 2.406435816719132e-07, + "logps/chosen": -37.18846893310547, + "logps/rejected": -69.02476501464844, + "loss": 0.2884, + "losses/dpo": 0.4819752871990204, + "losses/sft": 1.9720385074615479, + "losses/total": 0.4819752871990204, + "ref_logps/chosen": -27.404996871948242, + "ref_logps/rejected": -42.84521484375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9783473610877991, + "rewards/margins": 1.6396076679229736, + "rewards/rejected": -2.617954969406128, + "step": 1801 + }, + { + "epoch": 1.7, + "grad_norm": 26.68324851989746, + "learning_rate": 2.404686953480238e-07, + "logps/chosen": -71.21405029296875, + "logps/rejected": -81.84751892089844, + "loss": 0.4024, + "losses/dpo": 0.29949310421943665, + "losses/sft": 1.6894594430923462, + "losses/total": 0.29949310421943665, + "ref_logps/chosen": -49.875282287597656, + "ref_logps/rejected": -49.16812515258789, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1338772773742676, + "rewards/margins": 1.1340622901916504, + "rewards/rejected": -3.267939567565918, + "step": 1802 + }, + { + "epoch": 1.7, + "grad_norm": 27.291259765625, + "learning_rate": 2.402938090241343e-07, + "logps/chosen": -57.330352783203125, + "logps/rejected": -70.62774658203125, + "loss": 0.4482, + "losses/dpo": 0.527060866355896, + "losses/sft": 2.0806970596313477, + "losses/total": 0.527060866355896, + "ref_logps/chosen": -41.38750076293945, + "ref_logps/rejected": -45.277442932128906, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.594285249710083, + "rewards/margins": 0.9407455921173096, + "rewards/rejected": -2.5350308418273926, + "step": 1803 + }, + { + "epoch": 1.7, + "grad_norm": 20.700218200683594, + "learning_rate": 2.4011892270024485e-07, + "logps/chosen": -49.646339416503906, + "logps/rejected": -70.36936950683594, + "loss": 0.3494, + "losses/dpo": 0.31294238567352295, + "losses/sft": 1.506961464881897, + "losses/total": 0.31294238567352295, + "ref_logps/chosen": -38.689517974853516, + "ref_logps/rejected": -47.67042922973633, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0956823825836182, + "rewards/margins": 1.1742115020751953, + "rewards/rejected": -2.2698938846588135, + "step": 1804 + }, + { + "epoch": 1.7, + "grad_norm": 25.93865394592285, + "learning_rate": 2.3994403637635537e-07, + "logps/chosen": -48.94581985473633, + "logps/rejected": -72.06764221191406, + "loss": 0.5139, + "losses/dpo": 0.4126269519329071, + "losses/sft": 1.966224193572998, + "losses/total": 0.4126269519329071, + "ref_logps/chosen": -35.31593704223633, + "ref_logps/rejected": -50.644996643066406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.362987995147705, + "rewards/margins": 0.779276967048645, + "rewards/rejected": -2.1422648429870605, + "step": 1805 + }, + { + "epoch": 1.71, + "grad_norm": 21.34590721130371, + "learning_rate": 2.397691500524659e-07, + "logps/chosen": -50.95314025878906, + "logps/rejected": -68.23500061035156, + "loss": 0.3399, + "losses/dpo": 0.3922712206840515, + "losses/sft": 2.487985134124756, + "losses/total": 0.3922712206840515, + "ref_logps/chosen": -35.26377487182617, + "ref_logps/rejected": -39.783836364746094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5689367055892944, + "rewards/margins": 1.276179313659668, + "rewards/rejected": -2.845115900039673, + "step": 1806 + }, + { + "epoch": 1.71, + "grad_norm": 13.201313018798828, + "learning_rate": 2.395942637285764e-07, + "logps/chosen": -48.08342361450195, + "logps/rejected": -62.19560241699219, + "loss": 0.2348, + "losses/dpo": 0.19551517069339752, + "losses/sft": 1.8445029258728027, + "losses/total": 0.19551517069339752, + "ref_logps/chosen": -39.11737823486328, + "ref_logps/rejected": -35.42494201660156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8966045379638672, + "rewards/margins": 1.7804611921310425, + "rewards/rejected": -2.677065849304199, + "step": 1807 + }, + { + "epoch": 1.71, + "grad_norm": 19.576393127441406, + "learning_rate": 2.3941937740468696e-07, + "logps/chosen": -62.76116943359375, + "logps/rejected": -70.84951782226562, + "loss": 0.3568, + "losses/dpo": 0.3549603223800659, + "losses/sft": 1.8085447549819946, + "losses/total": 0.3549603223800659, + "ref_logps/chosen": -45.12492752075195, + "ref_logps/rejected": -43.808570861816406, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7636239528656006, + "rewards/margins": 0.9404711127281189, + "rewards/rejected": -2.7040953636169434, + "step": 1808 + }, + { + "epoch": 1.71, + "grad_norm": 20.147262573242188, + "learning_rate": 2.3924449108079747e-07, + "logps/chosen": -52.591129302978516, + "logps/rejected": -63.63362121582031, + "loss": 0.3674, + "losses/dpo": 0.4235965311527252, + "losses/sft": 1.3365919589996338, + "losses/total": 0.4235965311527252, + "ref_logps/chosen": -38.298648834228516, + "ref_logps/rejected": -35.4205436706543, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4292478561401367, + "rewards/margins": 1.3920602798461914, + "rewards/rejected": -2.821308135986328, + "step": 1809 + }, + { + "epoch": 1.71, + "grad_norm": 25.43413734436035, + "learning_rate": 2.39069604756908e-07, + "logps/chosen": -45.40858840942383, + "logps/rejected": -62.31193923950195, + "loss": 0.4528, + "losses/dpo": 0.768020510673523, + "losses/sft": 1.4819835424423218, + "losses/total": 0.768020510673523, + "ref_logps/chosen": -28.7589111328125, + "ref_logps/rejected": -34.95357131958008, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6649675369262695, + "rewards/margins": 1.0708694458007812, + "rewards/rejected": -2.735836982727051, + "step": 1810 + }, + { + "epoch": 1.71, + "grad_norm": 23.087696075439453, + "learning_rate": 2.3889471843301855e-07, + "logps/chosen": -53.17631530761719, + "logps/rejected": -74.31806945800781, + "loss": 0.442, + "losses/dpo": 0.7953958511352539, + "losses/sft": 2.4478635787963867, + "losses/total": 0.7953958511352539, + "ref_logps/chosen": -36.81160354614258, + "ref_logps/rejected": -43.09166717529297, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6364715099334717, + "rewards/margins": 1.4861682653427124, + "rewards/rejected": -3.1226396560668945, + "step": 1811 + }, + { + "epoch": 1.71, + "grad_norm": 35.091495513916016, + "learning_rate": 2.3871983210912906e-07, + "logps/chosen": -50.18351745605469, + "logps/rejected": -70.42749786376953, + "loss": 0.5903, + "losses/dpo": 0.6043030023574829, + "losses/sft": 2.051079750061035, + "losses/total": 0.6043030023574829, + "ref_logps/chosen": -33.16597366333008, + "ref_logps/rejected": -46.168888092041016, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.701754093170166, + "rewards/margins": 0.7241073846817017, + "rewards/rejected": -2.4258618354797363, + "step": 1812 + }, + { + "epoch": 1.71, + "grad_norm": 20.404088973999023, + "learning_rate": 2.3854494578523957e-07, + "logps/chosen": -51.82007598876953, + "logps/rejected": -72.42236328125, + "loss": 0.3263, + "losses/dpo": 0.2008453756570816, + "losses/sft": 2.5513551235198975, + "losses/total": 0.2008453756570816, + "ref_logps/chosen": -36.613616943359375, + "ref_logps/rejected": -43.31108474731445, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.520646095275879, + "rewards/margins": 1.39048171043396, + "rewards/rejected": -2.911127805709839, + "step": 1813 + }, + { + "epoch": 1.71, + "grad_norm": 22.575740814208984, + "learning_rate": 2.383700594613501e-07, + "logps/chosen": -41.14979934692383, + "logps/rejected": -57.995208740234375, + "loss": 0.4619, + "losses/dpo": 0.6094987392425537, + "losses/sft": 1.7752008438110352, + "losses/total": 0.6094987392425537, + "ref_logps/chosen": -28.672632217407227, + "ref_logps/rejected": -35.17646789550781, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2477166652679443, + "rewards/margins": 1.034157633781433, + "rewards/rejected": -2.281874179840088, + "step": 1814 + }, + { + "epoch": 1.71, + "grad_norm": 36.38962936401367, + "learning_rate": 2.3819517313746065e-07, + "logps/chosen": -60.74071502685547, + "logps/rejected": -69.07544708251953, + "loss": 0.8009, + "losses/dpo": 1.2549885511398315, + "losses/sft": 1.732917070388794, + "losses/total": 1.2549885511398315, + "ref_logps/chosen": -42.26467514038086, + "ref_logps/rejected": -47.442230224609375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8476041555404663, + "rewards/margins": 0.3157173991203308, + "rewards/rejected": -2.1633217334747314, + "step": 1815 + }, + { + "epoch": 1.71, + "grad_norm": 24.11556053161621, + "learning_rate": 2.3802028681357116e-07, + "logps/chosen": -61.94044876098633, + "logps/rejected": -86.11589813232422, + "loss": 0.3338, + "losses/dpo": 0.6760409474372864, + "losses/sft": 2.08918833732605, + "losses/total": 0.6760409474372864, + "ref_logps/chosen": -41.16832733154297, + "ref_logps/rejected": -50.388816833496094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.077211856842041, + "rewards/margins": 1.4954955577850342, + "rewards/rejected": -3.572707414627075, + "step": 1816 + }, + { + "epoch": 1.72, + "grad_norm": 14.405179023742676, + "learning_rate": 2.3784540048968168e-07, + "logps/chosen": -38.53070068359375, + "logps/rejected": -71.51303100585938, + "loss": 0.2274, + "losses/dpo": 0.13569137454032898, + "losses/sft": 1.565115213394165, + "losses/total": 0.13569137454032898, + "ref_logps/chosen": -28.008386611938477, + "ref_logps/rejected": -41.23868942260742, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0522313117980957, + "rewards/margins": 1.975203037261963, + "rewards/rejected": -3.0274343490600586, + "step": 1817 + }, + { + "epoch": 1.72, + "grad_norm": 28.90049934387207, + "learning_rate": 2.3767051416579224e-07, + "logps/chosen": -43.190391540527344, + "logps/rejected": -56.24452209472656, + "loss": 0.4207, + "losses/dpo": 0.4833914637565613, + "losses/sft": 2.234956979751587, + "losses/total": 0.4833914637565613, + "ref_logps/chosen": -30.222129821777344, + "ref_logps/rejected": -30.783945083618164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2968266010284424, + "rewards/margins": 1.2492311000823975, + "rewards/rejected": -2.54605770111084, + "step": 1818 + }, + { + "epoch": 1.72, + "grad_norm": 27.203447341918945, + "learning_rate": 2.3749562784190275e-07, + "logps/chosen": -54.47657012939453, + "logps/rejected": -90.49632263183594, + "loss": 0.3377, + "losses/dpo": 0.3628811240196228, + "losses/sft": 1.9849289655685425, + "losses/total": 0.3628811240196228, + "ref_logps/chosen": -38.0615234375, + "ref_logps/rejected": -56.2956428527832, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6415046453475952, + "rewards/margins": 1.7785634994506836, + "rewards/rejected": -3.4200680255889893, + "step": 1819 + }, + { + "epoch": 1.72, + "grad_norm": 23.622251510620117, + "learning_rate": 2.373207415180133e-07, + "logps/chosen": -62.47504806518555, + "logps/rejected": -79.90737915039062, + "loss": 0.3726, + "losses/dpo": 0.4988325238227844, + "losses/sft": 1.9776464700698853, + "losses/total": 0.4988325238227844, + "ref_logps/chosen": -44.18340301513672, + "ref_logps/rejected": -45.385475158691406, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8291640281677246, + "rewards/margins": 1.6230263710021973, + "rewards/rejected": -3.4521901607513428, + "step": 1820 + }, + { + "epoch": 1.72, + "grad_norm": 32.48872756958008, + "learning_rate": 2.371458551941238e-07, + "logps/chosen": -56.083763122558594, + "logps/rejected": -79.61029052734375, + "loss": 0.5124, + "losses/dpo": 0.7199397087097168, + "losses/sft": 2.0983541011810303, + "losses/total": 0.7199397087097168, + "ref_logps/chosen": -35.626197814941406, + "ref_logps/rejected": -48.16398620605469, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0457568168640137, + "rewards/margins": 1.0988742113113403, + "rewards/rejected": -3.1446309089660645, + "step": 1821 + }, + { + "epoch": 1.72, + "grad_norm": 27.68033218383789, + "learning_rate": 2.3697096887023435e-07, + "logps/chosen": -59.46272277832031, + "logps/rejected": -69.81183624267578, + "loss": 0.4971, + "losses/dpo": 0.8627828359603882, + "losses/sft": 2.3563098907470703, + "losses/total": 0.8627828359603882, + "ref_logps/chosen": -39.85981750488281, + "ref_logps/rejected": -39.03014373779297, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.960290789604187, + "rewards/margins": 1.1178789138793945, + "rewards/rejected": -3.078169822692871, + "step": 1822 + }, + { + "epoch": 1.72, + "grad_norm": 22.5125675201416, + "learning_rate": 2.3679608254634486e-07, + "logps/chosen": -57.32041931152344, + "logps/rejected": -76.5188217163086, + "loss": 0.4545, + "losses/dpo": 0.425339937210083, + "losses/sft": 1.8877969980239868, + "losses/total": 0.425339937210083, + "ref_logps/chosen": -42.733543395996094, + "ref_logps/rejected": -46.36111068725586, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4586875438690186, + "rewards/margins": 1.5570838451385498, + "rewards/rejected": -3.0157713890075684, + "step": 1823 + }, + { + "epoch": 1.72, + "grad_norm": 22.97613525390625, + "learning_rate": 2.3662119622245537e-07, + "logps/chosen": -63.38905334472656, + "logps/rejected": -82.48291015625, + "loss": 0.3041, + "losses/dpo": 0.2486165463924408, + "losses/sft": 1.890404224395752, + "losses/total": 0.2486165463924408, + "ref_logps/chosen": -47.25609588623047, + "ref_logps/rejected": -48.80116271972656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6132957935333252, + "rewards/margins": 1.7548785209655762, + "rewards/rejected": -3.3681745529174805, + "step": 1824 + }, + { + "epoch": 1.72, + "grad_norm": 31.059612274169922, + "learning_rate": 2.3644630989856594e-07, + "logps/chosen": -52.537757873535156, + "logps/rejected": -71.47386169433594, + "loss": 0.5254, + "losses/dpo": 0.19074425101280212, + "losses/sft": 2.104780435562134, + "losses/total": 0.19074425101280212, + "ref_logps/chosen": -36.833351135253906, + "ref_logps/rejected": -42.596961975097656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5704405307769775, + "rewards/margins": 1.3172492980957031, + "rewards/rejected": -2.8876895904541016, + "step": 1825 + }, + { + "epoch": 1.72, + "grad_norm": 15.249720573425293, + "learning_rate": 2.3627142357467645e-07, + "logps/chosen": -55.86733627319336, + "logps/rejected": -93.32683563232422, + "loss": 0.1699, + "losses/dpo": 0.02240651845932007, + "losses/sft": 1.7759265899658203, + "losses/total": 0.02240651845932007, + "ref_logps/chosen": -44.871089935302734, + "ref_logps/rejected": -56.358184814453125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0996246337890625, + "rewards/margins": 2.597240686416626, + "rewards/rejected": -3.6968655586242676, + "step": 1826 + }, + { + "epoch": 1.73, + "grad_norm": 24.925495147705078, + "learning_rate": 2.36096537250787e-07, + "logps/chosen": -55.65700149536133, + "logps/rejected": -80.70500183105469, + "loss": 0.3965, + "losses/dpo": 1.0382367372512817, + "losses/sft": 2.458404302597046, + "losses/total": 1.0382367372512817, + "ref_logps/chosen": -37.813629150390625, + "ref_logps/rejected": -45.53327941894531, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7843371629714966, + "rewards/margins": 1.7328346967697144, + "rewards/rejected": -3.517171859741211, + "step": 1827 + }, + { + "epoch": 1.73, + "grad_norm": 20.223031997680664, + "learning_rate": 2.359216509268975e-07, + "logps/chosen": -59.71818161010742, + "logps/rejected": -75.98500061035156, + "loss": 0.3323, + "losses/dpo": 0.2974328398704529, + "losses/sft": 2.027266263961792, + "losses/total": 0.2974328398704529, + "ref_logps/chosen": -44.14948272705078, + "ref_logps/rejected": -45.65176773071289, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5568698644638062, + "rewards/margins": 1.4764533042907715, + "rewards/rejected": -3.033323287963867, + "step": 1828 + }, + { + "epoch": 1.73, + "grad_norm": 30.462846755981445, + "learning_rate": 2.3574676460300804e-07, + "logps/chosen": -65.40301513671875, + "logps/rejected": -89.72885131835938, + "loss": 0.4634, + "losses/dpo": 0.7341820001602173, + "losses/sft": 2.5555121898651123, + "losses/total": 0.7341820001602173, + "ref_logps/chosen": -43.19352722167969, + "ref_logps/rejected": -52.105194091796875, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2209486961364746, + "rewards/margins": 1.5414172410964966, + "rewards/rejected": -3.7623658180236816, + "step": 1829 + }, + { + "epoch": 1.73, + "grad_norm": 16.414480209350586, + "learning_rate": 2.3557187827911855e-07, + "logps/chosen": -49.07080078125, + "logps/rejected": -79.08071899414062, + "loss": 0.2538, + "losses/dpo": 0.0965033695101738, + "losses/sft": 1.7428237199783325, + "losses/total": 0.0965033695101738, + "ref_logps/chosen": -36.794944763183594, + "ref_logps/rejected": -48.857421875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2275853157043457, + "rewards/margins": 1.7947438955307007, + "rewards/rejected": -3.022329330444336, + "step": 1830 + }, + { + "epoch": 1.73, + "grad_norm": 28.070436477661133, + "learning_rate": 2.353969919552291e-07, + "logps/chosen": -59.75160217285156, + "logps/rejected": -60.827232360839844, + "loss": 0.5082, + "losses/dpo": 0.4128730595111847, + "losses/sft": 1.4651991128921509, + "losses/total": 0.4128730595111847, + "ref_logps/chosen": -39.54951477050781, + "ref_logps/rejected": -33.90257263183594, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0202085971832275, + "rewards/margins": 0.672257125377655, + "rewards/rejected": -2.6924657821655273, + "step": 1831 + }, + { + "epoch": 1.73, + "grad_norm": 25.752349853515625, + "learning_rate": 2.3522210563133963e-07, + "logps/chosen": -49.78515625, + "logps/rejected": -65.71405792236328, + "loss": 0.4287, + "losses/dpo": 0.28923720121383667, + "losses/sft": 1.5678766965866089, + "losses/total": 0.28923720121383667, + "ref_logps/chosen": -36.99530792236328, + "ref_logps/rejected": -40.70927429199219, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.278984785079956, + "rewards/margins": 1.2214933633804321, + "rewards/rejected": -2.5004782676696777, + "step": 1832 + }, + { + "epoch": 1.73, + "grad_norm": 21.68288803100586, + "learning_rate": 2.3504721930745014e-07, + "logps/chosen": -46.7327880859375, + "logps/rejected": -76.64437866210938, + "loss": 0.306, + "losses/dpo": 0.20786938071250916, + "losses/sft": 1.6630686521530151, + "losses/total": 0.20786938071250916, + "ref_logps/chosen": -36.01597213745117, + "ref_logps/rejected": -48.47999954223633, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0716817378997803, + "rewards/margins": 1.744756817817688, + "rewards/rejected": -2.816438674926758, + "step": 1833 + }, + { + "epoch": 1.73, + "grad_norm": 22.000045776367188, + "learning_rate": 2.3487233298356068e-07, + "logps/chosen": -48.887638092041016, + "logps/rejected": -71.68235778808594, + "loss": 0.3812, + "losses/dpo": 0.35574308037757874, + "losses/sft": 2.0993213653564453, + "losses/total": 0.35574308037757874, + "ref_logps/chosen": -33.489990234375, + "ref_logps/rejected": -42.93003845214844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.539764642715454, + "rewards/margins": 1.3354679346084595, + "rewards/rejected": -2.875232458114624, + "step": 1834 + }, + { + "epoch": 1.73, + "grad_norm": 23.827600479125977, + "learning_rate": 2.346974466596712e-07, + "logps/chosen": -51.1121940612793, + "logps/rejected": -70.45458984375, + "loss": 0.3488, + "losses/dpo": 0.39547669887542725, + "losses/sft": 2.0413146018981934, + "losses/total": 0.39547669887542725, + "ref_logps/chosen": -37.81718444824219, + "ref_logps/rejected": -43.214599609375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3295011520385742, + "rewards/margins": 1.3944984674453735, + "rewards/rejected": -2.723999500274658, + "step": 1835 + }, + { + "epoch": 1.73, + "grad_norm": 31.912322998046875, + "learning_rate": 2.3452256033578173e-07, + "logps/chosen": -63.27443313598633, + "logps/rejected": -74.21241760253906, + "loss": 0.5333, + "losses/dpo": 0.7827017903327942, + "losses/sft": 2.633052110671997, + "losses/total": 0.7827017903327942, + "ref_logps/chosen": -44.273338317871094, + "ref_logps/rejected": -47.25540542602539, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9001094102859497, + "rewards/margins": 0.7955919504165649, + "rewards/rejected": -2.6957013607025146, + "step": 1836 + }, + { + "epoch": 1.73, + "grad_norm": 24.76863670349121, + "learning_rate": 2.3434767401189225e-07, + "logps/chosen": -72.81598663330078, + "logps/rejected": -78.52141571044922, + "loss": 0.4588, + "losses/dpo": 0.5126616358757019, + "losses/sft": 2.036393880844116, + "losses/total": 0.5126616358757019, + "ref_logps/chosen": -48.431392669677734, + "ref_logps/rejected": -45.08470153808594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4384589195251465, + "rewards/margins": 0.9052125215530396, + "rewards/rejected": -3.3436715602874756, + "step": 1837 + }, + { + "epoch": 1.74, + "grad_norm": 28.28955841064453, + "learning_rate": 2.3417278768800279e-07, + "logps/chosen": -53.83306121826172, + "logps/rejected": -74.90179443359375, + "loss": 0.4319, + "losses/dpo": 0.41800588369369507, + "losses/sft": 2.649404287338257, + "losses/total": 0.41800588369369507, + "ref_logps/chosen": -38.21922302246094, + "ref_logps/rejected": -47.19895935058594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5613840818405151, + "rewards/margins": 1.2088998556137085, + "rewards/rejected": -2.7702839374542236, + "step": 1838 + }, + { + "epoch": 1.74, + "grad_norm": 33.06009292602539, + "learning_rate": 2.3399790136411333e-07, + "logps/chosen": -56.79712677001953, + "logps/rejected": -64.22746276855469, + "loss": 0.5691, + "losses/dpo": 0.3352791666984558, + "losses/sft": 1.94242262840271, + "losses/total": 0.3352791666984558, + "ref_logps/chosen": -40.31622314453125, + "ref_logps/rejected": -40.16978454589844, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6480904817581177, + "rewards/margins": 0.757677435874939, + "rewards/rejected": -2.4057679176330566, + "step": 1839 + }, + { + "epoch": 1.74, + "grad_norm": 19.5128231048584, + "learning_rate": 2.3382301504022384e-07, + "logps/chosen": -46.164161682128906, + "logps/rejected": -75.56565856933594, + "loss": 0.2953, + "losses/dpo": 0.6225314140319824, + "losses/sft": 1.8770743608474731, + "losses/total": 0.6225314140319824, + "ref_logps/chosen": -27.919498443603516, + "ref_logps/rejected": -39.514408111572266, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8244662284851074, + "rewards/margins": 1.7806590795516968, + "rewards/rejected": -3.6051251888275146, + "step": 1840 + }, + { + "epoch": 1.74, + "grad_norm": 21.03430938720703, + "learning_rate": 2.3364812871633438e-07, + "logps/chosen": -60.95615768432617, + "logps/rejected": -65.29434967041016, + "loss": 0.4591, + "losses/dpo": 0.2938690781593323, + "losses/sft": 1.9467700719833374, + "losses/total": 0.2938690781593323, + "ref_logps/chosen": -45.059661865234375, + "ref_logps/rejected": -38.9007568359375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5896495580673218, + "rewards/margins": 1.0497106313705444, + "rewards/rejected": -2.639360189437866, + "step": 1841 + }, + { + "epoch": 1.74, + "grad_norm": 24.941936492919922, + "learning_rate": 2.334732423924449e-07, + "logps/chosen": -53.41764831542969, + "logps/rejected": -66.00311279296875, + "loss": 0.396, + "losses/dpo": 0.2544431984424591, + "losses/sft": 1.4687739610671997, + "losses/total": 0.2544431984424591, + "ref_logps/chosen": -37.06456756591797, + "ref_logps/rejected": -36.036800384521484, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.635308027267456, + "rewards/margins": 1.3613231182098389, + "rewards/rejected": -2.996631145477295, + "step": 1842 + }, + { + "epoch": 1.74, + "grad_norm": 29.51819610595703, + "learning_rate": 2.3329835606855543e-07, + "logps/chosen": -63.126502990722656, + "logps/rejected": -66.65913391113281, + "loss": 0.51, + "losses/dpo": 0.37955766916275024, + "losses/sft": 1.9559439420700073, + "losses/total": 0.37955766916275024, + "ref_logps/chosen": -46.511863708496094, + "ref_logps/rejected": -41.78247833251953, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6614642143249512, + "rewards/margins": 0.8262010216712952, + "rewards/rejected": -2.4876651763916016, + "step": 1843 + }, + { + "epoch": 1.74, + "grad_norm": 36.848724365234375, + "learning_rate": 2.3312346974466597e-07, + "logps/chosen": -42.624046325683594, + "logps/rejected": -54.71479415893555, + "loss": 0.4782, + "losses/dpo": 0.4191596508026123, + "losses/sft": 2.1579864025115967, + "losses/total": 0.4191596508026123, + "ref_logps/chosen": -29.775489807128906, + "ref_logps/rejected": -30.61602210998535, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2848554849624634, + "rewards/margins": 1.1250216960906982, + "rewards/rejected": -2.409877300262451, + "step": 1844 + }, + { + "epoch": 1.74, + "grad_norm": 23.67749786376953, + "learning_rate": 2.3294858342077648e-07, + "logps/chosen": -49.564029693603516, + "logps/rejected": -71.5465087890625, + "loss": 0.3629, + "losses/dpo": 0.5933958292007446, + "losses/sft": 2.0923280715942383, + "losses/total": 0.5933958292007446, + "ref_logps/chosen": -37.54664993286133, + "ref_logps/rejected": -43.87751770019531, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.201737880706787, + "rewards/margins": 1.565162181854248, + "rewards/rejected": -2.766900062561035, + "step": 1845 + }, + { + "epoch": 1.74, + "grad_norm": 20.85272216796875, + "learning_rate": 2.3277369709688702e-07, + "logps/chosen": -46.61496353149414, + "logps/rejected": -62.098121643066406, + "loss": 0.3089, + "losses/dpo": 0.19887515902519226, + "losses/sft": 1.8970520496368408, + "losses/total": 0.19887515902519226, + "ref_logps/chosen": -33.598785400390625, + "ref_logps/rejected": -34.05204772949219, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3016180992126465, + "rewards/margins": 1.5029888153076172, + "rewards/rejected": -2.8046069145202637, + "step": 1846 + }, + { + "epoch": 1.74, + "grad_norm": 33.110694885253906, + "learning_rate": 2.3259881077299753e-07, + "logps/chosen": -66.85472106933594, + "logps/rejected": -73.36692810058594, + "loss": 0.494, + "losses/dpo": 0.4233478903770447, + "losses/sft": 2.028045415878296, + "losses/total": 0.4233478903770447, + "ref_logps/chosen": -49.42625427246094, + "ref_logps/rejected": -45.749610900878906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.742846965789795, + "rewards/margins": 1.0188846588134766, + "rewards/rejected": -2.7617316246032715, + "step": 1847 + }, + { + "epoch": 1.75, + "grad_norm": 22.06517791748047, + "learning_rate": 2.3242392444910807e-07, + "logps/chosen": -61.929054260253906, + "logps/rejected": -74.1783447265625, + "loss": 0.3416, + "losses/dpo": 0.2992900013923645, + "losses/sft": 2.08903431892395, + "losses/total": 0.2992900013923645, + "ref_logps/chosen": -46.72382736206055, + "ref_logps/rejected": -44.66014862060547, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5205228328704834, + "rewards/margins": 1.4312971830368042, + "rewards/rejected": -2.951820135116577, + "step": 1848 + }, + { + "epoch": 1.75, + "grad_norm": 24.4124755859375, + "learning_rate": 2.3224903812521858e-07, + "logps/chosen": -49.27168273925781, + "logps/rejected": -66.37191772460938, + "loss": 0.4083, + "losses/dpo": 0.501542329788208, + "losses/sft": 1.5513527393341064, + "losses/total": 0.501542329788208, + "ref_logps/chosen": -37.170188903808594, + "ref_logps/rejected": -41.29085159301758, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2101495265960693, + "rewards/margins": 1.297957181930542, + "rewards/rejected": -2.5081067085266113, + "step": 1849 + }, + { + "epoch": 1.75, + "grad_norm": 23.928943634033203, + "learning_rate": 2.3207415180132915e-07, + "logps/chosen": -49.31248474121094, + "logps/rejected": -69.08738708496094, + "loss": 0.3848, + "losses/dpo": 0.411376416683197, + "losses/sft": 1.747654914855957, + "losses/total": 0.411376416683197, + "ref_logps/chosen": -31.206275939941406, + "ref_logps/rejected": -38.59733963012695, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8106211423873901, + "rewards/margins": 1.2383838891983032, + "rewards/rejected": -3.0490050315856934, + "step": 1850 + }, + { + "epoch": 1.75, + "grad_norm": 27.012725830078125, + "learning_rate": 2.3189926547743966e-07, + "logps/chosen": -60.662445068359375, + "logps/rejected": -81.42811584472656, + "loss": 0.3731, + "losses/dpo": 0.3123415410518646, + "losses/sft": 2.4386916160583496, + "losses/total": 0.3123415410518646, + "ref_logps/chosen": -43.75713348388672, + "ref_logps/rejected": -49.12187957763672, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6905311346054077, + "rewards/margins": 1.5400934219360352, + "rewards/rejected": -3.2306246757507324, + "step": 1851 + }, + { + "epoch": 1.75, + "grad_norm": 30.193464279174805, + "learning_rate": 2.3172437915355018e-07, + "logps/chosen": -63.7210693359375, + "logps/rejected": -66.50696563720703, + "loss": 0.4058, + "losses/dpo": 0.6251382231712341, + "losses/sft": 2.281372547149658, + "losses/total": 0.6251382231712341, + "ref_logps/chosen": -44.03181457519531, + "ref_logps/rejected": -36.02568435668945, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9689252376556396, + "rewards/margins": 1.0792032480239868, + "rewards/rejected": -3.048128604888916, + "step": 1852 + }, + { + "epoch": 1.75, + "grad_norm": 25.303871154785156, + "learning_rate": 2.3154949282966071e-07, + "logps/chosen": -53.38203430175781, + "logps/rejected": -78.54144287109375, + "loss": 0.3124, + "losses/dpo": 0.5744849443435669, + "losses/sft": 2.2462775707244873, + "losses/total": 0.5744849443435669, + "ref_logps/chosen": -40.01557159423828, + "ref_logps/rejected": -49.328025817871094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.336646556854248, + "rewards/margins": 1.584695816040039, + "rewards/rejected": -2.921342372894287, + "step": 1853 + }, + { + "epoch": 1.75, + "grad_norm": 18.691640853881836, + "learning_rate": 2.3137460650577123e-07, + "logps/chosen": -44.97010803222656, + "logps/rejected": -75.71627044677734, + "loss": 0.321, + "losses/dpo": 0.175616055727005, + "losses/sft": 1.6014487743377686, + "losses/total": 0.175616055727005, + "ref_logps/chosen": -32.952781677246094, + "ref_logps/rejected": -47.409908294677734, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.201733112335205, + "rewards/margins": 1.6289032697677612, + "rewards/rejected": -2.8306362628936768, + "step": 1854 + }, + { + "epoch": 1.75, + "grad_norm": 15.648659706115723, + "learning_rate": 2.3119972018188177e-07, + "logps/chosen": -34.525760650634766, + "logps/rejected": -66.15505981445312, + "loss": 0.276, + "losses/dpo": 0.4646318554878235, + "losses/sft": 1.5340651273727417, + "losses/total": 0.4646318554878235, + "ref_logps/chosen": -27.546180725097656, + "ref_logps/rejected": -41.33037567138672, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6979577541351318, + "rewards/margins": 1.7845112085342407, + "rewards/rejected": -2.482469081878662, + "step": 1855 + }, + { + "epoch": 1.75, + "grad_norm": 21.946725845336914, + "learning_rate": 2.3102483385799228e-07, + "logps/chosen": -53.30470275878906, + "logps/rejected": -75.9454574584961, + "loss": 0.3318, + "losses/dpo": 0.5257679224014282, + "losses/sft": 1.831146240234375, + "losses/total": 0.5257679224014282, + "ref_logps/chosen": -38.76506042480469, + "ref_logps/rejected": -46.32561492919922, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4539637565612793, + "rewards/margins": 1.5080205202102661, + "rewards/rejected": -2.961984395980835, + "step": 1856 + }, + { + "epoch": 1.75, + "grad_norm": 24.31056022644043, + "learning_rate": 2.3084994753410284e-07, + "logps/chosen": -61.453392028808594, + "logps/rejected": -68.18570709228516, + "loss": 0.459, + "losses/dpo": 0.37328892946243286, + "losses/sft": 2.5342836380004883, + "losses/total": 0.37328892946243286, + "ref_logps/chosen": -43.84550857543945, + "ref_logps/rejected": -42.47376251220703, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7607884407043457, + "rewards/margins": 0.810405969619751, + "rewards/rejected": -2.5711944103240967, + "step": 1857 + }, + { + "epoch": 1.75, + "grad_norm": 25.795124053955078, + "learning_rate": 2.3067506121021336e-07, + "logps/chosen": -58.45408630371094, + "logps/rejected": -75.3165283203125, + "loss": 0.4595, + "losses/dpo": 0.39884889125823975, + "losses/sft": 1.8553930521011353, + "losses/total": 0.39884889125823975, + "ref_logps/chosen": -39.70906448364258, + "ref_logps/rejected": -43.39494705200195, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8745019435882568, + "rewards/margins": 1.3176555633544922, + "rewards/rejected": -3.192157506942749, + "step": 1858 + }, + { + "epoch": 1.76, + "grad_norm": 15.022128105163574, + "learning_rate": 2.3050017488632387e-07, + "logps/chosen": -58.80267333984375, + "logps/rejected": -86.05952453613281, + "loss": 0.2289, + "losses/dpo": 0.3939743638038635, + "losses/sft": 2.242748737335205, + "losses/total": 0.3939743638038635, + "ref_logps/chosen": -44.415199279785156, + "ref_logps/rejected": -53.97506332397461, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4387474060058594, + "rewards/margins": 1.7696987390518188, + "rewards/rejected": -3.2084462642669678, + "step": 1859 + }, + { + "epoch": 1.76, + "grad_norm": 31.836544036865234, + "learning_rate": 2.303252885624344e-07, + "logps/chosen": -58.42820739746094, + "logps/rejected": -78.09474182128906, + "loss": 0.5332, + "losses/dpo": 0.4501548111438751, + "losses/sft": 1.7386133670806885, + "losses/total": 0.4501548111438751, + "ref_logps/chosen": -41.048240661621094, + "ref_logps/rejected": -49.97856903076172, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7379968166351318, + "rewards/margins": 1.073620319366455, + "rewards/rejected": -2.811617136001587, + "step": 1860 + }, + { + "epoch": 1.76, + "grad_norm": 29.845956802368164, + "learning_rate": 2.3015040223854492e-07, + "logps/chosen": -46.830650329589844, + "logps/rejected": -53.74964141845703, + "loss": 0.5937, + "losses/dpo": 0.7010772824287415, + "losses/sft": 1.4836887121200562, + "losses/total": 0.7010772824287415, + "ref_logps/chosen": -33.31001281738281, + "ref_logps/rejected": -32.46253204345703, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3520640134811401, + "rewards/margins": 0.7766469717025757, + "rewards/rejected": -2.128710985183716, + "step": 1861 + }, + { + "epoch": 1.76, + "grad_norm": 27.349044799804688, + "learning_rate": 2.2997551591465546e-07, + "logps/chosen": -61.350276947021484, + "logps/rejected": -81.38579559326172, + "loss": 0.4225, + "losses/dpo": 0.19630475342273712, + "losses/sft": 1.9088757038116455, + "losses/total": 0.19630475342273712, + "ref_logps/chosen": -41.3489990234375, + "ref_logps/rejected": -50.82018280029297, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0001277923583984, + "rewards/margins": 1.0564332008361816, + "rewards/rejected": -3.056561231613159, + "step": 1862 + }, + { + "epoch": 1.76, + "grad_norm": 23.62197494506836, + "learning_rate": 2.29800629590766e-07, + "logps/chosen": -49.401824951171875, + "logps/rejected": -73.51557159423828, + "loss": 0.4025, + "losses/dpo": 0.6817601919174194, + "losses/sft": 1.966322660446167, + "losses/total": 0.6817601919174194, + "ref_logps/chosen": -36.905059814453125, + "ref_logps/rejected": -44.787437438964844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2496763467788696, + "rewards/margins": 1.6231366395950317, + "rewards/rejected": -2.8728129863739014, + "step": 1863 + }, + { + "epoch": 1.76, + "grad_norm": 22.961708068847656, + "learning_rate": 2.2962574326687654e-07, + "logps/chosen": -46.41362762451172, + "logps/rejected": -69.81623840332031, + "loss": 0.4347, + "losses/dpo": 0.19165581464767456, + "losses/sft": 1.7985776662826538, + "losses/total": 0.19165581464767456, + "ref_logps/chosen": -32.241111755371094, + "ref_logps/rejected": -44.07400131225586, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4172515869140625, + "rewards/margins": 1.1569719314575195, + "rewards/rejected": -2.574223518371582, + "step": 1864 + }, + { + "epoch": 1.76, + "grad_norm": 32.46565246582031, + "learning_rate": 2.2945085694298705e-07, + "logps/chosen": -61.946746826171875, + "logps/rejected": -59.74442672729492, + "loss": 0.6418, + "losses/dpo": 0.5373279452323914, + "losses/sft": 2.1824910640716553, + "losses/total": 0.5373279452323914, + "ref_logps/chosen": -45.1789665222168, + "ref_logps/rejected": -36.91200256347656, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6767781972885132, + "rewards/margins": 0.6064640283584595, + "rewards/rejected": -2.2832422256469727, + "step": 1865 + }, + { + "epoch": 1.76, + "grad_norm": 25.684823989868164, + "learning_rate": 2.2927597061909756e-07, + "logps/chosen": -58.58709716796875, + "logps/rejected": -70.03372192382812, + "loss": 0.4293, + "losses/dpo": 0.32440823316574097, + "losses/sft": 1.9917763471603394, + "losses/total": 0.32440823316574097, + "ref_logps/chosen": -42.10264587402344, + "ref_logps/rejected": -42.35670852661133, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6484451293945312, + "rewards/margins": 1.1192561388015747, + "rewards/rejected": -2.7677011489868164, + "step": 1866 + }, + { + "epoch": 1.76, + "grad_norm": 21.3986759185791, + "learning_rate": 2.291010842952081e-07, + "logps/chosen": -48.57206344604492, + "logps/rejected": -74.04078674316406, + "loss": 0.3395, + "losses/dpo": 0.37597930431365967, + "losses/sft": 1.6027623414993286, + "losses/total": 0.37597930431365967, + "ref_logps/chosen": -34.158077239990234, + "ref_logps/rejected": -44.176666259765625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4413988590240479, + "rewards/margins": 1.5450128316879272, + "rewards/rejected": -2.9864115715026855, + "step": 1867 + }, + { + "epoch": 1.76, + "grad_norm": 23.15447425842285, + "learning_rate": 2.2892619797131862e-07, + "logps/chosen": -62.379058837890625, + "logps/rejected": -90.55410766601562, + "loss": 0.4447, + "losses/dpo": 0.4550441801548004, + "losses/sft": 2.0679984092712402, + "losses/total": 0.4550441801548004, + "ref_logps/chosen": -45.16303634643555, + "ref_logps/rejected": -58.38995361328125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7216026782989502, + "rewards/margins": 1.4948129653930664, + "rewards/rejected": -3.2164158821105957, + "step": 1868 + }, + { + "epoch": 1.76, + "grad_norm": 24.27429962158203, + "learning_rate": 2.2875131164742918e-07, + "logps/chosen": -49.776588439941406, + "logps/rejected": -69.47943115234375, + "loss": 0.4242, + "losses/dpo": 0.5417318940162659, + "losses/sft": 2.0911362171173096, + "losses/total": 0.5417318940162659, + "ref_logps/chosen": -35.80033493041992, + "ref_logps/rejected": -44.90788269042969, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.397625207901001, + "rewards/margins": 1.0595295429229736, + "rewards/rejected": -2.4571547508239746, + "step": 1869 + }, + { + "epoch": 1.77, + "grad_norm": 23.325531005859375, + "learning_rate": 2.285764253235397e-07, + "logps/chosen": -47.95051574707031, + "logps/rejected": -61.81595993041992, + "loss": 0.4171, + "losses/dpo": 0.7720044255256653, + "losses/sft": 1.924033761024475, + "losses/total": 0.7720044255256653, + "ref_logps/chosen": -35.24822235107422, + "ref_logps/rejected": -38.147430419921875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2702295780181885, + "rewards/margins": 1.0966236591339111, + "rewards/rejected": -2.3668532371520996, + "step": 1870 + }, + { + "epoch": 1.77, + "grad_norm": 22.157634735107422, + "learning_rate": 2.2840153899965023e-07, + "logps/chosen": -40.46189880371094, + "logps/rejected": -58.247581481933594, + "loss": 0.4623, + "losses/dpo": 0.4775947034358978, + "losses/sft": 1.3777129650115967, + "losses/total": 0.4775947034358978, + "ref_logps/chosen": -26.85367774963379, + "ref_logps/rejected": -37.701988220214844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3608219623565674, + "rewards/margins": 0.6937375068664551, + "rewards/rejected": -2.0545594692230225, + "step": 1871 + }, + { + "epoch": 1.77, + "grad_norm": 20.059139251708984, + "learning_rate": 2.2822665267576075e-07, + "logps/chosen": -47.75612258911133, + "logps/rejected": -69.51106262207031, + "loss": 0.4207, + "losses/dpo": 0.5288746356964111, + "losses/sft": 1.7966135740280151, + "losses/total": 0.5288746356964111, + "ref_logps/chosen": -33.53255844116211, + "ref_logps/rejected": -42.634315490722656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4223566055297852, + "rewards/margins": 1.2653181552886963, + "rewards/rejected": -2.6876747608184814, + "step": 1872 + }, + { + "epoch": 1.77, + "grad_norm": 32.08280944824219, + "learning_rate": 2.2805176635187126e-07, + "logps/chosen": -56.78495788574219, + "logps/rejected": -76.56369018554688, + "loss": 0.5646, + "losses/dpo": 0.4444050192832947, + "losses/sft": 2.156550407409668, + "losses/total": 0.4444050192832947, + "ref_logps/chosen": -38.83460998535156, + "ref_logps/rejected": -50.37071990966797, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.795034408569336, + "rewards/margins": 0.8242623805999756, + "rewards/rejected": -2.6192967891693115, + "step": 1873 + }, + { + "epoch": 1.77, + "grad_norm": 16.14358139038086, + "learning_rate": 2.278768800279818e-07, + "logps/chosen": -39.84698486328125, + "logps/rejected": -73.8427734375, + "loss": 0.2922, + "losses/dpo": 0.3803018629550934, + "losses/sft": 1.5653516054153442, + "losses/total": 0.3803018629550934, + "ref_logps/chosen": -29.583412170410156, + "ref_logps/rejected": -42.70185852050781, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0263575315475464, + "rewards/margins": 2.0877342224121094, + "rewards/rejected": -3.114091634750366, + "step": 1874 + }, + { + "epoch": 1.77, + "grad_norm": 22.408313751220703, + "learning_rate": 2.277019937040923e-07, + "logps/chosen": -54.31877899169922, + "logps/rejected": -65.66378021240234, + "loss": 0.3885, + "losses/dpo": 0.44363775849342346, + "losses/sft": 1.8331607580184937, + "losses/total": 0.44363775849342346, + "ref_logps/chosen": -37.31925964355469, + "ref_logps/rejected": -39.318138122558594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6999518871307373, + "rewards/margins": 0.9346121549606323, + "rewards/rejected": -2.63456392288208, + "step": 1875 + }, + { + "epoch": 1.77, + "grad_norm": 22.377567291259766, + "learning_rate": 2.2752710738020288e-07, + "logps/chosen": -38.93741989135742, + "logps/rejected": -54.418758392333984, + "loss": 0.4623, + "losses/dpo": 0.25928351283073425, + "losses/sft": 1.7262192964553833, + "losses/total": 0.25928351283073425, + "ref_logps/chosen": -30.986614227294922, + "ref_logps/rejected": -32.59572982788086, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7950806617736816, + "rewards/margins": 1.3872219324111938, + "rewards/rejected": -2.182302474975586, + "step": 1876 + }, + { + "epoch": 1.77, + "grad_norm": 23.78643035888672, + "learning_rate": 2.273522210563134e-07, + "logps/chosen": -40.39408493041992, + "logps/rejected": -62.54866027832031, + "loss": 0.4088, + "losses/dpo": 0.5053045153617859, + "losses/sft": 1.2312461137771606, + "losses/total": 0.5053045153617859, + "ref_logps/chosen": -29.41988754272461, + "ref_logps/rejected": -38.173805236816406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0974197387695312, + "rewards/margins": 1.3400654792785645, + "rewards/rejected": -2.4374852180480957, + "step": 1877 + }, + { + "epoch": 1.77, + "grad_norm": 24.650856018066406, + "learning_rate": 2.2717733473242393e-07, + "logps/chosen": -59.539485931396484, + "logps/rejected": -70.56353759765625, + "loss": 0.4161, + "losses/dpo": 0.5482534170150757, + "losses/sft": 2.2541956901550293, + "losses/total": 0.5482534170150757, + "ref_logps/chosen": -42.4941520690918, + "ref_logps/rejected": -42.365028381347656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.704533576965332, + "rewards/margins": 1.1153171062469482, + "rewards/rejected": -2.8198506832122803, + "step": 1878 + }, + { + "epoch": 1.77, + "grad_norm": 18.941280364990234, + "learning_rate": 2.2700244840853444e-07, + "logps/chosen": -46.671268463134766, + "logps/rejected": -68.30020141601562, + "loss": 0.2868, + "losses/dpo": 0.24786376953125, + "losses/sft": 1.248977780342102, + "losses/total": 0.24786376953125, + "ref_logps/chosen": -35.240299224853516, + "ref_logps/rejected": -41.1260871887207, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.143097162246704, + "rewards/margins": 1.574313759803772, + "rewards/rejected": -2.7174108028411865, + "step": 1879 + }, + { + "epoch": 1.78, + "grad_norm": 21.882057189941406, + "learning_rate": 2.2682756208464495e-07, + "logps/chosen": -51.05032730102539, + "logps/rejected": -79.54434204101562, + "loss": 0.3798, + "losses/dpo": 0.3674437999725342, + "losses/sft": 1.7481441497802734, + "losses/total": 0.3674437999725342, + "ref_logps/chosen": -37.21192169189453, + "ref_logps/rejected": -53.069183349609375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3838404417037964, + "rewards/margins": 1.263675332069397, + "rewards/rejected": -2.6475157737731934, + "step": 1880 + }, + { + "epoch": 1.78, + "grad_norm": 19.097293853759766, + "learning_rate": 2.266526757607555e-07, + "logps/chosen": -49.598358154296875, + "logps/rejected": -68.81123352050781, + "loss": 0.4006, + "losses/dpo": 0.2633708417415619, + "losses/sft": 1.7340539693832397, + "losses/total": 0.2633708417415619, + "ref_logps/chosen": -32.70411682128906, + "ref_logps/rejected": -41.301082611083984, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6894242763519287, + "rewards/margins": 1.0615906715393066, + "rewards/rejected": -2.7510147094726562, + "step": 1881 + }, + { + "epoch": 1.78, + "grad_norm": 27.385522842407227, + "learning_rate": 2.2647778943686603e-07, + "logps/chosen": -55.36388397216797, + "logps/rejected": -72.80226135253906, + "loss": 0.4856, + "losses/dpo": 0.5916553139686584, + "losses/sft": 1.4422340393066406, + "losses/total": 0.5916553139686584, + "ref_logps/chosen": -41.77886199951172, + "ref_logps/rejected": -47.21708679199219, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.358501672744751, + "rewards/margins": 1.200016736984253, + "rewards/rejected": -2.558518409729004, + "step": 1882 + }, + { + "epoch": 1.78, + "grad_norm": 22.030410766601562, + "learning_rate": 2.2630290311297657e-07, + "logps/chosen": -49.24067687988281, + "logps/rejected": -82.13379669189453, + "loss": 0.3313, + "losses/dpo": 0.2792872190475464, + "losses/sft": 1.9741783142089844, + "losses/total": 0.2792872190475464, + "ref_logps/chosen": -33.81756591796875, + "ref_logps/rejected": -51.035560607910156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.542311668395996, + "rewards/margins": 1.567512035369873, + "rewards/rejected": -3.10982346534729, + "step": 1883 + }, + { + "epoch": 1.78, + "grad_norm": 19.523178100585938, + "learning_rate": 2.2612801678908708e-07, + "logps/chosen": -48.15979766845703, + "logps/rejected": -64.07328796386719, + "loss": 0.3526, + "losses/dpo": 0.44792261719703674, + "losses/sft": 1.7503501176834106, + "losses/total": 0.44792261719703674, + "ref_logps/chosen": -36.24867248535156, + "ref_logps/rejected": -37.80896759033203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1911125183105469, + "rewards/margins": 1.435319423675537, + "rewards/rejected": -2.626431941986084, + "step": 1884 + }, + { + "epoch": 1.78, + "grad_norm": 28.6821346282959, + "learning_rate": 2.2595313046519762e-07, + "logps/chosen": -37.17652893066406, + "logps/rejected": -51.31489562988281, + "loss": 0.5727, + "losses/dpo": 0.5008769631385803, + "losses/sft": 1.2679917812347412, + "losses/total": 0.5008769631385803, + "ref_logps/chosen": -24.934823989868164, + "ref_logps/rejected": -32.09640121459961, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.224170446395874, + "rewards/margins": 0.6976792216300964, + "rewards/rejected": -1.9218497276306152, + "step": 1885 + }, + { + "epoch": 1.78, + "grad_norm": 15.205330848693848, + "learning_rate": 2.2577824414130813e-07, + "logps/chosen": -54.19927215576172, + "logps/rejected": -79.3831787109375, + "loss": 0.2253, + "losses/dpo": 0.2139306217432022, + "losses/sft": 1.5638699531555176, + "losses/total": 0.2139306217432022, + "ref_logps/chosen": -40.40202331542969, + "ref_logps/rejected": -50.48923873901367, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3797242641448975, + "rewards/margins": 1.5096697807312012, + "rewards/rejected": -2.8893942832946777, + "step": 1886 + }, + { + "epoch": 1.78, + "grad_norm": 32.752071380615234, + "learning_rate": 2.2560335781741865e-07, + "logps/chosen": -61.3519172668457, + "logps/rejected": -73.46245574951172, + "loss": 0.6125, + "losses/dpo": 0.27496880292892456, + "losses/sft": 1.528637409210205, + "losses/total": 0.27496880292892456, + "ref_logps/chosen": -43.78230667114258, + "ref_logps/rejected": -47.55207061767578, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7569612264633179, + "rewards/margins": 0.8340772390365601, + "rewards/rejected": -2.591038227081299, + "step": 1887 + }, + { + "epoch": 1.78, + "grad_norm": 15.8610200881958, + "learning_rate": 2.2542847149352919e-07, + "logps/chosen": -50.55162811279297, + "logps/rejected": -79.12870788574219, + "loss": 0.2502, + "losses/dpo": 0.138380765914917, + "losses/sft": 2.214749574661255, + "losses/total": 0.138380765914917, + "ref_logps/chosen": -37.85062026977539, + "ref_logps/rejected": -50.561622619628906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2701011896133423, + "rewards/margins": 1.5866076946258545, + "rewards/rejected": -2.8567090034484863, + "step": 1888 + }, + { + "epoch": 1.78, + "grad_norm": 22.326772689819336, + "learning_rate": 2.2525358516963973e-07, + "logps/chosen": -55.27908706665039, + "logps/rejected": -86.50544738769531, + "loss": 0.3719, + "losses/dpo": 0.1820342242717743, + "losses/sft": 1.4802669286727905, + "losses/total": 0.1820342242717743, + "ref_logps/chosen": -37.24418258666992, + "ref_logps/rejected": -55.28599166870117, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.803490161895752, + "rewards/margins": 1.3184552192687988, + "rewards/rejected": -3.121945381164551, + "step": 1889 + }, + { + "epoch": 1.78, + "grad_norm": 21.98392677307129, + "learning_rate": 2.2507869884575026e-07, + "logps/chosen": -58.31983947753906, + "logps/rejected": -82.4329833984375, + "loss": 0.3394, + "losses/dpo": 0.06195071339607239, + "losses/sft": 1.7827625274658203, + "losses/total": 0.06195071339607239, + "ref_logps/chosen": -43.131805419921875, + "ref_logps/rejected": -50.879005432128906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.518803358078003, + "rewards/margins": 1.6365940570831299, + "rewards/rejected": -3.155397415161133, + "step": 1890 + }, + { + "epoch": 1.79, + "grad_norm": 25.910991668701172, + "learning_rate": 2.2490381252186078e-07, + "logps/chosen": -61.02592468261719, + "logps/rejected": -102.15164947509766, + "loss": 0.4079, + "losses/dpo": 0.2999795079231262, + "losses/sft": 1.4605165719985962, + "losses/total": 0.2999795079231262, + "ref_logps/chosen": -40.20536422729492, + "ref_logps/rejected": -63.65682601928711, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0820555686950684, + "rewards/margins": 1.7674269676208496, + "rewards/rejected": -3.849482774734497, + "step": 1891 + }, + { + "epoch": 1.79, + "grad_norm": 27.1877384185791, + "learning_rate": 2.2472892619797132e-07, + "logps/chosen": -56.80752944946289, + "logps/rejected": -65.45315551757812, + "loss": 0.5826, + "losses/dpo": 0.9997936487197876, + "losses/sft": 2.1011176109313965, + "losses/total": 0.9997936487197876, + "ref_logps/chosen": -39.66331481933594, + "ref_logps/rejected": -40.86894226074219, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7144218683242798, + "rewards/margins": 0.7440000772476196, + "rewards/rejected": -2.4584219455718994, + "step": 1892 + }, + { + "epoch": 1.79, + "grad_norm": 21.7919921875, + "learning_rate": 2.2455403987408183e-07, + "logps/chosen": -54.760337829589844, + "logps/rejected": -55.419273376464844, + "loss": 0.3595, + "losses/dpo": 0.6955257058143616, + "losses/sft": 2.0487747192382812, + "losses/total": 0.6955257058143616, + "ref_logps/chosen": -44.6641845703125, + "ref_logps/rejected": -34.3460693359375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.009615182876587, + "rewards/margins": 1.097705364227295, + "rewards/rejected": -2.1073203086853027, + "step": 1893 + }, + { + "epoch": 1.79, + "grad_norm": 15.337175369262695, + "learning_rate": 2.2437915355019234e-07, + "logps/chosen": -51.50416564941406, + "logps/rejected": -100.27093505859375, + "loss": 0.174, + "losses/dpo": 0.35395389795303345, + "losses/sft": 2.671549081802368, + "losses/total": 0.35395389795303345, + "ref_logps/chosen": -34.17205047607422, + "ref_logps/rejected": -58.02793884277344, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7332121133804321, + "rewards/margins": 2.4910879135131836, + "rewards/rejected": -4.224300384521484, + "step": 1894 + }, + { + "epoch": 1.79, + "grad_norm": 17.456411361694336, + "learning_rate": 2.242042672263029e-07, + "logps/chosen": -47.11567687988281, + "logps/rejected": -61.513282775878906, + "loss": 0.3349, + "losses/dpo": 0.255015105009079, + "losses/sft": 1.9937318563461304, + "losses/total": 0.255015105009079, + "ref_logps/chosen": -35.42218780517578, + "ref_logps/rejected": -36.297943115234375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1693490743637085, + "rewards/margins": 1.3521851301193237, + "rewards/rejected": -2.5215344429016113, + "step": 1895 + }, + { + "epoch": 1.79, + "grad_norm": 21.69206428527832, + "learning_rate": 2.2402938090241342e-07, + "logps/chosen": -56.75226593017578, + "logps/rejected": -63.572509765625, + "loss": 0.421, + "losses/dpo": 0.4266464412212372, + "losses/sft": 2.0657413005828857, + "losses/total": 0.4266464412212372, + "ref_logps/chosen": -40.8559455871582, + "ref_logps/rejected": -36.38270568847656, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5896315574645996, + "rewards/margins": 1.1293489933013916, + "rewards/rejected": -2.718980550765991, + "step": 1896 + }, + { + "epoch": 1.79, + "grad_norm": 24.08820343017578, + "learning_rate": 2.2385449457852396e-07, + "logps/chosen": -52.32395935058594, + "logps/rejected": -86.34336853027344, + "loss": 0.3095, + "losses/dpo": 0.132833331823349, + "losses/sft": 1.6938538551330566, + "losses/total": 0.132833331823349, + "ref_logps/chosen": -37.865142822265625, + "ref_logps/rejected": -53.67011642456055, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4458816051483154, + "rewards/margins": 1.8214439153671265, + "rewards/rejected": -3.2673256397247314, + "step": 1897 + }, + { + "epoch": 1.79, + "grad_norm": 24.440874099731445, + "learning_rate": 2.2367960825463447e-07, + "logps/chosen": -48.53782272338867, + "logps/rejected": -75.16322326660156, + "loss": 0.3891, + "losses/dpo": 0.7033171057701111, + "losses/sft": 2.48795223236084, + "losses/total": 0.7033171057701111, + "ref_logps/chosen": -33.62128829956055, + "ref_logps/rejected": -41.39613723754883, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.491653561592102, + "rewards/margins": 1.885054349899292, + "rewards/rejected": -3.3767080307006836, + "step": 1898 + }, + { + "epoch": 1.79, + "grad_norm": 28.448230743408203, + "learning_rate": 2.23504721930745e-07, + "logps/chosen": -51.7152099609375, + "logps/rejected": -67.97134399414062, + "loss": 0.4612, + "losses/dpo": 0.4561270773410797, + "losses/sft": 2.098419189453125, + "losses/total": 0.4561270773410797, + "ref_logps/chosen": -34.344764709472656, + "ref_logps/rejected": -39.8526611328125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.737044334411621, + "rewards/margins": 1.0748240947723389, + "rewards/rejected": -2.811868190765381, + "step": 1899 + }, + { + "epoch": 1.79, + "grad_norm": 27.743558883666992, + "learning_rate": 2.2332983560685552e-07, + "logps/chosen": -72.33219909667969, + "logps/rejected": -100.0615005493164, + "loss": 0.4026, + "losses/dpo": 0.13542529940605164, + "losses/sft": 2.0535051822662354, + "losses/total": 0.13542529940605164, + "ref_logps/chosen": -52.81183624267578, + "ref_logps/rejected": -67.53926086425781, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9520361423492432, + "rewards/margins": 1.3001880645751953, + "rewards/rejected": -3.2522242069244385, + "step": 1900 + }, + { + "epoch": 1.8, + "grad_norm": 23.906633377075195, + "learning_rate": 2.2315494928296604e-07, + "logps/chosen": -67.08051300048828, + "logps/rejected": -79.08184814453125, + "loss": 0.3348, + "losses/dpo": 0.3648269772529602, + "losses/sft": 1.8850147724151611, + "losses/total": 0.3648269772529602, + "ref_logps/chosen": -48.83037185668945, + "ref_logps/rejected": -46.057395935058594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.825014591217041, + "rewards/margins": 1.4774305820465088, + "rewards/rejected": -3.302445411682129, + "step": 1901 + }, + { + "epoch": 1.8, + "grad_norm": 28.172666549682617, + "learning_rate": 2.229800629590766e-07, + "logps/chosen": -51.03199005126953, + "logps/rejected": -72.94522094726562, + "loss": 0.4074, + "losses/dpo": 0.616869330406189, + "losses/sft": 1.9095754623413086, + "losses/total": 0.616869330406189, + "ref_logps/chosen": -33.90076446533203, + "ref_logps/rejected": -44.803680419921875, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7131222486495972, + "rewards/margins": 1.1010315418243408, + "rewards/rejected": -2.8141536712646484, + "step": 1902 + }, + { + "epoch": 1.8, + "grad_norm": 25.836366653442383, + "learning_rate": 2.2280517663518711e-07, + "logps/chosen": -51.030887603759766, + "logps/rejected": -74.72064208984375, + "loss": 0.4934, + "losses/dpo": 0.9905093312263489, + "losses/sft": 2.5590171813964844, + "losses/total": 0.9905093312263489, + "ref_logps/chosen": -33.1800537109375, + "ref_logps/rejected": -44.894935607910156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7850837707519531, + "rewards/margins": 1.1974868774414062, + "rewards/rejected": -2.9825706481933594, + "step": 1903 + }, + { + "epoch": 1.8, + "grad_norm": 26.964326858520508, + "learning_rate": 2.2263029031129765e-07, + "logps/chosen": -52.39605712890625, + "logps/rejected": -62.85914993286133, + "loss": 0.6155, + "losses/dpo": 0.25205937027931213, + "losses/sft": 1.5752063989639282, + "losses/total": 0.25205937027931213, + "ref_logps/chosen": -34.76396179199219, + "ref_logps/rejected": -37.6327018737793, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.763209581375122, + "rewards/margins": 0.7594348192214966, + "rewards/rejected": -2.522644519805908, + "step": 1904 + }, + { + "epoch": 1.8, + "grad_norm": 22.541709899902344, + "learning_rate": 2.2245540398740817e-07, + "logps/chosen": -42.903236389160156, + "logps/rejected": -58.880409240722656, + "loss": 0.484, + "losses/dpo": 0.5868808627128601, + "losses/sft": 1.7349990606307983, + "losses/total": 0.5868808627128601, + "ref_logps/chosen": -29.372034072875977, + "ref_logps/rejected": -36.09337615966797, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3531203269958496, + "rewards/margins": 0.9255828261375427, + "rewards/rejected": -2.278702974319458, + "step": 1905 + }, + { + "epoch": 1.8, + "grad_norm": 19.29807472229004, + "learning_rate": 2.222805176635187e-07, + "logps/chosen": -51.947608947753906, + "logps/rejected": -80.24134063720703, + "loss": 0.3714, + "losses/dpo": 0.3035661280155182, + "losses/sft": 2.176285743713379, + "losses/total": 0.3035661280155182, + "ref_logps/chosen": -37.67027282714844, + "ref_logps/rejected": -48.94196701049805, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4277340173721313, + "rewards/margins": 1.7022035121917725, + "rewards/rejected": -3.1299376487731934, + "step": 1906 + }, + { + "epoch": 1.8, + "grad_norm": 15.007548332214355, + "learning_rate": 2.2210563133962922e-07, + "logps/chosen": -49.67732238769531, + "logps/rejected": -91.13972473144531, + "loss": 0.184, + "losses/dpo": 0.048366762697696686, + "losses/sft": 1.3094068765640259, + "losses/total": 0.048366762697696686, + "ref_logps/chosen": -36.67182159423828, + "ref_logps/rejected": -52.19459533691406, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3005497455596924, + "rewards/margins": 2.593963146209717, + "rewards/rejected": -3.8945131301879883, + "step": 1907 + }, + { + "epoch": 1.8, + "grad_norm": 22.120275497436523, + "learning_rate": 2.2193074501573976e-07, + "logps/chosen": -42.411258697509766, + "logps/rejected": -73.38336181640625, + "loss": 0.3994, + "losses/dpo": 0.4103068709373474, + "losses/sft": 1.3370667695999146, + "losses/total": 0.4103068709373474, + "ref_logps/chosen": -28.12499237060547, + "ref_logps/rejected": -46.879371643066406, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4286270141601562, + "rewards/margins": 1.2217724323272705, + "rewards/rejected": -2.6503994464874268, + "step": 1908 + }, + { + "epoch": 1.8, + "grad_norm": 21.12205696105957, + "learning_rate": 2.217558586918503e-07, + "logps/chosen": -50.89750671386719, + "logps/rejected": -72.23309326171875, + "loss": 0.2866, + "losses/dpo": 0.6609861850738525, + "losses/sft": 1.9567790031433105, + "losses/total": 0.6609861850738525, + "ref_logps/chosen": -39.11732482910156, + "ref_logps/rejected": -41.536781311035156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1780179738998413, + "rewards/margins": 1.8916127681732178, + "rewards/rejected": -3.0696308612823486, + "step": 1909 + }, + { + "epoch": 1.8, + "grad_norm": 27.44991683959961, + "learning_rate": 2.215809723679608e-07, + "logps/chosen": -58.320369720458984, + "logps/rejected": -66.22344970703125, + "loss": 0.6954, + "losses/dpo": 1.1899237632751465, + "losses/sft": 2.1805202960968018, + "losses/total": 1.1899237632751465, + "ref_logps/chosen": -42.65129089355469, + "ref_logps/rejected": -42.52025604248047, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5669080018997192, + "rewards/margins": 0.8034118413925171, + "rewards/rejected": -2.3703198432922363, + "step": 1910 + }, + { + "epoch": 1.8, + "grad_norm": 24.695268630981445, + "learning_rate": 2.2140608604407135e-07, + "logps/chosen": -50.96708679199219, + "logps/rejected": -64.78813171386719, + "loss": 0.5172, + "losses/dpo": 0.35559552907943726, + "losses/sft": 1.9054945707321167, + "losses/total": 0.35559552907943726, + "ref_logps/chosen": -39.908721923828125, + "ref_logps/rejected": -42.99611282348633, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1058368682861328, + "rewards/margins": 1.0733649730682373, + "rewards/rejected": -2.17920184135437, + "step": 1911 + }, + { + "epoch": 1.81, + "grad_norm": 28.08316993713379, + "learning_rate": 2.2123119972018186e-07, + "logps/chosen": -58.56736755371094, + "logps/rejected": -68.26663970947266, + "loss": 0.4828, + "losses/dpo": 0.7495432496070862, + "losses/sft": 2.0531625747680664, + "losses/total": 0.7495432496070862, + "ref_logps/chosen": -47.00447082519531, + "ref_logps/rejected": -43.70998764038086, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1562894582748413, + "rewards/margins": 1.2993755340576172, + "rewards/rejected": -2.455665111541748, + "step": 1912 + }, + { + "epoch": 1.81, + "grad_norm": 24.238311767578125, + "learning_rate": 2.210563133962924e-07, + "logps/chosen": -40.96937561035156, + "logps/rejected": -79.12947845458984, + "loss": 0.3677, + "losses/dpo": 0.2563820481300354, + "losses/sft": 1.4979825019836426, + "losses/total": 0.2563820481300354, + "ref_logps/chosen": -31.857439041137695, + "ref_logps/rejected": -51.90716552734375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9111933708190918, + "rewards/margins": 1.8110376596450806, + "rewards/rejected": -2.722230911254883, + "step": 1913 + }, + { + "epoch": 1.81, + "grad_norm": 15.501830101013184, + "learning_rate": 2.2088142707240294e-07, + "logps/chosen": -56.0780029296875, + "logps/rejected": -72.89102172851562, + "loss": 0.2175, + "losses/dpo": 0.11090554296970367, + "losses/sft": 1.774894118309021, + "losses/total": 0.11090554296970367, + "ref_logps/chosen": -42.51144027709961, + "ref_logps/rejected": -39.92144775390625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3566561937332153, + "rewards/margins": 1.9403011798858643, + "rewards/rejected": -3.296957492828369, + "step": 1914 + }, + { + "epoch": 1.81, + "grad_norm": 23.787960052490234, + "learning_rate": 2.2070654074851348e-07, + "logps/chosen": -48.06635284423828, + "logps/rejected": -66.04922485351562, + "loss": 0.4061, + "losses/dpo": 0.4237924814224243, + "losses/sft": 1.9574851989746094, + "losses/total": 0.4237924814224243, + "ref_logps/chosen": -35.15850067138672, + "ref_logps/rejected": -41.25889205932617, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2907847166061401, + "rewards/margins": 1.1882489919662476, + "rewards/rejected": -2.4790337085723877, + "step": 1915 + }, + { + "epoch": 1.81, + "grad_norm": 15.393126487731934, + "learning_rate": 2.20531654424624e-07, + "logps/chosen": -53.83302307128906, + "logps/rejected": -91.80354309082031, + "loss": 0.2203, + "losses/dpo": 0.16131183505058289, + "losses/sft": 1.8837714195251465, + "losses/total": 0.16131183505058289, + "ref_logps/chosen": -39.37416076660156, + "ref_logps/rejected": -56.51863098144531, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.445886492729187, + "rewards/margins": 2.0826048851013184, + "rewards/rejected": -3.528491497039795, + "step": 1916 + }, + { + "epoch": 1.81, + "grad_norm": 21.387184143066406, + "learning_rate": 2.203567681007345e-07, + "logps/chosen": -60.99091339111328, + "logps/rejected": -82.54985046386719, + "loss": 0.4402, + "losses/dpo": 0.2399728000164032, + "losses/sft": 2.080650568008423, + "losses/total": 0.2399728000164032, + "ref_logps/chosen": -44.19255065917969, + "ref_logps/rejected": -50.358707427978516, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.679836392402649, + "rewards/margins": 1.539278507232666, + "rewards/rejected": -3.2191147804260254, + "step": 1917 + }, + { + "epoch": 1.81, + "grad_norm": 36.407676696777344, + "learning_rate": 2.2018188177684504e-07, + "logps/chosen": -66.58694458007812, + "logps/rejected": -55.33207702636719, + "loss": 0.7924, + "losses/dpo": 1.0841635465621948, + "losses/sft": 1.6932896375656128, + "losses/total": 1.0841635465621948, + "ref_logps/chosen": -52.79731750488281, + "ref_logps/rejected": -36.23854064941406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3789634704589844, + "rewards/margins": 0.5303901433944702, + "rewards/rejected": -1.9093537330627441, + "step": 1918 + }, + { + "epoch": 1.81, + "grad_norm": 25.639163970947266, + "learning_rate": 2.2000699545295555e-07, + "logps/chosen": -73.86323547363281, + "logps/rejected": -64.93158721923828, + "loss": 0.5128, + "losses/dpo": 0.8661154508590698, + "losses/sft": 1.9866821765899658, + "losses/total": 0.8661154508590698, + "ref_logps/chosen": -57.46168518066406, + "ref_logps/rejected": -38.84584045410156, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6401550769805908, + "rewards/margins": 0.9684202075004578, + "rewards/rejected": -2.6085753440856934, + "step": 1919 + }, + { + "epoch": 1.81, + "grad_norm": 27.515398025512695, + "learning_rate": 2.198321091290661e-07, + "logps/chosen": -43.468780517578125, + "logps/rejected": -51.47468948364258, + "loss": 0.8092, + "losses/dpo": 1.607559084892273, + "losses/sft": 2.081766366958618, + "losses/total": 1.607559084892273, + "ref_logps/chosen": -31.525959014892578, + "ref_logps/rejected": -33.119041442871094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1942824125289917, + "rewards/margins": 0.6412827968597412, + "rewards/rejected": -1.835565209388733, + "step": 1920 + }, + { + "epoch": 1.81, + "grad_norm": 25.673067092895508, + "learning_rate": 2.1965722280517663e-07, + "logps/chosen": -67.26519012451172, + "logps/rejected": -68.62635803222656, + "loss": 0.4484, + "losses/dpo": 0.341810017824173, + "losses/sft": 1.7387341260910034, + "losses/total": 0.341810017824173, + "ref_logps/chosen": -49.15861892700195, + "ref_logps/rejected": -40.55122375488281, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8106573820114136, + "rewards/margins": 0.9968562126159668, + "rewards/rejected": -2.80751371383667, + "step": 1921 + }, + { + "epoch": 1.81, + "grad_norm": 20.926837921142578, + "learning_rate": 2.1948233648128717e-07, + "logps/chosen": -48.886878967285156, + "logps/rejected": -79.37821197509766, + "loss": 0.3511, + "losses/dpo": 0.1634785532951355, + "losses/sft": 1.8016397953033447, + "losses/total": 0.1634785532951355, + "ref_logps/chosen": -34.55718231201172, + "ref_logps/rejected": -49.34617614746094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4329702854156494, + "rewards/margins": 1.5702334642410278, + "rewards/rejected": -3.003203868865967, + "step": 1922 + }, + { + "epoch": 1.82, + "grad_norm": 19.447141647338867, + "learning_rate": 2.1930745015739768e-07, + "logps/chosen": -50.00474548339844, + "logps/rejected": -65.13369750976562, + "loss": 0.3732, + "losses/dpo": 0.50125652551651, + "losses/sft": 1.497388243675232, + "losses/total": 0.50125652551651, + "ref_logps/chosen": -39.18120574951172, + "ref_logps/rejected": -41.847801208496094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0823540687561035, + "rewards/margins": 1.2462358474731445, + "rewards/rejected": -2.328589916229248, + "step": 1923 + }, + { + "epoch": 1.82, + "grad_norm": 23.20139503479004, + "learning_rate": 2.191325638335082e-07, + "logps/chosen": -54.2501220703125, + "logps/rejected": -69.33674621582031, + "loss": 0.4377, + "losses/dpo": 0.40815237164497375, + "losses/sft": 1.551851511001587, + "losses/total": 0.40815237164497375, + "ref_logps/chosen": -42.55793762207031, + "ref_logps/rejected": -42.96839904785156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1692180633544922, + "rewards/margins": 1.4676170349121094, + "rewards/rejected": -2.6368350982666016, + "step": 1924 + }, + { + "epoch": 1.82, + "grad_norm": 18.53638458251953, + "learning_rate": 2.1895767750961874e-07, + "logps/chosen": -46.935935974121094, + "logps/rejected": -68.35185241699219, + "loss": 0.3251, + "losses/dpo": 0.5569038987159729, + "losses/sft": 2.3086202144622803, + "losses/total": 0.5569038987159729, + "ref_logps/chosen": -34.66331481933594, + "ref_logps/rejected": -40.83999252319336, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2272617816925049, + "rewards/margins": 1.5239248275756836, + "rewards/rejected": -2.7511866092681885, + "step": 1925 + }, + { + "epoch": 1.82, + "grad_norm": 25.34766960144043, + "learning_rate": 2.1878279118572925e-07, + "logps/chosen": -47.13518524169922, + "logps/rejected": -63.05792236328125, + "loss": 0.5489, + "losses/dpo": 1.0756182670593262, + "losses/sft": 2.5425522327423096, + "losses/total": 1.0756182670593262, + "ref_logps/chosen": -32.12316131591797, + "ref_logps/rejected": -38.68975067138672, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.50120210647583, + "rewards/margins": 0.9356152415275574, + "rewards/rejected": -2.436817169189453, + "step": 1926 + }, + { + "epoch": 1.82, + "grad_norm": 24.83631706237793, + "learning_rate": 2.1860790486183981e-07, + "logps/chosen": -54.74627685546875, + "logps/rejected": -76.66191101074219, + "loss": 0.3896, + "losses/dpo": 0.22895357012748718, + "losses/sft": 2.293583393096924, + "losses/total": 0.22895357012748718, + "ref_logps/chosen": -38.66905975341797, + "ref_logps/rejected": -44.893951416015625, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6077216863632202, + "rewards/margins": 1.5690736770629883, + "rewards/rejected": -3.176795244216919, + "step": 1927 + }, + { + "epoch": 1.82, + "grad_norm": 14.02584171295166, + "learning_rate": 2.1843301853795033e-07, + "logps/chosen": -47.70808410644531, + "logps/rejected": -81.25924682617188, + "loss": 0.2523, + "losses/dpo": 0.3759603202342987, + "losses/sft": 2.0829734802246094, + "losses/total": 0.3759603202342987, + "ref_logps/chosen": -33.322837829589844, + "ref_logps/rejected": -46.36687469482422, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4385244846343994, + "rewards/margins": 2.050712823867798, + "rewards/rejected": -3.4892373085021973, + "step": 1928 + }, + { + "epoch": 1.82, + "grad_norm": 19.891864776611328, + "learning_rate": 2.1825813221406087e-07, + "logps/chosen": -49.841156005859375, + "logps/rejected": -73.42411041259766, + "loss": 0.3344, + "losses/dpo": 0.23590995371341705, + "losses/sft": 1.7683762311935425, + "losses/total": 0.23590995371341705, + "ref_logps/chosen": -34.62481689453125, + "ref_logps/rejected": -43.10502624511719, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5216338634490967, + "rewards/margins": 1.5102744102478027, + "rewards/rejected": -3.0319082736968994, + "step": 1929 + }, + { + "epoch": 1.82, + "grad_norm": 19.316604614257812, + "learning_rate": 2.1808324589017138e-07, + "logps/chosen": -47.80896759033203, + "logps/rejected": -75.6249008178711, + "loss": 0.3223, + "losses/dpo": 0.2652219533920288, + "losses/sft": 1.5195528268814087, + "losses/total": 0.2652219533920288, + "ref_logps/chosen": -34.58601760864258, + "ref_logps/rejected": -48.44448471069336, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3222953081130981, + "rewards/margins": 1.395746111869812, + "rewards/rejected": -2.71804141998291, + "step": 1930 + }, + { + "epoch": 1.82, + "grad_norm": 26.199796676635742, + "learning_rate": 2.179083595662819e-07, + "logps/chosen": -48.245792388916016, + "logps/rejected": -78.83716583251953, + "loss": 0.4373, + "losses/dpo": 0.5776351690292358, + "losses/sft": 1.9075573682785034, + "losses/total": 0.5776351690292358, + "ref_logps/chosen": -33.773094177246094, + "ref_logps/rejected": -52.131649017333984, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.447270154953003, + "rewards/margins": 1.2232816219329834, + "rewards/rejected": -2.6705517768859863, + "step": 1931 + }, + { + "epoch": 1.82, + "grad_norm": 24.241119384765625, + "learning_rate": 2.1773347324239243e-07, + "logps/chosen": -51.85051727294922, + "logps/rejected": -64.8966064453125, + "loss": 0.316, + "losses/dpo": 0.11948569118976593, + "losses/sft": 1.355791687965393, + "losses/total": 0.11948569118976593, + "ref_logps/chosen": -41.326324462890625, + "ref_logps/rejected": -37.862464904785156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0524190664291382, + "rewards/margins": 1.6509957313537598, + "rewards/rejected": -2.7034149169921875, + "step": 1932 + }, + { + "epoch": 1.83, + "grad_norm": 26.71185302734375, + "learning_rate": 2.1755858691850294e-07, + "logps/chosen": -43.790252685546875, + "logps/rejected": -63.17491149902344, + "loss": 0.5615, + "losses/dpo": 0.666718602180481, + "losses/sft": 2.239082098007202, + "losses/total": 0.666718602180481, + "ref_logps/chosen": -29.334993362426758, + "ref_logps/rejected": -38.764217376708984, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4455257654190063, + "rewards/margins": 0.9955437779426575, + "rewards/rejected": -2.4410696029663086, + "step": 1933 + }, + { + "epoch": 1.83, + "grad_norm": 18.04010772705078, + "learning_rate": 2.173837005946135e-07, + "logps/chosen": -48.863975524902344, + "logps/rejected": -67.9365234375, + "loss": 0.2917, + "losses/dpo": 0.16975978016853333, + "losses/sft": 1.395847201347351, + "losses/total": 0.16975978016853333, + "ref_logps/chosen": -37.93406295776367, + "ref_logps/rejected": -39.69021224975586, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.092991590499878, + "rewards/margins": 1.731640100479126, + "rewards/rejected": -2.824631690979004, + "step": 1934 + }, + { + "epoch": 1.83, + "grad_norm": 16.366809844970703, + "learning_rate": 2.1720881427072402e-07, + "logps/chosen": -48.70531463623047, + "logps/rejected": -82.24748229980469, + "loss": 0.2509, + "losses/dpo": 0.45168188214302063, + "losses/sft": 2.0609686374664307, + "losses/total": 0.45168188214302063, + "ref_logps/chosen": -35.18280792236328, + "ref_logps/rejected": -51.04685974121094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3522506952285767, + "rewards/margins": 1.7678117752075195, + "rewards/rejected": -3.1200623512268066, + "step": 1935 + }, + { + "epoch": 1.83, + "grad_norm": 21.454904556274414, + "learning_rate": 2.1703392794683456e-07, + "logps/chosen": -44.24781036376953, + "logps/rejected": -55.08128356933594, + "loss": 0.4836, + "losses/dpo": 0.3794158697128296, + "losses/sft": 1.919634461402893, + "losses/total": 0.3794158697128296, + "ref_logps/chosen": -33.00543975830078, + "ref_logps/rejected": -37.07406997680664, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.124237060546875, + "rewards/margins": 0.6764841079711914, + "rewards/rejected": -1.8007211685180664, + "step": 1936 + }, + { + "epoch": 1.83, + "grad_norm": 21.988210678100586, + "learning_rate": 2.1685904162294507e-07, + "logps/chosen": -40.5090446472168, + "logps/rejected": -56.91093444824219, + "loss": 0.3527, + "losses/dpo": 0.8546777963638306, + "losses/sft": 1.8751733303070068, + "losses/total": 0.8546777963638306, + "ref_logps/chosen": -30.719322204589844, + "ref_logps/rejected": -32.112998962402344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9789724349975586, + "rewards/margins": 1.5008208751678467, + "rewards/rejected": -2.4797935485839844, + "step": 1937 + }, + { + "epoch": 1.83, + "grad_norm": 21.11084747314453, + "learning_rate": 2.1668415529905559e-07, + "logps/chosen": -42.49399185180664, + "logps/rejected": -57.91488265991211, + "loss": 0.4605, + "losses/dpo": 0.33433669805526733, + "losses/sft": 1.4056564569473267, + "losses/total": 0.33433669805526733, + "ref_logps/chosen": -28.660724639892578, + "ref_logps/rejected": -35.102294921875, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.383326530456543, + "rewards/margins": 0.8979320526123047, + "rewards/rejected": -2.2812585830688477, + "step": 1938 + }, + { + "epoch": 1.83, + "grad_norm": 22.22531509399414, + "learning_rate": 2.1650926897516613e-07, + "logps/chosen": -46.26589584350586, + "logps/rejected": -59.38967514038086, + "loss": 0.4035, + "losses/dpo": 0.7754935026168823, + "losses/sft": 2.324876070022583, + "losses/total": 0.7754935026168823, + "ref_logps/chosen": -31.906780242919922, + "ref_logps/rejected": -33.66438293457031, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4359118938446045, + "rewards/margins": 1.1366177797317505, + "rewards/rejected": -2.5725297927856445, + "step": 1939 + }, + { + "epoch": 1.83, + "grad_norm": 28.79388427734375, + "learning_rate": 2.1633438265127666e-07, + "logps/chosen": -55.67766571044922, + "logps/rejected": -59.03409957885742, + "loss": 0.5738, + "losses/dpo": 0.6611247658729553, + "losses/sft": 1.9232362508773804, + "losses/total": 0.6611247658729553, + "ref_logps/chosen": -38.92405700683594, + "ref_logps/rejected": -36.29399490356445, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.675360918045044, + "rewards/margins": 0.5986493825912476, + "rewards/rejected": -2.274010419845581, + "step": 1940 + }, + { + "epoch": 1.83, + "grad_norm": 22.21906852722168, + "learning_rate": 2.161594963273872e-07, + "logps/chosen": -62.16954040527344, + "logps/rejected": -79.95361328125, + "loss": 0.3554, + "losses/dpo": 0.3510781228542328, + "losses/sft": 2.206805944442749, + "losses/total": 0.3510781228542328, + "ref_logps/chosen": -45.09418487548828, + "ref_logps/rejected": -49.92413330078125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.707535982131958, + "rewards/margins": 1.295412540435791, + "rewards/rejected": -3.002948522567749, + "step": 1941 + }, + { + "epoch": 1.83, + "grad_norm": 25.15227508544922, + "learning_rate": 2.1598461000349772e-07, + "logps/chosen": -62.75640869140625, + "logps/rejected": -76.89555358886719, + "loss": 0.4364, + "losses/dpo": 0.23652169108390808, + "losses/sft": 1.976174235343933, + "losses/total": 0.23652169108390808, + "ref_logps/chosen": -46.537086486816406, + "ref_logps/rejected": -44.752655029296875, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.621931791305542, + "rewards/margins": 1.5923576354980469, + "rewards/rejected": -3.214289426803589, + "step": 1942 + }, + { + "epoch": 1.83, + "grad_norm": 27.23257064819336, + "learning_rate": 2.1580972367960826e-07, + "logps/chosen": -56.82284927368164, + "logps/rejected": -80.75686645507812, + "loss": 0.4509, + "losses/dpo": 0.2272796928882599, + "losses/sft": 1.745611548423767, + "losses/total": 0.2272796928882599, + "ref_logps/chosen": -42.16802978515625, + "ref_logps/rejected": -52.64835739135742, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4654819965362549, + "rewards/margins": 1.3453688621520996, + "rewards/rejected": -2.8108506202697754, + "step": 1943 + }, + { + "epoch": 1.84, + "grad_norm": 27.91211700439453, + "learning_rate": 2.1563483735571877e-07, + "logps/chosen": -57.24657440185547, + "logps/rejected": -81.26807403564453, + "loss": 0.4403, + "losses/dpo": 0.4680829644203186, + "losses/sft": 2.068673610687256, + "losses/total": 0.4680829644203186, + "ref_logps/chosen": -41.270355224609375, + "ref_logps/rejected": -54.46880340576172, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5976223945617676, + "rewards/margins": 1.0823047161102295, + "rewards/rejected": -2.679926872253418, + "step": 1944 + }, + { + "epoch": 1.84, + "grad_norm": 17.17719268798828, + "learning_rate": 2.1545995103182928e-07, + "logps/chosen": -52.65362548828125, + "logps/rejected": -76.73627471923828, + "loss": 0.268, + "losses/dpo": 0.25786781311035156, + "losses/sft": 2.301936626434326, + "losses/total": 0.25786781311035156, + "ref_logps/chosen": -39.370201110839844, + "ref_logps/rejected": -48.470130920410156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3283424377441406, + "rewards/margins": 1.4982726573944092, + "rewards/rejected": -2.826615333557129, + "step": 1945 + }, + { + "epoch": 1.84, + "grad_norm": 17.44093894958496, + "learning_rate": 2.1528506470793985e-07, + "logps/chosen": -46.42760467529297, + "logps/rejected": -64.9055404663086, + "loss": 0.2867, + "losses/dpo": 0.15678484737873077, + "losses/sft": 1.9859936237335205, + "losses/total": 0.15678484737873077, + "ref_logps/chosen": -34.89744567871094, + "ref_logps/rejected": -38.69559860229492, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1530159711837769, + "rewards/margins": 1.4679782390594482, + "rewards/rejected": -2.6209943294525146, + "step": 1946 + }, + { + "epoch": 1.84, + "grad_norm": 22.280567169189453, + "learning_rate": 2.1511017838405036e-07, + "logps/chosen": -47.924156188964844, + "logps/rejected": -75.62379455566406, + "loss": 0.4304, + "losses/dpo": 0.24968941509723663, + "losses/sft": 1.5886414051055908, + "losses/total": 0.24968941509723663, + "ref_logps/chosen": -32.75593948364258, + "ref_logps/rejected": -47.03985595703125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.516822099685669, + "rewards/margins": 1.3415718078613281, + "rewards/rejected": -2.858393669128418, + "step": 1947 + }, + { + "epoch": 1.84, + "grad_norm": 26.11994171142578, + "learning_rate": 2.149352920601609e-07, + "logps/chosen": -61.862754821777344, + "logps/rejected": -74.32380676269531, + "loss": 0.424, + "losses/dpo": 0.24698521196842194, + "losses/sft": 1.761254906654358, + "losses/total": 0.24698521196842194, + "ref_logps/chosen": -44.40204620361328, + "ref_logps/rejected": -42.56153106689453, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7460711002349854, + "rewards/margins": 1.4301559925079346, + "rewards/rejected": -3.17622709274292, + "step": 1948 + }, + { + "epoch": 1.84, + "grad_norm": 26.30391502380371, + "learning_rate": 2.147604057362714e-07, + "logps/chosen": -64.70724487304688, + "logps/rejected": -87.25625610351562, + "loss": 0.4695, + "losses/dpo": 0.5290414690971375, + "losses/sft": 2.2446036338806152, + "losses/total": 0.5290414690971375, + "ref_logps/chosen": -41.936336517333984, + "ref_logps/rejected": -50.98778533935547, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2770910263061523, + "rewards/margins": 1.3497557640075684, + "rewards/rejected": -3.6268467903137207, + "step": 1949 + }, + { + "epoch": 1.84, + "grad_norm": 21.30093765258789, + "learning_rate": 2.1458551941238195e-07, + "logps/chosen": -47.91936492919922, + "logps/rejected": -73.26781463623047, + "loss": 0.2849, + "losses/dpo": 0.19459158182144165, + "losses/sft": 1.247519850730896, + "losses/total": 0.19459158182144165, + "ref_logps/chosen": -34.45432662963867, + "ref_logps/rejected": -43.84803771972656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3465039730072021, + "rewards/margins": 1.5954742431640625, + "rewards/rejected": -2.9419782161712646, + "step": 1950 + }, + { + "epoch": 1.84, + "grad_norm": 24.17926788330078, + "learning_rate": 2.1441063308849246e-07, + "logps/chosen": -50.253482818603516, + "logps/rejected": -58.501869201660156, + "loss": 0.4728, + "losses/dpo": 0.5476719737052917, + "losses/sft": 2.300201654434204, + "losses/total": 0.5476719737052917, + "ref_logps/chosen": -34.257896423339844, + "ref_logps/rejected": -33.900978088378906, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5995583534240723, + "rewards/margins": 0.8605306148529053, + "rewards/rejected": -2.4600892066955566, + "step": 1951 + }, + { + "epoch": 1.84, + "grad_norm": 26.85236358642578, + "learning_rate": 2.1423574676460298e-07, + "logps/chosen": -44.034934997558594, + "logps/rejected": -66.96251678466797, + "loss": 0.5054, + "losses/dpo": 0.23814263939857483, + "losses/sft": 1.3473509550094604, + "losses/total": 0.23814263939857483, + "ref_logps/chosen": -29.715856552124023, + "ref_logps/rejected": -41.30472946166992, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4319078922271729, + "rewards/margins": 1.1338706016540527, + "rewards/rejected": -2.5657787322998047, + "step": 1952 + }, + { + "epoch": 1.84, + "grad_norm": 19.77062225341797, + "learning_rate": 2.1406086044071354e-07, + "logps/chosen": -52.25318145751953, + "logps/rejected": -66.92807006835938, + "loss": 0.3831, + "losses/dpo": 0.5790616273880005, + "losses/sft": 1.7756602764129639, + "losses/total": 0.5790616273880005, + "ref_logps/chosen": -39.21463394165039, + "ref_logps/rejected": -39.28076934814453, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3038547039031982, + "rewards/margins": 1.460875153541565, + "rewards/rejected": -2.7647299766540527, + "step": 1953 + }, + { + "epoch": 1.85, + "grad_norm": 22.926481246948242, + "learning_rate": 2.1388597411682405e-07, + "logps/chosen": -52.59754943847656, + "logps/rejected": -73.46942138671875, + "loss": 0.4746, + "losses/dpo": 0.2514271140098572, + "losses/sft": 1.4299970865249634, + "losses/total": 0.2514271140098572, + "ref_logps/chosen": -37.889503479003906, + "ref_logps/rejected": -47.44058609008789, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4708044528961182, + "rewards/margins": 1.1320796012878418, + "rewards/rejected": -2.602884292602539, + "step": 1954 + }, + { + "epoch": 1.85, + "grad_norm": 19.8757266998291, + "learning_rate": 2.137110877929346e-07, + "logps/chosen": -50.41484069824219, + "logps/rejected": -72.51498413085938, + "loss": 0.3462, + "losses/dpo": 0.1194656640291214, + "losses/sft": 2.324993848800659, + "losses/total": 0.1194656640291214, + "ref_logps/chosen": -36.714115142822266, + "ref_logps/rejected": -44.33071517944336, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.370072841644287, + "rewards/margins": 1.4483534097671509, + "rewards/rejected": -2.8184261322021484, + "step": 1955 + }, + { + "epoch": 1.85, + "grad_norm": 28.71787452697754, + "learning_rate": 2.135362014690451e-07, + "logps/chosen": -48.01447296142578, + "logps/rejected": -62.46440124511719, + "loss": 0.6122, + "losses/dpo": 1.064414143562317, + "losses/sft": 1.727824330329895, + "losses/total": 1.064414143562317, + "ref_logps/chosen": -33.594505310058594, + "ref_logps/rejected": -39.40873718261719, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4419970512390137, + "rewards/margins": 0.8635697364807129, + "rewards/rejected": -2.3055667877197266, + "step": 1956 + }, + { + "epoch": 1.85, + "grad_norm": 23.21286964416504, + "learning_rate": 2.1336131514515564e-07, + "logps/chosen": -58.0375862121582, + "logps/rejected": -70.63639831542969, + "loss": 0.37, + "losses/dpo": 0.5791358947753906, + "losses/sft": 1.890541911125183, + "losses/total": 0.5791358947753906, + "ref_logps/chosen": -45.19242477416992, + "ref_logps/rejected": -45.46302032470703, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2845160961151123, + "rewards/margins": 1.2328214645385742, + "rewards/rejected": -2.5173375606536865, + "step": 1957 + }, + { + "epoch": 1.85, + "grad_norm": 15.949946403503418, + "learning_rate": 2.1318642882126616e-07, + "logps/chosen": -33.374114990234375, + "logps/rejected": -72.46894836425781, + "loss": 0.2829, + "losses/dpo": 0.22898954153060913, + "losses/sft": 1.339084267616272, + "losses/total": 0.22898954153060913, + "ref_logps/chosen": -24.883380889892578, + "ref_logps/rejected": -48.905128479003906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8490734100341797, + "rewards/margins": 1.507308840751648, + "rewards/rejected": -2.356382369995117, + "step": 1958 + }, + { + "epoch": 1.85, + "grad_norm": 21.047752380371094, + "learning_rate": 2.130115424973767e-07, + "logps/chosen": -53.520713806152344, + "logps/rejected": -69.3406982421875, + "loss": 0.3651, + "losses/dpo": 0.5611324906349182, + "losses/sft": 1.5742043256759644, + "losses/total": 0.5611324906349182, + "ref_logps/chosen": -38.946136474609375, + "ref_logps/rejected": -40.23695755004883, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4574576616287231, + "rewards/margins": 1.4529160261154175, + "rewards/rejected": -2.9103739261627197, + "step": 1959 + }, + { + "epoch": 1.85, + "grad_norm": 14.455061912536621, + "learning_rate": 2.1283665617348724e-07, + "logps/chosen": -53.4486083984375, + "logps/rejected": -89.52093505859375, + "loss": 0.2338, + "losses/dpo": 0.2984086573123932, + "losses/sft": 1.398449182510376, + "losses/total": 0.2984086573123932, + "ref_logps/chosen": -42.681556701660156, + "ref_logps/rejected": -58.29749298095703, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.076704740524292, + "rewards/margins": 2.0456392765045166, + "rewards/rejected": -3.1223440170288086, + "step": 1960 + }, + { + "epoch": 1.85, + "grad_norm": 22.436206817626953, + "learning_rate": 2.1266176984959775e-07, + "logps/chosen": -77.03755187988281, + "logps/rejected": -88.14137268066406, + "loss": 0.2678, + "losses/dpo": 0.19570860266685486, + "losses/sft": 2.1246986389160156, + "losses/total": 0.19570860266685486, + "ref_logps/chosen": -60.108638763427734, + "ref_logps/rejected": -54.540679931640625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6928906440734863, + "rewards/margins": 1.667177677154541, + "rewards/rejected": -3.3600685596466064, + "step": 1961 + }, + { + "epoch": 1.85, + "grad_norm": 24.994171142578125, + "learning_rate": 2.124868835257083e-07, + "logps/chosen": -58.44688034057617, + "logps/rejected": -67.09022521972656, + "loss": 0.4604, + "losses/dpo": 0.3520793914794922, + "losses/sft": 1.989675760269165, + "losses/total": 0.3520793914794922, + "ref_logps/chosen": -43.235015869140625, + "ref_logps/rejected": -42.84651184082031, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.521186113357544, + "rewards/margins": 0.9031853675842285, + "rewards/rejected": -2.4243717193603516, + "step": 1962 + }, + { + "epoch": 1.85, + "grad_norm": 23.424358367919922, + "learning_rate": 2.123119972018188e-07, + "logps/chosen": -54.1328010559082, + "logps/rejected": -72.67156982421875, + "loss": 0.3887, + "losses/dpo": 0.4511217474937439, + "losses/sft": 1.9106628894805908, + "losses/total": 0.4511217474937439, + "ref_logps/chosen": -41.939208984375, + "ref_logps/rejected": -46.20673370361328, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2193596363067627, + "rewards/margins": 1.4271247386932373, + "rewards/rejected": -2.646484375, + "step": 1963 + }, + { + "epoch": 1.85, + "grad_norm": 15.990822792053223, + "learning_rate": 2.1213711087792934e-07, + "logps/chosen": -41.56805419921875, + "logps/rejected": -67.26626586914062, + "loss": 0.3243, + "losses/dpo": 0.2783471345901489, + "losses/sft": 1.7799925804138184, + "losses/total": 0.2783471345901489, + "ref_logps/chosen": -32.52330780029297, + "ref_logps/rejected": -42.371612548828125, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9044744968414307, + "rewards/margins": 1.5849908590316772, + "rewards/rejected": -2.4894652366638184, + "step": 1964 + }, + { + "epoch": 1.86, + "grad_norm": 31.50393295288086, + "learning_rate": 2.1196222455403988e-07, + "logps/chosen": -64.89741516113281, + "logps/rejected": -79.29817199707031, + "loss": 0.4631, + "losses/dpo": 0.8046008944511414, + "losses/sft": 2.1458544731140137, + "losses/total": 0.8046008944511414, + "ref_logps/chosen": -49.99016571044922, + "ref_logps/rejected": -51.007633209228516, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.490724802017212, + "rewards/margins": 1.3383294343948364, + "rewards/rejected": -2.829054355621338, + "step": 1965 + }, + { + "epoch": 1.86, + "grad_norm": 23.86702537536621, + "learning_rate": 2.117873382301504e-07, + "logps/chosen": -49.21942138671875, + "logps/rejected": -75.91116333007812, + "loss": 0.4431, + "losses/dpo": 0.4857032299041748, + "losses/sft": 1.9156571626663208, + "losses/total": 0.4857032299041748, + "ref_logps/chosen": -35.834869384765625, + "ref_logps/rejected": -50.44806671142578, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3384552001953125, + "rewards/margins": 1.2078537940979004, + "rewards/rejected": -2.546308994293213, + "step": 1966 + }, + { + "epoch": 1.86, + "grad_norm": 25.070886611938477, + "learning_rate": 2.1161245190626093e-07, + "logps/chosen": -50.53087615966797, + "logps/rejected": -68.46553802490234, + "loss": 0.467, + "losses/dpo": 0.4369206428527832, + "losses/sft": 1.8337554931640625, + "losses/total": 0.4369206428527832, + "ref_logps/chosen": -37.08546447753906, + "ref_logps/rejected": -43.849159240722656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3445405960083008, + "rewards/margins": 1.1170969009399414, + "rewards/rejected": -2.461637496948242, + "step": 1967 + }, + { + "epoch": 1.86, + "grad_norm": 20.981592178344727, + "learning_rate": 2.1143756558237144e-07, + "logps/chosen": -48.76057434082031, + "logps/rejected": -72.07445526123047, + "loss": 0.3442, + "losses/dpo": 0.6560444831848145, + "losses/sft": 1.9757767915725708, + "losses/total": 0.6560444831848145, + "ref_logps/chosen": -33.7701530456543, + "ref_logps/rejected": -43.66492462158203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.49904203414917, + "rewards/margins": 1.3419103622436523, + "rewards/rejected": -2.8409523963928223, + "step": 1968 + }, + { + "epoch": 1.86, + "grad_norm": 24.32599449157715, + "learning_rate": 2.1126267925848198e-07, + "logps/chosen": -53.34295654296875, + "logps/rejected": -72.85565185546875, + "loss": 0.364, + "losses/dpo": 0.22470355033874512, + "losses/sft": 1.8783470392227173, + "losses/total": 0.22470355033874512, + "ref_logps/chosen": -37.051368713378906, + "ref_logps/rejected": -42.297752380371094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6291587352752686, + "rewards/margins": 1.426631212234497, + "rewards/rejected": -3.0557899475097656, + "step": 1969 + }, + { + "epoch": 1.86, + "grad_norm": 20.983388900756836, + "learning_rate": 2.110877929345925e-07, + "logps/chosen": -53.12214660644531, + "logps/rejected": -67.75334167480469, + "loss": 0.3623, + "losses/dpo": 0.15745952725410461, + "losses/sft": 1.4009560346603394, + "losses/total": 0.15745952725410461, + "ref_logps/chosen": -41.74241638183594, + "ref_logps/rejected": -41.97954177856445, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1379724740982056, + "rewards/margins": 1.4394073486328125, + "rewards/rejected": -2.5773799419403076, + "step": 1970 + }, + { + "epoch": 1.86, + "grad_norm": 20.522003173828125, + "learning_rate": 2.1091290661070303e-07, + "logps/chosen": -47.230072021484375, + "logps/rejected": -71.14032745361328, + "loss": 0.3682, + "losses/dpo": 0.6743268370628357, + "losses/sft": 1.6973276138305664, + "losses/total": 0.6743268370628357, + "ref_logps/chosen": -34.39399719238281, + "ref_logps/rejected": -42.80510711669922, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.283607840538025, + "rewards/margins": 1.5499138832092285, + "rewards/rejected": -2.833521604537964, + "step": 1971 + }, + { + "epoch": 1.86, + "grad_norm": 17.255136489868164, + "learning_rate": 2.1073802028681357e-07, + "logps/chosen": -47.24890899658203, + "logps/rejected": -74.16438293457031, + "loss": 0.269, + "losses/dpo": 0.42768216133117676, + "losses/sft": 2.1388845443725586, + "losses/total": 0.42768216133117676, + "ref_logps/chosen": -35.4140739440918, + "ref_logps/rejected": -44.785301208496094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1834834814071655, + "rewards/margins": 1.7544245719909668, + "rewards/rejected": -2.9379079341888428, + "step": 1972 + }, + { + "epoch": 1.86, + "grad_norm": 21.254287719726562, + "learning_rate": 2.1056313396292408e-07, + "logps/chosen": -54.706207275390625, + "logps/rejected": -81.81231689453125, + "loss": 0.3673, + "losses/dpo": 0.5571094155311584, + "losses/sft": 2.0986173152923584, + "losses/total": 0.5571094155311584, + "ref_logps/chosen": -40.530128479003906, + "ref_logps/rejected": -48.939971923828125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4176077842712402, + "rewards/margins": 1.869626522064209, + "rewards/rejected": -3.287234306335449, + "step": 1973 + }, + { + "epoch": 1.86, + "grad_norm": 18.496536254882812, + "learning_rate": 2.1038824763903462e-07, + "logps/chosen": -54.24553680419922, + "logps/rejected": -74.15406036376953, + "loss": 0.3115, + "losses/dpo": 0.4126966893672943, + "losses/sft": 2.081036329269409, + "losses/total": 0.4126966893672943, + "ref_logps/chosen": -41.41372299194336, + "ref_logps/rejected": -46.38686752319336, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2831816673278809, + "rewards/margins": 1.4935376644134521, + "rewards/rejected": -2.776719331741333, + "step": 1974 + }, + { + "epoch": 1.86, + "grad_norm": 33.76393508911133, + "learning_rate": 2.1021336131514514e-07, + "logps/chosen": -55.534854888916016, + "logps/rejected": -78.84536743164062, + "loss": 0.6097, + "losses/dpo": 0.2924114763736725, + "losses/sft": 1.6932238340377808, + "losses/total": 0.2924114763736725, + "ref_logps/chosen": -38.251007080078125, + "ref_logps/rejected": -48.34223175048828, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7283849716186523, + "rewards/margins": 1.3219285011291504, + "rewards/rejected": -3.0503134727478027, + "step": 1975 + }, + { + "epoch": 1.87, + "grad_norm": 20.681589126586914, + "learning_rate": 2.1003847499125568e-07, + "logps/chosen": -65.04346466064453, + "logps/rejected": -80.50343322753906, + "loss": 0.3028, + "losses/dpo": 0.29852724075317383, + "losses/sft": 2.32308292388916, + "losses/total": 0.29852724075317383, + "ref_logps/chosen": -48.152496337890625, + "ref_logps/rejected": -49.82281494140625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6890971660614014, + "rewards/margins": 1.3789646625518799, + "rewards/rejected": -3.068061590194702, + "step": 1976 + }, + { + "epoch": 1.87, + "grad_norm": 27.520835876464844, + "learning_rate": 2.098635886673662e-07, + "logps/chosen": -44.01696014404297, + "logps/rejected": -71.24827575683594, + "loss": 0.4254, + "losses/dpo": 0.835005521774292, + "losses/sft": 1.3879934549331665, + "losses/total": 0.835005521774292, + "ref_logps/chosen": -32.514488220214844, + "ref_logps/rejected": -47.72511291503906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1502470970153809, + "rewards/margins": 1.2020692825317383, + "rewards/rejected": -2.352316379547119, + "step": 1977 + }, + { + "epoch": 1.87, + "grad_norm": 26.55544662475586, + "learning_rate": 2.0968870234347675e-07, + "logps/chosen": -51.0809440612793, + "logps/rejected": -57.31834411621094, + "loss": 0.5645, + "losses/dpo": 0.3740193247795105, + "losses/sft": 1.4272371530532837, + "losses/total": 0.3740193247795105, + "ref_logps/chosen": -37.47397994995117, + "ref_logps/rejected": -36.893035888671875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3606963157653809, + "rewards/margins": 0.6818346977233887, + "rewards/rejected": -2.0425310134887695, + "step": 1978 + }, + { + "epoch": 1.87, + "grad_norm": 23.518918991088867, + "learning_rate": 2.0951381601958727e-07, + "logps/chosen": -51.69788360595703, + "logps/rejected": -70.53106689453125, + "loss": 0.4227, + "losses/dpo": 0.46027320623397827, + "losses/sft": 2.0539302825927734, + "losses/total": 0.46027320623397827, + "ref_logps/chosen": -38.222206115722656, + "ref_logps/rejected": -46.31585693359375, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3475680351257324, + "rewards/margins": 1.0739524364471436, + "rewards/rejected": -2.421520709991455, + "step": 1979 + }, + { + "epoch": 1.87, + "grad_norm": 19.48638343811035, + "learning_rate": 2.0933892969569778e-07, + "logps/chosen": -47.801025390625, + "logps/rejected": -76.14320373535156, + "loss": 0.2804, + "losses/dpo": 0.06693714112043381, + "losses/sft": 1.7252448797225952, + "losses/total": 0.06693714112043381, + "ref_logps/chosen": -37.69862365722656, + "ref_logps/rejected": -43.908775329589844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.010240077972412, + "rewards/margins": 2.213202476501465, + "rewards/rejected": -3.223442554473877, + "step": 1980 + }, + { + "epoch": 1.87, + "grad_norm": 19.88045310974121, + "learning_rate": 2.0916404337180832e-07, + "logps/chosen": -45.399925231933594, + "logps/rejected": -62.11358642578125, + "loss": 0.3748, + "losses/dpo": 0.19484129548072815, + "losses/sft": 1.7399189472198486, + "losses/total": 0.19484129548072815, + "ref_logps/chosen": -35.95106506347656, + "ref_logps/rejected": -39.6606559753418, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9448865652084351, + "rewards/margins": 1.3004069328308105, + "rewards/rejected": -2.245293617248535, + "step": 1981 + }, + { + "epoch": 1.87, + "grad_norm": 22.77387809753418, + "learning_rate": 2.0898915704791883e-07, + "logps/chosen": -49.070858001708984, + "logps/rejected": -62.60783386230469, + "loss": 0.399, + "losses/dpo": 0.4592130184173584, + "losses/sft": 1.679467797279358, + "losses/total": 0.4592130184173584, + "ref_logps/chosen": -36.97407531738281, + "ref_logps/rejected": -37.21660232543945, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2096785306930542, + "rewards/margins": 1.3294445276260376, + "rewards/rejected": -2.539123058319092, + "step": 1982 + }, + { + "epoch": 1.87, + "grad_norm": 31.689556121826172, + "learning_rate": 2.0881427072402937e-07, + "logps/chosen": -68.82721710205078, + "logps/rejected": -70.45463562011719, + "loss": 0.4845, + "losses/dpo": 0.4984961152076721, + "losses/sft": 2.4181315898895264, + "losses/total": 0.4984961152076721, + "ref_logps/chosen": -51.806785583496094, + "ref_logps/rejected": -42.296478271484375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.702043056488037, + "rewards/margins": 1.1137725114822388, + "rewards/rejected": -2.8158156871795654, + "step": 1983 + }, + { + "epoch": 1.87, + "grad_norm": 21.13364028930664, + "learning_rate": 2.0863938440013988e-07, + "logps/chosen": -36.618003845214844, + "logps/rejected": -72.97779846191406, + "loss": 0.3157, + "losses/dpo": 0.06762217730283737, + "losses/sft": 2.127976655960083, + "losses/total": 0.06762217730283737, + "ref_logps/chosen": -25.291261672973633, + "ref_logps/rejected": -45.15331268310547, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.132674217224121, + "rewards/margins": 1.6497740745544434, + "rewards/rejected": -2.7824482917785645, + "step": 1984 + }, + { + "epoch": 1.87, + "grad_norm": 15.921688079833984, + "learning_rate": 2.0846449807625045e-07, + "logps/chosen": -54.70018005371094, + "logps/rejected": -80.42591857910156, + "loss": 0.2678, + "losses/dpo": 0.3374122381210327, + "losses/sft": 1.7384850978851318, + "losses/total": 0.3374122381210327, + "ref_logps/chosen": -39.920433044433594, + "ref_logps/rejected": -50.794761657714844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4779746532440186, + "rewards/margins": 1.4851419925689697, + "rewards/rejected": -2.9631166458129883, + "step": 1985 + }, + { + "epoch": 1.88, + "grad_norm": 18.273723602294922, + "learning_rate": 2.0828961175236096e-07, + "logps/chosen": -61.510658264160156, + "logps/rejected": -87.7957534790039, + "loss": 0.2687, + "losses/dpo": 0.21693351864814758, + "losses/sft": 1.8314409255981445, + "losses/total": 0.21693351864814758, + "ref_logps/chosen": -46.84638595581055, + "ref_logps/rejected": -53.92215347290039, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4664275646209717, + "rewards/margins": 1.9209327697753906, + "rewards/rejected": -3.3873603343963623, + "step": 1986 + }, + { + "epoch": 1.88, + "grad_norm": 27.942218780517578, + "learning_rate": 2.0811472542847147e-07, + "logps/chosen": -41.060115814208984, + "logps/rejected": -77.3597412109375, + "loss": 0.4036, + "losses/dpo": 0.17683270573616028, + "losses/sft": 2.055321216583252, + "losses/total": 0.17683270573616028, + "ref_logps/chosen": -27.92916488647461, + "ref_logps/rejected": -49.761436462402344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3130953311920166, + "rewards/margins": 1.4467356204986572, + "rewards/rejected": -2.759830951690674, + "step": 1987 + }, + { + "epoch": 1.88, + "grad_norm": 11.769414901733398, + "learning_rate": 2.07939839104582e-07, + "logps/chosen": -43.42335510253906, + "logps/rejected": -74.48541259765625, + "loss": 0.2253, + "losses/dpo": 0.20665507018566132, + "losses/sft": 1.1677836179733276, + "losses/total": 0.20665507018566132, + "ref_logps/chosen": -32.017337799072266, + "ref_logps/rejected": -43.09758377075195, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1406021118164062, + "rewards/margins": 1.9981807470321655, + "rewards/rejected": -3.1387829780578613, + "step": 1988 + }, + { + "epoch": 1.88, + "grad_norm": 23.761144638061523, + "learning_rate": 2.0776495278069253e-07, + "logps/chosen": -68.45650482177734, + "logps/rejected": -88.13902282714844, + "loss": 0.3371, + "losses/dpo": 0.3239016532897949, + "losses/sft": 2.3723411560058594, + "losses/total": 0.3239016532897949, + "ref_logps/chosen": -48.0863037109375, + "ref_logps/rejected": -49.539794921875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.037020206451416, + "rewards/margins": 1.8229024410247803, + "rewards/rejected": -3.8599228858947754, + "step": 1989 + }, + { + "epoch": 1.88, + "grad_norm": 25.765422821044922, + "learning_rate": 2.0759006645680306e-07, + "logps/chosen": -58.637203216552734, + "logps/rejected": -72.1034164428711, + "loss": 0.571, + "losses/dpo": 0.9255797863006592, + "losses/sft": 1.4871019124984741, + "losses/total": 0.9255797863006592, + "ref_logps/chosen": -40.592498779296875, + "ref_logps/rejected": -43.86729431152344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.804470181465149, + "rewards/margins": 1.0191423892974854, + "rewards/rejected": -2.823612689971924, + "step": 1990 + }, + { + "epoch": 1.88, + "grad_norm": 24.1730899810791, + "learning_rate": 2.074151801329136e-07, + "logps/chosen": -52.826576232910156, + "logps/rejected": -78.01376342773438, + "loss": 0.4196, + "losses/dpo": 0.5060432553291321, + "losses/sft": 2.1354801654815674, + "losses/total": 0.5060432553291321, + "ref_logps/chosen": -36.949371337890625, + "ref_logps/rejected": -45.54932403564453, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5877199172973633, + "rewards/margins": 1.6587238311767578, + "rewards/rejected": -3.246443748474121, + "step": 1991 + }, + { + "epoch": 1.88, + "grad_norm": 42.56746292114258, + "learning_rate": 2.0724029380902414e-07, + "logps/chosen": -58.30364990234375, + "logps/rejected": -66.47650146484375, + "loss": 0.688, + "losses/dpo": 0.6619449853897095, + "losses/sft": 2.4093542098999023, + "losses/total": 0.6619449853897095, + "ref_logps/chosen": -38.64124298095703, + "ref_logps/rejected": -41.48748016357422, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.966240644454956, + "rewards/margins": 0.5326617360115051, + "rewards/rejected": -2.4989023208618164, + "step": 1992 + }, + { + "epoch": 1.88, + "grad_norm": 17.661104202270508, + "learning_rate": 2.0706540748513466e-07, + "logps/chosen": -47.92689895629883, + "logps/rejected": -61.3677978515625, + "loss": 0.3667, + "losses/dpo": 0.6560609340667725, + "losses/sft": 1.7580631971359253, + "losses/total": 0.6560609340667725, + "ref_logps/chosen": -35.225093841552734, + "ref_logps/rejected": -33.523284912109375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2701804637908936, + "rewards/margins": 1.5142710208892822, + "rewards/rejected": -2.784451484680176, + "step": 1993 + }, + { + "epoch": 1.88, + "grad_norm": 22.866487503051758, + "learning_rate": 2.0689052116124517e-07, + "logps/chosen": -48.848724365234375, + "logps/rejected": -68.41227722167969, + "loss": 0.3555, + "losses/dpo": 0.14448793232440948, + "losses/sft": 1.8898380994796753, + "losses/total": 0.14448793232440948, + "ref_logps/chosen": -32.324951171875, + "ref_logps/rejected": -38.35407638549805, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6523773670196533, + "rewards/margins": 1.3534433841705322, + "rewards/rejected": -3.0058207511901855, + "step": 1994 + }, + { + "epoch": 1.88, + "grad_norm": 26.90878677368164, + "learning_rate": 2.067156348373557e-07, + "logps/chosen": -52.19657897949219, + "logps/rejected": -80.27163696289062, + "loss": 0.4643, + "losses/dpo": 0.4463430345058441, + "losses/sft": 2.5013411045074463, + "losses/total": 0.4463430345058441, + "ref_logps/chosen": -33.78961181640625, + "ref_logps/rejected": -46.71401596069336, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8406972885131836, + "rewards/margins": 1.5150644779205322, + "rewards/rejected": -3.355762004852295, + "step": 1995 + }, + { + "epoch": 1.88, + "grad_norm": 26.384733200073242, + "learning_rate": 2.0654074851346622e-07, + "logps/chosen": -68.16213989257812, + "logps/rejected": -86.7209701538086, + "loss": 0.4057, + "losses/dpo": 0.24786734580993652, + "losses/sft": 2.254863739013672, + "losses/total": 0.24786734580993652, + "ref_logps/chosen": -49.461891174316406, + "ref_logps/rejected": -55.3970832824707, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8700259923934937, + "rewards/margins": 1.2623631954193115, + "rewards/rejected": -3.1323890686035156, + "step": 1996 + }, + { + "epoch": 1.89, + "grad_norm": 23.183576583862305, + "learning_rate": 2.0636586218957679e-07, + "logps/chosen": -55.484580993652344, + "logps/rejected": -76.90269470214844, + "loss": 0.3383, + "losses/dpo": 0.3613179922103882, + "losses/sft": 1.9333280324935913, + "losses/total": 0.3613179922103882, + "ref_logps/chosen": -38.94062042236328, + "ref_logps/rejected": -41.81243133544922, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6543956995010376, + "rewards/margins": 1.854630708694458, + "rewards/rejected": -3.509026288986206, + "step": 1997 + }, + { + "epoch": 1.89, + "grad_norm": 25.89813804626465, + "learning_rate": 2.061909758656873e-07, + "logps/chosen": -41.725730895996094, + "logps/rejected": -76.85957336425781, + "loss": 0.4249, + "losses/dpo": 0.5903616547584534, + "losses/sft": 1.9699132442474365, + "losses/total": 0.5903616547584534, + "ref_logps/chosen": -27.31644630432129, + "ref_logps/rejected": -48.239173889160156, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4409284591674805, + "rewards/margins": 1.4211115837097168, + "rewards/rejected": -2.8620400428771973, + "step": 1998 + }, + { + "epoch": 1.89, + "grad_norm": 21.803895950317383, + "learning_rate": 2.0601608954179784e-07, + "logps/chosen": -57.87916564941406, + "logps/rejected": -86.5878677368164, + "loss": 0.3366, + "losses/dpo": 0.16372033953666687, + "losses/sft": 1.9345643520355225, + "losses/total": 0.16372033953666687, + "ref_logps/chosen": -39.35773468017578, + "ref_logps/rejected": -54.01332092285156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8521428108215332, + "rewards/margins": 1.4053120613098145, + "rewards/rejected": -3.2574548721313477, + "step": 1999 + }, + { + "epoch": 1.89, + "grad_norm": 15.345725059509277, + "learning_rate": 2.0584120321790835e-07, + "logps/chosen": -66.094482421875, + "logps/rejected": -80.86222076416016, + "loss": 0.2922, + "losses/dpo": 0.48337864875793457, + "losses/sft": 2.0383682250976562, + "losses/total": 0.48337864875793457, + "ref_logps/chosen": -48.794952392578125, + "ref_logps/rejected": -45.4182243347168, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7299530506134033, + "rewards/margins": 1.8144464492797852, + "rewards/rejected": -3.5443994998931885, + "step": 2000 + }, + { + "epoch": 1.89, + "grad_norm": 25.278440475463867, + "learning_rate": 2.0566631689401886e-07, + "logps/chosen": -56.98828887939453, + "logps/rejected": -76.40863800048828, + "loss": 0.4741, + "losses/dpo": 0.2761775553226471, + "losses/sft": 1.8638036251068115, + "losses/total": 0.2761775553226471, + "ref_logps/chosen": -42.440155029296875, + "ref_logps/rejected": -50.553497314453125, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4548134803771973, + "rewards/margins": 1.1307004690170288, + "rewards/rejected": -2.5855140686035156, + "step": 2001 + }, + { + "epoch": 1.89, + "grad_norm": 19.58589744567871, + "learning_rate": 2.054914305701294e-07, + "logps/chosen": -53.86283874511719, + "logps/rejected": -68.95311737060547, + "loss": 0.2785, + "losses/dpo": 0.339206337928772, + "losses/sft": 1.8253734111785889, + "losses/total": 0.339206337928772, + "ref_logps/chosen": -40.300052642822266, + "ref_logps/rejected": -39.351966857910156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.356278657913208, + "rewards/margins": 1.6038364171981812, + "rewards/rejected": -2.9601149559020996, + "step": 2002 + }, + { + "epoch": 1.89, + "grad_norm": 28.8936824798584, + "learning_rate": 2.0531654424623991e-07, + "logps/chosen": -60.06389617919922, + "logps/rejected": -86.05029296875, + "loss": 0.4235, + "losses/dpo": 0.7493062019348145, + "losses/sft": 2.340275287628174, + "losses/total": 0.7493062019348145, + "ref_logps/chosen": -41.57643127441406, + "ref_logps/rejected": -52.40216827392578, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.8487460613250732, + "rewards/margins": 1.5160664319992065, + "rewards/rejected": -3.3648126125335693, + "step": 2003 + }, + { + "epoch": 1.89, + "grad_norm": 24.26761245727539, + "learning_rate": 2.0514165792235048e-07, + "logps/chosen": -56.69879913330078, + "logps/rejected": -73.68850708007812, + "loss": 0.4476, + "losses/dpo": 0.8404563069343567, + "losses/sft": 2.1810801029205322, + "losses/total": 0.8404563069343567, + "ref_logps/chosen": -38.946495056152344, + "ref_logps/rejected": -41.723838806152344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7752306461334229, + "rewards/margins": 1.4212366342544556, + "rewards/rejected": -3.196467399597168, + "step": 2004 + }, + { + "epoch": 1.89, + "grad_norm": 21.48341178894043, + "learning_rate": 2.04966771598461e-07, + "logps/chosen": -46.665130615234375, + "logps/rejected": -63.29737854003906, + "loss": 0.3826, + "losses/dpo": 0.19626843929290771, + "losses/sft": 1.126360297203064, + "losses/total": 0.19626843929290771, + "ref_logps/chosen": -36.079742431640625, + "ref_logps/rejected": -40.45587921142578, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0585389137268066, + "rewards/margins": 1.2256114482879639, + "rewards/rejected": -2.2841501235961914, + "step": 2005 + }, + { + "epoch": 1.89, + "grad_norm": 20.516582489013672, + "learning_rate": 2.0479188527457153e-07, + "logps/chosen": -52.32270812988281, + "logps/rejected": -69.18130493164062, + "loss": 0.3467, + "losses/dpo": 0.3455691933631897, + "losses/sft": 1.6257274150848389, + "losses/total": 0.3455691933631897, + "ref_logps/chosen": -40.26002502441406, + "ref_logps/rejected": -43.94281005859375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.206268310546875, + "rewards/margins": 1.3175818920135498, + "rewards/rejected": -2.523850202560425, + "step": 2006 + }, + { + "epoch": 1.9, + "grad_norm": 27.236385345458984, + "learning_rate": 2.0461699895068204e-07, + "logps/chosen": -60.64058303833008, + "logps/rejected": -82.799072265625, + "loss": 0.451, + "losses/dpo": 0.28404921293258667, + "losses/sft": 1.7321925163269043, + "losses/total": 0.28404921293258667, + "ref_logps/chosen": -46.73709487915039, + "ref_logps/rejected": -54.20802688598633, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3903485536575317, + "rewards/margins": 1.4687564373016357, + "rewards/rejected": -2.859104871749878, + "step": 2007 + }, + { + "epoch": 1.9, + "grad_norm": 15.246626853942871, + "learning_rate": 2.0444211262679256e-07, + "logps/chosen": -51.04586410522461, + "logps/rejected": -79.669921875, + "loss": 0.2003, + "losses/dpo": 0.24426805973052979, + "losses/sft": 1.8733670711517334, + "losses/total": 0.24426805973052979, + "ref_logps/chosen": -36.87551498413086, + "ref_logps/rejected": -46.27311706542969, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4170351028442383, + "rewards/margins": 1.9226452112197876, + "rewards/rejected": -3.3396801948547363, + "step": 2008 + }, + { + "epoch": 1.9, + "grad_norm": 26.957326889038086, + "learning_rate": 2.042672263029031e-07, + "logps/chosen": -64.02755737304688, + "logps/rejected": -87.56922912597656, + "loss": 0.3679, + "losses/dpo": 0.39934104681015015, + "losses/sft": 2.0048184394836426, + "losses/total": 0.39934104681015015, + "ref_logps/chosen": -45.204986572265625, + "ref_logps/rejected": -55.2584342956543, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8822574615478516, + "rewards/margins": 1.3488216400146484, + "rewards/rejected": -3.2310791015625, + "step": 2009 + }, + { + "epoch": 1.9, + "grad_norm": 28.889867782592773, + "learning_rate": 2.0409233997901364e-07, + "logps/chosen": -45.40325927734375, + "logps/rejected": -64.39208984375, + "loss": 0.5301, + "losses/dpo": 0.3618355393409729, + "losses/sft": 1.6301692724227905, + "losses/total": 0.3618355393409729, + "ref_logps/chosen": -32.46398162841797, + "ref_logps/rejected": -43.23768615722656, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.293927550315857, + "rewards/margins": 0.8215130567550659, + "rewards/rejected": -2.115440607070923, + "step": 2010 + }, + { + "epoch": 1.9, + "grad_norm": 26.232376098632812, + "learning_rate": 2.0391745365512417e-07, + "logps/chosen": -58.55329132080078, + "logps/rejected": -81.92396545410156, + "loss": 0.3592, + "losses/dpo": 0.4822644591331482, + "losses/sft": 2.224480628967285, + "losses/total": 0.4822644591331482, + "ref_logps/chosen": -41.89851379394531, + "ref_logps/rejected": -51.11354446411133, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6654775142669678, + "rewards/margins": 1.4155640602111816, + "rewards/rejected": -3.0810418128967285, + "step": 2011 + }, + { + "epoch": 1.9, + "grad_norm": 18.64391326904297, + "learning_rate": 2.037425673312347e-07, + "logps/chosen": -57.3790283203125, + "logps/rejected": -89.85823822021484, + "loss": 0.321, + "losses/dpo": 0.5200005769729614, + "losses/sft": 1.751615285873413, + "losses/total": 0.5200005769729614, + "ref_logps/chosen": -38.839447021484375, + "ref_logps/rejected": -53.599388122558594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8539587259292603, + "rewards/margins": 1.7719268798828125, + "rewards/rejected": -3.6258857250213623, + "step": 2012 + }, + { + "epoch": 1.9, + "grad_norm": 26.9904727935791, + "learning_rate": 2.0356768100734523e-07, + "logps/chosen": -68.7083740234375, + "logps/rejected": -76.96910095214844, + "loss": 0.4193, + "losses/dpo": 0.272158682346344, + "losses/sft": 1.799604892730713, + "losses/total": 0.272158682346344, + "ref_logps/chosen": -50.99106979370117, + "ref_logps/rejected": -49.56971740722656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.771729826927185, + "rewards/margins": 0.9682086706161499, + "rewards/rejected": -2.739938497543335, + "step": 2013 + }, + { + "epoch": 1.9, + "grad_norm": 26.16299819946289, + "learning_rate": 2.0339279468345574e-07, + "logps/chosen": -45.748558044433594, + "logps/rejected": -64.01423645019531, + "loss": 0.5407, + "losses/dpo": 0.621539294719696, + "losses/sft": 1.330641508102417, + "losses/total": 0.621539294719696, + "ref_logps/chosen": -32.998722076416016, + "ref_logps/rejected": -40.929996490478516, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2749831676483154, + "rewards/margins": 1.0334405899047852, + "rewards/rejected": -2.3084237575531006, + "step": 2014 + }, + { + "epoch": 1.9, + "grad_norm": 33.948951721191406, + "learning_rate": 2.0321790835956625e-07, + "logps/chosen": -50.650291442871094, + "logps/rejected": -73.06553649902344, + "loss": 0.5064, + "losses/dpo": 0.5304527878761292, + "losses/sft": 2.369108200073242, + "losses/total": 0.5304527878761292, + "ref_logps/chosen": -35.19801330566406, + "ref_logps/rejected": -44.235130310058594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5452282428741455, + "rewards/margins": 1.3378121852874756, + "rewards/rejected": -2.883040428161621, + "step": 2015 + }, + { + "epoch": 1.9, + "grad_norm": 20.02587890625, + "learning_rate": 2.030430220356768e-07, + "logps/chosen": -47.7255859375, + "logps/rejected": -68.58493041992188, + "loss": 0.3913, + "losses/dpo": 0.43606191873550415, + "losses/sft": 1.7529096603393555, + "losses/total": 0.43606191873550415, + "ref_logps/chosen": -34.093971252441406, + "ref_logps/rejected": -42.21705627441406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3631618022918701, + "rewards/margins": 1.2736256122589111, + "rewards/rejected": -2.636787176132202, + "step": 2016 + }, + { + "epoch": 1.9, + "grad_norm": 28.85910415649414, + "learning_rate": 2.0286813571178733e-07, + "logps/chosen": -52.251495361328125, + "logps/rejected": -71.4780044555664, + "loss": 0.4898, + "losses/dpo": 0.20923466980457306, + "losses/sft": 1.697746992111206, + "losses/total": 0.20923466980457306, + "ref_logps/chosen": -35.547203063964844, + "ref_logps/rejected": -44.104331970214844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.670428991317749, + "rewards/margins": 1.0669379234313965, + "rewards/rejected": -2.7373671531677246, + "step": 2017 + }, + { + "epoch": 1.91, + "grad_norm": 20.186370849609375, + "learning_rate": 2.0269324938789787e-07, + "logps/chosen": -50.92329406738281, + "logps/rejected": -73.6741714477539, + "loss": 0.3167, + "losses/dpo": 0.4310111701488495, + "losses/sft": 1.2951059341430664, + "losses/total": 0.4310111701488495, + "ref_logps/chosen": -35.7137451171875, + "ref_logps/rejected": -42.59636688232422, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5209550857543945, + "rewards/margins": 1.5868254899978638, + "rewards/rejected": -3.107780694961548, + "step": 2018 + }, + { + "epoch": 1.91, + "grad_norm": 24.85271644592285, + "learning_rate": 2.0251836306400838e-07, + "logps/chosen": -48.684146881103516, + "logps/rejected": -63.44342041015625, + "loss": 0.3108, + "losses/dpo": 0.4532914161682129, + "losses/sft": 2.3797152042388916, + "losses/total": 0.4532914161682129, + "ref_logps/chosen": -36.48652648925781, + "ref_logps/rejected": -36.149147033691406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.219761848449707, + "rewards/margins": 1.5096651315689087, + "rewards/rejected": -2.729426860809326, + "step": 2019 + }, + { + "epoch": 1.91, + "grad_norm": 23.203615188598633, + "learning_rate": 2.0234347674011892e-07, + "logps/chosen": -55.359474182128906, + "logps/rejected": -64.22511291503906, + "loss": 0.4585, + "losses/dpo": 0.5014011859893799, + "losses/sft": 2.2300329208374023, + "losses/total": 0.5014011859893799, + "ref_logps/chosen": -38.677513122558594, + "ref_logps/rejected": -36.93783950805664, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6681957244873047, + "rewards/margins": 1.060531735420227, + "rewards/rejected": -2.728727340698242, + "step": 2020 + }, + { + "epoch": 1.91, + "grad_norm": 32.913204193115234, + "learning_rate": 2.0216859041622943e-07, + "logps/chosen": -59.758995056152344, + "logps/rejected": -73.78396606445312, + "loss": 0.5848, + "losses/dpo": 0.42151883244514465, + "losses/sft": 1.5064970254898071, + "losses/total": 0.42151883244514465, + "ref_logps/chosen": -45.86241912841797, + "ref_logps/rejected": -53.4886474609375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3896574974060059, + "rewards/margins": 0.6398743391036987, + "rewards/rejected": -2.029531955718994, + "step": 2021 + }, + { + "epoch": 1.91, + "grad_norm": 16.41496467590332, + "learning_rate": 2.0199370409233995e-07, + "logps/chosen": -50.31555938720703, + "logps/rejected": -63.7568359375, + "loss": 0.3858, + "losses/dpo": 0.5128999352455139, + "losses/sft": 1.543710708618164, + "losses/total": 0.5128999352455139, + "ref_logps/chosen": -37.41289520263672, + "ref_logps/rejected": -38.08341598510742, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.290266752243042, + "rewards/margins": 1.277075171470642, + "rewards/rejected": -2.5673418045043945, + "step": 2022 + }, + { + "epoch": 1.91, + "grad_norm": 24.01865005493164, + "learning_rate": 2.018188177684505e-07, + "logps/chosen": -45.16056823730469, + "logps/rejected": -82.82772827148438, + "loss": 0.2841, + "losses/dpo": 0.35318487882614136, + "losses/sft": 1.541651964187622, + "losses/total": 0.35318487882614136, + "ref_logps/chosen": -33.652015686035156, + "ref_logps/rejected": -51.68000793457031, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1508554220199585, + "rewards/margins": 1.9639167785644531, + "rewards/rejected": -3.114772319793701, + "step": 2023 + }, + { + "epoch": 1.91, + "grad_norm": 22.34075164794922, + "learning_rate": 2.0164393144456102e-07, + "logps/chosen": -48.46492004394531, + "logps/rejected": -67.80860900878906, + "loss": 0.4035, + "losses/dpo": 0.3765292763710022, + "losses/sft": 2.082228660583496, + "losses/total": 0.3765292763710022, + "ref_logps/chosen": -34.32884216308594, + "ref_logps/rejected": -43.48926544189453, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4136078357696533, + "rewards/margins": 1.0183261632919312, + "rewards/rejected": -2.431934118270874, + "step": 2024 + }, + { + "epoch": 1.91, + "grad_norm": 21.50850486755371, + "learning_rate": 2.0146904512067156e-07, + "logps/chosen": -45.23872375488281, + "logps/rejected": -73.0084228515625, + "loss": 0.3491, + "losses/dpo": 0.17151105403900146, + "losses/sft": 2.066784143447876, + "losses/total": 0.17151105403900146, + "ref_logps/chosen": -30.906810760498047, + "ref_logps/rejected": -43.360382080078125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4331917762756348, + "rewards/margins": 1.5316121578216553, + "rewards/rejected": -2.96480393409729, + "step": 2025 + }, + { + "epoch": 1.91, + "grad_norm": 23.876951217651367, + "learning_rate": 2.0129415879678208e-07, + "logps/chosen": -43.144287109375, + "logps/rejected": -57.90933609008789, + "loss": 0.4229, + "losses/dpo": 0.38650426268577576, + "losses/sft": 1.7937887907028198, + "losses/total": 0.38650426268577576, + "ref_logps/chosen": -32.53365707397461, + "ref_logps/rejected": -36.2231330871582, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0610628128051758, + "rewards/margins": 1.1075578927993774, + "rewards/rejected": -2.1686205863952637, + "step": 2026 + }, + { + "epoch": 1.91, + "grad_norm": 31.716066360473633, + "learning_rate": 2.0111927247289261e-07, + "logps/chosen": -57.36821365356445, + "logps/rejected": -70.847900390625, + "loss": 0.5411, + "losses/dpo": 0.8157985806465149, + "losses/sft": 2.6244165897369385, + "losses/total": 0.8157985806465149, + "ref_logps/chosen": -41.74764633178711, + "ref_logps/rejected": -46.83079528808594, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5620566606521606, + "rewards/margins": 0.8396540284156799, + "rewards/rejected": -2.4017105102539062, + "step": 2027 + }, + { + "epoch": 1.92, + "grad_norm": 29.944028854370117, + "learning_rate": 2.0094438614900313e-07, + "logps/chosen": -54.41606521606445, + "logps/rejected": -74.38653564453125, + "loss": 0.4916, + "losses/dpo": 0.35407572984695435, + "losses/sft": 1.6522469520568848, + "losses/total": 0.35407572984695435, + "ref_logps/chosen": -38.94879150390625, + "ref_logps/rejected": -46.490760803222656, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5467274188995361, + "rewards/margins": 1.2428498268127441, + "rewards/rejected": -2.7895772457122803, + "step": 2028 + }, + { + "epoch": 1.92, + "grad_norm": 19.687559127807617, + "learning_rate": 2.007694998251137e-07, + "logps/chosen": -42.60370635986328, + "logps/rejected": -69.28800964355469, + "loss": 0.3099, + "losses/dpo": 0.27244699001312256, + "losses/sft": 1.6144404411315918, + "losses/total": 0.27244699001312256, + "ref_logps/chosen": -28.908899307250977, + "ref_logps/rejected": -42.39570236206055, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3694807291030884, + "rewards/margins": 1.3197505474090576, + "rewards/rejected": -2.6892311573028564, + "step": 2029 + }, + { + "epoch": 1.92, + "grad_norm": 18.690031051635742, + "learning_rate": 2.005946135012242e-07, + "logps/chosen": -43.9360466003418, + "logps/rejected": -65.29751586914062, + "loss": 0.2687, + "losses/dpo": 0.14580021798610687, + "losses/sft": 2.0838897228240967, + "losses/total": 0.14580021798610687, + "ref_logps/chosen": -31.547157287597656, + "ref_logps/rejected": -34.753456115722656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2388887405395508, + "rewards/margins": 1.8155173063278198, + "rewards/rejected": -3.05440616607666, + "step": 2030 + }, + { + "epoch": 1.92, + "grad_norm": 19.959712982177734, + "learning_rate": 2.0041972717733472e-07, + "logps/chosen": -54.709930419921875, + "logps/rejected": -75.29872131347656, + "loss": 0.2827, + "losses/dpo": 0.2894763648509979, + "losses/sft": 1.991902232170105, + "losses/total": 0.2894763648509979, + "ref_logps/chosen": -41.17220687866211, + "ref_logps/rejected": -46.030967712402344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3537721633911133, + "rewards/margins": 1.5730029344558716, + "rewards/rejected": -2.9267749786376953, + "step": 2031 + }, + { + "epoch": 1.92, + "grad_norm": 23.293468475341797, + "learning_rate": 2.0024484085344526e-07, + "logps/chosen": -48.56928253173828, + "logps/rejected": -69.32160186767578, + "loss": 0.3616, + "losses/dpo": 0.3449220061302185, + "losses/sft": 1.977399468421936, + "losses/total": 0.3449220061302185, + "ref_logps/chosen": -34.26426315307617, + "ref_logps/rejected": -43.83550262451172, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.43050217628479, + "rewards/margins": 1.1181080341339111, + "rewards/rejected": -2.548610210418701, + "step": 2032 + }, + { + "epoch": 1.92, + "grad_norm": 26.006484985351562, + "learning_rate": 2.0006995452955577e-07, + "logps/chosen": -46.61804962158203, + "logps/rejected": -50.099266052246094, + "loss": 0.4767, + "losses/dpo": 0.5687950849533081, + "losses/sft": 1.6788700819015503, + "losses/total": 0.5687950849533081, + "ref_logps/chosen": -33.719215393066406, + "ref_logps/rejected": -29.503496170043945, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2898836135864258, + "rewards/margins": 0.7696936130523682, + "rewards/rejected": -2.059576988220215, + "step": 2033 + }, + { + "epoch": 1.92, + "grad_norm": 16.64080810546875, + "learning_rate": 1.998950682056663e-07, + "logps/chosen": -69.95222473144531, + "logps/rejected": -103.65144348144531, + "loss": 0.236, + "losses/dpo": 0.3266555666923523, + "losses/sft": 1.8553285598754883, + "losses/total": 0.3266555666923523, + "ref_logps/chosen": -50.67909622192383, + "ref_logps/rejected": -62.793434143066406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9273128509521484, + "rewards/margins": 2.1584882736206055, + "rewards/rejected": -4.085801124572754, + "step": 2034 + }, + { + "epoch": 1.92, + "grad_norm": 25.264097213745117, + "learning_rate": 1.9972018188177682e-07, + "logps/chosen": -48.751007080078125, + "logps/rejected": -76.67971801757812, + "loss": 0.4608, + "losses/dpo": 0.21231618523597717, + "losses/sft": 1.9976848363876343, + "losses/total": 0.21231618523597717, + "ref_logps/chosen": -35.7194709777832, + "ref_logps/rejected": -49.99412536621094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3031532764434814, + "rewards/margins": 1.3654061555862427, + "rewards/rejected": -2.6685593128204346, + "step": 2035 + }, + { + "epoch": 1.92, + "grad_norm": 41.93486022949219, + "learning_rate": 1.995452955578874e-07, + "logps/chosen": -77.87896728515625, + "logps/rejected": -84.36576843261719, + "loss": 0.8346, + "losses/dpo": 0.505761981010437, + "losses/sft": 2.4345896244049072, + "losses/total": 0.505761981010437, + "ref_logps/chosen": -46.7550163269043, + "ref_logps/rejected": -50.0225944519043, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.1123950481414795, + "rewards/margins": 0.32192283868789673, + "rewards/rejected": -3.4343180656433105, + "step": 2036 + }, + { + "epoch": 1.92, + "grad_norm": 19.129859924316406, + "learning_rate": 1.993704092339979e-07, + "logps/chosen": -44.94675064086914, + "logps/rejected": -74.08683013916016, + "loss": 0.2664, + "losses/dpo": 0.23209071159362793, + "losses/sft": 1.801204800605774, + "losses/total": 0.23209071159362793, + "ref_logps/chosen": -32.547752380371094, + "ref_logps/rejected": -45.430458068847656, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2398995161056519, + "rewards/margins": 1.6257374286651611, + "rewards/rejected": -2.8656368255615234, + "step": 2037 + }, + { + "epoch": 1.92, + "grad_norm": 19.115253448486328, + "learning_rate": 1.991955229101084e-07, + "logps/chosen": -45.65440368652344, + "logps/rejected": -60.15361404418945, + "loss": 0.3216, + "losses/dpo": 0.2604219615459442, + "losses/sft": 1.5795276165008545, + "losses/total": 0.2604219615459442, + "ref_logps/chosen": -34.540252685546875, + "ref_logps/rejected": -36.18833923339844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.111415147781372, + "rewards/margins": 1.2851128578186035, + "rewards/rejected": -2.3965280055999756, + "step": 2038 + }, + { + "epoch": 1.93, + "grad_norm": 29.1007137298584, + "learning_rate": 1.9902063658621895e-07, + "logps/chosen": -41.4874267578125, + "logps/rejected": -60.68952178955078, + "loss": 0.5793, + "losses/dpo": 0.27138128876686096, + "losses/sft": 1.2251626253128052, + "losses/total": 0.27138128876686096, + "ref_logps/chosen": -28.983592987060547, + "ref_logps/rejected": -37.920616149902344, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2503833770751953, + "rewards/margins": 1.0265073776245117, + "rewards/rejected": -2.276890754699707, + "step": 2039 + }, + { + "epoch": 1.93, + "grad_norm": 28.733631134033203, + "learning_rate": 1.9884575026232946e-07, + "logps/chosen": -52.36991500854492, + "logps/rejected": -50.15070343017578, + "loss": 0.5886, + "losses/dpo": 0.5438276529312134, + "losses/sft": 1.6661931276321411, + "losses/total": 0.5438276529312134, + "ref_logps/chosen": -36.1097412109375, + "ref_logps/rejected": -29.813556671142578, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6260173320770264, + "rewards/margins": 0.4076976776123047, + "rewards/rejected": -2.033715009689331, + "step": 2040 + }, + { + "epoch": 1.93, + "grad_norm": 15.451970100402832, + "learning_rate": 1.9867086393844e-07, + "logps/chosen": -49.75353240966797, + "logps/rejected": -75.8673095703125, + "loss": 0.246, + "losses/dpo": 0.4540598392486572, + "losses/sft": 2.013915538787842, + "losses/total": 0.4540598392486572, + "ref_logps/chosen": -36.06848907470703, + "ref_logps/rejected": -42.807682037353516, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3685040473937988, + "rewards/margins": 1.9374585151672363, + "rewards/rejected": -3.305962562561035, + "step": 2041 + }, + { + "epoch": 1.93, + "grad_norm": 18.874589920043945, + "learning_rate": 1.9849597761455054e-07, + "logps/chosen": -47.55992126464844, + "logps/rejected": -69.62950897216797, + "loss": 0.3843, + "losses/dpo": 0.4438169598579407, + "losses/sft": 1.8290280103683472, + "losses/total": 0.4438169598579407, + "ref_logps/chosen": -34.679588317871094, + "ref_logps/rejected": -41.335845947265625, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2880332469940186, + "rewards/margins": 1.5413331985473633, + "rewards/rejected": -2.829366445541382, + "step": 2042 + }, + { + "epoch": 1.93, + "grad_norm": 36.710052490234375, + "learning_rate": 1.9832109129066108e-07, + "logps/chosen": -60.273834228515625, + "logps/rejected": -69.26152801513672, + "loss": 0.5931, + "losses/dpo": 0.38301146030426025, + "losses/sft": 1.7894673347473145, + "losses/total": 0.38301146030426025, + "ref_logps/chosen": -41.36894989013672, + "ref_logps/rejected": -41.500797271728516, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.890488624572754, + "rewards/margins": 0.8855847716331482, + "rewards/rejected": -2.776073455810547, + "step": 2043 + }, + { + "epoch": 1.93, + "grad_norm": 20.80672836303711, + "learning_rate": 1.981462049667716e-07, + "logps/chosen": -54.35045623779297, + "logps/rejected": -74.58406066894531, + "loss": 0.379, + "losses/dpo": 0.47416791319847107, + "losses/sft": 3.190709352493286, + "losses/total": 0.47416791319847107, + "ref_logps/chosen": -36.59331130981445, + "ref_logps/rejected": -40.45694351196289, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7757145166397095, + "rewards/margins": 1.6369965076446533, + "rewards/rejected": -3.4127109050750732, + "step": 2044 + }, + { + "epoch": 1.93, + "grad_norm": 28.5202693939209, + "learning_rate": 1.979713186428821e-07, + "logps/chosen": -65.97419738769531, + "logps/rejected": -87.1558837890625, + "loss": 0.3507, + "losses/dpo": 0.40661272406578064, + "losses/sft": 2.788334369659424, + "losses/total": 0.40661272406578064, + "ref_logps/chosen": -45.286468505859375, + "ref_logps/rejected": -47.588104248046875, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.068773031234741, + "rewards/margins": 1.8880045413970947, + "rewards/rejected": -3.956777811050415, + "step": 2045 + }, + { + "epoch": 1.93, + "grad_norm": 28.022443771362305, + "learning_rate": 1.9779643231899265e-07, + "logps/chosen": -51.276031494140625, + "logps/rejected": -62.86093521118164, + "loss": 0.4797, + "losses/dpo": 0.5778504610061646, + "losses/sft": 1.8465807437896729, + "losses/total": 0.5778504610061646, + "ref_logps/chosen": -34.459999084472656, + "ref_logps/rejected": -37.06925582885742, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6816034317016602, + "rewards/margins": 0.8975642919540405, + "rewards/rejected": -2.5791680812835693, + "step": 2046 + }, + { + "epoch": 1.93, + "grad_norm": 20.93613624572754, + "learning_rate": 1.9762154599510316e-07, + "logps/chosen": -52.076847076416016, + "logps/rejected": -69.5322036743164, + "loss": 0.2863, + "losses/dpo": 0.21077489852905273, + "losses/sft": 1.5609585046768188, + "losses/total": 0.21077489852905273, + "ref_logps/chosen": -38.66948318481445, + "ref_logps/rejected": -40.144004821777344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3407361507415771, + "rewards/margins": 1.5980840921401978, + "rewards/rejected": -2.9388203620910645, + "step": 2047 + }, + { + "epoch": 1.93, + "grad_norm": 22.441720962524414, + "learning_rate": 1.974466596712137e-07, + "logps/chosen": -45.73637771606445, + "logps/rejected": -67.10670471191406, + "loss": 0.4889, + "losses/dpo": 0.36217594146728516, + "losses/sft": 1.6394838094711304, + "losses/total": 0.36217594146728516, + "ref_logps/chosen": -34.477806091308594, + "ref_logps/rejected": -44.96718215942383, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1258573532104492, + "rewards/margins": 1.08809494972229, + "rewards/rejected": -2.2139523029327393, + "step": 2048 + }, + { + "epoch": 1.93, + "grad_norm": 20.267269134521484, + "learning_rate": 1.9727177334732424e-07, + "logps/chosen": -46.09368896484375, + "logps/rejected": -75.48944091796875, + "loss": 0.3961, + "losses/dpo": 0.6624510884284973, + "losses/sft": 1.5255367755889893, + "losses/total": 0.6624510884284973, + "ref_logps/chosen": -33.08018112182617, + "ref_logps/rejected": -46.07804870605469, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3013508319854736, + "rewards/margins": 1.6397885084152222, + "rewards/rejected": -2.9411392211914062, + "step": 2049 + }, + { + "epoch": 1.94, + "grad_norm": 29.125268936157227, + "learning_rate": 1.9709688702343478e-07, + "logps/chosen": -55.18638610839844, + "logps/rejected": -75.74964141845703, + "loss": 0.4844, + "losses/dpo": 0.1501118689775467, + "losses/sft": 1.7127957344055176, + "losses/total": 0.1501118689775467, + "ref_logps/chosen": -36.874794006347656, + "ref_logps/rejected": -43.76411056518555, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8311591148376465, + "rewards/margins": 1.3673936128616333, + "rewards/rejected": -3.1985528469085693, + "step": 2050 + }, + { + "epoch": 1.94, + "grad_norm": 24.710039138793945, + "learning_rate": 1.969220006995453e-07, + "logps/chosen": -54.13361358642578, + "logps/rejected": -61.68814468383789, + "loss": 0.4008, + "losses/dpo": 0.29393234848976135, + "losses/sft": 1.48407781124115, + "losses/total": 0.29393234848976135, + "ref_logps/chosen": -40.47198486328125, + "ref_logps/rejected": -36.94309997558594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.366162657737732, + "rewards/margins": 1.1083418130874634, + "rewards/rejected": -2.4745047092437744, + "step": 2051 + }, + { + "epoch": 1.94, + "grad_norm": 21.571746826171875, + "learning_rate": 1.967471143756558e-07, + "logps/chosen": -57.80361557006836, + "logps/rejected": -79.19660949707031, + "loss": 0.2927, + "losses/dpo": 0.33083444833755493, + "losses/sft": 2.3784596920013428, + "losses/total": 0.33083444833755493, + "ref_logps/chosen": -39.48957443237305, + "ref_logps/rejected": -44.828399658203125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8314040899276733, + "rewards/margins": 1.605417013168335, + "rewards/rejected": -3.4368209838867188, + "step": 2052 + }, + { + "epoch": 1.94, + "grad_norm": 20.24934196472168, + "learning_rate": 1.9657222805176634e-07, + "logps/chosen": -47.747398376464844, + "logps/rejected": -67.80306243896484, + "loss": 0.3692, + "losses/dpo": 0.6854740381240845, + "losses/sft": 1.8549668788909912, + "losses/total": 0.6854740381240845, + "ref_logps/chosen": -34.80888366699219, + "ref_logps/rejected": -39.37554931640625, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.293851375579834, + "rewards/margins": 1.5489002466201782, + "rewards/rejected": -2.8427515029907227, + "step": 2053 + }, + { + "epoch": 1.94, + "grad_norm": 25.91911506652832, + "learning_rate": 1.9639734172787685e-07, + "logps/chosen": -59.826622009277344, + "logps/rejected": -93.56526184082031, + "loss": 0.3855, + "losses/dpo": 0.1819000095129013, + "losses/sft": 2.266031265258789, + "losses/total": 0.1819000095129013, + "ref_logps/chosen": -40.761009216308594, + "ref_logps/rejected": -59.559513092041016, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9065617322921753, + "rewards/margins": 1.4940130710601807, + "rewards/rejected": -3.4005746841430664, + "step": 2054 + }, + { + "epoch": 1.94, + "grad_norm": 23.563228607177734, + "learning_rate": 1.9622245540398742e-07, + "logps/chosen": -52.501014709472656, + "logps/rejected": -77.13671875, + "loss": 0.4442, + "losses/dpo": 0.277631938457489, + "losses/sft": 2.213420867919922, + "losses/total": 0.277631938457489, + "ref_logps/chosen": -36.81894302368164, + "ref_logps/rejected": -48.306800842285156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5682072639465332, + "rewards/margins": 1.314785361289978, + "rewards/rejected": -2.882992744445801, + "step": 2055 + }, + { + "epoch": 1.94, + "grad_norm": 28.440704345703125, + "learning_rate": 1.9604756908009793e-07, + "logps/chosen": -61.124229431152344, + "logps/rejected": -82.30738067626953, + "loss": 0.4886, + "losses/dpo": 0.276885986328125, + "losses/sft": 1.9950237274169922, + "losses/total": 0.276885986328125, + "ref_logps/chosen": -40.2645263671875, + "ref_logps/rejected": -49.9613151550293, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.085969924926758, + "rewards/margins": 1.1486362218856812, + "rewards/rejected": -3.2346062660217285, + "step": 2056 + }, + { + "epoch": 1.94, + "grad_norm": 16.89289093017578, + "learning_rate": 1.9587268275620847e-07, + "logps/chosen": -57.330413818359375, + "logps/rejected": -78.9949722290039, + "loss": 0.2761, + "losses/dpo": 0.2522817850112915, + "losses/sft": 1.7089128494262695, + "losses/total": 0.2522817850112915, + "ref_logps/chosen": -42.53447723388672, + "ref_logps/rejected": -49.11338806152344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4795938730239868, + "rewards/margins": 1.5085647106170654, + "rewards/rejected": -2.9881584644317627, + "step": 2057 + }, + { + "epoch": 1.94, + "grad_norm": 23.73087501525879, + "learning_rate": 1.9569779643231898e-07, + "logps/chosen": -56.779701232910156, + "logps/rejected": -72.22625732421875, + "loss": 0.3604, + "losses/dpo": 0.1695684939622879, + "losses/sft": 1.6324191093444824, + "losses/total": 0.1695684939622879, + "ref_logps/chosen": -39.02884292602539, + "ref_logps/rejected": -40.934547424316406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7750858068466187, + "rewards/margins": 1.3540852069854736, + "rewards/rejected": -3.129171133041382, + "step": 2058 + }, + { + "epoch": 1.94, + "grad_norm": 19.350387573242188, + "learning_rate": 1.955229101084295e-07, + "logps/chosen": -50.196685791015625, + "logps/rejected": -69.0809097290039, + "loss": 0.2892, + "losses/dpo": 0.282158762216568, + "losses/sft": 1.8599509000778198, + "losses/total": 0.282158762216568, + "ref_logps/chosen": -38.94902801513672, + "ref_logps/rejected": -40.440338134765625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1247658729553223, + "rewards/margins": 1.7392911911010742, + "rewards/rejected": -2.8640570640563965, + "step": 2059 + }, + { + "epoch": 1.95, + "grad_norm": 20.693626403808594, + "learning_rate": 1.9534802378454004e-07, + "logps/chosen": -48.73835754394531, + "logps/rejected": -63.906105041503906, + "loss": 0.416, + "losses/dpo": 0.2316254824399948, + "losses/sft": 1.894715428352356, + "losses/total": 0.2316254824399948, + "ref_logps/chosen": -37.42266082763672, + "ref_logps/rejected": -42.40538024902344, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1315703392028809, + "rewards/margins": 1.0185023546218872, + "rewards/rejected": -2.1500725746154785, + "step": 2060 + }, + { + "epoch": 1.95, + "grad_norm": 30.93864631652832, + "learning_rate": 1.9517313746065057e-07, + "logps/chosen": -58.22099685668945, + "logps/rejected": -66.14900970458984, + "loss": 0.5575, + "losses/dpo": 0.45409464836120605, + "losses/sft": 1.9882922172546387, + "losses/total": 0.45409464836120605, + "ref_logps/chosen": -40.06817626953125, + "ref_logps/rejected": -40.44312286376953, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8152823448181152, + "rewards/margins": 0.7553058862686157, + "rewards/rejected": -2.5705881118774414, + "step": 2061 + }, + { + "epoch": 1.95, + "grad_norm": 27.666088104248047, + "learning_rate": 1.9499825113676111e-07, + "logps/chosen": -60.009605407714844, + "logps/rejected": -72.30021667480469, + "loss": 0.4208, + "losses/dpo": 0.6343302130699158, + "losses/sft": 2.0791141986846924, + "losses/total": 0.6343302130699158, + "ref_logps/chosen": -40.573551177978516, + "ref_logps/rejected": -39.11536407470703, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9436051845550537, + "rewards/margins": 1.3748797178268433, + "rewards/rejected": -3.3184847831726074, + "step": 2062 + }, + { + "epoch": 1.95, + "grad_norm": 34.38661193847656, + "learning_rate": 1.9482336481287163e-07, + "logps/chosen": -49.46623229980469, + "logps/rejected": -57.707603454589844, + "loss": 0.6619, + "losses/dpo": 0.7517745494842529, + "losses/sft": 1.9962821006774902, + "losses/total": 0.7517745494842529, + "ref_logps/chosen": -33.148826599121094, + "ref_logps/rejected": -34.319637298583984, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6317403316497803, + "rewards/margins": 0.7070561647415161, + "rewards/rejected": -2.338796615600586, + "step": 2063 + }, + { + "epoch": 1.95, + "grad_norm": 21.083423614501953, + "learning_rate": 1.9464847848898217e-07, + "logps/chosen": -56.04933547973633, + "logps/rejected": -65.13681030273438, + "loss": 0.37, + "losses/dpo": 0.23565126955509186, + "losses/sft": 1.7713210582733154, + "losses/total": 0.23565126955509186, + "ref_logps/chosen": -44.885215759277344, + "ref_logps/rejected": -38.86481475830078, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1164121627807617, + "rewards/margins": 1.5107877254486084, + "rewards/rejected": -2.627200126647949, + "step": 2064 + }, + { + "epoch": 1.95, + "grad_norm": 20.880590438842773, + "learning_rate": 1.9447359216509268e-07, + "logps/chosen": -49.9166259765625, + "logps/rejected": -69.47760009765625, + "loss": 0.4008, + "losses/dpo": 0.26993870735168457, + "losses/sft": 1.7678112983703613, + "losses/total": 0.26993870735168457, + "ref_logps/chosen": -35.69708251953125, + "ref_logps/rejected": -41.96161651611328, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4219542741775513, + "rewards/margins": 1.3296444416046143, + "rewards/rejected": -2.751598834991455, + "step": 2065 + }, + { + "epoch": 1.95, + "grad_norm": 17.91613006591797, + "learning_rate": 1.942987058412032e-07, + "logps/chosen": -66.38330841064453, + "logps/rejected": -88.3729476928711, + "loss": 0.2564, + "losses/dpo": 0.15901872515678406, + "losses/sft": 1.9070210456848145, + "losses/total": 0.15901872515678406, + "ref_logps/chosen": -48.571468353271484, + "ref_logps/rejected": -52.37348937988281, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.781184434890747, + "rewards/margins": 1.8187617063522339, + "rewards/rejected": -3.5999460220336914, + "step": 2066 + }, + { + "epoch": 1.95, + "grad_norm": 27.85806655883789, + "learning_rate": 1.9412381951731373e-07, + "logps/chosen": -61.115875244140625, + "logps/rejected": -78.0401840209961, + "loss": 0.4132, + "losses/dpo": 0.8243873119354248, + "losses/sft": 1.2907639741897583, + "losses/total": 0.8243873119354248, + "ref_logps/chosen": -44.757720947265625, + "ref_logps/rejected": -47.41073989868164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6358152627944946, + "rewards/margins": 1.427128553390503, + "rewards/rejected": -3.062943935394287, + "step": 2067 + }, + { + "epoch": 1.95, + "grad_norm": 20.662166595458984, + "learning_rate": 1.9394893319342427e-07, + "logps/chosen": -55.93009567260742, + "logps/rejected": -70.70684814453125, + "loss": 0.3395, + "losses/dpo": 0.2904793620109558, + "losses/sft": 2.2393345832824707, + "losses/total": 0.2904793620109558, + "ref_logps/chosen": -40.470787048339844, + "ref_logps/rejected": -43.35810089111328, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5459303855895996, + "rewards/margins": 1.1889444589614868, + "rewards/rejected": -2.734874963760376, + "step": 2068 + }, + { + "epoch": 1.95, + "grad_norm": 28.276548385620117, + "learning_rate": 1.937740468695348e-07, + "logps/chosen": -44.18762969970703, + "logps/rejected": -60.83019256591797, + "loss": 0.5104, + "losses/dpo": 0.8664460182189941, + "losses/sft": 1.5060625076293945, + "losses/total": 0.8664460182189941, + "ref_logps/chosen": -29.414011001586914, + "ref_logps/rejected": -36.614662170410156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4773621559143066, + "rewards/margins": 0.9441909193992615, + "rewards/rejected": -2.421552896499634, + "step": 2069 + }, + { + "epoch": 1.95, + "grad_norm": 24.872291564941406, + "learning_rate": 1.9359916054564532e-07, + "logps/chosen": -55.625770568847656, + "logps/rejected": -78.26005554199219, + "loss": 0.3554, + "losses/dpo": 0.5299415588378906, + "losses/sft": 1.4370397329330444, + "losses/total": 0.5299415588378906, + "ref_logps/chosen": -43.666263580322266, + "ref_logps/rejected": -52.23921203613281, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1959506273269653, + "rewards/margins": 1.406134009361267, + "rewards/rejected": -2.6020846366882324, + "step": 2070 + }, + { + "epoch": 1.96, + "grad_norm": 34.20013427734375, + "learning_rate": 1.9342427422175586e-07, + "logps/chosen": -63.907798767089844, + "logps/rejected": -61.06121063232422, + "loss": 0.6904, + "losses/dpo": 0.7830640077590942, + "losses/sft": 2.0493459701538086, + "losses/total": 0.7830640077590942, + "ref_logps/chosen": -43.97303009033203, + "ref_logps/rejected": -38.165138244628906, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.9934767484664917, + "rewards/margins": 0.2961304485797882, + "rewards/rejected": -2.289607048034668, + "step": 2071 + }, + { + "epoch": 1.96, + "grad_norm": 22.545995712280273, + "learning_rate": 1.9324938789786637e-07, + "logps/chosen": -45.397300720214844, + "logps/rejected": -64.16326141357422, + "loss": 0.4307, + "losses/dpo": 0.2381553053855896, + "losses/sft": 1.8798092603683472, + "losses/total": 0.2381553053855896, + "ref_logps/chosen": -34.55494689941406, + "ref_logps/rejected": -40.66218566894531, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0842351913452148, + "rewards/margins": 1.2658724784851074, + "rewards/rejected": -2.3501076698303223, + "step": 2072 + }, + { + "epoch": 1.96, + "grad_norm": 23.276906967163086, + "learning_rate": 1.9307450157397688e-07, + "logps/chosen": -58.593563079833984, + "logps/rejected": -74.5292739868164, + "loss": 0.392, + "losses/dpo": 0.3592626452445984, + "losses/sft": 1.6102502346038818, + "losses/total": 0.3592626452445984, + "ref_logps/chosen": -46.083213806152344, + "ref_logps/rejected": -46.22300720214844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2510348558425903, + "rewards/margins": 1.579592227935791, + "rewards/rejected": -2.830626964569092, + "step": 2073 + }, + { + "epoch": 1.96, + "grad_norm": 24.0294132232666, + "learning_rate": 1.9289961525008745e-07, + "logps/chosen": -55.75016784667969, + "logps/rejected": -65.87483978271484, + "loss": 0.5777, + "losses/dpo": 1.2023175954818726, + "losses/sft": 2.2473299503326416, + "losses/total": 1.2023175954818726, + "ref_logps/chosen": -37.31719207763672, + "ref_logps/rejected": -38.586856842041016, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8432972431182861, + "rewards/margins": 0.8855009078979492, + "rewards/rejected": -2.7287983894348145, + "step": 2074 + }, + { + "epoch": 1.96, + "grad_norm": 25.856199264526367, + "learning_rate": 1.9272472892619796e-07, + "logps/chosen": -48.15317153930664, + "logps/rejected": -69.64471435546875, + "loss": 0.3503, + "losses/dpo": 0.45898646116256714, + "losses/sft": 1.573825716972351, + "losses/total": 0.45898646116256714, + "ref_logps/chosen": -33.36845397949219, + "ref_logps/rejected": -39.878990173339844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4784717559814453, + "rewards/margins": 1.498100996017456, + "rewards/rejected": -2.9765727519989014, + "step": 2075 + }, + { + "epoch": 1.96, + "grad_norm": 27.411643981933594, + "learning_rate": 1.925498426023085e-07, + "logps/chosen": -53.39967346191406, + "logps/rejected": -71.10379791259766, + "loss": 0.4396, + "losses/dpo": 0.21336789429187775, + "losses/sft": 1.7873870134353638, + "losses/total": 0.21336789429187775, + "ref_logps/chosen": -37.71217727661133, + "ref_logps/rejected": -44.63653564453125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5687496662139893, + "rewards/margins": 1.0779763460159302, + "rewards/rejected": -2.646726131439209, + "step": 2076 + }, + { + "epoch": 1.96, + "grad_norm": 26.992263793945312, + "learning_rate": 1.9237495627841901e-07, + "logps/chosen": -58.543304443359375, + "logps/rejected": -61.3272705078125, + "loss": 0.4392, + "losses/dpo": 0.49617934226989746, + "losses/sft": 1.79050874710083, + "losses/total": 0.49617934226989746, + "ref_logps/chosen": -43.81988525390625, + "ref_logps/rejected": -37.0499267578125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4723420143127441, + "rewards/margins": 0.9553926587104797, + "rewards/rejected": -2.427734613418579, + "step": 2077 + }, + { + "epoch": 1.96, + "grad_norm": 29.41219711303711, + "learning_rate": 1.9220006995452955e-07, + "logps/chosen": -51.70094299316406, + "logps/rejected": -58.129615783691406, + "loss": 0.489, + "losses/dpo": 0.48410069942474365, + "losses/sft": 1.8122549057006836, + "losses/total": 0.48410069942474365, + "ref_logps/chosen": -37.28096389770508, + "ref_logps/rejected": -34.442848205566406, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4419982433319092, + "rewards/margins": 0.9266785383224487, + "rewards/rejected": -2.3686766624450684, + "step": 2078 + }, + { + "epoch": 1.96, + "grad_norm": 30.24034309387207, + "learning_rate": 1.9202518363064007e-07, + "logps/chosen": -65.14990997314453, + "logps/rejected": -65.8658218383789, + "loss": 0.5403, + "losses/dpo": 0.5930138826370239, + "losses/sft": 1.9618332386016846, + "losses/total": 0.5930138826370239, + "ref_logps/chosen": -49.0767822265625, + "ref_logps/rejected": -39.36961364746094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.6073129177093506, + "rewards/margins": 1.0423080921173096, + "rewards/rejected": -2.64962100982666, + "step": 2079 + }, + { + "epoch": 1.96, + "grad_norm": 21.096139907836914, + "learning_rate": 1.9185029730675058e-07, + "logps/chosen": -54.708377838134766, + "logps/rejected": -84.69854736328125, + "loss": 0.358, + "losses/dpo": 0.3152535855770111, + "losses/sft": 2.2408792972564697, + "losses/total": 0.3152535855770111, + "ref_logps/chosen": -38.508750915527344, + "ref_logps/rejected": -52.90130615234375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6199626922607422, + "rewards/margins": 1.5597611665725708, + "rewards/rejected": -3.1797237396240234, + "step": 2080 + }, + { + "epoch": 1.97, + "grad_norm": 17.17258644104004, + "learning_rate": 1.9167541098286114e-07, + "logps/chosen": -47.74473190307617, + "logps/rejected": -74.9290542602539, + "loss": 0.3151, + "losses/dpo": 0.2950161099433899, + "losses/sft": 1.737703800201416, + "losses/total": 0.2950161099433899, + "ref_logps/chosen": -35.02825164794922, + "ref_logps/rejected": -48.01458740234375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2716484069824219, + "rewards/margins": 1.4197978973388672, + "rewards/rejected": -2.691446304321289, + "step": 2081 + }, + { + "epoch": 1.97, + "grad_norm": 29.31932258605957, + "learning_rate": 1.9150052465897166e-07, + "logps/chosen": -53.92249298095703, + "logps/rejected": -71.12378692626953, + "loss": 0.5637, + "losses/dpo": 0.7377325296401978, + "losses/sft": 1.9557472467422485, + "losses/total": 0.7377325296401978, + "ref_logps/chosen": -38.39622497558594, + "ref_logps/rejected": -45.32025146484375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.552626609802246, + "rewards/margins": 1.0277271270751953, + "rewards/rejected": -2.5803537368774414, + "step": 2082 + }, + { + "epoch": 1.97, + "grad_norm": 13.754678726196289, + "learning_rate": 1.913256383350822e-07, + "logps/chosen": -47.44029998779297, + "logps/rejected": -83.66126251220703, + "loss": 0.2069, + "losses/dpo": 0.10512280464172363, + "losses/sft": 1.3472740650177002, + "losses/total": 0.10512280464172363, + "ref_logps/chosen": -36.48375701904297, + "ref_logps/rejected": -51.67913055419922, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0956543684005737, + "rewards/margins": 2.1025586128234863, + "rewards/rejected": -3.1982131004333496, + "step": 2083 + }, + { + "epoch": 1.97, + "grad_norm": 19.123947143554688, + "learning_rate": 1.911507520111927e-07, + "logps/chosen": -40.42908477783203, + "logps/rejected": -65.13935089111328, + "loss": 0.4052, + "losses/dpo": 0.3695049285888672, + "losses/sft": 1.5850168466567993, + "losses/total": 0.3695049285888672, + "ref_logps/chosen": -31.06500244140625, + "ref_logps/rejected": -42.063072204589844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9364081621170044, + "rewards/margins": 1.371219515800476, + "rewards/rejected": -2.3076276779174805, + "step": 2084 + }, + { + "epoch": 1.97, + "grad_norm": 19.283388137817383, + "learning_rate": 1.9097586568730325e-07, + "logps/chosen": -53.13050842285156, + "logps/rejected": -76.23269653320312, + "loss": 0.3761, + "losses/dpo": 0.26982805132865906, + "losses/sft": 2.076272964477539, + "losses/total": 0.26982805132865906, + "ref_logps/chosen": -34.79920196533203, + "ref_logps/rejected": -43.05308532714844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8331308364868164, + "rewards/margins": 1.4848307371139526, + "rewards/rejected": -3.3179616928100586, + "step": 2085 + }, + { + "epoch": 1.97, + "grad_norm": 22.932968139648438, + "learning_rate": 1.9080097936341376e-07, + "logps/chosen": -51.976802825927734, + "logps/rejected": -66.0677261352539, + "loss": 0.3818, + "losses/dpo": 0.4107745885848999, + "losses/sft": 2.1810503005981445, + "losses/total": 0.4107745885848999, + "ref_logps/chosen": -37.13493347167969, + "ref_logps/rejected": -40.08832550048828, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4841868877410889, + "rewards/margins": 1.113753080368042, + "rewards/rejected": -2.597939968109131, + "step": 2086 + }, + { + "epoch": 1.97, + "grad_norm": 25.720481872558594, + "learning_rate": 1.906260930395243e-07, + "logps/chosen": -55.9951057434082, + "logps/rejected": -74.78225708007812, + "loss": 0.4095, + "losses/dpo": 0.7387577295303345, + "losses/sft": 2.833451986312866, + "losses/total": 0.7387577295303345, + "ref_logps/chosen": -39.956939697265625, + "ref_logps/rejected": -46.588706970214844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6038166284561157, + "rewards/margins": 1.2155382633209229, + "rewards/rejected": -2.819355010986328, + "step": 2087 + }, + { + "epoch": 1.97, + "grad_norm": 13.170822143554688, + "learning_rate": 1.9045120671563484e-07, + "logps/chosen": -52.139076232910156, + "logps/rejected": -82.11978912353516, + "loss": 0.2314, + "losses/dpo": 0.15874797105789185, + "losses/sft": 1.5309741497039795, + "losses/total": 0.15874797105789185, + "ref_logps/chosen": -40.98955535888672, + "ref_logps/rejected": -51.77434158325195, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1149518489837646, + "rewards/margins": 1.9195929765701294, + "rewards/rejected": -3.0345449447631836, + "step": 2088 + }, + { + "epoch": 1.97, + "grad_norm": 18.979862213134766, + "learning_rate": 1.9027632039174535e-07, + "logps/chosen": -49.205665588378906, + "logps/rejected": -60.659915924072266, + "loss": 0.4428, + "losses/dpo": 0.36194419860839844, + "losses/sft": 1.6680033206939697, + "losses/total": 0.36194419860839844, + "ref_logps/chosen": -32.610618591308594, + "ref_logps/rejected": -34.23227310180664, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6595051288604736, + "rewards/margins": 0.9832589626312256, + "rewards/rejected": -2.642764091491699, + "step": 2089 + }, + { + "epoch": 1.97, + "grad_norm": 27.069564819335938, + "learning_rate": 1.901014340678559e-07, + "logps/chosen": -46.990013122558594, + "logps/rejected": -64.36250305175781, + "loss": 0.4694, + "losses/dpo": 0.8395521640777588, + "losses/sft": 1.9547569751739502, + "losses/total": 0.8395521640777588, + "ref_logps/chosen": -32.83867645263672, + "ref_logps/rejected": -37.38503646850586, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4151335954666138, + "rewards/margins": 1.2826130390167236, + "rewards/rejected": -2.697746753692627, + "step": 2090 + }, + { + "epoch": 1.97, + "grad_norm": 22.355756759643555, + "learning_rate": 1.899265477439664e-07, + "logps/chosen": -56.93726348876953, + "logps/rejected": -73.70066833496094, + "loss": 0.3696, + "losses/dpo": 0.34817594289779663, + "losses/sft": 1.9821481704711914, + "losses/total": 0.34817594289779663, + "ref_logps/chosen": -46.03300476074219, + "ref_logps/rejected": -48.89353561401367, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.090425968170166, + "rewards/margins": 1.3902873992919922, + "rewards/rejected": -2.480713367462158, + "step": 2091 + }, + { + "epoch": 1.98, + "grad_norm": 23.368337631225586, + "learning_rate": 1.8975166142007694e-07, + "logps/chosen": -52.14710235595703, + "logps/rejected": -65.72732543945312, + "loss": 0.3966, + "losses/dpo": 0.4847814738750458, + "losses/sft": 1.4096633195877075, + "losses/total": 0.4847814738750458, + "ref_logps/chosen": -38.912132263183594, + "ref_logps/rejected": -39.4206657409668, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3234974145889282, + "rewards/margins": 1.3071682453155518, + "rewards/rejected": -2.6306657791137695, + "step": 2092 + }, + { + "epoch": 1.98, + "grad_norm": 15.476760864257812, + "learning_rate": 1.8957677509618748e-07, + "logps/chosen": -40.99615478515625, + "logps/rejected": -61.18220520019531, + "loss": 0.2826, + "losses/dpo": 0.48680445551872253, + "losses/sft": 1.4409599304199219, + "losses/total": 0.48680445551872253, + "ref_logps/chosen": -30.948347091674805, + "ref_logps/rejected": -36.278907775878906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.004780888557434, + "rewards/margins": 1.485548973083496, + "rewards/rejected": -2.4903297424316406, + "step": 2093 + }, + { + "epoch": 1.98, + "grad_norm": 19.184370040893555, + "learning_rate": 1.89401888772298e-07, + "logps/chosen": -54.1899528503418, + "logps/rejected": -67.83609771728516, + "loss": 0.3333, + "losses/dpo": 0.15602180361747742, + "losses/sft": 1.990742564201355, + "losses/total": 0.15602180361747742, + "ref_logps/chosen": -43.29187774658203, + "ref_logps/rejected": -42.60942840576172, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0898075103759766, + "rewards/margins": 1.4328596591949463, + "rewards/rejected": -2.522667169570923, + "step": 2094 + }, + { + "epoch": 1.98, + "grad_norm": 22.313430786132812, + "learning_rate": 1.8922700244840853e-07, + "logps/chosen": -48.01663589477539, + "logps/rejected": -75.1309814453125, + "loss": 0.3928, + "losses/dpo": 0.09186310321092606, + "losses/sft": 1.7343701124191284, + "losses/total": 0.09186310321092606, + "ref_logps/chosen": -36.94061279296875, + "ref_logps/rejected": -50.28507995605469, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1076024770736694, + "rewards/margins": 1.3769872188568115, + "rewards/rejected": -2.4845895767211914, + "step": 2095 + }, + { + "epoch": 1.98, + "grad_norm": 21.59351348876953, + "learning_rate": 1.8905211612451905e-07, + "logps/chosen": -58.79646301269531, + "logps/rejected": -79.04988098144531, + "loss": 0.3616, + "losses/dpo": 0.47118711471557617, + "losses/sft": 1.982682466506958, + "losses/total": 0.47118711471557617, + "ref_logps/chosen": -44.02228546142578, + "ref_logps/rejected": -45.99781036376953, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4774174690246582, + "rewards/margins": 1.8277900218963623, + "rewards/rejected": -3.3052072525024414, + "step": 2096 + }, + { + "epoch": 1.98, + "grad_norm": 19.19754981994629, + "learning_rate": 1.8887722980062959e-07, + "logps/chosen": -49.19921875, + "logps/rejected": -77.63159942626953, + "loss": 0.309, + "losses/dpo": 0.16407868266105652, + "losses/sft": 1.052739143371582, + "losses/total": 0.16407868266105652, + "ref_logps/chosen": -35.50464630126953, + "ref_logps/rejected": -50.42033767700195, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3694572448730469, + "rewards/margins": 1.351669192314148, + "rewards/rejected": -2.7211265563964844, + "step": 2097 + }, + { + "epoch": 1.98, + "grad_norm": 19.97472381591797, + "learning_rate": 1.887023434767401e-07, + "logps/chosen": -55.17011260986328, + "logps/rejected": -62.014442443847656, + "loss": 0.3012, + "losses/dpo": 0.2652393579483032, + "losses/sft": 1.9611105918884277, + "losses/total": 0.2652393579483032, + "ref_logps/chosen": -44.76509475708008, + "ref_logps/rejected": -36.59451675415039, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0405018329620361, + "rewards/margins": 1.5014909505844116, + "rewards/rejected": -2.541992664337158, + "step": 2098 + }, + { + "epoch": 1.98, + "grad_norm": 19.623929977416992, + "learning_rate": 1.8852745715285064e-07, + "logps/chosen": -47.913177490234375, + "logps/rejected": -59.980003356933594, + "loss": 0.4096, + "losses/dpo": 0.4777611494064331, + "losses/sft": 1.5093517303466797, + "losses/total": 0.4777611494064331, + "ref_logps/chosen": -34.349632263183594, + "ref_logps/rejected": -36.96465301513672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3563544750213623, + "rewards/margins": 0.945180356502533, + "rewards/rejected": -2.301534652709961, + "step": 2099 + }, + { + "epoch": 1.98, + "grad_norm": 23.5931339263916, + "learning_rate": 1.8835257082896118e-07, + "logps/chosen": -55.432037353515625, + "logps/rejected": -84.23030090332031, + "loss": 0.4203, + "losses/dpo": 0.47841909527778625, + "losses/sft": 2.050821542739868, + "losses/total": 0.47841909527778625, + "ref_logps/chosen": -42.95378112792969, + "ref_logps/rejected": -57.02083206176758, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2478258609771729, + "rewards/margins": 1.4731216430664062, + "rewards/rejected": -2.720947265625, + "step": 2100 + }, + { + "epoch": 1.98, + "grad_norm": 17.061511993408203, + "learning_rate": 1.881776845050717e-07, + "logps/chosen": -52.63528823852539, + "logps/rejected": -78.40866088867188, + "loss": 0.306, + "losses/dpo": 0.6880185604095459, + "losses/sft": 1.5328595638275146, + "losses/total": 0.6880185604095459, + "ref_logps/chosen": -39.48440933227539, + "ref_logps/rejected": -47.792110443115234, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3150876760482788, + "rewards/margins": 1.746567726135254, + "rewards/rejected": -3.0616555213928223, + "step": 2101 + }, + { + "epoch": 1.98, + "grad_norm": 24.723827362060547, + "learning_rate": 1.8800279818118223e-07, + "logps/chosen": -50.295284271240234, + "logps/rejected": -59.07971954345703, + "loss": 0.4753, + "losses/dpo": 0.6563800573348999, + "losses/sft": 2.3130757808685303, + "losses/total": 0.6563800573348999, + "ref_logps/chosen": -37.324432373046875, + "ref_logps/rejected": -37.981285095214844, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2970848083496094, + "rewards/margins": 0.8127583265304565, + "rewards/rejected": -2.1098432540893555, + "step": 2102 + }, + { + "epoch": 1.99, + "grad_norm": 29.858367919921875, + "learning_rate": 1.8782791185729274e-07, + "logps/chosen": -52.0495719909668, + "logps/rejected": -59.01878356933594, + "loss": 0.5749, + "losses/dpo": 1.0192652940750122, + "losses/sft": 2.6816837787628174, + "losses/total": 1.0192652940750122, + "ref_logps/chosen": -36.212440490722656, + "ref_logps/rejected": -33.46721649169922, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.5837132930755615, + "rewards/margins": 0.9714435338973999, + "rewards/rejected": -2.555156707763672, + "step": 2103 + }, + { + "epoch": 1.99, + "grad_norm": 17.373641967773438, + "learning_rate": 1.8765302553340328e-07, + "logps/chosen": -55.62825012207031, + "logps/rejected": -65.21308898925781, + "loss": 0.3592, + "losses/dpo": 0.35648399591445923, + "losses/sft": 1.9869621992111206, + "losses/total": 0.35648399591445923, + "ref_logps/chosen": -40.22018814086914, + "ref_logps/rejected": -37.518821716308594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.540806531906128, + "rewards/margins": 1.2286204099655151, + "rewards/rejected": -2.7694268226623535, + "step": 2104 + }, + { + "epoch": 1.99, + "grad_norm": 16.910202026367188, + "learning_rate": 1.874781392095138e-07, + "logps/chosen": -54.608497619628906, + "logps/rejected": -80.50454711914062, + "loss": 0.2619, + "losses/dpo": 0.21352556347846985, + "losses/sft": 1.5494791269302368, + "losses/total": 0.21352556347846985, + "ref_logps/chosen": -40.56264877319336, + "ref_logps/rejected": -50.489662170410156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4045848846435547, + "rewards/margins": 1.5969035625457764, + "rewards/rejected": -3.001488447189331, + "step": 2105 + }, + { + "epoch": 1.99, + "grad_norm": 30.094318389892578, + "learning_rate": 1.8730325288562436e-07, + "logps/chosen": -58.43780517578125, + "logps/rejected": -79.49662780761719, + "loss": 0.5818, + "losses/dpo": 0.9518269896507263, + "losses/sft": 1.4864927530288696, + "losses/total": 0.9518269896507263, + "ref_logps/chosen": -44.42962646484375, + "ref_logps/rejected": -53.615257263183594, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4008177518844604, + "rewards/margins": 1.1873186826705933, + "rewards/rejected": -2.5881364345550537, + "step": 2106 + }, + { + "epoch": 1.99, + "grad_norm": 24.94447898864746, + "learning_rate": 1.8712836656173487e-07, + "logps/chosen": -57.46205139160156, + "logps/rejected": -69.63911437988281, + "loss": 0.4224, + "losses/dpo": 0.7341169118881226, + "losses/sft": 1.85963773727417, + "losses/total": 0.7341169118881226, + "ref_logps/chosen": -42.052757263183594, + "ref_logps/rejected": -43.21053695678711, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5409293174743652, + "rewards/margins": 1.1019287109375, + "rewards/rejected": -2.6428580284118652, + "step": 2107 + }, + { + "epoch": 1.99, + "grad_norm": 13.211400985717773, + "learning_rate": 1.8695348023784538e-07, + "logps/chosen": -49.99925231933594, + "logps/rejected": -67.35913848876953, + "loss": 0.2116, + "losses/dpo": 0.27112096548080444, + "losses/sft": 2.0193769931793213, + "losses/total": 0.27112096548080444, + "ref_logps/chosen": -37.098487854003906, + "ref_logps/rejected": -36.51006317138672, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.290076494216919, + "rewards/margins": 1.7948315143585205, + "rewards/rejected": -3.0849080085754395, + "step": 2108 + }, + { + "epoch": 1.99, + "grad_norm": 16.180646896362305, + "learning_rate": 1.8677859391395592e-07, + "logps/chosen": -51.18617248535156, + "logps/rejected": -88.51129913330078, + "loss": 0.2823, + "losses/dpo": 0.27311259508132935, + "losses/sft": 1.8351768255233765, + "losses/total": 0.27311259508132935, + "ref_logps/chosen": -37.81904983520508, + "ref_logps/rejected": -59.578643798828125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3367122411727905, + "rewards/margins": 1.5565526485443115, + "rewards/rejected": -2.8932647705078125, + "step": 2109 + }, + { + "epoch": 1.99, + "grad_norm": 19.404233932495117, + "learning_rate": 1.8660370759006644e-07, + "logps/chosen": -46.71330261230469, + "logps/rejected": -58.63636779785156, + "loss": 0.4132, + "losses/dpo": 0.39510804414749146, + "losses/sft": 1.4505623579025269, + "losses/total": 0.39510804414749146, + "ref_logps/chosen": -37.53622055053711, + "ref_logps/rejected": -36.431819915771484, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9177082180976868, + "rewards/margins": 1.3027459383010864, + "rewards/rejected": -2.220454216003418, + "step": 2110 + }, + { + "epoch": 1.99, + "grad_norm": 26.47079086303711, + "learning_rate": 1.8642882126617697e-07, + "logps/chosen": -57.6226806640625, + "logps/rejected": -80.47708129882812, + "loss": 0.408, + "losses/dpo": 0.5040897130966187, + "losses/sft": 2.2119622230529785, + "losses/total": 0.5040897130966187, + "ref_logps/chosen": -41.14360809326172, + "ref_logps/rejected": -48.865867614746094, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6479074954986572, + "rewards/margins": 1.5132136344909668, + "rewards/rejected": -3.161121368408203, + "step": 2111 + }, + { + "epoch": 1.99, + "grad_norm": 17.380420684814453, + "learning_rate": 1.862539349422875e-07, + "logps/chosen": -48.38188934326172, + "logps/rejected": -70.9681625366211, + "loss": 0.26, + "losses/dpo": 0.30021780729293823, + "losses/sft": 1.6236364841461182, + "losses/total": 0.30021780729293823, + "ref_logps/chosen": -36.0531005859375, + "ref_logps/rejected": -40.83494567871094, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2328790426254272, + "rewards/margins": 1.7804433107376099, + "rewards/rejected": -3.013322353363037, + "step": 2112 + }, + { + "epoch": 2.0, + "grad_norm": 18.04389762878418, + "learning_rate": 1.8607904861839805e-07, + "logps/chosen": -60.78657531738281, + "logps/rejected": -67.65718078613281, + "loss": 0.3145, + "losses/dpo": 0.5886095762252808, + "losses/sft": 1.701097011566162, + "losses/total": 0.5886095762252808, + "ref_logps/chosen": -50.240760803222656, + "ref_logps/rejected": -41.739891052246094, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0545814037322998, + "rewards/margins": 1.5371477603912354, + "rewards/rejected": -2.591729164123535, + "step": 2113 + }, + { + "epoch": 2.0, + "grad_norm": 25.632863998413086, + "learning_rate": 1.8590416229450857e-07, + "logps/chosen": -50.28253936767578, + "logps/rejected": -65.70915222167969, + "loss": 0.5052, + "losses/dpo": 0.36404773592948914, + "losses/sft": 1.6944377422332764, + "losses/total": 0.36404773592948914, + "ref_logps/chosen": -34.70207977294922, + "ref_logps/rejected": -39.907432556152344, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5580458641052246, + "rewards/margins": 1.0221258401870728, + "rewards/rejected": -2.580171823501587, + "step": 2114 + }, + { + "epoch": 2.0, + "grad_norm": 19.195222854614258, + "learning_rate": 1.8572927597061908e-07, + "logps/chosen": -53.40088653564453, + "logps/rejected": -58.343116760253906, + "loss": 0.3877, + "losses/dpo": 0.39997634291648865, + "losses/sft": 1.6913156509399414, + "losses/total": 0.39997634291648865, + "ref_logps/chosen": -39.987152099609375, + "ref_logps/rejected": -35.145591735839844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3413735628128052, + "rewards/margins": 0.9783786535263062, + "rewards/rejected": -2.3197522163391113, + "step": 2115 + }, + { + "epoch": 2.0, + "grad_norm": 24.312637329101562, + "learning_rate": 1.8555438964672962e-07, + "logps/chosen": -47.844505310058594, + "logps/rejected": -61.53788757324219, + "loss": 0.4046, + "losses/dpo": 0.4707256555557251, + "losses/sft": 1.9867011308670044, + "losses/total": 0.4707256555557251, + "ref_logps/chosen": -36.274505615234375, + "ref_logps/rejected": -40.71784973144531, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.156999945640564, + "rewards/margins": 0.9250036478042603, + "rewards/rejected": -2.082003593444824, + "step": 2116 + }, + { + "epoch": 2.0, + "grad_norm": 20.24012565612793, + "learning_rate": 1.8537950332284013e-07, + "logps/chosen": -53.673866271972656, + "logps/rejected": -69.09025573730469, + "loss": 0.3986, + "losses/dpo": 0.5724376440048218, + "losses/sft": 1.5727691650390625, + "losses/total": 0.5724376440048218, + "ref_logps/chosen": -38.131690979003906, + "ref_logps/rejected": -42.203712463378906, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5542174577713013, + "rewards/margins": 1.1344361305236816, + "rewards/rejected": -2.6886537075042725, + "step": 2117 + }, + { + "epoch": 2.0, + "grad_norm": 23.998689651489258, + "learning_rate": 1.8520461699895067e-07, + "logps/chosen": -35.5611572265625, + "logps/rejected": -53.264007568359375, + "loss": 0.5241, + "losses/dpo": 0.7268857955932617, + "losses/sft": 2.0944571495056152, + "losses/total": 0.7268857955932617, + "ref_logps/chosen": -26.67676544189453, + "ref_logps/rejected": -33.97700500488281, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8884388208389282, + "rewards/margins": 1.0402616262435913, + "rewards/rejected": -1.9287004470825195, + "step": 2118 + }, + { + "epoch": 2.0, + "grad_norm": 14.622540473937988, + "learning_rate": 1.850297306750612e-07, + "logps/chosen": -54.903717041015625, + "logps/rejected": -72.09414672851562, + "loss": 0.2194, + "losses/dpo": 0.27196836471557617, + "losses/sft": 1.6568127870559692, + "losses/total": 0.27196836471557617, + "ref_logps/chosen": -45.634002685546875, + "ref_logps/rejected": -46.37932586669922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9269713759422302, + "rewards/margins": 1.6445108652114868, + "rewards/rejected": -2.5714821815490723, + "step": 2119 + }, + { + "epoch": 2.0, + "grad_norm": 15.972982406616211, + "learning_rate": 1.8485484435117175e-07, + "logps/chosen": -43.17649841308594, + "logps/rejected": -59.56483840942383, + "loss": 0.2986, + "losses/dpo": 0.48384755849838257, + "losses/sft": 1.9165600538253784, + "losses/total": 0.48384755849838257, + "ref_logps/chosen": -33.55433654785156, + "ref_logps/rejected": -35.56121063232422, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9622160196304321, + "rewards/margins": 1.4381465911865234, + "rewards/rejected": -2.400362730026245, + "step": 2120 + }, + { + "epoch": 2.0, + "grad_norm": 11.680267333984375, + "learning_rate": 1.8467995802728226e-07, + "logps/chosen": -35.333065032958984, + "logps/rejected": -59.890071868896484, + "loss": 0.2523, + "losses/dpo": 0.1491500437259674, + "losses/sft": 1.8207087516784668, + "losses/total": 0.1491500437259674, + "ref_logps/chosen": -29.129497528076172, + "ref_logps/rejected": -34.53155517578125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6203566193580627, + "rewards/margins": 1.9154951572418213, + "rewards/rejected": -2.5358517169952393, + "step": 2121 + }, + { + "epoch": 2.0, + "grad_norm": 18.440881729125977, + "learning_rate": 1.8450507170339277e-07, + "logps/chosen": -50.24641418457031, + "logps/rejected": -69.30216979980469, + "loss": 0.3082, + "losses/dpo": 0.06355299055576324, + "losses/sft": 1.8499056100845337, + "losses/total": 0.06355299055576324, + "ref_logps/chosen": -38.864601135253906, + "ref_logps/rejected": -40.225372314453125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.138181447982788, + "rewards/margins": 1.7694981098175049, + "rewards/rejected": -2.907679796218872, + "step": 2122 + }, + { + "epoch": 2.0, + "grad_norm": 12.123652458190918, + "learning_rate": 1.843301853795033e-07, + "logps/chosen": -47.44609451293945, + "logps/rejected": -98.56100463867188, + "loss": 0.1402, + "losses/dpo": 0.2347048819065094, + "losses/sft": 1.9066039323806763, + "losses/total": 0.2347048819065094, + "ref_logps/chosen": -36.865753173828125, + "ref_logps/rejected": -62.88535690307617, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.058034062385559, + "rewards/margins": 2.509530544281006, + "rewards/rejected": -3.5675647258758545, + "step": 2123 + }, + { + "epoch": 2.01, + "grad_norm": 10.315705299377441, + "learning_rate": 1.8415529905561382e-07, + "logps/chosen": -49.76079559326172, + "logps/rejected": -82.68477630615234, + "loss": 0.2081, + "losses/dpo": 0.13203921914100647, + "losses/sft": 1.5670490264892578, + "losses/total": 0.13203921914100647, + "ref_logps/chosen": -38.33154296875, + "ref_logps/rejected": -49.986724853515625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1429252624511719, + "rewards/margins": 2.1268796920776367, + "rewards/rejected": -3.2698049545288086, + "step": 2124 + }, + { + "epoch": 2.01, + "grad_norm": 25.119314193725586, + "learning_rate": 1.839804127317244e-07, + "logps/chosen": -50.736183166503906, + "logps/rejected": -63.65363693237305, + "loss": 0.476, + "losses/dpo": 0.7009780406951904, + "losses/sft": 1.9088724851608276, + "losses/total": 0.7009780406951904, + "ref_logps/chosen": -34.78553771972656, + "ref_logps/rejected": -39.48884582519531, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.595064401626587, + "rewards/margins": 0.8214143514633179, + "rewards/rejected": -2.4164786338806152, + "step": 2125 + }, + { + "epoch": 2.01, + "grad_norm": 17.129047393798828, + "learning_rate": 1.838055264078349e-07, + "logps/chosen": -46.27547836303711, + "logps/rejected": -69.67829132080078, + "loss": 0.3594, + "losses/dpo": 0.6327574849128723, + "losses/sft": 1.9159936904907227, + "losses/total": 0.6327574849128723, + "ref_logps/chosen": -31.954605102539062, + "ref_logps/rejected": -41.980838775634766, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4320874214172363, + "rewards/margins": 1.3376576900482178, + "rewards/rejected": -2.769745349884033, + "step": 2126 + }, + { + "epoch": 2.01, + "grad_norm": 9.828198432922363, + "learning_rate": 1.8363064008394544e-07, + "logps/chosen": -47.61317443847656, + "logps/rejected": -81.50820922851562, + "loss": 0.1344, + "losses/dpo": 0.10161048173904419, + "losses/sft": 1.8394410610198975, + "losses/total": 0.10161048173904419, + "ref_logps/chosen": -37.03186798095703, + "ref_logps/rejected": -45.87834167480469, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0581305027008057, + "rewards/margins": 2.5048563480377197, + "rewards/rejected": -3.5629868507385254, + "step": 2127 + }, + { + "epoch": 2.01, + "grad_norm": 14.764139175415039, + "learning_rate": 1.8345575376005595e-07, + "logps/chosen": -57.10142517089844, + "logps/rejected": -86.08834075927734, + "loss": 0.2137, + "losses/dpo": 0.19319021701812744, + "losses/sft": 2.0294692516326904, + "losses/total": 0.19319021701812744, + "ref_logps/chosen": -42.53826904296875, + "ref_logps/rejected": -51.38894271850586, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.456315517425537, + "rewards/margins": 2.0136241912841797, + "rewards/rejected": -3.4699394702911377, + "step": 2128 + }, + { + "epoch": 2.01, + "grad_norm": 13.709147453308105, + "learning_rate": 1.8328086743616647e-07, + "logps/chosen": -51.933265686035156, + "logps/rejected": -71.26970672607422, + "loss": 0.2108, + "losses/dpo": 0.36864811182022095, + "losses/sft": 1.5049283504486084, + "losses/total": 0.36864811182022095, + "ref_logps/chosen": -43.06371307373047, + "ref_logps/rejected": -41.1356086730957, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.886955201625824, + "rewards/margins": 2.1264548301696777, + "rewards/rejected": -3.0134103298187256, + "step": 2129 + }, + { + "epoch": 2.01, + "grad_norm": 9.689056396484375, + "learning_rate": 1.83105981112277e-07, + "logps/chosen": -55.64209747314453, + "logps/rejected": -85.40646362304688, + "loss": 0.1437, + "losses/dpo": 0.190652996301651, + "losses/sft": 1.7045769691467285, + "losses/total": 0.190652996301651, + "ref_logps/chosen": -43.312255859375, + "ref_logps/rejected": -47.68393325805664, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2329840660095215, + "rewards/margins": 2.539268970489502, + "rewards/rejected": -3.7722532749176025, + "step": 2130 + }, + { + "epoch": 2.01, + "grad_norm": 22.669729232788086, + "learning_rate": 1.8293109478838752e-07, + "logps/chosen": -56.211883544921875, + "logps/rejected": -68.6495361328125, + "loss": 0.4304, + "losses/dpo": 0.40455734729766846, + "losses/sft": 2.158808469772339, + "losses/total": 0.40455734729766846, + "ref_logps/chosen": -40.2336311340332, + "ref_logps/rejected": -40.402957916259766, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5978256464004517, + "rewards/margins": 1.2268320322036743, + "rewards/rejected": -2.824657678604126, + "step": 2131 + }, + { + "epoch": 2.01, + "grad_norm": 19.535429000854492, + "learning_rate": 1.8275620846449808e-07, + "logps/chosen": -57.2041015625, + "logps/rejected": -61.618003845214844, + "loss": 0.2841, + "losses/dpo": 0.3089786767959595, + "losses/sft": 1.9661866426467896, + "losses/total": 0.3089786767959595, + "ref_logps/chosen": -42.45734405517578, + "ref_logps/rejected": -32.064273834228516, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4746756553649902, + "rewards/margins": 1.4806971549987793, + "rewards/rejected": -2.9553728103637695, + "step": 2132 + }, + { + "epoch": 2.01, + "grad_norm": 14.503403663635254, + "learning_rate": 1.825813221406086e-07, + "logps/chosen": -41.284278869628906, + "logps/rejected": -65.96885681152344, + "loss": 0.2964, + "losses/dpo": 0.4060315489768982, + "losses/sft": 1.5218085050582886, + "losses/total": 0.4060315489768982, + "ref_logps/chosen": -33.32521438598633, + "ref_logps/rejected": -43.205238342285156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7959065437316895, + "rewards/margins": 1.4804561138153076, + "rewards/rejected": -2.276362895965576, + "step": 2133 + }, + { + "epoch": 2.02, + "grad_norm": 14.652665138244629, + "learning_rate": 1.8240643581671914e-07, + "logps/chosen": -52.136600494384766, + "logps/rejected": -74.16392517089844, + "loss": 0.2013, + "losses/dpo": 0.1409742534160614, + "losses/sft": 1.9089434146881104, + "losses/total": 0.1409742534160614, + "ref_logps/chosen": -40.16832733154297, + "ref_logps/rejected": -43.59336853027344, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1968274116516113, + "rewards/margins": 1.860228419303894, + "rewards/rejected": -3.057055950164795, + "step": 2134 + }, + { + "epoch": 2.02, + "grad_norm": 20.786081314086914, + "learning_rate": 1.8223154949282965e-07, + "logps/chosen": -56.99998474121094, + "logps/rejected": -78.87982940673828, + "loss": 0.255, + "losses/dpo": 0.13895924389362335, + "losses/sft": 2.1962175369262695, + "losses/total": 0.13895924389362335, + "ref_logps/chosen": -41.885986328125, + "ref_logps/rejected": -46.557987213134766, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5114002227783203, + "rewards/margins": 1.7207839488983154, + "rewards/rejected": -3.2321841716766357, + "step": 2135 + }, + { + "epoch": 2.02, + "grad_norm": 20.18816375732422, + "learning_rate": 1.820566631689402e-07, + "logps/chosen": -53.30295944213867, + "logps/rejected": -76.03685760498047, + "loss": 0.2507, + "losses/dpo": 0.23952838778495789, + "losses/sft": 1.6842877864837646, + "losses/total": 0.23952838778495789, + "ref_logps/chosen": -37.580692291259766, + "ref_logps/rejected": -43.69908905029297, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5722270011901855, + "rewards/margins": 1.6615500450134277, + "rewards/rejected": -3.2337770462036133, + "step": 2136 + }, + { + "epoch": 2.02, + "grad_norm": 16.824499130249023, + "learning_rate": 1.818817768450507e-07, + "logps/chosen": -50.670997619628906, + "logps/rejected": -68.41815185546875, + "loss": 0.272, + "losses/dpo": 0.13589981198310852, + "losses/sft": 1.4028607606887817, + "losses/total": 0.13589981198310852, + "ref_logps/chosen": -38.59898376464844, + "ref_logps/rejected": -37.87329864501953, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2072012424468994, + "rewards/margins": 1.847284197807312, + "rewards/rejected": -3.054485321044922, + "step": 2137 + }, + { + "epoch": 2.02, + "grad_norm": 10.899084091186523, + "learning_rate": 1.8170689052116124e-07, + "logps/chosen": -51.74102020263672, + "logps/rejected": -79.1762466430664, + "loss": 0.1997, + "losses/dpo": 0.2777785062789917, + "losses/sft": 2.1594667434692383, + "losses/total": 0.2777785062789917, + "ref_logps/chosen": -37.96143341064453, + "ref_logps/rejected": -43.343299865722656, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3779584169387817, + "rewards/margins": 2.205335855484009, + "rewards/rejected": -3.583294153213501, + "step": 2138 + }, + { + "epoch": 2.02, + "grad_norm": 18.899595260620117, + "learning_rate": 1.8153200419727178e-07, + "logps/chosen": -65.54669189453125, + "logps/rejected": -69.53129577636719, + "loss": 0.2631, + "losses/dpo": 0.20167754590511322, + "losses/sft": 1.8834009170532227, + "losses/total": 0.20167754590511322, + "ref_logps/chosen": -51.37293243408203, + "ref_logps/rejected": -39.78549575805664, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4173754453659058, + "rewards/margins": 1.5572043657302856, + "rewards/rejected": -2.9745798110961914, + "step": 2139 + }, + { + "epoch": 2.02, + "grad_norm": 18.19244384765625, + "learning_rate": 1.813571178733823e-07, + "logps/chosen": -60.87751388549805, + "logps/rejected": -95.88334655761719, + "loss": 0.2329, + "losses/dpo": 0.21829812228679657, + "losses/sft": 2.4819176197052, + "losses/total": 0.21829812228679657, + "ref_logps/chosen": -42.347862243652344, + "ref_logps/rejected": -58.603275299072266, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8529653549194336, + "rewards/margins": 1.8750416040420532, + "rewards/rejected": -3.7280068397521973, + "step": 2140 + }, + { + "epoch": 2.02, + "grad_norm": 19.8094539642334, + "learning_rate": 1.8118223154949283e-07, + "logps/chosen": -49.105247497558594, + "logps/rejected": -79.30419921875, + "loss": 0.3108, + "losses/dpo": 0.5008683800697327, + "losses/sft": 2.0632882118225098, + "losses/total": 0.5008683800697327, + "ref_logps/chosen": -32.95293426513672, + "ref_logps/rejected": -48.30475616455078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6152312755584717, + "rewards/margins": 1.4847133159637451, + "rewards/rejected": -3.099944591522217, + "step": 2141 + }, + { + "epoch": 2.02, + "grad_norm": 22.921703338623047, + "learning_rate": 1.8100734522560334e-07, + "logps/chosen": -51.896026611328125, + "logps/rejected": -66.95953369140625, + "loss": 0.318, + "losses/dpo": 0.3040695786476135, + "losses/sft": 2.3770389556884766, + "losses/total": 0.3040695786476135, + "ref_logps/chosen": -37.69776916503906, + "ref_logps/rejected": -36.614402770996094, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.419825792312622, + "rewards/margins": 1.614687204360962, + "rewards/rejected": -3.034513235092163, + "step": 2142 + }, + { + "epoch": 2.02, + "grad_norm": 17.001493453979492, + "learning_rate": 1.8083245890171388e-07, + "logps/chosen": -43.463356018066406, + "logps/rejected": -73.8758544921875, + "loss": 0.2646, + "losses/dpo": 0.2626388669013977, + "losses/sft": 1.924068570137024, + "losses/total": 0.2626388669013977, + "ref_logps/chosen": -28.334320068359375, + "ref_logps/rejected": -43.8366813659668, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5129038095474243, + "rewards/margins": 1.491014003753662, + "rewards/rejected": -3.003917694091797, + "step": 2143 + }, + { + "epoch": 2.02, + "grad_norm": 15.26026725769043, + "learning_rate": 1.806575725778244e-07, + "logps/chosen": -51.992671966552734, + "logps/rejected": -87.82942199707031, + "loss": 0.2071, + "losses/dpo": 0.13501951098442078, + "losses/sft": 2.489450693130493, + "losses/total": 0.13501951098442078, + "ref_logps/chosen": -35.501312255859375, + "ref_logps/rejected": -49.98847961425781, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6491355895996094, + "rewards/margins": 2.134958267211914, + "rewards/rejected": -3.7840940952301025, + "step": 2144 + }, + { + "epoch": 2.03, + "grad_norm": 18.45328140258789, + "learning_rate": 1.8048268625393493e-07, + "logps/chosen": -55.810699462890625, + "logps/rejected": -96.5757064819336, + "loss": 0.2232, + "losses/dpo": 0.2247193455696106, + "losses/sft": 1.5977500677108765, + "losses/total": 0.2247193455696106, + "ref_logps/chosen": -36.68391418457031, + "ref_logps/rejected": -55.360992431640625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9126784801483154, + "rewards/margins": 2.208792209625244, + "rewards/rejected": -4.121470928192139, + "step": 2145 + }, + { + "epoch": 2.03, + "grad_norm": 21.106897354125977, + "learning_rate": 1.8030779993004547e-07, + "logps/chosen": -56.17461013793945, + "logps/rejected": -68.20716857910156, + "loss": 0.3971, + "losses/dpo": 0.15008476376533508, + "losses/sft": 2.1184966564178467, + "losses/total": 0.15008476376533508, + "ref_logps/chosen": -36.314857482910156, + "ref_logps/rejected": -34.541873931884766, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9859752655029297, + "rewards/margins": 1.3805540800094604, + "rewards/rejected": -3.3665294647216797, + "step": 2146 + }, + { + "epoch": 2.03, + "grad_norm": 22.801605224609375, + "learning_rate": 1.8013291360615599e-07, + "logps/chosen": -54.46410369873047, + "logps/rejected": -81.90934753417969, + "loss": 0.3284, + "losses/dpo": 0.43667668104171753, + "losses/sft": 2.5421383380889893, + "losses/total": 0.43667668104171753, + "ref_logps/chosen": -37.55340576171875, + "ref_logps/rejected": -50.67803192138672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6910698413848877, + "rewards/margins": 1.4320616722106934, + "rewards/rejected": -3.123131275177002, + "step": 2147 + }, + { + "epoch": 2.03, + "grad_norm": 22.096450805664062, + "learning_rate": 1.7995802728226652e-07, + "logps/chosen": -60.6259765625, + "logps/rejected": -68.10839080810547, + "loss": 0.2962, + "losses/dpo": 0.1869063675403595, + "losses/sft": 2.394782304763794, + "losses/total": 0.1869063675403595, + "ref_logps/chosen": -41.70875549316406, + "ref_logps/rejected": -34.14208221435547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8917219638824463, + "rewards/margins": 1.504908561706543, + "rewards/rejected": -3.3966307640075684, + "step": 2148 + }, + { + "epoch": 2.03, + "grad_norm": 15.411166191101074, + "learning_rate": 1.7978314095837704e-07, + "logps/chosen": -59.88125991821289, + "logps/rejected": -90.34550476074219, + "loss": 0.1929, + "losses/dpo": 0.11514449119567871, + "losses/sft": 1.7957857847213745, + "losses/total": 0.11514449119567871, + "ref_logps/chosen": -44.15142822265625, + "ref_logps/rejected": -51.37074279785156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5729832649230957, + "rewards/margins": 2.324493169784546, + "rewards/rejected": -3.8974764347076416, + "step": 2149 + }, + { + "epoch": 2.03, + "grad_norm": 15.002555847167969, + "learning_rate": 1.7960825463448758e-07, + "logps/chosen": -54.14069747924805, + "logps/rejected": -77.42867279052734, + "loss": 0.3054, + "losses/dpo": 0.29670092463493347, + "losses/sft": 2.0884249210357666, + "losses/total": 0.29670092463493347, + "ref_logps/chosen": -36.3447265625, + "ref_logps/rejected": -43.1048469543457, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7795970439910889, + "rewards/margins": 1.6527860164642334, + "rewards/rejected": -3.4323830604553223, + "step": 2150 + }, + { + "epoch": 2.03, + "grad_norm": 18.32962417602539, + "learning_rate": 1.7943336831059812e-07, + "logps/chosen": -61.80403137207031, + "logps/rejected": -96.60762023925781, + "loss": 0.2212, + "losses/dpo": 0.5263907313346863, + "losses/sft": 2.3820464611053467, + "losses/total": 0.5263907313346863, + "ref_logps/chosen": -43.27747344970703, + "ref_logps/rejected": -55.29908752441406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8526558876037598, + "rewards/margins": 2.278197765350342, + "rewards/rejected": -4.130853652954102, + "step": 2151 + }, + { + "epoch": 2.03, + "grad_norm": 25.970869064331055, + "learning_rate": 1.7925848198670863e-07, + "logps/chosen": -58.544456481933594, + "logps/rejected": -80.18570709228516, + "loss": 0.3449, + "losses/dpo": 0.4173913598060608, + "losses/sft": 2.4565041065216064, + "losses/total": 0.4173913598060608, + "ref_logps/chosen": -38.74884033203125, + "ref_logps/rejected": -47.55836486816406, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9795615673065186, + "rewards/margins": 1.2831730842590332, + "rewards/rejected": -3.2627346515655518, + "step": 2152 + }, + { + "epoch": 2.03, + "grad_norm": 23.25876808166504, + "learning_rate": 1.7908359566281917e-07, + "logps/chosen": -49.929603576660156, + "logps/rejected": -59.478755950927734, + "loss": 0.3903, + "losses/dpo": 0.45677483081817627, + "losses/sft": 1.8607100248336792, + "losses/total": 0.45677483081817627, + "ref_logps/chosen": -35.993507385253906, + "ref_logps/rejected": -31.64887237548828, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3936094045639038, + "rewards/margins": 1.3893787860870361, + "rewards/rejected": -2.7829880714416504, + "step": 2153 + }, + { + "epoch": 2.03, + "grad_norm": 16.28001594543457, + "learning_rate": 1.7890870933892968e-07, + "logps/chosen": -57.345603942871094, + "logps/rejected": -71.04620361328125, + "loss": 0.2178, + "losses/dpo": 0.14801138639450073, + "losses/sft": 1.5319006443023682, + "losses/total": 0.14801138639450073, + "ref_logps/chosen": -42.26116180419922, + "ref_logps/rejected": -38.639469146728516, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.508443832397461, + "rewards/margins": 1.7322297096252441, + "rewards/rejected": -3.240673542022705, + "step": 2154 + }, + { + "epoch": 2.03, + "grad_norm": 16.706239700317383, + "learning_rate": 1.7873382301504022e-07, + "logps/chosen": -59.98960876464844, + "logps/rejected": -82.62277221679688, + "loss": 0.3025, + "losses/dpo": 0.26598334312438965, + "losses/sft": 2.105565071105957, + "losses/total": 0.26598334312438965, + "ref_logps/chosen": -41.27506637573242, + "ref_logps/rejected": -49.014793395996094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8714544773101807, + "rewards/margins": 1.4893431663513184, + "rewards/rejected": -3.360797643661499, + "step": 2155 + }, + { + "epoch": 2.04, + "grad_norm": 10.767970085144043, + "learning_rate": 1.7855893669115073e-07, + "logps/chosen": -52.65675354003906, + "logps/rejected": -82.47964477539062, + "loss": 0.146, + "losses/dpo": 0.11722319573163986, + "losses/sft": 1.8464301824569702, + "losses/total": 0.11722319573163986, + "ref_logps/chosen": -38.047515869140625, + "ref_logps/rejected": -45.859710693359375, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4609239101409912, + "rewards/margins": 2.2010693550109863, + "rewards/rejected": -3.6619935035705566, + "step": 2156 + }, + { + "epoch": 2.04, + "grad_norm": 12.871431350708008, + "learning_rate": 1.783840503672613e-07, + "logps/chosen": -64.88345336914062, + "logps/rejected": -89.14042663574219, + "loss": 0.169, + "losses/dpo": 0.2960633635520935, + "losses/sft": 1.85858154296875, + "losses/total": 0.2960633635520935, + "ref_logps/chosen": -47.82654571533203, + "ref_logps/rejected": -50.038429260253906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7056903839111328, + "rewards/margins": 2.204509973526001, + "rewards/rejected": -3.910200595855713, + "step": 2157 + }, + { + "epoch": 2.04, + "grad_norm": 24.01546287536621, + "learning_rate": 1.782091640433718e-07, + "logps/chosen": -75.90113067626953, + "logps/rejected": -83.4007568359375, + "loss": 0.2923, + "losses/dpo": 0.31727007031440735, + "losses/sft": 2.601719856262207, + "losses/total": 0.31727007031440735, + "ref_logps/chosen": -53.00567626953125, + "ref_logps/rejected": -45.32004928588867, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2895455360412598, + "rewards/margins": 1.5185257196426392, + "rewards/rejected": -3.8080711364746094, + "step": 2158 + }, + { + "epoch": 2.04, + "grad_norm": 24.344449996948242, + "learning_rate": 1.7803427771948232e-07, + "logps/chosen": -60.256370544433594, + "logps/rejected": -76.394287109375, + "loss": 0.3597, + "losses/dpo": 0.22860893607139587, + "losses/sft": 1.3986338376998901, + "losses/total": 0.22860893607139587, + "ref_logps/chosen": -41.57615661621094, + "ref_logps/rejected": -44.063560485839844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8680219650268555, + "rewards/margins": 1.365051507949829, + "rewards/rejected": -3.2330734729766846, + "step": 2159 + }, + { + "epoch": 2.04, + "grad_norm": 15.661075592041016, + "learning_rate": 1.7785939139559286e-07, + "logps/chosen": -48.448280334472656, + "logps/rejected": -78.65946197509766, + "loss": 0.1818, + "losses/dpo": 0.18213243782520294, + "losses/sft": 1.4669668674468994, + "losses/total": 0.18213243782520294, + "ref_logps/chosen": -34.93203353881836, + "ref_logps/rejected": -44.397987365722656, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3516244888305664, + "rewards/margins": 2.074523448944092, + "rewards/rejected": -3.426147937774658, + "step": 2160 + }, + { + "epoch": 2.04, + "grad_norm": 20.772863388061523, + "learning_rate": 1.7768450507170337e-07, + "logps/chosen": -63.82762145996094, + "logps/rejected": -91.49402618408203, + "loss": 0.2212, + "losses/dpo": 0.3095247745513916, + "losses/sft": 1.7047816514968872, + "losses/total": 0.3095247745513916, + "ref_logps/chosen": -43.230716705322266, + "ref_logps/rejected": -50.3248405456543, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0596907138824463, + "rewards/margins": 2.057227611541748, + "rewards/rejected": -4.116918087005615, + "step": 2161 + }, + { + "epoch": 2.04, + "grad_norm": 27.207130432128906, + "learning_rate": 1.7750961874781391e-07, + "logps/chosen": -59.497711181640625, + "logps/rejected": -70.29346466064453, + "loss": 0.4447, + "losses/dpo": 0.7661054134368896, + "losses/sft": 2.3019745349884033, + "losses/total": 0.7661054134368896, + "ref_logps/chosen": -38.863189697265625, + "ref_logps/rejected": -35.97392272949219, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0634524822235107, + "rewards/margins": 1.3685017824172974, + "rewards/rejected": -3.4319543838500977, + "step": 2162 + }, + { + "epoch": 2.04, + "grad_norm": 17.928508758544922, + "learning_rate": 1.7733473242392443e-07, + "logps/chosen": -47.27583312988281, + "logps/rejected": -63.323204040527344, + "loss": 0.2901, + "losses/dpo": 0.27205145359039307, + "losses/sft": 2.077524185180664, + "losses/total": 0.27205145359039307, + "ref_logps/chosen": -29.81635284423828, + "ref_logps/rejected": -30.439565658569336, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7459475994110107, + "rewards/margins": 1.5424165725708008, + "rewards/rejected": -3.2883639335632324, + "step": 2163 + }, + { + "epoch": 2.04, + "grad_norm": 20.832271575927734, + "learning_rate": 1.77159846100035e-07, + "logps/chosen": -58.44321823120117, + "logps/rejected": -92.69669342041016, + "loss": 0.2528, + "losses/dpo": 0.5324923992156982, + "losses/sft": 2.9317092895507812, + "losses/total": 0.5324923992156982, + "ref_logps/chosen": -38.508975982666016, + "ref_logps/rejected": -52.49767303466797, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.993424415588379, + "rewards/margins": 2.026477336883545, + "rewards/rejected": -4.019901752471924, + "step": 2164 + }, + { + "epoch": 2.04, + "grad_norm": 18.230239868164062, + "learning_rate": 1.769849597761455e-07, + "logps/chosen": -56.35995864868164, + "logps/rejected": -76.60353088378906, + "loss": 0.2634, + "losses/dpo": 0.2897110879421234, + "losses/sft": 1.3430176973342896, + "losses/total": 0.2897110879421234, + "ref_logps/chosen": -46.06782531738281, + "ref_logps/rejected": -47.96770095825195, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0292131900787354, + "rewards/margins": 1.8343701362609863, + "rewards/rejected": -2.8635830879211426, + "step": 2165 + }, + { + "epoch": 2.05, + "grad_norm": 17.788854598999023, + "learning_rate": 1.7681007345225602e-07, + "logps/chosen": -61.61143493652344, + "logps/rejected": -70.13768005371094, + "loss": 0.2113, + "losses/dpo": 0.0938451886177063, + "losses/sft": 1.7500905990600586, + "losses/total": 0.0938451886177063, + "ref_logps/chosen": -45.088768005371094, + "ref_logps/rejected": -35.49890899658203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6522667407989502, + "rewards/margins": 1.8116098642349243, + "rewards/rejected": -3.463876724243164, + "step": 2166 + }, + { + "epoch": 2.05, + "grad_norm": 17.123401641845703, + "learning_rate": 1.7663518712836656e-07, + "logps/chosen": -65.25563049316406, + "logps/rejected": -88.04917907714844, + "loss": 0.2419, + "losses/dpo": 0.1335296332836151, + "losses/sft": 2.229015827178955, + "losses/total": 0.1335296332836151, + "ref_logps/chosen": -47.218082427978516, + "ref_logps/rejected": -50.403404235839844, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8037550449371338, + "rewards/margins": 1.960822582244873, + "rewards/rejected": -3.764577865600586, + "step": 2167 + }, + { + "epoch": 2.05, + "grad_norm": 15.332640647888184, + "learning_rate": 1.7646030080447707e-07, + "logps/chosen": -56.073631286621094, + "logps/rejected": -86.1395263671875, + "loss": 0.1611, + "losses/dpo": 0.3132442831993103, + "losses/sft": 2.0467052459716797, + "losses/total": 0.3132442831993103, + "ref_logps/chosen": -41.04229736328125, + "ref_logps/rejected": -48.011940002441406, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5031332969665527, + "rewards/margins": 2.3096251487731934, + "rewards/rejected": -3.8127589225769043, + "step": 2168 + }, + { + "epoch": 2.05, + "grad_norm": 17.61914825439453, + "learning_rate": 1.762854144805876e-07, + "logps/chosen": -53.263023376464844, + "logps/rejected": -79.85975646972656, + "loss": 0.2624, + "losses/dpo": 0.1332562267780304, + "losses/sft": 2.0333592891693115, + "losses/total": 0.1332562267780304, + "ref_logps/chosen": -36.185855865478516, + "ref_logps/rejected": -45.8450813293457, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.707716941833496, + "rewards/margins": 1.6937508583068848, + "rewards/rejected": -3.401467800140381, + "step": 2169 + }, + { + "epoch": 2.05, + "grad_norm": 14.759425163269043, + "learning_rate": 1.7611052815669815e-07, + "logps/chosen": -53.97357940673828, + "logps/rejected": -79.67366027832031, + "loss": 0.1691, + "losses/dpo": 0.2415875792503357, + "losses/sft": 1.868658423423767, + "losses/total": 0.2415875792503357, + "ref_logps/chosen": -38.85692596435547, + "ref_logps/rejected": -42.45916748046875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5116651058197021, + "rewards/margins": 2.209784507751465, + "rewards/rejected": -3.721449613571167, + "step": 2170 + }, + { + "epoch": 2.05, + "grad_norm": 19.53839111328125, + "learning_rate": 1.7593564183280869e-07, + "logps/chosen": -53.13072967529297, + "logps/rejected": -66.05118560791016, + "loss": 0.2711, + "losses/dpo": 0.1980120837688446, + "losses/sft": 1.6273363828659058, + "losses/total": 0.1980120837688446, + "ref_logps/chosen": -34.85951614379883, + "ref_logps/rejected": -32.383243560791016, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8271217346191406, + "rewards/margins": 1.5396722555160522, + "rewards/rejected": -3.3667941093444824, + "step": 2171 + }, + { + "epoch": 2.05, + "grad_norm": 19.09229850769043, + "learning_rate": 1.757607555089192e-07, + "logps/chosen": -52.424198150634766, + "logps/rejected": -73.0693359375, + "loss": 0.2773, + "losses/dpo": 0.15910747647285461, + "losses/sft": 1.816064476966858, + "losses/total": 0.15910747647285461, + "ref_logps/chosen": -35.33361053466797, + "ref_logps/rejected": -38.2588996887207, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7090588808059692, + "rewards/margins": 1.771984577178955, + "rewards/rejected": -3.481043577194214, + "step": 2172 + }, + { + "epoch": 2.05, + "grad_norm": 14.572972297668457, + "learning_rate": 1.755858691850297e-07, + "logps/chosen": -52.50975799560547, + "logps/rejected": -74.72636413574219, + "loss": 0.2217, + "losses/dpo": 0.1505233347415924, + "losses/sft": 1.9595788717269897, + "losses/total": 0.1505233347415924, + "ref_logps/chosen": -35.77153015136719, + "ref_logps/rejected": -40.46814727783203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6738229990005493, + "rewards/margins": 1.7519986629486084, + "rewards/rejected": -3.4258217811584473, + "step": 2173 + }, + { + "epoch": 2.05, + "grad_norm": 27.026321411132812, + "learning_rate": 1.7541098286114025e-07, + "logps/chosen": -45.954315185546875, + "logps/rejected": -65.09857940673828, + "loss": 0.3316, + "losses/dpo": 0.600679337978363, + "losses/sft": 1.7634345293045044, + "losses/total": 0.600679337978363, + "ref_logps/chosen": -33.33518600463867, + "ref_logps/rejected": -37.236328125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2619129419326782, + "rewards/margins": 1.524311900138855, + "rewards/rejected": -2.786224842071533, + "step": 2174 + }, + { + "epoch": 2.05, + "grad_norm": 18.01077651977539, + "learning_rate": 1.7523609653725076e-07, + "logps/chosen": -48.266197204589844, + "logps/rejected": -92.97135162353516, + "loss": 0.1879, + "losses/dpo": 0.11259466409683228, + "losses/sft": 1.7341797351837158, + "losses/total": 0.11259466409683228, + "ref_logps/chosen": -33.4748649597168, + "ref_logps/rejected": -53.5361328125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4791333675384521, + "rewards/margins": 2.464388370513916, + "rewards/rejected": -3.9435219764709473, + "step": 2175 + }, + { + "epoch": 2.05, + "grad_norm": 27.163389205932617, + "learning_rate": 1.7506121021336133e-07, + "logps/chosen": -59.63398742675781, + "logps/rejected": -82.30131530761719, + "loss": 0.2348, + "losses/dpo": 0.13854947686195374, + "losses/sft": 1.8816542625427246, + "losses/total": 0.13854947686195374, + "ref_logps/chosen": -39.3703727722168, + "ref_logps/rejected": -42.33588409423828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0263609886169434, + "rewards/margins": 1.970181941986084, + "rewards/rejected": -3.9965429306030273, + "step": 2176 + }, + { + "epoch": 2.06, + "grad_norm": 13.131577491760254, + "learning_rate": 1.7488632388947184e-07, + "logps/chosen": -65.90166473388672, + "logps/rejected": -77.4763412475586, + "loss": 0.151, + "losses/dpo": 0.18051180243492126, + "losses/sft": 2.112351417541504, + "losses/total": 0.18051180243492126, + "ref_logps/chosen": -54.28382873535156, + "ref_logps/rejected": -45.44095230102539, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1617836952209473, + "rewards/margins": 2.041755199432373, + "rewards/rejected": -3.203538656234741, + "step": 2177 + }, + { + "epoch": 2.06, + "grad_norm": 14.968267440795898, + "learning_rate": 1.7471143756558238e-07, + "logps/chosen": -70.57845306396484, + "logps/rejected": -102.93708801269531, + "loss": 0.1333, + "losses/dpo": 0.15224751830101013, + "losses/sft": 1.9976760149002075, + "losses/total": 0.15224751830101013, + "ref_logps/chosen": -49.856834411621094, + "ref_logps/rejected": -55.65885925292969, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0721611976623535, + "rewards/margins": 2.6556618213653564, + "rewards/rejected": -4.727823257446289, + "step": 2178 + }, + { + "epoch": 2.06, + "grad_norm": 27.20005989074707, + "learning_rate": 1.745365512416929e-07, + "logps/chosen": -49.359806060791016, + "logps/rejected": -64.03297424316406, + "loss": 0.3851, + "losses/dpo": 0.2683928906917572, + "losses/sft": 1.762873649597168, + "losses/total": 0.2683928906917572, + "ref_logps/chosen": -32.485660552978516, + "ref_logps/rejected": -34.089698791503906, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.687414526939392, + "rewards/margins": 1.3069125413894653, + "rewards/rejected": -2.9943268299102783, + "step": 2179 + }, + { + "epoch": 2.06, + "grad_norm": 28.27048683166504, + "learning_rate": 1.743616649178034e-07, + "logps/chosen": -66.8674087524414, + "logps/rejected": -74.0071792602539, + "loss": 0.3318, + "losses/dpo": 0.17064473032951355, + "losses/sft": 1.7419644594192505, + "losses/total": 0.17064473032951355, + "ref_logps/chosen": -46.5545654296875, + "ref_logps/rejected": -38.44732666015625, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0312840938568115, + "rewards/margins": 1.5247018337249756, + "rewards/rejected": -3.555985927581787, + "step": 2180 + }, + { + "epoch": 2.06, + "grad_norm": 22.70830535888672, + "learning_rate": 1.7418677859391394e-07, + "logps/chosen": -53.06361770629883, + "logps/rejected": -75.39220428466797, + "loss": 0.2433, + "losses/dpo": 0.2415391504764557, + "losses/sft": 1.8355822563171387, + "losses/total": 0.2415391504764557, + "ref_logps/chosen": -36.476383209228516, + "ref_logps/rejected": -40.51774215698242, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6587237119674683, + "rewards/margins": 1.8287230730056763, + "rewards/rejected": -3.4874467849731445, + "step": 2181 + }, + { + "epoch": 2.06, + "grad_norm": 19.944292068481445, + "learning_rate": 1.7401189227002446e-07, + "logps/chosen": -58.89207077026367, + "logps/rejected": -87.72540283203125, + "loss": 0.2242, + "losses/dpo": 0.32031840085983276, + "losses/sft": 1.9566829204559326, + "losses/total": 0.32031840085983276, + "ref_logps/chosen": -41.21376037597656, + "ref_logps/rejected": -48.45860290527344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.767831563949585, + "rewards/margins": 2.158848762512207, + "rewards/rejected": -3.926680088043213, + "step": 2182 + }, + { + "epoch": 2.06, + "grad_norm": 16.069318771362305, + "learning_rate": 1.7383700594613502e-07, + "logps/chosen": -47.496742248535156, + "logps/rejected": -70.46304321289062, + "loss": 0.2476, + "losses/dpo": 0.2841242253780365, + "losses/sft": 2.312047243118286, + "losses/total": 0.2841242253780365, + "ref_logps/chosen": -32.34196472167969, + "ref_logps/rejected": -36.13676452636719, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5154778957366943, + "rewards/margins": 1.9171501398086548, + "rewards/rejected": -3.4326279163360596, + "step": 2183 + }, + { + "epoch": 2.06, + "grad_norm": 22.96808433532715, + "learning_rate": 1.7366211962224554e-07, + "logps/chosen": -59.409332275390625, + "logps/rejected": -86.48368835449219, + "loss": 0.3256, + "losses/dpo": 0.2149156630039215, + "losses/sft": 1.9455467462539673, + "losses/total": 0.2149156630039215, + "ref_logps/chosen": -39.371360778808594, + "ref_logps/rejected": -43.310508728027344, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0037968158721924, + "rewards/margins": 2.313520669937134, + "rewards/rejected": -4.317317962646484, + "step": 2184 + }, + { + "epoch": 2.06, + "grad_norm": 20.13286590576172, + "learning_rate": 1.7348723329835607e-07, + "logps/chosen": -54.029640197753906, + "logps/rejected": -86.71771240234375, + "loss": 0.1987, + "losses/dpo": 0.06851854175329208, + "losses/sft": 1.7582980394363403, + "losses/total": 0.06851854175329208, + "ref_logps/chosen": -33.388877868652344, + "ref_logps/rejected": -44.847572326660156, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0640759468078613, + "rewards/margins": 2.1229381561279297, + "rewards/rejected": -4.187013626098633, + "step": 2185 + }, + { + "epoch": 2.06, + "grad_norm": 12.484709739685059, + "learning_rate": 1.733123469744666e-07, + "logps/chosen": -69.00020599365234, + "logps/rejected": -92.46781921386719, + "loss": 0.1367, + "losses/dpo": 0.04129623621702194, + "losses/sft": 1.7258503437042236, + "losses/total": 0.04129623621702194, + "ref_logps/chosen": -46.61714172363281, + "ref_logps/rejected": -46.315940856933594, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2383062839508057, + "rewards/margins": 2.3768811225891113, + "rewards/rejected": -4.615187644958496, + "step": 2186 + }, + { + "epoch": 2.07, + "grad_norm": 21.23772430419922, + "learning_rate": 1.731374606505771e-07, + "logps/chosen": -53.44989776611328, + "logps/rejected": -92.73365783691406, + "loss": 0.2705, + "losses/dpo": 0.08861647546291351, + "losses/sft": 2.0424864292144775, + "losses/total": 0.08861647546291351, + "ref_logps/chosen": -36.1646728515625, + "ref_logps/rejected": -48.25510025024414, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7285223007202148, + "rewards/margins": 2.7193336486816406, + "rewards/rejected": -4.4478559494018555, + "step": 2187 + }, + { + "epoch": 2.07, + "grad_norm": 33.41069793701172, + "learning_rate": 1.7296257432668764e-07, + "logps/chosen": -60.85960006713867, + "logps/rejected": -75.24000549316406, + "loss": 0.3661, + "losses/dpo": 0.09842763841152191, + "losses/sft": 1.9827691316604614, + "losses/total": 0.09842763841152191, + "ref_logps/chosen": -38.05207824707031, + "ref_logps/rejected": -38.01402282714844, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.280752182006836, + "rewards/margins": 1.4418456554412842, + "rewards/rejected": -3.722598075866699, + "step": 2188 + }, + { + "epoch": 2.07, + "grad_norm": 13.806150436401367, + "learning_rate": 1.7278768800279818e-07, + "logps/chosen": -55.86123275756836, + "logps/rejected": -85.80702209472656, + "loss": 0.1695, + "losses/dpo": 0.24726992845535278, + "losses/sft": 1.9747188091278076, + "losses/total": 0.24726992845535278, + "ref_logps/chosen": -35.86094665527344, + "ref_logps/rejected": -44.97712326049805, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.000028133392334, + "rewards/margins": 2.0829615592956543, + "rewards/rejected": -4.0829901695251465, + "step": 2189 + }, + { + "epoch": 2.07, + "grad_norm": 14.162352561950684, + "learning_rate": 1.7261280167890872e-07, + "logps/chosen": -58.23813247680664, + "logps/rejected": -93.10269165039062, + "loss": 0.1762, + "losses/dpo": 0.07838228344917297, + "losses/sft": 2.18127703666687, + "losses/total": 0.07838228344917297, + "ref_logps/chosen": -40.983055114746094, + "ref_logps/rejected": -51.26152801513672, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7255079746246338, + "rewards/margins": 2.4586076736450195, + "rewards/rejected": -4.184115886688232, + "step": 2190 + }, + { + "epoch": 2.07, + "grad_norm": 26.535694122314453, + "learning_rate": 1.7243791535501923e-07, + "logps/chosen": -55.426780700683594, + "logps/rejected": -79.62908935546875, + "loss": 0.3449, + "losses/dpo": 0.19528482854366302, + "losses/sft": 1.7993446588516235, + "losses/total": 0.19528482854366302, + "ref_logps/chosen": -37.25511169433594, + "ref_logps/rejected": -43.912296295166016, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8171671628952026, + "rewards/margins": 1.7545119524002075, + "rewards/rejected": -3.57167911529541, + "step": 2191 + }, + { + "epoch": 2.07, + "grad_norm": 24.744495391845703, + "learning_rate": 1.7226302903112977e-07, + "logps/chosen": -53.65546417236328, + "logps/rejected": -85.93426513671875, + "loss": 0.3072, + "losses/dpo": 0.4467526972293854, + "losses/sft": 2.5786640644073486, + "losses/total": 0.4467526972293854, + "ref_logps/chosen": -33.4858283996582, + "ref_logps/rejected": -45.773956298828125, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.016963481903076, + "rewards/margins": 1.999066948890686, + "rewards/rejected": -4.016030311584473, + "step": 2192 + }, + { + "epoch": 2.07, + "grad_norm": 23.44725227355957, + "learning_rate": 1.7208814270724028e-07, + "logps/chosen": -53.96575927734375, + "logps/rejected": -87.75813293457031, + "loss": 0.2393, + "losses/dpo": 0.07793407142162323, + "losses/sft": 1.6732468605041504, + "losses/total": 0.07793407142162323, + "ref_logps/chosen": -37.87188720703125, + "ref_logps/rejected": -51.79331970214844, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6093873977661133, + "rewards/margins": 1.9870942831039429, + "rewards/rejected": -3.5964813232421875, + "step": 2193 + }, + { + "epoch": 2.07, + "grad_norm": 24.691877365112305, + "learning_rate": 1.719132563833508e-07, + "logps/chosen": -55.128334045410156, + "logps/rejected": -85.45429992675781, + "loss": 0.3788, + "losses/dpo": 0.9169511795043945, + "losses/sft": 2.684892177581787, + "losses/total": 0.9169511795043945, + "ref_logps/chosen": -33.98554229736328, + "ref_logps/rejected": -42.699195861816406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.114279270172119, + "rewards/margins": 2.1612305641174316, + "rewards/rejected": -4.275509834289551, + "step": 2194 + }, + { + "epoch": 2.07, + "grad_norm": 18.590097427368164, + "learning_rate": 1.7173837005946133e-07, + "logps/chosen": -42.8539924621582, + "logps/rejected": -72.0229721069336, + "loss": 0.2931, + "losses/dpo": 0.15254491567611694, + "losses/sft": 2.111268997192383, + "losses/total": 0.15254491567611694, + "ref_logps/chosen": -27.071752548217773, + "ref_logps/rejected": -38.403770446777344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5782239437103271, + "rewards/margins": 1.783695936203003, + "rewards/rejected": -3.36191987991333, + "step": 2195 + }, + { + "epoch": 2.07, + "grad_norm": 21.303430557250977, + "learning_rate": 1.7156348373557187e-07, + "logps/chosen": -56.064388275146484, + "logps/rejected": -73.31047058105469, + "loss": 0.2569, + "losses/dpo": 0.05567019432783127, + "losses/sft": 1.4970711469650269, + "losses/total": 0.05567019432783127, + "ref_logps/chosen": -41.055992126464844, + "ref_logps/rejected": -38.608367919921875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5008397102355957, + "rewards/margins": 1.9693713188171387, + "rewards/rejected": -3.4702110290527344, + "step": 2196 + }, + { + "epoch": 2.07, + "grad_norm": 24.07655143737793, + "learning_rate": 1.713885974116824e-07, + "logps/chosen": -52.34514617919922, + "logps/rejected": -80.76176452636719, + "loss": 0.2951, + "losses/dpo": 0.19542747735977173, + "losses/sft": 1.5525329113006592, + "losses/total": 0.19542747735977173, + "ref_logps/chosen": -34.239925384521484, + "ref_logps/rejected": -45.56114196777344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8105223178863525, + "rewards/margins": 1.7095403671264648, + "rewards/rejected": -3.5200626850128174, + "step": 2197 + }, + { + "epoch": 2.08, + "grad_norm": 22.42549705505371, + "learning_rate": 1.7121371108779292e-07, + "logps/chosen": -54.095611572265625, + "logps/rejected": -80.62397766113281, + "loss": 0.234, + "losses/dpo": 0.21732285618782043, + "losses/sft": 2.2127344608306885, + "losses/total": 0.21732285618782043, + "ref_logps/chosen": -37.54641342163086, + "ref_logps/rejected": -45.050636291503906, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6549193859100342, + "rewards/margins": 1.9024146795272827, + "rewards/rejected": -3.5573339462280273, + "step": 2198 + }, + { + "epoch": 2.08, + "grad_norm": 20.114450454711914, + "learning_rate": 1.7103882476390346e-07, + "logps/chosen": -53.59613037109375, + "logps/rejected": -94.48097229003906, + "loss": 0.1915, + "losses/dpo": 0.1815064698457718, + "losses/sft": 2.0497143268585205, + "losses/total": 0.1815064698457718, + "ref_logps/chosen": -37.92587661743164, + "ref_logps/rejected": -53.89605712890625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.567025065422058, + "rewards/margins": 2.4914662837982178, + "rewards/rejected": -4.058491230010986, + "step": 2199 + }, + { + "epoch": 2.08, + "grad_norm": 13.921630859375, + "learning_rate": 1.7086393844001398e-07, + "logps/chosen": -65.02278900146484, + "logps/rejected": -94.15165710449219, + "loss": 0.1687, + "losses/dpo": 0.11342591047286987, + "losses/sft": 1.8795005083084106, + "losses/total": 0.11342591047286987, + "ref_logps/chosen": -47.23157501220703, + "ref_logps/rejected": -52.740516662597656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.779120922088623, + "rewards/margins": 2.361992835998535, + "rewards/rejected": -4.141113758087158, + "step": 2200 + }, + { + "epoch": 2.08, + "grad_norm": 21.20457649230957, + "learning_rate": 1.706890521161245e-07, + "logps/chosen": -59.54458236694336, + "logps/rejected": -89.32866668701172, + "loss": 0.2722, + "losses/dpo": 0.10039615631103516, + "losses/sft": 2.3329107761383057, + "losses/total": 0.10039615631103516, + "ref_logps/chosen": -38.11788558959961, + "ref_logps/rejected": -48.12678909301758, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.142669916152954, + "rewards/margins": 1.97751784324646, + "rewards/rejected": -4.120187759399414, + "step": 2201 + }, + { + "epoch": 2.08, + "grad_norm": 13.924773216247559, + "learning_rate": 1.7051416579223505e-07, + "logps/chosen": -53.46916198730469, + "logps/rejected": -84.70620727539062, + "loss": 0.1943, + "losses/dpo": 0.16763654351234436, + "losses/sft": 2.394869327545166, + "losses/total": 0.16763654351234436, + "ref_logps/chosen": -31.43370819091797, + "ref_logps/rejected": -40.78105926513672, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.203545093536377, + "rewards/margins": 2.1889700889587402, + "rewards/rejected": -4.392515182495117, + "step": 2202 + }, + { + "epoch": 2.08, + "grad_norm": 23.022014617919922, + "learning_rate": 1.7033927946834557e-07, + "logps/chosen": -57.59918212890625, + "logps/rejected": -76.6629409790039, + "loss": 0.2921, + "losses/dpo": 0.47700318694114685, + "losses/sft": 1.7552341222763062, + "losses/total": 0.47700318694114685, + "ref_logps/chosen": -38.358924865722656, + "ref_logps/rejected": -40.03480911254883, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9240257740020752, + "rewards/margins": 1.7387876510620117, + "rewards/rejected": -3.662813425064087, + "step": 2203 + }, + { + "epoch": 2.08, + "grad_norm": 19.173295974731445, + "learning_rate": 1.701643931444561e-07, + "logps/chosen": -57.08106994628906, + "logps/rejected": -85.2384033203125, + "loss": 0.2732, + "losses/dpo": 0.09806828200817108, + "losses/sft": 2.107323169708252, + "losses/total": 0.09806828200817108, + "ref_logps/chosen": -35.34751892089844, + "ref_logps/rejected": -44.77887725830078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1733551025390625, + "rewards/margins": 1.8725978136062622, + "rewards/rejected": -4.045952796936035, + "step": 2204 + }, + { + "epoch": 2.08, + "grad_norm": 28.111894607543945, + "learning_rate": 1.6998950682056662e-07, + "logps/chosen": -55.997066497802734, + "logps/rejected": -73.00859069824219, + "loss": 0.3705, + "losses/dpo": 0.20976343750953674, + "losses/sft": 2.3299331665039062, + "losses/total": 0.20976343750953674, + "ref_logps/chosen": -36.010276794433594, + "ref_logps/rejected": -41.71709442138672, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9986789226531982, + "rewards/margins": 1.1304705142974854, + "rewards/rejected": -3.1291494369506836, + "step": 2205 + }, + { + "epoch": 2.08, + "grad_norm": 27.258811950683594, + "learning_rate": 1.6981462049667716e-07, + "logps/chosen": -63.16986083984375, + "logps/rejected": -98.24568939208984, + "loss": 0.222, + "losses/dpo": 0.19202932715415955, + "losses/sft": 2.4920687675476074, + "losses/total": 0.19202932715415955, + "ref_logps/chosen": -39.95744323730469, + "ref_logps/rejected": -51.04297637939453, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.321241855621338, + "rewards/margins": 2.39902925491333, + "rewards/rejected": -4.720271110534668, + "step": 2206 + }, + { + "epoch": 2.08, + "grad_norm": 19.09151268005371, + "learning_rate": 1.6963973417278767e-07, + "logps/chosen": -62.666236877441406, + "logps/rejected": -93.16124725341797, + "loss": 0.186, + "losses/dpo": 0.10941722989082336, + "losses/sft": 1.7997444868087769, + "losses/total": 0.10941722989082336, + "ref_logps/chosen": -40.200965881347656, + "ref_logps/rejected": -48.780181884765625, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2465267181396484, + "rewards/margins": 2.191579818725586, + "rewards/rejected": -4.438106536865234, + "step": 2207 + }, + { + "epoch": 2.08, + "grad_norm": 16.559520721435547, + "learning_rate": 1.6946484784889818e-07, + "logps/chosen": -56.985877990722656, + "logps/rejected": -87.59817504882812, + "loss": 0.1791, + "losses/dpo": 0.21834111213684082, + "losses/sft": 1.5174407958984375, + "losses/total": 0.21834111213684082, + "ref_logps/chosen": -35.917991638183594, + "ref_logps/rejected": -43.31269073486328, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.106788396835327, + "rewards/margins": 2.321760654449463, + "rewards/rejected": -4.428549289703369, + "step": 2208 + }, + { + "epoch": 2.09, + "grad_norm": 17.215805053710938, + "learning_rate": 1.6928996152500875e-07, + "logps/chosen": -45.57110595703125, + "logps/rejected": -71.7991943359375, + "loss": 0.2052, + "losses/dpo": 0.3538817763328552, + "losses/sft": 1.5875787734985352, + "losses/total": 0.3538817763328552, + "ref_logps/chosen": -30.913644790649414, + "ref_logps/rejected": -34.5597038269043, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4657464027404785, + "rewards/margins": 2.258202314376831, + "rewards/rejected": -3.7239487171173096, + "step": 2209 + }, + { + "epoch": 2.09, + "grad_norm": 8.342625617980957, + "learning_rate": 1.6911507520111926e-07, + "logps/chosen": -61.34052658081055, + "logps/rejected": -111.30182647705078, + "loss": 0.083, + "losses/dpo": 0.08819499611854553, + "losses/sft": 1.736279010772705, + "losses/total": 0.08819499611854553, + "ref_logps/chosen": -41.58523178100586, + "ref_logps/rejected": -60.833126068115234, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.975529670715332, + "rewards/margins": 3.071340560913086, + "rewards/rejected": -5.04686975479126, + "step": 2210 + }, + { + "epoch": 2.09, + "grad_norm": 14.353584289550781, + "learning_rate": 1.689401888772298e-07, + "logps/chosen": -57.32757568359375, + "logps/rejected": -83.46308898925781, + "loss": 0.1248, + "losses/dpo": 0.10038065910339355, + "losses/sft": 1.7414997816085815, + "losses/total": 0.10038065910339355, + "ref_logps/chosen": -38.945091247558594, + "ref_logps/rejected": -41.35407257080078, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8382482528686523, + "rewards/margins": 2.372653007507324, + "rewards/rejected": -4.210901260375977, + "step": 2211 + }, + { + "epoch": 2.09, + "grad_norm": 15.132885932922363, + "learning_rate": 1.6876530255334031e-07, + "logps/chosen": -48.53065490722656, + "logps/rejected": -79.5280990600586, + "loss": 0.1678, + "losses/dpo": 0.06708650290966034, + "losses/sft": 2.090761661529541, + "losses/total": 0.06708650290966034, + "ref_logps/chosen": -32.15243148803711, + "ref_logps/rejected": -40.45890808105469, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6378223896026611, + "rewards/margins": 2.269096612930298, + "rewards/rejected": -3.906919002532959, + "step": 2212 + }, + { + "epoch": 2.09, + "grad_norm": 24.768259048461914, + "learning_rate": 1.6859041622945085e-07, + "logps/chosen": -53.58648681640625, + "logps/rejected": -76.79092407226562, + "loss": 0.2226, + "losses/dpo": 0.31338661909103394, + "losses/sft": 2.219318389892578, + "losses/total": 0.31338661909103394, + "ref_logps/chosen": -32.16387176513672, + "ref_logps/rejected": -37.08013153076172, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1422619819641113, + "rewards/margins": 1.8288161754608154, + "rewards/rejected": -3.9710781574249268, + "step": 2213 + }, + { + "epoch": 2.09, + "grad_norm": 18.416728973388672, + "learning_rate": 1.6841552990556137e-07, + "logps/chosen": -55.35419845581055, + "logps/rejected": -78.583984375, + "loss": 0.2559, + "losses/dpo": 0.12346776574850082, + "losses/sft": 2.094236373901367, + "losses/total": 0.12346776574850082, + "ref_logps/chosen": -37.566490173339844, + "ref_logps/rejected": -40.12315368652344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7787704467773438, + "rewards/margins": 2.067312717437744, + "rewards/rejected": -3.846083164215088, + "step": 2214 + }, + { + "epoch": 2.09, + "grad_norm": 20.469013214111328, + "learning_rate": 1.682406435816719e-07, + "logps/chosen": -72.93456268310547, + "logps/rejected": -98.66339874267578, + "loss": 0.2145, + "losses/dpo": 0.22982379794120789, + "losses/sft": 2.3891358375549316, + "losses/total": 0.22982379794120789, + "ref_logps/chosen": -45.680511474609375, + "ref_logps/rejected": -49.3587646484375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.72540545463562, + "rewards/margins": 2.2050580978393555, + "rewards/rejected": -4.9304633140563965, + "step": 2215 + }, + { + "epoch": 2.09, + "grad_norm": 21.647539138793945, + "learning_rate": 1.6806575725778244e-07, + "logps/chosen": -49.575714111328125, + "logps/rejected": -80.34425354003906, + "loss": 0.239, + "losses/dpo": 0.13555070757865906, + "losses/sft": 2.7519843578338623, + "losses/total": 0.13555070757865906, + "ref_logps/chosen": -31.77227783203125, + "ref_logps/rejected": -41.82489776611328, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7803436517715454, + "rewards/margins": 2.071591854095459, + "rewards/rejected": -3.851935863494873, + "step": 2216 + }, + { + "epoch": 2.09, + "grad_norm": 18.783519744873047, + "learning_rate": 1.6789087093389296e-07, + "logps/chosen": -42.88027572631836, + "logps/rejected": -84.32542419433594, + "loss": 0.2282, + "losses/dpo": 0.16614983975887299, + "losses/sft": 1.6410008668899536, + "losses/total": 0.16614983975887299, + "ref_logps/chosen": -26.285659790039062, + "ref_logps/rejected": -43.653663635253906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.659461498260498, + "rewards/margins": 2.4077138900756836, + "rewards/rejected": -4.06717586517334, + "step": 2217 + }, + { + "epoch": 2.09, + "grad_norm": 27.009485244750977, + "learning_rate": 1.677159846100035e-07, + "logps/chosen": -59.341453552246094, + "logps/rejected": -77.44983673095703, + "loss": 0.3232, + "losses/dpo": 0.41764286160469055, + "losses/sft": 2.077141761779785, + "losses/total": 0.41764286160469055, + "ref_logps/chosen": -37.908653259277344, + "ref_logps/rejected": -39.44845199584961, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.143280029296875, + "rewards/margins": 1.6568586826324463, + "rewards/rejected": -3.8001389503479004, + "step": 2218 + }, + { + "epoch": 2.1, + "grad_norm": 26.723899841308594, + "learning_rate": 1.67541098286114e-07, + "logps/chosen": -69.6722640991211, + "logps/rejected": -91.57075500488281, + "loss": 0.3215, + "losses/dpo": 0.11519400030374527, + "losses/sft": 2.6588547229766846, + "losses/total": 0.11519400030374527, + "ref_logps/chosen": -41.187618255615234, + "ref_logps/rejected": -47.43374252319336, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8484644889831543, + "rewards/margins": 1.5652365684509277, + "rewards/rejected": -4.413701057434082, + "step": 2219 + }, + { + "epoch": 2.1, + "grad_norm": 18.43507194519043, + "learning_rate": 1.6736621196222455e-07, + "logps/chosen": -63.70801544189453, + "logps/rejected": -100.65912628173828, + "loss": 0.1566, + "losses/dpo": 0.28203755617141724, + "losses/sft": 1.9518189430236816, + "losses/total": 0.28203755617141724, + "ref_logps/chosen": -43.11907196044922, + "ref_logps/rejected": -53.56328582763672, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.058894395828247, + "rewards/margins": 2.6506900787353516, + "rewards/rejected": -4.709585189819336, + "step": 2220 + }, + { + "epoch": 2.1, + "grad_norm": 27.70952796936035, + "learning_rate": 1.6719132563833509e-07, + "logps/chosen": -55.762664794921875, + "logps/rejected": -93.76734924316406, + "loss": 0.3236, + "losses/dpo": 0.7263743281364441, + "losses/sft": 3.2946126461029053, + "losses/total": 0.7263743281364441, + "ref_logps/chosen": -32.23370361328125, + "ref_logps/rejected": -50.9365348815918, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.352896213531494, + "rewards/margins": 1.9301854372024536, + "rewards/rejected": -4.283081531524658, + "step": 2221 + }, + { + "epoch": 2.1, + "grad_norm": 19.451852798461914, + "learning_rate": 1.670164393144456e-07, + "logps/chosen": -53.164573669433594, + "logps/rejected": -83.31100463867188, + "loss": 0.2098, + "losses/dpo": 0.13225436210632324, + "losses/sft": 1.889532208442688, + "losses/total": 0.13225436210632324, + "ref_logps/chosen": -35.533294677734375, + "ref_logps/rejected": -42.04703140258789, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7631275653839111, + "rewards/margins": 2.3632702827453613, + "rewards/rejected": -4.126398086547852, + "step": 2222 + }, + { + "epoch": 2.1, + "grad_norm": 25.091676712036133, + "learning_rate": 1.6684155299055614e-07, + "logps/chosen": -53.45108413696289, + "logps/rejected": -78.944580078125, + "loss": 0.317, + "losses/dpo": 0.30559539794921875, + "losses/sft": 2.2687339782714844, + "losses/total": 0.30559539794921875, + "ref_logps/chosen": -34.379608154296875, + "ref_logps/rejected": -40.762237548828125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9071475267410278, + "rewards/margins": 1.9110870361328125, + "rewards/rejected": -3.81823468208313, + "step": 2223 + }, + { + "epoch": 2.1, + "grad_norm": 15.996298789978027, + "learning_rate": 1.6666666666666665e-07, + "logps/chosen": -53.738426208496094, + "logps/rejected": -82.813720703125, + "loss": 0.1778, + "losses/dpo": 0.152542382478714, + "losses/sft": 1.60818612575531, + "losses/total": 0.152542382478714, + "ref_logps/chosen": -34.05722427368164, + "ref_logps/rejected": -41.73652648925781, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9681199789047241, + "rewards/margins": 2.1395998001098633, + "rewards/rejected": -4.107719421386719, + "step": 2224 + }, + { + "epoch": 2.1, + "grad_norm": 36.061119079589844, + "learning_rate": 1.664917803427772e-07, + "logps/chosen": -61.44285202026367, + "logps/rejected": -101.45341491699219, + "loss": 0.3586, + "losses/dpo": 0.6123057007789612, + "losses/sft": 2.1946911811828613, + "losses/total": 0.6123057007789612, + "ref_logps/chosen": -39.097747802734375, + "ref_logps/rejected": -60.448699951171875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.234510660171509, + "rewards/margins": 1.8659616708755493, + "rewards/rejected": -4.100472450256348, + "step": 2225 + }, + { + "epoch": 2.1, + "grad_norm": 33.13676071166992, + "learning_rate": 1.663168940188877e-07, + "logps/chosen": -65.36542510986328, + "logps/rejected": -83.72738647460938, + "loss": 0.2638, + "losses/dpo": 0.5983446836471558, + "losses/sft": 1.8281232118606567, + "losses/total": 0.5983446836471558, + "ref_logps/chosen": -45.387176513671875, + "ref_logps/rejected": -42.81666564941406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9978246688842773, + "rewards/margins": 2.093247413635254, + "rewards/rejected": -4.091072082519531, + "step": 2226 + }, + { + "epoch": 2.1, + "grad_norm": 14.86474609375, + "learning_rate": 1.6614200769499824e-07, + "logps/chosen": -67.95303344726562, + "logps/rejected": -97.45545959472656, + "loss": 0.1262, + "losses/dpo": 0.13807989656925201, + "losses/sft": 2.1071507930755615, + "losses/total": 0.13807989656925201, + "ref_logps/chosen": -43.59563446044922, + "ref_logps/rejected": -46.4262580871582, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.435739517211914, + "rewards/margins": 2.667180299758911, + "rewards/rejected": -5.102920055389404, + "step": 2227 + }, + { + "epoch": 2.1, + "grad_norm": 23.756662368774414, + "learning_rate": 1.6596712137110878e-07, + "logps/chosen": -56.06706237792969, + "logps/rejected": -78.91806030273438, + "loss": 0.2815, + "losses/dpo": 0.2714054584503174, + "losses/sft": 1.6447714567184448, + "losses/total": 0.2714054584503174, + "ref_logps/chosen": -38.203189849853516, + "ref_logps/rejected": -41.72379684448242, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7863867282867432, + "rewards/margins": 1.9330400228500366, + "rewards/rejected": -3.7194266319274902, + "step": 2228 + }, + { + "epoch": 2.1, + "grad_norm": 40.492034912109375, + "learning_rate": 1.657922350472193e-07, + "logps/chosen": -68.50590515136719, + "logps/rejected": -88.46195220947266, + "loss": 0.541, + "losses/dpo": 0.8684618473052979, + "losses/sft": 3.044482946395874, + "losses/total": 0.8684618473052979, + "ref_logps/chosen": -44.93730926513672, + "ref_logps/rejected": -46.64177322387695, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3568596839904785, + "rewards/margins": 1.8251579999923706, + "rewards/rejected": -4.182017803192139, + "step": 2229 + }, + { + "epoch": 2.11, + "grad_norm": 30.27342414855957, + "learning_rate": 1.6561734872332983e-07, + "logps/chosen": -54.731815338134766, + "logps/rejected": -78.69754028320312, + "loss": 0.2927, + "losses/dpo": 0.08893805742263794, + "losses/sft": 2.030031442642212, + "losses/total": 0.08893805742263794, + "ref_logps/chosen": -36.54650115966797, + "ref_logps/rejected": -40.523658752441406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8185312747955322, + "rewards/margins": 1.9988573789596558, + "rewards/rejected": -3.8173885345458984, + "step": 2230 + }, + { + "epoch": 2.11, + "grad_norm": 18.653667449951172, + "learning_rate": 1.6544246239944034e-07, + "logps/chosen": -57.56175994873047, + "logps/rejected": -84.8160400390625, + "loss": 0.2574, + "losses/dpo": 0.560514509677887, + "losses/sft": 1.7918487787246704, + "losses/total": 0.560514509677887, + "ref_logps/chosen": -39.770137786865234, + "ref_logps/rejected": -47.24848937988281, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7791621685028076, + "rewards/margins": 1.977592945098877, + "rewards/rejected": -3.7567548751831055, + "step": 2231 + }, + { + "epoch": 2.11, + "grad_norm": 14.561543464660645, + "learning_rate": 1.6526757607555088e-07, + "logps/chosen": -42.29571533203125, + "logps/rejected": -69.97169494628906, + "loss": 0.1708, + "losses/dpo": 0.30205655097961426, + "losses/sft": 2.2472383975982666, + "losses/total": 0.30205655097961426, + "ref_logps/chosen": -29.076812744140625, + "ref_logps/rejected": -33.68364715576172, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.321890115737915, + "rewards/margins": 2.306915044784546, + "rewards/rejected": -3.628805160522461, + "step": 2232 + }, + { + "epoch": 2.11, + "grad_norm": 20.32581901550293, + "learning_rate": 1.650926897516614e-07, + "logps/chosen": -49.80039978027344, + "logps/rejected": -70.1690902709961, + "loss": 0.2397, + "losses/dpo": 0.2494000494480133, + "losses/sft": 1.9949127435684204, + "losses/total": 0.2494000494480133, + "ref_logps/chosen": -34.77843475341797, + "ref_logps/rejected": -38.26068878173828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5021963119506836, + "rewards/margins": 1.6886440515518188, + "rewards/rejected": -3.190840482711792, + "step": 2233 + }, + { + "epoch": 2.11, + "grad_norm": 18.533443450927734, + "learning_rate": 1.6491780342777196e-07, + "logps/chosen": -75.72285461425781, + "logps/rejected": -93.07483673095703, + "loss": 0.1894, + "losses/dpo": 0.14606128633022308, + "losses/sft": 2.5543980598449707, + "losses/total": 0.14606128633022308, + "ref_logps/chosen": -51.62957000732422, + "ref_logps/rejected": -47.83635711669922, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.409327983856201, + "rewards/margins": 2.11452054977417, + "rewards/rejected": -4.523848533630371, + "step": 2234 + }, + { + "epoch": 2.11, + "grad_norm": 17.314498901367188, + "learning_rate": 1.6474291710388247e-07, + "logps/chosen": -54.58754348754883, + "logps/rejected": -100.722900390625, + "loss": 0.1374, + "losses/dpo": 0.12343157827854156, + "losses/sft": 2.27484130859375, + "losses/total": 0.12343157827854156, + "ref_logps/chosen": -34.945777893066406, + "ref_logps/rejected": -53.31879425048828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9641767740249634, + "rewards/margins": 2.7762346267700195, + "rewards/rejected": -4.740411758422852, + "step": 2235 + }, + { + "epoch": 2.11, + "grad_norm": 23.696046829223633, + "learning_rate": 1.64568030779993e-07, + "logps/chosen": -49.654815673828125, + "logps/rejected": -80.75139617919922, + "loss": 0.2483, + "losses/dpo": 0.24056974053382874, + "losses/sft": 1.8114200830459595, + "losses/total": 0.24056974053382874, + "ref_logps/chosen": -30.18017578125, + "ref_logps/rejected": -42.11722946166992, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.947464108467102, + "rewards/margins": 1.9159528017044067, + "rewards/rejected": -3.8634166717529297, + "step": 2236 + }, + { + "epoch": 2.11, + "grad_norm": 18.896560668945312, + "learning_rate": 1.6439314445610353e-07, + "logps/chosen": -55.08682632446289, + "logps/rejected": -81.15058898925781, + "loss": 0.2281, + "losses/dpo": 0.13519428670406342, + "losses/sft": 1.9493145942687988, + "losses/total": 0.13519428670406342, + "ref_logps/chosen": -36.99998092651367, + "ref_logps/rejected": -43.503700256347656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8086843490600586, + "rewards/margins": 1.956005334854126, + "rewards/rejected": -3.7646899223327637, + "step": 2237 + }, + { + "epoch": 2.11, + "grad_norm": 21.233287811279297, + "learning_rate": 1.6421825813221404e-07, + "logps/chosen": -67.05801391601562, + "logps/rejected": -84.22810363769531, + "loss": 0.2588, + "losses/dpo": 0.3462636172771454, + "losses/sft": 2.207815170288086, + "losses/total": 0.3462636172771454, + "ref_logps/chosen": -47.680606842041016, + "ref_logps/rejected": -43.420562744140625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9377408027648926, + "rewards/margins": 2.143012523651123, + "rewards/rejected": -4.080753326416016, + "step": 2238 + }, + { + "epoch": 2.11, + "grad_norm": 26.304241180419922, + "learning_rate": 1.6404337180832458e-07, + "logps/chosen": -50.78749084472656, + "logps/rejected": -84.84367370605469, + "loss": 0.2854, + "losses/dpo": 0.4090076684951782, + "losses/sft": 1.5576115846633911, + "losses/total": 0.4090076684951782, + "ref_logps/chosen": -33.83174514770508, + "ref_logps/rejected": -45.708221435546875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6955742835998535, + "rewards/margins": 2.2179718017578125, + "rewards/rejected": -3.913546085357666, + "step": 2239 + }, + { + "epoch": 2.12, + "grad_norm": 19.78229522705078, + "learning_rate": 1.638684854844351e-07, + "logps/chosen": -60.204002380371094, + "logps/rejected": -91.46977233886719, + "loss": 0.2501, + "losses/dpo": 0.09898929297924042, + "losses/sft": 2.0589511394500732, + "losses/total": 0.09898929297924042, + "ref_logps/chosen": -39.20741653442383, + "ref_logps/rejected": -47.30803680419922, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.099658250808716, + "rewards/margins": 2.3165149688720703, + "rewards/rejected": -4.416172981262207, + "step": 2240 + }, + { + "epoch": 2.12, + "grad_norm": 21.187986373901367, + "learning_rate": 1.6369359916054566e-07, + "logps/chosen": -51.58329772949219, + "logps/rejected": -97.73701477050781, + "loss": 0.266, + "losses/dpo": 0.1460612267255783, + "losses/sft": 2.5049760341644287, + "losses/total": 0.1460612267255783, + "ref_logps/chosen": -35.03326416015625, + "ref_logps/rejected": -55.8988037109375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.655003309249878, + "rewards/margins": 2.528817892074585, + "rewards/rejected": -4.183821201324463, + "step": 2241 + }, + { + "epoch": 2.12, + "grad_norm": 14.329180717468262, + "learning_rate": 1.6351871283665617e-07, + "logps/chosen": -44.3607177734375, + "logps/rejected": -79.14237213134766, + "loss": 0.1686, + "losses/dpo": 0.26841437816619873, + "losses/sft": 1.3265010118484497, + "losses/total": 0.26841437816619873, + "ref_logps/chosen": -27.848522186279297, + "ref_logps/rejected": -39.2000732421875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6512196063995361, + "rewards/margins": 2.343010425567627, + "rewards/rejected": -3.994230270385742, + "step": 2242 + }, + { + "epoch": 2.12, + "grad_norm": 22.371397018432617, + "learning_rate": 1.633438265127667e-07, + "logps/chosen": -45.22007751464844, + "logps/rejected": -79.30278778076172, + "loss": 0.2533, + "losses/dpo": 0.1291871964931488, + "losses/sft": 1.2261956930160522, + "losses/total": 0.1291871964931488, + "ref_logps/chosen": -31.777246475219727, + "ref_logps/rejected": -44.50048065185547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3442833423614502, + "rewards/margins": 2.1359472274780273, + "rewards/rejected": -3.4802303314208984, + "step": 2243 + }, + { + "epoch": 2.12, + "grad_norm": 23.07261085510254, + "learning_rate": 1.6316894018887722e-07, + "logps/chosen": -51.61261749267578, + "logps/rejected": -77.51898193359375, + "loss": 0.2759, + "losses/dpo": 0.06007974594831467, + "losses/sft": 1.7587203979492188, + "losses/total": 0.06007974594831467, + "ref_logps/chosen": -35.39130401611328, + "ref_logps/rejected": -44.41795349121094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6221314668655396, + "rewards/margins": 1.6879708766937256, + "rewards/rejected": -3.3101024627685547, + "step": 2244 + }, + { + "epoch": 2.12, + "grad_norm": 19.950515747070312, + "learning_rate": 1.6299405386498773e-07, + "logps/chosen": -63.1268310546875, + "logps/rejected": -98.84368896484375, + "loss": 0.2335, + "losses/dpo": 0.08571727573871613, + "losses/sft": 1.9807275533676147, + "losses/total": 0.08571727573871613, + "ref_logps/chosen": -41.314613342285156, + "ref_logps/rejected": -56.26796340942383, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1812214851379395, + "rewards/margins": 2.0763511657714844, + "rewards/rejected": -4.257572650909424, + "step": 2245 + }, + { + "epoch": 2.12, + "grad_norm": 16.623170852661133, + "learning_rate": 1.6281916754109827e-07, + "logps/chosen": -44.955604553222656, + "logps/rejected": -86.53844451904297, + "loss": 0.1753, + "losses/dpo": 0.20481280982494354, + "losses/sft": 1.9196656942367554, + "losses/total": 0.20481280982494354, + "ref_logps/chosen": -29.323665618896484, + "ref_logps/rejected": -46.16213607788086, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.563193917274475, + "rewards/margins": 2.4744367599487305, + "rewards/rejected": -4.037631034851074, + "step": 2246 + }, + { + "epoch": 2.12, + "grad_norm": 26.565893173217773, + "learning_rate": 1.626442812172088e-07, + "logps/chosen": -56.64362716674805, + "logps/rejected": -81.39777374267578, + "loss": 0.3296, + "losses/dpo": 0.10279586911201477, + "losses/sft": 2.5590388774871826, + "losses/total": 0.10279586911201477, + "ref_logps/chosen": -36.619625091552734, + "ref_logps/rejected": -41.5498046875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0024003982543945, + "rewards/margins": 1.9823968410491943, + "rewards/rejected": -3.984797239303589, + "step": 2247 + }, + { + "epoch": 2.12, + "grad_norm": 27.30438232421875, + "learning_rate": 1.6246939489331935e-07, + "logps/chosen": -52.283355712890625, + "logps/rejected": -92.72384643554688, + "loss": 0.2662, + "losses/dpo": 0.31035855412483215, + "losses/sft": 1.8905383348464966, + "losses/total": 0.31035855412483215, + "ref_logps/chosen": -37.64241027832031, + "ref_logps/rejected": -55.542701721191406, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.464094638824463, + "rewards/margins": 2.2540202140808105, + "rewards/rejected": -3.7181148529052734, + "step": 2248 + }, + { + "epoch": 2.12, + "grad_norm": 30.367029190063477, + "learning_rate": 1.6229450856942986e-07, + "logps/chosen": -60.2539176940918, + "logps/rejected": -81.2086181640625, + "loss": 0.2924, + "losses/dpo": 0.27630990743637085, + "losses/sft": 2.526815891265869, + "losses/total": 0.27630990743637085, + "ref_logps/chosen": -41.47901153564453, + "ref_logps/rejected": -44.62622833251953, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.877490520477295, + "rewards/margins": 1.7807488441467285, + "rewards/rejected": -3.6582391262054443, + "step": 2249 + }, + { + "epoch": 2.12, + "grad_norm": 19.49413299560547, + "learning_rate": 1.621196222455404e-07, + "logps/chosen": -44.23246765136719, + "logps/rejected": -82.84221649169922, + "loss": 0.2119, + "losses/dpo": 0.2684689164161682, + "losses/sft": 2.4392786026000977, + "losses/total": 0.2684689164161682, + "ref_logps/chosen": -28.507184982299805, + "ref_logps/rejected": -42.323036193847656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5725280046463013, + "rewards/margins": 2.4793901443481445, + "rewards/rejected": -4.0519185066223145, + "step": 2250 + }, + { + "epoch": 2.13, + "grad_norm": 23.003732681274414, + "learning_rate": 1.6194473592165092e-07, + "logps/chosen": -55.03348159790039, + "logps/rejected": -86.842041015625, + "loss": 0.2321, + "losses/dpo": 0.07002858817577362, + "losses/sft": 2.1327624320983887, + "losses/total": 0.07002858817577362, + "ref_logps/chosen": -37.50593948364258, + "ref_logps/rejected": -46.40522003173828, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.752753734588623, + "rewards/margins": 2.290928363800049, + "rewards/rejected": -4.043682098388672, + "step": 2251 + }, + { + "epoch": 2.13, + "grad_norm": 17.487110137939453, + "learning_rate": 1.6176984959776143e-07, + "logps/chosen": -55.22237014770508, + "logps/rejected": -87.433837890625, + "loss": 0.1756, + "losses/dpo": 0.3281095623970032, + "losses/sft": 2.2306060791015625, + "losses/total": 0.3281095623970032, + "ref_logps/chosen": -37.505985260009766, + "ref_logps/rejected": -44.09942626953125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7716386318206787, + "rewards/margins": 2.561802864074707, + "rewards/rejected": -4.333441734313965, + "step": 2252 + }, + { + "epoch": 2.13, + "grad_norm": 22.838956832885742, + "learning_rate": 1.61594963273872e-07, + "logps/chosen": -56.50987243652344, + "logps/rejected": -90.54911804199219, + "loss": 0.2184, + "losses/dpo": 0.3154241740703583, + "losses/sft": 2.2732532024383545, + "losses/total": 0.3154241740703583, + "ref_logps/chosen": -36.47102737426758, + "ref_logps/rejected": -44.3578987121582, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.003884792327881, + "rewards/margins": 2.615237236022949, + "rewards/rejected": -4.61912202835083, + "step": 2253 + }, + { + "epoch": 2.13, + "grad_norm": 25.773296356201172, + "learning_rate": 1.614200769499825e-07, + "logps/chosen": -58.50932312011719, + "logps/rejected": -88.87077331542969, + "loss": 0.2326, + "losses/dpo": 0.19233441352844238, + "losses/sft": 1.700931429862976, + "losses/total": 0.19233441352844238, + "ref_logps/chosen": -42.74064636230469, + "ref_logps/rejected": -52.24690628051758, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5768675804138184, + "rewards/margins": 2.085519790649414, + "rewards/rejected": -3.6623873710632324, + "step": 2254 + }, + { + "epoch": 2.13, + "grad_norm": 21.362497329711914, + "learning_rate": 1.6124519062609305e-07, + "logps/chosen": -55.05992889404297, + "logps/rejected": -71.38827514648438, + "loss": 0.2201, + "losses/dpo": 0.2599198520183563, + "losses/sft": 1.693109393119812, + "losses/total": 0.2599198520183563, + "ref_logps/chosen": -38.30255889892578, + "ref_logps/rejected": -35.69434356689453, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6757371425628662, + "rewards/margins": 1.8936560153961182, + "rewards/rejected": -3.5693931579589844, + "step": 2255 + }, + { + "epoch": 2.13, + "grad_norm": 23.30718421936035, + "learning_rate": 1.6107030430220356e-07, + "logps/chosen": -65.42486572265625, + "logps/rejected": -80.0082778930664, + "loss": 0.3433, + "losses/dpo": 0.48402729630470276, + "losses/sft": 1.9216808080673218, + "losses/total": 0.48402729630470276, + "ref_logps/chosen": -44.13976287841797, + "ref_logps/rejected": -42.1544189453125, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1285104751586914, + "rewards/margins": 1.6568758487701416, + "rewards/rejected": -3.785386562347412, + "step": 2256 + }, + { + "epoch": 2.13, + "grad_norm": 42.49635314941406, + "learning_rate": 1.608954179783141e-07, + "logps/chosen": -68.43677520751953, + "logps/rejected": -88.00084686279297, + "loss": 0.5192, + "losses/dpo": 0.1933937668800354, + "losses/sft": 2.2337167263031006, + "losses/total": 0.1933937668800354, + "ref_logps/chosen": -45.55704879760742, + "ref_logps/rejected": -53.633323669433594, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.2879724502563477, + "rewards/margins": 1.1487798690795898, + "rewards/rejected": -3.4367523193359375, + "step": 2257 + }, + { + "epoch": 2.13, + "grad_norm": 25.39251708984375, + "learning_rate": 1.607205316544246e-07, + "logps/chosen": -42.743568420410156, + "logps/rejected": -71.08970642089844, + "loss": 0.3061, + "losses/dpo": 0.3308146595954895, + "losses/sft": 1.471168041229248, + "losses/total": 0.3308146595954895, + "ref_logps/chosen": -31.67058753967285, + "ref_logps/rejected": -40.524105072021484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1072981357574463, + "rewards/margins": 1.9492619037628174, + "rewards/rejected": -3.0565600395202637, + "step": 2258 + }, + { + "epoch": 2.13, + "grad_norm": 28.404516220092773, + "learning_rate": 1.6054564533053512e-07, + "logps/chosen": -56.04657745361328, + "logps/rejected": -76.16471862792969, + "loss": 0.3164, + "losses/dpo": 0.20249655842781067, + "losses/sft": 2.0203540325164795, + "losses/total": 0.20249655842781067, + "ref_logps/chosen": -40.08431625366211, + "ref_logps/rejected": -42.63153076171875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5962259769439697, + "rewards/margins": 1.7570924758911133, + "rewards/rejected": -3.353318691253662, + "step": 2259 + }, + { + "epoch": 2.13, + "grad_norm": 20.589021682739258, + "learning_rate": 1.603707590066457e-07, + "logps/chosen": -50.516075134277344, + "logps/rejected": -76.57281494140625, + "loss": 0.2487, + "losses/dpo": 0.12869106233119965, + "losses/sft": 2.1255719661712646, + "losses/total": 0.12869106233119965, + "ref_logps/chosen": -31.348102569580078, + "ref_logps/rejected": -39.09846496582031, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9167975187301636, + "rewards/margins": 1.83063805103302, + "rewards/rejected": -3.7474355697631836, + "step": 2260 + }, + { + "epoch": 2.14, + "grad_norm": 22.93833351135254, + "learning_rate": 1.601958726827562e-07, + "logps/chosen": -70.66360473632812, + "logps/rejected": -97.74938201904297, + "loss": 0.2547, + "losses/dpo": 0.40152662992477417, + "losses/sft": 2.035598039627075, + "losses/total": 0.40152662992477417, + "ref_logps/chosen": -46.9935302734375, + "ref_logps/rejected": -54.04365539550781, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3670077323913574, + "rewards/margins": 2.0035650730133057, + "rewards/rejected": -4.370573043823242, + "step": 2261 + }, + { + "epoch": 2.14, + "grad_norm": 25.003437042236328, + "learning_rate": 1.6002098635886674e-07, + "logps/chosen": -52.92638397216797, + "logps/rejected": -78.58967590332031, + "loss": 0.2604, + "losses/dpo": 0.11700643599033356, + "losses/sft": 1.5392807722091675, + "losses/total": 0.11700643599033356, + "ref_logps/chosen": -40.420047760009766, + "ref_logps/rejected": -44.13471603393555, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2506334781646729, + "rewards/margins": 2.1948623657226562, + "rewards/rejected": -3.445496082305908, + "step": 2262 + }, + { + "epoch": 2.14, + "grad_norm": 28.93943214416504, + "learning_rate": 1.5984610003497725e-07, + "logps/chosen": -50.6925048828125, + "logps/rejected": -64.04227447509766, + "loss": 0.374, + "losses/dpo": 0.17544978857040405, + "losses/sft": 2.4645628929138184, + "losses/total": 0.17544978857040405, + "ref_logps/chosen": -31.019001007080078, + "ref_logps/rejected": -31.54002571105957, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9673502445220947, + "rewards/margins": 1.2828750610351562, + "rewards/rejected": -3.250225305557251, + "step": 2263 + }, + { + "epoch": 2.14, + "grad_norm": 24.567184448242188, + "learning_rate": 1.596712137110878e-07, + "logps/chosen": -71.976318359375, + "logps/rejected": -95.36058044433594, + "loss": 0.2598, + "losses/dpo": 0.327330619096756, + "losses/sft": 1.4970035552978516, + "losses/total": 0.327330619096756, + "ref_logps/chosen": -53.477752685546875, + "ref_logps/rejected": -55.2637939453125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8498564958572388, + "rewards/margins": 2.1598219871520996, + "rewards/rejected": -4.009678840637207, + "step": 2264 + }, + { + "epoch": 2.14, + "grad_norm": 21.226322174072266, + "learning_rate": 1.594963273871983e-07, + "logps/chosen": -40.626991271972656, + "logps/rejected": -77.2824935913086, + "loss": 0.2938, + "losses/dpo": 0.13791534304618835, + "losses/sft": 1.3078718185424805, + "losses/total": 0.13791534304618835, + "ref_logps/chosen": -24.102535247802734, + "ref_logps/rejected": -38.710750579833984, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6524457931518555, + "rewards/margins": 2.204728126525879, + "rewards/rejected": -3.8571739196777344, + "step": 2265 + }, + { + "epoch": 2.14, + "grad_norm": 23.8439884185791, + "learning_rate": 1.5932144106330884e-07, + "logps/chosen": -53.080467224121094, + "logps/rejected": -75.99609375, + "loss": 0.2822, + "losses/dpo": 0.33289408683776855, + "losses/sft": 2.283602237701416, + "losses/total": 0.33289408683776855, + "ref_logps/chosen": -33.83545684814453, + "ref_logps/rejected": -36.57042694091797, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9245011806488037, + "rewards/margins": 2.0180652141571045, + "rewards/rejected": -3.94256591796875, + "step": 2266 + }, + { + "epoch": 2.14, + "grad_norm": 26.943132400512695, + "learning_rate": 1.5914655473941938e-07, + "logps/chosen": -62.90143966674805, + "logps/rejected": -75.03677368164062, + "loss": 0.3197, + "losses/dpo": 0.2554304599761963, + "losses/sft": 1.8869599103927612, + "losses/total": 0.2554304599761963, + "ref_logps/chosen": -47.20684051513672, + "ref_logps/rejected": -44.09348678588867, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5694596767425537, + "rewards/margins": 1.5248687267303467, + "rewards/rejected": -3.0943284034729004, + "step": 2267 + }, + { + "epoch": 2.14, + "grad_norm": 18.618547439575195, + "learning_rate": 1.589716684155299e-07, + "logps/chosen": -61.20868682861328, + "logps/rejected": -79.85617065429688, + "loss": 0.1833, + "losses/dpo": 0.13341785967350006, + "losses/sft": 2.1886203289031982, + "losses/total": 0.13341785967350006, + "ref_logps/chosen": -40.288238525390625, + "ref_logps/rejected": -38.06578063964844, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0920448303222656, + "rewards/margins": 2.0869946479797363, + "rewards/rejected": -4.17903995513916, + "step": 2268 + }, + { + "epoch": 2.14, + "grad_norm": 22.201250076293945, + "learning_rate": 1.5879678209164043e-07, + "logps/chosen": -55.31974411010742, + "logps/rejected": -76.48590087890625, + "loss": 0.2601, + "losses/dpo": 0.23466461896896362, + "losses/sft": 1.6558467149734497, + "losses/total": 0.23466461896896362, + "ref_logps/chosen": -39.18632507324219, + "ref_logps/rejected": -42.365272521972656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.613342046737671, + "rewards/margins": 1.7987215518951416, + "rewards/rejected": -3.4120635986328125, + "step": 2269 + }, + { + "epoch": 2.14, + "grad_norm": 15.449275970458984, + "learning_rate": 1.5862189576775095e-07, + "logps/chosen": -44.29328536987305, + "logps/rejected": -90.58779907226562, + "loss": 0.1799, + "losses/dpo": 0.27283793687820435, + "losses/sft": 2.3422796726226807, + "losses/total": 0.27283793687820435, + "ref_logps/chosen": -30.93407440185547, + "ref_logps/rejected": -53.70518493652344, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.335921049118042, + "rewards/margins": 2.3523406982421875, + "rewards/rejected": -3.6882619857788086, + "step": 2270 + }, + { + "epoch": 2.14, + "grad_norm": 17.470685958862305, + "learning_rate": 1.5844700944386149e-07, + "logps/chosen": -50.318607330322266, + "logps/rejected": -78.00550079345703, + "loss": 0.2057, + "losses/dpo": 0.35125651955604553, + "losses/sft": 1.649762749671936, + "losses/total": 0.35125651955604553, + "ref_logps/chosen": -34.96794128417969, + "ref_logps/rejected": -42.396392822265625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.535066843032837, + "rewards/margins": 2.025843858718872, + "rewards/rejected": -3.560910701751709, + "step": 2271 + }, + { + "epoch": 2.15, + "grad_norm": 26.762725830078125, + "learning_rate": 1.5827212311997203e-07, + "logps/chosen": -58.390323638916016, + "logps/rejected": -72.4156265258789, + "loss": 0.3026, + "losses/dpo": 0.1839708685874939, + "losses/sft": 1.2107266187667847, + "losses/total": 0.1839708685874939, + "ref_logps/chosen": -40.38214874267578, + "ref_logps/rejected": -38.73516845703125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8008177280426025, + "rewards/margins": 1.567227840423584, + "rewards/rejected": -3.3680455684661865, + "step": 2272 + }, + { + "epoch": 2.15, + "grad_norm": 34.8433952331543, + "learning_rate": 1.5809723679608254e-07, + "logps/chosen": -57.27212142944336, + "logps/rejected": -68.38531494140625, + "loss": 0.4535, + "losses/dpo": 0.6827790141105652, + "losses/sft": 2.280339241027832, + "losses/total": 0.6827790141105652, + "ref_logps/chosen": -35.589073181152344, + "ref_logps/rejected": -33.419410705566406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.168304443359375, + "rewards/margins": 1.3282856941223145, + "rewards/rejected": -3.4965901374816895, + "step": 2273 + }, + { + "epoch": 2.15, + "grad_norm": 19.23264503479004, + "learning_rate": 1.5792235047219308e-07, + "logps/chosen": -50.28895568847656, + "logps/rejected": -67.99697875976562, + "loss": 0.2277, + "losses/dpo": 0.14969190955162048, + "losses/sft": 2.3513457775115967, + "losses/total": 0.14969190955162048, + "ref_logps/chosen": -33.68083953857422, + "ref_logps/rejected": -35.101356506347656, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6608121395111084, + "rewards/margins": 1.6287498474121094, + "rewards/rejected": -3.2895619869232178, + "step": 2274 + }, + { + "epoch": 2.15, + "grad_norm": 18.94283676147461, + "learning_rate": 1.577474641483036e-07, + "logps/chosen": -63.68626403808594, + "logps/rejected": -87.64292907714844, + "loss": 0.2229, + "losses/dpo": 0.16320599615573883, + "losses/sft": 2.310617685317993, + "losses/total": 0.16320599615573883, + "ref_logps/chosen": -46.33479309082031, + "ref_logps/rejected": -48.5651741027832, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.73514723777771, + "rewards/margins": 2.172628402709961, + "rewards/rejected": -3.907775640487671, + "step": 2275 + }, + { + "epoch": 2.15, + "grad_norm": 20.575897216796875, + "learning_rate": 1.5757257782441413e-07, + "logps/chosen": -38.73346710205078, + "logps/rejected": -79.06417846679688, + "loss": 0.2254, + "losses/dpo": 0.18299530446529388, + "losses/sft": 1.768555760383606, + "losses/total": 0.18299530446529388, + "ref_logps/chosen": -25.789077758789062, + "ref_logps/rejected": -43.497650146484375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2944390773773193, + "rewards/margins": 2.2622132301330566, + "rewards/rejected": -3.556652069091797, + "step": 2276 + }, + { + "epoch": 2.15, + "grad_norm": 31.062278747558594, + "learning_rate": 1.5739769150052464e-07, + "logps/chosen": -47.12458801269531, + "logps/rejected": -69.7198486328125, + "loss": 0.44, + "losses/dpo": 0.2919631004333496, + "losses/sft": 1.9819501638412476, + "losses/total": 0.2919631004333496, + "ref_logps/chosen": -27.849842071533203, + "ref_logps/rejected": -35.79384231567383, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9274744987487793, + "rewards/margins": 1.4651265144348145, + "rewards/rejected": -3.392601251602173, + "step": 2277 + }, + { + "epoch": 2.15, + "grad_norm": 29.310911178588867, + "learning_rate": 1.5722280517663518e-07, + "logps/chosen": -45.972023010253906, + "logps/rejected": -67.21701049804688, + "loss": 0.3201, + "losses/dpo": 0.2422136515378952, + "losses/sft": 1.3804807662963867, + "losses/total": 0.2422136515378952, + "ref_logps/chosen": -30.895526885986328, + "ref_logps/rejected": -33.24668884277344, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.5076490640640259, + "rewards/margins": 1.8893829584121704, + "rewards/rejected": -3.3970322608947754, + "step": 2278 + }, + { + "epoch": 2.15, + "grad_norm": 21.477489471435547, + "learning_rate": 1.5704791885274572e-07, + "logps/chosen": -58.60751724243164, + "logps/rejected": -84.25736999511719, + "loss": 0.2386, + "losses/dpo": 0.3093772530555725, + "losses/sft": 1.692646861076355, + "losses/total": 0.3093772530555725, + "ref_logps/chosen": -38.81452560424805, + "ref_logps/rejected": -44.74578094482422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.979299545288086, + "rewards/margins": 1.971860408782959, + "rewards/rejected": -3.951159715652466, + "step": 2279 + }, + { + "epoch": 2.15, + "grad_norm": 15.371859550476074, + "learning_rate": 1.5687303252885623e-07, + "logps/chosen": -43.326560974121094, + "logps/rejected": -79.91537475585938, + "loss": 0.1764, + "losses/dpo": 0.07203559577465057, + "losses/sft": 1.0480899810791016, + "losses/total": 0.07203559577465057, + "ref_logps/chosen": -27.940738677978516, + "ref_logps/rejected": -42.891754150390625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5385822057724, + "rewards/margins": 2.1637799739837646, + "rewards/rejected": -3.702362060546875, + "step": 2280 + }, + { + "epoch": 2.15, + "grad_norm": 21.000402450561523, + "learning_rate": 1.5669814620496677e-07, + "logps/chosen": -66.86967468261719, + "logps/rejected": -110.08772277832031, + "loss": 0.2189, + "losses/dpo": 0.3041834831237793, + "losses/sft": 2.226001024246216, + "losses/total": 0.3041834831237793, + "ref_logps/chosen": -46.83509063720703, + "ref_logps/rejected": -66.58212280273438, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0034589767456055, + "rewards/margins": 2.347100257873535, + "rewards/rejected": -4.350558757781982, + "step": 2281 + }, + { + "epoch": 2.15, + "grad_norm": 26.565950393676758, + "learning_rate": 1.5652325988107728e-07, + "logps/chosen": -61.623512268066406, + "logps/rejected": -82.12686157226562, + "loss": 0.2483, + "losses/dpo": 0.4800388216972351, + "losses/sft": 2.0719287395477295, + "losses/total": 0.4800388216972351, + "ref_logps/chosen": -42.07328414916992, + "ref_logps/rejected": -42.147159576416016, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9550228118896484, + "rewards/margins": 2.042947292327881, + "rewards/rejected": -3.9979701042175293, + "step": 2282 + }, + { + "epoch": 2.16, + "grad_norm": 14.342329025268555, + "learning_rate": 1.5634837355718782e-07, + "logps/chosen": -52.11302947998047, + "logps/rejected": -91.74695587158203, + "loss": 0.1433, + "losses/dpo": 0.2717965245246887, + "losses/sft": 2.016220808029175, + "losses/total": 0.2717965245246887, + "ref_logps/chosen": -35.9696044921875, + "ref_logps/rejected": -49.544063568115234, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6143429279327393, + "rewards/margins": 2.6059460639953613, + "rewards/rejected": -4.2202887535095215, + "step": 2283 + }, + { + "epoch": 2.16, + "grad_norm": 11.542847633361816, + "learning_rate": 1.5617348723329834e-07, + "logps/chosen": -69.41544342041016, + "logps/rejected": -99.43051147460938, + "loss": 0.1041, + "losses/dpo": 0.06336472183465958, + "losses/sft": 2.059053897857666, + "losses/total": 0.06336472183465958, + "ref_logps/chosen": -46.148780822753906, + "ref_logps/rejected": -48.774330139160156, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3266663551330566, + "rewards/margins": 2.7389512062072754, + "rewards/rejected": -5.065617561340332, + "step": 2284 + }, + { + "epoch": 2.16, + "grad_norm": 19.886808395385742, + "learning_rate": 1.559986009094089e-07, + "logps/chosen": -47.85782241821289, + "logps/rejected": -90.40819549560547, + "loss": 0.1926, + "losses/dpo": 0.3982144594192505, + "losses/sft": 2.302834987640381, + "losses/total": 0.3982144594192505, + "ref_logps/chosen": -28.37134552001953, + "ref_logps/rejected": -47.10148239135742, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9486477375030518, + "rewards/margins": 2.382023572921753, + "rewards/rejected": -4.330671310424805, + "step": 2285 + }, + { + "epoch": 2.16, + "grad_norm": 12.676795959472656, + "learning_rate": 1.5582371458551941e-07, + "logps/chosen": -56.64424514770508, + "logps/rejected": -94.72074890136719, + "loss": 0.1119, + "losses/dpo": 0.1810685694217682, + "losses/sft": 1.5159043073654175, + "losses/total": 0.1810685694217682, + "ref_logps/chosen": -40.41544723510742, + "ref_logps/rejected": -49.2695198059082, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6228796243667603, + "rewards/margins": 2.9222428798675537, + "rewards/rejected": -4.5451226234436035, + "step": 2286 + }, + { + "epoch": 2.16, + "grad_norm": 28.70562744140625, + "learning_rate": 1.5564882826162993e-07, + "logps/chosen": -54.82651901245117, + "logps/rejected": -84.83499145507812, + "loss": 0.3099, + "losses/dpo": 0.09100537747144699, + "losses/sft": 1.8288042545318604, + "losses/total": 0.09100537747144699, + "ref_logps/chosen": -33.36064910888672, + "ref_logps/rejected": -43.051212310791016, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1465868949890137, + "rewards/margins": 2.0317912101745605, + "rewards/rejected": -4.178378105163574, + "step": 2287 + }, + { + "epoch": 2.16, + "grad_norm": 26.49138832092285, + "learning_rate": 1.5547394193774047e-07, + "logps/chosen": -65.35050964355469, + "logps/rejected": -86.64130401611328, + "loss": 0.2187, + "losses/dpo": 0.055660516023635864, + "losses/sft": 2.095733165740967, + "losses/total": 0.055660516023635864, + "ref_logps/chosen": -45.377235412597656, + "ref_logps/rejected": -45.432273864746094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9973276853561401, + "rewards/margins": 2.123575448989868, + "rewards/rejected": -4.120903015136719, + "step": 2288 + }, + { + "epoch": 2.16, + "grad_norm": 22.42824935913086, + "learning_rate": 1.5529905561385098e-07, + "logps/chosen": -66.8343276977539, + "logps/rejected": -81.79641723632812, + "loss": 0.2325, + "losses/dpo": 0.15890973806381226, + "losses/sft": 1.6447639465332031, + "losses/total": 0.15890973806381226, + "ref_logps/chosen": -46.97641372680664, + "ref_logps/rejected": -41.37504577636719, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.985791563987732, + "rewards/margins": 2.0563454627990723, + "rewards/rejected": -4.0421366691589355, + "step": 2289 + }, + { + "epoch": 2.16, + "grad_norm": 20.946821212768555, + "learning_rate": 1.5512416928996152e-07, + "logps/chosen": -55.57846450805664, + "logps/rejected": -80.79353332519531, + "loss": 0.2134, + "losses/dpo": 0.09192565083503723, + "losses/sft": 1.7822967767715454, + "losses/total": 0.09192565083503723, + "ref_logps/chosen": -34.669883728027344, + "ref_logps/rejected": -38.41657257080078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0908584594726562, + "rewards/margins": 2.1468381881713867, + "rewards/rejected": -4.237696647644043, + "step": 2290 + }, + { + "epoch": 2.16, + "grad_norm": 18.632497787475586, + "learning_rate": 1.5494928296607203e-07, + "logps/chosen": -55.583988189697266, + "logps/rejected": -84.01304626464844, + "loss": 0.1876, + "losses/dpo": 0.1862010955810547, + "losses/sft": 1.525383710861206, + "losses/total": 0.1862010955810547, + "ref_logps/chosen": -36.119598388671875, + "ref_logps/rejected": -43.54994201660156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9464391469955444, + "rewards/margins": 2.0998711585998535, + "rewards/rejected": -4.0463104248046875, + "step": 2291 + }, + { + "epoch": 2.16, + "grad_norm": 34.278316497802734, + "learning_rate": 1.547743966421826e-07, + "logps/chosen": -69.1971435546875, + "logps/rejected": -104.88549041748047, + "loss": 0.2795, + "losses/dpo": 0.1957712173461914, + "losses/sft": 1.8775060176849365, + "losses/total": 0.1957712173461914, + "ref_logps/chosen": -42.132774353027344, + "ref_logps/rejected": -55.01546096801758, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7064366340637207, + "rewards/margins": 2.280566453933716, + "rewards/rejected": -4.987003326416016, + "step": 2292 + }, + { + "epoch": 2.17, + "grad_norm": 25.116737365722656, + "learning_rate": 1.545995103182931e-07, + "logps/chosen": -54.52226257324219, + "logps/rejected": -87.70243835449219, + "loss": 0.2278, + "losses/dpo": 0.30233073234558105, + "losses/sft": 2.790884017944336, + "losses/total": 0.30233073234558105, + "ref_logps/chosen": -33.725563049316406, + "ref_logps/rejected": -46.07609558105469, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.079669952392578, + "rewards/margins": 2.0829648971557617, + "rewards/rejected": -4.16263484954834, + "step": 2293 + }, + { + "epoch": 2.17, + "grad_norm": 15.063234329223633, + "learning_rate": 1.5442462399440362e-07, + "logps/chosen": -57.47674560546875, + "logps/rejected": -90.12452697753906, + "loss": 0.1634, + "losses/dpo": 0.2507159113883972, + "losses/sft": 2.0140225887298584, + "losses/total": 0.2507159113883972, + "ref_logps/chosen": -35.16191101074219, + "ref_logps/rejected": -45.17369842529297, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2314834594726562, + "rewards/margins": 2.263598918914795, + "rewards/rejected": -4.495082855224609, + "step": 2294 + }, + { + "epoch": 2.17, + "grad_norm": 13.159616470336914, + "learning_rate": 1.5424973767051416e-07, + "logps/chosen": -54.27346420288086, + "logps/rejected": -81.01445007324219, + "loss": 0.1805, + "losses/dpo": 0.2564374506473541, + "losses/sft": 1.9482932090759277, + "losses/total": 0.2564374506473541, + "ref_logps/chosen": -36.50727081298828, + "ref_logps/rejected": -41.780303955078125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7766196727752686, + "rewards/margins": 2.1467955112457275, + "rewards/rejected": -3.923415184020996, + "step": 2295 + }, + { + "epoch": 2.17, + "grad_norm": 12.745832443237305, + "learning_rate": 1.5407485134662467e-07, + "logps/chosen": -68.29960632324219, + "logps/rejected": -98.3267822265625, + "loss": 0.1033, + "losses/dpo": 0.14031359553337097, + "losses/sft": 2.2993533611297607, + "losses/total": 0.14031359553337097, + "ref_logps/chosen": -49.136695861816406, + "ref_logps/rejected": -52.492549896240234, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9162909984588623, + "rewards/margins": 2.6671321392059326, + "rewards/rejected": -4.583423137664795, + "step": 2296 + }, + { + "epoch": 2.17, + "grad_norm": 31.119064331054688, + "learning_rate": 1.538999650227352e-07, + "logps/chosen": -56.37617492675781, + "logps/rejected": -82.12806701660156, + "loss": 0.3443, + "losses/dpo": 0.18578927218914032, + "losses/sft": 2.2522242069244385, + "losses/total": 0.18578927218914032, + "ref_logps/chosen": -35.081905364990234, + "ref_logps/rejected": -43.395755767822266, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1294267177581787, + "rewards/margins": 1.7438042163848877, + "rewards/rejected": -3.8732309341430664, + "step": 2297 + }, + { + "epoch": 2.17, + "grad_norm": 25.81895637512207, + "learning_rate": 1.5372507869884575e-07, + "logps/chosen": -50.75843048095703, + "logps/rejected": -70.24896240234375, + "loss": 0.2726, + "losses/dpo": 0.7891396284103394, + "losses/sft": 2.258631944656372, + "losses/total": 0.7891396284103394, + "ref_logps/chosen": -38.64788055419922, + "ref_logps/rejected": -35.67439270019531, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.211055040359497, + "rewards/margins": 2.2464025020599365, + "rewards/rejected": -3.4574575424194336, + "step": 2298 + }, + { + "epoch": 2.17, + "grad_norm": 21.256067276000977, + "learning_rate": 1.535501923749563e-07, + "logps/chosen": -62.33543395996094, + "logps/rejected": -79.2490463256836, + "loss": 0.2035, + "losses/dpo": 0.13268226385116577, + "losses/sft": 1.95466148853302, + "losses/total": 0.13268226385116577, + "ref_logps/chosen": -42.913124084472656, + "ref_logps/rejected": -37.233734130859375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9422309398651123, + "rewards/margins": 2.25930118560791, + "rewards/rejected": -4.201531410217285, + "step": 2299 + }, + { + "epoch": 2.17, + "grad_norm": 25.383174896240234, + "learning_rate": 1.533753060510668e-07, + "logps/chosen": -52.142601013183594, + "logps/rejected": -67.98534393310547, + "loss": 0.3036, + "losses/dpo": 0.09045708924531937, + "losses/sft": 2.255509853363037, + "losses/total": 0.09045708924531937, + "ref_logps/chosen": -32.18153381347656, + "ref_logps/rejected": -32.76335906982422, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9961068630218506, + "rewards/margins": 1.5260915756225586, + "rewards/rejected": -3.522198438644409, + "step": 2300 + }, + { + "epoch": 2.17, + "grad_norm": 20.05816650390625, + "learning_rate": 1.5320041972717732e-07, + "logps/chosen": -67.32510375976562, + "logps/rejected": -99.2700424194336, + "loss": 0.1995, + "losses/dpo": 0.29701942205429077, + "losses/sft": 2.7833328247070312, + "losses/total": 0.29701942205429077, + "ref_logps/chosen": -44.362266540527344, + "ref_logps/rejected": -53.67930603027344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2962844371795654, + "rewards/margins": 2.262789249420166, + "rewards/rejected": -4.5590739250183105, + "step": 2301 + }, + { + "epoch": 2.17, + "grad_norm": 23.345857620239258, + "learning_rate": 1.5302553340328785e-07, + "logps/chosen": -65.83466339111328, + "logps/rejected": -79.39578247070312, + "loss": 0.2934, + "losses/dpo": 0.2013949155807495, + "losses/sft": 2.1690316200256348, + "losses/total": 0.2013949155807495, + "ref_logps/chosen": -45.309478759765625, + "ref_logps/rejected": -41.431556701660156, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0525190830230713, + "rewards/margins": 1.743903636932373, + "rewards/rejected": -3.7964227199554443, + "step": 2302 + }, + { + "epoch": 2.17, + "grad_norm": 29.08433723449707, + "learning_rate": 1.5285064707939837e-07, + "logps/chosen": -53.37352752685547, + "logps/rejected": -73.21650695800781, + "loss": 0.3333, + "losses/dpo": 0.2476210743188858, + "losses/sft": 1.8561046123504639, + "losses/total": 0.2476210743188858, + "ref_logps/chosen": -33.61847686767578, + "ref_logps/rejected": -36.27830505371094, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9755051136016846, + "rewards/margins": 1.718315839767456, + "rewards/rejected": -3.6938209533691406, + "step": 2303 + }, + { + "epoch": 2.18, + "grad_norm": 22.244579315185547, + "learning_rate": 1.5267576075550893e-07, + "logps/chosen": -56.64630889892578, + "logps/rejected": -78.9195556640625, + "loss": 0.2829, + "losses/dpo": 0.75873863697052, + "losses/sft": 2.722460985183716, + "losses/total": 0.75873863697052, + "ref_logps/chosen": -39.257652282714844, + "ref_logps/rejected": -40.22932434082031, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.738865613937378, + "rewards/margins": 2.130156993865967, + "rewards/rejected": -3.8690226078033447, + "step": 2304 + }, + { + "epoch": 2.18, + "grad_norm": 16.203853607177734, + "learning_rate": 1.5250087443161945e-07, + "logps/chosen": -55.15812301635742, + "logps/rejected": -80.3203125, + "loss": 0.1578, + "losses/dpo": 0.15829437971115112, + "losses/sft": 2.101869821548462, + "losses/total": 0.15829437971115112, + "ref_logps/chosen": -40.08538818359375, + "ref_logps/rejected": -42.42623519897461, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5072734355926514, + "rewards/margins": 2.2821338176727295, + "rewards/rejected": -3.78940749168396, + "step": 2305 + }, + { + "epoch": 2.18, + "grad_norm": 23.39341163635254, + "learning_rate": 1.5232598810772998e-07, + "logps/chosen": -57.079349517822266, + "logps/rejected": -79.1810531616211, + "loss": 0.2492, + "losses/dpo": 0.20406943559646606, + "losses/sft": 1.9828931093215942, + "losses/total": 0.20406943559646606, + "ref_logps/chosen": -35.075340270996094, + "ref_logps/rejected": -40.44786071777344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2004010677337646, + "rewards/margins": 1.6729178428649902, + "rewards/rejected": -3.873319149017334, + "step": 2306 + }, + { + "epoch": 2.18, + "grad_norm": 27.756380081176758, + "learning_rate": 1.521511017838405e-07, + "logps/chosen": -56.9161376953125, + "logps/rejected": -84.33606719970703, + "loss": 0.2193, + "losses/dpo": 0.2271655797958374, + "losses/sft": 1.7130274772644043, + "losses/total": 0.2271655797958374, + "ref_logps/chosen": -37.48028564453125, + "ref_logps/rejected": -43.27394485473633, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9435847997665405, + "rewards/margins": 2.162627696990967, + "rewards/rejected": -4.106212615966797, + "step": 2307 + }, + { + "epoch": 2.18, + "grad_norm": 21.934593200683594, + "learning_rate": 1.51976215459951e-07, + "logps/chosen": -49.73581314086914, + "logps/rejected": -78.15467834472656, + "loss": 0.1955, + "losses/dpo": 0.25588667392730713, + "losses/sft": 2.7339768409729004, + "losses/total": 0.25588667392730713, + "ref_logps/chosen": -32.321189880371094, + "ref_logps/rejected": -37.73955535888672, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7414628267288208, + "rewards/margins": 2.3000504970550537, + "rewards/rejected": -4.041513442993164, + "step": 2308 + }, + { + "epoch": 2.18, + "grad_norm": 29.851484298706055, + "learning_rate": 1.5180132913606155e-07, + "logps/chosen": -64.11297607421875, + "logps/rejected": -87.09363555908203, + "loss": 0.2467, + "losses/dpo": 0.11850249767303467, + "losses/sft": 2.186098575592041, + "losses/total": 0.11850249767303467, + "ref_logps/chosen": -41.07861328125, + "ref_logps/rejected": -43.50823974609375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3034369945526123, + "rewards/margins": 2.055102825164795, + "rewards/rejected": -4.358539581298828, + "step": 2309 + }, + { + "epoch": 2.18, + "grad_norm": 40.93378448486328, + "learning_rate": 1.5162644281217206e-07, + "logps/chosen": -49.8791618347168, + "logps/rejected": -77.52201843261719, + "loss": 0.4609, + "losses/dpo": 1.0263172388076782, + "losses/sft": 2.1066715717315674, + "losses/total": 1.0263172388076782, + "ref_logps/chosen": -27.901844024658203, + "ref_logps/rejected": -38.298492431640625, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1977317333221436, + "rewards/margins": 1.724621295928955, + "rewards/rejected": -3.9223532676696777, + "step": 2310 + }, + { + "epoch": 2.18, + "grad_norm": 18.958892822265625, + "learning_rate": 1.5145155648828263e-07, + "logps/chosen": -48.95719909667969, + "logps/rejected": -90.12373352050781, + "loss": 0.1953, + "losses/dpo": 0.17861512303352356, + "losses/sft": 1.2062950134277344, + "losses/total": 0.17861512303352356, + "ref_logps/chosen": -34.45989990234375, + "ref_logps/rejected": -47.54639434814453, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4497298002243042, + "rewards/margins": 2.8080039024353027, + "rewards/rejected": -4.257733345031738, + "step": 2311 + }, + { + "epoch": 2.18, + "grad_norm": 30.231172561645508, + "learning_rate": 1.5127667016439314e-07, + "logps/chosen": -73.6708755493164, + "logps/rejected": -94.07293701171875, + "loss": 0.3743, + "losses/dpo": 0.8147287964820862, + "losses/sft": 2.1452088356018066, + "losses/total": 0.8147287964820862, + "ref_logps/chosen": -49.843849182128906, + "ref_logps/rejected": -49.35814666748047, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.382702350616455, + "rewards/margins": 2.088776111602783, + "rewards/rejected": -4.471478462219238, + "step": 2312 + }, + { + "epoch": 2.18, + "grad_norm": 18.43680763244629, + "learning_rate": 1.5110178384050368e-07, + "logps/chosen": -58.45404052734375, + "logps/rejected": -72.94013977050781, + "loss": 0.2686, + "losses/dpo": 0.482033908367157, + "losses/sft": 2.4288012981414795, + "losses/total": 0.482033908367157, + "ref_logps/chosen": -38.907203674316406, + "ref_logps/rejected": -37.18342590332031, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.954683780670166, + "rewards/margins": 1.6209882497787476, + "rewards/rejected": -3.575672149658203, + "step": 2313 + }, + { + "epoch": 2.19, + "grad_norm": 16.642179489135742, + "learning_rate": 1.509268975166142e-07, + "logps/chosen": -55.08019256591797, + "logps/rejected": -93.86299896240234, + "loss": 0.1586, + "losses/dpo": 0.08929822593927383, + "losses/sft": 2.444059371948242, + "losses/total": 0.08929822593927383, + "ref_logps/chosen": -36.147682189941406, + "ref_logps/rejected": -48.686256408691406, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8932510614395142, + "rewards/margins": 2.6244235038757324, + "rewards/rejected": -4.517674922943115, + "step": 2314 + }, + { + "epoch": 2.19, + "grad_norm": 23.275371551513672, + "learning_rate": 1.507520111927247e-07, + "logps/chosen": -66.2259750366211, + "logps/rejected": -101.51622772216797, + "loss": 0.187, + "losses/dpo": 0.17488284409046173, + "losses/sft": 1.972433090209961, + "losses/total": 0.17488284409046173, + "ref_logps/chosen": -41.19746398925781, + "ref_logps/rejected": -51.9842529296875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5028512477874756, + "rewards/margins": 2.4503462314605713, + "rewards/rejected": -4.953197479248047, + "step": 2315 + }, + { + "epoch": 2.19, + "grad_norm": 22.9316349029541, + "learning_rate": 1.5057712486883524e-07, + "logps/chosen": -56.81782531738281, + "logps/rejected": -85.59149169921875, + "loss": 0.2141, + "losses/dpo": 0.27787432074546814, + "losses/sft": 2.329253673553467, + "losses/total": 0.27787432074546814, + "ref_logps/chosen": -39.2373161315918, + "ref_logps/rejected": -47.52972412109375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7580509185791016, + "rewards/margins": 2.048125743865967, + "rewards/rejected": -3.8061766624450684, + "step": 2316 + }, + { + "epoch": 2.19, + "grad_norm": 22.330493927001953, + "learning_rate": 1.5040223854494578e-07, + "logps/chosen": -58.29429244995117, + "logps/rejected": -87.17301177978516, + "loss": 0.199, + "losses/dpo": 0.48983046412467957, + "losses/sft": 2.282857656478882, + "losses/total": 0.48983046412467957, + "ref_logps/chosen": -38.17634201049805, + "ref_logps/rejected": -46.41453552246094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0117950439453125, + "rewards/margins": 2.0640530586242676, + "rewards/rejected": -4.075847625732422, + "step": 2317 + }, + { + "epoch": 2.19, + "grad_norm": 16.657691955566406, + "learning_rate": 1.5022735222105632e-07, + "logps/chosen": -57.7027587890625, + "logps/rejected": -90.79986572265625, + "loss": 0.1598, + "losses/dpo": 0.19808822870254517, + "losses/sft": 1.4461119174957275, + "losses/total": 0.19808822870254517, + "ref_logps/chosen": -39.55608367919922, + "ref_logps/rejected": -47.212581634521484, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8146674633026123, + "rewards/margins": 2.544060468673706, + "rewards/rejected": -4.358727931976318, + "step": 2318 + }, + { + "epoch": 2.19, + "grad_norm": 19.468774795532227, + "learning_rate": 1.5005246589716683e-07, + "logps/chosen": -61.18832015991211, + "logps/rejected": -84.00276947021484, + "loss": 0.2029, + "losses/dpo": 0.11266574263572693, + "losses/sft": 1.8466558456420898, + "losses/total": 0.11266574263572693, + "ref_logps/chosen": -40.79590606689453, + "ref_logps/rejected": -38.97797393798828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0392415523529053, + "rewards/margins": 2.463238477706909, + "rewards/rejected": -4.5024800300598145, + "step": 2319 + }, + { + "epoch": 2.19, + "grad_norm": 23.325109481811523, + "learning_rate": 1.4987757957327737e-07, + "logps/chosen": -62.5383415222168, + "logps/rejected": -99.259033203125, + "loss": 0.1531, + "losses/dpo": 0.12385757267475128, + "losses/sft": 2.454174518585205, + "losses/total": 0.12385757267475128, + "ref_logps/chosen": -38.148841857910156, + "ref_logps/rejected": -49.837745666503906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4389500617980957, + "rewards/margins": 2.503178596496582, + "rewards/rejected": -4.942128658294678, + "step": 2320 + }, + { + "epoch": 2.19, + "grad_norm": 22.025278091430664, + "learning_rate": 1.4970269324938789e-07, + "logps/chosen": -53.976966857910156, + "logps/rejected": -79.06675720214844, + "loss": 0.1789, + "losses/dpo": 0.37636545300483704, + "losses/sft": 2.159593343734741, + "losses/total": 0.37636545300483704, + "ref_logps/chosen": -35.98680114746094, + "ref_logps/rejected": -38.93898010253906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7990168333053589, + "rewards/margins": 2.2137606143951416, + "rewards/rejected": -4.012777328491211, + "step": 2321 + }, + { + "epoch": 2.19, + "grad_norm": 27.695343017578125, + "learning_rate": 1.495278069254984e-07, + "logps/chosen": -58.63128662109375, + "logps/rejected": -84.05570220947266, + "loss": 0.2667, + "losses/dpo": 0.187092125415802, + "losses/sft": 2.039384603500366, + "losses/total": 0.187092125415802, + "ref_logps/chosen": -38.57199478149414, + "ref_logps/rejected": -41.511451721191406, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0059292316436768, + "rewards/margins": 2.2484960556030273, + "rewards/rejected": -4.254425525665283, + "step": 2322 + }, + { + "epoch": 2.19, + "grad_norm": 33.03314208984375, + "learning_rate": 1.4935292060160894e-07, + "logps/chosen": -52.55841064453125, + "logps/rejected": -89.72498321533203, + "loss": 0.3684, + "losses/dpo": 0.05208424851298332, + "losses/sft": 1.2749131917953491, + "losses/total": 0.05208424851298332, + "ref_logps/chosen": -39.29067611694336, + "ref_logps/rejected": -52.81770324707031, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3267734050750732, + "rewards/margins": 2.363955020904541, + "rewards/rejected": -3.690728187561035, + "step": 2323 + }, + { + "epoch": 2.19, + "grad_norm": 36.880924224853516, + "learning_rate": 1.4917803427771948e-07, + "logps/chosen": -65.55006408691406, + "logps/rejected": -76.8304672241211, + "loss": 0.4278, + "losses/dpo": 0.6489154696464539, + "losses/sft": 2.4154653549194336, + "losses/total": 0.6489154696464539, + "ref_logps/chosen": -44.63751220703125, + "ref_logps/rejected": -39.63740921020508, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0912556648254395, + "rewards/margins": 1.628050684928894, + "rewards/rejected": -3.719306230545044, + "step": 2324 + }, + { + "epoch": 2.2, + "grad_norm": 26.76791000366211, + "learning_rate": 1.4900314795383002e-07, + "logps/chosen": -43.804527282714844, + "logps/rejected": -57.542274475097656, + "loss": 0.4118, + "losses/dpo": 0.37537091970443726, + "losses/sft": 1.997344732284546, + "losses/total": 0.37537091970443726, + "ref_logps/chosen": -29.272216796875, + "ref_logps/rejected": -33.132381439208984, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4532310962677002, + "rewards/margins": 0.9877585172653198, + "rewards/rejected": -2.4409894943237305, + "step": 2325 + }, + { + "epoch": 2.2, + "grad_norm": 27.466733932495117, + "learning_rate": 1.4882826162994053e-07, + "logps/chosen": -44.07056427001953, + "logps/rejected": -63.016815185546875, + "loss": 0.3469, + "losses/dpo": 0.39591869711875916, + "losses/sft": 1.7456421852111816, + "losses/total": 0.39591869711875916, + "ref_logps/chosen": -26.791919708251953, + "ref_logps/rejected": -32.60139465332031, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7278647422790527, + "rewards/margins": 1.3136770725250244, + "rewards/rejected": -3.041541576385498, + "step": 2326 + }, + { + "epoch": 2.2, + "grad_norm": 22.862457275390625, + "learning_rate": 1.4865337530605107e-07, + "logps/chosen": -66.59981536865234, + "logps/rejected": -80.0736083984375, + "loss": 0.2588, + "losses/dpo": 0.19616463780403137, + "losses/sft": 2.4209187030792236, + "losses/total": 0.19616463780403137, + "ref_logps/chosen": -45.21485900878906, + "ref_logps/rejected": -40.36358642578125, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.138495445251465, + "rewards/margins": 1.8325070142745972, + "rewards/rejected": -3.9710023403167725, + "step": 2327 + }, + { + "epoch": 2.2, + "grad_norm": 16.2176570892334, + "learning_rate": 1.4847848898216158e-07, + "logps/chosen": -45.099464416503906, + "logps/rejected": -69.90950775146484, + "loss": 0.1891, + "losses/dpo": 0.08385226875543594, + "losses/sft": 1.7653790712356567, + "losses/total": 0.08385226875543594, + "ref_logps/chosen": -30.71889305114746, + "ref_logps/rejected": -34.14701843261719, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4380574226379395, + "rewards/margins": 2.1381921768188477, + "rewards/rejected": -3.576249599456787, + "step": 2328 + }, + { + "epoch": 2.2, + "grad_norm": 12.499853134155273, + "learning_rate": 1.483036026582721e-07, + "logps/chosen": -48.76898956298828, + "logps/rejected": -87.23249053955078, + "loss": 0.1081, + "losses/dpo": 0.11485821008682251, + "losses/sft": 2.2997212409973145, + "losses/total": 0.11485821008682251, + "ref_logps/chosen": -34.328887939453125, + "ref_logps/rejected": -44.967140197753906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4440099000930786, + "rewards/margins": 2.782524585723877, + "rewards/rejected": -4.226534843444824, + "step": 2329 + }, + { + "epoch": 2.2, + "grad_norm": 15.476034164428711, + "learning_rate": 1.4812871633438266e-07, + "logps/chosen": -56.94268035888672, + "logps/rejected": -84.33056640625, + "loss": 0.148, + "losses/dpo": 0.1531893014907837, + "losses/sft": 1.8435088396072388, + "losses/total": 0.1531893014907837, + "ref_logps/chosen": -37.59951400756836, + "ref_logps/rejected": -44.7179069519043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9343163967132568, + "rewards/margins": 2.0269503593444824, + "rewards/rejected": -3.9612669944763184, + "step": 2330 + }, + { + "epoch": 2.2, + "grad_norm": 19.033321380615234, + "learning_rate": 1.4795383001049317e-07, + "logps/chosen": -55.84258270263672, + "logps/rejected": -97.86953735351562, + "loss": 0.2389, + "losses/dpo": 0.44569632411003113, + "losses/sft": 2.1921885013580322, + "losses/total": 0.44569632411003113, + "ref_logps/chosen": -36.10467529296875, + "ref_logps/rejected": -53.268577575683594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9737910032272339, + "rewards/margins": 2.486304521560669, + "rewards/rejected": -4.460095405578613, + "step": 2331 + }, + { + "epoch": 2.2, + "grad_norm": 26.962873458862305, + "learning_rate": 1.477789436866037e-07, + "logps/chosen": -53.0608024597168, + "logps/rejected": -69.47300720214844, + "loss": 0.3438, + "losses/dpo": 0.6293489933013916, + "losses/sft": 2.8936233520507812, + "losses/total": 0.6293489933013916, + "ref_logps/chosen": -35.521522521972656, + "ref_logps/rejected": -37.66744613647461, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7539280652999878, + "rewards/margins": 1.4266278743743896, + "rewards/rejected": -3.180555820465088, + "step": 2332 + }, + { + "epoch": 2.2, + "grad_norm": 17.8859806060791, + "learning_rate": 1.4760405736271422e-07, + "logps/chosen": -49.18883514404297, + "logps/rejected": -88.13423156738281, + "loss": 0.202, + "losses/dpo": 0.19137877225875854, + "losses/sft": 2.218813180923462, + "losses/total": 0.19137877225875854, + "ref_logps/chosen": -27.740039825439453, + "ref_logps/rejected": -39.538360595703125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1448793411254883, + "rewards/margins": 2.714707851409912, + "rewards/rejected": -4.8595871925354, + "step": 2333 + }, + { + "epoch": 2.2, + "grad_norm": 23.893566131591797, + "learning_rate": 1.4742917103882476e-07, + "logps/chosen": -57.67502975463867, + "logps/rejected": -82.77272033691406, + "loss": 0.2789, + "losses/dpo": 0.17580771446228027, + "losses/sft": 2.105759382247925, + "losses/total": 0.17580771446228027, + "ref_logps/chosen": -38.39775085449219, + "ref_logps/rejected": -43.19354248046875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9277279376983643, + "rewards/margins": 2.0301897525787354, + "rewards/rejected": -3.9579176902770996, + "step": 2334 + }, + { + "epoch": 2.2, + "grad_norm": 16.83787727355957, + "learning_rate": 1.4725428471493527e-07, + "logps/chosen": -60.886390686035156, + "logps/rejected": -92.79360961914062, + "loss": 0.1671, + "losses/dpo": 0.2394506186246872, + "losses/sft": 1.7804149389266968, + "losses/total": 0.2394506186246872, + "ref_logps/chosen": -40.23788070678711, + "ref_logps/rejected": -47.62163543701172, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0648512840270996, + "rewards/margins": 2.4523463249206543, + "rewards/rejected": -4.517197608947754, + "step": 2335 + }, + { + "epoch": 2.21, + "grad_norm": 22.526973724365234, + "learning_rate": 1.470793983910458e-07, + "logps/chosen": -62.29762268066406, + "logps/rejected": -85.05091094970703, + "loss": 0.2611, + "losses/dpo": 0.11467424035072327, + "losses/sft": 1.7647268772125244, + "losses/total": 0.11467424035072327, + "ref_logps/chosen": -40.084190368652344, + "ref_logps/rejected": -41.87248992919922, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2213430404663086, + "rewards/margins": 2.096498966217041, + "rewards/rejected": -4.317842483520508, + "step": 2336 + }, + { + "epoch": 2.21, + "grad_norm": 12.491043090820312, + "learning_rate": 1.4690451206715635e-07, + "logps/chosen": -52.1961669921875, + "logps/rejected": -92.15484619140625, + "loss": 0.0957, + "losses/dpo": 0.10348942875862122, + "losses/sft": 2.396301507949829, + "losses/total": 0.10348942875862122, + "ref_logps/chosen": -31.215700149536133, + "ref_logps/rejected": -45.03858947753906, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.09804630279541, + "rewards/margins": 2.613579511642456, + "rewards/rejected": -4.711626052856445, + "step": 2337 + }, + { + "epoch": 2.21, + "grad_norm": 18.863603591918945, + "learning_rate": 1.4672962574326687e-07, + "logps/chosen": -46.332733154296875, + "logps/rejected": -79.79254150390625, + "loss": 0.2002, + "losses/dpo": 0.18961940705776215, + "losses/sft": 2.0179660320281982, + "losses/total": 0.18961940705776215, + "ref_logps/chosen": -31.654754638671875, + "ref_logps/rejected": -40.82588195800781, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4677982330322266, + "rewards/margins": 2.4288666248321533, + "rewards/rejected": -3.896665096282959, + "step": 2338 + }, + { + "epoch": 2.21, + "grad_norm": 25.21454620361328, + "learning_rate": 1.465547394193774e-07, + "logps/chosen": -65.52156066894531, + "logps/rejected": -100.70330047607422, + "loss": 0.2241, + "losses/dpo": 0.26555219292640686, + "losses/sft": 1.5325287580490112, + "losses/total": 0.26555219292640686, + "ref_logps/chosen": -36.212493896484375, + "ref_logps/rejected": -48.46067810058594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9309067726135254, + "rewards/margins": 2.293355703353882, + "rewards/rejected": -5.224262237548828, + "step": 2339 + }, + { + "epoch": 2.21, + "grad_norm": 20.153215408325195, + "learning_rate": 1.4637985309548792e-07, + "logps/chosen": -44.94344711303711, + "logps/rejected": -74.34300231933594, + "loss": 0.2596, + "losses/dpo": 0.20611099898815155, + "losses/sft": 1.7320812940597534, + "losses/total": 0.20611099898815155, + "ref_logps/chosen": -30.651456832885742, + "ref_logps/rejected": -37.26879119873047, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4291990995407104, + "rewards/margins": 2.27822208404541, + "rewards/rejected": -3.70742130279541, + "step": 2340 + }, + { + "epoch": 2.21, + "grad_norm": 20.724573135375977, + "learning_rate": 1.4620496677159846e-07, + "logps/chosen": -61.433876037597656, + "logps/rejected": -94.84131622314453, + "loss": 0.2727, + "losses/dpo": 0.18200325965881348, + "losses/sft": 2.288672924041748, + "losses/total": 0.18200325965881348, + "ref_logps/chosen": -39.890411376953125, + "ref_logps/rejected": -47.58531188964844, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.154345989227295, + "rewards/margins": 2.5712552070617676, + "rewards/rejected": -4.725600719451904, + "step": 2341 + }, + { + "epoch": 2.21, + "grad_norm": 17.30550765991211, + "learning_rate": 1.4603008044770897e-07, + "logps/chosen": -43.660831451416016, + "logps/rejected": -93.08619689941406, + "loss": 0.204, + "losses/dpo": 0.06266417354345322, + "losses/sft": 1.5552688837051392, + "losses/total": 0.06266417354345322, + "ref_logps/chosen": -27.48443031311035, + "ref_logps/rejected": -51.1119270324707, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6176401376724243, + "rewards/margins": 2.5797863006591797, + "rewards/rejected": -4.197426795959473, + "step": 2342 + }, + { + "epoch": 2.21, + "grad_norm": 18.430200576782227, + "learning_rate": 1.458551941238195e-07, + "logps/chosen": -49.0999755859375, + "logps/rejected": -91.00870513916016, + "loss": 0.1564, + "losses/dpo": 0.10836179554462433, + "losses/sft": 1.6327718496322632, + "losses/total": 0.10836179554462433, + "ref_logps/chosen": -32.151344299316406, + "ref_logps/rejected": -47.25897216796875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6948630809783936, + "rewards/margins": 2.680109977722168, + "rewards/rejected": -4.374973297119141, + "step": 2343 + }, + { + "epoch": 2.21, + "grad_norm": 22.52871322631836, + "learning_rate": 1.4568030779993005e-07, + "logps/chosen": -63.45011520385742, + "logps/rejected": -87.78423309326172, + "loss": 0.2481, + "losses/dpo": 0.36345982551574707, + "losses/sft": 3.078723669052124, + "losses/total": 0.36345982551574707, + "ref_logps/chosen": -43.00407791137695, + "ref_logps/rejected": -46.24573516845703, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0446040630340576, + "rewards/margins": 2.109245777130127, + "rewards/rejected": -4.1538496017456055, + "step": 2344 + }, + { + "epoch": 2.21, + "grad_norm": 24.400203704833984, + "learning_rate": 1.4550542147604056e-07, + "logps/chosen": -53.13116455078125, + "logps/rejected": -88.30104064941406, + "loss": 0.22, + "losses/dpo": 0.12786290049552917, + "losses/sft": 2.163546562194824, + "losses/total": 0.12786290049552917, + "ref_logps/chosen": -34.274723052978516, + "ref_logps/rejected": -46.45326614379883, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8856441974639893, + "rewards/margins": 2.29913330078125, + "rewards/rejected": -4.18477725982666, + "step": 2345 + }, + { + "epoch": 2.22, + "grad_norm": 18.022647857666016, + "learning_rate": 1.453305351521511e-07, + "logps/chosen": -43.55058288574219, + "logps/rejected": -78.72930908203125, + "loss": 0.171, + "losses/dpo": 0.03637959808111191, + "losses/sft": 2.0351409912109375, + "losses/total": 0.03637959808111191, + "ref_logps/chosen": -30.565048217773438, + "ref_logps/rejected": -40.70380401611328, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.298553228378296, + "rewards/margins": 2.503997564315796, + "rewards/rejected": -3.8025505542755127, + "step": 2346 + }, + { + "epoch": 2.22, + "grad_norm": 32.07356643676758, + "learning_rate": 1.451556488282616e-07, + "logps/chosen": -61.853782653808594, + "logps/rejected": -83.10594177246094, + "loss": 0.3792, + "losses/dpo": 0.44469523429870605, + "losses/sft": 2.6296372413635254, + "losses/total": 0.44469523429870605, + "ref_logps/chosen": -36.15470504760742, + "ref_logps/rejected": -42.85236740112305, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5699076652526855, + "rewards/margins": 1.455450177192688, + "rewards/rejected": -4.025358200073242, + "step": 2347 + }, + { + "epoch": 2.22, + "grad_norm": 16.023487091064453, + "learning_rate": 1.4498076250437215e-07, + "logps/chosen": -55.09426498413086, + "logps/rejected": -93.76437377929688, + "loss": 0.1763, + "losses/dpo": 0.07367486506700516, + "losses/sft": 2.2977771759033203, + "losses/total": 0.07367486506700516, + "ref_logps/chosen": -34.59999084472656, + "ref_logps/rejected": -49.805213928222656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.049427032470703, + "rewards/margins": 2.3464887142181396, + "rewards/rejected": -4.395915508270264, + "step": 2348 + }, + { + "epoch": 2.22, + "grad_norm": 16.10032844543457, + "learning_rate": 1.448058761804827e-07, + "logps/chosen": -58.19569396972656, + "logps/rejected": -78.85516357421875, + "loss": 0.2249, + "losses/dpo": 0.41694408655166626, + "losses/sft": 1.5890512466430664, + "losses/total": 0.41694408655166626, + "ref_logps/chosen": -43.76419448852539, + "ref_logps/rejected": -40.66889572143555, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4431498050689697, + "rewards/margins": 2.3754773139953613, + "rewards/rejected": -3.818627119064331, + "step": 2349 + }, + { + "epoch": 2.22, + "grad_norm": 17.756059646606445, + "learning_rate": 1.446309898565932e-07, + "logps/chosen": -58.808738708496094, + "logps/rejected": -93.82316589355469, + "loss": 0.1624, + "losses/dpo": 0.36172622442245483, + "losses/sft": 2.4447214603424072, + "losses/total": 0.36172622442245483, + "ref_logps/chosen": -40.85578536987305, + "ref_logps/rejected": -51.608863830566406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7952957153320312, + "rewards/margins": 2.4261350631713867, + "rewards/rejected": -4.221430778503418, + "step": 2350 + }, + { + "epoch": 2.22, + "grad_norm": 22.392757415771484, + "learning_rate": 1.4445610353270374e-07, + "logps/chosen": -39.621299743652344, + "logps/rejected": -76.97991943359375, + "loss": 0.271, + "losses/dpo": 0.08346450328826904, + "losses/sft": 1.575729489326477, + "losses/total": 0.08346450328826904, + "ref_logps/chosen": -23.799175262451172, + "ref_logps/rejected": -36.59522247314453, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.582212209701538, + "rewards/margins": 2.4562578201293945, + "rewards/rejected": -4.038470268249512, + "step": 2351 + }, + { + "epoch": 2.22, + "grad_norm": 11.101428031921387, + "learning_rate": 1.4428121720881425e-07, + "logps/chosen": -46.938114166259766, + "logps/rejected": -87.33726501464844, + "loss": 0.0978, + "losses/dpo": 0.07818686217069626, + "losses/sft": 2.0432565212249756, + "losses/total": 0.07818686217069626, + "ref_logps/chosen": -34.40842819213867, + "ref_logps/rejected": -44.21738815307617, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2529685497283936, + "rewards/margins": 3.059019088745117, + "rewards/rejected": -4.31198787689209, + "step": 2352 + }, + { + "epoch": 2.22, + "grad_norm": 18.5439395904541, + "learning_rate": 1.441063308849248e-07, + "logps/chosen": -54.774200439453125, + "logps/rejected": -93.36119079589844, + "loss": 0.1625, + "losses/dpo": 0.08087873458862305, + "losses/sft": 2.0139360427856445, + "losses/total": 0.08087873458862305, + "ref_logps/chosen": -33.428253173828125, + "ref_logps/rejected": -44.51591110229492, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1345949172973633, + "rewards/margins": 2.7499332427978516, + "rewards/rejected": -4.884528160095215, + "step": 2353 + }, + { + "epoch": 2.22, + "grad_norm": 29.636119842529297, + "learning_rate": 1.439314445610353e-07, + "logps/chosen": -71.17277526855469, + "logps/rejected": -88.97742462158203, + "loss": 0.341, + "losses/dpo": 0.40482181310653687, + "losses/sft": 2.618036985397339, + "losses/total": 0.40482181310653687, + "ref_logps/chosen": -44.85466766357422, + "ref_logps/rejected": -45.272884368896484, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.631810188293457, + "rewards/margins": 1.7386436462402344, + "rewards/rejected": -4.370453834533691, + "step": 2354 + }, + { + "epoch": 2.22, + "grad_norm": 31.361713409423828, + "learning_rate": 1.4375655823714585e-07, + "logps/chosen": -63.979164123535156, + "logps/rejected": -75.9061050415039, + "loss": 0.3415, + "losses/dpo": 0.20038992166519165, + "losses/sft": 2.0793874263763428, + "losses/total": 0.20038992166519165, + "ref_logps/chosen": -43.39042282104492, + "ref_logps/rejected": -38.34117889404297, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0588741302490234, + "rewards/margins": 1.6976183652877808, + "rewards/rejected": -3.7564926147460938, + "step": 2355 + }, + { + "epoch": 2.22, + "grad_norm": 28.45371437072754, + "learning_rate": 1.4358167191325638e-07, + "logps/chosen": -51.93365478515625, + "logps/rejected": -75.81988525390625, + "loss": 0.3713, + "losses/dpo": 0.4351145029067993, + "losses/sft": 2.2038331031799316, + "losses/total": 0.4351145029067993, + "ref_logps/chosen": -32.08757019042969, + "ref_logps/rejected": -40.00370788574219, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9846086502075195, + "rewards/margins": 1.5970089435577393, + "rewards/rejected": -3.581617593765259, + "step": 2356 + }, + { + "epoch": 2.23, + "grad_norm": 15.803486824035645, + "learning_rate": 1.4340678558936692e-07, + "logps/chosen": -48.984432220458984, + "logps/rejected": -86.11408233642578, + "loss": 0.1101, + "losses/dpo": 0.014929844997823238, + "losses/sft": 1.5590956211090088, + "losses/total": 0.014929844997823238, + "ref_logps/chosen": -37.49921417236328, + "ref_logps/rejected": -46.27743148803711, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.148522138595581, + "rewards/margins": 2.8351430892944336, + "rewards/rejected": -3.9836652278900146, + "step": 2357 + }, + { + "epoch": 2.23, + "grad_norm": 22.702421188354492, + "learning_rate": 1.4323189926547744e-07, + "logps/chosen": -51.1621208190918, + "logps/rejected": -63.68791961669922, + "loss": 0.315, + "losses/dpo": 0.33401402831077576, + "losses/sft": 1.608689308166504, + "losses/total": 0.33401402831077576, + "ref_logps/chosen": -36.11867904663086, + "ref_logps/rejected": -29.403697967529297, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5043443441390991, + "rewards/margins": 1.9240778684616089, + "rewards/rejected": -3.428422451019287, + "step": 2358 + }, + { + "epoch": 2.23, + "grad_norm": 20.503326416015625, + "learning_rate": 1.4305701294158795e-07, + "logps/chosen": -56.55320739746094, + "logps/rejected": -74.6923599243164, + "loss": 0.1809, + "losses/dpo": 0.2541070878505707, + "losses/sft": 1.267479658126831, + "losses/total": 0.2541070878505707, + "ref_logps/chosen": -37.958709716796875, + "ref_logps/rejected": -35.907806396484375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.859449863433838, + "rewards/margins": 2.01900577545166, + "rewards/rejected": -3.878455638885498, + "step": 2359 + }, + { + "epoch": 2.23, + "grad_norm": 24.33203887939453, + "learning_rate": 1.428821266176985e-07, + "logps/chosen": -61.42462158203125, + "logps/rejected": -82.93911743164062, + "loss": 0.2563, + "losses/dpo": 0.2705605626106262, + "losses/sft": 2.0370235443115234, + "losses/total": 0.2705605626106262, + "ref_logps/chosen": -42.64265823364258, + "ref_logps/rejected": -45.80293273925781, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8781962394714355, + "rewards/margins": 1.8354219198226929, + "rewards/rejected": -3.713618278503418, + "step": 2360 + }, + { + "epoch": 2.23, + "grad_norm": 25.733675003051758, + "learning_rate": 1.42707240293809e-07, + "logps/chosen": -53.19091033935547, + "logps/rejected": -80.37325286865234, + "loss": 0.3386, + "losses/dpo": 0.33262988924980164, + "losses/sft": 2.1511356830596924, + "losses/total": 0.33262988924980164, + "ref_logps/chosen": -36.03321075439453, + "ref_logps/rejected": -45.60382843017578, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7157700061798096, + "rewards/margins": 1.7611720561981201, + "rewards/rejected": -3.4769420623779297, + "step": 2361 + }, + { + "epoch": 2.23, + "grad_norm": 21.893434524536133, + "learning_rate": 1.4253235396991957e-07, + "logps/chosen": -46.79673767089844, + "logps/rejected": -77.75418090820312, + "loss": 0.2763, + "losses/dpo": 0.23092472553253174, + "losses/sft": 2.5948309898376465, + "losses/total": 0.23092472553253174, + "ref_logps/chosen": -31.810062408447266, + "ref_logps/rejected": -43.440086364746094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4986672401428223, + "rewards/margins": 1.932742953300476, + "rewards/rejected": -3.431410312652588, + "step": 2362 + }, + { + "epoch": 2.23, + "grad_norm": 31.391372680664062, + "learning_rate": 1.4235746764603008e-07, + "logps/chosen": -51.307395935058594, + "logps/rejected": -78.97651672363281, + "loss": 0.289, + "losses/dpo": 0.30398237705230713, + "losses/sft": 1.74521803855896, + "losses/total": 0.30398237705230713, + "ref_logps/chosen": -35.805572509765625, + "ref_logps/rejected": -42.899356842041016, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5501821041107178, + "rewards/margins": 2.057533025741577, + "rewards/rejected": -3.607715368270874, + "step": 2363 + }, + { + "epoch": 2.23, + "grad_norm": 9.573360443115234, + "learning_rate": 1.4218258132214062e-07, + "logps/chosen": -56.36796569824219, + "logps/rejected": -96.30144500732422, + "loss": 0.1066, + "losses/dpo": 0.18614837527275085, + "losses/sft": 2.1442477703094482, + "losses/total": 0.18614837527275085, + "ref_logps/chosen": -38.60167694091797, + "ref_logps/rejected": -52.30154037475586, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7766289710998535, + "rewards/margins": 2.6233623027801514, + "rewards/rejected": -4.399991035461426, + "step": 2364 + }, + { + "epoch": 2.23, + "grad_norm": 12.25535774230957, + "learning_rate": 1.4200769499825113e-07, + "logps/chosen": -57.207275390625, + "logps/rejected": -87.8372802734375, + "loss": 0.1169, + "losses/dpo": 0.1619815230369568, + "losses/sft": 2.0699262619018555, + "losses/total": 0.1619815230369568, + "ref_logps/chosen": -42.47515106201172, + "ref_logps/rejected": -45.31751251220703, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.473212718963623, + "rewards/margins": 2.778763771057129, + "rewards/rejected": -4.251976490020752, + "step": 2365 + }, + { + "epoch": 2.23, + "grad_norm": 21.626710891723633, + "learning_rate": 1.4183280867436164e-07, + "logps/chosen": -50.645103454589844, + "logps/rejected": -66.97547912597656, + "loss": 0.2146, + "losses/dpo": 0.21027088165283203, + "losses/sft": 1.540096640586853, + "losses/total": 0.21027088165283203, + "ref_logps/chosen": -31.896881103515625, + "ref_logps/rejected": -30.79705238342285, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8748222589492798, + "rewards/margins": 1.7430202960968018, + "rewards/rejected": -3.617842674255371, + "step": 2366 + }, + { + "epoch": 2.24, + "grad_norm": 31.71278190612793, + "learning_rate": 1.4165792235047218e-07, + "logps/chosen": -53.707672119140625, + "logps/rejected": -86.32237243652344, + "loss": 0.3126, + "losses/dpo": 0.06811884790658951, + "losses/sft": 2.075042486190796, + "losses/total": 0.06811884790658951, + "ref_logps/chosen": -35.07553482055664, + "ref_logps/rejected": -44.44194793701172, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8632137775421143, + "rewards/margins": 2.3248291015625, + "rewards/rejected": -4.188042640686035, + "step": 2367 + }, + { + "epoch": 2.24, + "grad_norm": 36.270774841308594, + "learning_rate": 1.4148303602658272e-07, + "logps/chosen": -59.30921173095703, + "logps/rejected": -73.14798736572266, + "loss": 0.491, + "losses/dpo": 0.2740963399410248, + "losses/sft": 2.3004026412963867, + "losses/total": 0.2740963399410248, + "ref_logps/chosen": -34.68144607543945, + "ref_logps/rejected": -36.50506591796875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4627766609191895, + "rewards/margins": 1.2015153169631958, + "rewards/rejected": -3.6642918586730957, + "step": 2368 + }, + { + "epoch": 2.24, + "grad_norm": 25.790407180786133, + "learning_rate": 1.4130814970269326e-07, + "logps/chosen": -50.60401916503906, + "logps/rejected": -75.64742279052734, + "loss": 0.2233, + "losses/dpo": 0.2719334661960602, + "losses/sft": 2.431874990463257, + "losses/total": 0.2719334661960602, + "ref_logps/chosen": -33.43584442138672, + "ref_logps/rejected": -35.54055404663086, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7168177366256714, + "rewards/margins": 2.2938692569732666, + "rewards/rejected": -4.010686874389648, + "step": 2369 + }, + { + "epoch": 2.24, + "grad_norm": 21.304140090942383, + "learning_rate": 1.4113326337880377e-07, + "logps/chosen": -48.10052490234375, + "logps/rejected": -89.85772705078125, + "loss": 0.1887, + "losses/dpo": 0.12991724908351898, + "losses/sft": 1.800195574760437, + "losses/total": 0.12991724908351898, + "ref_logps/chosen": -32.903175354003906, + "ref_logps/rejected": -51.25528335571289, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.519735336303711, + "rewards/margins": 2.3405098915100098, + "rewards/rejected": -3.8602449893951416, + "step": 2370 + }, + { + "epoch": 2.24, + "grad_norm": 21.90868377685547, + "learning_rate": 1.409583770549143e-07, + "logps/chosen": -55.55950164794922, + "logps/rejected": -85.34005737304688, + "loss": 0.3056, + "losses/dpo": 0.6460038423538208, + "losses/sft": 1.9490422010421753, + "losses/total": 0.6460038423538208, + "ref_logps/chosen": -33.34410095214844, + "ref_logps/rejected": -44.41395568847656, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2215399742126465, + "rewards/margins": 1.871070384979248, + "rewards/rejected": -4.0926103591918945, + "step": 2371 + }, + { + "epoch": 2.24, + "grad_norm": 35.088531494140625, + "learning_rate": 1.4078349073102483e-07, + "logps/chosen": -60.84961700439453, + "logps/rejected": -65.9256591796875, + "loss": 0.4156, + "losses/dpo": 0.5359123945236206, + "losses/sft": 3.1174604892730713, + "losses/total": 0.5359123945236206, + "ref_logps/chosen": -38.57980728149414, + "ref_logps/rejected": -30.240612030029297, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.226980686187744, + "rewards/margins": 1.3415242433547974, + "rewards/rejected": -3.568504810333252, + "step": 2372 + }, + { + "epoch": 2.24, + "grad_norm": 24.721359252929688, + "learning_rate": 1.4060860440713534e-07, + "logps/chosen": -74.81805419921875, + "logps/rejected": -91.27822875976562, + "loss": 0.2603, + "losses/dpo": 0.43100637197494507, + "losses/sft": 2.326233148574829, + "losses/total": 0.43100637197494507, + "ref_logps/chosen": -49.39601135253906, + "ref_logps/rejected": -43.19419860839844, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5422043800354004, + "rewards/margins": 2.2661991119384766, + "rewards/rejected": -4.808403491973877, + "step": 2373 + }, + { + "epoch": 2.24, + "grad_norm": 33.49697494506836, + "learning_rate": 1.4043371808324588e-07, + "logps/chosen": -46.592891693115234, + "logps/rejected": -66.65484619140625, + "loss": 0.412, + "losses/dpo": 0.2121744453907013, + "losses/sft": 1.344349980354309, + "losses/total": 0.2121744453907013, + "ref_logps/chosen": -31.860713958740234, + "ref_logps/rejected": -37.57476043701172, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4732177257537842, + "rewards/margins": 1.4347916841506958, + "rewards/rejected": -2.9080092906951904, + "step": 2374 + }, + { + "epoch": 2.24, + "grad_norm": 18.769556045532227, + "learning_rate": 1.4025883175935642e-07, + "logps/chosen": -58.47972869873047, + "logps/rejected": -79.76788330078125, + "loss": 0.1919, + "losses/dpo": 0.21788561344146729, + "losses/sft": 2.000558376312256, + "losses/total": 0.21788561344146729, + "ref_logps/chosen": -42.12378692626953, + "ref_logps/rejected": -41.602020263671875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.635594129562378, + "rewards/margins": 2.1809921264648438, + "rewards/rejected": -3.8165862560272217, + "step": 2375 + }, + { + "epoch": 2.24, + "grad_norm": 22.678112030029297, + "learning_rate": 1.4008394543546696e-07, + "logps/chosen": -46.83934020996094, + "logps/rejected": -74.54325866699219, + "loss": 0.2744, + "losses/dpo": 0.034999292343854904, + "losses/sft": 1.8867835998535156, + "losses/total": 0.034999292343854904, + "ref_logps/chosen": -30.624225616455078, + "ref_logps/rejected": -38.76499938964844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.621511697769165, + "rewards/margins": 1.9563138484954834, + "rewards/rejected": -3.5778255462646484, + "step": 2376 + }, + { + "epoch": 2.24, + "grad_norm": 21.544294357299805, + "learning_rate": 1.3990905911157747e-07, + "logps/chosen": -58.8060302734375, + "logps/rejected": -82.62645721435547, + "loss": 0.2119, + "losses/dpo": 0.2927456498146057, + "losses/sft": 2.033172607421875, + "losses/total": 0.2927456498146057, + "ref_logps/chosen": -37.91547393798828, + "ref_logps/rejected": -43.22669219970703, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0890557765960693, + "rewards/margins": 1.850920557975769, + "rewards/rejected": -3.939976453781128, + "step": 2377 + }, + { + "epoch": 2.25, + "grad_norm": 22.206777572631836, + "learning_rate": 1.39734172787688e-07, + "logps/chosen": -55.01869201660156, + "logps/rejected": -75.8746109008789, + "loss": 0.2801, + "losses/dpo": 0.16016580164432526, + "losses/sft": 2.433382034301758, + "losses/total": 0.16016580164432526, + "ref_logps/chosen": -32.20459747314453, + "ref_logps/rejected": -33.72629165649414, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.281409740447998, + "rewards/margins": 1.9334220886230469, + "rewards/rejected": -4.214831829071045, + "step": 2378 + }, + { + "epoch": 2.25, + "grad_norm": 16.628137588500977, + "learning_rate": 1.3955928646379852e-07, + "logps/chosen": -52.125919342041016, + "logps/rejected": -88.80117797851562, + "loss": 0.1552, + "losses/dpo": 0.12228988856077194, + "losses/sft": 1.8678429126739502, + "losses/total": 0.12228988856077194, + "ref_logps/chosen": -30.66082000732422, + "ref_logps/rejected": -42.06886291503906, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.146510124206543, + "rewards/margins": 2.5267210006713867, + "rewards/rejected": -4.6732306480407715, + "step": 2379 + }, + { + "epoch": 2.25, + "grad_norm": 28.080888748168945, + "learning_rate": 1.3938440013990903e-07, + "logps/chosen": -56.184120178222656, + "logps/rejected": -83.13200378417969, + "loss": 0.3831, + "losses/dpo": 0.04662206023931503, + "losses/sft": 2.3052868843078613, + "losses/total": 0.04662206023931503, + "ref_logps/chosen": -35.02573776245117, + "ref_logps/rejected": -39.637577056884766, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1158385276794434, + "rewards/margins": 2.2336044311523438, + "rewards/rejected": -4.349442481994629, + "step": 2380 + }, + { + "epoch": 2.25, + "grad_norm": 19.871089935302734, + "learning_rate": 1.392095138160196e-07, + "logps/chosen": -62.149200439453125, + "logps/rejected": -101.46636962890625, + "loss": 0.1846, + "losses/dpo": 0.22456835210323334, + "losses/sft": 2.095123767852783, + "losses/total": 0.22456835210323334, + "ref_logps/chosen": -41.12150192260742, + "ref_logps/rejected": -53.73661422729492, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1027703285217285, + "rewards/margins": 2.6702051162719727, + "rewards/rejected": -4.772974967956543, + "step": 2381 + }, + { + "epoch": 2.25, + "grad_norm": 32.6857795715332, + "learning_rate": 1.390346274921301e-07, + "logps/chosen": -49.76288604736328, + "logps/rejected": -80.27776336669922, + "loss": 0.3087, + "losses/dpo": 0.052292875945568085, + "losses/sft": 1.8107963800430298, + "losses/total": 0.052292875945568085, + "ref_logps/chosen": -31.52740478515625, + "ref_logps/rejected": -39.43199920654297, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8235479593276978, + "rewards/margins": 2.261028528213501, + "rewards/rejected": -4.084576606750488, + "step": 2382 + }, + { + "epoch": 2.25, + "grad_norm": 11.212374687194824, + "learning_rate": 1.3885974116824065e-07, + "logps/chosen": -48.82699966430664, + "logps/rejected": -77.65071105957031, + "loss": 0.1289, + "losses/dpo": 0.1088331937789917, + "losses/sft": 1.497929573059082, + "losses/total": 0.1088331937789917, + "ref_logps/chosen": -35.838314056396484, + "ref_logps/rejected": -37.895538330078125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2988685369491577, + "rewards/margins": 2.676649570465088, + "rewards/rejected": -3.975518226623535, + "step": 2383 + }, + { + "epoch": 2.25, + "grad_norm": 24.601694107055664, + "learning_rate": 1.3868485484435116e-07, + "logps/chosen": -56.765804290771484, + "logps/rejected": -81.39866638183594, + "loss": 0.2784, + "losses/dpo": 0.2070329636335373, + "losses/sft": 2.3070905208587646, + "losses/total": 0.2070329636335373, + "ref_logps/chosen": -37.4853515625, + "ref_logps/rejected": -43.59917449951172, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9280447959899902, + "rewards/margins": 1.851904273033142, + "rewards/rejected": -3.779949188232422, + "step": 2384 + }, + { + "epoch": 2.25, + "grad_norm": 22.381103515625, + "learning_rate": 1.385099685204617e-07, + "logps/chosen": -51.57588195800781, + "logps/rejected": -84.54014587402344, + "loss": 0.2206, + "losses/dpo": 0.3153095245361328, + "losses/sft": 2.236558437347412, + "losses/total": 0.3153095245361328, + "ref_logps/chosen": -31.67160415649414, + "ref_logps/rejected": -43.47623062133789, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.990427851676941, + "rewards/margins": 2.115964412689209, + "rewards/rejected": -4.106391906738281, + "step": 2385 + }, + { + "epoch": 2.25, + "grad_norm": 12.015800476074219, + "learning_rate": 1.3833508219657221e-07, + "logps/chosen": -64.45236206054688, + "logps/rejected": -99.07717895507812, + "loss": 0.1092, + "losses/dpo": 0.09402421116828918, + "losses/sft": 2.278538465499878, + "losses/total": 0.09402421116828918, + "ref_logps/chosen": -44.67734146118164, + "ref_logps/rejected": -53.46605682373047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9775025844573975, + "rewards/margins": 2.5836098194122314, + "rewards/rejected": -4.561112403869629, + "step": 2386 + }, + { + "epoch": 2.25, + "grad_norm": 29.570575714111328, + "learning_rate": 1.3816019587268273e-07, + "logps/chosen": -59.985172271728516, + "logps/rejected": -72.76171112060547, + "loss": 0.3698, + "losses/dpo": 0.46376681327819824, + "losses/sft": 2.2356157302856445, + "losses/total": 0.46376681327819824, + "ref_logps/chosen": -41.023773193359375, + "ref_logps/rejected": -40.67151641845703, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8961400985717773, + "rewards/margins": 1.3128788471221924, + "rewards/rejected": -3.2090187072753906, + "step": 2387 + }, + { + "epoch": 2.25, + "grad_norm": 25.337995529174805, + "learning_rate": 1.379853095487933e-07, + "logps/chosen": -60.974884033203125, + "logps/rejected": -80.356689453125, + "loss": 0.2601, + "losses/dpo": 0.15893080830574036, + "losses/sft": 2.2057037353515625, + "losses/total": 0.15893080830574036, + "ref_logps/chosen": -43.78391647338867, + "ref_logps/rejected": -43.60730743408203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7190968990325928, + "rewards/margins": 1.9558415412902832, + "rewards/rejected": -3.674938440322876, + "step": 2388 + }, + { + "epoch": 2.26, + "grad_norm": 37.78849411010742, + "learning_rate": 1.378104232249038e-07, + "logps/chosen": -61.546714782714844, + "logps/rejected": -69.55838775634766, + "loss": 0.5174, + "losses/dpo": 0.7308023571968079, + "losses/sft": 2.1762781143188477, + "losses/total": 0.7308023571968079, + "ref_logps/chosen": -35.81056594848633, + "ref_logps/rejected": -31.574665069580078, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.573615074157715, + "rewards/margins": 1.2247569561004639, + "rewards/rejected": -3.7983717918395996, + "step": 2389 + }, + { + "epoch": 2.26, + "grad_norm": 32.76581954956055, + "learning_rate": 1.3763553690101434e-07, + "logps/chosen": -62.23737716674805, + "logps/rejected": -102.82528686523438, + "loss": 0.2494, + "losses/dpo": 0.10765311121940613, + "losses/sft": 2.1158807277679443, + "losses/total": 0.10765311121940613, + "ref_logps/chosen": -40.06000518798828, + "ref_logps/rejected": -56.96181869506836, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2177374362945557, + "rewards/margins": 2.3686089515686035, + "rewards/rejected": -4.586346626281738, + "step": 2390 + }, + { + "epoch": 2.26, + "grad_norm": 17.851558685302734, + "learning_rate": 1.3746065057712486e-07, + "logps/chosen": -54.43299102783203, + "logps/rejected": -95.2369613647461, + "loss": 0.1583, + "losses/dpo": 0.08966552466154099, + "losses/sft": 1.845613956451416, + "losses/total": 0.08966552466154099, + "ref_logps/chosen": -34.57926940917969, + "ref_logps/rejected": -48.206199645996094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9853723049163818, + "rewards/margins": 2.7177042961120605, + "rewards/rejected": -4.7030768394470215, + "step": 2391 + }, + { + "epoch": 2.26, + "grad_norm": 23.088077545166016, + "learning_rate": 1.372857642532354e-07, + "logps/chosen": -51.7119026184082, + "logps/rejected": -79.51332092285156, + "loss": 0.3184, + "losses/dpo": 0.5947718024253845, + "losses/sft": 1.8849107027053833, + "losses/total": 0.5947718024253845, + "ref_logps/chosen": -34.84232711791992, + "ref_logps/rejected": -42.829750061035156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6869573593139648, + "rewards/margins": 1.9813995361328125, + "rewards/rejected": -3.6683568954467773, + "step": 2392 + }, + { + "epoch": 2.26, + "grad_norm": 27.97321128845215, + "learning_rate": 1.371108779293459e-07, + "logps/chosen": -60.52677917480469, + "logps/rejected": -85.35513305664062, + "loss": 0.2969, + "losses/dpo": 0.27566102147102356, + "losses/sft": 1.8344444036483765, + "losses/total": 0.27566102147102356, + "ref_logps/chosen": -41.73499298095703, + "ref_logps/rejected": -47.36833953857422, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8791784048080444, + "rewards/margins": 1.9195013046264648, + "rewards/rejected": -3.798679828643799, + "step": 2393 + }, + { + "epoch": 2.26, + "grad_norm": 24.059640884399414, + "learning_rate": 1.3693599160545645e-07, + "logps/chosen": -64.58656311035156, + "logps/rejected": -81.40239715576172, + "loss": 0.2808, + "losses/dpo": 0.1934591382741928, + "losses/sft": 1.9751237630844116, + "losses/total": 0.1934591382741928, + "ref_logps/chosen": -43.479068756103516, + "ref_logps/rejected": -39.68476104736328, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1107492446899414, + "rewards/margins": 2.06101393699646, + "rewards/rejected": -4.1717634201049805, + "step": 2394 + }, + { + "epoch": 2.26, + "grad_norm": 32.36083221435547, + "learning_rate": 1.3676110528156699e-07, + "logps/chosen": -58.72663497924805, + "logps/rejected": -70.34164428710938, + "loss": 0.3824, + "losses/dpo": 0.33511149883270264, + "losses/sft": 1.4628716707229614, + "losses/total": 0.33511149883270264, + "ref_logps/chosen": -39.878639221191406, + "ref_logps/rejected": -37.69023895263672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8847999572753906, + "rewards/margins": 1.3803400993347168, + "rewards/rejected": -3.2651400566101074, + "step": 2395 + }, + { + "epoch": 2.26, + "grad_norm": 24.17458152770996, + "learning_rate": 1.365862189576775e-07, + "logps/chosen": -49.89300537109375, + "logps/rejected": -68.76500701904297, + "loss": 0.3302, + "losses/dpo": 0.3179434835910797, + "losses/sft": 1.9142531156539917, + "losses/total": 0.3179434835910797, + "ref_logps/chosen": -33.66688919067383, + "ref_logps/rejected": -36.99160385131836, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6226119995117188, + "rewards/margins": 1.5547285079956055, + "rewards/rejected": -3.177340507507324, + "step": 2396 + }, + { + "epoch": 2.26, + "grad_norm": 14.464322090148926, + "learning_rate": 1.3641133263378804e-07, + "logps/chosen": -54.700801849365234, + "logps/rejected": -98.97001647949219, + "loss": 0.1051, + "losses/dpo": 0.13569232821464539, + "losses/sft": 2.107184886932373, + "losses/total": 0.13569232821464539, + "ref_logps/chosen": -35.661781311035156, + "ref_logps/rejected": -47.221923828125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.903902292251587, + "rewards/margins": 3.270906448364258, + "rewards/rejected": -5.174808979034424, + "step": 2397 + }, + { + "epoch": 2.26, + "grad_norm": 24.608543395996094, + "learning_rate": 1.3623644630989855e-07, + "logps/chosen": -56.23943328857422, + "logps/rejected": -78.3583755493164, + "loss": 0.264, + "losses/dpo": 0.2555888593196869, + "losses/sft": 2.392543315887451, + "losses/total": 0.2555888593196869, + "ref_logps/chosen": -36.91911315917969, + "ref_logps/rejected": -44.044044494628906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9320323467254639, + "rewards/margins": 1.4994008541107178, + "rewards/rejected": -3.4314332008361816, + "step": 2398 + }, + { + "epoch": 2.27, + "grad_norm": 22.063926696777344, + "learning_rate": 1.360615599860091e-07, + "logps/chosen": -59.61087417602539, + "logps/rejected": -84.21319580078125, + "loss": 0.2006, + "losses/dpo": 0.30503690242767334, + "losses/sft": 2.4765117168426514, + "losses/total": 0.30503690242767334, + "ref_logps/chosen": -41.401397705078125, + "ref_logps/rejected": -44.19184875488281, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8209476470947266, + "rewards/margins": 2.1811866760253906, + "rewards/rejected": -4.002134323120117, + "step": 2399 + }, + { + "epoch": 2.27, + "grad_norm": 29.38550567626953, + "learning_rate": 1.3588667366211963e-07, + "logps/chosen": -71.275146484375, + "logps/rejected": -89.43241882324219, + "loss": 0.2534, + "losses/dpo": 0.06601622700691223, + "losses/sft": 2.1628527641296387, + "losses/total": 0.06601622700691223, + "ref_logps/chosen": -51.447181701660156, + "ref_logps/rejected": -47.4654426574707, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9827961921691895, + "rewards/margins": 2.213902473449707, + "rewards/rejected": -4.196698188781738, + "step": 2400 + }, + { + "epoch": 2.27, + "grad_norm": 16.175073623657227, + "learning_rate": 1.3571178733823014e-07, + "logps/chosen": -59.07041931152344, + "logps/rejected": -92.49278259277344, + "loss": 0.1621, + "losses/dpo": 0.3248475193977356, + "losses/sft": 1.802823781967163, + "losses/total": 0.3248475193977356, + "ref_logps/chosen": -40.613304138183594, + "ref_logps/rejected": -49.56654357910156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8457117080688477, + "rewards/margins": 2.4469127655029297, + "rewards/rejected": -4.292624473571777, + "step": 2401 + }, + { + "epoch": 2.27, + "grad_norm": 20.280170440673828, + "learning_rate": 1.3553690101434068e-07, + "logps/chosen": -32.33776092529297, + "logps/rejected": -64.39653015136719, + "loss": 0.2531, + "losses/dpo": 0.3441128134727478, + "losses/sft": 1.8245842456817627, + "losses/total": 0.3441128134727478, + "ref_logps/chosen": -23.20067024230957, + "ref_logps/rejected": -34.498069763183594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9137091040611267, + "rewards/margins": 2.076137065887451, + "rewards/rejected": -2.9898462295532227, + "step": 2402 + }, + { + "epoch": 2.27, + "grad_norm": 18.15727424621582, + "learning_rate": 1.353620146904512e-07, + "logps/chosen": -64.0350341796875, + "logps/rejected": -92.22100830078125, + "loss": 0.184, + "losses/dpo": 0.1796242594718933, + "losses/sft": 2.6987340450286865, + "losses/total": 0.1796242594718933, + "ref_logps/chosen": -40.579315185546875, + "ref_logps/rejected": -46.809898376464844, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.345571517944336, + "rewards/margins": 2.195540428161621, + "rewards/rejected": -4.541111946105957, + "step": 2403 + }, + { + "epoch": 2.27, + "grad_norm": 16.40632438659668, + "learning_rate": 1.3518712836656173e-07, + "logps/chosen": -54.407630920410156, + "logps/rejected": -84.37345886230469, + "loss": 0.1674, + "losses/dpo": 0.29198169708251953, + "losses/sft": 1.8292515277862549, + "losses/total": 0.29198169708251953, + "ref_logps/chosen": -43.080963134765625, + "ref_logps/rejected": -48.06843566894531, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.132666826248169, + "rewards/margins": 2.497835159301758, + "rewards/rejected": -3.630502223968506, + "step": 2404 + }, + { + "epoch": 2.27, + "grad_norm": 15.126834869384766, + "learning_rate": 1.3501224204267225e-07, + "logps/chosen": -51.35548400878906, + "logps/rejected": -89.8807373046875, + "loss": 0.17, + "losses/dpo": 0.10265373438596725, + "losses/sft": 2.852729320526123, + "losses/total": 0.10265373438596725, + "ref_logps/chosen": -35.860774993896484, + "ref_logps/rejected": -51.35160446166992, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5494709014892578, + "rewards/margins": 2.303443193435669, + "rewards/rejected": -3.8529140949249268, + "step": 2405 + }, + { + "epoch": 2.27, + "grad_norm": 26.159523010253906, + "learning_rate": 1.3483735571878278e-07, + "logps/chosen": -58.79133605957031, + "logps/rejected": -61.3942985534668, + "loss": 0.3852, + "losses/dpo": 0.44203728437423706, + "losses/sft": 2.013800859451294, + "losses/total": 0.44203728437423706, + "ref_logps/chosen": -35.162986755371094, + "ref_logps/rejected": -27.51243019104004, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.362834930419922, + "rewards/margins": 1.0253522396087646, + "rewards/rejected": -3.3881869316101074, + "step": 2406 + }, + { + "epoch": 2.27, + "grad_norm": 12.882963180541992, + "learning_rate": 1.3466246939489332e-07, + "logps/chosen": -47.99653244018555, + "logps/rejected": -77.51039123535156, + "loss": 0.1679, + "losses/dpo": 0.18950128555297852, + "losses/sft": 1.9544509649276733, + "losses/total": 0.18950128555297852, + "ref_logps/chosen": -30.21261215209961, + "ref_logps/rejected": -35.950096130371094, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7783918380737305, + "rewards/margins": 2.3776371479034424, + "rewards/rejected": -4.156029224395752, + "step": 2407 + }, + { + "epoch": 2.27, + "grad_norm": 22.991592407226562, + "learning_rate": 1.3448758307100384e-07, + "logps/chosen": -55.77557373046875, + "logps/rejected": -82.40957641601562, + "loss": 0.4157, + "losses/dpo": 0.35862863063812256, + "losses/sft": 2.105030059814453, + "losses/total": 0.35862863063812256, + "ref_logps/chosen": -35.95896911621094, + "ref_logps/rejected": -44.39405059814453, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9816608428955078, + "rewards/margins": 1.8198909759521484, + "rewards/rejected": -3.8015518188476562, + "step": 2408 + }, + { + "epoch": 2.27, + "grad_norm": 19.473819732666016, + "learning_rate": 1.3431269674711438e-07, + "logps/chosen": -66.45231628417969, + "logps/rejected": -86.8429946899414, + "loss": 0.1994, + "losses/dpo": 0.23618318140506744, + "losses/sft": 2.5551085472106934, + "losses/total": 0.23618318140506744, + "ref_logps/chosen": -46.22987365722656, + "ref_logps/rejected": -47.191810607910156, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.022244453430176, + "rewards/margins": 1.9428741931915283, + "rewards/rejected": -3.965118646621704, + "step": 2409 + }, + { + "epoch": 2.28, + "grad_norm": 22.52762794494629, + "learning_rate": 1.341378104232249e-07, + "logps/chosen": -54.11133575439453, + "logps/rejected": -70.26641845703125, + "loss": 0.2801, + "losses/dpo": 0.39310312271118164, + "losses/sft": 2.870839834213257, + "losses/total": 0.39310312271118164, + "ref_logps/chosen": -36.081295013427734, + "ref_logps/rejected": -33.66963195800781, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8030041456222534, + "rewards/margins": 1.8566750288009644, + "rewards/rejected": -3.6596791744232178, + "step": 2410 + }, + { + "epoch": 2.28, + "grad_norm": 24.872575759887695, + "learning_rate": 1.3396292409933543e-07, + "logps/chosen": -49.60558319091797, + "logps/rejected": -70.62821960449219, + "loss": 0.3292, + "losses/dpo": 0.5849834084510803, + "losses/sft": 2.177232027053833, + "losses/total": 0.5849834084510803, + "ref_logps/chosen": -29.09119987487793, + "ref_logps/rejected": -37.09852600097656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.051438808441162, + "rewards/margins": 1.301531195640564, + "rewards/rejected": -3.3529698848724365, + "step": 2411 + }, + { + "epoch": 2.28, + "grad_norm": 17.264997482299805, + "learning_rate": 1.3378803777544594e-07, + "logps/chosen": -60.85906219482422, + "logps/rejected": -89.72657012939453, + "loss": 0.1469, + "losses/dpo": 0.2169148176908493, + "losses/sft": 2.3806140422821045, + "losses/total": 0.2169148176908493, + "ref_logps/chosen": -37.93870544433594, + "ref_logps/rejected": -45.111961364746094, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2920358180999756, + "rewards/margins": 2.169424533843994, + "rewards/rejected": -4.461460590362549, + "step": 2412 + }, + { + "epoch": 2.28, + "grad_norm": 23.59300422668457, + "learning_rate": 1.336131514515565e-07, + "logps/chosen": -47.2181396484375, + "logps/rejected": -77.3600082397461, + "loss": 0.3153, + "losses/dpo": 0.057991232722997665, + "losses/sft": 1.221850872039795, + "losses/total": 0.057991232722997665, + "ref_logps/chosen": -30.850021362304688, + "ref_logps/rejected": -41.15699005126953, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6368123292922974, + "rewards/margins": 1.9834893941879272, + "rewards/rejected": -3.6203017234802246, + "step": 2413 + }, + { + "epoch": 2.28, + "grad_norm": 12.763352394104004, + "learning_rate": 1.3343826512766702e-07, + "logps/chosen": -55.571128845214844, + "logps/rejected": -85.33956146240234, + "loss": 0.1619, + "losses/dpo": 0.10864780843257904, + "losses/sft": 2.114583730697632, + "losses/total": 0.10864780843257904, + "ref_logps/chosen": -36.87445068359375, + "ref_logps/rejected": -42.569793701171875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8696682453155518, + "rewards/margins": 2.4073095321655273, + "rewards/rejected": -4.2769775390625, + "step": 2414 + }, + { + "epoch": 2.28, + "grad_norm": 16.972850799560547, + "learning_rate": 1.3326337880377753e-07, + "logps/chosen": -63.30195617675781, + "logps/rejected": -95.20246124267578, + "loss": 0.1504, + "losses/dpo": 0.12854431569576263, + "losses/sft": 1.3681620359420776, + "losses/total": 0.12854431569576263, + "ref_logps/chosen": -43.029457092285156, + "ref_logps/rejected": -50.25859069824219, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.027250051498413, + "rewards/margins": 2.467137336730957, + "rewards/rejected": -4.494387626647949, + "step": 2415 + }, + { + "epoch": 2.28, + "grad_norm": 18.43023681640625, + "learning_rate": 1.3308849247988807e-07, + "logps/chosen": -53.49689483642578, + "logps/rejected": -77.71681213378906, + "loss": 0.2667, + "losses/dpo": 0.3872087895870209, + "losses/sft": 1.7538005113601685, + "losses/total": 0.3872087895870209, + "ref_logps/chosen": -35.99681091308594, + "ref_logps/rejected": -41.74824523925781, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7500083446502686, + "rewards/margins": 1.8468482494354248, + "rewards/rejected": -3.5968565940856934, + "step": 2416 + }, + { + "epoch": 2.28, + "grad_norm": 30.0272159576416, + "learning_rate": 1.3291360615599858e-07, + "logps/chosen": -50.79872131347656, + "logps/rejected": -79.16336822509766, + "loss": 0.4276, + "losses/dpo": 0.20233452320098877, + "losses/sft": 1.5577571392059326, + "losses/total": 0.20233452320098877, + "ref_logps/chosen": -33.007076263427734, + "ref_logps/rejected": -43.126792907714844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.779164433479309, + "rewards/margins": 1.824493169784546, + "rewards/rejected": -3.6036574840545654, + "step": 2417 + }, + { + "epoch": 2.28, + "grad_norm": 23.389415740966797, + "learning_rate": 1.3273871983210912e-07, + "logps/chosen": -63.302425384521484, + "logps/rejected": -95.86807250976562, + "loss": 0.2034, + "losses/dpo": 0.2668130695819855, + "losses/sft": 3.084998369216919, + "losses/total": 0.2668130695819855, + "ref_logps/chosen": -39.01983642578125, + "ref_logps/rejected": -48.74082946777344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4282588958740234, + "rewards/margins": 2.2844650745391846, + "rewards/rejected": -4.712724208831787, + "step": 2418 + }, + { + "epoch": 2.28, + "grad_norm": 29.843721389770508, + "learning_rate": 1.3256383350821963e-07, + "logps/chosen": -70.4579086303711, + "logps/rejected": -96.57888793945312, + "loss": 0.2756, + "losses/dpo": 0.23706570267677307, + "losses/sft": 2.3109374046325684, + "losses/total": 0.23706570267677307, + "ref_logps/chosen": -51.540157318115234, + "ref_logps/rejected": -55.11073303222656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8917758464813232, + "rewards/margins": 2.2550394535064697, + "rewards/rejected": -4.146815299987793, + "step": 2419 + }, + { + "epoch": 2.29, + "grad_norm": 26.076583862304688, + "learning_rate": 1.323889471843302e-07, + "logps/chosen": -59.901817321777344, + "logps/rejected": -90.6585464477539, + "loss": 0.4032, + "losses/dpo": 0.1096503883600235, + "losses/sft": 1.7867217063903809, + "losses/total": 0.1096503883600235, + "ref_logps/chosen": -35.881282806396484, + "ref_logps/rejected": -43.73113250732422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4020538330078125, + "rewards/margins": 2.290687322616577, + "rewards/rejected": -4.692741394042969, + "step": 2420 + }, + { + "epoch": 2.29, + "grad_norm": 24.19892692565918, + "learning_rate": 1.322140608604407e-07, + "logps/chosen": -46.25697326660156, + "logps/rejected": -83.18038940429688, + "loss": 0.2321, + "losses/dpo": 0.3132858872413635, + "losses/sft": 2.0489869117736816, + "losses/total": 0.3132858872413635, + "ref_logps/chosen": -27.728519439697266, + "ref_logps/rejected": -42.803306579589844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8528456687927246, + "rewards/margins": 2.1848621368408203, + "rewards/rejected": -4.037708282470703, + "step": 2421 + }, + { + "epoch": 2.29, + "grad_norm": 15.319928169250488, + "learning_rate": 1.3203917453655123e-07, + "logps/chosen": -42.185203552246094, + "logps/rejected": -74.96870422363281, + "loss": 0.2152, + "losses/dpo": 0.1868710219860077, + "losses/sft": 1.4592739343643188, + "losses/total": 0.1868710219860077, + "ref_logps/chosen": -27.96023178100586, + "ref_logps/rejected": -39.26026916503906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.422497034072876, + "rewards/margins": 2.148346185684204, + "rewards/rejected": -3.570843458175659, + "step": 2422 + }, + { + "epoch": 2.29, + "grad_norm": 18.8427677154541, + "learning_rate": 1.3186428821266176e-07, + "logps/chosen": -54.79179382324219, + "logps/rejected": -75.80045318603516, + "loss": 0.2228, + "losses/dpo": 0.2674533426761627, + "losses/sft": 2.0315146446228027, + "losses/total": 0.2674533426761627, + "ref_logps/chosen": -40.59490966796875, + "ref_logps/rejected": -39.550743103027344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4196884632110596, + "rewards/margins": 2.2052831649780273, + "rewards/rejected": -3.624971389770508, + "step": 2423 + }, + { + "epoch": 2.29, + "grad_norm": 24.370243072509766, + "learning_rate": 1.3168940188877228e-07, + "logps/chosen": -65.93751525878906, + "logps/rejected": -82.40924072265625, + "loss": 0.2725, + "losses/dpo": 0.1942099779844284, + "losses/sft": 1.8575749397277832, + "losses/total": 0.1942099779844284, + "ref_logps/chosen": -43.61890411376953, + "ref_logps/rejected": -39.99061584472656, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.231860637664795, + "rewards/margins": 2.0100016593933105, + "rewards/rejected": -4.2418622970581055, + "step": 2424 + }, + { + "epoch": 2.29, + "grad_norm": 12.209527015686035, + "learning_rate": 1.3151451556488282e-07, + "logps/chosen": -52.96317672729492, + "logps/rejected": -84.84678649902344, + "loss": 0.1069, + "losses/dpo": 0.06721833348274231, + "losses/sft": 1.5265836715698242, + "losses/total": 0.06721833348274231, + "ref_logps/chosen": -39.00221252441406, + "ref_logps/rejected": -44.65850830078125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3960963487625122, + "rewards/margins": 2.6227309703826904, + "rewards/rejected": -4.018827438354492, + "step": 2425 + }, + { + "epoch": 2.29, + "grad_norm": 13.12696647644043, + "learning_rate": 1.3133962924099336e-07, + "logps/chosen": -54.62923049926758, + "logps/rejected": -87.98492431640625, + "loss": 0.1217, + "losses/dpo": 0.11109773069620132, + "losses/sft": 1.8667696714401245, + "losses/total": 0.11109773069620132, + "ref_logps/chosen": -38.25688934326172, + "ref_logps/rejected": -47.481788635253906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6372339725494385, + "rewards/margins": 2.4130802154541016, + "rewards/rejected": -4.050313949584961, + "step": 2426 + }, + { + "epoch": 2.29, + "grad_norm": 13.050585746765137, + "learning_rate": 1.311647429171039e-07, + "logps/chosen": -52.75262451171875, + "logps/rejected": -86.94068908691406, + "loss": 0.1198, + "losses/dpo": 0.1460084170103073, + "losses/sft": 2.224973440170288, + "losses/total": 0.1460084170103073, + "ref_logps/chosen": -35.61548614501953, + "ref_logps/rejected": -42.66188049316406, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7137138843536377, + "rewards/margins": 2.7141664028167725, + "rewards/rejected": -4.42788028717041, + "step": 2427 + }, + { + "epoch": 2.29, + "grad_norm": 36.96118927001953, + "learning_rate": 1.309898565932144e-07, + "logps/chosen": -71.69375610351562, + "logps/rejected": -80.9984359741211, + "loss": 0.4443, + "losses/dpo": 0.6484484076499939, + "losses/sft": 1.8623161315917969, + "losses/total": 0.6484484076499939, + "ref_logps/chosen": -52.135955810546875, + "ref_logps/rejected": -45.18701934814453, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9557801485061646, + "rewards/margins": 1.6253615617752075, + "rewards/rejected": -3.581141948699951, + "step": 2428 + }, + { + "epoch": 2.29, + "grad_norm": 21.450525283813477, + "learning_rate": 1.3081497026932492e-07, + "logps/chosen": -46.33113098144531, + "logps/rejected": -80.69264221191406, + "loss": 0.2107, + "losses/dpo": 0.37929439544677734, + "losses/sft": 1.9647760391235352, + "losses/total": 0.37929439544677734, + "ref_logps/chosen": -31.963848114013672, + "ref_logps/rejected": -43.37226104736328, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4367282390594482, + "rewards/margins": 2.295309543609619, + "rewards/rejected": -3.7320377826690674, + "step": 2429 + }, + { + "epoch": 2.29, + "grad_norm": 27.140365600585938, + "learning_rate": 1.3064008394543546e-07, + "logps/chosen": -61.80070495605469, + "logps/rejected": -75.57101440429688, + "loss": 0.2846, + "losses/dpo": 0.18821236491203308, + "losses/sft": 2.1133620738983154, + "losses/total": 0.18821236491203308, + "ref_logps/chosen": -40.08721160888672, + "ref_logps/rejected": -39.165260314941406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.171349287033081, + "rewards/margins": 1.469226598739624, + "rewards/rejected": -3.640575885772705, + "step": 2430 + }, + { + "epoch": 2.3, + "grad_norm": 22.137954711914062, + "learning_rate": 1.3046519762154597e-07, + "logps/chosen": -65.85840606689453, + "logps/rejected": -113.12633514404297, + "loss": 0.1886, + "losses/dpo": 0.03326120227575302, + "losses/sft": 2.66357421875, + "losses/total": 0.03326120227575302, + "ref_logps/chosen": -45.154388427734375, + "ref_logps/rejected": -63.15598678588867, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.070401668548584, + "rewards/margins": 2.926633358001709, + "rewards/rejected": -4.997035026550293, + "step": 2431 + }, + { + "epoch": 2.3, + "grad_norm": 17.694807052612305, + "learning_rate": 1.3029031129765654e-07, + "logps/chosen": -64.27177429199219, + "logps/rejected": -83.77233123779297, + "loss": 0.2356, + "losses/dpo": 0.09184356778860092, + "losses/sft": 1.846799373626709, + "losses/total": 0.09184356778860092, + "ref_logps/chosen": -45.82080841064453, + "ref_logps/rejected": -42.62569808959961, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8450958728790283, + "rewards/margins": 2.2695670127868652, + "rewards/rejected": -4.114663124084473, + "step": 2432 + }, + { + "epoch": 2.3, + "grad_norm": 16.929241180419922, + "learning_rate": 1.3011542497376705e-07, + "logps/chosen": -66.10371398925781, + "logps/rejected": -96.54380798339844, + "loss": 0.1577, + "losses/dpo": 0.0324687696993351, + "losses/sft": 1.9238481521606445, + "losses/total": 0.0324687696993351, + "ref_logps/chosen": -47.739498138427734, + "ref_logps/rejected": -52.19666290283203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8364222049713135, + "rewards/margins": 2.598292112350464, + "rewards/rejected": -4.434714317321777, + "step": 2433 + }, + { + "epoch": 2.3, + "grad_norm": 21.235267639160156, + "learning_rate": 1.299405386498776e-07, + "logps/chosen": -58.0411376953125, + "logps/rejected": -95.41883087158203, + "loss": 0.2436, + "losses/dpo": 0.2698054909706116, + "losses/sft": 2.0384953022003174, + "losses/total": 0.2698054909706116, + "ref_logps/chosen": -36.53667449951172, + "ref_logps/rejected": -47.482093811035156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1504464149475098, + "rewards/margins": 2.6432273387908936, + "rewards/rejected": -4.793673515319824, + "step": 2434 + }, + { + "epoch": 2.3, + "grad_norm": 32.499664306640625, + "learning_rate": 1.297656523259881e-07, + "logps/chosen": -51.987548828125, + "logps/rejected": -71.0145034790039, + "loss": 0.3761, + "losses/dpo": 0.8973588943481445, + "losses/sft": 2.3473894596099854, + "losses/total": 0.8973588943481445, + "ref_logps/chosen": -34.76100158691406, + "ref_logps/rejected": -36.838016510009766, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7226548194885254, + "rewards/margins": 1.6949940919876099, + "rewards/rejected": -3.4176487922668457, + "step": 2435 + }, + { + "epoch": 2.3, + "grad_norm": 24.999605178833008, + "learning_rate": 1.2959076600209861e-07, + "logps/chosen": -60.05938720703125, + "logps/rejected": -89.8451919555664, + "loss": 0.2166, + "losses/dpo": 0.13032609224319458, + "losses/sft": 2.4050872325897217, + "losses/total": 0.13032609224319458, + "ref_logps/chosen": -39.30598449707031, + "ref_logps/rejected": -45.87055587768555, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0753402709960938, + "rewards/margins": 2.3221235275268555, + "rewards/rejected": -4.397463798522949, + "step": 2436 + }, + { + "epoch": 2.3, + "grad_norm": 17.191051483154297, + "learning_rate": 1.2941587967820915e-07, + "logps/chosen": -53.162628173828125, + "logps/rejected": -86.98213958740234, + "loss": 0.1366, + "losses/dpo": 0.09298518300056458, + "losses/sft": 2.3359131813049316, + "losses/total": 0.09298518300056458, + "ref_logps/chosen": -36.64933395385742, + "ref_logps/rejected": -43.11638641357422, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6513291597366333, + "rewards/margins": 2.735246181488037, + "rewards/rejected": -4.386575222015381, + "step": 2437 + }, + { + "epoch": 2.3, + "grad_norm": 22.428136825561523, + "learning_rate": 1.2924099335431967e-07, + "logps/chosen": -57.398590087890625, + "logps/rejected": -96.95325469970703, + "loss": 0.196, + "losses/dpo": 0.14919836819171906, + "losses/sft": 2.094348192214966, + "losses/total": 0.14919836819171906, + "ref_logps/chosen": -29.801013946533203, + "ref_logps/rejected": -48.45726013183594, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7597575187683105, + "rewards/margins": 2.089841842651367, + "rewards/rejected": -4.849599361419678, + "step": 2438 + }, + { + "epoch": 2.3, + "grad_norm": 16.206666946411133, + "learning_rate": 1.2906610703043023e-07, + "logps/chosen": -48.533172607421875, + "logps/rejected": -82.30545806884766, + "loss": 0.21, + "losses/dpo": 0.15454255044460297, + "losses/sft": 1.960620641708374, + "losses/total": 0.15454255044460297, + "ref_logps/chosen": -29.086952209472656, + "ref_logps/rejected": -43.64354705810547, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9446223974227905, + "rewards/margins": 1.9215692281723022, + "rewards/rejected": -3.8661916255950928, + "step": 2439 + }, + { + "epoch": 2.3, + "grad_norm": 17.442277908325195, + "learning_rate": 1.2889122070654074e-07, + "logps/chosen": -56.63894271850586, + "logps/rejected": -85.51051330566406, + "loss": 0.168, + "losses/dpo": 0.13820341229438782, + "losses/sft": 1.964359164237976, + "losses/total": 0.13820341229438782, + "ref_logps/chosen": -39.571189880371094, + "ref_logps/rejected": -46.58123779296875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.706775426864624, + "rewards/margins": 2.1861517429351807, + "rewards/rejected": -3.8929271697998047, + "step": 2440 + }, + { + "epoch": 2.31, + "grad_norm": 18.458139419555664, + "learning_rate": 1.2871633438265128e-07, + "logps/chosen": -56.450103759765625, + "logps/rejected": -84.12791442871094, + "loss": 0.1951, + "losses/dpo": 0.12744677066802979, + "losses/sft": 1.800889492034912, + "losses/total": 0.12744677066802979, + "ref_logps/chosen": -38.28672790527344, + "ref_logps/rejected": -47.266780853271484, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8163377046585083, + "rewards/margins": 1.8697752952575684, + "rewards/rejected": -3.686112880706787, + "step": 2441 + }, + { + "epoch": 2.31, + "grad_norm": 28.57596206665039, + "learning_rate": 1.285414480587618e-07, + "logps/chosen": -67.91671752929688, + "logps/rejected": -78.28376770019531, + "loss": 0.382, + "losses/dpo": 0.8102523684501648, + "losses/sft": 2.4306821823120117, + "losses/total": 0.8102523684501648, + "ref_logps/chosen": -42.26472473144531, + "ref_logps/rejected": -38.39464569091797, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.565199851989746, + "rewards/margins": 1.4237124919891357, + "rewards/rejected": -3.988912582397461, + "step": 2442 + }, + { + "epoch": 2.31, + "grad_norm": 24.497249603271484, + "learning_rate": 1.283665617348723e-07, + "logps/chosen": -57.40618133544922, + "logps/rejected": -76.92585754394531, + "loss": 0.2653, + "losses/dpo": 0.3706018626689911, + "losses/sft": 2.0360872745513916, + "losses/total": 0.3706018626689911, + "ref_logps/chosen": -34.803768157958984, + "ref_logps/rejected": -35.438758850097656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2602412700653076, + "rewards/margins": 1.8884680271148682, + "rewards/rejected": -4.148709297180176, + "step": 2443 + }, + { + "epoch": 2.31, + "grad_norm": 23.49471092224121, + "learning_rate": 1.2819167541098285e-07, + "logps/chosen": -43.90263366699219, + "logps/rejected": -105.9566421508789, + "loss": 0.1804, + "losses/dpo": 0.03583944961428642, + "losses/sft": 1.913006067276001, + "losses/total": 0.03583944961428642, + "ref_logps/chosen": -26.37704849243164, + "ref_logps/rejected": -59.380035400390625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7525585889816284, + "rewards/margins": 2.905101776123047, + "rewards/rejected": -4.657660484313965, + "step": 2444 + }, + { + "epoch": 2.31, + "grad_norm": 19.033655166625977, + "learning_rate": 1.2801678908709339e-07, + "logps/chosen": -57.934593200683594, + "logps/rejected": -95.11457824707031, + "loss": 0.1616, + "losses/dpo": 0.3845723867416382, + "losses/sft": 2.000866651535034, + "losses/total": 0.3845723867416382, + "ref_logps/chosen": -35.448612213134766, + "ref_logps/rejected": -45.77138900756836, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.248598575592041, + "rewards/margins": 2.6857199668884277, + "rewards/rejected": -4.934318542480469, + "step": 2445 + }, + { + "epoch": 2.31, + "grad_norm": 21.187564849853516, + "learning_rate": 1.2784190276320393e-07, + "logps/chosen": -41.606624603271484, + "logps/rejected": -76.82884216308594, + "loss": 0.2662, + "losses/dpo": 0.26558205485343933, + "losses/sft": 1.6570028066635132, + "losses/total": 0.26558205485343933, + "ref_logps/chosen": -26.780683517456055, + "ref_logps/rejected": -36.45180892944336, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4825940132141113, + "rewards/margins": 2.5551092624664307, + "rewards/rejected": -4.037703514099121, + "step": 2446 + }, + { + "epoch": 2.31, + "grad_norm": 20.586835861206055, + "learning_rate": 1.2766701643931444e-07, + "logps/chosen": -52.53807830810547, + "logps/rejected": -95.30747985839844, + "loss": 0.178, + "losses/dpo": 0.025985386222600937, + "losses/sft": 1.456533670425415, + "losses/total": 0.025985386222600937, + "ref_logps/chosen": -37.67286682128906, + "ref_logps/rejected": -50.75, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4865206480026245, + "rewards/margins": 2.9692273139953613, + "rewards/rejected": -4.455747604370117, + "step": 2447 + }, + { + "epoch": 2.31, + "grad_norm": 20.420146942138672, + "learning_rate": 1.2749213011542498e-07, + "logps/chosen": -64.54875183105469, + "logps/rejected": -94.70415496826172, + "loss": 0.2212, + "losses/dpo": 0.1658926010131836, + "losses/sft": 1.9655648469924927, + "losses/total": 0.1658926010131836, + "ref_logps/chosen": -41.79730224609375, + "ref_logps/rejected": -47.0723991394043, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2751455307006836, + "rewards/margins": 2.488029718399048, + "rewards/rejected": -4.7631754875183105, + "step": 2448 + }, + { + "epoch": 2.31, + "grad_norm": 31.835233688354492, + "learning_rate": 1.273172437915355e-07, + "logps/chosen": -76.6187744140625, + "logps/rejected": -98.84121704101562, + "loss": 0.3508, + "losses/dpo": 0.2180909961462021, + "losses/sft": 2.0946733951568604, + "losses/total": 0.2180909961462021, + "ref_logps/chosen": -51.33055114746094, + "ref_logps/rejected": -54.965362548828125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5288217067718506, + "rewards/margins": 1.8587641716003418, + "rewards/rejected": -4.387585639953613, + "step": 2449 + }, + { + "epoch": 2.31, + "grad_norm": 18.53763198852539, + "learning_rate": 1.27142357467646e-07, + "logps/chosen": -54.00996017456055, + "logps/rejected": -84.0213623046875, + "loss": 0.2441, + "losses/dpo": 0.48685282468795776, + "losses/sft": 2.1103055477142334, + "losses/total": 0.48685282468795776, + "ref_logps/chosen": -34.60890197753906, + "ref_logps/rejected": -45.56768798828125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.940105676651001, + "rewards/margins": 1.9052613973617554, + "rewards/rejected": -3.845366954803467, + "step": 2450 + }, + { + "epoch": 2.31, + "grad_norm": 27.337562561035156, + "learning_rate": 1.2696747114375654e-07, + "logps/chosen": -58.65935516357422, + "logps/rejected": -75.87437438964844, + "loss": 0.2933, + "losses/dpo": 0.14911752939224243, + "losses/sft": 1.7263939380645752, + "losses/total": 0.14911752939224243, + "ref_logps/chosen": -40.06737518310547, + "ref_logps/rejected": -37.62965393066406, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8591978549957275, + "rewards/margins": 1.9652740955352783, + "rewards/rejected": -3.824471950531006, + "step": 2451 + }, + { + "epoch": 2.32, + "grad_norm": 20.54986000061035, + "learning_rate": 1.2679258481986708e-07, + "logps/chosen": -63.70112228393555, + "logps/rejected": -99.12324523925781, + "loss": 0.2124, + "losses/dpo": 0.25114142894744873, + "losses/sft": 1.7811933755874634, + "losses/total": 0.25114142894744873, + "ref_logps/chosen": -42.85333251953125, + "ref_logps/rejected": -56.14357376098633, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0847787857055664, + "rewards/margins": 2.2131879329681396, + "rewards/rejected": -4.297966957092285, + "step": 2452 + }, + { + "epoch": 2.32, + "grad_norm": 25.26395034790039, + "learning_rate": 1.2661769849597762e-07, + "logps/chosen": -55.94580078125, + "logps/rejected": -68.66764831542969, + "loss": 0.3117, + "losses/dpo": 0.12438103556632996, + "losses/sft": 2.262509822845459, + "losses/total": 0.12438103556632996, + "ref_logps/chosen": -37.385475158691406, + "ref_logps/rejected": -33.213523864746094, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8560330867767334, + "rewards/margins": 1.6893800497055054, + "rewards/rejected": -3.545413017272949, + "step": 2453 + }, + { + "epoch": 2.32, + "grad_norm": 20.834056854248047, + "learning_rate": 1.2644281217208813e-07, + "logps/chosen": -51.35419464111328, + "logps/rejected": -78.75160217285156, + "loss": 0.2838, + "losses/dpo": 0.26044735312461853, + "losses/sft": 2.5525283813476562, + "losses/total": 0.26044735312461853, + "ref_logps/chosen": -32.40174102783203, + "ref_logps/rejected": -41.621246337890625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8952455520629883, + "rewards/margins": 1.8177897930145264, + "rewards/rejected": -3.7130353450775146, + "step": 2454 + }, + { + "epoch": 2.32, + "grad_norm": 10.098645210266113, + "learning_rate": 1.2626792584819867e-07, + "logps/chosen": -54.063453674316406, + "logps/rejected": -97.94427490234375, + "loss": 0.0728, + "losses/dpo": 0.08777850866317749, + "losses/sft": 2.1690478324890137, + "losses/total": 0.08777850866317749, + "ref_logps/chosen": -35.80142593383789, + "ref_logps/rejected": -46.71717834472656, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8262027502059937, + "rewards/margins": 3.296506404876709, + "rewards/rejected": -5.122709274291992, + "step": 2455 + }, + { + "epoch": 2.32, + "grad_norm": 15.536396026611328, + "learning_rate": 1.2609303952430918e-07, + "logps/chosen": -56.88234329223633, + "logps/rejected": -98.4262466430664, + "loss": 0.1293, + "losses/dpo": 0.10162603110074997, + "losses/sft": 2.2636406421661377, + "losses/total": 0.10162603110074997, + "ref_logps/chosen": -39.09235382080078, + "ref_logps/rejected": -51.033958435058594, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7789990901947021, + "rewards/margins": 2.9602303504943848, + "rewards/rejected": -4.739229202270508, + "step": 2456 + }, + { + "epoch": 2.32, + "grad_norm": 32.14033889770508, + "learning_rate": 1.259181532004197e-07, + "logps/chosen": -50.93202209472656, + "logps/rejected": -88.27909088134766, + "loss": 0.283, + "losses/dpo": 0.331728458404541, + "losses/sft": 2.056981325149536, + "losses/total": 0.331728458404541, + "ref_logps/chosen": -32.174049377441406, + "ref_logps/rejected": -43.49720764160156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8757970333099365, + "rewards/margins": 2.602391242980957, + "rewards/rejected": -4.478188514709473, + "step": 2457 + }, + { + "epoch": 2.32, + "grad_norm": 20.82699203491211, + "learning_rate": 1.2574326687653026e-07, + "logps/chosen": -44.46734619140625, + "logps/rejected": -68.64436340332031, + "loss": 0.2503, + "losses/dpo": 0.1542724221944809, + "losses/sft": 2.5364983081817627, + "losses/total": 0.1542724221944809, + "ref_logps/chosen": -27.635202407836914, + "ref_logps/rejected": -34.36957550048828, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6832144260406494, + "rewards/margins": 1.7442653179168701, + "rewards/rejected": -3.4274797439575195, + "step": 2458 + }, + { + "epoch": 2.32, + "grad_norm": 17.210628509521484, + "learning_rate": 1.2556838055264078e-07, + "logps/chosen": -56.43772888183594, + "logps/rejected": -88.28271484375, + "loss": 0.1769, + "losses/dpo": 0.3156144618988037, + "losses/sft": 2.474200487136841, + "losses/total": 0.3156144618988037, + "ref_logps/chosen": -31.542987823486328, + "ref_logps/rejected": -42.54148483276367, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.489474058151245, + "rewards/margins": 2.0846495628356934, + "rewards/rejected": -4.574123382568359, + "step": 2459 + }, + { + "epoch": 2.32, + "grad_norm": 28.804651260375977, + "learning_rate": 1.2539349422875131e-07, + "logps/chosen": -76.59201049804688, + "logps/rejected": -86.26632690429688, + "loss": 0.3618, + "losses/dpo": 0.4398012161254883, + "losses/sft": 2.768176317214966, + "losses/total": 0.4398012161254883, + "ref_logps/chosen": -51.46018981933594, + "ref_logps/rejected": -43.8438720703125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5131821632385254, + "rewards/margins": 1.7290639877319336, + "rewards/rejected": -4.242246150970459, + "step": 2460 + }, + { + "epoch": 2.32, + "grad_norm": 20.570898056030273, + "learning_rate": 1.2521860790486183e-07, + "logps/chosen": -60.46925735473633, + "logps/rejected": -95.93611145019531, + "loss": 0.2352, + "losses/dpo": 0.1121564507484436, + "losses/sft": 2.411712169647217, + "losses/total": 0.1121564507484436, + "ref_logps/chosen": -35.38029479980469, + "ref_logps/rejected": -45.97947311401367, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5088961124420166, + "rewards/margins": 2.4867677688598633, + "rewards/rejected": -4.995663642883301, + "step": 2461 + }, + { + "epoch": 2.32, + "grad_norm": 20.29084587097168, + "learning_rate": 1.2504372158097237e-07, + "logps/chosen": -60.67136001586914, + "logps/rejected": -100.66184997558594, + "loss": 0.1832, + "losses/dpo": 0.059751611202955246, + "losses/sft": 1.9690364599227905, + "losses/total": 0.059751611202955246, + "ref_logps/chosen": -37.356910705566406, + "ref_logps/rejected": -52.58418273925781, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3314452171325684, + "rewards/margins": 2.476321220397949, + "rewards/rejected": -4.807766914367676, + "step": 2462 + }, + { + "epoch": 2.33, + "grad_norm": 23.233488082885742, + "learning_rate": 1.2486883525708288e-07, + "logps/chosen": -58.75841522216797, + "logps/rejected": -90.21353912353516, + "loss": 0.2688, + "losses/dpo": 0.18277209997177124, + "losses/sft": 2.602585792541504, + "losses/total": 0.18277209997177124, + "ref_logps/chosen": -39.41297912597656, + "ref_logps/rejected": -47.370506286621094, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9345438480377197, + "rewards/margins": 2.349759578704834, + "rewards/rejected": -4.284303665161133, + "step": 2463 + }, + { + "epoch": 2.33, + "grad_norm": 46.90460968017578, + "learning_rate": 1.2469394893319342e-07, + "logps/chosen": -62.881072998046875, + "logps/rejected": -76.18502807617188, + "loss": 0.6715, + "losses/dpo": 0.14692619442939758, + "losses/sft": 1.9703174829483032, + "losses/total": 0.14692619442939758, + "ref_logps/chosen": -38.223548889160156, + "ref_logps/rejected": -42.19207763671875, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4657530784606934, + "rewards/margins": 0.933542013168335, + "rewards/rejected": -3.399294853210449, + "step": 2464 + }, + { + "epoch": 2.33, + "grad_norm": 25.74579620361328, + "learning_rate": 1.2451906260930396e-07, + "logps/chosen": -57.81740951538086, + "logps/rejected": -80.21636199951172, + "loss": 0.3415, + "losses/dpo": 0.6382368803024292, + "losses/sft": 2.155486583709717, + "losses/total": 0.6382368803024292, + "ref_logps/chosen": -36.00206756591797, + "ref_logps/rejected": -42.33823013305664, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1815338134765625, + "rewards/margins": 1.6062794923782349, + "rewards/rejected": -3.787813186645508, + "step": 2465 + }, + { + "epoch": 2.33, + "grad_norm": 26.63974380493164, + "learning_rate": 1.2434417628541447e-07, + "logps/chosen": -63.47370910644531, + "logps/rejected": -84.77128601074219, + "loss": 0.3019, + "losses/dpo": 0.5920904874801636, + "losses/sft": 2.2707266807556152, + "losses/total": 0.5920904874801636, + "ref_logps/chosen": -44.79814147949219, + "ref_logps/rejected": -45.82119369506836, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8675568103790283, + "rewards/margins": 2.0274524688720703, + "rewards/rejected": -3.8950095176696777, + "step": 2466 + }, + { + "epoch": 2.33, + "grad_norm": 18.14527702331543, + "learning_rate": 1.24169289961525e-07, + "logps/chosen": -35.8369026184082, + "logps/rejected": -73.15737915039062, + "loss": 0.2277, + "losses/dpo": 0.23918096721172333, + "losses/sft": 1.267582654953003, + "losses/total": 0.23918096721172333, + "ref_logps/chosen": -25.805511474609375, + "ref_logps/rejected": -39.50239944458008, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0031393766403198, + "rewards/margins": 2.362358570098877, + "rewards/rejected": -3.3654980659484863, + "step": 2467 + }, + { + "epoch": 2.33, + "grad_norm": 10.962759017944336, + "learning_rate": 1.2399440363763552e-07, + "logps/chosen": -58.845863342285156, + "logps/rejected": -109.3837890625, + "loss": 0.1015, + "losses/dpo": 0.18106605112552643, + "losses/sft": 1.651471734046936, + "losses/total": 0.18106605112552643, + "ref_logps/chosen": -42.6986083984375, + "ref_logps/rejected": -62.83644485473633, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6147253513336182, + "rewards/margins": 3.040008544921875, + "rewards/rejected": -4.654733657836914, + "step": 2468 + }, + { + "epoch": 2.33, + "grad_norm": 37.31141662597656, + "learning_rate": 1.2381951731374606e-07, + "logps/chosen": -52.12660598754883, + "logps/rejected": -74.52386474609375, + "loss": 0.4707, + "losses/dpo": 0.13884441554546356, + "losses/sft": 2.2521181106567383, + "losses/total": 0.13884441554546356, + "ref_logps/chosen": -32.10844802856445, + "ref_logps/rejected": -39.178443908691406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0018160343170166, + "rewards/margins": 1.5327260494232178, + "rewards/rejected": -3.5345423221588135, + "step": 2469 + }, + { + "epoch": 2.33, + "grad_norm": 23.709136962890625, + "learning_rate": 1.2364463098985657e-07, + "logps/chosen": -57.718055725097656, + "logps/rejected": -94.78102111816406, + "loss": 0.1607, + "losses/dpo": 0.23916247487068176, + "losses/sft": 1.9120935201644897, + "losses/total": 0.23916247487068176, + "ref_logps/chosen": -38.44097137451172, + "ref_logps/rejected": -45.318538665771484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9277087450027466, + "rewards/margins": 3.018540382385254, + "rewards/rejected": -4.946249008178711, + "step": 2470 + }, + { + "epoch": 2.33, + "grad_norm": 22.588701248168945, + "learning_rate": 1.234697446659671e-07, + "logps/chosen": -63.049468994140625, + "logps/rejected": -91.10270690917969, + "loss": 0.2421, + "losses/dpo": 0.20788276195526123, + "losses/sft": 2.512965679168701, + "losses/total": 0.20788276195526123, + "ref_logps/chosen": -41.786827087402344, + "ref_logps/rejected": -46.390743255615234, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1262640953063965, + "rewards/margins": 2.3449323177337646, + "rewards/rejected": -4.47119665145874, + "step": 2471 + }, + { + "epoch": 2.33, + "grad_norm": 29.13214111328125, + "learning_rate": 1.2329485834207765e-07, + "logps/chosen": -48.514976501464844, + "logps/rejected": -75.62982177734375, + "loss": 0.4032, + "losses/dpo": 0.09179812669754028, + "losses/sft": 1.7202165126800537, + "losses/total": 0.09179812669754028, + "ref_logps/chosen": -28.74802589416504, + "ref_logps/rejected": -34.98958206176758, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9766952991485596, + "rewards/margins": 2.0873286724090576, + "rewards/rejected": -4.064023971557617, + "step": 2472 + }, + { + "epoch": 2.34, + "grad_norm": 33.538761138916016, + "learning_rate": 1.2311997201818816e-07, + "logps/chosen": -57.00736999511719, + "logps/rejected": -96.07063293457031, + "loss": 0.3945, + "losses/dpo": 0.2000495195388794, + "losses/sft": 2.2454514503479004, + "losses/total": 0.2000495195388794, + "ref_logps/chosen": -33.50537872314453, + "ref_logps/rejected": -52.868507385253906, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.350198984146118, + "rewards/margins": 1.9700140953063965, + "rewards/rejected": -4.320213794708252, + "step": 2473 + }, + { + "epoch": 2.34, + "grad_norm": 17.343101501464844, + "learning_rate": 1.229450856942987e-07, + "logps/chosen": -57.00798034667969, + "logps/rejected": -80.87181091308594, + "loss": 0.1811, + "losses/dpo": 0.1261172890663147, + "losses/sft": 2.116879940032959, + "losses/total": 0.1261172890663147, + "ref_logps/chosen": -40.87028503417969, + "ref_logps/rejected": -41.24085235595703, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.613769292831421, + "rewards/margins": 2.3493261337280273, + "rewards/rejected": -3.963095188140869, + "step": 2474 + }, + { + "epoch": 2.34, + "grad_norm": 25.425739288330078, + "learning_rate": 1.2277019937040922e-07, + "logps/chosen": -55.171783447265625, + "logps/rejected": -92.42965698242188, + "loss": 0.1827, + "losses/dpo": 0.08154292404651642, + "losses/sft": 1.7210462093353271, + "losses/total": 0.08154292404651642, + "ref_logps/chosen": -36.099735260009766, + "ref_logps/rejected": -47.89049530029297, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9072051048278809, + "rewards/margins": 2.5467114448547363, + "rewards/rejected": -4.453916549682617, + "step": 2475 + }, + { + "epoch": 2.34, + "grad_norm": 24.284379959106445, + "learning_rate": 1.2259531304651976e-07, + "logps/chosen": -51.57073974609375, + "logps/rejected": -72.46035766601562, + "loss": 0.246, + "losses/dpo": 0.1052037850022316, + "losses/sft": 2.075737714767456, + "losses/total": 0.1052037850022316, + "ref_logps/chosen": -37.08552932739258, + "ref_logps/rejected": -37.51117706298828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4485211372375488, + "rewards/margins": 2.0463967323303223, + "rewards/rejected": -3.494917869567871, + "step": 2476 + }, + { + "epoch": 2.34, + "grad_norm": 23.2425479888916, + "learning_rate": 1.224204267226303e-07, + "logps/chosen": -46.36212158203125, + "logps/rejected": -75.45295715332031, + "loss": 0.2452, + "losses/dpo": 0.4087217152118683, + "losses/sft": 2.0530078411102295, + "losses/total": 0.4087217152118683, + "ref_logps/chosen": -27.779163360595703, + "ref_logps/rejected": -38.845252990722656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8582955598831177, + "rewards/margins": 1.802475094795227, + "rewards/rejected": -3.6607704162597656, + "step": 2477 + }, + { + "epoch": 2.34, + "grad_norm": 26.23789405822754, + "learning_rate": 1.222455403987408e-07, + "logps/chosen": -60.44539260864258, + "logps/rejected": -90.34058380126953, + "loss": 0.2432, + "losses/dpo": 0.21942941844463348, + "losses/sft": 1.828418254852295, + "losses/total": 0.21942941844463348, + "ref_logps/chosen": -38.12716293334961, + "ref_logps/rejected": -46.1883544921875, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.231823444366455, + "rewards/margins": 2.1833996772766113, + "rewards/rejected": -4.415222644805908, + "step": 2478 + }, + { + "epoch": 2.34, + "grad_norm": 31.031585693359375, + "learning_rate": 1.2207065407485135e-07, + "logps/chosen": -59.69524002075195, + "logps/rejected": -94.47356414794922, + "loss": 0.2372, + "losses/dpo": 0.3599066436290741, + "losses/sft": 1.6405057907104492, + "losses/total": 0.3599066436290741, + "ref_logps/chosen": -36.046424865722656, + "ref_logps/rejected": -46.17823791503906, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3648810386657715, + "rewards/margins": 2.464651107788086, + "rewards/rejected": -4.829532623291016, + "step": 2479 + }, + { + "epoch": 2.34, + "grad_norm": 23.124414443969727, + "learning_rate": 1.2189576775096189e-07, + "logps/chosen": -52.421485900878906, + "logps/rejected": -87.51934051513672, + "loss": 0.2083, + "losses/dpo": 0.15195035934448242, + "losses/sft": 2.460550308227539, + "losses/total": 0.15195035934448242, + "ref_logps/chosen": -36.099639892578125, + "ref_logps/rejected": -41.90242385864258, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6321848630905151, + "rewards/margins": 2.92950701713562, + "rewards/rejected": -4.561691761016846, + "step": 2480 + }, + { + "epoch": 2.34, + "grad_norm": 24.015825271606445, + "learning_rate": 1.217208814270724e-07, + "logps/chosen": -51.32763671875, + "logps/rejected": -71.25943756103516, + "loss": 0.2565, + "losses/dpo": 0.23391401767730713, + "losses/sft": 2.285999059677124, + "losses/total": 0.23391401767730713, + "ref_logps/chosen": -33.14604949951172, + "ref_logps/rejected": -38.33258056640625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8181586265563965, + "rewards/margins": 1.4745268821716309, + "rewards/rejected": -3.2926855087280273, + "step": 2481 + }, + { + "epoch": 2.34, + "grad_norm": 25.586483001708984, + "learning_rate": 1.215459951031829e-07, + "logps/chosen": -53.34920883178711, + "logps/rejected": -75.64665222167969, + "loss": 0.2474, + "losses/dpo": 0.4022758901119232, + "losses/sft": 1.703517198562622, + "losses/total": 0.4022758901119232, + "ref_logps/chosen": -32.40521240234375, + "ref_logps/rejected": -35.93869400024414, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.094399929046631, + "rewards/margins": 1.8763959407806396, + "rewards/rejected": -3.9707958698272705, + "step": 2482 + }, + { + "epoch": 2.34, + "grad_norm": 17.13437843322754, + "learning_rate": 1.2137110877929345e-07, + "logps/chosen": -56.20327377319336, + "logps/rejected": -77.69754791259766, + "loss": 0.1637, + "losses/dpo": 0.13188134133815765, + "losses/sft": 1.2916014194488525, + "losses/total": 0.13188134133815765, + "ref_logps/chosen": -35.04595184326172, + "ref_logps/rejected": -34.603843688964844, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.115732431411743, + "rewards/margins": 2.1936378479003906, + "rewards/rejected": -4.309370040893555, + "step": 2483 + }, + { + "epoch": 2.35, + "grad_norm": 28.694690704345703, + "learning_rate": 1.21196222455404e-07, + "logps/chosen": -55.59763717651367, + "logps/rejected": -94.02726745605469, + "loss": 0.1727, + "losses/dpo": 0.08780666440725327, + "losses/sft": 2.1725800037384033, + "losses/total": 0.08780666440725327, + "ref_logps/chosen": -33.64509582519531, + "ref_logps/rejected": -46.372535705566406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.195254325866699, + "rewards/margins": 2.570219039916992, + "rewards/rejected": -4.765473365783691, + "step": 2484 + }, + { + "epoch": 2.35, + "grad_norm": 32.29655075073242, + "learning_rate": 1.210213361315145e-07, + "logps/chosen": -53.50304412841797, + "logps/rejected": -76.4126205444336, + "loss": 0.4029, + "losses/dpo": 0.2864738702774048, + "losses/sft": 2.3736648559570312, + "losses/total": 0.2864738702774048, + "ref_logps/chosen": -33.7008056640625, + "ref_logps/rejected": -41.582855224609375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9802236557006836, + "rewards/margins": 1.502752423286438, + "rewards/rejected": -3.482976198196411, + "step": 2485 + }, + { + "epoch": 2.35, + "grad_norm": 10.374073028564453, + "learning_rate": 1.2084644980762504e-07, + "logps/chosen": -51.38214874267578, + "logps/rejected": -111.7244873046875, + "loss": 0.0784, + "losses/dpo": 0.04466250538825989, + "losses/sft": 2.186174154281616, + "losses/total": 0.04466250538825989, + "ref_logps/chosen": -35.338314056396484, + "ref_logps/rejected": -59.774009704589844, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6043837070465088, + "rewards/margins": 3.5906643867492676, + "rewards/rejected": -5.195047855377197, + "step": 2486 + }, + { + "epoch": 2.35, + "grad_norm": 16.83829116821289, + "learning_rate": 1.2067156348373558e-07, + "logps/chosen": -48.547996520996094, + "logps/rejected": -87.23783111572266, + "loss": 0.1686, + "losses/dpo": 0.15708595514297485, + "losses/sft": 1.4696367979049683, + "losses/total": 0.15708595514297485, + "ref_logps/chosen": -27.32805061340332, + "ref_logps/rejected": -43.719696044921875, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.121994972229004, + "rewards/margins": 2.2298190593719482, + "rewards/rejected": -4.351813793182373, + "step": 2487 + }, + { + "epoch": 2.35, + "grad_norm": 24.458301544189453, + "learning_rate": 1.204966771598461e-07, + "logps/chosen": -63.58038330078125, + "logps/rejected": -87.37181091308594, + "loss": 0.2324, + "losses/dpo": 0.24954506754875183, + "losses/sft": 2.0932254791259766, + "losses/total": 0.24954506754875183, + "ref_logps/chosen": -42.60572052001953, + "ref_logps/rejected": -40.25506591796875, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.097466468811035, + "rewards/margins": 2.6142077445983887, + "rewards/rejected": -4.711674690246582, + "step": 2488 + }, + { + "epoch": 2.35, + "grad_norm": 21.959930419921875, + "learning_rate": 1.203217908359566e-07, + "logps/chosen": -64.03722381591797, + "logps/rejected": -78.38963317871094, + "loss": 0.2408, + "losses/dpo": 0.4552140235900879, + "losses/sft": 2.167886734008789, + "losses/total": 0.4552140235900879, + "ref_logps/chosen": -45.30060577392578, + "ref_logps/rejected": -40.939735412597656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8736615180969238, + "rewards/margins": 1.871328353881836, + "rewards/rejected": -3.7449898719787598, + "step": 2489 + }, + { + "epoch": 2.35, + "grad_norm": 23.76074981689453, + "learning_rate": 1.2014690451206714e-07, + "logps/chosen": -54.80952835083008, + "logps/rejected": -82.96211242675781, + "loss": 0.2265, + "losses/dpo": 0.3063853085041046, + "losses/sft": 2.179708480834961, + "losses/total": 0.3063853085041046, + "ref_logps/chosen": -34.65270233154297, + "ref_logps/rejected": -41.05809783935547, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0156826972961426, + "rewards/margins": 2.1747193336486816, + "rewards/rejected": -4.190402030944824, + "step": 2490 + }, + { + "epoch": 2.35, + "grad_norm": 17.532278060913086, + "learning_rate": 1.1997201818817768e-07, + "logps/chosen": -65.1349105834961, + "logps/rejected": -83.5277099609375, + "loss": 0.2153, + "losses/dpo": 0.08171360939741135, + "losses/sft": 1.9716815948486328, + "losses/total": 0.08171360939741135, + "ref_logps/chosen": -44.480445861816406, + "ref_logps/rejected": -41.43211364746094, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0654468536376953, + "rewards/margins": 2.1441125869750977, + "rewards/rejected": -4.209559440612793, + "step": 2491 + }, + { + "epoch": 2.35, + "grad_norm": 25.128236770629883, + "learning_rate": 1.197971318642882e-07, + "logps/chosen": -66.88798522949219, + "logps/rejected": -73.8001708984375, + "loss": 0.2689, + "losses/dpo": 0.44392144680023193, + "losses/sft": 2.020564317703247, + "losses/total": 0.44392144680023193, + "ref_logps/chosen": -41.911354064941406, + "ref_logps/rejected": -32.91697692871094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4976630210876465, + "rewards/margins": 1.5906562805175781, + "rewards/rejected": -4.088319301605225, + "step": 2492 + }, + { + "epoch": 2.35, + "grad_norm": 15.019405364990234, + "learning_rate": 1.1962224554039873e-07, + "logps/chosen": -52.15613555908203, + "logps/rejected": -87.82102966308594, + "loss": 0.144, + "losses/dpo": 0.09639427065849304, + "losses/sft": 2.326068878173828, + "losses/total": 0.09639427065849304, + "ref_logps/chosen": -34.46449279785156, + "ref_logps/rejected": -47.426231384277344, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7691645622253418, + "rewards/margins": 2.270314931869507, + "rewards/rejected": -4.0394792556762695, + "step": 2493 + }, + { + "epoch": 2.36, + "grad_norm": 27.142698287963867, + "learning_rate": 1.1944735921650927e-07, + "logps/chosen": -49.20192337036133, + "logps/rejected": -75.52554321289062, + "loss": 0.2958, + "losses/dpo": 0.09309644997119904, + "losses/sft": 1.8377737998962402, + "losses/total": 0.09309644997119904, + "ref_logps/chosen": -29.45509910583496, + "ref_logps/rejected": -36.688255310058594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.974682331085205, + "rewards/margins": 1.9090458154678345, + "rewards/rejected": -3.883728265762329, + "step": 2494 + }, + { + "epoch": 2.36, + "grad_norm": 14.402860641479492, + "learning_rate": 1.1927247289261979e-07, + "logps/chosen": -49.971622467041016, + "logps/rejected": -88.38668823242188, + "loss": 0.1399, + "losses/dpo": 0.10670387744903564, + "losses/sft": 2.0274529457092285, + "losses/total": 0.10670387744903564, + "ref_logps/chosen": -33.17326354980469, + "ref_logps/rejected": -46.11180114746094, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6798359155654907, + "rewards/margins": 2.54765248298645, + "rewards/rejected": -4.2274885177612305, + "step": 2495 + }, + { + "epoch": 2.36, + "grad_norm": 17.647789001464844, + "learning_rate": 1.1909758656873033e-07, + "logps/chosen": -61.177093505859375, + "logps/rejected": -96.60371398925781, + "loss": 0.167, + "losses/dpo": 0.1981060802936554, + "losses/sft": 2.3658909797668457, + "losses/total": 0.1981060802936554, + "ref_logps/chosen": -37.59406661987305, + "ref_logps/rejected": -48.2691650390625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3583028316497803, + "rewards/margins": 2.475151777267456, + "rewards/rejected": -4.833454608917236, + "step": 2496 + }, + { + "epoch": 2.36, + "grad_norm": 39.10746765136719, + "learning_rate": 1.1892270024484084e-07, + "logps/chosen": -65.44963073730469, + "logps/rejected": -86.73764038085938, + "loss": 0.3805, + "losses/dpo": 0.4502178430557251, + "losses/sft": 2.1416373252868652, + "losses/total": 0.4502178430557251, + "ref_logps/chosen": -40.37435531616211, + "ref_logps/rejected": -44.7657356262207, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5075275897979736, + "rewards/margins": 1.6896629333496094, + "rewards/rejected": -4.197190284729004, + "step": 2497 + }, + { + "epoch": 2.36, + "grad_norm": 18.289409637451172, + "learning_rate": 1.1874781392095138e-07, + "logps/chosen": -48.01121520996094, + "logps/rejected": -77.09693908691406, + "loss": 0.1928, + "losses/dpo": 0.1729419082403183, + "losses/sft": 2.5575506687164307, + "losses/total": 0.1729419082403183, + "ref_logps/chosen": -31.58893585205078, + "ref_logps/rejected": -40.06235885620117, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.642228364944458, + "rewards/margins": 2.061229705810547, + "rewards/rejected": -3.703458070755005, + "step": 2498 + }, + { + "epoch": 2.36, + "grad_norm": 34.7858772277832, + "learning_rate": 1.185729275970619e-07, + "logps/chosen": -54.143707275390625, + "logps/rejected": -83.10661315917969, + "loss": 0.396, + "losses/dpo": 0.031737472862005234, + "losses/sft": 2.1121714115142822, + "losses/total": 0.031737472862005234, + "ref_logps/chosen": -33.59892654418945, + "ref_logps/rejected": -42.253868103027344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.054478168487549, + "rewards/margins": 2.030796527862549, + "rewards/rejected": -4.085274696350098, + "step": 2499 + }, + { + "epoch": 2.36, + "grad_norm": 32.00473403930664, + "learning_rate": 1.1839804127317243e-07, + "logps/chosen": -60.17320251464844, + "logps/rejected": -80.67930603027344, + "loss": 0.3692, + "losses/dpo": 0.31357210874557495, + "losses/sft": 2.699254274368286, + "losses/total": 0.31357210874557495, + "ref_logps/chosen": -37.89751434326172, + "ref_logps/rejected": -40.754493713378906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2275688648223877, + "rewards/margins": 1.7649123668670654, + "rewards/rejected": -3.992481231689453, + "step": 2500 + }, + { + "epoch": 2.36, + "grad_norm": 18.617389678955078, + "learning_rate": 1.1822315494928297e-07, + "logps/chosen": -59.37726593017578, + "logps/rejected": -73.59318542480469, + "loss": 0.2353, + "losses/dpo": 0.34349995851516724, + "losses/sft": 1.656891107559204, + "losses/total": 0.34349995851516724, + "ref_logps/chosen": -43.758975982666016, + "ref_logps/rejected": -36.68318176269531, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5618293285369873, + "rewards/margins": 2.1291706562042236, + "rewards/rejected": -3.69100022315979, + "step": 2501 + }, + { + "epoch": 2.36, + "grad_norm": 33.56885528564453, + "learning_rate": 1.180482686253935e-07, + "logps/chosen": -54.31251525878906, + "logps/rejected": -69.59628295898438, + "loss": 0.3615, + "losses/dpo": 0.1370834857225418, + "losses/sft": 1.7437453269958496, + "losses/total": 0.1370834857225418, + "ref_logps/chosen": -35.061424255371094, + "ref_logps/rejected": -36.689937591552734, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.925109624862671, + "rewards/margins": 1.3655250072479248, + "rewards/rejected": -3.2906343936920166, + "step": 2502 + }, + { + "epoch": 2.36, + "grad_norm": 22.351438522338867, + "learning_rate": 1.1787338230150402e-07, + "logps/chosen": -64.93675231933594, + "logps/rejected": -88.02465057373047, + "loss": 0.2347, + "losses/dpo": 0.2633363604545593, + "losses/sft": 2.7023799419403076, + "losses/total": 0.2633363604545593, + "ref_logps/chosen": -41.82102966308594, + "ref_logps/rejected": -45.46723175048828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.311573028564453, + "rewards/margins": 1.944169044494629, + "rewards/rejected": -4.25574254989624, + "step": 2503 + }, + { + "epoch": 2.36, + "grad_norm": 17.059335708618164, + "learning_rate": 1.1769849597761455e-07, + "logps/chosen": -58.19172668457031, + "logps/rejected": -88.29953002929688, + "loss": 0.152, + "losses/dpo": 0.15331706404685974, + "losses/sft": 1.6098928451538086, + "losses/total": 0.15331706404685974, + "ref_logps/chosen": -39.69989776611328, + "ref_logps/rejected": -43.158870697021484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.849183201789856, + "rewards/margins": 2.6648826599121094, + "rewards/rejected": -4.514065742492676, + "step": 2504 + }, + { + "epoch": 2.37, + "grad_norm": 10.571551322937012, + "learning_rate": 1.1752360965372507e-07, + "logps/chosen": -47.77638626098633, + "logps/rejected": -90.98786163330078, + "loss": 0.1173, + "losses/dpo": 0.07016374170780182, + "losses/sft": 1.7941679954528809, + "losses/total": 0.07016374170780182, + "ref_logps/chosen": -31.9508113861084, + "ref_logps/rejected": -45.91461181640625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5825573205947876, + "rewards/margins": 2.92476749420166, + "rewards/rejected": -4.507324695587158, + "step": 2505 + }, + { + "epoch": 2.37, + "grad_norm": 13.991881370544434, + "learning_rate": 1.173487233298356e-07, + "logps/chosen": -51.79463195800781, + "logps/rejected": -89.44017028808594, + "loss": 0.1979, + "losses/dpo": 0.3450295627117157, + "losses/sft": 1.1797661781311035, + "losses/total": 0.3450295627117157, + "ref_logps/chosen": -32.96784210205078, + "ref_logps/rejected": -46.98945236206055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.882678747177124, + "rewards/margins": 2.3623924255371094, + "rewards/rejected": -4.2450714111328125, + "step": 2506 + }, + { + "epoch": 2.37, + "grad_norm": 36.70252990722656, + "learning_rate": 1.1717383700594612e-07, + "logps/chosen": -58.41950225830078, + "logps/rejected": -90.22645568847656, + "loss": 0.275, + "losses/dpo": 0.510208785533905, + "losses/sft": 2.7526657581329346, + "losses/total": 0.510208785533905, + "ref_logps/chosen": -35.70545196533203, + "ref_logps/rejected": -42.26300048828125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2714052200317383, + "rewards/margins": 2.524940013885498, + "rewards/rejected": -4.7963457107543945, + "step": 2507 + }, + { + "epoch": 2.37, + "grad_norm": 16.828462600708008, + "learning_rate": 1.1699895068205666e-07, + "logps/chosen": -65.06836700439453, + "logps/rejected": -99.44084167480469, + "loss": 0.1796, + "losses/dpo": 0.18887639045715332, + "losses/sft": 1.9328739643096924, + "losses/total": 0.18887639045715332, + "ref_logps/chosen": -41.1427001953125, + "ref_logps/rejected": -54.88560485839844, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.392566442489624, + "rewards/margins": 2.062957286834717, + "rewards/rejected": -4.455523490905762, + "step": 2508 + }, + { + "epoch": 2.37, + "grad_norm": 29.831985473632812, + "learning_rate": 1.1682406435816719e-07, + "logps/chosen": -65.11944580078125, + "logps/rejected": -83.46836853027344, + "loss": 0.3405, + "losses/dpo": 0.06395786255598068, + "losses/sft": 1.8726327419281006, + "losses/total": 0.06395786255598068, + "ref_logps/chosen": -42.787269592285156, + "ref_logps/rejected": -41.564571380615234, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.233217477798462, + "rewards/margins": 1.9571622610092163, + "rewards/rejected": -4.190380096435547, + "step": 2509 + }, + { + "epoch": 2.37, + "grad_norm": 19.849464416503906, + "learning_rate": 1.1664917803427771e-07, + "logps/chosen": -48.45545196533203, + "logps/rejected": -65.52117919921875, + "loss": 0.2649, + "losses/dpo": 0.23613230884075165, + "losses/sft": 2.165217399597168, + "losses/total": 0.23613230884075165, + "ref_logps/chosen": -31.89284896850586, + "ref_logps/rejected": -30.07781219482422, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6562602519989014, + "rewards/margins": 1.8880764245986938, + "rewards/rejected": -3.5443365573883057, + "step": 2510 + }, + { + "epoch": 2.37, + "grad_norm": 31.271276473999023, + "learning_rate": 1.1647429171038824e-07, + "logps/chosen": -60.59358596801758, + "logps/rejected": -74.07872772216797, + "loss": 0.3818, + "losses/dpo": 0.19057197868824005, + "losses/sft": 1.6542092561721802, + "losses/total": 0.19057197868824005, + "ref_logps/chosen": -39.452354431152344, + "ref_logps/rejected": -34.966094970703125, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1141233444213867, + "rewards/margins": 1.7971405982971191, + "rewards/rejected": -3.9112634658813477, + "step": 2511 + }, + { + "epoch": 2.37, + "grad_norm": 29.318376541137695, + "learning_rate": 1.1629940538649877e-07, + "logps/chosen": -68.43071746826172, + "logps/rejected": -93.36837768554688, + "loss": 0.2093, + "losses/dpo": 0.2968805730342865, + "losses/sft": 2.332024097442627, + "losses/total": 0.2968805730342865, + "ref_logps/chosen": -46.62525939941406, + "ref_logps/rejected": -48.09784698486328, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.180546283721924, + "rewards/margins": 2.3465075492858887, + "rewards/rejected": -4.5270538330078125, + "step": 2512 + }, + { + "epoch": 2.37, + "grad_norm": 25.011404037475586, + "learning_rate": 1.1612451906260929e-07, + "logps/chosen": -78.1155014038086, + "logps/rejected": -82.64877319335938, + "loss": 0.2416, + "losses/dpo": 0.14893746376037598, + "losses/sft": 2.395493984222412, + "losses/total": 0.14893746376037598, + "ref_logps/chosen": -56.980194091796875, + "ref_logps/rejected": -40.27091979980469, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1135311126708984, + "rewards/margins": 2.124253988265991, + "rewards/rejected": -4.237785339355469, + "step": 2513 + }, + { + "epoch": 2.37, + "grad_norm": 29.251052856445312, + "learning_rate": 1.1594963273871983e-07, + "logps/chosen": -70.18621063232422, + "logps/rejected": -106.8014907836914, + "loss": 0.292, + "losses/dpo": 0.35300523042678833, + "losses/sft": 2.156134843826294, + "losses/total": 0.35300523042678833, + "ref_logps/chosen": -48.75346374511719, + "ref_logps/rejected": -60.8981819152832, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1432740688323975, + "rewards/margins": 2.447057008743286, + "rewards/rejected": -4.590331077575684, + "step": 2514 + }, + { + "epoch": 2.37, + "grad_norm": 22.282623291015625, + "learning_rate": 1.1577474641483036e-07, + "logps/chosen": -55.873451232910156, + "logps/rejected": -77.88690185546875, + "loss": 0.292, + "losses/dpo": 0.42617663741111755, + "losses/sft": 1.9727500677108765, + "losses/total": 0.42617663741111755, + "ref_logps/chosen": -38.03065490722656, + "ref_logps/rejected": -36.25072479248047, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7842793464660645, + "rewards/margins": 2.379338026046753, + "rewards/rejected": -4.163617134094238, + "step": 2515 + }, + { + "epoch": 2.38, + "grad_norm": 19.478694915771484, + "learning_rate": 1.1559986009094088e-07, + "logps/chosen": -46.964141845703125, + "logps/rejected": -89.1031494140625, + "loss": 0.1336, + "losses/dpo": 0.23474769294261932, + "losses/sft": 2.006344795227051, + "losses/total": 0.23474769294261932, + "ref_logps/chosen": -28.179340362548828, + "ref_logps/rejected": -41.634239196777344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8784804344177246, + "rewards/margins": 2.868411064147949, + "rewards/rejected": -4.746891498565674, + "step": 2516 + }, + { + "epoch": 2.38, + "grad_norm": 16.617446899414062, + "learning_rate": 1.1542497376705142e-07, + "logps/chosen": -40.45554733276367, + "logps/rejected": -87.22796630859375, + "loss": 0.1389, + "losses/dpo": 0.10199884325265884, + "losses/sft": 0.8668092489242554, + "losses/total": 0.10199884325265884, + "ref_logps/chosen": -27.659257888793945, + "ref_logps/rejected": -44.386199951171875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2796292304992676, + "rewards/margins": 3.004546642303467, + "rewards/rejected": -4.284175872802734, + "step": 2517 + }, + { + "epoch": 2.38, + "grad_norm": 25.946596145629883, + "learning_rate": 1.1525008744316193e-07, + "logps/chosen": -58.72694396972656, + "logps/rejected": -87.80030822753906, + "loss": 0.1976, + "losses/dpo": 0.2447998821735382, + "losses/sft": 2.1436166763305664, + "losses/total": 0.2447998821735382, + "ref_logps/chosen": -40.694984436035156, + "ref_logps/rejected": -45.388946533203125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8031957149505615, + "rewards/margins": 2.4379405975341797, + "rewards/rejected": -4.24113655090332, + "step": 2518 + }, + { + "epoch": 2.38, + "grad_norm": 28.655118942260742, + "learning_rate": 1.1507520111927246e-07, + "logps/chosen": -71.76898193359375, + "logps/rejected": -90.77214050292969, + "loss": 0.2743, + "losses/dpo": 0.09876863658428192, + "losses/sft": 1.8171237707138062, + "losses/total": 0.09876863658428192, + "ref_logps/chosen": -46.837554931640625, + "ref_logps/rejected": -47.927669525146484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4931435585021973, + "rewards/margins": 1.7913038730621338, + "rewards/rejected": -4.284447193145752, + "step": 2519 + }, + { + "epoch": 2.38, + "grad_norm": 18.40703773498535, + "learning_rate": 1.14900314795383e-07, + "logps/chosen": -69.4997329711914, + "logps/rejected": -88.21411895751953, + "loss": 0.2095, + "losses/dpo": 0.06394433975219727, + "losses/sft": 1.9794204235076904, + "losses/total": 0.06394433975219727, + "ref_logps/chosen": -46.442657470703125, + "ref_logps/rejected": -41.188011169433594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3057076930999756, + "rewards/margins": 2.3969035148620605, + "rewards/rejected": -4.702611446380615, + "step": 2520 + }, + { + "epoch": 2.38, + "grad_norm": 44.416439056396484, + "learning_rate": 1.1472542847149353e-07, + "logps/chosen": -72.70677947998047, + "logps/rejected": -84.92475891113281, + "loss": 0.4243, + "losses/dpo": 0.20675459504127502, + "losses/sft": 1.9799474477767944, + "losses/total": 0.20675459504127502, + "ref_logps/chosen": -47.42877960205078, + "ref_logps/rejected": -46.17876052856445, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5278000831604004, + "rewards/margins": 1.3468000888824463, + "rewards/rejected": -3.874600410461426, + "step": 2521 + }, + { + "epoch": 2.38, + "grad_norm": 35.40708923339844, + "learning_rate": 1.1455054214760405e-07, + "logps/chosen": -57.54240417480469, + "logps/rejected": -79.65693664550781, + "loss": 0.2964, + "losses/dpo": 0.25444450974464417, + "losses/sft": 2.3195719718933105, + "losses/total": 0.25444450974464417, + "ref_logps/chosen": -39.25010299682617, + "ref_logps/rejected": -40.364768981933594, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8292301893234253, + "rewards/margins": 2.0999863147735596, + "rewards/rejected": -3.9292166233062744, + "step": 2522 + }, + { + "epoch": 2.38, + "grad_norm": 26.783090591430664, + "learning_rate": 1.1437565582371459e-07, + "logps/chosen": -55.58489990234375, + "logps/rejected": -88.6790771484375, + "loss": 0.3058, + "losses/dpo": 0.5128049850463867, + "losses/sft": 3.0865347385406494, + "losses/total": 0.5128049850463867, + "ref_logps/chosen": -31.529624938964844, + "ref_logps/rejected": -46.409820556640625, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4055278301239014, + "rewards/margins": 1.8213980197906494, + "rewards/rejected": -4.226925849914551, + "step": 2523 + }, + { + "epoch": 2.38, + "grad_norm": 18.745254516601562, + "learning_rate": 1.1420076949982512e-07, + "logps/chosen": -55.39482879638672, + "logps/rejected": -86.84278106689453, + "loss": 0.2169, + "losses/dpo": 0.22749954462051392, + "losses/sft": 2.523874521255493, + "losses/total": 0.22749954462051392, + "ref_logps/chosen": -35.19559860229492, + "ref_logps/rejected": -48.01429748535156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.019923210144043, + "rewards/margins": 1.8629255294799805, + "rewards/rejected": -3.8828487396240234, + "step": 2524 + }, + { + "epoch": 2.38, + "grad_norm": 36.80396270751953, + "learning_rate": 1.1402588317593563e-07, + "logps/chosen": -68.5572509765625, + "logps/rejected": -93.6121826171875, + "loss": 0.3319, + "losses/dpo": 0.9290879964828491, + "losses/sft": 2.5073792934417725, + "losses/total": 0.9290879964828491, + "ref_logps/chosen": -42.86677551269531, + "ref_logps/rejected": -47.364646911621094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5690479278564453, + "rewards/margins": 2.055706024169922, + "rewards/rejected": -4.624753952026367, + "step": 2525 + }, + { + "epoch": 2.39, + "grad_norm": 14.319046974182129, + "learning_rate": 1.1385099685204616e-07, + "logps/chosen": -44.667320251464844, + "logps/rejected": -78.54658508300781, + "loss": 0.1266, + "losses/dpo": 0.06307297945022583, + "losses/sft": 1.4635499715805054, + "losses/total": 0.06307297945022583, + "ref_logps/chosen": -31.404142379760742, + "ref_logps/rejected": -39.424076080322266, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3263180255889893, + "rewards/margins": 2.585933208465576, + "rewards/rejected": -3.9122512340545654, + "step": 2526 + }, + { + "epoch": 2.39, + "grad_norm": 21.520751953125, + "learning_rate": 1.136761105281567e-07, + "logps/chosen": -47.26658248901367, + "logps/rejected": -66.99307250976562, + "loss": 0.1981, + "losses/dpo": 0.07307786494493484, + "losses/sft": 1.6557444334030151, + "losses/total": 0.07307786494493484, + "ref_logps/chosen": -33.730133056640625, + "ref_logps/rejected": -29.53289031982422, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3536449670791626, + "rewards/margins": 2.3923730850219727, + "rewards/rejected": -3.7460179328918457, + "step": 2527 + }, + { + "epoch": 2.39, + "grad_norm": 15.640666961669922, + "learning_rate": 1.1350122420426722e-07, + "logps/chosen": -65.79432678222656, + "logps/rejected": -97.02033996582031, + "loss": 0.1134, + "losses/dpo": 0.10471897572278976, + "losses/sft": 1.8932609558105469, + "losses/total": 0.10471897572278976, + "ref_logps/chosen": -43.85139846801758, + "ref_logps/rejected": -49.76023864746094, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1942920684814453, + "rewards/margins": 2.5317177772521973, + "rewards/rejected": -4.726009845733643, + "step": 2528 + }, + { + "epoch": 2.39, + "grad_norm": 13.496094703674316, + "learning_rate": 1.1332633788037775e-07, + "logps/chosen": -57.23871612548828, + "logps/rejected": -91.66221618652344, + "loss": 0.1418, + "losses/dpo": 0.062215834856033325, + "losses/sft": 1.5759575366973877, + "losses/total": 0.062215834856033325, + "ref_logps/chosen": -39.60386657714844, + "ref_logps/rejected": -44.322872161865234, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7634848356246948, + "rewards/margins": 2.970449686050415, + "rewards/rejected": -4.73393440246582, + "step": 2529 + }, + { + "epoch": 2.39, + "grad_norm": 28.31764793395996, + "learning_rate": 1.1315145155648829e-07, + "logps/chosen": -60.44569396972656, + "logps/rejected": -83.93952941894531, + "loss": 0.2322, + "losses/dpo": 0.05488099902868271, + "losses/sft": 1.9449849128723145, + "losses/total": 0.05488099902868271, + "ref_logps/chosen": -39.794559478759766, + "ref_logps/rejected": -40.099056243896484, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0651135444641113, + "rewards/margins": 2.3189339637756348, + "rewards/rejected": -4.384047508239746, + "step": 2530 + }, + { + "epoch": 2.39, + "grad_norm": 29.370267868041992, + "learning_rate": 1.1297656523259881e-07, + "logps/chosen": -51.62361145019531, + "logps/rejected": -90.83674621582031, + "loss": 0.2629, + "losses/dpo": 0.23380114138126373, + "losses/sft": 1.6566338539123535, + "losses/total": 0.23380114138126373, + "ref_logps/chosen": -33.09646224975586, + "ref_logps/rejected": -47.39875411987305, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8527146577835083, + "rewards/margins": 2.4910855293273926, + "rewards/rejected": -4.3438005447387695, + "step": 2531 + }, + { + "epoch": 2.39, + "grad_norm": 24.555252075195312, + "learning_rate": 1.1280167890870932e-07, + "logps/chosen": -72.76583099365234, + "logps/rejected": -95.81634521484375, + "loss": 0.2592, + "losses/dpo": 0.2598763108253479, + "losses/sft": 2.5177524089813232, + "losses/total": 0.2598763108253479, + "ref_logps/chosen": -45.730281829833984, + "ref_logps/rejected": -50.23405456542969, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.703554630279541, + "rewards/margins": 1.8546749353408813, + "rewards/rejected": -4.558229923248291, + "step": 2532 + }, + { + "epoch": 2.39, + "grad_norm": 21.70107650756836, + "learning_rate": 1.1262679258481986e-07, + "logps/chosen": -71.17634582519531, + "logps/rejected": -101.7744140625, + "loss": 0.1884, + "losses/dpo": 0.10532114654779434, + "losses/sft": 3.040187120437622, + "losses/total": 0.10532114654779434, + "ref_logps/chosen": -40.8497200012207, + "ref_logps/rejected": -45.84651184082031, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0326626300811768, + "rewards/margins": 2.560127019882202, + "rewards/rejected": -5.592789649963379, + "step": 2533 + }, + { + "epoch": 2.39, + "grad_norm": 24.463098526000977, + "learning_rate": 1.1245190626093039e-07, + "logps/chosen": -52.83331298828125, + "logps/rejected": -75.98428344726562, + "loss": 0.2238, + "losses/dpo": 0.24562160670757294, + "losses/sft": 1.7588242292404175, + "losses/total": 0.24562160670757294, + "ref_logps/chosen": -34.70185852050781, + "ref_logps/rejected": -36.12361526489258, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.813145637512207, + "rewards/margins": 2.1729211807250977, + "rewards/rejected": -3.9860668182373047, + "step": 2534 + }, + { + "epoch": 2.39, + "grad_norm": 40.53074264526367, + "learning_rate": 1.1227701993704091e-07, + "logps/chosen": -61.78066635131836, + "logps/rejected": -79.13165283203125, + "loss": 0.3981, + "losses/dpo": 0.37873753905296326, + "losses/sft": 1.501401424407959, + "losses/total": 0.37873753905296326, + "ref_logps/chosen": -38.14493179321289, + "ref_logps/rejected": -38.703392028808594, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3635733127593994, + "rewards/margins": 1.6792532205581665, + "rewards/rejected": -4.0428266525268555, + "step": 2535 + }, + { + "epoch": 2.39, + "grad_norm": 34.54536819458008, + "learning_rate": 1.1210213361315145e-07, + "logps/chosen": -60.59071731567383, + "logps/rejected": -83.66757202148438, + "loss": 0.3721, + "losses/dpo": 0.22701193392276764, + "losses/sft": 2.5241289138793945, + "losses/total": 0.22701193392276764, + "ref_logps/chosen": -34.511104583740234, + "ref_logps/rejected": -36.30790710449219, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6079609394073486, + "rewards/margins": 2.1280055046081543, + "rewards/rejected": -4.735966682434082, + "step": 2536 + }, + { + "epoch": 2.4, + "grad_norm": 27.279024124145508, + "learning_rate": 1.1192724728926198e-07, + "logps/chosen": -72.88031005859375, + "logps/rejected": -92.48606872558594, + "loss": 0.2682, + "losses/dpo": 0.3463936448097229, + "losses/sft": 2.2874464988708496, + "losses/total": 0.3463936448097229, + "ref_logps/chosen": -48.92679977416992, + "ref_logps/rejected": -46.500892639160156, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3953514099121094, + "rewards/margins": 2.2031667232513428, + "rewards/rejected": -4.598518371582031, + "step": 2537 + }, + { + "epoch": 2.4, + "grad_norm": 26.769601821899414, + "learning_rate": 1.117523609653725e-07, + "logps/chosen": -46.56377410888672, + "logps/rejected": -75.41265869140625, + "loss": 0.287, + "losses/dpo": 0.284978985786438, + "losses/sft": 2.4260573387145996, + "losses/total": 0.284978985786438, + "ref_logps/chosen": -27.320091247558594, + "ref_logps/rejected": -36.27096176147461, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9243682622909546, + "rewards/margins": 1.9898014068603516, + "rewards/rejected": -3.9141695499420166, + "step": 2538 + }, + { + "epoch": 2.4, + "grad_norm": 25.439987182617188, + "learning_rate": 1.1157747464148302e-07, + "logps/chosen": -75.16964721679688, + "logps/rejected": -103.0256118774414, + "loss": 0.2039, + "losses/dpo": 0.23099377751350403, + "losses/sft": 2.4316399097442627, + "losses/total": 0.23099377751350403, + "ref_logps/chosen": -49.023441314697266, + "ref_logps/rejected": -54.69520568847656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.614621162414551, + "rewards/margins": 2.2184195518493652, + "rewards/rejected": -4.833040714263916, + "step": 2539 + }, + { + "epoch": 2.4, + "grad_norm": 19.881114959716797, + "learning_rate": 1.1140258831759356e-07, + "logps/chosen": -56.24695587158203, + "logps/rejected": -99.95590209960938, + "loss": 0.2281, + "losses/dpo": 0.3772699236869812, + "losses/sft": 1.3900045156478882, + "losses/total": 0.3772699236869812, + "ref_logps/chosen": -38.65867614746094, + "ref_logps/rejected": -57.306697845458984, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7588279247283936, + "rewards/margins": 2.506092071533203, + "rewards/rejected": -4.264919757843018, + "step": 2540 + }, + { + "epoch": 2.4, + "grad_norm": 11.859535217285156, + "learning_rate": 1.1122770199370408e-07, + "logps/chosen": -57.50248718261719, + "logps/rejected": -87.93421173095703, + "loss": 0.1335, + "losses/dpo": 0.13757172226905823, + "losses/sft": 2.3896234035491943, + "losses/total": 0.13757172226905823, + "ref_logps/chosen": -37.72437286376953, + "ref_logps/rejected": -43.9356575012207, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9778120517730713, + "rewards/margins": 2.422043800354004, + "rewards/rejected": -4.399855613708496, + "step": 2541 + }, + { + "epoch": 2.4, + "grad_norm": 29.46007537841797, + "learning_rate": 1.1105281566981461e-07, + "logps/chosen": -51.233314514160156, + "logps/rejected": -77.33616638183594, + "loss": 0.3359, + "losses/dpo": 0.34855157136917114, + "losses/sft": 1.9801949262619019, + "losses/total": 0.34855157136917114, + "ref_logps/chosen": -33.40945053100586, + "ref_logps/rejected": -40.61573791503906, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7823864221572876, + "rewards/margins": 1.8896558284759521, + "rewards/rejected": -3.6720423698425293, + "step": 2542 + }, + { + "epoch": 2.4, + "grad_norm": 21.368791580200195, + "learning_rate": 1.1087792934592515e-07, + "logps/chosen": -62.33343505859375, + "logps/rejected": -101.64625549316406, + "loss": 0.1896, + "losses/dpo": 0.18004998564720154, + "losses/sft": 3.0143661499023438, + "losses/total": 0.18004998564720154, + "ref_logps/chosen": -38.726539611816406, + "ref_logps/rejected": -51.92256164550781, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.360690116882324, + "rewards/margins": 2.6116790771484375, + "rewards/rejected": -4.972369194030762, + "step": 2543 + }, + { + "epoch": 2.4, + "grad_norm": 35.730865478515625, + "learning_rate": 1.1070304302203567e-07, + "logps/chosen": -70.31576538085938, + "logps/rejected": -100.08964538574219, + "loss": 0.4345, + "losses/dpo": 1.000927448272705, + "losses/sft": 3.285482168197632, + "losses/total": 1.000927448272705, + "ref_logps/chosen": -41.02024841308594, + "ref_logps/rejected": -49.754337310791016, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9295520782470703, + "rewards/margins": 2.1039786338806152, + "rewards/rejected": -5.0335307121276855, + "step": 2544 + }, + { + "epoch": 2.4, + "grad_norm": 18.468238830566406, + "learning_rate": 1.105281566981462e-07, + "logps/chosen": -41.055992126464844, + "logps/rejected": -68.91433715820312, + "loss": 0.2171, + "losses/dpo": 0.2452457845211029, + "losses/sft": 1.721042275428772, + "losses/total": 0.2452457845211029, + "ref_logps/chosen": -22.163066864013672, + "ref_logps/rejected": -31.338363647460938, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8892927169799805, + "rewards/margins": 1.8683040142059326, + "rewards/rejected": -3.757596969604492, + "step": 2545 + }, + { + "epoch": 2.4, + "grad_norm": 22.803457260131836, + "learning_rate": 1.1035327037425674e-07, + "logps/chosen": -53.94634246826172, + "logps/rejected": -80.46607971191406, + "loss": 0.2559, + "losses/dpo": 0.16401897370815277, + "losses/sft": 1.861817717552185, + "losses/total": 0.16401897370815277, + "ref_logps/chosen": -36.254173278808594, + "ref_logps/rejected": -39.521995544433594, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7692170143127441, + "rewards/margins": 2.3251914978027344, + "rewards/rejected": -4.0944085121154785, + "step": 2546 + }, + { + "epoch": 2.41, + "grad_norm": 38.432125091552734, + "learning_rate": 1.1017838405036725e-07, + "logps/chosen": -57.14134216308594, + "logps/rejected": -79.35484313964844, + "loss": 0.3582, + "losses/dpo": 0.15578432381153107, + "losses/sft": 1.6802306175231934, + "losses/total": 0.15578432381153107, + "ref_logps/chosen": -41.46296310424805, + "ref_logps/rejected": -43.66193771362305, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5678377151489258, + "rewards/margins": 2.001453161239624, + "rewards/rejected": -3.56929087638855, + "step": 2547 + }, + { + "epoch": 2.41, + "grad_norm": 25.972978591918945, + "learning_rate": 1.1000349772647778e-07, + "logps/chosen": -49.720951080322266, + "logps/rejected": -93.64736938476562, + "loss": 0.1612, + "losses/dpo": 0.15485922992229462, + "losses/sft": 1.4682108163833618, + "losses/total": 0.15485922992229462, + "ref_logps/chosen": -30.325763702392578, + "ref_logps/rejected": -45.82988739013672, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.939518690109253, + "rewards/margins": 2.8422303199768066, + "rewards/rejected": -4.7817487716674805, + "step": 2548 + }, + { + "epoch": 2.41, + "grad_norm": 18.726795196533203, + "learning_rate": 1.0982861140258832e-07, + "logps/chosen": -61.978515625, + "logps/rejected": -96.72171020507812, + "loss": 0.197, + "losses/dpo": 0.20206576585769653, + "losses/sft": 2.011671781539917, + "losses/total": 0.20206576585769653, + "ref_logps/chosen": -39.43273162841797, + "ref_logps/rejected": -50.971038818359375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2545785903930664, + "rewards/margins": 2.320488929748535, + "rewards/rejected": -4.575067520141602, + "step": 2549 + }, + { + "epoch": 2.41, + "grad_norm": 14.803045272827148, + "learning_rate": 1.0965372507869884e-07, + "logps/chosen": -51.179534912109375, + "logps/rejected": -93.59822082519531, + "loss": 0.1145, + "losses/dpo": 0.12413838505744934, + "losses/sft": 2.185044288635254, + "losses/total": 0.12413838505744934, + "ref_logps/chosen": -32.79823684692383, + "ref_logps/rejected": -47.055362701416016, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.838130235671997, + "rewards/margins": 2.816154956817627, + "rewards/rejected": -4.654285430908203, + "step": 2550 + }, + { + "epoch": 2.41, + "grad_norm": 20.38247299194336, + "learning_rate": 1.0947883875480937e-07, + "logps/chosen": -59.00342559814453, + "logps/rejected": -91.65235900878906, + "loss": 0.205, + "losses/dpo": 0.1265772134065628, + "losses/sft": 2.0871572494506836, + "losses/total": 0.1265772134065628, + "ref_logps/chosen": -40.99570846557617, + "ref_logps/rejected": -49.13435745239258, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.800771713256836, + "rewards/margins": 2.4510293006896973, + "rewards/rejected": -4.251801013946533, + "step": 2551 + }, + { + "epoch": 2.41, + "grad_norm": 24.816736221313477, + "learning_rate": 1.0930395243091991e-07, + "logps/chosen": -44.99361038208008, + "logps/rejected": -75.62315368652344, + "loss": 0.2588, + "losses/dpo": 0.06230252981185913, + "losses/sft": 1.9967135190963745, + "losses/total": 0.06230252981185913, + "ref_logps/chosen": -25.542572021484375, + "ref_logps/rejected": -37.40641784667969, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9451038837432861, + "rewards/margins": 1.8765697479248047, + "rewards/rejected": -3.821673631668091, + "step": 2552 + }, + { + "epoch": 2.41, + "grad_norm": 29.295747756958008, + "learning_rate": 1.0912906610703043e-07, + "logps/chosen": -52.34647750854492, + "logps/rejected": -74.05122375488281, + "loss": 0.3451, + "losses/dpo": 0.5134259462356567, + "losses/sft": 2.4235899448394775, + "losses/total": 0.5134259462356567, + "ref_logps/chosen": -34.62861633300781, + "ref_logps/rejected": -40.31925964355469, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7717859745025635, + "rewards/margins": 1.6014108657836914, + "rewards/rejected": -3.373196840286255, + "step": 2553 + }, + { + "epoch": 2.41, + "grad_norm": 20.89582061767578, + "learning_rate": 1.0895417978314095e-07, + "logps/chosen": -46.52728271484375, + "logps/rejected": -73.043701171875, + "loss": 0.2388, + "losses/dpo": 0.17445340752601624, + "losses/sft": 1.5419580936431885, + "losses/total": 0.17445340752601624, + "ref_logps/chosen": -30.312349319458008, + "ref_logps/rejected": -37.24897384643555, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6214933395385742, + "rewards/margins": 1.957979679107666, + "rewards/rejected": -3.5794732570648193, + "step": 2554 + }, + { + "epoch": 2.41, + "grad_norm": 32.36873245239258, + "learning_rate": 1.0877929345925147e-07, + "logps/chosen": -45.970191955566406, + "logps/rejected": -62.24394989013672, + "loss": 0.4179, + "losses/dpo": 0.23007695376873016, + "losses/sft": 1.738694190979004, + "losses/total": 0.23007695376873016, + "ref_logps/chosen": -27.8089599609375, + "ref_logps/rejected": -31.732833862304688, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8161232471466064, + "rewards/margins": 1.234989047050476, + "rewards/rejected": -3.051112174987793, + "step": 2555 + }, + { + "epoch": 2.41, + "grad_norm": 18.925806045532227, + "learning_rate": 1.0860440713536201e-07, + "logps/chosen": -59.07218933105469, + "logps/rejected": -83.79288482666016, + "loss": 0.1978, + "losses/dpo": 0.10686750710010529, + "losses/sft": 2.293520212173462, + "losses/total": 0.10686750710010529, + "ref_logps/chosen": -41.20150375366211, + "ref_logps/rejected": -45.5233268737793, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7870683670043945, + "rewards/margins": 2.0398879051208496, + "rewards/rejected": -3.826956272125244, + "step": 2556 + }, + { + "epoch": 2.41, + "grad_norm": 26.95958137512207, + "learning_rate": 1.0842952081147254e-07, + "logps/chosen": -64.685302734375, + "logps/rejected": -84.84998321533203, + "loss": 0.2623, + "losses/dpo": 0.3503202795982361, + "losses/sft": 2.1921966075897217, + "losses/total": 0.3503202795982361, + "ref_logps/chosen": -39.75926971435547, + "ref_logps/rejected": -39.21153259277344, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.492602586746216, + "rewards/margins": 2.071241617202759, + "rewards/rejected": -4.563844203948975, + "step": 2557 + }, + { + "epoch": 2.42, + "grad_norm": 27.554433822631836, + "learning_rate": 1.0825463448758306e-07, + "logps/chosen": -55.231040954589844, + "logps/rejected": -94.85967254638672, + "loss": 0.2275, + "losses/dpo": 0.5521403551101685, + "losses/sft": 2.0651941299438477, + "losses/total": 0.5521403551101685, + "ref_logps/chosen": -35.30305480957031, + "ref_logps/rejected": -46.768211364746094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9927988052368164, + "rewards/margins": 2.816347599029541, + "rewards/rejected": -4.809145927429199, + "step": 2558 + }, + { + "epoch": 2.42, + "grad_norm": 30.89105987548828, + "learning_rate": 1.080797481636936e-07, + "logps/chosen": -63.46617889404297, + "logps/rejected": -88.59207916259766, + "loss": 0.2614, + "losses/dpo": 0.18831908702850342, + "losses/sft": 1.423599123954773, + "losses/total": 0.18831908702850342, + "ref_logps/chosen": -41.77220916748047, + "ref_logps/rejected": -43.777137756347656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1693973541259766, + "rewards/margins": 2.3120970726013184, + "rewards/rejected": -4.481494426727295, + "step": 2559 + }, + { + "epoch": 2.42, + "grad_norm": 26.34918785095215, + "learning_rate": 1.0790486183980413e-07, + "logps/chosen": -63.056236267089844, + "logps/rejected": -89.27275085449219, + "loss": 0.2281, + "losses/dpo": 0.29373589158058167, + "losses/sft": 2.6011059284210205, + "losses/total": 0.29373589158058167, + "ref_logps/chosen": -42.57627868652344, + "ref_logps/rejected": -49.83995819091797, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0479960441589355, + "rewards/margins": 1.8952829837799072, + "rewards/rejected": -3.9432787895202637, + "step": 2560 + }, + { + "epoch": 2.42, + "grad_norm": 23.103620529174805, + "learning_rate": 1.0772997551591464e-07, + "logps/chosen": -64.57014465332031, + "logps/rejected": -99.75311279296875, + "loss": 0.208, + "losses/dpo": 0.22294288873672485, + "losses/sft": 2.720385789871216, + "losses/total": 0.22294288873672485, + "ref_logps/chosen": -41.784584045410156, + "ref_logps/rejected": -52.6777229309082, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2785568237304688, + "rewards/margins": 2.4289824962615967, + "rewards/rejected": -4.7075395584106445, + "step": 2561 + }, + { + "epoch": 2.42, + "grad_norm": 33.543739318847656, + "learning_rate": 1.0755508919202518e-07, + "logps/chosen": -65.54817199707031, + "logps/rejected": -69.56430053710938, + "loss": 0.4232, + "losses/dpo": 0.22556215524673462, + "losses/sft": 2.6789019107818604, + "losses/total": 0.22556215524673462, + "ref_logps/chosen": -40.86676025390625, + "ref_logps/rejected": -32.30731964111328, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4681406021118164, + "rewards/margins": 1.2575569152832031, + "rewards/rejected": -3.7256975173950195, + "step": 2562 + }, + { + "epoch": 2.42, + "grad_norm": 26.381847381591797, + "learning_rate": 1.073802028681357e-07, + "logps/chosen": -60.17300033569336, + "logps/rejected": -82.67431640625, + "loss": 0.2909, + "losses/dpo": 0.23626123368740082, + "losses/sft": 2.309133768081665, + "losses/total": 0.23626123368740082, + "ref_logps/chosen": -39.730655670166016, + "ref_logps/rejected": -44.660614013671875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.044234275817871, + "rewards/margins": 1.757136344909668, + "rewards/rejected": -3.80137038230896, + "step": 2563 + }, + { + "epoch": 2.42, + "grad_norm": 18.88750457763672, + "learning_rate": 1.0720531654424623e-07, + "logps/chosen": -55.77342224121094, + "logps/rejected": -91.54100036621094, + "loss": 0.2088, + "losses/dpo": 0.19664618372917175, + "losses/sft": 2.034778594970703, + "losses/total": 0.19664618372917175, + "ref_logps/chosen": -37.26781463623047, + "ref_logps/rejected": -49.17311096191406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8505603075027466, + "rewards/margins": 2.386228084564209, + "rewards/rejected": -4.236788272857666, + "step": 2564 + }, + { + "epoch": 2.42, + "grad_norm": 24.47402572631836, + "learning_rate": 1.0703043022035677e-07, + "logps/chosen": -69.35269927978516, + "logps/rejected": -87.98463439941406, + "loss": 0.2034, + "losses/dpo": 0.23828518390655518, + "losses/sft": 2.5770516395568848, + "losses/total": 0.23828518390655518, + "ref_logps/chosen": -44.25381088256836, + "ref_logps/rejected": -43.23814392089844, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5098886489868164, + "rewards/margins": 1.9647605419158936, + "rewards/rejected": -4.474648952484131, + "step": 2565 + }, + { + "epoch": 2.42, + "grad_norm": 28.16697883605957, + "learning_rate": 1.068555438964673e-07, + "logps/chosen": -58.12694549560547, + "logps/rejected": -96.69146728515625, + "loss": 0.2996, + "losses/dpo": 0.7032175064086914, + "losses/sft": 2.742940664291382, + "losses/total": 0.7032175064086914, + "ref_logps/chosen": -36.491485595703125, + "ref_logps/rejected": -49.2968635559082, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.163546085357666, + "rewards/margins": 2.5759146213531494, + "rewards/rejected": -4.739460468292236, + "step": 2566 + }, + { + "epoch": 2.42, + "grad_norm": 22.47705078125, + "learning_rate": 1.0668065757257782e-07, + "logps/chosen": -55.596065521240234, + "logps/rejected": -90.7059326171875, + "loss": 0.2267, + "losses/dpo": 0.09340497851371765, + "losses/sft": 1.8198113441467285, + "losses/total": 0.09340497851371765, + "ref_logps/chosen": -34.655731201171875, + "ref_logps/rejected": -43.12435531616211, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0940330028533936, + "rewards/margins": 2.6641249656677246, + "rewards/rejected": -4.758157730102539, + "step": 2567 + }, + { + "epoch": 2.42, + "grad_norm": 15.867330551147461, + "learning_rate": 1.0650577124868835e-07, + "logps/chosen": -57.36811828613281, + "logps/rejected": -103.16275787353516, + "loss": 0.1051, + "losses/dpo": 0.054004017263650894, + "losses/sft": 1.3831908702850342, + "losses/total": 0.054004017263650894, + "ref_logps/chosen": -38.60383224487305, + "ref_logps/rejected": -53.714534759521484, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8764283657073975, + "rewards/margins": 3.0683937072753906, + "rewards/rejected": -4.944822311401367, + "step": 2568 + }, + { + "epoch": 2.43, + "grad_norm": 15.959575653076172, + "learning_rate": 1.0633088492479887e-07, + "logps/chosen": -62.00328826904297, + "logps/rejected": -103.62792205810547, + "loss": 0.1124, + "losses/dpo": 0.1275884211063385, + "losses/sft": 2.610710382461548, + "losses/total": 0.1275884211063385, + "ref_logps/chosen": -39.876678466796875, + "ref_logps/rejected": -56.215484619140625, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2126615047454834, + "rewards/margins": 2.5285825729370117, + "rewards/rejected": -4.741243839263916, + "step": 2569 + }, + { + "epoch": 2.43, + "grad_norm": 22.52553939819336, + "learning_rate": 1.061559986009094e-07, + "logps/chosen": -53.69630813598633, + "logps/rejected": -77.34857177734375, + "loss": 0.2359, + "losses/dpo": 0.0458427369594574, + "losses/sft": 1.746375560760498, + "losses/total": 0.0458427369594574, + "ref_logps/chosen": -37.15336608886719, + "ref_logps/rejected": -40.85997009277344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.654294490814209, + "rewards/margins": 1.994565725326538, + "rewards/rejected": -3.648860216140747, + "step": 2570 + }, + { + "epoch": 2.43, + "grad_norm": 31.48843765258789, + "learning_rate": 1.0598111227701994e-07, + "logps/chosen": -55.549407958984375, + "logps/rejected": -86.17488098144531, + "loss": 0.3064, + "losses/dpo": 0.3240250051021576, + "losses/sft": 2.280369997024536, + "losses/total": 0.3240250051021576, + "ref_logps/chosen": -39.13319396972656, + "ref_logps/rejected": -46.76850128173828, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6416208744049072, + "rewards/margins": 2.2990169525146484, + "rewards/rejected": -3.9406378269195557, + "step": 2571 + }, + { + "epoch": 2.43, + "grad_norm": 20.905593872070312, + "learning_rate": 1.0580622595313046e-07, + "logps/chosen": -68.0166015625, + "logps/rejected": -99.62675476074219, + "loss": 0.1966, + "losses/dpo": 0.25101208686828613, + "losses/sft": 2.055720567703247, + "losses/total": 0.25101208686828613, + "ref_logps/chosen": -44.68172836303711, + "ref_logps/rejected": -50.23406219482422, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3334875106811523, + "rewards/margins": 2.6057815551757812, + "rewards/rejected": -4.939269065856934, + "step": 2572 + }, + { + "epoch": 2.43, + "grad_norm": 15.05843448638916, + "learning_rate": 1.0563133962924099e-07, + "logps/chosen": -68.5745849609375, + "logps/rejected": -104.69305419921875, + "loss": 0.1654, + "losses/dpo": 0.09332914650440216, + "losses/sft": 2.0536437034606934, + "losses/total": 0.09332914650440216, + "ref_logps/chosen": -45.46392059326172, + "ref_logps/rejected": -57.96839141845703, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.311066150665283, + "rewards/margins": 2.3613998889923096, + "rewards/rejected": -4.672466278076172, + "step": 2573 + }, + { + "epoch": 2.43, + "grad_norm": 30.56990623474121, + "learning_rate": 1.0545645330535152e-07, + "logps/chosen": -60.13212203979492, + "logps/rejected": -73.24850463867188, + "loss": 0.3406, + "losses/dpo": 0.3165338635444641, + "losses/sft": 2.2543745040893555, + "losses/total": 0.3165338635444641, + "ref_logps/chosen": -39.25490951538086, + "ref_logps/rejected": -37.137939453125, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.087721347808838, + "rewards/margins": 1.523335337638855, + "rewards/rejected": -3.6110568046569824, + "step": 2574 + }, + { + "epoch": 2.43, + "grad_norm": 37.013343811035156, + "learning_rate": 1.0528156698146204e-07, + "logps/chosen": -63.27751922607422, + "logps/rejected": -78.61744689941406, + "loss": 0.3501, + "losses/dpo": 0.6447678208351135, + "losses/sft": 2.375950574874878, + "losses/total": 0.6447678208351135, + "ref_logps/chosen": -40.818180084228516, + "ref_logps/rejected": -38.092140197753906, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.245934009552002, + "rewards/margins": 1.8065966367721558, + "rewards/rejected": -4.052530765533447, + "step": 2575 + }, + { + "epoch": 2.43, + "grad_norm": 22.064970016479492, + "learning_rate": 1.0510668065757257e-07, + "logps/chosen": -53.35215377807617, + "logps/rejected": -93.520263671875, + "loss": 0.2302, + "losses/dpo": 0.2972893714904785, + "losses/sft": 2.070923089981079, + "losses/total": 0.2972893714904785, + "ref_logps/chosen": -32.72959899902344, + "ref_logps/rejected": -48.75401306152344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0622551441192627, + "rewards/margins": 2.414369821548462, + "rewards/rejected": -4.476624965667725, + "step": 2576 + }, + { + "epoch": 2.43, + "grad_norm": 23.87879753112793, + "learning_rate": 1.049317943336831e-07, + "logps/chosen": -53.36554718017578, + "logps/rejected": -73.52658081054688, + "loss": 0.2544, + "losses/dpo": 0.09178003668785095, + "losses/sft": 1.391703724861145, + "losses/total": 0.09178003668785095, + "ref_logps/chosen": -34.63602828979492, + "ref_logps/rejected": -33.592864990234375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.872951626777649, + "rewards/margins": 2.12041974067688, + "rewards/rejected": -3.9933714866638184, + "step": 2577 + }, + { + "epoch": 2.43, + "grad_norm": 24.76368522644043, + "learning_rate": 1.0475690800979363e-07, + "logps/chosen": -55.37023162841797, + "logps/rejected": -80.47941589355469, + "loss": 0.22, + "losses/dpo": 0.23654115200042725, + "losses/sft": 2.030259132385254, + "losses/total": 0.23654115200042725, + "ref_logps/chosen": -38.04815673828125, + "ref_logps/rejected": -41.07991409301758, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7322070598602295, + "rewards/margins": 2.2077434062957764, + "rewards/rejected": -3.939950466156006, + "step": 2578 + }, + { + "epoch": 2.44, + "grad_norm": 15.50444221496582, + "learning_rate": 1.0458202168590416e-07, + "logps/chosen": -51.69697952270508, + "logps/rejected": -95.99156951904297, + "loss": 0.1231, + "losses/dpo": 0.11117083579301834, + "losses/sft": 2.241940975189209, + "losses/total": 0.11117083579301834, + "ref_logps/chosen": -30.247873306274414, + "ref_logps/rejected": -48.499053955078125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1449108123779297, + "rewards/margins": 2.6043403148651123, + "rewards/rejected": -4.749251365661621, + "step": 2579 + }, + { + "epoch": 2.44, + "grad_norm": 22.83673095703125, + "learning_rate": 1.0440713536201469e-07, + "logps/chosen": -65.80226135253906, + "logps/rejected": -99.94749450683594, + "loss": 0.1994, + "losses/dpo": 0.07799456268548965, + "losses/sft": 2.1029629707336426, + "losses/total": 0.07799456268548965, + "ref_logps/chosen": -38.49122619628906, + "ref_logps/rejected": -48.85552978515625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7311038970947266, + "rewards/margins": 2.378091812133789, + "rewards/rejected": -5.109195709228516, + "step": 2580 + }, + { + "epoch": 2.44, + "grad_norm": 19.489704132080078, + "learning_rate": 1.0423224903812522e-07, + "logps/chosen": -57.17464828491211, + "logps/rejected": -94.33549499511719, + "loss": 0.2041, + "losses/dpo": 0.11986584216356277, + "losses/sft": 2.0400967597961426, + "losses/total": 0.11986584216356277, + "ref_logps/chosen": -37.687808990478516, + "ref_logps/rejected": -48.437835693359375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9486839771270752, + "rewards/margins": 2.6410815715789795, + "rewards/rejected": -4.589765548706055, + "step": 2581 + }, + { + "epoch": 2.44, + "grad_norm": 30.269742965698242, + "learning_rate": 1.0405736271423574e-07, + "logps/chosen": -66.55439758300781, + "logps/rejected": -91.45822143554688, + "loss": 0.3001, + "losses/dpo": 0.36642366647720337, + "losses/sft": 1.6556113958358765, + "losses/total": 0.36642366647720337, + "ref_logps/chosen": -37.89095687866211, + "ref_logps/rejected": -43.350677490234375, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8663437366485596, + "rewards/margins": 1.9444106817245483, + "rewards/rejected": -4.810754776000977, + "step": 2582 + }, + { + "epoch": 2.44, + "grad_norm": 31.75640296936035, + "learning_rate": 1.0388247639034626e-07, + "logps/chosen": -60.32160568237305, + "logps/rejected": -93.30657196044922, + "loss": 0.3227, + "losses/dpo": 0.5739339590072632, + "losses/sft": 2.398925542831421, + "losses/total": 0.5739339590072632, + "ref_logps/chosen": -38.183006286621094, + "ref_logps/rejected": -53.49128341674805, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.213860034942627, + "rewards/margins": 1.7676692008972168, + "rewards/rejected": -3.9815292358398438, + "step": 2583 + }, + { + "epoch": 2.44, + "grad_norm": 28.80004119873047, + "learning_rate": 1.037075900664568e-07, + "logps/chosen": -61.17211151123047, + "logps/rejected": -76.71539306640625, + "loss": 0.3286, + "losses/dpo": 0.16467228531837463, + "losses/sft": 2.1150360107421875, + "losses/total": 0.16467228531837463, + "ref_logps/chosen": -38.43989944458008, + "ref_logps/rejected": -36.76949691772461, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.273221492767334, + "rewards/margins": 1.7213687896728516, + "rewards/rejected": -3.9945902824401855, + "step": 2584 + }, + { + "epoch": 2.44, + "grad_norm": 16.38739776611328, + "learning_rate": 1.0353270374256733e-07, + "logps/chosen": -52.807159423828125, + "logps/rejected": -105.39630889892578, + "loss": 0.1077, + "losses/dpo": 0.20742735266685486, + "losses/sft": 1.9841108322143555, + "losses/total": 0.20742735266685486, + "ref_logps/chosen": -37.28129196166992, + "ref_logps/rejected": -54.11236572265625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5525871515274048, + "rewards/margins": 3.5758073329925537, + "rewards/rejected": -5.12839412689209, + "step": 2585 + }, + { + "epoch": 2.44, + "grad_norm": 16.015092849731445, + "learning_rate": 1.0335781741867785e-07, + "logps/chosen": -61.46519470214844, + "logps/rejected": -107.0567398071289, + "loss": 0.1243, + "losses/dpo": 0.03934570401906967, + "losses/sft": 1.920172929763794, + "losses/total": 0.03934570401906967, + "ref_logps/chosen": -39.61329650878906, + "ref_logps/rejected": -55.37855529785156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1851894855499268, + "rewards/margins": 2.982628583908081, + "rewards/rejected": -5.167818069458008, + "step": 2586 + }, + { + "epoch": 2.44, + "grad_norm": 21.995683670043945, + "learning_rate": 1.0318293109478839e-07, + "logps/chosen": -37.99213409423828, + "logps/rejected": -69.28343963623047, + "loss": 0.2505, + "losses/dpo": 0.2805306613445282, + "losses/sft": 1.5321217775344849, + "losses/total": 0.2805306613445282, + "ref_logps/chosen": -22.738601684570312, + "ref_logps/rejected": -33.14030075073242, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.525353193283081, + "rewards/margins": 2.088960647583008, + "rewards/rejected": -3.614313840866089, + "step": 2587 + }, + { + "epoch": 2.44, + "grad_norm": 27.576040267944336, + "learning_rate": 1.0300804477089892e-07, + "logps/chosen": -47.47115707397461, + "logps/rejected": -82.44940185546875, + "loss": 0.2467, + "losses/dpo": 0.24677318334579468, + "losses/sft": 1.7984297275543213, + "losses/total": 0.24677318334579468, + "ref_logps/chosen": -24.82701301574707, + "ref_logps/rejected": -38.341651916503906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2644143104553223, + "rewards/margins": 2.146359920501709, + "rewards/rejected": -4.410774230957031, + "step": 2588 + }, + { + "epoch": 2.44, + "grad_norm": 15.27244758605957, + "learning_rate": 1.0283315844700943e-07, + "logps/chosen": -50.437843322753906, + "logps/rejected": -84.30396270751953, + "loss": 0.1588, + "losses/dpo": 0.2607237994670868, + "losses/sft": 2.5970253944396973, + "losses/total": 0.2607237994670868, + "ref_logps/chosen": -31.924461364746094, + "ref_logps/rejected": -41.21435546875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8513380289077759, + "rewards/margins": 2.45762300491333, + "rewards/rejected": -4.308960914611816, + "step": 2589 + }, + { + "epoch": 2.45, + "grad_norm": 16.532649993896484, + "learning_rate": 1.0265827212311996e-07, + "logps/chosen": -65.29950714111328, + "logps/rejected": -98.13487243652344, + "loss": 0.1633, + "losses/dpo": 0.10464746505022049, + "losses/sft": 2.533601999282837, + "losses/total": 0.10464746505022049, + "ref_logps/chosen": -37.745697021484375, + "ref_logps/rejected": -46.759918212890625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7553815841674805, + "rewards/margins": 2.3821136951446533, + "rewards/rejected": -5.137495517730713, + "step": 2590 + }, + { + "epoch": 2.45, + "grad_norm": 18.9774227142334, + "learning_rate": 1.024833857992305e-07, + "logps/chosen": -75.89419555664062, + "logps/rejected": -92.86912536621094, + "loss": 0.146, + "losses/dpo": 0.1274433583021164, + "losses/sft": 2.581130266189575, + "losses/total": 0.1274433583021164, + "ref_logps/chosen": -49.07698440551758, + "ref_logps/rejected": -42.79907989501953, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.681720495223999, + "rewards/margins": 2.3252837657928467, + "rewards/rejected": -5.007004261016846, + "step": 2591 + }, + { + "epoch": 2.45, + "grad_norm": 28.013385772705078, + "learning_rate": 1.0230849947534102e-07, + "logps/chosen": -56.53223419189453, + "logps/rejected": -87.07548522949219, + "loss": 0.2968, + "losses/dpo": 0.7539942264556885, + "losses/sft": 2.2622296810150146, + "losses/total": 0.7539942264556885, + "ref_logps/chosen": -36.85771560668945, + "ref_logps/rejected": -43.74913787841797, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9674524068832397, + "rewards/margins": 2.3651814460754395, + "rewards/rejected": -4.332633972167969, + "step": 2592 + }, + { + "epoch": 2.45, + "grad_norm": 17.80710220336914, + "learning_rate": 1.0213361315145155e-07, + "logps/chosen": -73.81526184082031, + "logps/rejected": -98.49223327636719, + "loss": 0.1632, + "losses/dpo": 0.07690103352069855, + "losses/sft": 2.688950300216675, + "losses/total": 0.07690103352069855, + "ref_logps/chosen": -49.22899627685547, + "ref_logps/rejected": -50.03904724121094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4586265087127686, + "rewards/margins": 2.3866920471191406, + "rewards/rejected": -4.845318794250488, + "step": 2593 + }, + { + "epoch": 2.45, + "grad_norm": 33.03223419189453, + "learning_rate": 1.0195872682756209e-07, + "logps/chosen": -54.94720458984375, + "logps/rejected": -76.68730163574219, + "loss": 0.3758, + "losses/dpo": 0.06294659525156021, + "losses/sft": 1.5065737962722778, + "losses/total": 0.06294659525156021, + "ref_logps/chosen": -34.556373596191406, + "ref_logps/rejected": -38.32640838623047, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.039083242416382, + "rewards/margins": 1.7970057725906372, + "rewards/rejected": -3.8360891342163086, + "step": 2594 + }, + { + "epoch": 2.45, + "grad_norm": 23.080896377563477, + "learning_rate": 1.0178384050367261e-07, + "logps/chosen": -69.1246109008789, + "logps/rejected": -86.28018951416016, + "loss": 0.204, + "losses/dpo": 0.3689577281475067, + "losses/sft": 1.9399421215057373, + "losses/total": 0.3689577281475067, + "ref_logps/chosen": -50.673126220703125, + "ref_logps/rejected": -43.97283172607422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8451478481292725, + "rewards/margins": 2.385587453842163, + "rewards/rejected": -4.2307353019714355, + "step": 2595 + }, + { + "epoch": 2.45, + "grad_norm": 31.5340576171875, + "learning_rate": 1.0160895417978313e-07, + "logps/chosen": -59.139892578125, + "logps/rejected": -81.00654602050781, + "loss": 0.344, + "losses/dpo": 0.5528707504272461, + "losses/sft": 3.7191176414489746, + "losses/total": 0.5528707504272461, + "ref_logps/chosen": -35.80817413330078, + "ref_logps/rejected": -39.8519287109375, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.333172082901001, + "rewards/margins": 1.7822892665863037, + "rewards/rejected": -4.115461349487305, + "step": 2596 + }, + { + "epoch": 2.45, + "grad_norm": 31.34276008605957, + "learning_rate": 1.0143406785589366e-07, + "logps/chosen": -64.4171142578125, + "logps/rejected": -86.96149444580078, + "loss": 0.2704, + "losses/dpo": 0.1981174349784851, + "losses/sft": 1.7797414064407349, + "losses/total": 0.1981174349784851, + "ref_logps/chosen": -43.85118865966797, + "ref_logps/rejected": -44.20526885986328, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0565929412841797, + "rewards/margins": 2.2190299034118652, + "rewards/rejected": -4.275622844696045, + "step": 2597 + }, + { + "epoch": 2.45, + "grad_norm": 24.233430862426758, + "learning_rate": 1.0125918153200419e-07, + "logps/chosen": -60.92336654663086, + "logps/rejected": -73.82623291015625, + "loss": 0.2384, + "losses/dpo": 0.2107107937335968, + "losses/sft": 1.8149641752243042, + "losses/total": 0.2107107937335968, + "ref_logps/chosen": -40.020103454589844, + "ref_logps/rejected": -32.9814567565918, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0903263092041016, + "rewards/margins": 1.9941506385803223, + "rewards/rejected": -4.084476947784424, + "step": 2598 + }, + { + "epoch": 2.45, + "grad_norm": 25.469482421875, + "learning_rate": 1.0108429520811472e-07, + "logps/chosen": -75.62583923339844, + "logps/rejected": -85.56845092773438, + "loss": 0.1917, + "losses/dpo": 0.10696591436862946, + "losses/sft": 2.345310926437378, + "losses/total": 0.10696591436862946, + "ref_logps/chosen": -50.843589782714844, + "ref_logps/rejected": -40.18869400024414, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.478224754333496, + "rewards/margins": 2.059751510620117, + "rewards/rejected": -4.537976264953613, + "step": 2599 + }, + { + "epoch": 2.46, + "grad_norm": 17.674009323120117, + "learning_rate": 1.0090940888422526e-07, + "logps/chosen": -55.87786102294922, + "logps/rejected": -95.49864959716797, + "loss": 0.1263, + "losses/dpo": 0.13038519024848938, + "losses/sft": 2.023768901824951, + "losses/total": 0.13038519024848938, + "ref_logps/chosen": -33.11652374267578, + "ref_logps/rejected": -46.888946533203125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2761332988739014, + "rewards/margins": 2.584836959838867, + "rewards/rejected": -4.860970497131348, + "step": 2600 + }, + { + "epoch": 2.46, + "grad_norm": 27.673967361450195, + "learning_rate": 1.0073452256033578e-07, + "logps/chosen": -73.77913665771484, + "logps/rejected": -82.05780029296875, + "loss": 0.1891, + "losses/dpo": 0.26632291078567505, + "losses/sft": 2.3219902515411377, + "losses/total": 0.26632291078567505, + "ref_logps/chosen": -51.91088104248047, + "ref_logps/rejected": -38.380714416503906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.186825752258301, + "rewards/margins": 2.1808829307556152, + "rewards/rejected": -4.367708206176758, + "step": 2601 + }, + { + "epoch": 2.46, + "grad_norm": 26.718460083007812, + "learning_rate": 1.0055963623644631e-07, + "logps/chosen": -50.28725051879883, + "logps/rejected": -88.12123107910156, + "loss": 0.2317, + "losses/dpo": 0.5363413095474243, + "losses/sft": 2.115255117416382, + "losses/total": 0.5363413095474243, + "ref_logps/chosen": -31.593265533447266, + "ref_logps/rejected": -43.55470275878906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.869398832321167, + "rewards/margins": 2.587254524230957, + "rewards/rejected": -4.456653118133545, + "step": 2602 + }, + { + "epoch": 2.46, + "grad_norm": 25.41333770751953, + "learning_rate": 1.0038474991255685e-07, + "logps/chosen": -47.423858642578125, + "logps/rejected": -92.43830108642578, + "loss": 0.1706, + "losses/dpo": 0.36899334192276, + "losses/sft": 1.9994312524795532, + "losses/total": 0.36899334192276, + "ref_logps/chosen": -30.479087829589844, + "ref_logps/rejected": -48.30529022216797, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6944770812988281, + "rewards/margins": 2.7188243865966797, + "rewards/rejected": -4.413301467895508, + "step": 2603 + }, + { + "epoch": 2.46, + "grad_norm": 24.383352279663086, + "learning_rate": 1.0020986358866736e-07, + "logps/chosen": -55.91067123413086, + "logps/rejected": -92.13478088378906, + "loss": 0.1873, + "losses/dpo": 0.21596641838550568, + "losses/sft": 2.039780616760254, + "losses/total": 0.21596641838550568, + "ref_logps/chosen": -32.247535705566406, + "ref_logps/rejected": -46.783451080322266, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.366313934326172, + "rewards/margins": 2.168818473815918, + "rewards/rejected": -4.53513240814209, + "step": 2604 + }, + { + "epoch": 2.46, + "grad_norm": 19.391090393066406, + "learning_rate": 1.0003497726477789e-07, + "logps/chosen": -53.40163040161133, + "logps/rejected": -83.4697265625, + "loss": 0.2159, + "losses/dpo": 0.04920261353254318, + "losses/sft": 2.0974574089050293, + "losses/total": 0.04920261353254318, + "ref_logps/chosen": -35.043418884277344, + "ref_logps/rejected": -41.9373664855957, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.835821270942688, + "rewards/margins": 2.3174145221710205, + "rewards/rejected": -4.15323543548584, + "step": 2605 + }, + { + "epoch": 2.46, + "grad_norm": 26.59425163269043, + "learning_rate": 9.986009094088841e-08, + "logps/chosen": -46.27521514892578, + "logps/rejected": -64.21771240234375, + "loss": 0.3517, + "losses/dpo": 0.4971241056919098, + "losses/sft": 1.7221955060958862, + "losses/total": 0.4971241056919098, + "ref_logps/chosen": -27.062803268432617, + "ref_logps/rejected": -29.00020980834961, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9212415218353271, + "rewards/margins": 1.600508689880371, + "rewards/rejected": -3.5217502117156982, + "step": 2606 + }, + { + "epoch": 2.46, + "grad_norm": 33.43109130859375, + "learning_rate": 9.968520461699895e-08, + "logps/chosen": -76.14991760253906, + "logps/rejected": -97.0946044921875, + "loss": 0.2625, + "losses/dpo": 0.17947545647621155, + "losses/sft": 1.9435949325561523, + "losses/total": 0.17947545647621155, + "ref_logps/chosen": -45.67512893676758, + "ref_logps/rejected": -44.292972564697266, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0474791526794434, + "rewards/margins": 2.232684373855591, + "rewards/rejected": -5.280163764953613, + "step": 2607 + }, + { + "epoch": 2.46, + "grad_norm": 29.819128036499023, + "learning_rate": 9.951031829310948e-08, + "logps/chosen": -60.859375, + "logps/rejected": -78.77252197265625, + "loss": 0.2921, + "losses/dpo": 0.21985659003257751, + "losses/sft": 1.9781346321105957, + "losses/total": 0.21985659003257751, + "ref_logps/chosen": -41.72977828979492, + "ref_logps/rejected": -42.71349334716797, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9129598140716553, + "rewards/margins": 1.6929433345794678, + "rewards/rejected": -3.605903148651123, + "step": 2608 + }, + { + "epoch": 2.46, + "grad_norm": 31.35969352722168, + "learning_rate": 9.933543196922e-08, + "logps/chosen": -68.019287109375, + "logps/rejected": -86.71482849121094, + "loss": 0.404, + "losses/dpo": 0.45526280999183655, + "losses/sft": 2.063840389251709, + "losses/total": 0.45526280999183655, + "ref_logps/chosen": -41.12223815917969, + "ref_logps/rejected": -43.22393035888672, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6897048950195312, + "rewards/margins": 1.6593852043151855, + "rewards/rejected": -4.349089622497559, + "step": 2609 + }, + { + "epoch": 2.46, + "grad_norm": 32.93818283081055, + "learning_rate": 9.916054564533054e-08, + "logps/chosen": -65.99555969238281, + "logps/rejected": -87.19364929199219, + "loss": 0.2947, + "losses/dpo": 0.05186900496482849, + "losses/sft": 1.5554454326629639, + "losses/total": 0.05186900496482849, + "ref_logps/chosen": -46.03703689575195, + "ref_logps/rejected": -46.50900650024414, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9958527088165283, + "rewards/margins": 2.0726115703582764, + "rewards/rejected": -4.068464279174805, + "step": 2610 + }, + { + "epoch": 2.47, + "grad_norm": 20.314306259155273, + "learning_rate": 9.898565932144105e-08, + "logps/chosen": -49.21299362182617, + "logps/rejected": -86.56928253173828, + "loss": 0.1739, + "losses/dpo": 0.24736928939819336, + "losses/sft": 2.3513190746307373, + "losses/total": 0.24736928939819336, + "ref_logps/chosen": -29.592952728271484, + "ref_logps/rejected": -38.746429443359375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9620039463043213, + "rewards/margins": 2.820281505584717, + "rewards/rejected": -4.782285213470459, + "step": 2611 + }, + { + "epoch": 2.47, + "grad_norm": 26.15523910522461, + "learning_rate": 9.881077299755158e-08, + "logps/chosen": -61.650848388671875, + "logps/rejected": -92.20849609375, + "loss": 0.3197, + "losses/dpo": 0.10007618367671967, + "losses/sft": 1.8990036249160767, + "losses/total": 0.10007618367671967, + "ref_logps/chosen": -39.098411560058594, + "ref_logps/rejected": -44.057594299316406, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.255244016647339, + "rewards/margins": 2.5598459243774414, + "rewards/rejected": -4.815090179443359, + "step": 2612 + }, + { + "epoch": 2.47, + "grad_norm": 31.91933822631836, + "learning_rate": 9.863588667366212e-08, + "logps/chosen": -59.646156311035156, + "logps/rejected": -69.28634643554688, + "loss": 0.3912, + "losses/dpo": 0.6779725551605225, + "losses/sft": 2.219099521636963, + "losses/total": 0.6779725551605225, + "ref_logps/chosen": -39.468257904052734, + "ref_logps/rejected": -34.59103775024414, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.017789840698242, + "rewards/margins": 1.4517407417297363, + "rewards/rejected": -3.4695305824279785, + "step": 2613 + }, + { + "epoch": 2.47, + "grad_norm": 15.879584312438965, + "learning_rate": 9.846100034977264e-08, + "logps/chosen": -57.7434196472168, + "logps/rejected": -81.94081115722656, + "loss": 0.1865, + "losses/dpo": 0.3243858218193054, + "losses/sft": 2.1442439556121826, + "losses/total": 0.3243858218193054, + "ref_logps/chosen": -35.57036590576172, + "ref_logps/rejected": -36.337947845458984, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2173056602478027, + "rewards/margins": 2.3429813385009766, + "rewards/rejected": -4.560286521911621, + "step": 2614 + }, + { + "epoch": 2.47, + "grad_norm": 27.229232788085938, + "learning_rate": 9.828611402588317e-08, + "logps/chosen": -57.22346496582031, + "logps/rejected": -92.30690002441406, + "loss": 0.2385, + "losses/dpo": 0.048968106508255005, + "losses/sft": 1.6596750020980835, + "losses/total": 0.048968106508255005, + "ref_logps/chosen": -37.21009826660156, + "ref_logps/rejected": -47.969364166259766, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.001336097717285, + "rewards/margins": 2.432417869567871, + "rewards/rejected": -4.433753967285156, + "step": 2615 + }, + { + "epoch": 2.47, + "grad_norm": 19.067716598510742, + "learning_rate": 9.811122770199371e-08, + "logps/chosen": -54.6844482421875, + "logps/rejected": -95.34724426269531, + "loss": 0.1926, + "losses/dpo": 0.05017391964793205, + "losses/sft": 1.4284586906433105, + "losses/total": 0.05017391964793205, + "ref_logps/chosen": -36.69255065917969, + "ref_logps/rejected": -48.41150665283203, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.799189805984497, + "rewards/margins": 2.8943839073181152, + "rewards/rejected": -4.693573951721191, + "step": 2616 + }, + { + "epoch": 2.47, + "grad_norm": 33.83940124511719, + "learning_rate": 9.793634137810424e-08, + "logps/chosen": -51.00293731689453, + "logps/rejected": -67.70549011230469, + "loss": 0.3807, + "losses/dpo": 0.34572577476501465, + "losses/sft": 1.9972656965255737, + "losses/total": 0.34572577476501465, + "ref_logps/chosen": -30.170448303222656, + "ref_logps/rejected": -34.406410217285156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.083249092102051, + "rewards/margins": 1.2466588020324707, + "rewards/rejected": -3.3299078941345215, + "step": 2617 + }, + { + "epoch": 2.47, + "grad_norm": 17.854434967041016, + "learning_rate": 9.776145505421475e-08, + "logps/chosen": -57.60009002685547, + "logps/rejected": -89.62419128417969, + "loss": 0.1826, + "losses/dpo": 0.082145094871521, + "losses/sft": 1.8801279067993164, + "losses/total": 0.082145094871521, + "ref_logps/chosen": -41.04216003417969, + "ref_logps/rejected": -45.35902404785156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6557927131652832, + "rewards/margins": 2.770723819732666, + "rewards/rejected": -4.426516532897949, + "step": 2618 + }, + { + "epoch": 2.47, + "grad_norm": 39.21747970581055, + "learning_rate": 9.758656873032529e-08, + "logps/chosen": -70.81407165527344, + "logps/rejected": -74.14389038085938, + "loss": 0.3921, + "losses/dpo": 0.4029538333415985, + "losses/sft": 1.9141440391540527, + "losses/total": 0.4029538333415985, + "ref_logps/chosen": -47.187129974365234, + "ref_logps/rejected": -36.16504669189453, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.362694501876831, + "rewards/margins": 1.4351894855499268, + "rewards/rejected": -3.797883987426758, + "step": 2619 + }, + { + "epoch": 2.47, + "grad_norm": 33.028900146484375, + "learning_rate": 9.741168240643581e-08, + "logps/chosen": -69.96118927001953, + "logps/rejected": -98.61248016357422, + "loss": 0.3076, + "losses/dpo": 0.31740495562553406, + "losses/sft": 1.7501455545425415, + "losses/total": 0.31740495562553406, + "ref_logps/chosen": -44.765323638916016, + "ref_logps/rejected": -47.95121383666992, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5195865631103516, + "rewards/margins": 2.546539783477783, + "rewards/rejected": -5.066126346588135, + "step": 2620 + }, + { + "epoch": 2.47, + "grad_norm": 43.19218444824219, + "learning_rate": 9.723679608254634e-08, + "logps/chosen": -70.31956481933594, + "logps/rejected": -97.96747589111328, + "loss": 0.3709, + "losses/dpo": 0.26920267939567566, + "losses/sft": 2.482292890548706, + "losses/total": 0.26920267939567566, + "ref_logps/chosen": -45.48460388183594, + "ref_logps/rejected": -50.64182662963867, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4834964275360107, + "rewards/margins": 2.249068260192871, + "rewards/rejected": -4.732564926147461, + "step": 2621 + }, + { + "epoch": 2.48, + "grad_norm": 17.518869400024414, + "learning_rate": 9.706190975865686e-08, + "logps/chosen": -54.61959457397461, + "logps/rejected": -91.7092056274414, + "loss": 0.2001, + "losses/dpo": 0.10851458460092545, + "losses/sft": 2.083570718765259, + "losses/total": 0.10851458460092545, + "ref_logps/chosen": -37.579498291015625, + "ref_logps/rejected": -50.20752716064453, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7040092945098877, + "rewards/margins": 2.4461588859558105, + "rewards/rejected": -4.150168418884277, + "step": 2622 + }, + { + "epoch": 2.48, + "grad_norm": 21.378183364868164, + "learning_rate": 9.68870234347674e-08, + "logps/chosen": -52.947940826416016, + "logps/rejected": -71.7638931274414, + "loss": 0.2138, + "losses/dpo": 0.1524897813796997, + "losses/sft": 2.109269857406616, + "losses/total": 0.1524897813796997, + "ref_logps/chosen": -32.70970916748047, + "ref_logps/rejected": -32.23436737060547, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.023822784423828, + "rewards/margins": 1.929129719734192, + "rewards/rejected": -3.9529523849487305, + "step": 2623 + }, + { + "epoch": 2.48, + "grad_norm": 27.453723907470703, + "learning_rate": 9.671213711087793e-08, + "logps/chosen": -59.317291259765625, + "logps/rejected": -81.62176513671875, + "loss": 0.253, + "losses/dpo": 0.1010105311870575, + "losses/sft": 2.640120506286621, + "losses/total": 0.1010105311870575, + "ref_logps/chosen": -42.9470329284668, + "ref_logps/rejected": -43.72551727294922, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.637026071548462, + "rewards/margins": 2.1525988578796387, + "rewards/rejected": -3.7896246910095215, + "step": 2624 + }, + { + "epoch": 2.48, + "grad_norm": 32.864959716796875, + "learning_rate": 9.653725078698844e-08, + "logps/chosen": -48.011375427246094, + "logps/rejected": -68.41641998291016, + "loss": 0.398, + "losses/dpo": 0.38649439811706543, + "losses/sft": 1.7968426942825317, + "losses/total": 0.38649439811706543, + "ref_logps/chosen": -29.747228622436523, + "ref_logps/rejected": -32.32157516479492, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8264145851135254, + "rewards/margins": 1.7830703258514404, + "rewards/rejected": -3.6094846725463867, + "step": 2625 + }, + { + "epoch": 2.48, + "grad_norm": 35.30326843261719, + "learning_rate": 9.636236446309898e-08, + "logps/chosen": -64.98481750488281, + "logps/rejected": -89.75151824951172, + "loss": 0.3121, + "losses/dpo": 0.2173253893852234, + "losses/sft": 2.1693167686462402, + "losses/total": 0.2173253893852234, + "ref_logps/chosen": -42.099998474121094, + "ref_logps/rejected": -49.113101959228516, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2884809970855713, + "rewards/margins": 1.7753610610961914, + "rewards/rejected": -4.063841819763184, + "step": 2626 + }, + { + "epoch": 2.48, + "grad_norm": 13.172626495361328, + "learning_rate": 9.618747813920951e-08, + "logps/chosen": -64.95667266845703, + "logps/rejected": -95.32521057128906, + "loss": 0.1226, + "losses/dpo": 0.14937573671340942, + "losses/sft": 1.979300618171692, + "losses/total": 0.14937573671340942, + "ref_logps/chosen": -45.157470703125, + "ref_logps/rejected": -49.7628173828125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9799200296401978, + "rewards/margins": 2.576319694519043, + "rewards/rejected": -4.556240081787109, + "step": 2627 + }, + { + "epoch": 2.48, + "grad_norm": 46.55250930786133, + "learning_rate": 9.601259181532003e-08, + "logps/chosen": -68.29072570800781, + "logps/rejected": -87.84684753417969, + "loss": 0.4144, + "losses/dpo": 0.7455044984817505, + "losses/sft": 2.2339420318603516, + "losses/total": 0.7455044984817505, + "ref_logps/chosen": -42.974945068359375, + "ref_logps/rejected": -46.17321014404297, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5315780639648438, + "rewards/margins": 1.6357853412628174, + "rewards/rejected": -4.16736364364624, + "step": 2628 + }, + { + "epoch": 2.48, + "grad_norm": 35.92108917236328, + "learning_rate": 9.583770549143057e-08, + "logps/chosen": -62.43308639526367, + "logps/rejected": -70.52333068847656, + "loss": 0.3736, + "losses/dpo": 0.18482337892055511, + "losses/sft": 1.562248945236206, + "losses/total": 0.18482337892055511, + "ref_logps/chosen": -38.28158187866211, + "ref_logps/rejected": -32.70833969116211, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4151501655578613, + "rewards/margins": 1.3663485050201416, + "rewards/rejected": -3.781498670578003, + "step": 2629 + }, + { + "epoch": 2.48, + "grad_norm": 27.783422470092773, + "learning_rate": 9.56628191675411e-08, + "logps/chosen": -56.60566711425781, + "logps/rejected": -82.60130310058594, + "loss": 0.3281, + "losses/dpo": 0.22605109214782715, + "losses/sft": 1.8187787532806396, + "losses/total": 0.22605109214782715, + "ref_logps/chosen": -37.79899215698242, + "ref_logps/rejected": -44.11222839355469, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8806679248809814, + "rewards/margins": 1.9682402610778809, + "rewards/rejected": -3.8489081859588623, + "step": 2630 + }, + { + "epoch": 2.48, + "grad_norm": 28.24785614013672, + "learning_rate": 9.548793284365162e-08, + "logps/chosen": -44.786865234375, + "logps/rejected": -81.65611267089844, + "loss": 0.2532, + "losses/dpo": 0.40884193778038025, + "losses/sft": 2.1563503742218018, + "losses/total": 0.40884193778038025, + "ref_logps/chosen": -27.189533233642578, + "ref_logps/rejected": -44.9838752746582, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7597332000732422, + "rewards/margins": 1.9074907302856445, + "rewards/rejected": -3.6672239303588867, + "step": 2631 + }, + { + "epoch": 2.49, + "grad_norm": 6.792993068695068, + "learning_rate": 9.531304651976215e-08, + "logps/chosen": -51.781490325927734, + "logps/rejected": -101.36820983886719, + "loss": 0.0638, + "losses/dpo": 0.09195427596569061, + "losses/sft": 1.6872437000274658, + "losses/total": 0.09195427596569061, + "ref_logps/chosen": -33.46580123901367, + "ref_logps/rejected": -50.52802658081055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.831568717956543, + "rewards/margins": 3.2524495124816895, + "rewards/rejected": -5.084018230438232, + "step": 2632 + }, + { + "epoch": 2.49, + "grad_norm": 26.637428283691406, + "learning_rate": 9.513816019587268e-08, + "logps/chosen": -58.11437225341797, + "logps/rejected": -88.03756713867188, + "loss": 0.2448, + "losses/dpo": 0.10347309708595276, + "losses/sft": 2.7814109325408936, + "losses/total": 0.10347309708595276, + "ref_logps/chosen": -34.66980743408203, + "ref_logps/rejected": -45.40983581542969, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3444571495056152, + "rewards/margins": 1.918316125869751, + "rewards/rejected": -4.262773036956787, + "step": 2633 + }, + { + "epoch": 2.49, + "grad_norm": 24.358135223388672, + "learning_rate": 9.49632738719832e-08, + "logps/chosen": -61.41733169555664, + "logps/rejected": -99.54876708984375, + "loss": 0.2404, + "losses/dpo": 0.16507381200790405, + "losses/sft": 2.5139079093933105, + "losses/total": 0.16507381200790405, + "ref_logps/chosen": -38.32343292236328, + "ref_logps/rejected": -55.90496063232422, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.309389591217041, + "rewards/margins": 2.0549912452697754, + "rewards/rejected": -4.364380836486816, + "step": 2634 + }, + { + "epoch": 2.49, + "grad_norm": 21.77463150024414, + "learning_rate": 9.478838754809374e-08, + "logps/chosen": -45.654022216796875, + "logps/rejected": -88.64103698730469, + "loss": 0.2014, + "losses/dpo": 0.24121077358722687, + "losses/sft": 2.3094232082366943, + "losses/total": 0.24121077358722687, + "ref_logps/chosen": -29.739646911621094, + "ref_logps/rejected": -46.01719284057617, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5914379358291626, + "rewards/margins": 2.6709470748901367, + "rewards/rejected": -4.262385368347168, + "step": 2635 + }, + { + "epoch": 2.49, + "grad_norm": 30.00931739807129, + "learning_rate": 9.461350122420427e-08, + "logps/chosen": -52.32457733154297, + "logps/rejected": -72.18357849121094, + "loss": 0.3245, + "losses/dpo": 0.15869125723838806, + "losses/sft": 1.7572306394577026, + "losses/total": 0.15869125723838806, + "ref_logps/chosen": -33.57781982421875, + "ref_logps/rejected": -38.852386474609375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8746757507324219, + "rewards/margins": 1.458443284034729, + "rewards/rejected": -3.3331191539764404, + "step": 2636 + }, + { + "epoch": 2.49, + "grad_norm": 18.77487564086914, + "learning_rate": 9.443861490031479e-08, + "logps/chosen": -39.521217346191406, + "logps/rejected": -74.77930450439453, + "loss": 0.2339, + "losses/dpo": 0.12782526016235352, + "losses/sft": 1.8056789636611938, + "losses/total": 0.12782526016235352, + "ref_logps/chosen": -28.591285705566406, + "ref_logps/rejected": -38.67945098876953, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0929930210113525, + "rewards/margins": 2.5169925689697266, + "rewards/rejected": -3.609985828399658, + "step": 2637 + }, + { + "epoch": 2.49, + "grad_norm": 25.57986831665039, + "learning_rate": 9.426372857642532e-08, + "logps/chosen": -57.35826110839844, + "logps/rejected": -80.22688293457031, + "loss": 0.3642, + "losses/dpo": 0.3260922133922577, + "losses/sft": 1.7510254383087158, + "losses/total": 0.3260922133922577, + "ref_logps/chosen": -34.019405364990234, + "ref_logps/rejected": -41.680458068847656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.333885669708252, + "rewards/margins": 1.520756483078003, + "rewards/rejected": -3.854642391204834, + "step": 2638 + }, + { + "epoch": 2.49, + "grad_norm": 21.60886573791504, + "learning_rate": 9.408884225253584e-08, + "logps/chosen": -64.35485076904297, + "logps/rejected": -79.99525451660156, + "loss": 0.2163, + "losses/dpo": 0.06006474047899246, + "losses/sft": 2.2289817333221436, + "losses/total": 0.06006474047899246, + "ref_logps/chosen": -47.54611587524414, + "ref_logps/rejected": -40.913818359375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6808732748031616, + "rewards/margins": 2.2272701263427734, + "rewards/rejected": -3.9081432819366455, + "step": 2639 + }, + { + "epoch": 2.49, + "grad_norm": 22.581619262695312, + "learning_rate": 9.391395592864637e-08, + "logps/chosen": -51.91297912597656, + "logps/rejected": -88.46798706054688, + "loss": 0.3111, + "losses/dpo": 0.7317584156990051, + "losses/sft": 2.3876187801361084, + "losses/total": 0.7317584156990051, + "ref_logps/chosen": -32.81362533569336, + "ref_logps/rejected": -47.68986511230469, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.909935474395752, + "rewards/margins": 2.167876720428467, + "rewards/rejected": -4.077812194824219, + "step": 2640 + }, + { + "epoch": 2.49, + "grad_norm": 17.824546813964844, + "learning_rate": 9.37390696047569e-08, + "logps/chosen": -66.83539581298828, + "logps/rejected": -86.10513305664062, + "loss": 0.1963, + "losses/dpo": 0.05992724001407623, + "losses/sft": 2.7904083728790283, + "losses/total": 0.05992724001407623, + "ref_logps/chosen": -45.83329772949219, + "ref_logps/rejected": -44.290245056152344, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1002094745635986, + "rewards/margins": 2.0812788009643555, + "rewards/rejected": -4.181488037109375, + "step": 2641 + }, + { + "epoch": 2.49, + "grad_norm": 33.15191650390625, + "learning_rate": 9.356418328086744e-08, + "logps/chosen": -50.72719192504883, + "logps/rejected": -81.42460632324219, + "loss": 0.3367, + "losses/dpo": 0.31758102774620056, + "losses/sft": 1.3459575176239014, + "losses/total": 0.31758102774620056, + "ref_logps/chosen": -32.37253952026367, + "ref_logps/rejected": -47.08313751220703, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.835465431213379, + "rewards/margins": 1.5986818075180054, + "rewards/rejected": -3.4341471195220947, + "step": 2642 + }, + { + "epoch": 2.5, + "grad_norm": 18.970203399658203, + "learning_rate": 9.338929695697796e-08, + "logps/chosen": -62.68254852294922, + "logps/rejected": -97.68290710449219, + "loss": 0.1524, + "losses/dpo": 0.2148987352848053, + "losses/sft": 2.4488165378570557, + "losses/total": 0.2148987352848053, + "ref_logps/chosen": -41.273277282714844, + "ref_logps/rejected": -47.41449737548828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1409270763397217, + "rewards/margins": 2.8859143257141113, + "rewards/rejected": -5.026841163635254, + "step": 2643 + }, + { + "epoch": 2.5, + "grad_norm": 16.483991622924805, + "learning_rate": 9.321441063308849e-08, + "logps/chosen": -56.09375762939453, + "logps/rejected": -82.86587524414062, + "loss": 0.1589, + "losses/dpo": 0.14035722613334656, + "losses/sft": 1.6219528913497925, + "losses/total": 0.14035722613334656, + "ref_logps/chosen": -39.73691940307617, + "ref_logps/rejected": -41.95567321777344, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6356831789016724, + "rewards/margins": 2.455336809158325, + "rewards/rejected": -4.091020107269287, + "step": 2644 + }, + { + "epoch": 2.5, + "grad_norm": 23.772016525268555, + "learning_rate": 9.303952430919903e-08, + "logps/chosen": -45.77806091308594, + "logps/rejected": -71.56695556640625, + "loss": 0.291, + "losses/dpo": 0.3063003718852997, + "losses/sft": 1.6803115606307983, + "losses/total": 0.3063003718852997, + "ref_logps/chosen": -29.976591110229492, + "ref_logps/rejected": -39.285186767578125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5801470279693604, + "rewards/margins": 1.6480300426483154, + "rewards/rejected": -3.228177070617676, + "step": 2645 + }, + { + "epoch": 2.5, + "grad_norm": 21.726228713989258, + "learning_rate": 9.286463798530954e-08, + "logps/chosen": -53.46764373779297, + "logps/rejected": -83.09614562988281, + "loss": 0.2369, + "losses/dpo": 0.22365626692771912, + "losses/sft": 1.9487669467926025, + "losses/total": 0.22365626692771912, + "ref_logps/chosen": -33.389488220214844, + "ref_logps/rejected": -41.57721710205078, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0078158378601074, + "rewards/margins": 2.1440773010253906, + "rewards/rejected": -4.15189266204834, + "step": 2646 + }, + { + "epoch": 2.5, + "grad_norm": 19.15696144104004, + "learning_rate": 9.268975166142006e-08, + "logps/chosen": -55.86904525756836, + "logps/rejected": -96.14935302734375, + "loss": 0.199, + "losses/dpo": 0.0832444280385971, + "losses/sft": 2.557432174682617, + "losses/total": 0.0832444280385971, + "ref_logps/chosen": -35.647789001464844, + "ref_logps/rejected": -49.84021759033203, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.022125720977783, + "rewards/margins": 2.60878849029541, + "rewards/rejected": -4.630914211273193, + "step": 2647 + }, + { + "epoch": 2.5, + "grad_norm": 19.90987777709961, + "learning_rate": 9.25148653375306e-08, + "logps/chosen": -58.71638107299805, + "logps/rejected": -79.5850601196289, + "loss": 0.1919, + "losses/dpo": 0.09517229348421097, + "losses/sft": 2.3693816661834717, + "losses/total": 0.09517229348421097, + "ref_logps/chosen": -41.172203063964844, + "ref_logps/rejected": -42.81840515136719, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7544176578521729, + "rewards/margins": 1.9222476482391357, + "rewards/rejected": -3.6766653060913086, + "step": 2648 + }, + { + "epoch": 2.5, + "grad_norm": 19.189653396606445, + "learning_rate": 9.233997901364113e-08, + "logps/chosen": -48.83949279785156, + "logps/rejected": -82.58723449707031, + "loss": 0.2218, + "losses/dpo": 0.28850340843200684, + "losses/sft": 1.326021432876587, + "losses/total": 0.28850340843200684, + "ref_logps/chosen": -32.46562957763672, + "ref_logps/rejected": -41.25714874267578, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6373860836029053, + "rewards/margins": 2.4956226348876953, + "rewards/rejected": -4.1330084800720215, + "step": 2649 + }, + { + "epoch": 2.5, + "grad_norm": 14.481467247009277, + "learning_rate": 9.216509268975166e-08, + "logps/chosen": -60.59910583496094, + "logps/rejected": -90.18119812011719, + "loss": 0.153, + "losses/dpo": 0.04702283442020416, + "losses/sft": 2.3079562187194824, + "losses/total": 0.04702283442020416, + "ref_logps/chosen": -38.13494873046875, + "ref_logps/rejected": -44.97789001464844, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.246415615081787, + "rewards/margins": 2.2739148139953613, + "rewards/rejected": -4.520330429077148, + "step": 2650 + }, + { + "epoch": 2.5, + "grad_norm": 27.231679916381836, + "learning_rate": 9.19902063658622e-08, + "logps/chosen": -53.07997131347656, + "logps/rejected": -84.53821563720703, + "loss": 0.2813, + "losses/dpo": 0.15879404544830322, + "losses/sft": 1.921830415725708, + "losses/total": 0.15879404544830322, + "ref_logps/chosen": -32.77253723144531, + "ref_logps/rejected": -41.605411529541016, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0307438373565674, + "rewards/margins": 2.26253604888916, + "rewards/rejected": -4.293279647827148, + "step": 2651 + }, + { + "epoch": 2.5, + "grad_norm": 32.732086181640625, + "learning_rate": 9.181532004197272e-08, + "logps/chosen": -58.05944061279297, + "logps/rejected": -66.1270751953125, + "loss": 0.3329, + "losses/dpo": 0.5357910990715027, + "losses/sft": 1.9927314519882202, + "losses/total": 0.5357910990715027, + "ref_logps/chosen": -39.23957061767578, + "ref_logps/rejected": -32.11646270751953, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8819873332977295, + "rewards/margins": 1.5190739631652832, + "rewards/rejected": -3.4010612964630127, + "step": 2652 + }, + { + "epoch": 2.51, + "grad_norm": 18.582731246948242, + "learning_rate": 9.164043371808323e-08, + "logps/chosen": -62.617984771728516, + "logps/rejected": -90.97095489501953, + "loss": 0.2075, + "losses/dpo": 0.2849293351173401, + "losses/sft": 2.2467405796051025, + "losses/total": 0.2849293351173401, + "ref_logps/chosen": -41.20893096923828, + "ref_logps/rejected": -47.07484436035156, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1409051418304443, + "rewards/margins": 2.248706340789795, + "rewards/rejected": -4.38961124420166, + "step": 2653 + }, + { + "epoch": 2.51, + "grad_norm": 13.544334411621094, + "learning_rate": 9.146554739419376e-08, + "logps/chosen": -63.693058013916016, + "logps/rejected": -98.46592712402344, + "loss": 0.1346, + "losses/dpo": 0.22762365639209747, + "losses/sft": 2.903892755508423, + "losses/total": 0.22762365639209747, + "ref_logps/chosen": -39.51471710205078, + "ref_logps/rejected": -48.387489318847656, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4178342819213867, + "rewards/margins": 2.590010643005371, + "rewards/rejected": -5.007844924926758, + "step": 2654 + }, + { + "epoch": 2.51, + "grad_norm": 31.12428092956543, + "learning_rate": 9.12906610703043e-08, + "logps/chosen": -51.019309997558594, + "logps/rejected": -82.58992767333984, + "loss": 0.4647, + "losses/dpo": 0.11014776676893234, + "losses/sft": 2.443131923675537, + "losses/total": 0.11014776676893234, + "ref_logps/chosen": -29.98655128479004, + "ref_logps/rejected": -43.18667221069336, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.103276014328003, + "rewards/margins": 1.8370500802993774, + "rewards/rejected": -3.940326452255249, + "step": 2655 + }, + { + "epoch": 2.51, + "grad_norm": 21.865400314331055, + "learning_rate": 9.111577474641482e-08, + "logps/chosen": -58.064613342285156, + "logps/rejected": -79.7762451171875, + "loss": 0.2234, + "losses/dpo": 0.20587067306041718, + "losses/sft": 1.972915530204773, + "losses/total": 0.20587067306041718, + "ref_logps/chosen": -37.52762985229492, + "ref_logps/rejected": -41.9873046875, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0536983013153076, + "rewards/margins": 1.7251962423324585, + "rewards/rejected": -3.7788944244384766, + "step": 2656 + }, + { + "epoch": 2.51, + "grad_norm": 20.920183181762695, + "learning_rate": 9.094088842252535e-08, + "logps/chosen": -70.60042572021484, + "logps/rejected": -90.38031005859375, + "loss": 0.2386, + "losses/dpo": 0.2561028003692627, + "losses/sft": 2.2593626976013184, + "losses/total": 0.2561028003692627, + "ref_logps/chosen": -45.15994644165039, + "ref_logps/rejected": -46.44893264770508, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5440478324890137, + "rewards/margins": 1.849090337753296, + "rewards/rejected": -4.3931379318237305, + "step": 2657 + }, + { + "epoch": 2.51, + "grad_norm": 15.381429672241211, + "learning_rate": 9.076600209863589e-08, + "logps/chosen": -48.48783493041992, + "logps/rejected": -85.4599380493164, + "loss": 0.1252, + "losses/dpo": 0.16669030487537384, + "losses/sft": 1.9194079637527466, + "losses/total": 0.16669030487537384, + "ref_logps/chosen": -32.741188049316406, + "ref_logps/rejected": -43.610870361328125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5746644735336304, + "rewards/margins": 2.6102418899536133, + "rewards/rejected": -4.184906482696533, + "step": 2658 + }, + { + "epoch": 2.51, + "grad_norm": 21.14702033996582, + "learning_rate": 9.059111577474642e-08, + "logps/chosen": -48.063499450683594, + "logps/rejected": -81.52400207519531, + "loss": 0.3334, + "losses/dpo": 0.15759655833244324, + "losses/sft": 1.1650971174240112, + "losses/total": 0.15759655833244324, + "ref_logps/chosen": -31.4576473236084, + "ref_logps/rejected": -42.72325134277344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6605851650238037, + "rewards/margins": 2.2194900512695312, + "rewards/rejected": -3.880075216293335, + "step": 2659 + }, + { + "epoch": 2.51, + "grad_norm": 20.476564407348633, + "learning_rate": 9.041622945085694e-08, + "logps/chosen": -57.79673767089844, + "logps/rejected": -87.91827392578125, + "loss": 0.183, + "losses/dpo": 0.048319194465875626, + "losses/sft": 2.017071485519409, + "losses/total": 0.048319194465875626, + "ref_logps/chosen": -38.672584533691406, + "ref_logps/rejected": -45.02015686035156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9124152660369873, + "rewards/margins": 2.3773961067199707, + "rewards/rejected": -4.289811134338379, + "step": 2660 + }, + { + "epoch": 2.51, + "grad_norm": 19.473453521728516, + "learning_rate": 9.024134312696747e-08, + "logps/chosen": -45.143959045410156, + "logps/rejected": -69.37344360351562, + "loss": 0.2067, + "losses/dpo": 0.21326187252998352, + "losses/sft": 1.9567234516143799, + "losses/total": 0.21326187252998352, + "ref_logps/chosen": -30.596092224121094, + "ref_logps/rejected": -35.80542755126953, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4547865390777588, + "rewards/margins": 1.902014970779419, + "rewards/rejected": -3.3568012714385986, + "step": 2661 + }, + { + "epoch": 2.51, + "grad_norm": 38.11095428466797, + "learning_rate": 9.006645680307799e-08, + "logps/chosen": -59.61377716064453, + "logps/rejected": -80.72429656982422, + "loss": 0.4039, + "losses/dpo": 0.7609592080116272, + "losses/sft": 1.9940719604492188, + "losses/total": 0.7609592080116272, + "ref_logps/chosen": -35.80747985839844, + "ref_logps/rejected": -40.880069732666016, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.380629301071167, + "rewards/margins": 1.603793740272522, + "rewards/rejected": -3.9844231605529785, + "step": 2662 + }, + { + "epoch": 2.51, + "grad_norm": 14.014473915100098, + "learning_rate": 8.989157047918852e-08, + "logps/chosen": -53.547298431396484, + "logps/rejected": -102.56118774414062, + "loss": 0.1184, + "losses/dpo": 0.1557226926088333, + "losses/sft": 2.2434537410736084, + "losses/total": 0.1557226926088333, + "ref_logps/chosen": -39.66558074951172, + "ref_logps/rejected": -57.48854446411133, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3881717920303345, + "rewards/margins": 3.1190924644470215, + "rewards/rejected": -4.507264137268066, + "step": 2663 + }, + { + "epoch": 2.52, + "grad_norm": 20.364011764526367, + "learning_rate": 8.971668415529906e-08, + "logps/chosen": -73.42503356933594, + "logps/rejected": -92.31561279296875, + "loss": 0.1569, + "losses/dpo": 0.20702314376831055, + "losses/sft": 2.134514093399048, + "losses/total": 0.20702314376831055, + "ref_logps/chosen": -51.78997802734375, + "ref_logps/rejected": -46.695194244384766, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1635055541992188, + "rewards/margins": 2.398536443710327, + "rewards/rejected": -4.562042236328125, + "step": 2664 + }, + { + "epoch": 2.52, + "grad_norm": 31.631244659423828, + "learning_rate": 8.954179783140958e-08, + "logps/chosen": -67.80427551269531, + "logps/rejected": -77.00192260742188, + "loss": 0.3346, + "losses/dpo": 0.2949151396751404, + "losses/sft": 2.028554677963257, + "losses/total": 0.2949151396751404, + "ref_logps/chosen": -48.55534362792969, + "ref_logps/rejected": -43.106842041015625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.924892544746399, + "rewards/margins": 1.464616298675537, + "rewards/rejected": -3.3895087242126465, + "step": 2665 + }, + { + "epoch": 2.52, + "grad_norm": 16.61313247680664, + "learning_rate": 8.936691150752011e-08, + "logps/chosen": -46.049842834472656, + "logps/rejected": -78.18761444091797, + "loss": 0.1422, + "losses/dpo": 0.06558723002672195, + "losses/sft": 1.3075509071350098, + "losses/total": 0.06558723002672195, + "ref_logps/chosen": -29.03466033935547, + "ref_logps/rejected": -37.812522888183594, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7015185356140137, + "rewards/margins": 2.3359904289245605, + "rewards/rejected": -4.037508964538574, + "step": 2666 + }, + { + "epoch": 2.52, + "grad_norm": 18.713289260864258, + "learning_rate": 8.919202518363065e-08, + "logps/chosen": -57.84708786010742, + "logps/rejected": -93.6070556640625, + "loss": 0.178, + "losses/dpo": 0.10467052459716797, + "losses/sft": 1.7599592208862305, + "losses/total": 0.10467052459716797, + "ref_logps/chosen": -41.1086311340332, + "ref_logps/rejected": -49.096893310546875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.673845887184143, + "rewards/margins": 2.777170181274414, + "rewards/rejected": -4.451015949249268, + "step": 2667 + }, + { + "epoch": 2.52, + "grad_norm": 16.586545944213867, + "learning_rate": 8.901713885974116e-08, + "logps/chosen": -71.38506317138672, + "logps/rejected": -106.7919692993164, + "loss": 0.1381, + "losses/dpo": 0.042315613478422165, + "losses/sft": 1.8805639743804932, + "losses/total": 0.042315613478422165, + "ref_logps/chosen": -47.56238555908203, + "ref_logps/rejected": -55.274505615234375, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.382267475128174, + "rewards/margins": 2.769479513168335, + "rewards/rejected": -5.15174674987793, + "step": 2668 + }, + { + "epoch": 2.52, + "grad_norm": 21.321243286132812, + "learning_rate": 8.884225253585169e-08, + "logps/chosen": -56.752079010009766, + "logps/rejected": -91.5252685546875, + "loss": 0.1944, + "losses/dpo": 0.13461831212043762, + "losses/sft": 2.2940287590026855, + "losses/total": 0.13461831212043762, + "ref_logps/chosen": -37.261627197265625, + "ref_logps/rejected": -46.732460021972656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.949044942855835, + "rewards/margins": 2.5302371978759766, + "rewards/rejected": -4.479282379150391, + "step": 2669 + }, + { + "epoch": 2.52, + "grad_norm": 26.792482376098633, + "learning_rate": 8.866736621196221e-08, + "logps/chosen": -56.12199020385742, + "logps/rejected": -73.48761749267578, + "loss": 0.3073, + "losses/dpo": 0.16398343443870544, + "losses/sft": 2.3927910327911377, + "losses/total": 0.16398343443870544, + "ref_logps/chosen": -34.62025451660156, + "ref_logps/rejected": -34.842750549316406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1501736640930176, + "rewards/margins": 1.714313268661499, + "rewards/rejected": -3.8644871711730957, + "step": 2670 + }, + { + "epoch": 2.52, + "grad_norm": 22.86536979675293, + "learning_rate": 8.849247988807275e-08, + "logps/chosen": -58.844024658203125, + "logps/rejected": -111.99443054199219, + "loss": 0.2107, + "losses/dpo": 0.1297515630722046, + "losses/sft": 1.7009419202804565, + "losses/total": 0.1297515630722046, + "ref_logps/chosen": -41.70057678222656, + "ref_logps/rejected": -70.78482055664062, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7143445014953613, + "rewards/margins": 2.406615972518921, + "rewards/rejected": -4.120960235595703, + "step": 2671 + }, + { + "epoch": 2.52, + "grad_norm": 17.73824119567871, + "learning_rate": 8.831759356418328e-08, + "logps/chosen": -49.567779541015625, + "logps/rejected": -93.88105773925781, + "loss": 0.1715, + "losses/dpo": 0.4700128436088562, + "losses/sft": 2.277287483215332, + "losses/total": 0.4700128436088562, + "ref_logps/chosen": -33.02092742919922, + "ref_logps/rejected": -49.19862747192383, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.654685139656067, + "rewards/margins": 2.8135576248168945, + "rewards/rejected": -4.468242645263672, + "step": 2672 + }, + { + "epoch": 2.52, + "grad_norm": 31.842103958129883, + "learning_rate": 8.81427072402938e-08, + "logps/chosen": -56.82010269165039, + "logps/rejected": -84.50248718261719, + "loss": 0.316, + "losses/dpo": 0.45291897654533386, + "losses/sft": 1.5067979097366333, + "losses/total": 0.45291897654533386, + "ref_logps/chosen": -33.516632080078125, + "ref_logps/rejected": -40.809226989746094, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.3303468227386475, + "rewards/margins": 2.0389795303344727, + "rewards/rejected": -4.369326591491699, + "step": 2673 + }, + { + "epoch": 2.53, + "grad_norm": 25.733028411865234, + "learning_rate": 8.796782091640434e-08, + "logps/chosen": -45.055824279785156, + "logps/rejected": -68.95246124267578, + "loss": 0.3068, + "losses/dpo": 0.18253163993358612, + "losses/sft": 1.8108981847763062, + "losses/total": 0.18253163993358612, + "ref_logps/chosen": -27.325088500976562, + "ref_logps/rejected": -33.991943359375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.773073434829712, + "rewards/margins": 1.7229784727096558, + "rewards/rejected": -3.496051788330078, + "step": 2674 + }, + { + "epoch": 2.53, + "grad_norm": 24.55254554748535, + "learning_rate": 8.779293459251486e-08, + "logps/chosen": -62.67566680908203, + "logps/rejected": -98.48059844970703, + "loss": 0.1498, + "losses/dpo": 0.051351405680179596, + "losses/sft": 1.9819999933242798, + "losses/total": 0.051351405680179596, + "ref_logps/chosen": -38.56451416015625, + "ref_logps/rejected": -48.215606689453125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4111151695251465, + "rewards/margins": 2.6153836250305176, + "rewards/rejected": -5.026498794555664, + "step": 2675 + }, + { + "epoch": 2.53, + "grad_norm": 15.96773910522461, + "learning_rate": 8.761804826862538e-08, + "logps/chosen": -59.49025344848633, + "logps/rejected": -101.77914428710938, + "loss": 0.1423, + "losses/dpo": 0.08034539967775345, + "losses/sft": 2.561227560043335, + "losses/total": 0.08034539967775345, + "ref_logps/chosen": -37.52191162109375, + "ref_logps/rejected": -56.185550689697266, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.196834087371826, + "rewards/margins": 2.36252498626709, + "rewards/rejected": -4.559359550476074, + "step": 2676 + }, + { + "epoch": 2.53, + "grad_norm": 21.04685401916504, + "learning_rate": 8.744316194473592e-08, + "logps/chosen": -49.12049102783203, + "logps/rejected": -87.42820739746094, + "loss": 0.1864, + "losses/dpo": 0.14930391311645508, + "losses/sft": 1.7568103075027466, + "losses/total": 0.14930391311645508, + "ref_logps/chosen": -29.447824478149414, + "ref_logps/rejected": -41.89769744873047, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.96726655960083, + "rewards/margins": 2.585784435272217, + "rewards/rejected": -4.553050994873047, + "step": 2677 + }, + { + "epoch": 2.53, + "grad_norm": 30.19243621826172, + "learning_rate": 8.726827562084645e-08, + "logps/chosen": -63.23504638671875, + "logps/rejected": -87.2403335571289, + "loss": 0.2948, + "losses/dpo": 0.17851522564888, + "losses/sft": 2.166452169418335, + "losses/total": 0.17851522564888, + "ref_logps/chosen": -41.614871978759766, + "ref_logps/rejected": -45.09661865234375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.162017822265625, + "rewards/margins": 2.052354335784912, + "rewards/rejected": -4.214372158050537, + "step": 2678 + }, + { + "epoch": 2.53, + "grad_norm": 17.124975204467773, + "learning_rate": 8.709338929695697e-08, + "logps/chosen": -52.939613342285156, + "logps/rejected": -80.61222839355469, + "loss": 0.1665, + "losses/dpo": 0.2314153015613556, + "losses/sft": 1.5011587142944336, + "losses/total": 0.2314153015613556, + "ref_logps/chosen": -31.999927520751953, + "ref_logps/rejected": -33.60721969604492, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0939688682556152, + "rewards/margins": 2.606532573699951, + "rewards/rejected": -4.700501441955566, + "step": 2679 + }, + { + "epoch": 2.53, + "grad_norm": 26.842926025390625, + "learning_rate": 8.691850297306751e-08, + "logps/chosen": -53.796592712402344, + "logps/rejected": -80.64501190185547, + "loss": 0.2724, + "losses/dpo": 0.22778701782226562, + "losses/sft": 1.9416782855987549, + "losses/total": 0.22778701782226562, + "ref_logps/chosen": -33.74138641357422, + "ref_logps/rejected": -41.23072052001953, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.005521059036255, + "rewards/margins": 1.9359074831008911, + "rewards/rejected": -3.9414284229278564, + "step": 2680 + }, + { + "epoch": 2.53, + "grad_norm": 14.247771263122559, + "learning_rate": 8.674361664917804e-08, + "logps/chosen": -60.75342559814453, + "logps/rejected": -100.37091064453125, + "loss": 0.0971, + "losses/dpo": 0.07497790455818176, + "losses/sft": 1.9749066829681396, + "losses/total": 0.07497790455818176, + "ref_logps/chosen": -36.79957962036133, + "ref_logps/rejected": -47.23640441894531, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3953840732574463, + "rewards/margins": 2.9180665016174316, + "rewards/rejected": -5.313450813293457, + "step": 2681 + }, + { + "epoch": 2.53, + "grad_norm": 25.20327377319336, + "learning_rate": 8.656873032528855e-08, + "logps/chosen": -60.66781997680664, + "logps/rejected": -85.57939147949219, + "loss": 0.256, + "losses/dpo": 0.4076347053050995, + "losses/sft": 2.5626718997955322, + "losses/total": 0.4076347053050995, + "ref_logps/chosen": -38.370948791503906, + "ref_logps/rejected": -44.13459396362305, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.229686975479126, + "rewards/margins": 1.914792776107788, + "rewards/rejected": -4.144479751586914, + "step": 2682 + }, + { + "epoch": 2.53, + "grad_norm": 21.946439743041992, + "learning_rate": 8.639384400139909e-08, + "logps/chosen": -51.11888122558594, + "logps/rejected": -80.06268310546875, + "loss": 0.1651, + "losses/dpo": 0.15955320000648499, + "losses/sft": 1.7931838035583496, + "losses/total": 0.15955320000648499, + "ref_logps/chosen": -36.465118408203125, + "ref_logps/rejected": -40.24945068359375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4653767347335815, + "rewards/margins": 2.515946626663208, + "rewards/rejected": -3.981323480606079, + "step": 2683 + }, + { + "epoch": 2.53, + "grad_norm": 22.575918197631836, + "learning_rate": 8.621895767750962e-08, + "logps/chosen": -53.03132629394531, + "logps/rejected": -88.59447479248047, + "loss": 0.1848, + "losses/dpo": 0.223446786403656, + "losses/sft": 2.388427257537842, + "losses/total": 0.223446786403656, + "ref_logps/chosen": -31.093069076538086, + "ref_logps/rejected": -40.585906982421875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1938254833221436, + "rewards/margins": 2.6070315837860107, + "rewards/rejected": -4.8008575439453125, + "step": 2684 + }, + { + "epoch": 2.54, + "grad_norm": 25.52772331237793, + "learning_rate": 8.604407135362014e-08, + "logps/chosen": -53.61585998535156, + "logps/rejected": -75.51603698730469, + "loss": 0.3007, + "losses/dpo": 0.28772467374801636, + "losses/sft": 3.019272804260254, + "losses/total": 0.28772467374801636, + "ref_logps/chosen": -32.3934440612793, + "ref_logps/rejected": -36.954978942871094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.122241735458374, + "rewards/margins": 1.733864188194275, + "rewards/rejected": -3.8561058044433594, + "step": 2685 + }, + { + "epoch": 2.54, + "grad_norm": 23.419788360595703, + "learning_rate": 8.586918502973067e-08, + "logps/chosen": -51.07176208496094, + "logps/rejected": -74.2645263671875, + "loss": 0.2598, + "losses/dpo": 0.22374330461025238, + "losses/sft": 2.271658420562744, + "losses/total": 0.22374330461025238, + "ref_logps/chosen": -30.589229583740234, + "ref_logps/rejected": -34.38256072998047, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0482535362243652, + "rewards/margins": 1.9399421215057373, + "rewards/rejected": -3.9881958961486816, + "step": 2686 + }, + { + "epoch": 2.54, + "grad_norm": 28.787511825561523, + "learning_rate": 8.56942987058412e-08, + "logps/chosen": -58.16155242919922, + "logps/rejected": -75.07455444335938, + "loss": 0.3659, + "losses/dpo": 0.2110963761806488, + "losses/sft": 2.0601718425750732, + "losses/total": 0.2110963761806488, + "ref_logps/chosen": -34.536338806152344, + "ref_logps/rejected": -38.25844955444336, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3625218868255615, + "rewards/margins": 1.3190889358520508, + "rewards/rejected": -3.6816108226776123, + "step": 2687 + }, + { + "epoch": 2.54, + "grad_norm": 34.83714294433594, + "learning_rate": 8.551941238195173e-08, + "logps/chosen": -51.61262512207031, + "logps/rejected": -82.34982299804688, + "loss": 0.319, + "losses/dpo": 0.04639798402786255, + "losses/sft": 1.6721293926239014, + "losses/total": 0.04639798402786255, + "ref_logps/chosen": -36.343528747558594, + "ref_logps/rejected": -46.852760314941406, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5269098281860352, + "rewards/margins": 2.022796630859375, + "rewards/rejected": -3.549706220626831, + "step": 2688 + }, + { + "epoch": 2.54, + "grad_norm": 35.27690887451172, + "learning_rate": 8.534452605806224e-08, + "logps/chosen": -72.50088500976562, + "logps/rejected": -93.8785400390625, + "loss": 0.3794, + "losses/dpo": 0.2738884687423706, + "losses/sft": 2.269841432571411, + "losses/total": 0.2738884687423706, + "ref_logps/chosen": -44.61711120605469, + "ref_logps/rejected": -46.61103057861328, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7883777618408203, + "rewards/margins": 1.9383740425109863, + "rewards/rejected": -4.726752281188965, + "step": 2689 + }, + { + "epoch": 2.54, + "grad_norm": 22.528913497924805, + "learning_rate": 8.516963973417278e-08, + "logps/chosen": -57.423797607421875, + "logps/rejected": -78.01254272460938, + "loss": 0.2056, + "losses/dpo": 0.29599159955978394, + "losses/sft": 2.3086752891540527, + "losses/total": 0.29599159955978394, + "ref_logps/chosen": -38.14057159423828, + "ref_logps/rejected": -36.20295715332031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9283225536346436, + "rewards/margins": 2.2526357173919678, + "rewards/rejected": -4.180958271026611, + "step": 2690 + }, + { + "epoch": 2.54, + "grad_norm": 41.20805740356445, + "learning_rate": 8.499475341028331e-08, + "logps/chosen": -54.827720642089844, + "logps/rejected": -71.26068878173828, + "loss": 0.4547, + "losses/dpo": 0.1558263748884201, + "losses/sft": 1.9702070951461792, + "losses/total": 0.1558263748884201, + "ref_logps/chosen": -35.821510314941406, + "ref_logps/rejected": -35.55598449707031, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.900620937347412, + "rewards/margins": 1.6698490381240845, + "rewards/rejected": -3.570470094680786, + "step": 2691 + }, + { + "epoch": 2.54, + "grad_norm": 36.05692672729492, + "learning_rate": 8.481986708639384e-08, + "logps/chosen": -64.08729553222656, + "logps/rejected": -89.35417938232422, + "loss": 0.3963, + "losses/dpo": 0.3824594020843506, + "losses/sft": 1.5775214433670044, + "losses/total": 0.3824594020843506, + "ref_logps/chosen": -38.27870178222656, + "ref_logps/rejected": -46.996070861816406, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.580859422683716, + "rewards/margins": 1.654951572418213, + "rewards/rejected": -4.23581075668335, + "step": 2692 + }, + { + "epoch": 2.54, + "grad_norm": 28.538545608520508, + "learning_rate": 8.464498076250437e-08, + "logps/chosen": -58.3211669921875, + "logps/rejected": -86.10749053955078, + "loss": 0.3228, + "losses/dpo": 0.28001561760902405, + "losses/sft": 2.0562100410461426, + "losses/total": 0.28001561760902405, + "ref_logps/chosen": -35.05115509033203, + "ref_logps/rejected": -45.59699630737305, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3270010948181152, + "rewards/margins": 1.7240484952926636, + "rewards/rejected": -4.051049709320068, + "step": 2693 + }, + { + "epoch": 2.54, + "grad_norm": 30.47183609008789, + "learning_rate": 8.44700944386149e-08, + "logps/chosen": -65.7847900390625, + "logps/rejected": -85.92161560058594, + "loss": 0.2933, + "losses/dpo": 0.21794924139976501, + "losses/sft": 2.74277925491333, + "losses/total": 0.21794924139976501, + "ref_logps/chosen": -43.84074401855469, + "ref_logps/rejected": -44.49517059326172, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.194404125213623, + "rewards/margins": 1.9482409954071045, + "rewards/rejected": -4.142644882202148, + "step": 2694 + }, + { + "epoch": 2.54, + "grad_norm": 22.02395248413086, + "learning_rate": 8.429520811472543e-08, + "logps/chosen": -70.81207275390625, + "logps/rejected": -84.12870788574219, + "loss": 0.2327, + "losses/dpo": 0.28347504138946533, + "losses/sft": 2.1498355865478516, + "losses/total": 0.28347504138946533, + "ref_logps/chosen": -46.21925354003906, + "ref_logps/rejected": -40.54079055786133, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4592814445495605, + "rewards/margins": 1.8995107412338257, + "rewards/rejected": -4.358792304992676, + "step": 2695 + }, + { + "epoch": 2.55, + "grad_norm": 14.962983131408691, + "learning_rate": 8.412032179083595e-08, + "logps/chosen": -52.652095794677734, + "logps/rejected": -108.85685729980469, + "loss": 0.1631, + "losses/dpo": 0.18968872725963593, + "losses/sft": 2.0677170753479004, + "losses/total": 0.18968872725963593, + "ref_logps/chosen": -32.3492546081543, + "ref_logps/rejected": -63.92298126220703, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0302841663360596, + "rewards/margins": 2.463104248046875, + "rewards/rejected": -4.4933881759643555, + "step": 2696 + }, + { + "epoch": 2.55, + "grad_norm": 38.845951080322266, + "learning_rate": 8.394543546694648e-08, + "logps/chosen": -60.15574645996094, + "logps/rejected": -88.33204650878906, + "loss": 0.3749, + "losses/dpo": 0.0761902928352356, + "losses/sft": 2.1660962104797363, + "losses/total": 0.0761902928352356, + "ref_logps/chosen": -39.53703308105469, + "ref_logps/rejected": -44.3358154296875, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.061871290206909, + "rewards/margins": 2.337751865386963, + "rewards/rejected": -4.399622917175293, + "step": 2697 + }, + { + "epoch": 2.55, + "grad_norm": 30.22638702392578, + "learning_rate": 8.3770549143057e-08, + "logps/chosen": -57.35099411010742, + "logps/rejected": -77.09319305419922, + "loss": 0.3672, + "losses/dpo": 0.3272969722747803, + "losses/sft": 1.9871859550476074, + "losses/total": 0.3272969722747803, + "ref_logps/chosen": -38.44696044921875, + "ref_logps/rejected": -37.48995590209961, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8904036283493042, + "rewards/margins": 2.069920539855957, + "rewards/rejected": -3.960324287414551, + "step": 2698 + }, + { + "epoch": 2.55, + "grad_norm": 24.346715927124023, + "learning_rate": 8.359566281916754e-08, + "logps/chosen": -57.777427673339844, + "logps/rejected": -80.12771606445312, + "loss": 0.2426, + "losses/dpo": 0.2551974356174469, + "losses/sft": 1.7932424545288086, + "losses/total": 0.2551974356174469, + "ref_logps/chosen": -37.71232223510742, + "ref_logps/rejected": -39.012168884277344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0065104961395264, + "rewards/margins": 2.105043888092041, + "rewards/rejected": -4.111554145812988, + "step": 2699 + }, + { + "epoch": 2.55, + "grad_norm": 17.088525772094727, + "learning_rate": 8.342077649527807e-08, + "logps/chosen": -54.74834442138672, + "logps/rejected": -87.44673156738281, + "loss": 0.201, + "losses/dpo": 0.4395669400691986, + "losses/sft": 2.2243034839630127, + "losses/total": 0.4395669400691986, + "ref_logps/chosen": -31.777137756347656, + "ref_logps/rejected": -38.3187255859375, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2971205711364746, + "rewards/margins": 2.6156809329986572, + "rewards/rejected": -4.912801265716553, + "step": 2700 + }, + { + "epoch": 2.55, + "grad_norm": 16.262378692626953, + "learning_rate": 8.32458901713886e-08, + "logps/chosen": -56.27473449707031, + "logps/rejected": -72.77891540527344, + "loss": 0.2048, + "losses/dpo": 0.1984620839357376, + "losses/sft": 1.8290061950683594, + "losses/total": 0.1984620839357376, + "ref_logps/chosen": -38.23634719848633, + "ref_logps/rejected": -34.218109130859375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8038387298583984, + "rewards/margins": 2.052241802215576, + "rewards/rejected": -3.8560807704925537, + "step": 2701 + }, + { + "epoch": 2.55, + "grad_norm": 21.502120971679688, + "learning_rate": 8.307100384749912e-08, + "logps/chosen": -53.68284606933594, + "logps/rejected": -87.90415954589844, + "loss": 0.2486, + "losses/dpo": 0.0498664416372776, + "losses/sft": 1.9409465789794922, + "losses/total": 0.0498664416372776, + "ref_logps/chosen": -32.84949493408203, + "ref_logps/rejected": -40.87139129638672, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0833353996276855, + "rewards/margins": 2.6199421882629395, + "rewards/rejected": -4.703277587890625, + "step": 2702 + }, + { + "epoch": 2.55, + "grad_norm": 24.44669532775879, + "learning_rate": 8.289611752360965e-08, + "logps/chosen": -56.875633239746094, + "logps/rejected": -86.78715515136719, + "loss": 0.2103, + "losses/dpo": 0.15162293612957, + "losses/sft": 2.292759895324707, + "losses/total": 0.15162293612957, + "ref_logps/chosen": -36.75770568847656, + "ref_logps/rejected": -43.673057556152344, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0117931365966797, + "rewards/margins": 2.299616813659668, + "rewards/rejected": -4.311409950256348, + "step": 2703 + }, + { + "epoch": 2.55, + "grad_norm": 11.615337371826172, + "learning_rate": 8.272123119972017e-08, + "logps/chosen": -50.485042572021484, + "logps/rejected": -96.719482421875, + "loss": 0.0889, + "losses/dpo": 0.13951162993907928, + "losses/sft": 2.236678123474121, + "losses/total": 0.13951162993907928, + "ref_logps/chosen": -34.31647872924805, + "ref_logps/rejected": -52.05599594116211, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.616856575012207, + "rewards/margins": 2.849492073059082, + "rewards/rejected": -4.466348648071289, + "step": 2704 + }, + { + "epoch": 2.55, + "grad_norm": 20.754470825195312, + "learning_rate": 8.25463448758307e-08, + "logps/chosen": -53.029170989990234, + "logps/rejected": -69.87765502929688, + "loss": 0.2586, + "losses/dpo": 0.19815683364868164, + "losses/sft": 2.0173227787017822, + "losses/total": 0.19815683364868164, + "ref_logps/chosen": -37.79646301269531, + "ref_logps/rejected": -35.85797119140625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5232707262039185, + "rewards/margins": 1.8786981105804443, + "rewards/rejected": -3.4019687175750732, + "step": 2705 + }, + { + "epoch": 2.56, + "grad_norm": 21.824260711669922, + "learning_rate": 8.237145855194124e-08, + "logps/chosen": -55.73167419433594, + "logps/rejected": -80.55992126464844, + "loss": 0.247, + "losses/dpo": 0.10748860239982605, + "losses/sft": 2.1623952388763428, + "losses/total": 0.10748860239982605, + "ref_logps/chosen": -35.19187545776367, + "ref_logps/rejected": -39.41991424560547, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0539801120758057, + "rewards/margins": 2.0600204467773438, + "rewards/rejected": -4.11400032043457, + "step": 2706 + }, + { + "epoch": 2.56, + "grad_norm": 19.404788970947266, + "learning_rate": 8.219657222805176e-08, + "logps/chosen": -63.99925231933594, + "logps/rejected": -88.44627380371094, + "loss": 0.171, + "losses/dpo": 0.2757548987865448, + "losses/sft": 2.140808343887329, + "losses/total": 0.2757548987865448, + "ref_logps/chosen": -43.223228454589844, + "ref_logps/rejected": -45.635955810546875, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0776021480560303, + "rewards/margins": 2.203429698944092, + "rewards/rejected": -4.281031608581543, + "step": 2707 + }, + { + "epoch": 2.56, + "grad_norm": 19.738277435302734, + "learning_rate": 8.202168590416229e-08, + "logps/chosen": -59.29216766357422, + "logps/rejected": -83.80810546875, + "loss": 0.193, + "losses/dpo": 0.11343620717525482, + "losses/sft": 1.7162336111068726, + "losses/total": 0.11343620717525482, + "ref_logps/chosen": -40.56107711791992, + "ref_logps/rejected": -40.640689849853516, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8731088638305664, + "rewards/margins": 2.4436323642730713, + "rewards/rejected": -4.316740989685059, + "step": 2708 + }, + { + "epoch": 2.56, + "grad_norm": 18.394798278808594, + "learning_rate": 8.184679958027283e-08, + "logps/chosen": -48.421905517578125, + "logps/rejected": -86.56251525878906, + "loss": 0.1685, + "losses/dpo": 0.2300102710723877, + "losses/sft": 2.1626460552215576, + "losses/total": 0.2300102710723877, + "ref_logps/chosen": -29.724613189697266, + "ref_logps/rejected": -41.89875793457031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8697292804718018, + "rewards/margins": 2.596646308898926, + "rewards/rejected": -4.466375350952148, + "step": 2709 + }, + { + "epoch": 2.56, + "grad_norm": 20.218198776245117, + "learning_rate": 8.167191325638335e-08, + "logps/chosen": -59.41455841064453, + "logps/rejected": -82.58963775634766, + "loss": 0.2338, + "losses/dpo": 0.23479968309402466, + "losses/sft": 2.1468422412872314, + "losses/total": 0.23479968309402466, + "ref_logps/chosen": -40.77599334716797, + "ref_logps/rejected": -38.94643020629883, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.863856554031372, + "rewards/margins": 2.5004639625549316, + "rewards/rejected": -4.364320755004883, + "step": 2710 + }, + { + "epoch": 2.56, + "grad_norm": 28.48883819580078, + "learning_rate": 8.149702693249387e-08, + "logps/chosen": -65.20120239257812, + "logps/rejected": -91.61866760253906, + "loss": 0.2368, + "losses/dpo": 0.13447558879852295, + "losses/sft": 2.661884069442749, + "losses/total": 0.13447558879852295, + "ref_logps/chosen": -40.47429656982422, + "ref_logps/rejected": -43.73830795288086, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4726903438568115, + "rewards/margins": 2.3153457641601562, + "rewards/rejected": -4.788036346435547, + "step": 2711 + }, + { + "epoch": 2.56, + "grad_norm": 25.97309684753418, + "learning_rate": 8.13221406086044e-08, + "logps/chosen": -77.37887573242188, + "logps/rejected": -94.94210052490234, + "loss": 0.2287, + "losses/dpo": 0.4562962055206299, + "losses/sft": 2.44805645942688, + "losses/total": 0.4562962055206299, + "ref_logps/chosen": -48.929874420166016, + "ref_logps/rejected": -44.67776870727539, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.844900131225586, + "rewards/margins": 2.1815333366394043, + "rewards/rejected": -5.02643346786499, + "step": 2712 + }, + { + "epoch": 2.56, + "grad_norm": 24.289073944091797, + "learning_rate": 8.114725428471493e-08, + "logps/chosen": -74.5272445678711, + "logps/rejected": -101.11073303222656, + "loss": 0.2716, + "losses/dpo": 0.0493089035153389, + "losses/sft": 2.828669786453247, + "losses/total": 0.0493089035153389, + "ref_logps/chosen": -46.1937255859375, + "ref_logps/rejected": -50.59699249267578, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.83335280418396, + "rewards/margins": 2.218021869659424, + "rewards/rejected": -5.051374435424805, + "step": 2713 + }, + { + "epoch": 2.56, + "grad_norm": 16.502212524414062, + "learning_rate": 8.097236796082546e-08, + "logps/chosen": -57.70896911621094, + "logps/rejected": -88.69378662109375, + "loss": 0.1388, + "losses/dpo": 0.13913074135780334, + "losses/sft": 2.0559241771698, + "losses/total": 0.13913074135780334, + "ref_logps/chosen": -37.42401885986328, + "ref_logps/rejected": -39.46044921875, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0284950733184814, + "rewards/margins": 2.89483904838562, + "rewards/rejected": -4.923334121704102, + "step": 2714 + }, + { + "epoch": 2.56, + "grad_norm": 17.884679794311523, + "learning_rate": 8.0797481636936e-08, + "logps/chosen": -50.73571014404297, + "logps/rejected": -82.34507751464844, + "loss": 0.1387, + "losses/dpo": 0.05677824094891548, + "losses/sft": 1.0485986471176147, + "losses/total": 0.05677824094891548, + "ref_logps/chosen": -33.30804443359375, + "ref_logps/rejected": -36.03643035888672, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7427667379379272, + "rewards/margins": 2.8880972862243652, + "rewards/rejected": -4.630864143371582, + "step": 2715 + }, + { + "epoch": 2.56, + "grad_norm": 16.64714813232422, + "learning_rate": 8.062259531304652e-08, + "logps/chosen": -53.00878143310547, + "logps/rejected": -89.87242889404297, + "loss": 0.1451, + "losses/dpo": 0.20994940400123596, + "losses/sft": 2.305251359939575, + "losses/total": 0.20994940400123596, + "ref_logps/chosen": -32.36865997314453, + "ref_logps/rejected": -44.373199462890625, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.064012050628662, + "rewards/margins": 2.4859113693237305, + "rewards/rejected": -4.549923419952393, + "step": 2716 + }, + { + "epoch": 2.57, + "grad_norm": 30.641056060791016, + "learning_rate": 8.044770898915705e-08, + "logps/chosen": -51.4459114074707, + "logps/rejected": -80.82252502441406, + "loss": 0.3217, + "losses/dpo": 0.32456889748573303, + "losses/sft": 2.557231903076172, + "losses/total": 0.32456889748573303, + "ref_logps/chosen": -29.376659393310547, + "ref_logps/rejected": -40.18138122558594, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.206925392150879, + "rewards/margins": 1.8571892976760864, + "rewards/rejected": -4.064114570617676, + "step": 2717 + }, + { + "epoch": 2.57, + "grad_norm": 29.355276107788086, + "learning_rate": 8.027282266526756e-08, + "logps/chosen": -53.154075622558594, + "logps/rejected": -82.87979888916016, + "loss": 0.2962, + "losses/dpo": 0.3268861174583435, + "losses/sft": 2.273249387741089, + "losses/total": 0.3268861174583435, + "ref_logps/chosen": -34.71628189086914, + "ref_logps/rejected": -44.28424072265625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8437790870666504, + "rewards/margins": 2.0157763957977295, + "rewards/rejected": -3.859555721282959, + "step": 2718 + }, + { + "epoch": 2.57, + "grad_norm": 37.05657958984375, + "learning_rate": 8.00979363413781e-08, + "logps/chosen": -74.75926208496094, + "logps/rejected": -105.12036895751953, + "loss": 0.2862, + "losses/dpo": 0.18447448313236237, + "losses/sft": 2.504068613052368, + "losses/total": 0.18447448313236237, + "ref_logps/chosen": -44.20771026611328, + "ref_logps/rejected": -52.926090240478516, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0551552772521973, + "rewards/margins": 2.1642727851867676, + "rewards/rejected": -5.219428062438965, + "step": 2719 + }, + { + "epoch": 2.57, + "grad_norm": 28.68370819091797, + "learning_rate": 7.992305001748863e-08, + "logps/chosen": -62.89386749267578, + "logps/rejected": -86.82422637939453, + "loss": 0.2742, + "losses/dpo": 0.29928988218307495, + "losses/sft": 2.10127592086792, + "losses/total": 0.29928988218307495, + "ref_logps/chosen": -41.423397064208984, + "ref_logps/rejected": -39.5869255065918, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1470468044281006, + "rewards/margins": 2.576683521270752, + "rewards/rejected": -4.723730564117432, + "step": 2720 + }, + { + "epoch": 2.57, + "grad_norm": 30.622920989990234, + "learning_rate": 7.974816369359915e-08, + "logps/chosen": -58.17392349243164, + "logps/rejected": -87.57852935791016, + "loss": 0.2811, + "losses/dpo": 0.3850134313106537, + "losses/sft": 2.2896618843078613, + "losses/total": 0.3850134313106537, + "ref_logps/chosen": -35.89087677001953, + "ref_logps/rejected": -43.804901123046875, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.228304624557495, + "rewards/margins": 2.1490578651428223, + "rewards/rejected": -4.3773627281188965, + "step": 2721 + }, + { + "epoch": 2.57, + "grad_norm": 26.471567153930664, + "learning_rate": 7.957327736970969e-08, + "logps/chosen": -56.93463897705078, + "logps/rejected": -79.65208435058594, + "loss": 0.2862, + "losses/dpo": 0.439805805683136, + "losses/sft": 2.0052285194396973, + "losses/total": 0.439805805683136, + "ref_logps/chosen": -36.83883285522461, + "ref_logps/rejected": -39.14894104003906, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.009580373764038, + "rewards/margins": 2.04073429107666, + "rewards/rejected": -4.050314903259277, + "step": 2722 + }, + { + "epoch": 2.57, + "grad_norm": 40.461177825927734, + "learning_rate": 7.939839104582022e-08, + "logps/chosen": -60.62139129638672, + "logps/rejected": -85.19065856933594, + "loss": 0.3285, + "losses/dpo": 0.18352824449539185, + "losses/sft": 1.780651330947876, + "losses/total": 0.18352824449539185, + "ref_logps/chosen": -34.476436614990234, + "ref_logps/rejected": -40.504791259765625, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.614495277404785, + "rewards/margins": 1.854091763496399, + "rewards/rejected": -4.4685869216918945, + "step": 2723 + }, + { + "epoch": 2.57, + "grad_norm": 17.81596565246582, + "learning_rate": 7.922350472193074e-08, + "logps/chosen": -52.41864013671875, + "logps/rejected": -87.20779418945312, + "loss": 0.1905, + "losses/dpo": 0.20663711428642273, + "losses/sft": 1.5303996801376343, + "losses/total": 0.20663711428642273, + "ref_logps/chosen": -34.06294250488281, + "ref_logps/rejected": -42.91332244873047, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8355698585510254, + "rewards/margins": 2.5938773155212402, + "rewards/rejected": -4.429447174072266, + "step": 2724 + }, + { + "epoch": 2.57, + "grad_norm": 26.453439712524414, + "learning_rate": 7.904861839804127e-08, + "logps/chosen": -65.81513977050781, + "logps/rejected": -90.457275390625, + "loss": 0.2071, + "losses/dpo": 0.09725739806890488, + "losses/sft": 2.0027050971984863, + "losses/total": 0.09725739806890488, + "ref_logps/chosen": -41.82478332519531, + "ref_logps/rejected": -43.45936584472656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3990349769592285, + "rewards/margins": 2.300755739212036, + "rewards/rejected": -4.699790954589844, + "step": 2725 + }, + { + "epoch": 2.57, + "grad_norm": 36.95955276489258, + "learning_rate": 7.88737320741518e-08, + "logps/chosen": -59.159141540527344, + "logps/rejected": -93.71883392333984, + "loss": 0.2399, + "losses/dpo": 0.1004643589258194, + "losses/sft": 2.725231885910034, + "losses/total": 0.1004643589258194, + "ref_logps/chosen": -35.653419494628906, + "ref_logps/rejected": -44.3263053894043, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.350572109222412, + "rewards/margins": 2.5886809825897217, + "rewards/rejected": -4.939253330230713, + "step": 2726 + }, + { + "epoch": 2.58, + "grad_norm": 19.50714874267578, + "learning_rate": 7.869884575026232e-08, + "logps/chosen": -61.15166473388672, + "logps/rejected": -90.53507995605469, + "loss": 0.2022, + "losses/dpo": 0.587996780872345, + "losses/sft": 2.068999767303467, + "losses/total": 0.587996780872345, + "ref_logps/chosen": -40.30133819580078, + "ref_logps/rejected": -44.567108154296875, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0850329399108887, + "rewards/margins": 2.5117645263671875, + "rewards/rejected": -4.596797943115234, + "step": 2727 + }, + { + "epoch": 2.58, + "grad_norm": 20.398759841918945, + "learning_rate": 7.852395942637286e-08, + "logps/chosen": -56.559783935546875, + "logps/rejected": -87.5220718383789, + "loss": 0.1689, + "losses/dpo": 0.07520067691802979, + "losses/sft": 2.4023282527923584, + "losses/total": 0.07520067691802979, + "ref_logps/chosen": -36.88642120361328, + "ref_logps/rejected": -42.54554748535156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9673360586166382, + "rewards/margins": 2.53031587600708, + "rewards/rejected": -4.497652053833008, + "step": 2728 + }, + { + "epoch": 2.58, + "grad_norm": 23.103666305541992, + "learning_rate": 7.834907310248339e-08, + "logps/chosen": -56.202880859375, + "logps/rejected": -94.07498168945312, + "loss": 0.2416, + "losses/dpo": 0.537917971611023, + "losses/sft": 1.7297097444534302, + "losses/total": 0.537917971611023, + "ref_logps/chosen": -37.405799865722656, + "ref_logps/rejected": -48.333030700683594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8797078132629395, + "rewards/margins": 2.6944875717163086, + "rewards/rejected": -4.57419490814209, + "step": 2729 + }, + { + "epoch": 2.58, + "grad_norm": 35.609649658203125, + "learning_rate": 7.817418677859391e-08, + "logps/chosen": -48.1756706237793, + "logps/rejected": -83.10713195800781, + "loss": 0.5478, + "losses/dpo": 0.2470855414867401, + "losses/sft": 1.257510781288147, + "losses/total": 0.2470855414867401, + "ref_logps/chosen": -29.944290161132812, + "ref_logps/rejected": -43.08921813964844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8231382369995117, + "rewards/margins": 2.1786537170410156, + "rewards/rejected": -4.001791954040527, + "step": 2730 + }, + { + "epoch": 2.58, + "grad_norm": 29.666091918945312, + "learning_rate": 7.799930045470445e-08, + "logps/chosen": -59.72986602783203, + "logps/rejected": -74.16252899169922, + "loss": 0.3179, + "losses/dpo": 0.16925600171089172, + "losses/sft": 1.5958210229873657, + "losses/total": 0.16925600171089172, + "ref_logps/chosen": -37.968505859375, + "ref_logps/rejected": -34.77012634277344, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1761364936828613, + "rewards/margins": 1.763104796409607, + "rewards/rejected": -3.9392411708831787, + "step": 2731 + }, + { + "epoch": 2.58, + "grad_norm": 16.219690322875977, + "learning_rate": 7.782441413081496e-08, + "logps/chosen": -46.803672790527344, + "logps/rejected": -86.58584594726562, + "loss": 0.1355, + "losses/dpo": 0.18694616854190826, + "losses/sft": 1.8560783863067627, + "losses/total": 0.18694616854190826, + "ref_logps/chosen": -32.62672805786133, + "ref_logps/rejected": -43.35367202758789, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.417694330215454, + "rewards/margins": 2.9055228233337402, + "rewards/rejected": -4.323217391967773, + "step": 2732 + }, + { + "epoch": 2.58, + "grad_norm": 28.264787673950195, + "learning_rate": 7.764952780692549e-08, + "logps/chosen": -55.259429931640625, + "logps/rejected": -88.17562866210938, + "loss": 0.2243, + "losses/dpo": 0.37570884823799133, + "losses/sft": 2.372860908508301, + "losses/total": 0.37570884823799133, + "ref_logps/chosen": -35.449188232421875, + "ref_logps/rejected": -42.4696159362793, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9810240268707275, + "rewards/margins": 2.5895774364471436, + "rewards/rejected": -4.570601463317871, + "step": 2733 + }, + { + "epoch": 2.58, + "grad_norm": 19.84027671813965, + "learning_rate": 7.747464148303602e-08, + "logps/chosen": -57.22923278808594, + "logps/rejected": -75.40006256103516, + "loss": 0.1557, + "losses/dpo": 0.08267943561077118, + "losses/sft": 1.6729727983474731, + "losses/total": 0.08267943561077118, + "ref_logps/chosen": -41.01884460449219, + "ref_logps/rejected": -37.75150680541992, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6210386753082275, + "rewards/margins": 2.14381742477417, + "rewards/rejected": -3.7648563385009766, + "step": 2734 + }, + { + "epoch": 2.58, + "grad_norm": 35.98508071899414, + "learning_rate": 7.729975515914655e-08, + "logps/chosen": -66.03498840332031, + "logps/rejected": -90.2000732421875, + "loss": 0.3154, + "losses/dpo": 0.667961061000824, + "losses/sft": 2.4656476974487305, + "losses/total": 0.667961061000824, + "ref_logps/chosen": -47.4753532409668, + "ref_logps/rejected": -49.57347869873047, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8559632301330566, + "rewards/margins": 2.206695556640625, + "rewards/rejected": -4.062658786773682, + "step": 2735 + }, + { + "epoch": 2.58, + "grad_norm": 20.190582275390625, + "learning_rate": 7.712486883525708e-08, + "logps/chosen": -69.85173034667969, + "logps/rejected": -105.54653930664062, + "loss": 0.1527, + "losses/dpo": 0.2493601143360138, + "losses/sft": 1.9513732194900513, + "losses/total": 0.2493601143360138, + "ref_logps/chosen": -44.38006591796875, + "ref_logps/rejected": -50.512794494628906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.547166347503662, + "rewards/margins": 2.9562084674835205, + "rewards/rejected": -5.503375053405762, + "step": 2736 + }, + { + "epoch": 2.58, + "grad_norm": 28.619308471679688, + "learning_rate": 7.69499825113676e-08, + "logps/chosen": -55.38597106933594, + "logps/rejected": -93.68559265136719, + "loss": 0.2459, + "losses/dpo": 0.07029101997613907, + "losses/sft": 2.2137067317962646, + "losses/total": 0.07029101997613907, + "ref_logps/chosen": -35.886619567871094, + "ref_logps/rejected": -49.30084228515625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9499354362487793, + "rewards/margins": 2.488539934158325, + "rewards/rejected": -4.438475131988525, + "step": 2737 + }, + { + "epoch": 2.59, + "grad_norm": 16.033260345458984, + "learning_rate": 7.677509618747815e-08, + "logps/chosen": -65.22747039794922, + "logps/rejected": -92.58159637451172, + "loss": 0.1134, + "losses/dpo": 0.16272282600402832, + "losses/sft": 2.371382713317871, + "losses/total": 0.16272282600402832, + "ref_logps/chosen": -41.89468002319336, + "ref_logps/rejected": -41.7523078918457, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3332786560058594, + "rewards/margins": 2.749650716781616, + "rewards/rejected": -5.082929611206055, + "step": 2738 + }, + { + "epoch": 2.59, + "grad_norm": 27.556373596191406, + "learning_rate": 7.660020986358866e-08, + "logps/chosen": -57.74921798706055, + "logps/rejected": -76.01809692382812, + "loss": 0.2424, + "losses/dpo": 0.07248252630233765, + "losses/sft": 3.415224075317383, + "losses/total": 0.07248252630233765, + "ref_logps/chosen": -33.27150344848633, + "ref_logps/rejected": -32.11667251586914, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4477713108062744, + "rewards/margins": 1.942371129989624, + "rewards/rejected": -4.390142440795898, + "step": 2739 + }, + { + "epoch": 2.59, + "grad_norm": 31.120080947875977, + "learning_rate": 7.642532353969918e-08, + "logps/chosen": -64.96121215820312, + "logps/rejected": -95.09601593017578, + "loss": 0.3059, + "losses/dpo": 0.16930629312992096, + "losses/sft": 2.241840362548828, + "losses/total": 0.16930629312992096, + "ref_logps/chosen": -41.666969299316406, + "ref_logps/rejected": -47.29644012451172, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3294241428375244, + "rewards/margins": 2.450533390045166, + "rewards/rejected": -4.7799577713012695, + "step": 2740 + }, + { + "epoch": 2.59, + "grad_norm": 31.812015533447266, + "learning_rate": 7.625043721580972e-08, + "logps/chosen": -58.548728942871094, + "logps/rejected": -84.16853332519531, + "loss": 0.2639, + "losses/dpo": 0.4040836989879608, + "losses/sft": 2.3799593448638916, + "losses/total": 0.4040836989879608, + "ref_logps/chosen": -36.088958740234375, + "ref_logps/rejected": -36.762855529785156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2459769248962402, + "rewards/margins": 2.494590997695923, + "rewards/rejected": -4.740568161010742, + "step": 2741 + }, + { + "epoch": 2.59, + "grad_norm": 24.292409896850586, + "learning_rate": 7.607555089192025e-08, + "logps/chosen": -53.20740509033203, + "logps/rejected": -72.34294128417969, + "loss": 0.2668, + "losses/dpo": 0.457363098859787, + "losses/sft": 1.9364184141159058, + "losses/total": 0.457363098859787, + "ref_logps/chosen": -30.19186019897461, + "ref_logps/rejected": -31.325294494628906, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3015542030334473, + "rewards/margins": 1.8002103567123413, + "rewards/rejected": -4.101764678955078, + "step": 2742 + }, + { + "epoch": 2.59, + "grad_norm": 29.611892700195312, + "learning_rate": 7.590066456803077e-08, + "logps/chosen": -62.867698669433594, + "logps/rejected": -86.58934020996094, + "loss": 0.2814, + "losses/dpo": 0.3269091844558716, + "losses/sft": 1.7992746829986572, + "losses/total": 0.3269091844558716, + "ref_logps/chosen": -38.58263397216797, + "ref_logps/rejected": -42.31630325317383, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.428506851196289, + "rewards/margins": 1.9987969398498535, + "rewards/rejected": -4.427303791046143, + "step": 2743 + }, + { + "epoch": 2.59, + "grad_norm": 34.87808609008789, + "learning_rate": 7.572577824414131e-08, + "logps/chosen": -58.59577941894531, + "logps/rejected": -67.25249481201172, + "loss": 0.4421, + "losses/dpo": 0.6237035989761353, + "losses/sft": 2.1839563846588135, + "losses/total": 0.6237035989761353, + "ref_logps/chosen": -37.07997131347656, + "ref_logps/rejected": -31.63884735107422, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.151581287384033, + "rewards/margins": 1.4097837209701538, + "rewards/rejected": -3.5613648891448975, + "step": 2744 + }, + { + "epoch": 2.59, + "grad_norm": 14.540555000305176, + "learning_rate": 7.555089192025184e-08, + "logps/chosen": -52.120262145996094, + "logps/rejected": -81.17356872558594, + "loss": 0.1524, + "losses/dpo": 0.12189151346683502, + "losses/sft": 2.514629364013672, + "losses/total": 0.12189151346683502, + "ref_logps/chosen": -33.45191955566406, + "ref_logps/rejected": -37.32148361206055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8668346405029297, + "rewards/margins": 2.5183732509613037, + "rewards/rejected": -4.3852081298828125, + "step": 2745 + }, + { + "epoch": 2.59, + "grad_norm": 26.12554931640625, + "learning_rate": 7.537600559636235e-08, + "logps/chosen": -51.76294708251953, + "logps/rejected": -77.05157470703125, + "loss": 0.3377, + "losses/dpo": 0.31474778056144714, + "losses/sft": 2.283543109893799, + "losses/total": 0.31474778056144714, + "ref_logps/chosen": -34.063941955566406, + "ref_logps/rejected": -38.45088195800781, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7699003219604492, + "rewards/margins": 2.090169906616211, + "rewards/rejected": -3.86007022857666, + "step": 2746 + }, + { + "epoch": 2.59, + "grad_norm": 26.18435287475586, + "learning_rate": 7.520111927247289e-08, + "logps/chosen": -63.85174560546875, + "logps/rejected": -88.06969451904297, + "loss": 0.3193, + "losses/dpo": 0.0951785072684288, + "losses/sft": 1.9448986053466797, + "losses/total": 0.0951785072684288, + "ref_logps/chosen": -39.32052230834961, + "ref_logps/rejected": -45.76739501953125, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.453122138977051, + "rewards/margins": 1.7771079540252686, + "rewards/rejected": -4.230230331420898, + "step": 2747 + }, + { + "epoch": 2.59, + "grad_norm": 27.94809913635254, + "learning_rate": 7.502623294858342e-08, + "logps/chosen": -48.2830924987793, + "logps/rejected": -84.45472717285156, + "loss": 0.3111, + "losses/dpo": 0.6783143877983093, + "losses/sft": 2.2958736419677734, + "losses/total": 0.6783143877983093, + "ref_logps/chosen": -31.046344757080078, + "ref_logps/rejected": -42.30171203613281, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7236746549606323, + "rewards/margins": 2.4916274547576904, + "rewards/rejected": -4.215302467346191, + "step": 2748 + }, + { + "epoch": 2.6, + "grad_norm": 23.95720100402832, + "learning_rate": 7.485134662469394e-08, + "logps/chosen": -68.78559112548828, + "logps/rejected": -87.39158630371094, + "loss": 0.2563, + "losses/dpo": 0.3453255295753479, + "losses/sft": 2.2026615142822266, + "losses/total": 0.3453255295753479, + "ref_logps/chosen": -45.66846466064453, + "ref_logps/rejected": -46.090431213378906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3117127418518066, + "rewards/margins": 1.8184019327163696, + "rewards/rejected": -4.130114555358887, + "step": 2749 + }, + { + "epoch": 2.6, + "grad_norm": 24.049285888671875, + "learning_rate": 7.467646030080447e-08, + "logps/chosen": -52.744720458984375, + "logps/rejected": -85.0572509765625, + "loss": 0.2297, + "losses/dpo": 0.04395807906985283, + "losses/sft": 2.03694486618042, + "losses/total": 0.04395807906985283, + "ref_logps/chosen": -33.33172607421875, + "ref_logps/rejected": -40.23902893066406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.941299557685852, + "rewards/margins": 2.540522813796997, + "rewards/rejected": -4.481822490692139, + "step": 2750 + }, + { + "epoch": 2.6, + "grad_norm": 30.164949417114258, + "learning_rate": 7.450157397691501e-08, + "logps/chosen": -40.5528564453125, + "logps/rejected": -55.17308044433594, + "loss": 0.3725, + "losses/dpo": 0.36549150943756104, + "losses/sft": 1.51241934299469, + "losses/total": 0.36549150943756104, + "ref_logps/chosen": -27.111711502075195, + "ref_logps/rejected": -28.69839096069336, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3441150188446045, + "rewards/margins": 1.303354024887085, + "rewards/rejected": -2.6474692821502686, + "step": 2751 + }, + { + "epoch": 2.6, + "grad_norm": 23.710203170776367, + "learning_rate": 7.432668765302553e-08, + "logps/chosen": -56.65016174316406, + "logps/rejected": -88.03787994384766, + "loss": 0.2623, + "losses/dpo": 0.4416307210922241, + "losses/sft": 2.197373151779175, + "losses/total": 0.4416307210922241, + "ref_logps/chosen": -29.66163444519043, + "ref_logps/rejected": -42.691070556640625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6988525390625, + "rewards/margins": 1.8358286619186401, + "rewards/rejected": -4.53468132019043, + "step": 2752 + }, + { + "epoch": 2.6, + "grad_norm": 23.91667366027832, + "learning_rate": 7.415180132913605e-08, + "logps/chosen": -52.05896759033203, + "logps/rejected": -88.61227416992188, + "loss": 0.2444, + "losses/dpo": 0.24901221692562103, + "losses/sft": 2.4008641242980957, + "losses/total": 0.24901221692562103, + "ref_logps/chosen": -31.663232803344727, + "ref_logps/rejected": -46.722476959228516, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0395736694335938, + "rewards/margins": 2.1494064331054688, + "rewards/rejected": -4.1889801025390625, + "step": 2753 + }, + { + "epoch": 2.6, + "grad_norm": 31.838598251342773, + "learning_rate": 7.397691500524659e-08, + "logps/chosen": -59.711021423339844, + "logps/rejected": -83.3906478881836, + "loss": 0.3189, + "losses/dpo": 0.08191178739070892, + "losses/sft": 1.777084469795227, + "losses/total": 0.08191178739070892, + "ref_logps/chosen": -38.904083251953125, + "ref_logps/rejected": -41.59431076049805, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.080693244934082, + "rewards/margins": 2.09894061088562, + "rewards/rejected": -4.179634094238281, + "step": 2754 + }, + { + "epoch": 2.6, + "grad_norm": 18.359704971313477, + "learning_rate": 7.380202868135711e-08, + "logps/chosen": -53.37405014038086, + "logps/rejected": -80.7494125366211, + "loss": 0.2247, + "losses/dpo": 0.14001595973968506, + "losses/sft": 1.684976577758789, + "losses/total": 0.14001595973968506, + "ref_logps/chosen": -35.51100158691406, + "ref_logps/rejected": -40.45469665527344, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7863047122955322, + "rewards/margins": 2.24316668510437, + "rewards/rejected": -4.0294718742370605, + "step": 2755 + }, + { + "epoch": 2.6, + "grad_norm": 14.300028800964355, + "learning_rate": 7.362714235746764e-08, + "logps/chosen": -67.7360610961914, + "logps/rejected": -108.64254760742188, + "loss": 0.1267, + "losses/dpo": 0.11517482995986938, + "losses/sft": 1.967227816581726, + "losses/total": 0.11517482995986938, + "ref_logps/chosen": -46.20024871826172, + "ref_logps/rejected": -55.243709564208984, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1535816192626953, + "rewards/margins": 3.186302661895752, + "rewards/rejected": -5.3398847579956055, + "step": 2756 + }, + { + "epoch": 2.6, + "grad_norm": 15.87247371673584, + "learning_rate": 7.345225603357818e-08, + "logps/chosen": -74.22026824951172, + "logps/rejected": -102.53128814697266, + "loss": 0.1413, + "losses/dpo": 0.09355618804693222, + "losses/sft": 2.145048141479492, + "losses/total": 0.09355618804693222, + "ref_logps/chosen": -49.551414489746094, + "ref_logps/rejected": -54.68934631347656, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4668853282928467, + "rewards/margins": 2.3173089027404785, + "rewards/rejected": -4.784193992614746, + "step": 2757 + }, + { + "epoch": 2.6, + "grad_norm": 25.568130493164062, + "learning_rate": 7.32773697096887e-08, + "logps/chosen": -59.52566909790039, + "logps/rejected": -92.83795166015625, + "loss": 0.3106, + "losses/dpo": 0.6710569262504578, + "losses/sft": 1.9152323007583618, + "losses/total": 0.6710569262504578, + "ref_logps/chosen": -32.93742752075195, + "ref_logps/rejected": -43.587005615234375, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6588244438171387, + "rewards/margins": 2.266270160675049, + "rewards/rejected": -4.9250946044921875, + "step": 2758 + }, + { + "epoch": 2.61, + "grad_norm": 35.34548568725586, + "learning_rate": 7.310248338579923e-08, + "logps/chosen": -55.91728591918945, + "logps/rejected": -80.59571075439453, + "loss": 0.3067, + "losses/dpo": 0.14326077699661255, + "losses/sft": 2.452972173690796, + "losses/total": 0.14326077699661255, + "ref_logps/chosen": -37.061195373535156, + "ref_logps/rejected": -44.04087829589844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8856089115142822, + "rewards/margins": 1.769874095916748, + "rewards/rejected": -3.6554832458496094, + "step": 2759 + }, + { + "epoch": 2.61, + "grad_norm": 44.63969802856445, + "learning_rate": 7.292759706190975e-08, + "logps/chosen": -70.88312530517578, + "logps/rejected": -78.41619873046875, + "loss": 0.6211, + "losses/dpo": 0.39397814869880676, + "losses/sft": 1.8053189516067505, + "losses/total": 0.39397814869880676, + "ref_logps/chosen": -46.417572021484375, + "ref_logps/rejected": -42.08167266845703, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.4465560913085938, + "rewards/margins": 1.1868970394134521, + "rewards/rejected": -3.633452892303467, + "step": 2760 + }, + { + "epoch": 2.61, + "grad_norm": 19.750978469848633, + "learning_rate": 7.275271073802028e-08, + "logps/chosen": -66.57269287109375, + "logps/rejected": -101.32249450683594, + "loss": 0.1487, + "losses/dpo": 0.20585063099861145, + "losses/sft": 1.814422845840454, + "losses/total": 0.20585063099861145, + "ref_logps/chosen": -44.48317337036133, + "ref_logps/rejected": -52.13437271118164, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2089526653289795, + "rewards/margins": 2.709860324859619, + "rewards/rejected": -4.9188127517700195, + "step": 2761 + }, + { + "epoch": 2.61, + "grad_norm": 23.789087295532227, + "learning_rate": 7.25778244141308e-08, + "logps/chosen": -59.16730499267578, + "logps/rejected": -89.41313171386719, + "loss": 0.1899, + "losses/dpo": 0.07216180115938187, + "losses/sft": 1.716038703918457, + "losses/total": 0.07216180115938187, + "ref_logps/chosen": -39.268524169921875, + "ref_logps/rejected": -44.30385971069336, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9898784160614014, + "rewards/margins": 2.5210492610931396, + "rewards/rejected": -4.510927677154541, + "step": 2762 + }, + { + "epoch": 2.61, + "grad_norm": 20.332326889038086, + "learning_rate": 7.240293809024135e-08, + "logps/chosen": -58.525840759277344, + "logps/rejected": -90.68019104003906, + "loss": 0.205, + "losses/dpo": 0.11882972717285156, + "losses/sft": 2.0520684719085693, + "losses/total": 0.11882972717285156, + "ref_logps/chosen": -41.305606842041016, + "ref_logps/rejected": -46.05442428588867, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7220234870910645, + "rewards/margins": 2.740553140640259, + "rewards/rejected": -4.462576866149902, + "step": 2763 + }, + { + "epoch": 2.61, + "grad_norm": 25.217487335205078, + "learning_rate": 7.222805176635187e-08, + "logps/chosen": -40.85335922241211, + "logps/rejected": -67.79035949707031, + "loss": 0.2726, + "losses/dpo": 0.32088738679885864, + "losses/sft": 1.9489741325378418, + "losses/total": 0.32088738679885864, + "ref_logps/chosen": -26.03641128540039, + "ref_logps/rejected": -32.96977615356445, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4816948175430298, + "rewards/margins": 2.000363349914551, + "rewards/rejected": -3.482058525085449, + "step": 2764 + }, + { + "epoch": 2.61, + "grad_norm": 11.892221450805664, + "learning_rate": 7.20531654424624e-08, + "logps/chosen": -58.42539978027344, + "logps/rejected": -97.72834777832031, + "loss": 0.0853, + "losses/dpo": 0.04946961626410484, + "losses/sft": 2.365954637527466, + "losses/total": 0.04946961626410484, + "ref_logps/chosen": -34.785057067871094, + "ref_logps/rejected": -45.975440979003906, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.364034652709961, + "rewards/margins": 2.811256170272827, + "rewards/rejected": -5.175291061401367, + "step": 2765 + }, + { + "epoch": 2.61, + "grad_norm": 19.934789657592773, + "learning_rate": 7.187827911857292e-08, + "logps/chosen": -56.808448791503906, + "logps/rejected": -83.65318298339844, + "loss": 0.2082, + "losses/dpo": 0.024278657510876656, + "losses/sft": 2.5071146488189697, + "losses/total": 0.024278657510876656, + "ref_logps/chosen": -36.50736618041992, + "ref_logps/rejected": -40.96238708496094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0301082134246826, + "rewards/margins": 2.238971710205078, + "rewards/rejected": -4.26908016204834, + "step": 2766 + }, + { + "epoch": 2.61, + "grad_norm": 16.937761306762695, + "learning_rate": 7.170339279468346e-08, + "logps/chosen": -68.17940521240234, + "logps/rejected": -105.52655029296875, + "loss": 0.1581, + "losses/dpo": 0.28504469990730286, + "losses/sft": 2.2126517295837402, + "losses/total": 0.28504469990730286, + "ref_logps/chosen": -47.13844680786133, + "ref_logps/rejected": -58.97942352294922, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.104095935821533, + "rewards/margins": 2.550617218017578, + "rewards/rejected": -4.6547136306762695, + "step": 2767 + }, + { + "epoch": 2.61, + "grad_norm": 31.370891571044922, + "learning_rate": 7.152850647079397e-08, + "logps/chosen": -42.02412033081055, + "logps/rejected": -80.32880401611328, + "loss": 0.2981, + "losses/dpo": 0.09788006544113159, + "losses/sft": 1.3362228870391846, + "losses/total": 0.09788006544113159, + "ref_logps/chosen": -26.39270782470703, + "ref_logps/rejected": -42.535484313964844, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5631415843963623, + "rewards/margins": 2.216191053390503, + "rewards/rejected": -3.779332160949707, + "step": 2768 + }, + { + "epoch": 2.61, + "grad_norm": 22.922773361206055, + "learning_rate": 7.13536201469045e-08, + "logps/chosen": -47.84135818481445, + "logps/rejected": -77.00415802001953, + "loss": 0.2842, + "losses/dpo": 0.09399730712175369, + "losses/sft": 1.5567306280136108, + "losses/total": 0.09399730712175369, + "ref_logps/chosen": -30.130083084106445, + "ref_logps/rejected": -42.87702178955078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7711275815963745, + "rewards/margins": 1.6415858268737793, + "rewards/rejected": -3.4127135276794434, + "step": 2769 + }, + { + "epoch": 2.62, + "grad_norm": 16.778785705566406, + "learning_rate": 7.117873382301504e-08, + "logps/chosen": -46.029396057128906, + "logps/rejected": -102.69303894042969, + "loss": 0.1253, + "losses/dpo": 0.048510029911994934, + "losses/sft": 1.8755203485488892, + "losses/total": 0.048510029911994934, + "ref_logps/chosen": -25.625782012939453, + "ref_logps/rejected": -48.27287292480469, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0403614044189453, + "rewards/margins": 3.401655912399292, + "rewards/rejected": -5.442017555236816, + "step": 2770 + }, + { + "epoch": 2.62, + "grad_norm": 21.509645462036133, + "learning_rate": 7.100384749912557e-08, + "logps/chosen": -51.85521697998047, + "logps/rejected": -81.54478454589844, + "loss": 0.2322, + "losses/dpo": 0.18543429672718048, + "losses/sft": 1.5244815349578857, + "losses/total": 0.18543429672718048, + "ref_logps/chosen": -31.410240173339844, + "ref_logps/rejected": -40.685272216796875, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.044497489929199, + "rewards/margins": 2.0414538383483887, + "rewards/rejected": -4.085951328277588, + "step": 2771 + }, + { + "epoch": 2.62, + "grad_norm": 18.010059356689453, + "learning_rate": 7.082896117523609e-08, + "logps/chosen": -51.89374542236328, + "logps/rejected": -82.58511352539062, + "loss": 0.2133, + "losses/dpo": 0.39312419295310974, + "losses/sft": 2.261950731277466, + "losses/total": 0.39312419295310974, + "ref_logps/chosen": -33.64170837402344, + "ref_logps/rejected": -37.86516571044922, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8252036571502686, + "rewards/margins": 2.6467905044555664, + "rewards/rejected": -4.471994400024414, + "step": 2772 + }, + { + "epoch": 2.62, + "grad_norm": 17.342485427856445, + "learning_rate": 7.065407485134663e-08, + "logps/chosen": -71.14315795898438, + "logps/rejected": -91.72359466552734, + "loss": 0.1973, + "losses/dpo": 0.16242167353630066, + "losses/sft": 2.4318346977233887, + "losses/total": 0.16242167353630066, + "ref_logps/chosen": -44.687644958496094, + "ref_logps/rejected": -45.36669158935547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.645550489425659, + "rewards/margins": 1.9901399612426758, + "rewards/rejected": -4.635690689086914, + "step": 2773 + }, + { + "epoch": 2.62, + "grad_norm": 31.10148048400879, + "learning_rate": 7.047918852745716e-08, + "logps/chosen": -63.20632553100586, + "logps/rejected": -75.41357421875, + "loss": 0.3465, + "losses/dpo": 0.5767205953598022, + "losses/sft": 2.44036602973938, + "losses/total": 0.5767205953598022, + "ref_logps/chosen": -39.650474548339844, + "ref_logps/rejected": -41.25980758666992, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3555850982666016, + "rewards/margins": 1.0597914457321167, + "rewards/rejected": -3.415376663208008, + "step": 2774 + }, + { + "epoch": 2.62, + "grad_norm": 21.61287498474121, + "learning_rate": 7.030430220356767e-08, + "logps/chosen": -54.62089157104492, + "logps/rejected": -89.15480041503906, + "loss": 0.26, + "losses/dpo": 0.5773274898529053, + "losses/sft": 1.0455708503723145, + "losses/total": 0.5773274898529053, + "ref_logps/chosen": -37.89388656616211, + "ref_logps/rejected": -48.036537170410156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6727001667022705, + "rewards/margins": 2.43912672996521, + "rewards/rejected": -4.1118268966674805, + "step": 2775 + }, + { + "epoch": 2.62, + "grad_norm": 12.734532356262207, + "learning_rate": 7.012941587967821e-08, + "logps/chosen": -50.53339385986328, + "logps/rejected": -81.3243179321289, + "loss": 0.1239, + "losses/dpo": 0.09055262804031372, + "losses/sft": 1.4335606098175049, + "losses/total": 0.09055262804031372, + "ref_logps/chosen": -33.74474334716797, + "ref_logps/rejected": -37.206478118896484, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6788654327392578, + "rewards/margins": 2.7329187393188477, + "rewards/rejected": -4.4117841720581055, + "step": 2776 + }, + { + "epoch": 2.62, + "grad_norm": 12.526095390319824, + "learning_rate": 6.995452955578873e-08, + "logps/chosen": -54.03600311279297, + "logps/rejected": -92.27059936523438, + "loss": 0.1018, + "losses/dpo": 0.1050320714712143, + "losses/sft": 2.236359119415283, + "losses/total": 0.1050320714712143, + "ref_logps/chosen": -37.53479766845703, + "ref_logps/rejected": -49.787750244140625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.650120496749878, + "rewards/margins": 2.5981643199920654, + "rewards/rejected": -4.248284816741943, + "step": 2777 + }, + { + "epoch": 2.62, + "grad_norm": 27.205617904663086, + "learning_rate": 6.977964323189926e-08, + "logps/chosen": -68.04354858398438, + "logps/rejected": -85.22676086425781, + "loss": 0.2743, + "losses/dpo": 0.23525092005729675, + "losses/sft": 2.535377264022827, + "losses/total": 0.23525092005729675, + "ref_logps/chosen": -43.5046501159668, + "ref_logps/rejected": -41.47509765625, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.453890562057495, + "rewards/margins": 1.9212758541107178, + "rewards/rejected": -4.375166416168213, + "step": 2778 + }, + { + "epoch": 2.62, + "grad_norm": 26.89873504638672, + "learning_rate": 6.96047569080098e-08, + "logps/chosen": -58.988155364990234, + "logps/rejected": -82.1798095703125, + "loss": 0.2435, + "losses/dpo": 0.3528088629245758, + "losses/sft": 2.489513635635376, + "losses/total": 0.3528088629245758, + "ref_logps/chosen": -37.62434768676758, + "ref_logps/rejected": -40.86037826538086, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.136380910873413, + "rewards/margins": 1.9955627918243408, + "rewards/rejected": -4.131943702697754, + "step": 2779 + }, + { + "epoch": 2.63, + "grad_norm": 14.128700256347656, + "learning_rate": 6.942987058412032e-08, + "logps/chosen": -61.918033599853516, + "logps/rejected": -90.81793212890625, + "loss": 0.1436, + "losses/dpo": 0.19881035387516022, + "losses/sft": 1.9981014728546143, + "losses/total": 0.19881035387516022, + "ref_logps/chosen": -43.85242462158203, + "ref_logps/rejected": -47.815494537353516, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8065611124038696, + "rewards/margins": 2.493682861328125, + "rewards/rejected": -4.300244331359863, + "step": 2780 + }, + { + "epoch": 2.63, + "grad_norm": 17.454092025756836, + "learning_rate": 6.925498426023085e-08, + "logps/chosen": -65.04185485839844, + "logps/rejected": -89.77005767822266, + "loss": 0.1662, + "losses/dpo": 0.16697481274604797, + "losses/sft": 1.6025989055633545, + "losses/total": 0.16697481274604797, + "ref_logps/chosen": -45.2525634765625, + "ref_logps/rejected": -48.22856140136719, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9789294004440308, + "rewards/margins": 2.1752207279205322, + "rewards/rejected": -4.154150009155273, + "step": 2781 + }, + { + "epoch": 2.63, + "grad_norm": 23.606170654296875, + "learning_rate": 6.908009793634136e-08, + "logps/chosen": -51.40668869018555, + "logps/rejected": -77.44808959960938, + "loss": 0.2499, + "losses/dpo": 0.06719326227903366, + "losses/sft": 2.4284608364105225, + "losses/total": 0.06719326227903366, + "ref_logps/chosen": -30.66219139099121, + "ref_logps/rejected": -32.262874603271484, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0744497776031494, + "rewards/margins": 2.4440712928771973, + "rewards/rejected": -4.518521308898926, + "step": 2782 + }, + { + "epoch": 2.63, + "grad_norm": 35.55862045288086, + "learning_rate": 6.89052116124519e-08, + "logps/chosen": -57.03900146484375, + "logps/rejected": -86.1204833984375, + "loss": 0.453, + "losses/dpo": 0.8028914928436279, + "losses/sft": 2.2781355381011963, + "losses/total": 0.8028914928436279, + "ref_logps/chosen": -31.307281494140625, + "ref_logps/rejected": -47.18540573120117, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.573172092437744, + "rewards/margins": 1.3203353881835938, + "rewards/rejected": -3.893507719039917, + "step": 2783 + }, + { + "epoch": 2.63, + "grad_norm": 25.089380264282227, + "learning_rate": 6.873032528856243e-08, + "logps/chosen": -49.195579528808594, + "logps/rejected": -76.71953582763672, + "loss": 0.2077, + "losses/dpo": 0.31144824624061584, + "losses/sft": 2.3947620391845703, + "losses/total": 0.31144824624061584, + "ref_logps/chosen": -29.806447982788086, + "ref_logps/rejected": -33.67094039916992, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.938913106918335, + "rewards/margins": 2.3659462928771973, + "rewards/rejected": -4.304859161376953, + "step": 2784 + }, + { + "epoch": 2.63, + "grad_norm": 24.925973892211914, + "learning_rate": 6.855543896467295e-08, + "logps/chosen": -62.95465850830078, + "logps/rejected": -75.41535949707031, + "loss": 0.2814, + "losses/dpo": 0.2505493760108948, + "losses/sft": 1.8324346542358398, + "losses/total": 0.2505493760108948, + "ref_logps/chosen": -43.36646270751953, + "ref_logps/rejected": -35.1422233581543, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9588196277618408, + "rewards/margins": 2.0684938430786133, + "rewards/rejected": -4.027313232421875, + "step": 2785 + }, + { + "epoch": 2.63, + "grad_norm": 36.95057678222656, + "learning_rate": 6.838055264078349e-08, + "logps/chosen": -59.67194747924805, + "logps/rejected": -85.59119415283203, + "loss": 0.2985, + "losses/dpo": 0.39724084734916687, + "losses/sft": 2.0060245990753174, + "losses/total": 0.39724084734916687, + "ref_logps/chosen": -38.386375427246094, + "ref_logps/rejected": -44.242923736572266, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.128556728363037, + "rewards/margins": 2.006269931793213, + "rewards/rejected": -4.13482666015625, + "step": 2786 + }, + { + "epoch": 2.63, + "grad_norm": 11.359419822692871, + "learning_rate": 6.820566631689402e-08, + "logps/chosen": -49.66535186767578, + "logps/rejected": -87.66726684570312, + "loss": 0.1295, + "losses/dpo": 0.1356457620859146, + "losses/sft": 1.7585225105285645, + "losses/total": 0.1356457620859146, + "ref_logps/chosen": -31.357789993286133, + "ref_logps/rejected": -42.3443603515625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.830756425857544, + "rewards/margins": 2.7015347480773926, + "rewards/rejected": -4.532291412353516, + "step": 2787 + }, + { + "epoch": 2.63, + "grad_norm": 17.60601234436035, + "learning_rate": 6.803077999300455e-08, + "logps/chosen": -55.6259765625, + "logps/rejected": -82.13127136230469, + "loss": 0.2109, + "losses/dpo": 0.14678038656711578, + "losses/sft": 1.7931146621704102, + "losses/total": 0.14678038656711578, + "ref_logps/chosen": -37.12821578979492, + "ref_logps/rejected": -44.43461227416992, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8497759103775024, + "rewards/margins": 1.9198906421661377, + "rewards/rejected": -3.7696664333343506, + "step": 2788 + }, + { + "epoch": 2.63, + "grad_norm": 23.402408599853516, + "learning_rate": 6.785589366911507e-08, + "logps/chosen": -53.22080993652344, + "logps/rejected": -80.79360961914062, + "loss": 0.2426, + "losses/dpo": 0.07986205071210861, + "losses/sft": 1.734291911125183, + "losses/total": 0.07986205071210861, + "ref_logps/chosen": -31.365550994873047, + "ref_logps/rejected": -39.95213317871094, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.185525894165039, + "rewards/margins": 1.898621916770935, + "rewards/rejected": -4.084147930145264, + "step": 2789 + }, + { + "epoch": 2.63, + "grad_norm": 18.517105102539062, + "learning_rate": 6.76810073452256e-08, + "logps/chosen": -63.89511489868164, + "logps/rejected": -76.96041870117188, + "loss": 0.1742, + "losses/dpo": 0.12401974201202393, + "losses/sft": 2.07351016998291, + "losses/total": 0.12401974201202393, + "ref_logps/chosen": -43.70576858520508, + "ref_logps/rejected": -34.34823226928711, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.018934726715088, + "rewards/margins": 2.242283821105957, + "rewards/rejected": -4.261219024658203, + "step": 2790 + }, + { + "epoch": 2.64, + "grad_norm": 28.592937469482422, + "learning_rate": 6.750612102133612e-08, + "logps/chosen": -72.5536117553711, + "logps/rejected": -108.85317993164062, + "loss": 0.3279, + "losses/dpo": 0.13034898042678833, + "losses/sft": 2.3115530014038086, + "losses/total": 0.13034898042678833, + "ref_logps/chosen": -46.214637756347656, + "ref_logps/rejected": -54.15791320800781, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6338977813720703, + "rewards/margins": 2.8356289863586426, + "rewards/rejected": -5.469526290893555, + "step": 2791 + }, + { + "epoch": 2.64, + "grad_norm": 16.924577713012695, + "learning_rate": 6.733123469744666e-08, + "logps/chosen": -65.43248748779297, + "logps/rejected": -96.6303939819336, + "loss": 0.1932, + "losses/dpo": 0.2527864873409271, + "losses/sft": 1.8339967727661133, + "losses/total": 0.2527864873409271, + "ref_logps/chosen": -42.82279586791992, + "ref_logps/rejected": -53.38432312011719, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2609691619873047, + "rewards/margins": 2.063638210296631, + "rewards/rejected": -4.3246073722839355, + "step": 2792 + }, + { + "epoch": 2.64, + "grad_norm": 34.77162170410156, + "learning_rate": 6.715634837355719e-08, + "logps/chosen": -60.72416687011719, + "logps/rejected": -77.06990814208984, + "loss": 0.3762, + "losses/dpo": 0.4854225516319275, + "losses/sft": 2.3445606231689453, + "losses/total": 0.4854225516319275, + "ref_logps/chosen": -37.12005615234375, + "ref_logps/rejected": -36.52295684814453, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3604109287261963, + "rewards/margins": 1.6942837238311768, + "rewards/rejected": -4.054694652557373, + "step": 2793 + }, + { + "epoch": 2.64, + "grad_norm": 30.284433364868164, + "learning_rate": 6.698146204966771e-08, + "logps/chosen": -72.28326416015625, + "logps/rejected": -96.45884704589844, + "loss": 0.2627, + "losses/dpo": 0.37639346718788147, + "losses/sft": 1.7413197755813599, + "losses/total": 0.37639346718788147, + "ref_logps/chosen": -47.78852844238281, + "ref_logps/rejected": -47.57475662231445, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4494733810424805, + "rewards/margins": 2.4389357566833496, + "rewards/rejected": -4.88840913772583, + "step": 2794 + }, + { + "epoch": 2.64, + "grad_norm": 21.282243728637695, + "learning_rate": 6.680657572577825e-08, + "logps/chosen": -54.01810073852539, + "logps/rejected": -71.20744323730469, + "loss": 0.2563, + "losses/dpo": 0.33943137526512146, + "losses/sft": 2.083404064178467, + "losses/total": 0.33943137526512146, + "ref_logps/chosen": -36.095115661621094, + "ref_logps/rejected": -33.575401306152344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7922985553741455, + "rewards/margins": 1.9709055423736572, + "rewards/rejected": -3.7632040977478027, + "step": 2795 + }, + { + "epoch": 2.64, + "grad_norm": 31.79059600830078, + "learning_rate": 6.663168940188877e-08, + "logps/chosen": -60.04279708862305, + "logps/rejected": -81.55189514160156, + "loss": 0.3565, + "losses/dpo": 0.6937614679336548, + "losses/sft": 1.5668808221817017, + "losses/total": 0.6937614679336548, + "ref_logps/chosen": -37.2183952331543, + "ref_logps/rejected": -44.807838439941406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.282440423965454, + "rewards/margins": 1.3919651508331299, + "rewards/rejected": -3.674405574798584, + "step": 2796 + }, + { + "epoch": 2.64, + "grad_norm": 20.556947708129883, + "learning_rate": 6.645680307799929e-08, + "logps/chosen": -60.282859802246094, + "logps/rejected": -89.69305419921875, + "loss": 0.2597, + "losses/dpo": 0.2210417240858078, + "losses/sft": 2.085726499557495, + "losses/total": 0.2210417240858078, + "ref_logps/chosen": -37.647403717041016, + "ref_logps/rejected": -46.58287811279297, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.263545513153076, + "rewards/margins": 2.047470808029175, + "rewards/rejected": -4.31101655960083, + "step": 2797 + }, + { + "epoch": 2.64, + "grad_norm": 23.896883010864258, + "learning_rate": 6.628191675410982e-08, + "logps/chosen": -53.391578674316406, + "logps/rejected": -82.85118865966797, + "loss": 0.1982, + "losses/dpo": 0.3769487738609314, + "losses/sft": 1.865683913230896, + "losses/total": 0.3769487738609314, + "ref_logps/chosen": -32.8033561706543, + "ref_logps/rejected": -36.25730895996094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0588221549987793, + "rewards/margins": 2.6005661487579346, + "rewards/rejected": -4.659388542175293, + "step": 2798 + }, + { + "epoch": 2.64, + "grad_norm": 20.717090606689453, + "learning_rate": 6.610703043022036e-08, + "logps/chosen": -44.62938690185547, + "logps/rejected": -86.13666534423828, + "loss": 0.198, + "losses/dpo": 0.32698315382003784, + "losses/sft": 1.6401935815811157, + "losses/total": 0.32698315382003784, + "ref_logps/chosen": -29.89868927001953, + "ref_logps/rejected": -41.154136657714844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.473069429397583, + "rewards/margins": 3.02518367767334, + "rewards/rejected": -4.498252868652344, + "step": 2799 + }, + { + "epoch": 2.64, + "grad_norm": 31.287242889404297, + "learning_rate": 6.593214410633088e-08, + "logps/chosen": -66.99874877929688, + "logps/rejected": -83.04682159423828, + "loss": 0.4646, + "losses/dpo": 0.29867488145828247, + "losses/sft": 2.061471462249756, + "losses/total": 0.29867488145828247, + "ref_logps/chosen": -41.9615364074707, + "ref_logps/rejected": -43.84099578857422, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.503721237182617, + "rewards/margins": 1.4168610572814941, + "rewards/rejected": -3.9205825328826904, + "step": 2800 + }, + { + "epoch": 2.64, + "grad_norm": 20.528745651245117, + "learning_rate": 6.575725778244141e-08, + "logps/chosen": -61.36975860595703, + "logps/rejected": -82.8834457397461, + "loss": 0.1926, + "losses/dpo": 0.13481679558753967, + "losses/sft": 2.364569902420044, + "losses/total": 0.13481679558753967, + "ref_logps/chosen": -38.4459114074707, + "ref_logps/rejected": -39.14704132080078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2923851013183594, + "rewards/margins": 2.081256151199341, + "rewards/rejected": -4.373641014099121, + "step": 2801 + }, + { + "epoch": 2.65, + "grad_norm": 30.4796085357666, + "learning_rate": 6.558237145855195e-08, + "logps/chosen": -70.28099060058594, + "logps/rejected": -87.71537017822266, + "loss": 0.3285, + "losses/dpo": 0.19485768675804138, + "losses/sft": 2.2904770374298096, + "losses/total": 0.19485768675804138, + "ref_logps/chosen": -47.85303497314453, + "ref_logps/rejected": -45.99406051635742, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.242794990539551, + "rewards/margins": 1.929335594177246, + "rewards/rejected": -4.172130584716797, + "step": 2802 + }, + { + "epoch": 2.65, + "grad_norm": 22.948678970336914, + "learning_rate": 6.540748513466246e-08, + "logps/chosen": -66.8612060546875, + "logps/rejected": -85.68760681152344, + "loss": 0.274, + "losses/dpo": 0.12161294370889664, + "losses/sft": 2.5325162410736084, + "losses/total": 0.12161294370889664, + "ref_logps/chosen": -39.060401916503906, + "ref_logps/rejected": -38.514678955078125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.780081033706665, + "rewards/margins": 1.9372124671936035, + "rewards/rejected": -4.717293739318848, + "step": 2803 + }, + { + "epoch": 2.65, + "grad_norm": 29.647390365600586, + "learning_rate": 6.523259881077299e-08, + "logps/chosen": -45.707313537597656, + "logps/rejected": -62.48001480102539, + "loss": 0.4069, + "losses/dpo": 0.22883006930351257, + "losses/sft": 1.4940004348754883, + "losses/total": 0.22883006930351257, + "ref_logps/chosen": -30.91168975830078, + "ref_logps/rejected": -33.89064025878906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4795620441436768, + "rewards/margins": 1.37937593460083, + "rewards/rejected": -2.858937978744507, + "step": 2804 + }, + { + "epoch": 2.65, + "grad_norm": 13.591337203979492, + "learning_rate": 6.505771248688352e-08, + "logps/chosen": -56.497833251953125, + "logps/rejected": -83.28926849365234, + "loss": 0.1288, + "losses/dpo": 0.09809619188308716, + "losses/sft": 1.549083948135376, + "losses/total": 0.09809619188308716, + "ref_logps/chosen": -41.87842559814453, + "ref_logps/rejected": -43.66329574584961, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4619405269622803, + "rewards/margins": 2.5006566047668457, + "rewards/rejected": -3.962597131729126, + "step": 2805 + }, + { + "epoch": 2.65, + "grad_norm": 17.058685302734375, + "learning_rate": 6.488282616299405e-08, + "logps/chosen": -57.05895233154297, + "logps/rejected": -105.78681182861328, + "loss": 0.1118, + "losses/dpo": 0.08921144902706146, + "losses/sft": 2.2917540073394775, + "losses/total": 0.08921144902706146, + "ref_logps/chosen": -36.44682312011719, + "ref_logps/rejected": -56.53832244873047, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0612125396728516, + "rewards/margins": 2.863636016845703, + "rewards/rejected": -4.924848556518555, + "step": 2806 + }, + { + "epoch": 2.65, + "grad_norm": 24.081478118896484, + "learning_rate": 6.470793983910458e-08, + "logps/chosen": -67.83106994628906, + "logps/rejected": -102.46758270263672, + "loss": 0.2318, + "losses/dpo": 0.38107314705848694, + "losses/sft": 2.170287847518921, + "losses/total": 0.38107314705848694, + "ref_logps/chosen": -43.02593994140625, + "ref_logps/rejected": -57.54987335205078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4805126190185547, + "rewards/margins": 2.0112593173980713, + "rewards/rejected": -4.491771697998047, + "step": 2807 + }, + { + "epoch": 2.65, + "grad_norm": 41.07267761230469, + "learning_rate": 6.453305351521512e-08, + "logps/chosen": -56.034263610839844, + "logps/rejected": -74.24307250976562, + "loss": 0.422, + "losses/dpo": 0.7111890912055969, + "losses/sft": 2.3105666637420654, + "losses/total": 0.7111890912055969, + "ref_logps/chosen": -36.072364807128906, + "ref_logps/rejected": -38.674285888671875, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9961903095245361, + "rewards/margins": 1.5606892108917236, + "rewards/rejected": -3.5568795204162598, + "step": 2808 + }, + { + "epoch": 2.65, + "grad_norm": 18.853776931762695, + "learning_rate": 6.435816719132564e-08, + "logps/chosen": -61.940704345703125, + "logps/rejected": -97.80964660644531, + "loss": 0.1706, + "losses/dpo": 0.09961122274398804, + "losses/sft": 2.048339605331421, + "losses/total": 0.09961122274398804, + "ref_logps/chosen": -40.614463806152344, + "ref_logps/rejected": -55.26185989379883, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1326239109039307, + "rewards/margins": 2.122154712677002, + "rewards/rejected": -4.254778861999512, + "step": 2809 + }, + { + "epoch": 2.65, + "grad_norm": 18.57338523864746, + "learning_rate": 6.418328086743615e-08, + "logps/chosen": -58.30417251586914, + "logps/rejected": -106.75396728515625, + "loss": 0.1749, + "losses/dpo": 0.11486898362636566, + "losses/sft": 2.597902536392212, + "losses/total": 0.11486898362636566, + "ref_logps/chosen": -38.71113586425781, + "ref_logps/rejected": -59.33735656738281, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.959303617477417, + "rewards/margins": 2.782358407974243, + "rewards/rejected": -4.74166202545166, + "step": 2810 + }, + { + "epoch": 2.65, + "grad_norm": 17.233592987060547, + "learning_rate": 6.400839454354669e-08, + "logps/chosen": -55.027259826660156, + "logps/rejected": -78.33998107910156, + "loss": 0.1425, + "losses/dpo": 0.08399466425180435, + "losses/sft": 1.9579206705093384, + "losses/total": 0.08399466425180435, + "ref_logps/chosen": -37.600929260253906, + "ref_logps/rejected": -34.71483612060547, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7426331043243408, + "rewards/margins": 2.6198816299438477, + "rewards/rejected": -4.362514495849609, + "step": 2811 + }, + { + "epoch": 2.66, + "grad_norm": 26.96534538269043, + "learning_rate": 6.383350821965722e-08, + "logps/chosen": -54.305416107177734, + "logps/rejected": -74.97158813476562, + "loss": 0.305, + "losses/dpo": 0.39059174060821533, + "losses/sft": 1.5081398487091064, + "losses/total": 0.39059174060821533, + "ref_logps/chosen": -31.655078887939453, + "ref_logps/rejected": -34.95521926879883, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.265033721923828, + "rewards/margins": 1.7366034984588623, + "rewards/rejected": -4.001636981964111, + "step": 2812 + }, + { + "epoch": 2.66, + "grad_norm": 16.062406539916992, + "learning_rate": 6.365862189576775e-08, + "logps/chosen": -57.12964630126953, + "logps/rejected": -101.63590240478516, + "loss": 0.0998, + "losses/dpo": 0.059631701558828354, + "losses/sft": 2.075115442276001, + "losses/total": 0.059631701558828354, + "ref_logps/chosen": -36.83037567138672, + "ref_logps/rejected": -46.61079788208008, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0299272537231445, + "rewards/margins": 3.4725828170776367, + "rewards/rejected": -5.502510070800781, + "step": 2813 + }, + { + "epoch": 2.66, + "grad_norm": 12.354023933410645, + "learning_rate": 6.348373557187827e-08, + "logps/chosen": -65.61959838867188, + "logps/rejected": -94.36149597167969, + "loss": 0.1151, + "losses/dpo": 0.20401525497436523, + "losses/sft": 2.2358593940734863, + "losses/total": 0.20401525497436523, + "ref_logps/chosen": -46.7528076171875, + "ref_logps/rejected": -49.06699752807617, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.88667893409729, + "rewards/margins": 2.642770767211914, + "rewards/rejected": -4.529449462890625, + "step": 2814 + }, + { + "epoch": 2.66, + "grad_norm": 34.59256362915039, + "learning_rate": 6.330884924798881e-08, + "logps/chosen": -61.12478256225586, + "logps/rejected": -85.32469177246094, + "loss": 0.2882, + "losses/dpo": 0.08062613755464554, + "losses/sft": 2.407926321029663, + "losses/total": 0.08062613755464554, + "ref_logps/chosen": -40.063072204589844, + "ref_logps/rejected": -45.3946533203125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.106171131134033, + "rewards/margins": 1.8868328332901, + "rewards/rejected": -3.993004083633423, + "step": 2815 + }, + { + "epoch": 2.66, + "grad_norm": 14.632702827453613, + "learning_rate": 6.313396292409934e-08, + "logps/chosen": -53.33955001831055, + "logps/rejected": -83.21797180175781, + "loss": 0.1651, + "losses/dpo": 0.2551000118255615, + "losses/sft": 2.0775253772735596, + "losses/total": 0.2551000118255615, + "ref_logps/chosen": -37.003536224365234, + "ref_logps/rejected": -46.398101806640625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6336013078689575, + "rewards/margins": 2.0483856201171875, + "rewards/rejected": -3.6819865703582764, + "step": 2816 + }, + { + "epoch": 2.66, + "grad_norm": 21.115245819091797, + "learning_rate": 6.295907660020985e-08, + "logps/chosen": -54.940757751464844, + "logps/rejected": -87.11189270019531, + "loss": 0.2, + "losses/dpo": 0.08007544279098511, + "losses/sft": 2.2354846000671387, + "losses/total": 0.08007544279098511, + "ref_logps/chosen": -33.57972717285156, + "ref_logps/rejected": -42.48321533203125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1361031532287598, + "rewards/margins": 2.326765537261963, + "rewards/rejected": -4.462868690490723, + "step": 2817 + }, + { + "epoch": 2.66, + "grad_norm": 24.38103485107422, + "learning_rate": 6.278419027632039e-08, + "logps/chosen": -53.5403938293457, + "logps/rejected": -70.36354064941406, + "loss": 0.3236, + "losses/dpo": 0.4314962923526764, + "losses/sft": 1.524219274520874, + "losses/total": 0.4314962923526764, + "ref_logps/chosen": -33.54059600830078, + "ref_logps/rejected": -35.4715461730957, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9999796152114868, + "rewards/margins": 1.489220142364502, + "rewards/rejected": -3.489199638366699, + "step": 2818 + }, + { + "epoch": 2.66, + "grad_norm": 13.823323249816895, + "learning_rate": 6.260930395243091e-08, + "logps/chosen": -52.77983856201172, + "logps/rejected": -78.38325500488281, + "loss": 0.1391, + "losses/dpo": 0.06568809598684311, + "losses/sft": 1.9338798522949219, + "losses/total": 0.06568809598684311, + "ref_logps/chosen": -32.31357955932617, + "ref_logps/rejected": -33.70201873779297, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.046626091003418, + "rewards/margins": 2.4214982986450195, + "rewards/rejected": -4.4681243896484375, + "step": 2819 + }, + { + "epoch": 2.66, + "grad_norm": 26.558530807495117, + "learning_rate": 6.243441762854144e-08, + "logps/chosen": -47.277793884277344, + "logps/rejected": -95.91194152832031, + "loss": 0.3083, + "losses/dpo": 0.09661128371953964, + "losses/sft": 1.964137077331543, + "losses/total": 0.09661128371953964, + "ref_logps/chosen": -28.412307739257812, + "ref_logps/rejected": -50.79615783691406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8865485191345215, + "rewards/margins": 2.6250295639038086, + "rewards/rejected": -4.511578559875488, + "step": 2820 + }, + { + "epoch": 2.66, + "grad_norm": 24.625911712646484, + "learning_rate": 6.225953130465198e-08, + "logps/chosen": -53.90091323852539, + "logps/rejected": -79.53909301757812, + "loss": 0.2804, + "losses/dpo": 0.41780102252960205, + "losses/sft": 1.9195451736450195, + "losses/total": 0.41780102252960205, + "ref_logps/chosen": -34.44166564941406, + "ref_logps/rejected": -38.059112548828125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9459251165390015, + "rewards/margins": 2.202073574066162, + "rewards/rejected": -4.147998809814453, + "step": 2821 + }, + { + "epoch": 2.66, + "grad_norm": 45.21463394165039, + "learning_rate": 6.20846449807625e-08, + "logps/chosen": -64.99295043945312, + "logps/rejected": -77.49156188964844, + "loss": 0.3923, + "losses/dpo": 0.5282461047172546, + "losses/sft": 1.621016502380371, + "losses/total": 0.5282461047172546, + "ref_logps/chosen": -45.96197509765625, + "ref_logps/rejected": -41.91083526611328, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9030970335006714, + "rewards/margins": 1.6549763679504395, + "rewards/rejected": -3.5580732822418213, + "step": 2822 + }, + { + "epoch": 2.67, + "grad_norm": 33.73280334472656, + "learning_rate": 6.190975865687303e-08, + "logps/chosen": -68.11634063720703, + "logps/rejected": -94.88563537597656, + "loss": 0.3487, + "losses/dpo": 0.04625094681978226, + "losses/sft": 1.466246485710144, + "losses/total": 0.04625094681978226, + "ref_logps/chosen": -40.46844482421875, + "ref_logps/rejected": -49.142311096191406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.76478910446167, + "rewards/margins": 1.8095433712005615, + "rewards/rejected": -4.5743327140808105, + "step": 2823 + }, + { + "epoch": 2.67, + "grad_norm": 25.91737174987793, + "learning_rate": 6.173487233298356e-08, + "logps/chosen": -71.0357437133789, + "logps/rejected": -91.23492431640625, + "loss": 0.3423, + "losses/dpo": 0.03390391170978546, + "losses/sft": 1.9826462268829346, + "losses/total": 0.03390391170978546, + "ref_logps/chosen": -48.21184539794922, + "ref_logps/rejected": -46.81634521484375, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2823901176452637, + "rewards/margins": 2.1594676971435547, + "rewards/rejected": -4.44185733795166, + "step": 2824 + }, + { + "epoch": 2.67, + "grad_norm": 32.87423324584961, + "learning_rate": 6.155998600909408e-08, + "logps/chosen": -66.14747619628906, + "logps/rejected": -84.25035095214844, + "loss": 0.3121, + "losses/dpo": 0.27095383405685425, + "losses/sft": 2.4791924953460693, + "losses/total": 0.27095383405685425, + "ref_logps/chosen": -40.39013671875, + "ref_logps/rejected": -40.88392639160156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5757346153259277, + "rewards/margins": 1.7609083652496338, + "rewards/rejected": -4.336643218994141, + "step": 2825 + }, + { + "epoch": 2.67, + "grad_norm": 22.894607543945312, + "learning_rate": 6.138509968520461e-08, + "logps/chosen": -59.413307189941406, + "logps/rejected": -95.7797622680664, + "loss": 0.1864, + "losses/dpo": 0.08187133818864822, + "losses/sft": 2.186403512954712, + "losses/total": 0.08187133818864822, + "ref_logps/chosen": -37.663307189941406, + "ref_logps/rejected": -46.59320068359375, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.174999952316284, + "rewards/margins": 2.7436561584472656, + "rewards/rejected": -4.918656349182129, + "step": 2826 + }, + { + "epoch": 2.67, + "grad_norm": 15.680797576904297, + "learning_rate": 6.121021336131515e-08, + "logps/chosen": -55.5383186340332, + "logps/rejected": -89.84370422363281, + "loss": 0.1492, + "losses/dpo": 0.2526828944683075, + "losses/sft": 2.1954054832458496, + "losses/total": 0.2526828944683075, + "ref_logps/chosen": -34.80877685546875, + "ref_logps/rejected": -45.156471252441406, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0729541778564453, + "rewards/margins": 2.3957691192626953, + "rewards/rejected": -4.468723297119141, + "step": 2827 + }, + { + "epoch": 2.67, + "grad_norm": 13.249262809753418, + "learning_rate": 6.103532703742567e-08, + "logps/chosen": -45.910484313964844, + "logps/rejected": -83.5672607421875, + "loss": 0.1422, + "losses/dpo": 0.2480028122663498, + "losses/sft": 1.4911564588546753, + "losses/total": 0.2480028122663498, + "ref_logps/chosen": -31.42209243774414, + "ref_logps/rejected": -43.59312057495117, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4488391876220703, + "rewards/margins": 2.548574924468994, + "rewards/rejected": -3.9974141120910645, + "step": 2828 + }, + { + "epoch": 2.67, + "grad_norm": 28.451623916625977, + "learning_rate": 6.08604407135362e-08, + "logps/chosen": -54.65733337402344, + "logps/rejected": -69.976806640625, + "loss": 0.3127, + "losses/dpo": 0.296722948551178, + "losses/sft": 1.2962543964385986, + "losses/total": 0.296722948551178, + "ref_logps/chosen": -36.54468536376953, + "ref_logps/rejected": -34.26911163330078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.811265230178833, + "rewards/margins": 1.7595045566558838, + "rewards/rejected": -3.5707693099975586, + "step": 2829 + }, + { + "epoch": 2.67, + "grad_norm": 11.739123344421387, + "learning_rate": 6.068555438964672e-08, + "logps/chosen": -51.673057556152344, + "logps/rejected": -89.33514404296875, + "loss": 0.1661, + "losses/dpo": 0.3497820794582367, + "losses/sft": 1.7831809520721436, + "losses/total": 0.3497820794582367, + "ref_logps/chosen": -33.37078857421875, + "ref_logps/rejected": -47.4122314453125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8302268981933594, + "rewards/margins": 2.3620643615722656, + "rewards/rejected": -4.192291259765625, + "step": 2830 + }, + { + "epoch": 2.67, + "grad_norm": 24.126773834228516, + "learning_rate": 6.051066806575725e-08, + "logps/chosen": -49.084659576416016, + "logps/rejected": -73.62368774414062, + "loss": 0.3012, + "losses/dpo": 0.44974058866500854, + "losses/sft": 1.5654181241989136, + "losses/total": 0.44974058866500854, + "ref_logps/chosen": -33.50338363647461, + "ref_logps/rejected": -40.60755920410156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5581278800964355, + "rewards/margins": 1.7434853315353394, + "rewards/rejected": -3.3016133308410645, + "step": 2831 + }, + { + "epoch": 2.67, + "grad_norm": 31.02278709411621, + "learning_rate": 6.033578174186779e-08, + "logps/chosen": -59.54981231689453, + "logps/rejected": -81.86512756347656, + "loss": 0.2806, + "losses/dpo": 0.250593900680542, + "losses/sft": 2.108398675918579, + "losses/total": 0.250593900680542, + "ref_logps/chosen": -40.206787109375, + "ref_logps/rejected": -40.99497604370117, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.934302806854248, + "rewards/margins": 2.15271258354187, + "rewards/rejected": -4.087015628814697, + "step": 2832 + }, + { + "epoch": 2.68, + "grad_norm": 35.50676345825195, + "learning_rate": 6.01608954179783e-08, + "logps/chosen": -60.410980224609375, + "logps/rejected": -84.25567626953125, + "loss": 0.4141, + "losses/dpo": 0.9166530966758728, + "losses/sft": 2.4444801807403564, + "losses/total": 0.9166530966758728, + "ref_logps/chosen": -38.02571487426758, + "ref_logps/rejected": -43.96631622314453, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2385263442993164, + "rewards/margins": 1.7904090881347656, + "rewards/rejected": -4.028935432434082, + "step": 2833 + }, + { + "epoch": 2.68, + "grad_norm": 18.488130569458008, + "learning_rate": 5.998600909408884e-08, + "logps/chosen": -53.45903015136719, + "logps/rejected": -96.04625701904297, + "loss": 0.1785, + "losses/dpo": 0.09850554168224335, + "losses/sft": 2.2874128818511963, + "losses/total": 0.09850554168224335, + "ref_logps/chosen": -36.234947204589844, + "ref_logps/rejected": -50.32763671875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7224082946777344, + "rewards/margins": 2.849454402923584, + "rewards/rejected": -4.571862697601318, + "step": 2834 + }, + { + "epoch": 2.68, + "grad_norm": 24.639921188354492, + "learning_rate": 5.981112277019937e-08, + "logps/chosen": -57.561161041259766, + "logps/rejected": -96.45553588867188, + "loss": 0.2501, + "losses/dpo": 0.1356874406337738, + "losses/sft": 2.844369888305664, + "losses/total": 0.1356874406337738, + "ref_logps/chosen": -34.99861145019531, + "ref_logps/rejected": -50.56623840332031, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2562549114227295, + "rewards/margins": 2.332674503326416, + "rewards/rejected": -4.588929176330566, + "step": 2835 + }, + { + "epoch": 2.68, + "grad_norm": 16.97640609741211, + "learning_rate": 5.963623644630989e-08, + "logps/chosen": -53.053009033203125, + "logps/rejected": -96.39279174804688, + "loss": 0.1831, + "losses/dpo": 0.2519689202308655, + "losses/sft": 2.359346866607666, + "losses/total": 0.2519689202308655, + "ref_logps/chosen": -34.11677932739258, + "ref_logps/rejected": -51.430335998535156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8936229944229126, + "rewards/margins": 2.6026225090026855, + "rewards/rejected": -4.496245384216309, + "step": 2836 + }, + { + "epoch": 2.68, + "grad_norm": 22.97124481201172, + "learning_rate": 5.946135012242042e-08, + "logps/chosen": -58.60469055175781, + "logps/rejected": -86.85208892822266, + "loss": 0.1883, + "losses/dpo": 0.19095231592655182, + "losses/sft": 2.2699618339538574, + "losses/total": 0.19095231592655182, + "ref_logps/chosen": -36.17578125, + "ref_logps/rejected": -43.6269645690918, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2428905963897705, + "rewards/margins": 2.0796217918395996, + "rewards/rejected": -4.322512626647949, + "step": 2837 + }, + { + "epoch": 2.68, + "grad_norm": 13.44681453704834, + "learning_rate": 5.928646379853095e-08, + "logps/chosen": -60.35519790649414, + "logps/rejected": -97.99239349365234, + "loss": 0.1259, + "losses/dpo": 0.1397063136100769, + "losses/sft": 1.6836830377578735, + "losses/total": 0.1397063136100769, + "ref_logps/chosen": -42.25647735595703, + "ref_logps/rejected": -51.19499969482422, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8098721504211426, + "rewards/margins": 2.8698675632476807, + "rewards/rejected": -4.679739952087402, + "step": 2838 + }, + { + "epoch": 2.68, + "grad_norm": 25.163185119628906, + "learning_rate": 5.9111577474641484e-08, + "logps/chosen": -56.017662048339844, + "logps/rejected": -73.30323791503906, + "loss": 0.2781, + "losses/dpo": 0.09470625221729279, + "losses/sft": 1.7585679292678833, + "losses/total": 0.09470625221729279, + "ref_logps/chosen": -35.404571533203125, + "ref_logps/rejected": -35.85308074951172, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.061309576034546, + "rewards/margins": 1.6837064027786255, + "rewards/rejected": -3.7450156211853027, + "step": 2839 + }, + { + "epoch": 2.68, + "grad_norm": 18.755857467651367, + "learning_rate": 5.893669115075201e-08, + "logps/chosen": -52.348114013671875, + "logps/rejected": -86.44564819335938, + "loss": 0.2045, + "losses/dpo": 0.29966428875923157, + "losses/sft": 2.5235798358917236, + "losses/total": 0.29966428875923157, + "ref_logps/chosen": -37.292572021484375, + "ref_logps/rejected": -49.929527282714844, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.505554437637329, + "rewards/margins": 2.1460580825805664, + "rewards/rejected": -3.6516122817993164, + "step": 2840 + }, + { + "epoch": 2.68, + "grad_norm": 15.73888874053955, + "learning_rate": 5.8761804826862536e-08, + "logps/chosen": -63.68634033203125, + "logps/rejected": -88.6485595703125, + "loss": 0.1394, + "losses/dpo": 0.16902875900268555, + "losses/sft": 1.6753729581832886, + "losses/total": 0.16902875900268555, + "ref_logps/chosen": -45.48716735839844, + "ref_logps/rejected": -44.093048095703125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8199174404144287, + "rewards/margins": 2.6356332302093506, + "rewards/rejected": -4.455550670623779, + "step": 2841 + }, + { + "epoch": 2.68, + "grad_norm": 19.015249252319336, + "learning_rate": 5.858691850297306e-08, + "logps/chosen": -60.6126708984375, + "logps/rejected": -75.68568420410156, + "loss": 0.1691, + "losses/dpo": 0.32353365421295166, + "losses/sft": 1.8243657350540161, + "losses/total": 0.32353365421295166, + "ref_logps/chosen": -45.986183166503906, + "ref_logps/rejected": -38.06608963012695, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.462648630142212, + "rewards/margins": 2.2993111610412598, + "rewards/rejected": -3.7619595527648926, + "step": 2842 + }, + { + "epoch": 2.68, + "grad_norm": 26.87492561340332, + "learning_rate": 5.8412032179083594e-08, + "logps/chosen": -47.727516174316406, + "logps/rejected": -100.16867065429688, + "loss": 0.2355, + "losses/dpo": 0.6074020862579346, + "losses/sft": 2.9043121337890625, + "losses/total": 0.6074020862579346, + "ref_logps/chosen": -31.07196807861328, + "ref_logps/rejected": -55.296180725097656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6655550003051758, + "rewards/margins": 2.8216946125030518, + "rewards/rejected": -4.487249374389648, + "step": 2843 + }, + { + "epoch": 2.69, + "grad_norm": 30.815824508666992, + "learning_rate": 5.823714585519412e-08, + "logps/chosen": -54.60810852050781, + "logps/rejected": -84.23872375488281, + "loss": 0.3046, + "losses/dpo": 0.07891845703125, + "losses/sft": 2.2529456615448, + "losses/total": 0.07891845703125, + "ref_logps/chosen": -33.00050354003906, + "ref_logps/rejected": -39.17915725708008, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1607604026794434, + "rewards/margins": 2.3451969623565674, + "rewards/rejected": -4.50595760345459, + "step": 2844 + }, + { + "epoch": 2.69, + "grad_norm": 26.856121063232422, + "learning_rate": 5.8062259531304646e-08, + "logps/chosen": -61.337127685546875, + "logps/rejected": -88.42852783203125, + "loss": 0.3975, + "losses/dpo": 0.7157143950462341, + "losses/sft": 2.255458354949951, + "losses/total": 0.7157143950462341, + "ref_logps/chosen": -39.10514831542969, + "ref_logps/rejected": -49.1973762512207, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.223198413848877, + "rewards/margins": 1.6999170780181885, + "rewards/rejected": -3.9231152534484863, + "step": 2845 + }, + { + "epoch": 2.69, + "grad_norm": 12.56474494934082, + "learning_rate": 5.788737320741518e-08, + "logps/chosen": -43.921138763427734, + "logps/rejected": -80.57787322998047, + "loss": 0.1118, + "losses/dpo": 0.1052473857998848, + "losses/sft": 1.0585225820541382, + "losses/total": 0.1052473857998848, + "ref_logps/chosen": -30.996601104736328, + "ref_logps/rejected": -40.49279022216797, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2924540042877197, + "rewards/margins": 2.7160539627075195, + "rewards/rejected": -4.00850772857666, + "step": 2846 + }, + { + "epoch": 2.69, + "grad_norm": 23.446834564208984, + "learning_rate": 5.771248688352571e-08, + "logps/chosen": -75.3303451538086, + "logps/rejected": -99.12146759033203, + "loss": 0.231, + "losses/dpo": 0.17752321064472198, + "losses/sft": 2.1565511226654053, + "losses/total": 0.17752321064472198, + "ref_logps/chosen": -47.45655059814453, + "ref_logps/rejected": -48.7686767578125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7873799800872803, + "rewards/margins": 2.2478995323181152, + "rewards/rejected": -5.035279750823975, + "step": 2847 + }, + { + "epoch": 2.69, + "grad_norm": 18.20689582824707, + "learning_rate": 5.753760055963623e-08, + "logps/chosen": -52.07342529296875, + "logps/rejected": -78.18859100341797, + "loss": 0.1835, + "losses/dpo": 0.09994463622570038, + "losses/sft": 1.597030520439148, + "losses/total": 0.09994463622570038, + "ref_logps/chosen": -37.117149353027344, + "ref_logps/rejected": -39.53816604614258, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4956278800964355, + "rewards/margins": 2.369415283203125, + "rewards/rejected": -3.8650431632995605, + "step": 2848 + }, + { + "epoch": 2.69, + "grad_norm": 12.670719146728516, + "learning_rate": 5.736271423574676e-08, + "logps/chosen": -38.847774505615234, + "logps/rejected": -80.98487854003906, + "loss": 0.129, + "losses/dpo": 0.19369924068450928, + "losses/sft": 1.7729889154434204, + "losses/total": 0.19369924068450928, + "ref_logps/chosen": -28.20480728149414, + "ref_logps/rejected": -43.106536865234375, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0642967224121094, + "rewards/margins": 2.7235376834869385, + "rewards/rejected": -3.787834405899048, + "step": 2849 + }, + { + "epoch": 2.69, + "grad_norm": 19.287364959716797, + "learning_rate": 5.7187827911857295e-08, + "logps/chosen": -57.862728118896484, + "logps/rejected": -91.24444580078125, + "loss": 0.1451, + "losses/dpo": 0.10392201691865921, + "losses/sft": 1.9286572933197021, + "losses/total": 0.10392201691865921, + "ref_logps/chosen": -40.45824432373047, + "ref_logps/rejected": -46.64256286621094, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7404483556747437, + "rewards/margins": 2.719740152359009, + "rewards/rejected": -4.460188865661621, + "step": 2850 + }, + { + "epoch": 2.69, + "grad_norm": 18.712642669677734, + "learning_rate": 5.7012941587967815e-08, + "logps/chosen": -58.4813346862793, + "logps/rejected": -97.10137176513672, + "loss": 0.1882, + "losses/dpo": 0.29807591438293457, + "losses/sft": 1.7365291118621826, + "losses/total": 0.29807591438293457, + "ref_logps/chosen": -39.16337585449219, + "ref_logps/rejected": -51.623409271240234, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.931795597076416, + "rewards/margins": 2.616000175476074, + "rewards/rejected": -4.547796249389648, + "step": 2851 + }, + { + "epoch": 2.69, + "grad_norm": 32.193443298339844, + "learning_rate": 5.683805526407835e-08, + "logps/chosen": -59.583824157714844, + "logps/rejected": -75.64508819580078, + "loss": 0.3345, + "losses/dpo": 0.35838064551353455, + "losses/sft": 2.1990456581115723, + "losses/total": 0.35838064551353455, + "ref_logps/chosen": -39.830238342285156, + "ref_logps/rejected": -39.453948974609375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9753586053848267, + "rewards/margins": 1.6437549591064453, + "rewards/rejected": -3.6191134452819824, + "step": 2852 + }, + { + "epoch": 2.69, + "grad_norm": 21.557411193847656, + "learning_rate": 5.666316894018887e-08, + "logps/chosen": -52.954158782958984, + "logps/rejected": -79.87135314941406, + "loss": 0.2439, + "losses/dpo": 0.28020960092544556, + "losses/sft": 2.0722146034240723, + "losses/total": 0.28020960092544556, + "ref_logps/chosen": -33.953643798828125, + "ref_logps/rejected": -40.866371154785156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9000515937805176, + "rewards/margins": 2.0004472732543945, + "rewards/rejected": -3.900498867034912, + "step": 2853 + }, + { + "epoch": 2.69, + "grad_norm": 22.58413314819336, + "learning_rate": 5.6488282616299405e-08, + "logps/chosen": -57.0494384765625, + "logps/rejected": -80.87075805664062, + "loss": 0.2772, + "losses/dpo": 0.06300322711467743, + "losses/sft": 2.2173423767089844, + "losses/total": 0.06300322711467743, + "ref_logps/chosen": -32.17005920410156, + "ref_logps/rejected": -34.647682189941406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.487938404083252, + "rewards/margins": 2.134368896484375, + "rewards/rejected": -4.622306823730469, + "step": 2854 + }, + { + "epoch": 2.7, + "grad_norm": 17.20001792907715, + "learning_rate": 5.631339629240993e-08, + "logps/chosen": -68.9234390258789, + "logps/rejected": -94.60698699951172, + "loss": 0.1944, + "losses/dpo": 0.11728018522262573, + "losses/sft": 2.2426631450653076, + "losses/total": 0.11728018522262573, + "ref_logps/chosen": -46.50959014892578, + "ref_logps/rejected": -49.548004150390625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.241384983062744, + "rewards/margins": 2.2645134925842285, + "rewards/rejected": -4.505898475646973, + "step": 2855 + }, + { + "epoch": 2.7, + "grad_norm": 23.192480087280273, + "learning_rate": 5.613850996852046e-08, + "logps/chosen": -54.01043701171875, + "logps/rejected": -75.58695983886719, + "loss": 0.2488, + "losses/dpo": 0.3014264404773712, + "losses/sft": 1.4761443138122559, + "losses/total": 0.3014264404773712, + "ref_logps/chosen": -36.561363220214844, + "ref_logps/rejected": -37.017738342285156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7449076175689697, + "rewards/margins": 2.1120147705078125, + "rewards/rejected": -3.856922149658203, + "step": 2856 + }, + { + "epoch": 2.7, + "grad_norm": 19.56039047241211, + "learning_rate": 5.596362364463099e-08, + "logps/chosen": -49.30674743652344, + "logps/rejected": -83.095703125, + "loss": 0.1686, + "losses/dpo": 0.3388398289680481, + "losses/sft": 1.8534343242645264, + "losses/total": 0.3388398289680481, + "ref_logps/chosen": -35.508235931396484, + "ref_logps/rejected": -44.92726135253906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3798513412475586, + "rewards/margins": 2.4369935989379883, + "rewards/rejected": -3.816844940185547, + "step": 2857 + }, + { + "epoch": 2.7, + "grad_norm": 21.908437728881836, + "learning_rate": 5.578873732074151e-08, + "logps/chosen": -60.04517364501953, + "logps/rejected": -78.47991943359375, + "loss": 0.3128, + "losses/dpo": 0.4070974588394165, + "losses/sft": 1.322322964668274, + "losses/total": 0.4070974588394165, + "ref_logps/chosen": -39.38600158691406, + "ref_logps/rejected": -40.62982177734375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.065917491912842, + "rewards/margins": 1.7190923690795898, + "rewards/rejected": -3.7850098609924316, + "step": 2858 + }, + { + "epoch": 2.7, + "grad_norm": 17.502845764160156, + "learning_rate": 5.561385099685204e-08, + "logps/chosen": -62.451568603515625, + "logps/rejected": -91.19190216064453, + "loss": 0.1822, + "losses/dpo": 0.09059447795152664, + "losses/sft": 2.5058538913726807, + "losses/total": 0.09059447795152664, + "ref_logps/chosen": -42.02899169921875, + "ref_logps/rejected": -44.19563674926758, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0422580242156982, + "rewards/margins": 2.6573684215545654, + "rewards/rejected": -4.699626445770264, + "step": 2859 + }, + { + "epoch": 2.7, + "grad_norm": 31.259292602539062, + "learning_rate": 5.5438964672962574e-08, + "logps/chosen": -55.31084442138672, + "logps/rejected": -83.58088684082031, + "loss": 0.3248, + "losses/dpo": 0.25898510217666626, + "losses/sft": 1.929835557937622, + "losses/total": 0.25898510217666626, + "ref_logps/chosen": -35.6456413269043, + "ref_logps/rejected": -46.37433624267578, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.966520071029663, + "rewards/margins": 1.7541351318359375, + "rewards/rejected": -3.7206552028656006, + "step": 2860 + }, + { + "epoch": 2.7, + "grad_norm": 15.265108108520508, + "learning_rate": 5.52640783490731e-08, + "logps/chosen": -62.39719772338867, + "logps/rejected": -97.02392578125, + "loss": 0.1688, + "losses/dpo": 0.08139872550964355, + "losses/sft": 2.539510726928711, + "losses/total": 0.08139872550964355, + "ref_logps/chosen": -39.57550048828125, + "ref_logps/rejected": -46.76481628417969, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2821693420410156, + "rewards/margins": 2.743741512298584, + "rewards/rejected": -5.0259108543396, + "step": 2861 + }, + { + "epoch": 2.7, + "grad_norm": 25.72768783569336, + "learning_rate": 5.5089192025183626e-08, + "logps/chosen": -62.99943923950195, + "logps/rejected": -101.4157485961914, + "loss": 0.2289, + "losses/dpo": 0.444305956363678, + "losses/sft": 2.4931867122650146, + "losses/total": 0.444305956363678, + "ref_logps/chosen": -38.98030090332031, + "ref_logps/rejected": -54.532142639160156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.40191388130188, + "rewards/margins": 2.286447048187256, + "rewards/rejected": -4.688361167907715, + "step": 2862 + }, + { + "epoch": 2.7, + "grad_norm": 28.17253303527832, + "learning_rate": 5.491430570129416e-08, + "logps/chosen": -67.42326354980469, + "logps/rejected": -98.20608520507812, + "loss": 0.2794, + "losses/dpo": 0.2669195234775543, + "losses/sft": 2.1227869987487793, + "losses/total": 0.2669195234775543, + "ref_logps/chosen": -42.004920959472656, + "ref_logps/rejected": -54.77240753173828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5418338775634766, + "rewards/margins": 1.8015345335006714, + "rewards/rejected": -4.3433685302734375, + "step": 2863 + }, + { + "epoch": 2.7, + "grad_norm": 15.804169654846191, + "learning_rate": 5.4739419377404684e-08, + "logps/chosen": -54.603328704833984, + "logps/rejected": -87.33549499511719, + "loss": 0.135, + "losses/dpo": 0.17943991720676422, + "losses/sft": 2.426722764968872, + "losses/total": 0.17943991720676422, + "ref_logps/chosen": -32.791717529296875, + "ref_logps/rejected": -40.36088562011719, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.181161403656006, + "rewards/margins": 2.516299247741699, + "rewards/rejected": -4.697461128234863, + "step": 2864 + }, + { + "epoch": 2.71, + "grad_norm": 27.17184066772461, + "learning_rate": 5.4564533053515217e-08, + "logps/chosen": -60.46577453613281, + "logps/rejected": -81.92391967773438, + "loss": 0.2859, + "losses/dpo": 0.20735451579093933, + "losses/sft": 2.2000725269317627, + "losses/total": 0.20735451579093933, + "ref_logps/chosen": -39.78398895263672, + "ref_logps/rejected": -40.30161666870117, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.068178653717041, + "rewards/margins": 2.0940513610839844, + "rewards/rejected": -4.162229537963867, + "step": 2865 + }, + { + "epoch": 2.71, + "grad_norm": 20.945539474487305, + "learning_rate": 5.4389646729625736e-08, + "logps/chosen": -53.99884033203125, + "logps/rejected": -77.11982727050781, + "loss": 0.2387, + "losses/dpo": 0.12420934438705444, + "losses/sft": 2.1732325553894043, + "losses/total": 0.12420934438705444, + "ref_logps/chosen": -36.194007873535156, + "ref_logps/rejected": -37.71255111694336, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.780483603477478, + "rewards/margins": 2.1602444648742676, + "rewards/rejected": -3.940728187561035, + "step": 2866 + }, + { + "epoch": 2.71, + "grad_norm": 15.74340534210205, + "learning_rate": 5.421476040573627e-08, + "logps/chosen": -72.09315490722656, + "logps/rejected": -87.51078796386719, + "loss": 0.1616, + "losses/dpo": 0.15017655491828918, + "losses/sft": 2.0582592487335205, + "losses/total": 0.15017655491828918, + "ref_logps/chosen": -52.09022903442383, + "ref_logps/rejected": -46.710243225097656, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0002927780151367, + "rewards/margins": 2.0797617435455322, + "rewards/rejected": -4.080055236816406, + "step": 2867 + }, + { + "epoch": 2.71, + "grad_norm": 24.89318084716797, + "learning_rate": 5.40398740818468e-08, + "logps/chosen": -57.139404296875, + "logps/rejected": -96.89073944091797, + "loss": 0.1922, + "losses/dpo": 0.1747281700372696, + "losses/sft": 2.065976858139038, + "losses/total": 0.1747281700372696, + "ref_logps/chosen": -37.90196990966797, + "ref_logps/rejected": -48.73188781738281, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.923743724822998, + "rewards/margins": 2.892141819000244, + "rewards/rejected": -4.815885543823242, + "step": 2868 + }, + { + "epoch": 2.71, + "grad_norm": 30.109760284423828, + "learning_rate": 5.386498775795732e-08, + "logps/chosen": -68.16104125976562, + "logps/rejected": -94.56878662109375, + "loss": 0.2708, + "losses/dpo": 0.4975349009037018, + "losses/sft": 2.309370994567871, + "losses/total": 0.4975349009037018, + "ref_logps/chosen": -39.060020446777344, + "ref_logps/rejected": -44.24861145019531, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.910101890563965, + "rewards/margins": 2.121915102005005, + "rewards/rejected": -5.032016754150391, + "step": 2869 + }, + { + "epoch": 2.71, + "grad_norm": 29.30947494506836, + "learning_rate": 5.369010143406785e-08, + "logps/chosen": -62.62954330444336, + "logps/rejected": -79.90357971191406, + "loss": 0.3123, + "losses/dpo": 0.2233206331729889, + "losses/sft": 2.1770215034484863, + "losses/total": 0.2233206331729889, + "ref_logps/chosen": -39.58228302001953, + "ref_logps/rejected": -37.34901428222656, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3047261238098145, + "rewards/margins": 1.950730800628662, + "rewards/rejected": -4.255456924438477, + "step": 2870 + }, + { + "epoch": 2.71, + "grad_norm": 19.843786239624023, + "learning_rate": 5.3515215110178385e-08, + "logps/chosen": -58.096275329589844, + "logps/rejected": -85.56560516357422, + "loss": 0.1962, + "losses/dpo": 0.15028533339500427, + "losses/sft": 1.9977455139160156, + "losses/total": 0.15028533339500427, + "ref_logps/chosen": -36.27116394042969, + "ref_logps/rejected": -42.68719482421875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.182511329650879, + "rewards/margins": 2.105329751968384, + "rewards/rejected": -4.287840843200684, + "step": 2871 + }, + { + "epoch": 2.71, + "grad_norm": 38.27719497680664, + "learning_rate": 5.334032878628891e-08, + "logps/chosen": -53.89379119873047, + "logps/rejected": -83.87779235839844, + "loss": 0.4339, + "losses/dpo": 0.5076555609703064, + "losses/sft": 2.490772008895874, + "losses/total": 0.5076555609703064, + "ref_logps/chosen": -33.37085723876953, + "ref_logps/rejected": -45.197574615478516, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.052293300628662, + "rewards/margins": 1.815728783607483, + "rewards/rejected": -3.8680224418640137, + "step": 2872 + }, + { + "epoch": 2.71, + "grad_norm": 13.567605972290039, + "learning_rate": 5.316544246239944e-08, + "logps/chosen": -58.44673538208008, + "logps/rejected": -91.78823852539062, + "loss": 0.1496, + "losses/dpo": 0.04928778111934662, + "losses/sft": 1.6096879243850708, + "losses/total": 0.04928778111934662, + "ref_logps/chosen": -40.113067626953125, + "ref_logps/rejected": -44.49457550048828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8333666324615479, + "rewards/margins": 2.896000385284424, + "rewards/rejected": -4.729366779327393, + "step": 2873 + }, + { + "epoch": 2.71, + "grad_norm": 28.203121185302734, + "learning_rate": 5.299055613850997e-08, + "logps/chosen": -61.545440673828125, + "logps/rejected": -88.96922302246094, + "loss": 0.26, + "losses/dpo": 0.15058162808418274, + "losses/sft": 1.7406771183013916, + "losses/total": 0.15058162808418274, + "ref_logps/chosen": -39.28202819824219, + "ref_logps/rejected": -45.285858154296875, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2263410091400146, + "rewards/margins": 2.141995429992676, + "rewards/rejected": -4.368336200714111, + "step": 2874 + }, + { + "epoch": 2.71, + "grad_norm": 23.05544090270996, + "learning_rate": 5.2815669814620495e-08, + "logps/chosen": -61.41276550292969, + "logps/rejected": -80.32254791259766, + "loss": 0.3129, + "losses/dpo": 0.16426196694374084, + "losses/sft": 2.6184425354003906, + "losses/total": 0.16426196694374084, + "ref_logps/chosen": -39.670345306396484, + "ref_logps/rejected": -43.33232116699219, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1742424964904785, + "rewards/margins": 1.524780035018921, + "rewards/rejected": -3.6990227699279785, + "step": 2875 + }, + { + "epoch": 2.72, + "grad_norm": 23.09227180480957, + "learning_rate": 5.264078349073102e-08, + "logps/chosen": -74.49089050292969, + "logps/rejected": -90.31794738769531, + "loss": 0.1765, + "losses/dpo": 0.2602127194404602, + "losses/sft": 1.8905940055847168, + "losses/total": 0.2602127194404602, + "ref_logps/chosen": -51.54088592529297, + "ref_logps/rejected": -46.022525787353516, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2950005531311035, + "rewards/margins": 2.1345412731170654, + "rewards/rejected": -4.429542064666748, + "step": 2876 + }, + { + "epoch": 2.72, + "grad_norm": 22.58933448791504, + "learning_rate": 5.246589716684155e-08, + "logps/chosen": -57.41387939453125, + "logps/rejected": -69.24373626708984, + "loss": 0.2936, + "losses/dpo": 0.4625975489616394, + "losses/sft": 1.7165277004241943, + "losses/total": 0.4625975489616394, + "ref_logps/chosen": -43.284934997558594, + "ref_logps/rejected": -36.15578842163086, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4128944873809814, + "rewards/margins": 1.8959007263183594, + "rewards/rejected": -3.308795213699341, + "step": 2877 + }, + { + "epoch": 2.72, + "grad_norm": 16.131208419799805, + "learning_rate": 5.229101084295208e-08, + "logps/chosen": -48.987396240234375, + "logps/rejected": -83.63958740234375, + "loss": 0.1453, + "losses/dpo": 0.0889534279704094, + "losses/sft": 1.8422859907150269, + "losses/total": 0.0889534279704094, + "ref_logps/chosen": -32.04210662841797, + "ref_logps/rejected": -41.86195373535156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6945290565490723, + "rewards/margins": 2.4832351207733154, + "rewards/rejected": -4.177763938903809, + "step": 2878 + }, + { + "epoch": 2.72, + "grad_norm": 48.708351135253906, + "learning_rate": 5.211612451906261e-08, + "logps/chosen": -56.94121551513672, + "logps/rejected": -69.2843017578125, + "loss": 0.4896, + "losses/dpo": 0.645794689655304, + "losses/sft": 2.6366021633148193, + "losses/total": 0.645794689655304, + "ref_logps/chosen": -33.79083251953125, + "ref_logps/rejected": -30.095840454101562, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3150386810302734, + "rewards/margins": 1.6038072109222412, + "rewards/rejected": -3.9188458919525146, + "step": 2879 + }, + { + "epoch": 2.72, + "grad_norm": 29.57944679260254, + "learning_rate": 5.194123819517313e-08, + "logps/chosen": -58.34238815307617, + "logps/rejected": -90.1668472290039, + "loss": 0.3344, + "losses/dpo": 0.44302046298980713, + "losses/sft": 1.897892713546753, + "losses/total": 0.44302046298980713, + "ref_logps/chosen": -38.493927001953125, + "ref_logps/rejected": -51.56311798095703, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9848463535308838, + "rewards/margins": 1.8755269050598145, + "rewards/rejected": -3.8603732585906982, + "step": 2880 + }, + { + "epoch": 2.72, + "grad_norm": 23.48588752746582, + "learning_rate": 5.1766351871283664e-08, + "logps/chosen": -82.89336395263672, + "logps/rejected": -102.16773986816406, + "loss": 0.2056, + "losses/dpo": 0.36578232049942017, + "losses/sft": 2.3649439811706543, + "losses/total": 0.36578232049942017, + "ref_logps/chosen": -58.64131164550781, + "ref_logps/rejected": -53.85184097290039, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4252052307128906, + "rewards/margins": 2.4063849449157715, + "rewards/rejected": -4.831589698791504, + "step": 2881 + }, + { + "epoch": 2.72, + "grad_norm": 19.96122169494629, + "learning_rate": 5.1591465547394196e-08, + "logps/chosen": -42.434059143066406, + "logps/rejected": -64.46273803710938, + "loss": 0.2572, + "losses/dpo": 0.35179752111434937, + "losses/sft": 2.2044763565063477, + "losses/total": 0.35179752111434937, + "ref_logps/chosen": -25.632247924804688, + "ref_logps/rejected": -28.573287963867188, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6801811456680298, + "rewards/margins": 1.9087638854980469, + "rewards/rejected": -3.588944911956787, + "step": 2882 + }, + { + "epoch": 2.72, + "grad_norm": 28.02341651916504, + "learning_rate": 5.1416579223504716e-08, + "logps/chosen": -65.40029907226562, + "logps/rejected": -77.53459167480469, + "loss": 0.2917, + "losses/dpo": 0.5614790916442871, + "losses/sft": 1.9322682619094849, + "losses/total": 0.5614790916442871, + "ref_logps/chosen": -44.492366790771484, + "ref_logps/rejected": -40.451541900634766, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0907931327819824, + "rewards/margins": 1.6175122261047363, + "rewards/rejected": -3.7083053588867188, + "step": 2883 + }, + { + "epoch": 2.72, + "grad_norm": 18.873231887817383, + "learning_rate": 5.124169289961525e-08, + "logps/chosen": -53.775421142578125, + "logps/rejected": -82.19868469238281, + "loss": 0.1725, + "losses/dpo": 0.04656333848834038, + "losses/sft": 1.74667227268219, + "losses/total": 0.04656333848834038, + "ref_logps/chosen": -38.103355407714844, + "ref_logps/rejected": -41.982879638671875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.567206859588623, + "rewards/margins": 2.454373359680176, + "rewards/rejected": -4.021579742431641, + "step": 2884 + }, + { + "epoch": 2.72, + "grad_norm": 15.36976432800293, + "learning_rate": 5.1066806575725774e-08, + "logps/chosen": -48.32014465332031, + "logps/rejected": -80.34864044189453, + "loss": 0.1677, + "losses/dpo": 0.20992562174797058, + "losses/sft": 1.912480354309082, + "losses/total": 0.20992562174797058, + "ref_logps/chosen": -31.82781410217285, + "ref_logps/rejected": -39.51568603515625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6492328643798828, + "rewards/margins": 2.4340627193450928, + "rewards/rejected": -4.083295822143555, + "step": 2885 + }, + { + "epoch": 2.73, + "grad_norm": 20.57769012451172, + "learning_rate": 5.0891920251836306e-08, + "logps/chosen": -79.54473876953125, + "logps/rejected": -93.16749572753906, + "loss": 0.2527, + "losses/dpo": 0.2616657614707947, + "losses/sft": 2.74359393119812, + "losses/total": 0.2616657614707947, + "ref_logps/chosen": -50.75069046020508, + "ref_logps/rejected": -48.24676513671875, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8794045448303223, + "rewards/margins": 1.6126682758331299, + "rewards/rejected": -4.492073059082031, + "step": 2886 + }, + { + "epoch": 2.73, + "grad_norm": 23.890918731689453, + "learning_rate": 5.071703392794683e-08, + "logps/chosen": -59.0823974609375, + "logps/rejected": -79.6904525756836, + "loss": 0.2393, + "losses/dpo": 0.30438071489334106, + "losses/sft": 2.2941267490386963, + "losses/total": 0.30438071489334106, + "ref_logps/chosen": -38.12369155883789, + "ref_logps/rejected": -38.59575653076172, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0958709716796875, + "rewards/margins": 2.013599395751953, + "rewards/rejected": -4.109470367431641, + "step": 2887 + }, + { + "epoch": 2.73, + "grad_norm": 14.827569961547852, + "learning_rate": 5.054214760405736e-08, + "logps/chosen": -39.25969314575195, + "logps/rejected": -79.71253967285156, + "loss": 0.1596, + "losses/dpo": 0.3309593200683594, + "losses/sft": 1.6219087839126587, + "losses/total": 0.3309593200683594, + "ref_logps/chosen": -28.565309524536133, + "ref_logps/rejected": -40.868919372558594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0694384574890137, + "rewards/margins": 2.814924478530884, + "rewards/rejected": -3.8843631744384766, + "step": 2888 + }, + { + "epoch": 2.73, + "grad_norm": 21.80104637145996, + "learning_rate": 5.036726128016789e-08, + "logps/chosen": -62.342769622802734, + "logps/rejected": -94.05638122558594, + "loss": 0.1875, + "losses/dpo": 0.24158428609371185, + "losses/sft": 3.037687063217163, + "losses/total": 0.24158428609371185, + "ref_logps/chosen": -36.985023498535156, + "ref_logps/rejected": -47.24561309814453, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5357747077941895, + "rewards/margins": 2.1453018188476562, + "rewards/rejected": -4.681077003479004, + "step": 2889 + }, + { + "epoch": 2.73, + "grad_norm": 24.7808837890625, + "learning_rate": 5.019237495627842e-08, + "logps/chosen": -49.591758728027344, + "logps/rejected": -75.19595336914062, + "loss": 0.3314, + "losses/dpo": 0.09084201604127884, + "losses/sft": 2.1008388996124268, + "losses/total": 0.09084201604127884, + "ref_logps/chosen": -27.8034610748291, + "ref_logps/rejected": -37.023075103759766, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.1788296699523926, + "rewards/margins": 1.6384576559066772, + "rewards/rejected": -3.8172874450683594, + "step": 2890 + }, + { + "epoch": 2.73, + "grad_norm": 24.186092376708984, + "learning_rate": 5.001748863238894e-08, + "logps/chosen": -53.1131591796875, + "logps/rejected": -79.9860610961914, + "loss": 0.2231, + "losses/dpo": 0.12335456907749176, + "losses/sft": 1.8055753707885742, + "losses/total": 0.12335456907749176, + "ref_logps/chosen": -33.05149841308594, + "ref_logps/rejected": -38.446929931640625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0061657428741455, + "rewards/margins": 2.147747755050659, + "rewards/rejected": -4.153913497924805, + "step": 2891 + }, + { + "epoch": 2.73, + "grad_norm": 25.13157081604004, + "learning_rate": 4.9842602308499475e-08, + "logps/chosen": -52.623966217041016, + "logps/rejected": -80.99105072021484, + "loss": 0.3294, + "losses/dpo": 0.14942845702171326, + "losses/sft": 2.0872185230255127, + "losses/total": 0.14942845702171326, + "ref_logps/chosen": -35.113525390625, + "ref_logps/rejected": -42.599098205566406, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7510440349578857, + "rewards/margins": 2.088151454925537, + "rewards/rejected": -3.839195489883423, + "step": 2892 + }, + { + "epoch": 2.73, + "grad_norm": 22.575241088867188, + "learning_rate": 4.966771598461e-08, + "logps/chosen": -64.890625, + "logps/rejected": -83.49400329589844, + "loss": 0.2308, + "losses/dpo": 0.09400902688503265, + "losses/sft": 2.699190378189087, + "losses/total": 0.09400902688503265, + "ref_logps/chosen": -45.70062255859375, + "ref_logps/rejected": -41.647560119628906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.919000267982483, + "rewards/margins": 2.26564359664917, + "rewards/rejected": -4.1846442222595215, + "step": 2893 + }, + { + "epoch": 2.73, + "grad_norm": 22.032365798950195, + "learning_rate": 4.949282966072053e-08, + "logps/chosen": -71.71018981933594, + "logps/rejected": -105.13894653320312, + "loss": 0.1838, + "losses/dpo": 0.09269165992736816, + "losses/sft": 2.3309507369995117, + "losses/total": 0.09269165992736816, + "ref_logps/chosen": -47.39867401123047, + "ref_logps/rejected": -56.04137420654297, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.431152105331421, + "rewards/margins": 2.478605270385742, + "rewards/rejected": -4.909757137298584, + "step": 2894 + }, + { + "epoch": 2.73, + "grad_norm": 18.26418685913086, + "learning_rate": 4.931794333683106e-08, + "logps/chosen": -42.78246307373047, + "logps/rejected": -81.12641143798828, + "loss": 0.1778, + "losses/dpo": 0.14242404699325562, + "losses/sft": 2.016181707382202, + "losses/total": 0.14242404699325562, + "ref_logps/chosen": -30.147472381591797, + "ref_logps/rejected": -42.09385681152344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2634994983673096, + "rewards/margins": 2.639756679534912, + "rewards/rejected": -3.9032559394836426, + "step": 2895 + }, + { + "epoch": 2.73, + "grad_norm": 23.775630950927734, + "learning_rate": 4.9143057012941585e-08, + "logps/chosen": -66.25042724609375, + "logps/rejected": -90.31790161132812, + "loss": 0.257, + "losses/dpo": 0.29815739393234253, + "losses/sft": 2.7361183166503906, + "losses/total": 0.29815739393234253, + "ref_logps/chosen": -41.492977142333984, + "ref_logps/rejected": -47.90015411376953, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4757447242736816, + "rewards/margins": 1.7660298347473145, + "rewards/rejected": -4.241774559020996, + "step": 2896 + }, + { + "epoch": 2.74, + "grad_norm": 30.72845458984375, + "learning_rate": 4.896817068905212e-08, + "logps/chosen": -56.872501373291016, + "logps/rejected": -86.8343505859375, + "loss": 0.5212, + "losses/dpo": 1.8256490230560303, + "losses/sft": 2.678504467010498, + "losses/total": 1.8256490230560303, + "ref_logps/chosen": -33.75708770751953, + "ref_logps/rejected": -42.83272933959961, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3115410804748535, + "rewards/margins": 2.088620901107788, + "rewards/rejected": -4.4001617431640625, + "step": 2897 + }, + { + "epoch": 2.74, + "grad_norm": 13.860445022583008, + "learning_rate": 4.8793284365162644e-08, + "logps/chosen": -71.68281555175781, + "logps/rejected": -101.97732543945312, + "loss": 0.1145, + "losses/dpo": 0.17511504888534546, + "losses/sft": 2.1915576457977295, + "losses/total": 0.17511504888534546, + "ref_logps/chosen": -47.659629821777344, + "ref_logps/rejected": -49.48004150390625, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4023187160491943, + "rewards/margins": 2.847410202026367, + "rewards/rejected": -5.249728679656982, + "step": 2898 + }, + { + "epoch": 2.74, + "grad_norm": 21.314437866210938, + "learning_rate": 4.861839804127317e-08, + "logps/chosen": -61.1273193359375, + "logps/rejected": -84.61996459960938, + "loss": 0.1693, + "losses/dpo": 0.17391903698444366, + "losses/sft": 2.1920340061187744, + "losses/total": 0.17391903698444366, + "ref_logps/chosen": -41.756996154785156, + "ref_logps/rejected": -42.61939239501953, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9370322227478027, + "rewards/margins": 2.2630255222320557, + "rewards/rejected": -4.2000579833984375, + "step": 2899 + }, + { + "epoch": 2.74, + "grad_norm": 31.29777717590332, + "learning_rate": 4.84435117173837e-08, + "logps/chosen": -52.42212677001953, + "logps/rejected": -68.17816162109375, + "loss": 0.5142, + "losses/dpo": 1.3692716360092163, + "losses/sft": 2.1053643226623535, + "losses/total": 1.3692716360092163, + "ref_logps/chosen": -33.53604507446289, + "ref_logps/rejected": -30.62323760986328, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8886083364486694, + "rewards/margins": 1.8668839931488037, + "rewards/rejected": -3.7554922103881836, + "step": 2900 + }, + { + "epoch": 2.74, + "grad_norm": 25.538320541381836, + "learning_rate": 4.826862539349422e-08, + "logps/chosen": -53.15333557128906, + "logps/rejected": -73.51534271240234, + "loss": 0.2187, + "losses/dpo": 0.20950065553188324, + "losses/sft": 2.373840808868408, + "losses/total": 0.20950065553188324, + "ref_logps/chosen": -33.88386535644531, + "ref_logps/rejected": -33.111846923828125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9269468784332275, + "rewards/margins": 2.113402843475342, + "rewards/rejected": -4.04034948348999, + "step": 2901 + }, + { + "epoch": 2.74, + "grad_norm": 20.94049835205078, + "learning_rate": 4.8093739069604754e-08, + "logps/chosen": -44.767662048339844, + "logps/rejected": -71.72140502929688, + "loss": 0.2182, + "losses/dpo": 0.2349478304386139, + "losses/sft": 1.956683874130249, + "losses/total": 0.2349478304386139, + "ref_logps/chosen": -29.535459518432617, + "ref_logps/rejected": -34.54790496826172, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5232203006744385, + "rewards/margins": 2.1941304206848145, + "rewards/rejected": -3.717350721359253, + "step": 2902 + }, + { + "epoch": 2.74, + "grad_norm": 19.078367233276367, + "learning_rate": 4.7918852745715286e-08, + "logps/chosen": -51.2137451171875, + "logps/rejected": -74.24647521972656, + "loss": 0.1702, + "losses/dpo": 0.2708870768547058, + "losses/sft": 1.9555599689483643, + "losses/total": 0.2708870768547058, + "ref_logps/chosen": -32.18997573852539, + "ref_logps/rejected": -32.63280487060547, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9023771286010742, + "rewards/margins": 2.258990526199341, + "rewards/rejected": -4.161367893218994, + "step": 2903 + }, + { + "epoch": 2.74, + "grad_norm": 24.8669490814209, + "learning_rate": 4.774396642182581e-08, + "logps/chosen": -43.82025146484375, + "logps/rejected": -76.4781494140625, + "loss": 0.2336, + "losses/dpo": 0.23846878111362457, + "losses/sft": 1.27152419090271, + "losses/total": 0.23846878111362457, + "ref_logps/chosen": -30.502214431762695, + "ref_logps/rejected": -38.13578796386719, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.331803798675537, + "rewards/margins": 2.5024328231811523, + "rewards/rejected": -3.8342366218566895, + "step": 2904 + }, + { + "epoch": 2.74, + "grad_norm": 20.20412826538086, + "learning_rate": 4.756908009793634e-08, + "logps/chosen": -56.2938232421875, + "logps/rejected": -98.10809326171875, + "loss": 0.177, + "losses/dpo": 0.1635667085647583, + "losses/sft": 1.756253719329834, + "losses/total": 0.1635667085647583, + "ref_logps/chosen": -35.66443634033203, + "ref_logps/rejected": -50.62413787841797, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.062938690185547, + "rewards/margins": 2.685457229614258, + "rewards/rejected": -4.748395919799805, + "step": 2905 + }, + { + "epoch": 2.74, + "grad_norm": 13.31275463104248, + "learning_rate": 4.739419377404687e-08, + "logps/chosen": -74.900146484375, + "logps/rejected": -103.60855102539062, + "loss": 0.123, + "losses/dpo": 0.1407439410686493, + "losses/sft": 2.33221435546875, + "losses/total": 0.1407439410686493, + "ref_logps/chosen": -51.773983001708984, + "ref_logps/rejected": -53.90714645385742, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3126163482666016, + "rewards/margins": 2.6575241088867188, + "rewards/rejected": -4.97014045715332, + "step": 2906 + }, + { + "epoch": 2.75, + "grad_norm": 46.8740348815918, + "learning_rate": 4.7219307450157396e-08, + "logps/chosen": -64.64573669433594, + "logps/rejected": -62.52058410644531, + "loss": 0.4977, + "losses/dpo": 1.0391278266906738, + "losses/sft": 2.072814702987671, + "losses/total": 1.0391278266906738, + "ref_logps/chosen": -44.504512786865234, + "ref_logps/rejected": -30.27450942993164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.014122247695923, + "rewards/margins": 1.2104852199554443, + "rewards/rejected": -3.224607467651367, + "step": 2907 + }, + { + "epoch": 2.75, + "grad_norm": 16.31307601928711, + "learning_rate": 4.704442112626792e-08, + "logps/chosen": -62.43891906738281, + "logps/rejected": -89.83383178710938, + "loss": 0.1648, + "losses/dpo": 0.12543222308158875, + "losses/sft": 2.3578948974609375, + "losses/total": 0.12543222308158875, + "ref_logps/chosen": -42.93967056274414, + "ref_logps/rejected": -46.32426452636719, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9499249458312988, + "rewards/margins": 2.4010322093963623, + "rewards/rejected": -4.350956916809082, + "step": 2908 + }, + { + "epoch": 2.75, + "grad_norm": 18.701644897460938, + "learning_rate": 4.686953480237845e-08, + "logps/chosen": -65.79672241210938, + "logps/rejected": -68.97135925292969, + "loss": 0.1877, + "losses/dpo": 0.269913911819458, + "losses/sft": 1.9493757486343384, + "losses/total": 0.269913911819458, + "ref_logps/chosen": -49.953006744384766, + "ref_logps/rejected": -30.98887062072754, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5843713283538818, + "rewards/margins": 2.213876724243164, + "rewards/rejected": -3.798248052597046, + "step": 2909 + }, + { + "epoch": 2.75, + "grad_norm": 15.259963035583496, + "learning_rate": 4.669464847848898e-08, + "logps/chosen": -56.39866638183594, + "logps/rejected": -98.07179260253906, + "loss": 0.1472, + "losses/dpo": 0.21368534862995148, + "losses/sft": 2.2640364170074463, + "losses/total": 0.21368534862995148, + "ref_logps/chosen": -35.53032684326172, + "ref_logps/rejected": -52.81883239746094, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0868337154388428, + "rewards/margins": 2.438462257385254, + "rewards/rejected": -4.525296211242676, + "step": 2910 + }, + { + "epoch": 2.75, + "grad_norm": 23.00604820251465, + "learning_rate": 4.651976215459951e-08, + "logps/chosen": -55.955814361572266, + "logps/rejected": -82.95382690429688, + "loss": 0.1878, + "losses/dpo": 0.12088857591152191, + "losses/sft": 1.6624360084533691, + "losses/total": 0.12088857591152191, + "ref_logps/chosen": -38.27434539794922, + "ref_logps/rejected": -40.77610778808594, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7681469917297363, + "rewards/margins": 2.449625015258789, + "rewards/rejected": -4.217772006988525, + "step": 2911 + }, + { + "epoch": 2.75, + "grad_norm": 28.070449829101562, + "learning_rate": 4.634487583071003e-08, + "logps/chosen": -64.6600570678711, + "logps/rejected": -86.61955261230469, + "loss": 0.2678, + "losses/dpo": 0.11538442224264145, + "losses/sft": 1.9681587219238281, + "losses/total": 0.11538442224264145, + "ref_logps/chosen": -39.92943572998047, + "ref_logps/rejected": -42.636722564697266, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.473062038421631, + "rewards/margins": 1.9252204895019531, + "rewards/rejected": -4.398282527923584, + "step": 2912 + }, + { + "epoch": 2.75, + "grad_norm": 15.563338279724121, + "learning_rate": 4.6169989506820565e-08, + "logps/chosen": -52.811279296875, + "logps/rejected": -81.2079086303711, + "loss": 0.1518, + "losses/dpo": 0.15597154200077057, + "losses/sft": 1.8160310983657837, + "losses/total": 0.15597154200077057, + "ref_logps/chosen": -34.96300506591797, + "ref_logps/rejected": -38.81499099731445, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7848271131515503, + "rewards/margins": 2.454464912414551, + "rewards/rejected": -4.239291667938232, + "step": 2913 + }, + { + "epoch": 2.75, + "grad_norm": 24.842069625854492, + "learning_rate": 4.59951031829311e-08, + "logps/chosen": -58.31574630737305, + "logps/rejected": -89.58419799804688, + "loss": 0.2135, + "losses/dpo": 0.35399264097213745, + "losses/sft": 2.2470216751098633, + "losses/total": 0.35399264097213745, + "ref_logps/chosen": -38.719276428222656, + "ref_logps/rejected": -47.51642990112305, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9596470594406128, + "rewards/margins": 2.247130870819092, + "rewards/rejected": -4.206777572631836, + "step": 2914 + }, + { + "epoch": 2.75, + "grad_norm": 25.518869400024414, + "learning_rate": 4.5820216859041617e-08, + "logps/chosen": -60.399696350097656, + "logps/rejected": -80.20470428466797, + "loss": 0.2861, + "losses/dpo": 0.3874371647834778, + "losses/sft": 2.599649667739868, + "losses/total": 0.3874371647834778, + "ref_logps/chosen": -36.75235366821289, + "ref_logps/rejected": -40.736690521240234, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.364734411239624, + "rewards/margins": 1.5820667743682861, + "rewards/rejected": -3.94680118560791, + "step": 2915 + }, + { + "epoch": 2.75, + "grad_norm": 24.60268211364746, + "learning_rate": 4.564533053515215e-08, + "logps/chosen": -50.21382141113281, + "logps/rejected": -69.26856994628906, + "loss": 0.3713, + "losses/dpo": 0.8575896620750427, + "losses/sft": 1.8644320964813232, + "losses/total": 0.8575896620750427, + "ref_logps/chosen": -31.54848289489746, + "ref_logps/rejected": -31.33245849609375, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8665341138839722, + "rewards/margins": 1.927076816558838, + "rewards/rejected": -3.7936110496520996, + "step": 2916 + }, + { + "epoch": 2.75, + "grad_norm": 24.392568588256836, + "learning_rate": 4.5470444211262675e-08, + "logps/chosen": -63.81717300415039, + "logps/rejected": -78.88933563232422, + "loss": 0.3482, + "losses/dpo": 0.45321568846702576, + "losses/sft": 1.9605830907821655, + "losses/total": 0.45321568846702576, + "ref_logps/chosen": -43.75624084472656, + "ref_logps/rejected": -37.84739685058594, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0060930252075195, + "rewards/margins": 2.0981009006500244, + "rewards/rejected": -4.104194164276123, + "step": 2917 + }, + { + "epoch": 2.76, + "grad_norm": 19.76136016845703, + "learning_rate": 4.529555788737321e-08, + "logps/chosen": -57.73360824584961, + "logps/rejected": -89.73143005371094, + "loss": 0.1611, + "losses/dpo": 0.1569322943687439, + "losses/sft": 2.173398494720459, + "losses/total": 0.1569322943687439, + "ref_logps/chosen": -34.68782043457031, + "ref_logps/rejected": -44.75007629394531, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3045785427093506, + "rewards/margins": 2.1935575008392334, + "rewards/rejected": -4.498136043548584, + "step": 2918 + }, + { + "epoch": 2.76, + "grad_norm": 41.902217864990234, + "learning_rate": 4.5120671563483733e-08, + "logps/chosen": -79.74898529052734, + "logps/rejected": -94.09258270263672, + "loss": 0.4207, + "losses/dpo": 0.5761823058128357, + "losses/sft": 2.102543354034424, + "losses/total": 0.5761823058128357, + "ref_logps/chosen": -52.83766174316406, + "ref_logps/rejected": -46.913352966308594, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6911330223083496, + "rewards/margins": 2.026789903640747, + "rewards/rejected": -4.717922687530518, + "step": 2919 + }, + { + "epoch": 2.76, + "grad_norm": 16.969646453857422, + "learning_rate": 4.494578523959426e-08, + "logps/chosen": -73.46839904785156, + "logps/rejected": -108.63909912109375, + "loss": 0.1134, + "losses/dpo": 0.25589433312416077, + "losses/sft": 2.5820610523223877, + "losses/total": 0.25589433312416077, + "ref_logps/chosen": -47.53834533691406, + "ref_logps/rejected": -46.04529571533203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.593005895614624, + "rewards/margins": 3.6663739681243896, + "rewards/rejected": -6.259379863739014, + "step": 2920 + }, + { + "epoch": 2.76, + "grad_norm": 24.645444869995117, + "learning_rate": 4.477089891570479e-08, + "logps/chosen": -63.86751937866211, + "logps/rejected": -88.18708801269531, + "loss": 0.316, + "losses/dpo": 0.4615601897239685, + "losses/sft": 2.747189521789551, + "losses/total": 0.4615601897239685, + "ref_logps/chosen": -40.57049560546875, + "ref_logps/rejected": -46.73444366455078, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.329702854156494, + "rewards/margins": 1.8155622482299805, + "rewards/rejected": -4.145264625549316, + "step": 2921 + }, + { + "epoch": 2.76, + "grad_norm": 22.891401290893555, + "learning_rate": 4.4596012591815324e-08, + "logps/chosen": -56.50829315185547, + "logps/rejected": -83.11656951904297, + "loss": 0.2162, + "losses/dpo": 0.28885069489479065, + "losses/sft": 1.9294531345367432, + "losses/total": 0.28885069489479065, + "ref_logps/chosen": -32.53710174560547, + "ref_logps/rejected": -37.08204650878906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3971192836761475, + "rewards/margins": 2.2063326835632324, + "rewards/rejected": -4.603451728820801, + "step": 2922 + }, + { + "epoch": 2.76, + "grad_norm": 41.41575622558594, + "learning_rate": 4.4421126267925844e-08, + "logps/chosen": -61.92214584350586, + "logps/rejected": -82.36434936523438, + "loss": 0.4097, + "losses/dpo": 0.1857387274503708, + "losses/sft": 1.9243720769882202, + "losses/total": 0.1857387274503708, + "ref_logps/chosen": -40.911468505859375, + "ref_logps/rejected": -44.80412673950195, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.101067543029785, + "rewards/margins": 1.6549553871154785, + "rewards/rejected": -3.7560229301452637, + "step": 2923 + }, + { + "epoch": 2.76, + "grad_norm": 20.074983596801758, + "learning_rate": 4.4246239944036376e-08, + "logps/chosen": -47.114009857177734, + "logps/rejected": -78.469970703125, + "loss": 0.2168, + "losses/dpo": 0.13913625478744507, + "losses/sft": 1.5697959661483765, + "losses/total": 0.13913625478744507, + "ref_logps/chosen": -32.79225158691406, + "ref_logps/rejected": -37.911094665527344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4321759939193726, + "rewards/margins": 2.623711585998535, + "rewards/rejected": -4.055887699127197, + "step": 2924 + }, + { + "epoch": 2.76, + "grad_norm": 38.66976547241211, + "learning_rate": 4.40713536201469e-08, + "logps/chosen": -59.42203903198242, + "logps/rejected": -84.69801330566406, + "loss": 0.3001, + "losses/dpo": 0.09689392149448395, + "losses/sft": 2.3049609661102295, + "losses/total": 0.09689392149448395, + "ref_logps/chosen": -36.72865295410156, + "ref_logps/rejected": -37.53166198730469, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.269338369369507, + "rewards/margins": 2.4472970962524414, + "rewards/rejected": -4.716635704040527, + "step": 2925 + }, + { + "epoch": 2.76, + "grad_norm": 14.12325668334961, + "learning_rate": 4.389646729625743e-08, + "logps/chosen": -62.84257507324219, + "logps/rejected": -102.05506896972656, + "loss": 0.1012, + "losses/dpo": 0.036279600113630295, + "losses/sft": 2.1291112899780273, + "losses/total": 0.036279600113630295, + "ref_logps/chosen": -41.30048370361328, + "ref_logps/rejected": -52.78526306152344, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1542088985443115, + "rewards/margins": 2.7727713584899902, + "rewards/rejected": -4.926980018615723, + "step": 2926 + }, + { + "epoch": 2.76, + "grad_norm": 28.979934692382812, + "learning_rate": 4.372158097236796e-08, + "logps/chosen": -63.19284439086914, + "logps/rejected": -84.02626037597656, + "loss": 0.3178, + "losses/dpo": 0.2483692318201065, + "losses/sft": 1.6956597566604614, + "losses/total": 0.2483692318201065, + "ref_logps/chosen": -37.93545913696289, + "ref_logps/rejected": -39.247894287109375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5257389545440674, + "rewards/margins": 1.95209801197052, + "rewards/rejected": -4.477836608886719, + "step": 2927 + }, + { + "epoch": 2.76, + "grad_norm": 30.049333572387695, + "learning_rate": 4.3546694648478486e-08, + "logps/chosen": -50.92780303955078, + "logps/rejected": -73.13703918457031, + "loss": 0.2777, + "losses/dpo": 0.24642544984817505, + "losses/sft": 1.7158806324005127, + "losses/total": 0.24642544984817505, + "ref_logps/chosen": -34.705142974853516, + "ref_logps/rejected": -38.05579376220703, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6222660541534424, + "rewards/margins": 1.8858588933944702, + "rewards/rejected": -3.508125066757202, + "step": 2928 + }, + { + "epoch": 2.77, + "grad_norm": 25.350297927856445, + "learning_rate": 4.337180832458902e-08, + "logps/chosen": -53.08549499511719, + "logps/rejected": -70.6810302734375, + "loss": 0.3429, + "losses/dpo": 0.27654868364334106, + "losses/sft": 2.1185340881347656, + "losses/total": 0.27654868364334106, + "ref_logps/chosen": -33.332984924316406, + "ref_logps/rejected": -33.37082290649414, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9752511978149414, + "rewards/margins": 1.7557696104049683, + "rewards/rejected": -3.731020927429199, + "step": 2929 + }, + { + "epoch": 2.77, + "grad_norm": 31.370952606201172, + "learning_rate": 4.3196922000699545e-08, + "logps/chosen": -70.31005859375, + "logps/rejected": -95.06546020507812, + "loss": 0.3223, + "losses/dpo": 0.8907086253166199, + "losses/sft": 2.5566813945770264, + "losses/total": 0.8907086253166199, + "ref_logps/chosen": -45.06493377685547, + "ref_logps/rejected": -45.993873596191406, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.524512529373169, + "rewards/margins": 2.382645845413208, + "rewards/rejected": -4.907158374786377, + "step": 2930 + }, + { + "epoch": 2.77, + "grad_norm": 24.798593521118164, + "learning_rate": 4.302203567681007e-08, + "logps/chosen": -41.98005676269531, + "logps/rejected": -79.919677734375, + "loss": 0.2423, + "losses/dpo": 0.16114863753318787, + "losses/sft": 1.46969735622406, + "losses/total": 0.16114863753318787, + "ref_logps/chosen": -26.345916748046875, + "ref_logps/rejected": -37.114524841308594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.563414216041565, + "rewards/margins": 2.717101573944092, + "rewards/rejected": -4.280515670776367, + "step": 2931 + }, + { + "epoch": 2.77, + "grad_norm": 22.61949920654297, + "learning_rate": 4.28471493529206e-08, + "logps/chosen": -65.4260025024414, + "logps/rejected": -100.01286315917969, + "loss": 0.2688, + "losses/dpo": 0.15575724840164185, + "losses/sft": 1.9339677095413208, + "losses/total": 0.15575724840164185, + "ref_logps/chosen": -46.596397399902344, + "ref_logps/rejected": -56.88796615600586, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.882960557937622, + "rewards/margins": 2.429529905319214, + "rewards/rejected": -4.312490463256836, + "step": 2932 + }, + { + "epoch": 2.77, + "grad_norm": 25.711580276489258, + "learning_rate": 4.267226302903112e-08, + "logps/chosen": -50.37397384643555, + "logps/rejected": -67.489013671875, + "loss": 0.2586, + "losses/dpo": 0.2976170778274536, + "losses/sft": 2.023776054382324, + "losses/total": 0.2976170778274536, + "ref_logps/chosen": -36.26109313964844, + "ref_logps/rejected": -31.36077308654785, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4112882614135742, + "rewards/margins": 2.20153546333313, + "rewards/rejected": -3.612823724746704, + "step": 2933 + }, + { + "epoch": 2.77, + "grad_norm": 28.1121768951416, + "learning_rate": 4.2497376705141655e-08, + "logps/chosen": -62.54951477050781, + "logps/rejected": -89.92970275878906, + "loss": 0.1856, + "losses/dpo": 0.26242735981941223, + "losses/sft": 2.670637845993042, + "losses/total": 0.26242735981941223, + "ref_logps/chosen": -41.26399612426758, + "ref_logps/rejected": -44.05115509033203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.128551959991455, + "rewards/margins": 2.4593024253845215, + "rewards/rejected": -4.587854385375977, + "step": 2934 + }, + { + "epoch": 2.77, + "grad_norm": 19.07452392578125, + "learning_rate": 4.232249038125219e-08, + "logps/chosen": -51.450740814208984, + "logps/rejected": -78.8673095703125, + "loss": 0.2404, + "losses/dpo": 0.10036514699459076, + "losses/sft": 1.9438000917434692, + "losses/total": 0.10036514699459076, + "ref_logps/chosen": -32.28302001953125, + "ref_logps/rejected": -37.61000061035156, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9167721271514893, + "rewards/margins": 2.2089591026306152, + "rewards/rejected": -4.125731468200684, + "step": 2935 + }, + { + "epoch": 2.77, + "grad_norm": 24.527149200439453, + "learning_rate": 4.214760405736271e-08, + "logps/chosen": -55.682861328125, + "logps/rejected": -83.66441345214844, + "loss": 0.3108, + "losses/dpo": 0.12902814149856567, + "losses/sft": 2.277679681777954, + "losses/total": 0.12902814149856567, + "ref_logps/chosen": -34.76591110229492, + "ref_logps/rejected": -38.99586868286133, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0916948318481445, + "rewards/margins": 2.37515926361084, + "rewards/rejected": -4.466854095458984, + "step": 2936 + }, + { + "epoch": 2.77, + "grad_norm": 14.175972938537598, + "learning_rate": 4.197271773347324e-08, + "logps/chosen": -55.338077545166016, + "logps/rejected": -91.476806640625, + "loss": 0.1644, + "losses/dpo": 0.18473079800605774, + "losses/sft": 1.5740883350372314, + "losses/total": 0.18473079800605774, + "ref_logps/chosen": -33.325950622558594, + "ref_logps/rejected": -48.24712371826172, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2012124061584473, + "rewards/margins": 2.121755838394165, + "rewards/rejected": -4.322968482971191, + "step": 2937 + }, + { + "epoch": 2.77, + "grad_norm": 30.652555465698242, + "learning_rate": 4.179783140958377e-08, + "logps/chosen": -59.166534423828125, + "logps/rejected": -92.97552490234375, + "loss": 0.294, + "losses/dpo": 0.3652971684932709, + "losses/sft": 2.5270166397094727, + "losses/total": 0.3652971684932709, + "ref_logps/chosen": -36.86262512207031, + "ref_logps/rejected": -48.15949249267578, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.230390787124634, + "rewards/margins": 2.2512118816375732, + "rewards/rejected": -4.481602668762207, + "step": 2938 + }, + { + "epoch": 2.78, + "grad_norm": 13.035469055175781, + "learning_rate": 4.16229450856943e-08, + "logps/chosen": -59.53193664550781, + "logps/rejected": -108.38406372070312, + "loss": 0.0777, + "losses/dpo": 0.056367017328739166, + "losses/sft": 2.0953269004821777, + "losses/total": 0.056367017328739166, + "ref_logps/chosen": -36.9190559387207, + "ref_logps/rejected": -55.25392532348633, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2612881660461426, + "rewards/margins": 3.051725387573242, + "rewards/rejected": -5.313013553619385, + "step": 2939 + }, + { + "epoch": 2.78, + "grad_norm": 33.68614196777344, + "learning_rate": 4.144805876180482e-08, + "logps/chosen": -66.39884948730469, + "logps/rejected": -81.25732421875, + "loss": 0.303, + "losses/dpo": 0.11013530939817429, + "losses/sft": 2.38643479347229, + "losses/total": 0.11013530939817429, + "ref_logps/chosen": -43.77596664428711, + "ref_logps/rejected": -39.08415222167969, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2622883319854736, + "rewards/margins": 1.9550285339355469, + "rewards/rejected": -4.217316627502441, + "step": 2940 + }, + { + "epoch": 2.78, + "grad_norm": 19.431079864501953, + "learning_rate": 4.127317243791535e-08, + "logps/chosen": -50.00224304199219, + "logps/rejected": -82.65943908691406, + "loss": 0.193, + "losses/dpo": 0.08156921714544296, + "losses/sft": 1.507349967956543, + "losses/total": 0.08156921714544296, + "ref_logps/chosen": -34.5750617980957, + "ref_logps/rejected": -45.306766510009766, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5427186489105225, + "rewards/margins": 2.1925485134124756, + "rewards/rejected": -3.735267162322998, + "step": 2941 + }, + { + "epoch": 2.78, + "grad_norm": 21.818389892578125, + "learning_rate": 4.109828611402588e-08, + "logps/chosen": -69.96769714355469, + "logps/rejected": -92.95931243896484, + "loss": 0.2367, + "losses/dpo": 0.412746787071228, + "losses/sft": 1.8832069635391235, + "losses/total": 0.412746787071228, + "ref_logps/chosen": -42.7349853515625, + "ref_logps/rejected": -44.36425018310547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.723271369934082, + "rewards/margins": 2.136235237121582, + "rewards/rejected": -4.859506607055664, + "step": 2942 + }, + { + "epoch": 2.78, + "grad_norm": 22.550331115722656, + "learning_rate": 4.0923399790136414e-08, + "logps/chosen": -65.5389404296875, + "logps/rejected": -95.45225524902344, + "loss": 0.2065, + "losses/dpo": 0.1375802606344223, + "losses/sft": 2.6348206996917725, + "losses/total": 0.1375802606344223, + "ref_logps/chosen": -40.30699157714844, + "ref_logps/rejected": -50.36699676513672, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.523195266723633, + "rewards/margins": 1.9853311777114868, + "rewards/rejected": -4.50852632522583, + "step": 2943 + }, + { + "epoch": 2.78, + "grad_norm": 27.819787979125977, + "learning_rate": 4.0748513466246933e-08, + "logps/chosen": -59.16255569458008, + "logps/rejected": -91.4408187866211, + "loss": 0.2817, + "losses/dpo": 0.21156972646713257, + "losses/sft": 2.3053195476531982, + "losses/total": 0.21156972646713257, + "ref_logps/chosen": -33.958587646484375, + "ref_logps/rejected": -44.284446716308594, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.520397186279297, + "rewards/margins": 2.1952402591705322, + "rewards/rejected": -4.715637683868408, + "step": 2944 + }, + { + "epoch": 2.78, + "grad_norm": 32.6779670715332, + "learning_rate": 4.0573627142357466e-08, + "logps/chosen": -64.88963317871094, + "logps/rejected": -87.98614501953125, + "loss": 0.2971, + "losses/dpo": 0.05765041336417198, + "losses/sft": 1.792264699935913, + "losses/total": 0.05765041336417198, + "ref_logps/chosen": -42.00370788574219, + "ref_logps/rejected": -46.18706512451172, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2885916233062744, + "rewards/margins": 1.8913166522979736, + "rewards/rejected": -4.179908275604248, + "step": 2945 + }, + { + "epoch": 2.78, + "grad_norm": 28.0327205657959, + "learning_rate": 4.0398740818468e-08, + "logps/chosen": -58.09992218017578, + "logps/rejected": -97.98863220214844, + "loss": 0.2519, + "losses/dpo": 0.2719775438308716, + "losses/sft": 2.390690565109253, + "losses/total": 0.2719775438308716, + "ref_logps/chosen": -32.832759857177734, + "ref_logps/rejected": -50.393028259277344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5267162322998047, + "rewards/margins": 2.232844352722168, + "rewards/rejected": -4.759560585021973, + "step": 2946 + }, + { + "epoch": 2.78, + "grad_norm": 23.211795806884766, + "learning_rate": 4.0223854494578524e-08, + "logps/chosen": -59.732635498046875, + "logps/rejected": -89.6614990234375, + "loss": 0.2501, + "losses/dpo": 0.14845877885818481, + "losses/sft": 1.9149870872497559, + "losses/total": 0.14845877885818481, + "ref_logps/chosen": -39.32978820800781, + "ref_logps/rejected": -50.03449249267578, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0402846336364746, + "rewards/margins": 1.9224166870117188, + "rewards/rejected": -3.9627015590667725, + "step": 2947 + }, + { + "epoch": 2.78, + "grad_norm": 16.810693740844727, + "learning_rate": 4.004896817068905e-08, + "logps/chosen": -55.253021240234375, + "logps/rejected": -95.67185974121094, + "loss": 0.1394, + "losses/dpo": 0.26655498147010803, + "losses/sft": 1.4401518106460571, + "losses/total": 0.26655498147010803, + "ref_logps/chosen": -35.22863006591797, + "ref_logps/rejected": -48.77995300292969, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.002439022064209, + "rewards/margins": 2.686751127243042, + "rewards/rejected": -4.689189910888672, + "step": 2948 + }, + { + "epoch": 2.78, + "grad_norm": 25.708545684814453, + "learning_rate": 3.9874081846799576e-08, + "logps/chosen": -55.23566436767578, + "logps/rejected": -83.1280746459961, + "loss": 0.2876, + "losses/dpo": 0.08939340710639954, + "losses/sft": 1.760914921760559, + "losses/total": 0.08939340710639954, + "ref_logps/chosen": -37.5933952331543, + "ref_logps/rejected": -44.07594680786133, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7642266750335693, + "rewards/margins": 2.1409859657287598, + "rewards/rejected": -3.905212879180908, + "step": 2949 + }, + { + "epoch": 2.79, + "grad_norm": 18.22446632385254, + "learning_rate": 3.969919552291011e-08, + "logps/chosen": -35.238712310791016, + "logps/rejected": -66.91839599609375, + "loss": 0.2692, + "losses/dpo": 0.10803490877151489, + "losses/sft": 1.8808940649032593, + "losses/total": 0.10803490877151489, + "ref_logps/chosen": -24.548784255981445, + "ref_logps/rejected": -32.27580642700195, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0689928531646729, + "rewards/margins": 2.395265817642212, + "rewards/rejected": -3.4642586708068848, + "step": 2950 + }, + { + "epoch": 2.79, + "grad_norm": 24.135835647583008, + "learning_rate": 3.9524309199020634e-08, + "logps/chosen": -48.49653244018555, + "logps/rejected": -72.26859283447266, + "loss": 0.2484, + "losses/dpo": 0.1773497313261032, + "losses/sft": 1.6088541746139526, + "losses/total": 0.1773497313261032, + "ref_logps/chosen": -32.05012893676758, + "ref_logps/rejected": -35.045982360839844, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6446402072906494, + "rewards/margins": 2.0776209831237793, + "rewards/rejected": -3.7222609519958496, + "step": 2951 + }, + { + "epoch": 2.79, + "grad_norm": 16.43662452697754, + "learning_rate": 3.934942287513116e-08, + "logps/chosen": -61.54220199584961, + "logps/rejected": -94.9558334350586, + "loss": 0.1711, + "losses/dpo": 0.08148401975631714, + "losses/sft": 2.9786620140075684, + "losses/total": 0.08148401975631714, + "ref_logps/chosen": -39.89582824707031, + "ref_logps/rejected": -46.92484664916992, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1646370887756348, + "rewards/margins": 2.6384620666503906, + "rewards/rejected": -4.803099155426025, + "step": 2952 + }, + { + "epoch": 2.79, + "grad_norm": 16.742591857910156, + "learning_rate": 3.917453655124169e-08, + "logps/chosen": -50.09156036376953, + "logps/rejected": -75.986328125, + "loss": 0.2142, + "losses/dpo": 0.18750856816768646, + "losses/sft": 2.043002128601074, + "losses/total": 0.18750856816768646, + "ref_logps/chosen": -33.27346420288086, + "ref_logps/rejected": -37.96230697631836, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.681809902191162, + "rewards/margins": 2.120591640472412, + "rewards/rejected": -3.802401542663574, + "step": 2953 + }, + { + "epoch": 2.79, + "grad_norm": 14.109723091125488, + "learning_rate": 3.8999650227352225e-08, + "logps/chosen": -70.32637023925781, + "logps/rejected": -127.77806091308594, + "loss": 0.1114, + "losses/dpo": 0.16246438026428223, + "losses/sft": 3.349597215652466, + "losses/total": 0.16246438026428223, + "ref_logps/chosen": -46.12102127075195, + "ref_logps/rejected": -70.36228942871094, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.420535087585449, + "rewards/margins": 3.3210411071777344, + "rewards/rejected": -5.741575717926025, + "step": 2954 + }, + { + "epoch": 2.79, + "grad_norm": 16.419734954833984, + "learning_rate": 3.8824763903462745e-08, + "logps/chosen": -58.6666374206543, + "logps/rejected": -82.16273498535156, + "loss": 0.1733, + "losses/dpo": 0.19920210540294647, + "losses/sft": 1.8356988430023193, + "losses/total": 0.19920210540294647, + "ref_logps/chosen": -35.208919525146484, + "ref_logps/rejected": -37.94212341308594, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3457717895507812, + "rewards/margins": 2.076289176940918, + "rewards/rejected": -4.422060966491699, + "step": 2955 + }, + { + "epoch": 2.79, + "grad_norm": 38.09503936767578, + "learning_rate": 3.864987757957328e-08, + "logps/chosen": -64.63117980957031, + "logps/rejected": -83.3462905883789, + "loss": 0.4737, + "losses/dpo": 0.19685395061969757, + "losses/sft": 3.2698616981506348, + "losses/total": 0.19685395061969757, + "ref_logps/chosen": -37.46269226074219, + "ref_logps/rejected": -34.36955261230469, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.716848611831665, + "rewards/margins": 2.1808247566223145, + "rewards/rejected": -4.8976731300354, + "step": 2956 + }, + { + "epoch": 2.79, + "grad_norm": 24.42434310913086, + "learning_rate": 3.84749912556838e-08, + "logps/chosen": -53.592613220214844, + "logps/rejected": -71.39588928222656, + "loss": 0.3187, + "losses/dpo": 0.16775572299957275, + "losses/sft": 2.2685396671295166, + "losses/total": 0.16775572299957275, + "ref_logps/chosen": -32.739906311035156, + "ref_logps/rejected": -33.00611877441406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.085270643234253, + "rewards/margins": 1.7537059783935547, + "rewards/rejected": -3.8389763832092285, + "step": 2957 + }, + { + "epoch": 2.79, + "grad_norm": 27.867319107055664, + "learning_rate": 3.830010493179433e-08, + "logps/chosen": -46.6761360168457, + "logps/rejected": -76.2200927734375, + "loss": 0.3352, + "losses/dpo": 0.37638312578201294, + "losses/sft": 1.684161901473999, + "losses/total": 0.37638312578201294, + "ref_logps/chosen": -30.49056625366211, + "ref_logps/rejected": -40.442466735839844, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6185567378997803, + "rewards/margins": 1.9592053890228271, + "rewards/rejected": -3.5777623653411865, + "step": 2958 + }, + { + "epoch": 2.79, + "grad_norm": 33.544219970703125, + "learning_rate": 3.812521860790486e-08, + "logps/chosen": -65.1119613647461, + "logps/rejected": -85.15020751953125, + "loss": 0.3739, + "losses/dpo": 0.2625933885574341, + "losses/sft": 2.014401435852051, + "losses/total": 0.2625933885574341, + "ref_logps/chosen": -45.40331268310547, + "ref_logps/rejected": -47.31351089477539, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9708645343780518, + "rewards/margins": 1.81280517578125, + "rewards/rejected": -3.783669948577881, + "step": 2959 + }, + { + "epoch": 2.8, + "grad_norm": 42.03447723388672, + "learning_rate": 3.795033228401539e-08, + "logps/chosen": -59.89862060546875, + "logps/rejected": -72.97151184082031, + "loss": 0.4636, + "losses/dpo": 0.20353052020072937, + "losses/sft": 1.6133798360824585, + "losses/total": 0.20353052020072937, + "ref_logps/chosen": -40.813438415527344, + "ref_logps/rejected": -38.648433685302734, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9085184335708618, + "rewards/margins": 1.5237895250320435, + "rewards/rejected": -3.432307720184326, + "step": 2960 + }, + { + "epoch": 2.8, + "grad_norm": 8.90991497039795, + "learning_rate": 3.777544596012592e-08, + "logps/chosen": -54.34622573852539, + "logps/rejected": -94.96410369873047, + "loss": 0.0769, + "losses/dpo": 0.07265409082174301, + "losses/sft": 2.0861217975616455, + "losses/total": 0.07265409082174301, + "ref_logps/chosen": -37.764564514160156, + "ref_logps/rejected": -48.592872619628906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6581659317016602, + "rewards/margins": 2.978957176208496, + "rewards/rejected": -4.637123107910156, + "step": 2961 + }, + { + "epoch": 2.8, + "grad_norm": 29.597333908081055, + "learning_rate": 3.7600559636236446e-08, + "logps/chosen": -45.32130432128906, + "logps/rejected": -75.54582977294922, + "loss": 0.3432, + "losses/dpo": 0.17707788944244385, + "losses/sft": 1.8706631660461426, + "losses/total": 0.17707788944244385, + "ref_logps/chosen": -28.261106491088867, + "ref_logps/rejected": -38.920448303222656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7060202360153198, + "rewards/margins": 1.9565176963806152, + "rewards/rejected": -3.6625380516052246, + "step": 2962 + }, + { + "epoch": 2.8, + "grad_norm": 19.86264419555664, + "learning_rate": 3.742567331234697e-08, + "logps/chosen": -54.12257766723633, + "logps/rejected": -88.94450378417969, + "loss": 0.163, + "losses/dpo": 0.0474557988345623, + "losses/sft": 2.0292882919311523, + "losses/total": 0.0474557988345623, + "ref_logps/chosen": -35.720436096191406, + "ref_logps/rejected": -42.519832611083984, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8402135372161865, + "rewards/margins": 2.8022539615631104, + "rewards/rejected": -4.642467975616455, + "step": 2963 + }, + { + "epoch": 2.8, + "grad_norm": 33.89479064941406, + "learning_rate": 3.7250786988457504e-08, + "logps/chosen": -69.56108093261719, + "logps/rejected": -84.26187133789062, + "loss": 0.4123, + "losses/dpo": 0.42647796869277954, + "losses/sft": 2.724936008453369, + "losses/total": 0.42647796869277954, + "ref_logps/chosen": -45.12382125854492, + "ref_logps/rejected": -39.894134521484375, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4437255859375, + "rewards/margins": 1.9930481910705566, + "rewards/rejected": -4.436773777008057, + "step": 2964 + }, + { + "epoch": 2.8, + "grad_norm": 24.20273780822754, + "learning_rate": 3.707590066456802e-08, + "logps/chosen": -57.218040466308594, + "logps/rejected": -90.99298095703125, + "loss": 0.25, + "losses/dpo": 0.1578081250190735, + "losses/sft": 1.372912883758545, + "losses/total": 0.1578081250190735, + "ref_logps/chosen": -36.720550537109375, + "ref_logps/rejected": -47.78184509277344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0497493743896484, + "rewards/margins": 2.271364212036133, + "rewards/rejected": -4.321113586425781, + "step": 2965 + }, + { + "epoch": 2.8, + "grad_norm": 18.949331283569336, + "learning_rate": 3.6901014340678556e-08, + "logps/chosen": -74.29554748535156, + "logps/rejected": -115.29342651367188, + "loss": 0.1696, + "losses/dpo": 0.07121007144451141, + "losses/sft": 2.2247562408447266, + "losses/total": 0.07121007144451141, + "ref_logps/chosen": -47.42454528808594, + "ref_logps/rejected": -64.8714599609375, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6870999336242676, + "rewards/margins": 2.3550961017608643, + "rewards/rejected": -5.042195796966553, + "step": 2966 + }, + { + "epoch": 2.8, + "grad_norm": 22.12060546875, + "learning_rate": 3.672612801678909e-08, + "logps/chosen": -52.25728225708008, + "logps/rejected": -88.52669525146484, + "loss": 0.1735, + "losses/dpo": 0.3398432731628418, + "losses/sft": 2.1218903064727783, + "losses/total": 0.3398432731628418, + "ref_logps/chosen": -32.640262603759766, + "ref_logps/rejected": -42.928184509277344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.96170175075531, + "rewards/margins": 2.5981497764587402, + "rewards/rejected": -4.55985164642334, + "step": 2967 + }, + { + "epoch": 2.8, + "grad_norm": 29.145397186279297, + "learning_rate": 3.6551241692899614e-08, + "logps/chosen": -60.1705436706543, + "logps/rejected": -102.53408813476562, + "loss": 0.2463, + "losses/dpo": 0.6488381028175354, + "losses/sft": 1.8082618713378906, + "losses/total": 0.6488381028175354, + "ref_logps/chosen": -39.983062744140625, + "ref_logps/rejected": -56.66352081298828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0187482833862305, + "rewards/margins": 2.568307876586914, + "rewards/rejected": -4.5870561599731445, + "step": 2968 + }, + { + "epoch": 2.8, + "grad_norm": 45.12929916381836, + "learning_rate": 3.637635536901014e-08, + "logps/chosen": -77.0869140625, + "logps/rejected": -83.4021987915039, + "loss": 0.4624, + "losses/dpo": 1.125467300415039, + "losses/sft": 2.4663610458374023, + "losses/total": 1.125467300415039, + "ref_logps/chosen": -47.52934265136719, + "ref_logps/rejected": -40.71036911010742, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.955756664276123, + "rewards/margins": 1.3134262561798096, + "rewards/rejected": -4.269183158874512, + "step": 2969 + }, + { + "epoch": 2.8, + "grad_norm": 33.336448669433594, + "learning_rate": 3.620146904512067e-08, + "logps/chosen": -67.48674011230469, + "logps/rejected": -104.83739471435547, + "loss": 0.2856, + "losses/dpo": 0.5604097843170166, + "losses/sft": 2.52388858795166, + "losses/total": 0.5604097843170166, + "ref_logps/chosen": -45.390689849853516, + "ref_logps/rejected": -58.22635269165039, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2096049785614014, + "rewards/margins": 2.4514989852905273, + "rewards/rejected": -4.661104202270508, + "step": 2970 + }, + { + "epoch": 2.81, + "grad_norm": 37.19539260864258, + "learning_rate": 3.60265827212312e-08, + "logps/chosen": -64.23799896240234, + "logps/rejected": -72.99726867675781, + "loss": 0.5128, + "losses/dpo": 0.291890025138855, + "losses/sft": 2.6363255977630615, + "losses/total": 0.291890025138855, + "ref_logps/chosen": -40.86454772949219, + "ref_logps/rejected": -40.237030029296875, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3373451232910156, + "rewards/margins": 0.938678503036499, + "rewards/rejected": -3.2760238647460938, + "step": 2971 + }, + { + "epoch": 2.81, + "grad_norm": 36.78740692138672, + "learning_rate": 3.585169639734173e-08, + "logps/chosen": -51.64378356933594, + "logps/rejected": -76.26048278808594, + "loss": 0.4277, + "losses/dpo": 0.7999288439750671, + "losses/sft": 1.7480086088180542, + "losses/total": 0.7999288439750671, + "ref_logps/chosen": -31.028268814086914, + "ref_logps/rejected": -38.83718490600586, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.061551570892334, + "rewards/margins": 1.6807782649993896, + "rewards/rejected": -3.7423300743103027, + "step": 2972 + }, + { + "epoch": 2.81, + "grad_norm": 26.65422821044922, + "learning_rate": 3.567681007345225e-08, + "logps/chosen": -61.479705810546875, + "logps/rejected": -96.65974426269531, + "loss": 0.2269, + "losses/dpo": 0.13492417335510254, + "losses/sft": 2.4486684799194336, + "losses/total": 0.13492417335510254, + "ref_logps/chosen": -42.64320373535156, + "ref_logps/rejected": -52.526371002197266, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.883650779724121, + "rewards/margins": 2.529686689376831, + "rewards/rejected": -4.413337230682373, + "step": 2973 + }, + { + "epoch": 2.81, + "grad_norm": 25.414209365844727, + "learning_rate": 3.550192374956278e-08, + "logps/chosen": -66.7447509765625, + "logps/rejected": -90.71731567382812, + "loss": 0.3875, + "losses/dpo": 0.10386046022176743, + "losses/sft": 1.5561758279800415, + "losses/total": 0.10386046022176743, + "ref_logps/chosen": -45.63017654418945, + "ref_logps/rejected": -47.700557708740234, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.1114578247070312, + "rewards/margins": 2.190218687057495, + "rewards/rejected": -4.3016767501831055, + "step": 2974 + }, + { + "epoch": 2.81, + "grad_norm": 31.883968353271484, + "learning_rate": 3.5327037425673315e-08, + "logps/chosen": -51.42143249511719, + "logps/rejected": -58.91716003417969, + "loss": 0.4093, + "losses/dpo": 0.3280029296875, + "losses/sft": 1.6561799049377441, + "losses/total": 0.3280029296875, + "ref_logps/chosen": -31.118148803710938, + "ref_logps/rejected": -26.301490783691406, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0303287506103516, + "rewards/margins": 1.231238603591919, + "rewards/rejected": -3.2615673542022705, + "step": 2975 + }, + { + "epoch": 2.81, + "grad_norm": 18.696861267089844, + "learning_rate": 3.5152151101783834e-08, + "logps/chosen": -62.46991729736328, + "logps/rejected": -110.99563598632812, + "loss": 0.1321, + "losses/dpo": 0.14700421690940857, + "losses/sft": 1.9267423152923584, + "losses/total": 0.14700421690940857, + "ref_logps/chosen": -41.423683166503906, + "ref_logps/rejected": -58.88855743408203, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1046228408813477, + "rewards/margins": 3.1060850620269775, + "rewards/rejected": -5.210708141326904, + "step": 2976 + }, + { + "epoch": 2.81, + "grad_norm": 32.2402229309082, + "learning_rate": 3.497726477789437e-08, + "logps/chosen": -57.33751678466797, + "logps/rejected": -98.6733169555664, + "loss": 0.2619, + "losses/dpo": 0.09131017327308655, + "losses/sft": 2.0977137088775635, + "losses/total": 0.09131017327308655, + "ref_logps/chosen": -35.32010269165039, + "ref_logps/rejected": -51.638694763183594, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2017414569854736, + "rewards/margins": 2.501720905303955, + "rewards/rejected": -4.703462600708008, + "step": 2977 + }, + { + "epoch": 2.81, + "grad_norm": 20.025590896606445, + "learning_rate": 3.48023784540049e-08, + "logps/chosen": -54.157447814941406, + "logps/rejected": -86.65713500976562, + "loss": 0.1588, + "losses/dpo": 0.2667253017425537, + "losses/sft": 1.8768854141235352, + "losses/total": 0.2667253017425537, + "ref_logps/chosen": -39.21525573730469, + "ref_logps/rejected": -44.985740661621094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4942193031311035, + "rewards/margins": 2.6729202270507812, + "rewards/rejected": -4.167139530181885, + "step": 2978 + }, + { + "epoch": 2.81, + "grad_norm": 35.18009567260742, + "learning_rate": 3.4627492130115425e-08, + "logps/chosen": -60.42721176147461, + "logps/rejected": -101.02044677734375, + "loss": 0.2923, + "losses/dpo": 0.49971938133239746, + "losses/sft": 2.3838436603546143, + "losses/total": 0.49971938133239746, + "ref_logps/chosen": -36.92533874511719, + "ref_logps/rejected": -52.1884765625, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.350187301635742, + "rewards/margins": 2.5330095291137695, + "rewards/rejected": -4.883196830749512, + "step": 2979 + }, + { + "epoch": 2.81, + "grad_norm": 18.57384490966797, + "learning_rate": 3.445260580622595e-08, + "logps/chosen": -62.590736389160156, + "logps/rejected": -87.78916931152344, + "loss": 0.1762, + "losses/dpo": 0.04849696531891823, + "losses/sft": 1.909379482269287, + "losses/total": 0.04849696531891823, + "ref_logps/chosen": -42.78089904785156, + "ref_logps/rejected": -41.12408447265625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9809837341308594, + "rewards/margins": 2.6855244636535645, + "rewards/rejected": -4.666508197784424, + "step": 2980 + }, + { + "epoch": 2.81, + "grad_norm": 24.224576950073242, + "learning_rate": 3.427771948233648e-08, + "logps/chosen": -57.722599029541016, + "logps/rejected": -90.49748992919922, + "loss": 0.2004, + "losses/dpo": 0.23663319647312164, + "losses/sft": 1.8057953119277954, + "losses/total": 0.23663319647312164, + "ref_logps/chosen": -36.739173889160156, + "ref_logps/rejected": -46.24281311035156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0983424186706543, + "rewards/margins": 2.327125310897827, + "rewards/rejected": -4.425467491149902, + "step": 2981 + }, + { + "epoch": 2.82, + "grad_norm": 16.721393585205078, + "learning_rate": 3.410283315844701e-08, + "logps/chosen": -50.42713165283203, + "logps/rejected": -97.43521118164062, + "loss": 0.1399, + "losses/dpo": 0.1284208744764328, + "losses/sft": 2.3800570964813232, + "losses/total": 0.1284208744764328, + "ref_logps/chosen": -37.059600830078125, + "ref_logps/rejected": -50.89272689819336, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3367533683776855, + "rewards/margins": 3.3174943923950195, + "rewards/rejected": -4.654247760772705, + "step": 2982 + }, + { + "epoch": 2.82, + "grad_norm": 18.984500885009766, + "learning_rate": 3.3927946834557535e-08, + "logps/chosen": -55.87278747558594, + "logps/rejected": -93.3619384765625, + "loss": 0.2242, + "losses/dpo": 0.06402628868818283, + "losses/sft": 2.0018904209136963, + "losses/total": 0.06402628868818283, + "ref_logps/chosen": -37.17364501953125, + "ref_logps/rejected": -48.63155746459961, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8699142932891846, + "rewards/margins": 2.6031241416931152, + "rewards/rejected": -4.473038196563721, + "step": 2983 + }, + { + "epoch": 2.82, + "grad_norm": 18.34654426574707, + "learning_rate": 3.375306051066806e-08, + "logps/chosen": -54.484527587890625, + "logps/rejected": -98.38478088378906, + "loss": 0.1873, + "losses/dpo": 0.060594167560338974, + "losses/sft": 2.0047285556793213, + "losses/total": 0.060594167560338974, + "ref_logps/chosen": -32.541080474853516, + "ref_logps/rejected": -48.39060592651367, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.194344997406006, + "rewards/margins": 2.805072546005249, + "rewards/rejected": -4.999417304992676, + "step": 2984 + }, + { + "epoch": 2.82, + "grad_norm": 19.110803604125977, + "learning_rate": 3.3578174186778594e-08, + "logps/chosen": -59.22347640991211, + "logps/rejected": -93.82707977294922, + "loss": 0.1775, + "losses/dpo": 0.06492815911769867, + "losses/sft": 2.290909767150879, + "losses/total": 0.06492815911769867, + "ref_logps/chosen": -38.76340103149414, + "ref_logps/rejected": -47.39862060546875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0460081100463867, + "rewards/margins": 2.5968379974365234, + "rewards/rejected": -4.64284610748291, + "step": 2985 + }, + { + "epoch": 2.82, + "grad_norm": 14.808489799499512, + "learning_rate": 3.3403287862889126e-08, + "logps/chosen": -54.92408752441406, + "logps/rejected": -97.23770141601562, + "loss": 0.1103, + "losses/dpo": 0.056466784328222275, + "losses/sft": 2.340762138366699, + "losses/total": 0.056466784328222275, + "ref_logps/chosen": -36.700828552246094, + "ref_logps/rejected": -51.280738830566406, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8223259449005127, + "rewards/margins": 2.7733707427978516, + "rewards/rejected": -4.595696926116943, + "step": 2986 + }, + { + "epoch": 2.82, + "grad_norm": 28.081100463867188, + "learning_rate": 3.3228401538999646e-08, + "logps/chosen": -67.30946350097656, + "logps/rejected": -85.007568359375, + "loss": 0.2949, + "losses/dpo": 0.2029864341020584, + "losses/sft": 2.2702720165252686, + "losses/total": 0.2029864341020584, + "ref_logps/chosen": -40.41299819946289, + "ref_logps/rejected": -39.83832550048828, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.689646005630493, + "rewards/margins": 1.8272786140441895, + "rewards/rejected": -4.516924858093262, + "step": 2987 + }, + { + "epoch": 2.82, + "grad_norm": 26.84359359741211, + "learning_rate": 3.305351521511018e-08, + "logps/chosen": -71.78427124023438, + "logps/rejected": -83.98118591308594, + "loss": 0.257, + "losses/dpo": 0.09366091340780258, + "losses/sft": 2.0670480728149414, + "losses/total": 0.09366091340780258, + "ref_logps/chosen": -47.53776550292969, + "ref_logps/rejected": -41.261619567871094, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4246511459350586, + "rewards/margins": 1.8473058938980103, + "rewards/rejected": -4.271956920623779, + "step": 2988 + }, + { + "epoch": 2.82, + "grad_norm": 22.925477981567383, + "learning_rate": 3.2878628891220704e-08, + "logps/chosen": -57.029563903808594, + "logps/rejected": -101.94001007080078, + "loss": 0.2671, + "losses/dpo": 0.19132955372333527, + "losses/sft": 2.351469039916992, + "losses/total": 0.19132955372333527, + "ref_logps/chosen": -32.821189880371094, + "ref_logps/rejected": -54.15678405761719, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.42083740234375, + "rewards/margins": 2.357485294342041, + "rewards/rejected": -4.778322219848633, + "step": 2989 + }, + { + "epoch": 2.82, + "grad_norm": 15.527331352233887, + "learning_rate": 3.270374256733123e-08, + "logps/chosen": -59.65545654296875, + "logps/rejected": -92.93575286865234, + "loss": 0.1897, + "losses/dpo": 0.20035426318645477, + "losses/sft": 1.5696300268173218, + "losses/total": 0.20035426318645477, + "ref_logps/chosen": -37.63036346435547, + "ref_logps/rejected": -48.36811065673828, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.202509880065918, + "rewards/margins": 2.25425386428833, + "rewards/rejected": -4.456763744354248, + "step": 2990 + }, + { + "epoch": 2.82, + "grad_norm": 34.997982025146484, + "learning_rate": 3.252885624344176e-08, + "logps/chosen": -53.74292755126953, + "logps/rejected": -73.917236328125, + "loss": 0.4323, + "losses/dpo": 0.14197325706481934, + "losses/sft": 1.5360971689224243, + "losses/total": 0.14197325706481934, + "ref_logps/chosen": -34.2266731262207, + "ref_logps/rejected": -40.91029739379883, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9516255855560303, + "rewards/margins": 1.3490679264068604, + "rewards/rejected": -3.3006935119628906, + "step": 2991 + }, + { + "epoch": 2.83, + "grad_norm": 24.652267456054688, + "learning_rate": 3.235396991955229e-08, + "logps/chosen": -56.34262466430664, + "logps/rejected": -77.86944580078125, + "loss": 0.2396, + "losses/dpo": 0.33956053853034973, + "losses/sft": 2.0223419666290283, + "losses/total": 0.33956053853034973, + "ref_logps/chosen": -35.858177185058594, + "ref_logps/rejected": -34.715274810791016, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.048444986343384, + "rewards/margins": 2.2669718265533447, + "rewards/rejected": -4.3154168128967285, + "step": 2992 + }, + { + "epoch": 2.83, + "grad_norm": 20.87251853942871, + "learning_rate": 3.217908359566282e-08, + "logps/chosen": -75.1152114868164, + "logps/rejected": -113.919677734375, + "loss": 0.1501, + "losses/dpo": 0.040977198630571365, + "losses/sft": 2.842691659927368, + "losses/total": 0.040977198630571365, + "ref_logps/chosen": -48.413238525390625, + "ref_logps/rejected": -61.866790771484375, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6701972484588623, + "rewards/margins": 2.5350914001464844, + "rewards/rejected": -5.205288410186768, + "step": 2993 + }, + { + "epoch": 2.83, + "grad_norm": 21.815921783447266, + "learning_rate": 3.2004197271773347e-08, + "logps/chosen": -59.997947692871094, + "logps/rejected": -87.62559509277344, + "loss": 0.2309, + "losses/dpo": 0.41971027851104736, + "losses/sft": 1.6817584037780762, + "losses/total": 0.41971027851104736, + "ref_logps/chosen": -40.48411560058594, + "ref_logps/rejected": -46.842369079589844, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.951383113861084, + "rewards/margins": 2.1269400119781494, + "rewards/rejected": -4.0783233642578125, + "step": 2994 + }, + { + "epoch": 2.83, + "grad_norm": 26.691635131835938, + "learning_rate": 3.182931094788387e-08, + "logps/chosen": -61.08686828613281, + "logps/rejected": -94.94357299804688, + "loss": 0.303, + "losses/dpo": 0.11571475863456726, + "losses/sft": 2.303887128829956, + "losses/total": 0.11571475863456726, + "ref_logps/chosen": -39.1475830078125, + "ref_logps/rejected": -49.49618911743164, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1939287185668945, + "rewards/margins": 2.350809335708618, + "rewards/rejected": -4.544737815856934, + "step": 2995 + }, + { + "epoch": 2.83, + "grad_norm": 26.821645736694336, + "learning_rate": 3.1654424623994405e-08, + "logps/chosen": -55.198997497558594, + "logps/rejected": -75.54061889648438, + "loss": 0.281, + "losses/dpo": 0.4803215265274048, + "losses/sft": 2.131777048110962, + "losses/total": 0.4803215265274048, + "ref_logps/chosen": -33.7226676940918, + "ref_logps/rejected": -35.69221878051758, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1476328372955322, + "rewards/margins": 1.8372067213058472, + "rewards/rejected": -3.984839677810669, + "step": 2996 + }, + { + "epoch": 2.83, + "grad_norm": 19.113996505737305, + "learning_rate": 3.1479538300104924e-08, + "logps/chosen": -61.06296157836914, + "logps/rejected": -98.74284362792969, + "loss": 0.1654, + "losses/dpo": 0.18672429025173187, + "losses/sft": 2.309131145477295, + "losses/total": 0.18672429025173187, + "ref_logps/chosen": -40.02710723876953, + "ref_logps/rejected": -52.59275817871094, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.103585720062256, + "rewards/margins": 2.511423110961914, + "rewards/rejected": -4.61500883102417, + "step": 2997 + }, + { + "epoch": 2.83, + "grad_norm": 22.21666717529297, + "learning_rate": 3.130465197621546e-08, + "logps/chosen": -46.25444793701172, + "logps/rejected": -73.90019226074219, + "loss": 0.2209, + "losses/dpo": 0.35864341259002686, + "losses/sft": 2.213911294937134, + "losses/total": 0.35864341259002686, + "ref_logps/chosen": -29.645763397216797, + "ref_logps/rejected": -36.79693603515625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6608679294586182, + "rewards/margins": 2.0494577884674072, + "rewards/rejected": -3.7103259563446045, + "step": 2998 + }, + { + "epoch": 2.83, + "grad_norm": 22.629308700561523, + "learning_rate": 3.112976565232599e-08, + "logps/chosen": -57.04795837402344, + "logps/rejected": -71.7969970703125, + "loss": 0.2753, + "losses/dpo": 0.2649767994880676, + "losses/sft": 1.9152793884277344, + "losses/total": 0.2649767994880676, + "ref_logps/chosen": -40.077362060546875, + "ref_logps/rejected": -38.61092758178711, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6970595121383667, + "rewards/margins": 1.6215471029281616, + "rewards/rejected": -3.3186066150665283, + "step": 2999 + }, + { + "epoch": 2.83, + "grad_norm": 29.257476806640625, + "learning_rate": 3.0954879328436515e-08, + "logps/chosen": -64.85026550292969, + "logps/rejected": -99.04454803466797, + "loss": 0.2222, + "losses/dpo": 0.2852153182029724, + "losses/sft": 2.0283477306365967, + "losses/total": 0.2852153182029724, + "ref_logps/chosen": -37.707908630371094, + "ref_logps/rejected": -47.53254318237305, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.714235782623291, + "rewards/margins": 2.436964273452759, + "rewards/rejected": -5.151200294494629, + "step": 3000 + } + ], + "logging_steps": 1.0, + "max_steps": 3177, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}